From 9807a54cd74149988f5d20088bf7a7957c205bfb Mon Sep 17 00:00:00 2001
From: Jarno Rajahalme <jarno.rajahalme@nsn.com>
Date: Mon, 7 Jan 2013 09:54:31 -0800
Subject: linux/openvswitch.h: Make OVSP_LOCAL 32-bit.

OVS ports are now 32-bit, so OVSP_LOCAL should be too.
(Internally, kernel module still keeps port numbers 16-bit, though.)

Signed-off-by: Jarno Rajahalme <jarno.rajahalme@nsn.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/linux/openvswitch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index d42e174bd0c8..99e6414a40d9 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -94,7 +94,7 @@ struct ovs_vport_stats {
 };
 
 /* Fixed logical ports. */
-#define OVSP_LOCAL      ((__u16)0)
+#define OVSP_LOCAL      ((__u32)0)
 
 /* Packet transfer. */
 
-- 
cgit 


From 345046673449b5c35840e5cc34a60059cbec9305 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:00:53 +0000
Subject: slab: Move kmalloc related function defs

Move these functions higher up in slab.h so that they are grouped with other
generic kmalloc related definitions.

Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 5d168d7e0a28..ccbb37685c6c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -147,6 +147,15 @@ void kmem_cache_free(struct kmem_cache *, void *);
 		sizeof(struct __struct), __alignof__(struct __struct),\
 		(__flags), NULL)
 
+/*
+ * Common kmalloc functions provided by all allocators
+ */
+void * __must_check __krealloc(const void *, size_t, gfp_t);
+void * __must_check krealloc(const void *, size_t, gfp_t);
+void kfree(const void *);
+void kzfree(const void *);
+size_t ksize(const void *);
+
 /*
  * The largest kmalloc size supported by the slab allocators is
  * 32 megabyte (2^25) or the maximum allocatable page order if that is
@@ -224,15 +233,6 @@ struct seq_file;
 int cache_show(struct kmem_cache *s, struct seq_file *m);
 void print_slabinfo_header(struct seq_file *m);
 
-/*
- * Common kmalloc functions provided by all allocators
- */
-void * __must_check __krealloc(const void *, size_t, gfp_t);
-void * __must_check krealloc(const void *, size_t, gfp_t);
-void kfree(const void *);
-void kzfree(const void *);
-size_t ksize(const void *);
-
 /*
  * Allocator specific definitions. These are mainly used to establish optimized
  * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
-- 
cgit 


From ce6a50263d4ddeba1f0d08f16716a82770c03690 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:14:19 +0000
Subject: slab: Common kmalloc slab index determination

Extract the function to determine the index of the slab within
the array of kmalloc caches as well as a function to determine
maximum object size from the nr of the kmalloc slab.

This is used here only to simplify slub bootstrap but will
be used later also for SLAB.

Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h     | 172 +++++++++++++++++++++++++++++++++--------------
 include/linux/slub_def.h |  63 -----------------
 2 files changed, 122 insertions(+), 113 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index ccbb37685c6c..c97fe92532d1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -94,29 +94,6 @@
 #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
 				(unsigned long)ZERO_SIZE_PTR)
 
-/*
- * Common fields provided in kmem_cache by all slab allocators
- * This struct is either used directly by the allocator (SLOB)
- * or the allocator must include definitions for all fields
- * provided in kmem_cache_common in their definition of kmem_cache.
- *
- * Once we can do anonymous structs (C11 standard) we could put a
- * anonymous struct definition in these allocators so that the
- * separate allocations in the kmem_cache structure of SLAB and
- * SLUB is no longer needed.
- */
-#ifdef CONFIG_SLOB
-struct kmem_cache {
-	unsigned int object_size;/* The original size of the object */
-	unsigned int size;	/* The aligned/padded/added on size  */
-	unsigned int align;	/* Alignment as calculated */
-	unsigned long flags;	/* Active flags on the slab */
-	const char *name;	/* Slab name for sysfs */
-	int refcount;		/* Use counter */
-	void (*ctor)(void *);	/* Called on object slot creation */
-	struct list_head list;	/* List of all slab caches on the system */
-};
-#endif
 
 struct mem_cgroup;
 /*
@@ -156,6 +133,35 @@ void kfree(const void *);
 void kzfree(const void *);
 size_t ksize(const void *);
 
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+	unsigned int object_size;/* The original size of the object */
+	unsigned int size;	/* The aligned/padded/added on size  */
+	unsigned int align;	/* Alignment as calculated */
+	unsigned long flags;	/* Active flags on the slab */
+	const char *name;	/* Slab name for sysfs */
+	int refcount;		/* Use counter */
+	void (*ctor)(void *);	/* Called on object slot creation */
+	struct list_head list;	/* List of all slab caches on the system */
+};
+
+#define KMALLOC_MAX_SIZE (1UL << 30)
+
+#include <linux/slob_def.h>
+
+#else /* CONFIG_SLOB */
+
 /*
  * The largest kmalloc size supported by the slab allocators is
  * 32 megabyte (2^25) or the maximum allocatable page order if that is
@@ -171,6 +177,99 @@ size_t ksize(const void *);
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
 #define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
 
+/*
+ * Kmalloc subsystem.
+ */
+#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
+#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
+#else
+#ifdef CONFIG_SLAB
+#define KMALLOC_MIN_SIZE 32
+#else
+#define KMALLOC_MIN_SIZE 8
+#endif
+#endif
+
+#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
+
+/*
+ * Figure out which kmalloc slab an allocation of a certain size
+ * belongs to.
+ * 0 = zero alloc
+ * 1 =  65 .. 96 bytes
+ * 2 = 120 .. 192 bytes
+ * n = 2^(n-1) .. 2^n -1
+ */
+static __always_inline int kmalloc_index(size_t size)
+{
+	if (!size)
+		return 0;
+
+	if (size <= KMALLOC_MIN_SIZE)
+		return KMALLOC_SHIFT_LOW;
+
+	if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
+		return 1;
+	if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
+		return 2;
+	if (size <=          8) return 3;
+	if (size <=         16) return 4;
+	if (size <=         32) return 5;
+	if (size <=         64) return 6;
+	if (size <=        128) return 7;
+	if (size <=        256) return 8;
+	if (size <=        512) return 9;
+	if (size <=       1024) return 10;
+	if (size <=   2 * 1024) return 11;
+	if (size <=   4 * 1024) return 12;
+	if (size <=   8 * 1024) return 13;
+	if (size <=  16 * 1024) return 14;
+	if (size <=  32 * 1024) return 15;
+	if (size <=  64 * 1024) return 16;
+	if (size <= 128 * 1024) return 17;
+	if (size <= 256 * 1024) return 18;
+	if (size <= 512 * 1024) return 19;
+	if (size <= 1024 * 1024) return 20;
+	if (size <=  2 * 1024 * 1024) return 21;
+	if (size <=  4 * 1024 * 1024) return 22;
+	if (size <=  8 * 1024 * 1024) return 23;
+	if (size <=  16 * 1024 * 1024) return 24;
+	if (size <=  32 * 1024 * 1024) return 25;
+	if (size <=  64 * 1024 * 1024) return 26;
+	BUG();
+
+	/* Will never be reached. Needed because the compiler may complain */
+	return -1;
+}
+
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#elif defined(CONFIG_SLUB)
+#include <linux/slub_def.h>
+#else
+#error "Unknown slab allocator"
+#endif
+
+/*
+ * Determine size used for the nth kmalloc cache.
+ * return size or 0 if a kmalloc cache for that
+ * size does not exist
+ */
+static __always_inline int kmalloc_size(int n)
+{
+	if (n > 2)
+		return 1 << n;
+
+	if (n == 1 && KMALLOC_MIN_SIZE <= 32)
+		return 96;
+
+	if (n == 2 && KMALLOC_MIN_SIZE <= 64)
+		return 192;
+
+	return 0;
+}
+#endif /* !CONFIG_SLOB */
+
 /*
  * Some archs want to perform DMA into kmalloc caches and need a guaranteed
  * alignment larger than the alignment of a 64-bit integer.
@@ -233,33 +332,6 @@ struct seq_file;
 int cache_show(struct kmem_cache *s, struct seq_file *m);
 void print_slabinfo_header(struct seq_file *m);
 
-/*
- * Allocator specific definitions. These are mainly used to establish optimized
- * ways to convert kmalloc() calls to kmem_cache_alloc() invocations by
- * selecting the appropriate general cache at compile time.
- *
- * Allocators must define at least:
- *
- *	kmem_cache_alloc()
- *	__kmalloc()
- *	kmalloc()
- *
- * Those wishing to support NUMA must also define:
- *
- *	kmem_cache_alloc_node()
- *	kmalloc_node()
- *
- * See each allocator definition file for additional comments and
- * implementation notes.
- */
-#ifdef CONFIG_SLUB
-#include <linux/slub_def.h>
-#elif defined(CONFIG_SLOB)
-#include <linux/slob_def.h>
-#else
-#include <linux/slab_def.h>
-#endif
-
 /**
  * kmalloc_array - allocate memory for an array.
  * @n: number of elements.
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 9db4825cd393..99c3e05ff1f0 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,17 +115,6 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-/*
- * Kmalloc subsystem.
- */
-#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
-#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
-#else
-#define KMALLOC_MIN_SIZE 8
-#endif
-
-#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
-
 /*
  * Maximum kmalloc object size handled by SLUB. Larger object allocations
  * are passed through to the page allocator. The page allocator "fastpath"
@@ -152,58 +141,6 @@ struct kmem_cache {
  */
 extern struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
 
-/*
- * Sorry that the following has to be that ugly but some versions of GCC
- * have trouble with constant propagation and loops.
- */
-static __always_inline int kmalloc_index(size_t size)
-{
-	if (!size)
-		return 0;
-
-	if (size <= KMALLOC_MIN_SIZE)
-		return KMALLOC_SHIFT_LOW;
-
-	if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96)
-		return 1;
-	if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192)
-		return 2;
-	if (size <=          8) return 3;
-	if (size <=         16) return 4;
-	if (size <=         32) return 5;
-	if (size <=         64) return 6;
-	if (size <=        128) return 7;
-	if (size <=        256) return 8;
-	if (size <=        512) return 9;
-	if (size <=       1024) return 10;
-	if (size <=   2 * 1024) return 11;
-	if (size <=   4 * 1024) return 12;
-/*
- * The following is only needed to support architectures with a larger page
- * size than 4k. We need to support 2 * PAGE_SIZE here. So for a 64k page
- * size we would have to go up to 128k.
- */
-	if (size <=   8 * 1024) return 13;
-	if (size <=  16 * 1024) return 14;
-	if (size <=  32 * 1024) return 15;
-	if (size <=  64 * 1024) return 16;
-	if (size <= 128 * 1024) return 17;
-	if (size <= 256 * 1024) return 18;
-	if (size <= 512 * 1024) return 19;
-	if (size <= 1024 * 1024) return 20;
-	if (size <=  2 * 1024 * 1024) return 21;
-	BUG();
-	return -1; /* Will never be reached */
-
-/*
- * What we really wanted to do and cannot do because of compiler issues is:
- *	int i;
- *	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++)
- *		if (size <= (1 << i))
- *			return i;
- */
-}
-
 /*
  * Find the slab cache for a given combination of allocation flags and size.
  *
-- 
cgit 


From e33660165c901d18e7d3df2290db070d3e4b46df Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:14:18 +0000
Subject: slab: Use common kmalloc_index/kmalloc_size functions

Make slab use the common functions. We can get rid of a lot
of old ugly stuff as a results. Among them the sizes
array and the weird include/linux/kmalloc_sizes file and
some pretty bad #include statements in slab_def.h.

The one thing that is different in slab is that the 32 byte
cache will also be created for arches that have page sizes
larger than 4K. There are numerous smaller allocations that
SLOB and SLUB can handle better because of their support for
smaller allocation sizes so lets keep the 32 byte slab also
for arches with > 4K pages.

Reviewed-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/kmalloc_sizes.h |  45 -----------
 include/linux/slab_def.h      |  47 +++---------
 mm/slab.c                     | 169 +++++++++++++++++++-----------------------
 3 files changed, 88 insertions(+), 173 deletions(-)
 delete mode 100644 include/linux/kmalloc_sizes.h

(limited to 'include/linux')

diff --git a/include/linux/kmalloc_sizes.h b/include/linux/kmalloc_sizes.h
deleted file mode 100644
index e576b848ce10..000000000000
--- a/include/linux/kmalloc_sizes.h
+++ /dev/null
@@ -1,45 +0,0 @@
-#if (PAGE_SIZE == 4096)
-	CACHE(32)
-#endif
-	CACHE(64)
-#if L1_CACHE_BYTES < 64
-	CACHE(96)
-#endif
-	CACHE(128)
-#if L1_CACHE_BYTES < 128
-	CACHE(192)
-#endif
-	CACHE(256)
-	CACHE(512)
-	CACHE(1024)
-	CACHE(2048)
-	CACHE(4096)
-	CACHE(8192)
-	CACHE(16384)
-	CACHE(32768)
-	CACHE(65536)
-	CACHE(131072)
-#if KMALLOC_MAX_SIZE >= 262144
-	CACHE(262144)
-#endif
-#if KMALLOC_MAX_SIZE >= 524288
-	CACHE(524288)
-#endif
-#if KMALLOC_MAX_SIZE >= 1048576
-	CACHE(1048576)
-#endif
-#if KMALLOC_MAX_SIZE >= 2097152
-	CACHE(2097152)
-#endif
-#if KMALLOC_MAX_SIZE >= 4194304
-	CACHE(4194304)
-#endif
-#if KMALLOC_MAX_SIZE >= 8388608
-	CACHE(8388608)
-#endif
-#if KMALLOC_MAX_SIZE >= 16777216
-	CACHE(16777216)
-#endif
-#if KMALLOC_MAX_SIZE >= 33554432
-	CACHE(33554432)
-#endif
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8bb6e0eaf3c6..e0f30ef9525d 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -11,8 +11,6 @@
  */
 
 #include <linux/init.h>
-#include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
-#include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
 
 /*
@@ -104,15 +102,8 @@ struct kmem_cache {
 	 */
 };
 
-/* Size description struct for general caches. */
-struct cache_sizes {
-	size_t		 	cs_size;
-	struct kmem_cache	*cs_cachep;
-#ifdef CONFIG_ZONE_DMA
-	struct kmem_cache	*cs_dmacachep;
-#endif
-};
-extern struct cache_sizes malloc_sizes[];
+extern struct kmem_cache *kmalloc_caches[PAGE_SHIFT + MAX_ORDER];
+extern struct kmem_cache *kmalloc_dma_caches[PAGE_SHIFT + MAX_ORDER];
 
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
@@ -133,26 +124,19 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 	void *ret;
 
 	if (__builtin_constant_p(size)) {
-		int i = 0;
+		int i;
 
 		if (!size)
 			return ZERO_SIZE_PTR;
 
-#define CACHE(x) \
-		if (size <= x) \
-			goto found; \
-		else \
-			i++;
-#include <linux/kmalloc_sizes.h>
-#undef CACHE
-		return NULL;
-found:
+		i = kmalloc_index(size);
+
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
-			cachep = malloc_sizes[i].cs_dmacachep;
+			cachep = kmalloc_dma_caches[i];
 		else
 #endif
-			cachep = malloc_sizes[i].cs_cachep;
+			cachep = kmalloc_caches[i];
 
 		ret = kmem_cache_alloc_trace(cachep, flags, size);
 
@@ -186,26 +170,19 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 	struct kmem_cache *cachep;
 
 	if (__builtin_constant_p(size)) {
-		int i = 0;
+		int i;
 
 		if (!size)
 			return ZERO_SIZE_PTR;
 
-#define CACHE(x) \
-		if (size <= x) \
-			goto found; \
-		else \
-			i++;
-#include <linux/kmalloc_sizes.h>
-#undef CACHE
-		return NULL;
-found:
+		i = kmalloc_index(size);
+
 #ifdef CONFIG_ZONE_DMA
 		if (flags & GFP_DMA)
-			cachep = malloc_sizes[i].cs_dmacachep;
+			cachep = kmalloc_dma_caches[i];
 		else
 #endif
-			cachep = malloc_sizes[i].cs_cachep;
+			cachep = kmalloc_caches[i];
 
 		return kmem_cache_alloc_node_trace(cachep, flags, node, size);
 	}
diff --git a/mm/slab.c b/mm/slab.c
index e7667a3584bc..2a7132ec4ff6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -318,34 +318,18 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
-/*
- * This function must be completely optimized away if a constant is passed to
- * it.  Mostly the same as what is in linux/slab.h except it returns an index.
- */
-static __always_inline int index_of(const size_t size)
-{
-	extern void __bad_size(void);
-
-	if (__builtin_constant_p(size)) {
-		int i = 0;
+struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+EXPORT_SYMBOL(kmalloc_caches);
 
-#define CACHE(x) \
-	if (size <=x) \
-		return i; \
-	else \
-		i++;
-#include <linux/kmalloc_sizes.h>
-#undef CACHE
-		__bad_size();
-	} else
-		__bad_size();
-	return 0;
-}
+#ifdef CONFIG_ZONE_DMA
+struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+EXPORT_SYMBOL(kmalloc_dma_caches);
+#endif
 
 static int slab_early_init = 1;
 
-#define INDEX_AC index_of(sizeof(struct arraycache_init))
-#define INDEX_L3 index_of(sizeof(struct kmem_list3))
+#define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
+#define INDEX_L3 kmalloc_index(sizeof(struct kmem_list3))
 
 static void kmem_list3_init(struct kmem_list3 *parent)
 {
@@ -524,30 +508,6 @@ static inline unsigned int obj_to_index(const struct kmem_cache *cache,
 	return reciprocal_divide(offset, cache->reciprocal_buffer_size);
 }
 
-/*
- * These are the default caches for kmalloc. Custom caches can have other sizes.
- */
-struct cache_sizes malloc_sizes[] = {
-#define CACHE(x) { .cs_size = (x) },
-#include <linux/kmalloc_sizes.h>
-	CACHE(ULONG_MAX)
-#undef CACHE
-};
-EXPORT_SYMBOL(malloc_sizes);
-
-/* Must match cache_sizes above. Out of line to keep cache footprint low. */
-struct cache_names {
-	char *name;
-	char *name_dma;
-};
-
-static struct cache_names __initdata cache_names[] = {
-#define CACHE(x) { .name = "size-" #x, .name_dma = "size-" #x "(DMA)" },
-#include <linux/kmalloc_sizes.h>
-	{NULL,}
-#undef CACHE
-};
-
 static struct arraycache_init initarray_generic =
     { {0, BOOT_CPUCACHE_ENTRIES, 1, 0} };
 
@@ -625,19 +585,23 @@ static void slab_set_debugobj_lock_classes(struct kmem_cache *cachep)
 
 static void init_node_lock_keys(int q)
 {
-	struct cache_sizes *s = malloc_sizes;
+	int i;
 
 	if (slab_state < UP)
 		return;
 
-	for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) {
+	for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
 		struct kmem_list3 *l3;
+		struct kmem_cache *cache = kmalloc_caches[i];
+
+		if (!cache)
+			continue;
 
-		l3 = s->cs_cachep->nodelists[q];
-		if (!l3 || OFF_SLAB(s->cs_cachep))
+		l3 = cache->nodelists[q];
+		if (!l3 || OFF_SLAB(cache))
 			continue;
 
-		slab_set_lock_classes(s->cs_cachep, &on_slab_l3_key,
+		slab_set_lock_classes(cache, &on_slab_l3_key,
 				&on_slab_alc_key, q);
 	}
 }
@@ -705,20 +669,19 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 static inline struct kmem_cache *__find_general_cachep(size_t size,
 							gfp_t gfpflags)
 {
-	struct cache_sizes *csizep = malloc_sizes;
+	int i;
 
 #if DEBUG
 	/* This happens if someone tries to call
 	 * kmem_cache_create(), or __kmalloc(), before
 	 * the generic caches are initialized.
 	 */
-	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
+	BUG_ON(kmalloc_caches[INDEX_AC] == NULL);
 #endif
 	if (!size)
 		return ZERO_SIZE_PTR;
 
-	while (size > csizep->cs_size)
-		csizep++;
+	i = kmalloc_index(size);
 
 	/*
 	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
@@ -727,9 +690,9 @@ static inline struct kmem_cache *__find_general_cachep(size_t size,
 	 */
 #ifdef CONFIG_ZONE_DMA
 	if (unlikely(gfpflags & GFP_DMA))
-		return csizep->cs_dmacachep;
+		return kmalloc_dma_caches[i];
 #endif
-	return csizep->cs_cachep;
+	return kmalloc_caches[i];
 }
 
 static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
@@ -1602,8 +1565,6 @@ static void setup_nodelists_pointer(struct kmem_cache *cachep)
  */
 void __init kmem_cache_init(void)
 {
-	struct cache_sizes *sizes;
-	struct cache_names *names;
 	int i;
 
 	kmem_cache = &kmem_cache_boot;
@@ -1657,8 +1618,6 @@ void __init kmem_cache_init(void)
 	list_add(&kmem_cache->list, &slab_caches);
 
 	/* 2+3) create the kmalloc caches */
-	sizes = malloc_sizes;
-	names = cache_names;
 
 	/*
 	 * Initialize the caches that provide memory for the array cache and the
@@ -1666,35 +1625,39 @@ void __init kmem_cache_init(void)
 	 * bug.
 	 */
 
-	sizes[INDEX_AC].cs_cachep = create_kmalloc_cache(names[INDEX_AC].name,
-					sizes[INDEX_AC].cs_size, ARCH_KMALLOC_FLAGS);
+	kmalloc_caches[INDEX_AC] = create_kmalloc_cache("kmalloc-ac",
+					kmalloc_size(INDEX_AC), ARCH_KMALLOC_FLAGS);
 
 	if (INDEX_AC != INDEX_L3)
-		sizes[INDEX_L3].cs_cachep =
-			create_kmalloc_cache(names[INDEX_L3].name,
-				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_FLAGS);
+		kmalloc_caches[INDEX_L3] =
+			create_kmalloc_cache("kmalloc-l3",
+				kmalloc_size(INDEX_L3), ARCH_KMALLOC_FLAGS);
 
 	slab_early_init = 0;
 
-	while (sizes->cs_size != ULONG_MAX) {
-		/*
-		 * For performance, all the general caches are L1 aligned.
-		 * This should be particularly beneficial on SMP boxes, as it
-		 * eliminates "false sharing".
-		 * Note for systems short on memory removing the alignment will
-		 * allow tighter packing of the smaller caches.
-		 */
-		if (!sizes->cs_cachep)
-			sizes->cs_cachep = create_kmalloc_cache(names->name,
-					sizes->cs_size, ARCH_KMALLOC_FLAGS);
+	for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
+		size_t cs_size = kmalloc_size(i);
+
+		if (cs_size < KMALLOC_MIN_SIZE)
+			continue;
+
+		if (!kmalloc_caches[i]) {
+			/*
+			 * For performance, all the general caches are L1 aligned.
+			 * This should be particularly beneficial on SMP boxes, as it
+			 * eliminates "false sharing".
+			 * Note for systems short on memory removing the alignment will
+			 * allow tighter packing of the smaller caches.
+			 */
+			kmalloc_caches[i] = create_kmalloc_cache("kmalloc",
+					cs_size, ARCH_KMALLOC_FLAGS);
+		}
 
 #ifdef CONFIG_ZONE_DMA
-		sizes->cs_dmacachep = create_kmalloc_cache(
-			names->name_dma, sizes->cs_size,
+		kmalloc_dma_caches[i] = create_kmalloc_cache(
+			"kmalloc-dma", cs_size,
 			SLAB_CACHE_DMA|ARCH_KMALLOC_FLAGS);
 #endif
-		sizes++;
-		names++;
 	}
 	/* 4) Replace the bootstrap head arrays */
 	{
@@ -1713,17 +1676,16 @@ void __init kmem_cache_init(void)
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
+		BUG_ON(cpu_cache_get(kmalloc_caches[INDEX_AC])
 		       != &initarray_generic.cache);
-		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
+		memcpy(ptr, cpu_cache_get(kmalloc_caches[INDEX_AC]),
 		       sizeof(struct arraycache_init));
 		/*
 		 * Do not assume that spinlocks can be initialized via memcpy:
 		 */
 		spin_lock_init(&ptr->lock);
 
-		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-		    ptr;
+		kmalloc_caches[INDEX_AC]->array[smp_processor_id()] = ptr;
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1732,17 +1694,39 @@ void __init kmem_cache_init(void)
 		for_each_online_node(nid) {
 			init_list(kmem_cache, &initkmem_list3[CACHE_CACHE + nid], nid);
 
-			init_list(malloc_sizes[INDEX_AC].cs_cachep,
+			init_list(kmalloc_caches[INDEX_AC],
 				  &initkmem_list3[SIZE_AC + nid], nid);
 
 			if (INDEX_AC != INDEX_L3) {
-				init_list(malloc_sizes[INDEX_L3].cs_cachep,
+				init_list(kmalloc_caches[INDEX_L3],
 					  &initkmem_list3[SIZE_L3 + nid], nid);
 			}
 		}
 	}
 
 	slab_state = UP;
+
+	/* Create the proper names */
+	for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
+		char *s;
+		struct kmem_cache *c = kmalloc_caches[i];
+
+		if (!c)
+			continue;
+
+		s = kasprintf(GFP_NOWAIT, "kmalloc-%d", kmalloc_size(i));
+
+		BUG_ON(!s);
+		c->name = s;
+
+#ifdef CONFIG_ZONE_DMA
+		c = kmalloc_dma_caches[i];
+		BUG_ON(!c);
+		s = kasprintf(GFP_NOWAIT, "dma-kmalloc-%d", kmalloc_size(i));
+		BUG_ON(!s);
+		c->name = s;
+#endif
+	}
 }
 
 void __init kmem_cache_init_late(void)
@@ -2428,10 +2412,9 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 			size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
-	    && cachep->object_size > cache_line_size()
-	    && ALIGN(size, cachep->align) < PAGE_SIZE) {
-		cachep->obj_offset += PAGE_SIZE - ALIGN(size, cachep->align);
+	if (size >= kmalloc_size(INDEX_L3 + 1)
+	    && cachep->object_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+		cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
 		size = PAGE_SIZE;
 	}
 #endif
-- 
cgit 


From 6744f087ba2a49f6d6935d9daa0b20a0f03567b5 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:12:17 +0000
Subject: slab: Common name for the per node structures

Rename the structure used for the per node structures in slab
to have a name that expresses that fact.

Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab_def.h |  2 +-
 mm/slab.c                | 87 ++++++++++++++++++++++++------------------------
 2 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index e0f30ef9525d..8b5b2f6b36d3 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -95,7 +95,7 @@ struct kmem_cache {
 	 * pointer for each node since "nodelists" uses the remainder of
 	 * available pointers.
 	 */
-	struct kmem_list3 **nodelists;
+	struct kmem_cache_node **nodelists;
 	struct array_cache *array[NR_CPUS + MAX_NUMNODES];
 	/*
 	 * Do not add fields after array[]
diff --git a/mm/slab.c b/mm/slab.c
index 2a7132ec4ff6..7c0da4c86973 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -288,7 +288,7 @@ struct arraycache_init {
 /*
  * The slab lists for all objects.
  */
-struct kmem_list3 {
+struct kmem_cache_node {
 	struct list_head slabs_partial;	/* partial list first, better asm code */
 	struct list_head slabs_full;
 	struct list_head slabs_free;
@@ -306,13 +306,13 @@ struct kmem_list3 {
  * Need this for bootstrapping a per node allocator.
  */
 #define NUM_INIT_LISTS (3 * MAX_NUMNODES)
-static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
+static struct kmem_cache_node __initdata initkmem_list3[NUM_INIT_LISTS];
 #define	CACHE_CACHE 0
 #define	SIZE_AC MAX_NUMNODES
 #define	SIZE_L3 (2 * MAX_NUMNODES)
 
 static int drain_freelist(struct kmem_cache *cache,
-			struct kmem_list3 *l3, int tofree);
+			struct kmem_cache_node *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
@@ -329,9 +329,9 @@ EXPORT_SYMBOL(kmalloc_dma_caches);
 static int slab_early_init = 1;
 
 #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
-#define INDEX_L3 kmalloc_index(sizeof(struct kmem_list3))
+#define INDEX_L3 kmalloc_index(sizeof(struct kmem_cache_node))
 
-static void kmem_list3_init(struct kmem_list3 *parent)
+static void kmem_list3_init(struct kmem_cache_node *parent)
 {
 	INIT_LIST_HEAD(&parent->slabs_full);
 	INIT_LIST_HEAD(&parent->slabs_partial);
@@ -546,7 +546,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep,
 		int q)
 {
 	struct array_cache **alc;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	int r;
 
 	l3 = cachep->nodelists[q];
@@ -591,7 +591,7 @@ static void init_node_lock_keys(int q)
 		return;
 
 	for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) {
-		struct kmem_list3 *l3;
+		struct kmem_cache_node *l3;
 		struct kmem_cache *cache = kmalloc_caches[i];
 
 		if (!cache)
@@ -608,9 +608,8 @@ static void init_node_lock_keys(int q)
 
 static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
 {
-	struct kmem_list3 *l3;
-	l3 = cachep->nodelists[q];
-	if (!l3)
+
+	if (!cachep->nodelists[q])
 		return;
 
 	slab_set_lock_classes(cachep, &on_slab_l3_key,
@@ -901,7 +900,7 @@ static inline bool is_slab_pfmemalloc(struct slab *slabp)
 static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
 						struct array_cache *ac)
 {
-	struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()];
+	struct kmem_cache_node *l3 = cachep->nodelists[numa_mem_id()];
 	struct slab *slabp;
 	unsigned long flags;
 
@@ -934,7 +933,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 
 	/* Ensure the caller is allowed to use objects from PFMEMALLOC slab */
 	if (unlikely(is_obj_pfmemalloc(objp))) {
-		struct kmem_list3 *l3;
+		struct kmem_cache_node *l3;
 
 		if (gfp_pfmemalloc_allowed(flags)) {
 			clear_obj_pfmemalloc(&objp);
@@ -1106,7 +1105,7 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 static void __drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache *ac, int node)
 {
-	struct kmem_list3 *rl3 = cachep->nodelists[node];
+	struct kmem_cache_node *rl3 = cachep->nodelists[node];
 
 	if (ac->avail) {
 		spin_lock(&rl3->list_lock);
@@ -1127,7 +1126,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 /*
  * Called from cache_reap() to regularly drain alien caches round robin.
  */
-static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
+static void reap_alien(struct kmem_cache *cachep, struct kmem_cache_node *l3)
 {
 	int node = __this_cpu_read(slab_reap_node);
 
@@ -1162,7 +1161,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 {
 	struct slab *slabp = virt_to_slab(objp);
 	int nodeid = slabp->nodeid;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	struct array_cache *alien = NULL;
 	int node;
 
@@ -1207,8 +1206,8 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 static int init_cache_nodelists_node(int node)
 {
 	struct kmem_cache *cachep;
-	struct kmem_list3 *l3;
-	const int memsize = sizeof(struct kmem_list3);
+	struct kmem_cache_node *l3;
+	const int memsize = sizeof(struct kmem_cache_node);
 
 	list_for_each_entry(cachep, &slab_caches, list) {
 		/*
@@ -1244,7 +1243,7 @@ static int init_cache_nodelists_node(int node)
 static void __cpuinit cpuup_canceled(long cpu)
 {
 	struct kmem_cache *cachep;
-	struct kmem_list3 *l3 = NULL;
+	struct kmem_cache_node *l3 = NULL;
 	int node = cpu_to_mem(cpu);
 	const struct cpumask *mask = cpumask_of_node(node);
 
@@ -1309,7 +1308,7 @@ free_array_cache:
 static int __cpuinit cpuup_prepare(long cpu)
 {
 	struct kmem_cache *cachep;
-	struct kmem_list3 *l3 = NULL;
+	struct kmem_cache_node *l3 = NULL;
 	int node = cpu_to_mem(cpu);
 	int err;
 
@@ -1463,7 +1462,7 @@ static int __meminit drain_cache_nodelists_node(int node)
 	int ret = 0;
 
 	list_for_each_entry(cachep, &slab_caches, list) {
-		struct kmem_list3 *l3;
+		struct kmem_cache_node *l3;
 
 		l3 = cachep->nodelists[node];
 		if (!l3)
@@ -1516,15 +1515,15 @@ out:
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *list,
 				int nodeid)
 {
-	struct kmem_list3 *ptr;
+	struct kmem_cache_node *ptr;
 
-	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
+	ptr = kmalloc_node(sizeof(struct kmem_cache_node), GFP_NOWAIT, nodeid);
 	BUG_ON(!ptr);
 
-	memcpy(ptr, list, sizeof(struct kmem_list3));
+	memcpy(ptr, list, sizeof(struct kmem_cache_node));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
 	 */
@@ -1556,7 +1555,7 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
  */
 static void setup_nodelists_pointer(struct kmem_cache *cachep)
 {
-	cachep->nodelists = (struct kmem_list3 **)&cachep->array[nr_cpu_ids];
+	cachep->nodelists = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
 }
 
 /*
@@ -1613,7 +1612,7 @@ void __init kmem_cache_init(void)
 	 */
 	create_boot_cache(kmem_cache, "kmem_cache",
 		offsetof(struct kmem_cache, array[nr_cpu_ids]) +
-				  nr_node_ids * sizeof(struct kmem_list3 *),
+				  nr_node_ids * sizeof(struct kmem_cache_node *),
 				  SLAB_HWCACHE_ALIGN);
 	list_add(&kmem_cache->list, &slab_caches);
 
@@ -1787,7 +1786,7 @@ __initcall(cpucache_init);
 static noinline void
 slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 {
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	struct slab *slabp;
 	unsigned long flags;
 	int node;
@@ -2279,7 +2278,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 			int node;
 			for_each_online_node(node) {
 				cachep->nodelists[node] =
-				    kmalloc_node(sizeof(struct kmem_list3),
+				    kmalloc_node(sizeof(struct kmem_cache_node),
 						gfp, node);
 				BUG_ON(!cachep->nodelists[node]);
 				kmem_list3_init(cachep->nodelists[node]);
@@ -2547,7 +2546,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *l3,
 			struct array_cache *ac,
 			int force, int node);
 
@@ -2567,7 +2566,7 @@ static void do_drain(void *arg)
 
 static void drain_cpu_caches(struct kmem_cache *cachep)
 {
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	int node;
 
 	on_each_cpu(do_drain, cachep, 1);
@@ -2592,7 +2591,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
  * Returns the actual number of slabs released.
  */
 static int drain_freelist(struct kmem_cache *cache,
-			struct kmem_list3 *l3, int tofree)
+			struct kmem_cache_node *l3, int tofree)
 {
 	struct list_head *p;
 	int nr_freed;
@@ -2630,7 +2629,7 @@ out:
 static int __cache_shrink(struct kmem_cache *cachep)
 {
 	int ret = 0, i = 0;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 
 	drain_cpu_caches(cachep);
 
@@ -2672,7 +2671,7 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
 	int i;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	int rc = __cache_shrink(cachep);
 
 	if (rc)
@@ -2869,7 +2868,7 @@ static int cache_grow(struct kmem_cache *cachep,
 	struct slab *slabp;
 	size_t offset;
 	gfp_t local_flags;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 
 	/*
 	 * Be lazy and only check for valid flags here,  keeping it out of the
@@ -3059,7 +3058,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
 							bool force_refill)
 {
 	int batchcount;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	struct array_cache *ac;
 	int node;
 
@@ -3391,7 +3390,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 {
 	struct list_head *entry;
 	struct slab *slabp;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	void *obj;
 	int x;
 
@@ -3586,7 +3585,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		       int node)
 {
 	int i;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 
 	for (i = 0; i < nr_objects; i++) {
 		void *objp;
@@ -3632,7 +3631,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 {
 	int batchcount;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	int node = numa_mem_id();
 
 	batchcount = ac->batchcount;
@@ -3924,7 +3923,7 @@ EXPORT_SYMBOL(kfree);
 static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int node;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	struct array_cache *new_shared;
 	struct array_cache **new_alien = NULL;
 
@@ -3969,7 +3968,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 			free_alien_cache(new_alien);
 			continue;
 		}
-		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
+		l3 = kmalloc_node(sizeof(struct kmem_cache_node), gfp, node);
 		if (!l3) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
@@ -4165,7 +4164,7 @@ skip_setup:
  * necessary. Note that the l3 listlock also protects the array_cache
  * if drain_array() is used on the shared array.
  */
-static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *l3,
 			 struct array_cache *ac, int force, int node)
 {
 	int tofree;
@@ -4204,7 +4203,7 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 static void cache_reap(struct work_struct *w)
 {
 	struct kmem_cache *searchp;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	int node = numa_mem_id();
 	struct delayed_work *work = to_delayed_work(w);
 
@@ -4268,7 +4267,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 	const char *name;
 	char *error = NULL;
 	int node;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 
 	active_objs = 0;
 	num_slabs = 0;
@@ -4482,7 +4481,7 @@ static int leaks_show(struct seq_file *m, void *p)
 {
 	struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
 	struct slab *slabp;
-	struct kmem_list3 *l3;
+	struct kmem_cache_node *l3;
 	const char *name;
 	unsigned long *n = m->private;
 	int node;
-- 
cgit 


From 6a67368c36e2c0c2578ba62f6264ab739af08cce Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:14:19 +0000
Subject: slab: Rename nodelists to node

Have a common naming between both slab caches for future changes.

Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab_def.h |   2 +-
 mm/slab.c                | 135 +++++++++++++++++++++++------------------------
 2 files changed, 68 insertions(+), 69 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 8b5b2f6b36d3..4ff50e8d1a2c 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -95,7 +95,7 @@ struct kmem_cache {
 	 * pointer for each node since "nodelists" uses the remainder of
 	 * available pointers.
 	 */
-	struct kmem_cache_node **nodelists;
+	struct kmem_cache_node **node;
 	struct array_cache *array[NR_CPUS + MAX_NUMNODES];
 	/*
 	 * Do not add fields after array[]
diff --git a/mm/slab.c b/mm/slab.c
index 7c0da4c86973..3416f4c544b3 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -347,7 +347,7 @@ static void kmem_list3_init(struct kmem_cache_node *parent)
 #define MAKE_LIST(cachep, listp, slab, nodeid)				\
 	do {								\
 		INIT_LIST_HEAD(listp);					\
-		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
+		list_splice(&(cachep->node[nodeid]->slab), listp);	\
 	} while (0)
 
 #define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
@@ -549,7 +549,7 @@ static void slab_set_lock_classes(struct kmem_cache *cachep,
 	struct kmem_cache_node *l3;
 	int r;
 
-	l3 = cachep->nodelists[q];
+	l3 = cachep->node[q];
 	if (!l3)
 		return;
 
@@ -597,7 +597,7 @@ static void init_node_lock_keys(int q)
 		if (!cache)
 			continue;
 
-		l3 = cache->nodelists[q];
+		l3 = cache->node[q];
 		if (!l3 || OFF_SLAB(cache))
 			continue;
 
@@ -608,8 +608,7 @@ static void init_node_lock_keys(int q)
 
 static void on_slab_lock_classes_node(struct kmem_cache *cachep, int q)
 {
-
-	if (!cachep->nodelists[q])
+	if (!cachep->node[q])
 		return;
 
 	slab_set_lock_classes(cachep, &on_slab_l3_key,
@@ -900,7 +899,7 @@ static inline bool is_slab_pfmemalloc(struct slab *slabp)
 static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
 						struct array_cache *ac)
 {
-	struct kmem_cache_node *l3 = cachep->nodelists[numa_mem_id()];
+	struct kmem_cache_node *l3 = cachep->node[numa_mem_id()];
 	struct slab *slabp;
 	unsigned long flags;
 
@@ -955,7 +954,7 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
 		 * If there are empty slabs on the slabs_free list and we are
 		 * being forced to refill the cache, mark this one !pfmemalloc.
 		 */
-		l3 = cachep->nodelists[numa_mem_id()];
+		l3 = cachep->node[numa_mem_id()];
 		if (!list_empty(&l3->slabs_free) && force_refill) {
 			struct slab *slabp = virt_to_slab(objp);
 			ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
@@ -1105,7 +1104,7 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 static void __drain_alien_cache(struct kmem_cache *cachep,
 				struct array_cache *ac, int node)
 {
-	struct kmem_cache_node *rl3 = cachep->nodelists[node];
+	struct kmem_cache_node *rl3 = cachep->node[node];
 
 	if (ac->avail) {
 		spin_lock(&rl3->list_lock);
@@ -1174,7 +1173,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 	if (likely(slabp->nodeid == node))
 		return 0;
 
-	l3 = cachep->nodelists[node];
+	l3 = cachep->node[node];
 	STATS_INC_NODEFREES(cachep);
 	if (l3->alien && l3->alien[nodeid]) {
 		alien = l3->alien[nodeid];
@@ -1186,24 +1185,24 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
 		ac_put_obj(cachep, alien, objp);
 		spin_unlock(&alien->lock);
 	} else {
-		spin_lock(&(cachep->nodelists[nodeid])->list_lock);
+		spin_lock(&(cachep->node[nodeid])->list_lock);
 		free_block(cachep, &objp, 1, nodeid);
-		spin_unlock(&(cachep->nodelists[nodeid])->list_lock);
+		spin_unlock(&(cachep->node[nodeid])->list_lock);
 	}
 	return 1;
 }
 #endif
 
 /*
- * Allocates and initializes nodelists for a node on each slab cache, used for
+ * Allocates and initializes node for a node on each slab cache, used for
  * either memory or cpu hotplug.  If memory is being hot-added, the kmem_list3
  * will be allocated off-node since memory is not yet online for the new node.
- * When hotplugging memory or a cpu, existing nodelists are not replaced if
+ * When hotplugging memory or a cpu, existing node are not replaced if
  * already in use.
  *
  * Must hold slab_mutex.
  */
-static int init_cache_nodelists_node(int node)
+static int init_cache_node_node(int node)
 {
 	struct kmem_cache *cachep;
 	struct kmem_cache_node *l3;
@@ -1215,7 +1214,7 @@ static int init_cache_nodelists_node(int node)
 		 * begin anything. Make sure some other cpu on this
 		 * node has not already allocated this
 		 */
-		if (!cachep->nodelists[node]) {
+		if (!cachep->node[node]) {
 			l3 = kmalloc_node(memsize, GFP_KERNEL, node);
 			if (!l3)
 				return -ENOMEM;
@@ -1228,14 +1227,14 @@ static int init_cache_nodelists_node(int node)
 			 * go.  slab_mutex is sufficient
 			 * protection here.
 			 */
-			cachep->nodelists[node] = l3;
+			cachep->node[node] = l3;
 		}
 
-		spin_lock_irq(&cachep->nodelists[node]->list_lock);
-		cachep->nodelists[node]->free_limit =
+		spin_lock_irq(&cachep->node[node]->list_lock);
+		cachep->node[node]->free_limit =
 			(1 + nr_cpus_node(node)) *
 			cachep->batchcount + cachep->num;
-		spin_unlock_irq(&cachep->nodelists[node]->list_lock);
+		spin_unlock_irq(&cachep->node[node]->list_lock);
 	}
 	return 0;
 }
@@ -1255,7 +1254,7 @@ static void __cpuinit cpuup_canceled(long cpu)
 		/* cpu is dead; no one can alloc from it. */
 		nc = cachep->array[cpu];
 		cachep->array[cpu] = NULL;
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 
 		if (!l3)
 			goto free_array_cache;
@@ -1298,7 +1297,7 @@ free_array_cache:
 	 * shrink each nodelist to its limit.
 	 */
 	list_for_each_entry(cachep, &slab_caches, list) {
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (!l3)
 			continue;
 		drain_freelist(cachep, l3, l3->free_objects);
@@ -1318,7 +1317,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 	 * kmalloc_node allows us to add the slab to the right
 	 * kmem_list3 and not this cpu's kmem_list3
 	 */
-	err = init_cache_nodelists_node(node);
+	err = init_cache_node_node(node);
 	if (err < 0)
 		goto bad;
 
@@ -1353,7 +1352,7 @@ static int __cpuinit cpuup_prepare(long cpu)
 			}
 		}
 		cachep->array[cpu] = nc;
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		BUG_ON(!l3);
 
 		spin_lock_irq(&l3->list_lock);
@@ -1456,7 +1455,7 @@ static struct notifier_block __cpuinitdata cpucache_notifier = {
  *
  * Must hold slab_mutex.
  */
-static int __meminit drain_cache_nodelists_node(int node)
+static int __meminit drain_cache_node_node(int node)
 {
 	struct kmem_cache *cachep;
 	int ret = 0;
@@ -1464,7 +1463,7 @@ static int __meminit drain_cache_nodelists_node(int node)
 	list_for_each_entry(cachep, &slab_caches, list) {
 		struct kmem_cache_node *l3;
 
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (!l3)
 			continue;
 
@@ -1493,12 +1492,12 @@ static int __meminit slab_memory_callback(struct notifier_block *self,
 	switch (action) {
 	case MEM_GOING_ONLINE:
 		mutex_lock(&slab_mutex);
-		ret = init_cache_nodelists_node(nid);
+		ret = init_cache_node_node(nid);
 		mutex_unlock(&slab_mutex);
 		break;
 	case MEM_GOING_OFFLINE:
 		mutex_lock(&slab_mutex);
-		ret = drain_cache_nodelists_node(nid);
+		ret = drain_cache_node_node(nid);
 		mutex_unlock(&slab_mutex);
 		break;
 	case MEM_ONLINE:
@@ -1530,7 +1529,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node *
 	spin_lock_init(&ptr->list_lock);
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
-	cachep->nodelists[nodeid] = ptr;
+	cachep->node[nodeid] = ptr;
 }
 
 /*
@@ -1542,8 +1541,8 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
 	int node;
 
 	for_each_online_node(node) {
-		cachep->nodelists[node] = &initkmem_list3[index + node];
-		cachep->nodelists[node]->next_reap = jiffies +
+		cachep->node[node] = &initkmem_list3[index + node];
+		cachep->node[node]->next_reap = jiffies +
 		    REAPTIMEOUT_LIST3 +
 		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 	}
@@ -1551,11 +1550,11 @@ static void __init set_up_list3s(struct kmem_cache *cachep, int index)
 
 /*
  * The memory after the last cpu cache pointer is used for the
- * the nodelists pointer.
+ * the node pointer.
  */
-static void setup_nodelists_pointer(struct kmem_cache *cachep)
+static void setup_node_pointer(struct kmem_cache *cachep)
 {
-	cachep->nodelists = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
+	cachep->node = (struct kmem_cache_node **)&cachep->array[nr_cpu_ids];
 }
 
 /*
@@ -1567,7 +1566,7 @@ void __init kmem_cache_init(void)
 	int i;
 
 	kmem_cache = &kmem_cache_boot;
-	setup_nodelists_pointer(kmem_cache);
+	setup_node_pointer(kmem_cache);
 
 	if (num_possible_nodes() == 1)
 		use_alien_caches = 0;
@@ -1756,7 +1755,7 @@ void __init kmem_cache_init_late(void)
 #ifdef CONFIG_NUMA
 	/*
 	 * Register a memory hotplug callback that initializes and frees
-	 * nodelists.
+	 * node.
 	 */
 	hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
 #endif
@@ -1801,7 +1800,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
 		unsigned long active_objs = 0, num_objs = 0, free_objects = 0;
 		unsigned long active_slabs = 0, num_slabs = 0;
 
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (!l3)
 			continue;
 
@@ -2277,15 +2276,15 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 		} else {
 			int node;
 			for_each_online_node(node) {
-				cachep->nodelists[node] =
+				cachep->node[node] =
 				    kmalloc_node(sizeof(struct kmem_cache_node),
 						gfp, node);
-				BUG_ON(!cachep->nodelists[node]);
-				kmem_list3_init(cachep->nodelists[node]);
+				BUG_ON(!cachep->node[node]);
+				kmem_list3_init(cachep->node[node]);
 			}
 		}
 	}
-	cachep->nodelists[numa_mem_id()]->next_reap =
+	cachep->node[numa_mem_id()]->next_reap =
 			jiffies + REAPTIMEOUT_LIST3 +
 			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 
@@ -2388,7 +2387,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	else
 		gfp = GFP_NOWAIT;
 
-	setup_nodelists_pointer(cachep);
+	setup_node_pointer(cachep);
 #if DEBUG
 
 	/*
@@ -2527,7 +2526,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock);
+	assert_spin_locked(&cachep->node[numa_mem_id()]->list_lock);
 #endif
 }
 
@@ -2535,7 +2534,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[node]->list_lock);
+	assert_spin_locked(&cachep->node[node]->list_lock);
 #endif
 }
 
@@ -2558,9 +2557,9 @@ static void do_drain(void *arg)
 
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
-	spin_lock(&cachep->nodelists[node]->list_lock);
+	spin_lock(&cachep->node[node]->list_lock);
 	free_block(cachep, ac->entry, ac->avail, node);
-	spin_unlock(&cachep->nodelists[node]->list_lock);
+	spin_unlock(&cachep->node[node]->list_lock);
 	ac->avail = 0;
 }
 
@@ -2572,13 +2571,13 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	on_each_cpu(do_drain, cachep, 1);
 	check_irq_on();
 	for_each_online_node(node) {
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (l3 && l3->alien)
 			drain_alien_cache(cachep, l3->alien);
 	}
 
 	for_each_online_node(node) {
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (l3)
 			drain_array(cachep, l3, l3->shared, 1, node);
 	}
@@ -2635,7 +2634,7 @@ static int __cache_shrink(struct kmem_cache *cachep)
 
 	check_irq_on();
 	for_each_online_node(i) {
-		l3 = cachep->nodelists[i];
+		l3 = cachep->node[i];
 		if (!l3)
 			continue;
 
@@ -2682,7 +2681,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
-		l3 = cachep->nodelists[i];
+		l3 = cachep->node[i];
 		if (l3) {
 			kfree(l3->shared);
 			free_alien_cache(l3->alien);
@@ -2879,7 +2878,7 @@ static int cache_grow(struct kmem_cache *cachep,
 
 	/* Take the l3 list lock to change the colour_next on this node */
 	check_irq_off();
-	l3 = cachep->nodelists[nodeid];
+	l3 = cachep->node[nodeid];
 	spin_lock(&l3->list_lock);
 
 	/* Get colour for the slab, and cal the next value. */
@@ -3077,7 +3076,7 @@ retry:
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = cachep->nodelists[node];
+	l3 = cachep->node[node];
 
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
@@ -3299,7 +3298,7 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
 /*
  * Fallback function if there was no memory available and no objects on a
  * certain node and fall back is permitted. First we scan all the
- * available nodelists for available objects. If that fails then we
+ * available node for available objects. If that fails then we
  * perform an allocation without specifying a node. This allows the page
  * allocator to do its reclaim / fallback magic. We then insert the
  * slab into the proper nodelist and then allocate from it.
@@ -3333,8 +3332,8 @@ retry:
 		nid = zone_to_nid(zone);
 
 		if (cpuset_zone_allowed_hardwall(zone, flags) &&
-			cache->nodelists[nid] &&
-			cache->nodelists[nid]->free_objects) {
+			cache->node[nid] &&
+			cache->node[nid]->free_objects) {
 				obj = ____cache_alloc_node(cache,
 					flags | GFP_THISNODE, nid);
 				if (obj)
@@ -3394,7 +3393,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
 	void *obj;
 	int x;
 
-	l3 = cachep->nodelists[nodeid];
+	l3 = cachep->node[nodeid];
 	BUG_ON(!l3);
 
 retry:
@@ -3479,7 +3478,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (nodeid == NUMA_NO_NODE)
 		nodeid = slab_node;
 
-	if (unlikely(!cachep->nodelists[nodeid])) {
+	if (unlikely(!cachep->node[nodeid])) {
 		/* Node not bootstrapped yet */
 		ptr = fallback_alloc(cachep, flags);
 		goto out;
@@ -3595,7 +3594,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
 		objp = objpp[i];
 
 		slabp = virt_to_slab(objp);
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		list_del(&slabp->list);
 		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
@@ -3639,7 +3638,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
-	l3 = cachep->nodelists[node];
+	l3 = cachep->node[node];
 	spin_lock(&l3->list_lock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
@@ -3946,7 +3945,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 			}
 		}
 
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (l3) {
 			struct array_cache *shared = l3->shared;
 
@@ -3982,7 +3981,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 		l3->alien = new_alien;
 		l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
-		cachep->nodelists[node] = l3;
+		cachep->node[node] = l3;
 	}
 	return 0;
 
@@ -3991,13 +3990,13 @@ fail:
 		/* Cache is not active yet. Roll back what we did */
 		node--;
 		while (node >= 0) {
-			if (cachep->nodelists[node]) {
-				l3 = cachep->nodelists[node];
+			if (cachep->node[node]) {
+				l3 = cachep->node[node];
 
 				kfree(l3->shared);
 				free_alien_cache(l3->alien);
 				kfree(l3);
-				cachep->nodelists[node] = NULL;
+				cachep->node[node] = NULL;
 			}
 			node--;
 		}
@@ -4057,9 +4056,9 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		struct array_cache *ccold = new->new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+		spin_lock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
 		free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock);
+		spin_unlock_irq(&cachep->node[cpu_to_mem(i)]->list_lock);
 		kfree(ccold);
 	}
 	kfree(new);
@@ -4219,7 +4218,7 @@ static void cache_reap(struct work_struct *w)
 		 * have established with reasonable certainty that
 		 * we can do some work if the lock was obtained.
 		 */
-		l3 = searchp->nodelists[node];
+		l3 = searchp->node[node];
 
 		reap_alien(searchp, l3);
 
@@ -4272,7 +4271,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (!l3)
 			continue;
 
@@ -4497,7 +4496,7 @@ static int leaks_show(struct seq_file *m, void *p)
 	n[1] = 0;
 
 	for_each_online_node(node) {
-		l3 = cachep->nodelists[node];
+		l3 = cachep->node[node];
 		if (!l3)
 			continue;
 
-- 
cgit 


From 95a05b428cc675694321c8f762591984f3fd2b1e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:14:19 +0000
Subject: slab: Common constants for kmalloc boundaries

Standardize the constants that describe the smallest and largest
object kept in the kmalloc arrays for SLAB and SLUB.

Differentiate between the maximum size for which a slab cache is used
(KMALLOC_MAX_CACHE_SIZE) and the maximum allocatable size
(KMALLOC_MAX_SIZE, KMALLOC_MAX_ORDER).

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h     | 34 ++++++++++++++++++++++++----------
 include/linux/slub_def.h | 19 +++----------------
 mm/slub.c                | 22 +++++++++++-----------
 3 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index c97fe92532d1..c01780540054 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -163,7 +163,12 @@ struct kmem_cache {
 #else /* CONFIG_SLOB */
 
 /*
- * The largest kmalloc size supported by the slab allocators is
+ * Kmalloc array related definitions
+ */
+
+#ifdef CONFIG_SLAB
+/*
+ * The largest kmalloc size supported by the SLAB allocators is
  * 32 megabyte (2^25) or the maximum allocatable page order if that is
  * less than 32 MB.
  *
@@ -173,9 +178,24 @@ struct kmem_cache {
  */
 #define KMALLOC_SHIFT_HIGH	((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
 				(MAX_ORDER + PAGE_SHIFT - 1) : 25)
+#define KMALLOC_SHIFT_MAX	KMALLOC_SHIFT_HIGH
+#define KMALLOC_SHIFT_LOW	5
+#else
+/*
+ * SLUB allocates up to order 2 pages directly and otherwise
+ * passes the request to the page allocator.
+ */
+#define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
+#define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#define KMALLOC_SHIFT_LOW	3
+#endif
 
-#define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
-#define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_HIGH - PAGE_SHIFT)
+/* Maximum allocatable size */
+#define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
+/* Maximum size for which we actually use a slab cache */
+#define KMALLOC_MAX_CACHE_SIZE	(1UL << KMALLOC_SHIFT_HIGH)
+/* Maximum order allocatable via the slab allocagtor */
+#define KMALLOC_MAX_ORDER	(KMALLOC_SHIFT_MAX - PAGE_SHIFT)
 
 /*
  * Kmalloc subsystem.
@@ -183,15 +203,9 @@ struct kmem_cache {
 #if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
 #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
 #else
-#ifdef CONFIG_SLAB
-#define KMALLOC_MIN_SIZE 32
-#else
-#define KMALLOC_MIN_SIZE 8
-#endif
+#define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
 #endif
 
-#define KMALLOC_SHIFT_LOW ilog2(KMALLOC_MIN_SIZE)
-
 /*
  * Figure out which kmalloc slab an allocation of a certain size
  * belongs to.
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 99c3e05ff1f0..032028ef9a34 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,19 +115,6 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-/*
- * Maximum kmalloc object size handled by SLUB. Larger object allocations
- * are passed through to the page allocator. The page allocator "fastpath"
- * is relatively slow so we need this value sufficiently high so that
- * performance critical objects are allocated through the SLUB fastpath.
- *
- * This should be dropped to PAGE_SIZE / 2 once the page allocator
- * "fastpath" becomes competitive with the slab allocator fastpaths.
- */
-#define SLUB_MAX_SIZE (2 * PAGE_SIZE)
-
-#define SLUB_PAGE_SHIFT (PAGE_SHIFT + 2)
-
 #ifdef CONFIG_ZONE_DMA
 #define SLUB_DMA __GFP_DMA
 #else
@@ -139,7 +126,7 @@ struct kmem_cache {
  * We keep the general caches in an array of slab caches that are used for
  * 2^x bytes of allocations.
  */
-extern struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
+extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
 
 /*
  * Find the slab cache for a given combination of allocation flags and size.
@@ -211,7 +198,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size)) {
-		if (size > SLUB_MAX_SIZE)
+		if (size > KMALLOC_MAX_CACHE_SIZE)
 			return kmalloc_large(size, flags);
 
 		if (!(flags & SLUB_DMA)) {
@@ -247,7 +234,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= SLUB_MAX_SIZE && !(flags & SLUB_DMA)) {
+		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & SLUB_DMA)) {
 			struct kmem_cache *s = kmalloc_slab(size);
 
 		if (!s)
diff --git a/mm/slub.c b/mm/slub.c
index ba2ca53f6c3a..d0f72ee06310 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2775,7 +2775,7 @@ init_kmem_cache_node(struct kmem_cache_node *n)
 static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
 {
 	BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
-			SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
+			KMALLOC_SHIFT_HIGH * sizeof(struct kmem_cache_cpu));
 
 	/*
 	 * Must align to double word boundary for the double cmpxchg
@@ -3174,11 +3174,11 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
+struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
 EXPORT_SYMBOL(kmalloc_caches);
 
 #ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+static struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
 #endif
 
 static int __init setup_slub_min_order(char *str)
@@ -3280,7 +3280,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	struct kmem_cache *s;
 	void *ret;
 
-	if (unlikely(size > SLUB_MAX_SIZE))
+	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
 		return kmalloc_large(size, flags);
 
 	s = get_slab(size, flags);
@@ -3316,7 +3316,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	struct kmem_cache *s;
 	void *ret;
 
-	if (unlikely(size > SLUB_MAX_SIZE)) {
+	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
 		ret = kmalloc_large_node(size, flags, node);
 
 		trace_kmalloc_node(_RET_IP_, ret,
@@ -3721,7 +3721,7 @@ void __init kmem_cache_init(void)
 		caches++;
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 		kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
 		caches++;
 	}
@@ -3739,7 +3739,7 @@ void __init kmem_cache_init(void)
 		BUG_ON(!kmalloc_caches[2]->name);
 	}
 
-	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+	for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
 		char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 		BUG_ON(!s);
@@ -3751,7 +3751,7 @@ void __init kmem_cache_init(void)
 #endif
 
 #ifdef CONFIG_ZONE_DMA
-	for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+	for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
 		struct kmem_cache *s = kmalloc_caches[i];
 
 		if (s && s->size) {
@@ -3930,7 +3930,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	struct kmem_cache *s;
 	void *ret;
 
-	if (unlikely(size > SLUB_MAX_SIZE))
+	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
 		return kmalloc_large(size, gfpflags);
 
 	s = get_slab(size, gfpflags);
@@ -3953,7 +3953,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	struct kmem_cache *s;
 	void *ret;
 
-	if (unlikely(size > SLUB_MAX_SIZE)) {
+	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
 		ret = kmalloc_large_node(size, gfpflags, node);
 
 		trace_kmalloc_node(caller, ret,
@@ -4312,7 +4312,7 @@ static void resiliency_test(void)
 {
 	u8 *p;
 
-	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
 
 	printk(KERN_ERR "SLUB resiliency testing\n");
 	printk(KERN_ERR "-----------------------\n");
-- 
cgit 


From 9425c58e5445277699ff3c2a87bac1cfebc1b48d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:12:17 +0000
Subject: slab: Common definition for the array of kmalloc caches

Have a common definition fo the kmalloc cache arrays in
SLAB and SLUB

Acked-by: Glauber Costa <glommer@parallels.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h     | 5 +++++
 include/linux/slab_def.h | 3 ---
 include/linux/slub_def.h | 6 ------
 mm/slab.c                | 8 --------
 mm/slab_common.c         | 8 ++++++++
 mm/slub.c                | 7 -------
 6 files changed, 13 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index c01780540054..f2327a898a85 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -206,6 +206,11 @@ struct kmem_cache {
 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
 #endif
 
+extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+#ifdef CONFIG_ZONE_DMA
+extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+#endif
+
 /*
  * Figure out which kmalloc slab an allocation of a certain size
  * belongs to.
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 4ff50e8d1a2c..113ec080313f 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -102,9 +102,6 @@ struct kmem_cache {
 	 */
 };
 
-extern struct kmem_cache *kmalloc_caches[PAGE_SHIFT + MAX_ORDER];
-extern struct kmem_cache *kmalloc_dma_caches[PAGE_SHIFT + MAX_ORDER];
-
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 032028ef9a34..3701896f7f8a 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -122,12 +122,6 @@ struct kmem_cache {
 #define SLUB_DMA (__force gfp_t)0
 #endif
 
-/*
- * We keep the general caches in an array of slab caches that are used for
- * 2^x bytes of allocations.
- */
-extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
-
 /*
  * Find the slab cache for a given combination of allocation flags and size.
  *
diff --git a/mm/slab.c b/mm/slab.c
index 3416f4c544b3..357f0bdc5e43 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -318,14 +318,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
-EXPORT_SYMBOL(kmalloc_caches);
-
-#ifdef CONFIG_ZONE_DMA
-struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
-EXPORT_SYMBOL(kmalloc_dma_caches);
-#endif
-
 static int slab_early_init = 1;
 
 #define INDEX_AC kmalloc_index(sizeof(struct arraycache_init))
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 53adfbf2f3b2..0437b8189b8a 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -319,6 +319,14 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size,
 	return s;
 }
 
+struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
+EXPORT_SYMBOL(kmalloc_caches);
+
+#ifdef CONFIG_ZONE_DMA
+struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
+EXPORT_SYMBOL(kmalloc_dma_caches);
+#endif
+
 #endif /* !CONFIG_SLOB */
 
 
diff --git a/mm/slub.c b/mm/slub.c
index d0f72ee06310..527cbfb5c49b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3174,13 +3174,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
  *		Kmalloc subsystem
  *******************************************************************/
 
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
-EXPORT_SYMBOL(kmalloc_caches);
-
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
-#endif
-
 static int __init setup_slub_min_order(char *str)
 {
 	get_option(&str, &slub_min_order);
-- 
cgit 


From 2c59dd6544212faa5ce761920d2251f4152f408d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:14:19 +0000
Subject: slab: Common Kmalloc cache determination

Extract the optimized lookup functions from slub and put them into
slab_common.c. Then make slab use these functions as well.

Joonsoo notes that this fixes some issues with constant folding which
also reduces the code size for slub.

https://lkml.org/lkml/2012/10/20/82

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h |  41 +++++-------------
 mm/slab.c                |  40 ++----------------
 mm/slab.h                |   3 ++
 mm/slab_common.c         | 105 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/slub.c                | 108 +++--------------------------------------------
 5 files changed, 124 insertions(+), 173 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 3701896f7f8a..16341e5316de 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -115,29 +115,6 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-#ifdef CONFIG_ZONE_DMA
-#define SLUB_DMA __GFP_DMA
-#else
-/* Disable DMA functionality */
-#define SLUB_DMA (__force gfp_t)0
-#endif
-
-/*
- * Find the slab cache for a given combination of allocation flags and size.
- *
- * This ought to end up with a global pointer to the right cache
- * in kmalloc_caches.
- */
-static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
-{
-	int index = kmalloc_index(size);
-
-	if (index == 0)
-		return NULL;
-
-	return kmalloc_caches[index];
-}
-
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
@@ -195,13 +172,14 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 		if (size > KMALLOC_MAX_CACHE_SIZE)
 			return kmalloc_large(size, flags);
 
-		if (!(flags & SLUB_DMA)) {
-			struct kmem_cache *s = kmalloc_slab(size);
+		if (!(flags & GFP_DMA)) {
+			int index = kmalloc_index(size);
 
-			if (!s)
+			if (!index)
 				return ZERO_SIZE_PTR;
 
-			return kmem_cache_alloc_trace(s, flags, size);
+			return kmem_cache_alloc_trace(kmalloc_caches[index],
+					flags, size);
 		}
 	}
 	return __kmalloc(size, flags);
@@ -228,13 +206,14 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) &&
-		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & SLUB_DMA)) {
-			struct kmem_cache *s = kmalloc_slab(size);
+		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
+		int index = kmalloc_index(size);
 
-		if (!s)
+		if (!index)
 			return ZERO_SIZE_PTR;
 
-		return kmem_cache_alloc_node_trace(s, flags, node, size);
+		return kmem_cache_alloc_node_trace(kmalloc_caches[index],
+			       flags, node, size);
 	}
 	return __kmalloc_node(size, flags, node);
 }
diff --git a/mm/slab.c b/mm/slab.c
index 08ba44f81a28..62629b11df38 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -656,40 +656,6 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
-static inline struct kmem_cache *__find_general_cachep(size_t size,
-							gfp_t gfpflags)
-{
-	int i;
-
-#if DEBUG
-	/* This happens if someone tries to call
-	 * kmem_cache_create(), or __kmalloc(), before
-	 * the generic caches are initialized.
-	 */
-	BUG_ON(kmalloc_caches[INDEX_AC] == NULL);
-#endif
-	if (!size)
-		return ZERO_SIZE_PTR;
-
-	i = kmalloc_index(size);
-
-	/*
-	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
-	 * has cs_{dma,}cachep==NULL. Thus no special case
-	 * for large kmalloc calls required.
-	 */
-#ifdef CONFIG_ZONE_DMA
-	if (unlikely(gfpflags & GFP_DMA))
-		return kmalloc_dma_caches[i];
-#endif
-	return kmalloc_caches[i];
-}
-
-static struct kmem_cache *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
-{
-	return __find_general_cachep(size, gfpflags);
-}
-
 static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 {
 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
@@ -2426,7 +2392,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 	cachep->reciprocal_buffer_size = reciprocal_value(size);
 
 	if (flags & CFLGS_OFF_SLAB) {
-		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
+		cachep->slabp_cache = kmalloc_slab(slab_size, 0u);
 		/*
 		 * This is a possibility for one of the malloc_sizes caches.
 		 * But since we go off slab only for object size greater than
@@ -3729,7 +3695,7 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller)
 {
 	struct kmem_cache *cachep;
 
-	cachep = kmem_find_general_cachep(size, flags);
+	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
 	return kmem_cache_alloc_node_trace(cachep, flags, node, size);
@@ -3774,7 +3740,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	 * Then kmalloc uses the uninlined functions instead of the inline
 	 * functions.
 	 */
-	cachep = __find_general_cachep(size, flags);
+	cachep = kmalloc_slab(size, flags);
 	if (unlikely(ZERO_OR_NULL_PTR(cachep)))
 		return cachep;
 	ret = slab_alloc(cachep, flags, caller);
diff --git a/mm/slab.h b/mm/slab.h
index 44c0bd6dc19e..c01bc8921ac5 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -38,6 +38,9 @@ unsigned long calculate_alignment(unsigned long flags,
 #ifndef CONFIG_SLOB
 /* Kmalloc array related functions */
 void create_kmalloc_caches(unsigned long);
+
+/* Find the kmalloc slab corresponding for a certain size */
+struct kmem_cache *kmalloc_slab(size_t, gfp_t);
 #endif
 
 
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2b0ebb6d071d..6d73f0b7f21c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -327,6 +327,68 @@ struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
 EXPORT_SYMBOL(kmalloc_dma_caches);
 #endif
 
+/*
+ * Conversion table for small slabs sizes / 8 to the index in the
+ * kmalloc array. This is necessary for slabs < 192 since we have non power
+ * of two cache sizes there. The size of larger slabs can be determined using
+ * fls.
+ */
+static s8 size_index[24] = {
+	3,	/* 8 */
+	4,	/* 16 */
+	5,	/* 24 */
+	5,	/* 32 */
+	6,	/* 40 */
+	6,	/* 48 */
+	6,	/* 56 */
+	6,	/* 64 */
+	1,	/* 72 */
+	1,	/* 80 */
+	1,	/* 88 */
+	1,	/* 96 */
+	7,	/* 104 */
+	7,	/* 112 */
+	7,	/* 120 */
+	7,	/* 128 */
+	2,	/* 136 */
+	2,	/* 144 */
+	2,	/* 152 */
+	2,	/* 160 */
+	2,	/* 168 */
+	2,	/* 176 */
+	2,	/* 184 */
+	2	/* 192 */
+};
+
+static inline int size_index_elem(size_t bytes)
+{
+	return (bytes - 1) / 8;
+}
+
+/*
+ * Find the kmem_cache structure that serves a given size of
+ * allocation
+ */
+struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
+{
+	int index;
+
+	if (size <= 192) {
+		if (!size)
+			return ZERO_SIZE_PTR;
+
+		index = size_index[size_index_elem(size)];
+	} else
+		index = fls(size - 1);
+
+#ifdef CONFIG_ZONE_DMA
+	if (unlikely((flags & SLAB_CACHE_DMA)))
+		return kmalloc_dma_caches[index];
+
+#endif
+	return kmalloc_caches[index];
+}
+
 /*
  * Create the kmalloc array. Some of the regular kmalloc arrays
  * may already have been created because they were needed to
@@ -336,6 +398,47 @@ void __init create_kmalloc_caches(unsigned long flags)
 {
 	int i;
 
+	/*
+	 * Patch up the size_index table if we have strange large alignment
+	 * requirements for the kmalloc array. This is only the case for
+	 * MIPS it seems. The standard arches will not generate any code here.
+	 *
+	 * Largest permitted alignment is 256 bytes due to the way we
+	 * handle the index determination for the smaller caches.
+	 *
+	 * Make sure that nothing crazy happens if someone starts tinkering
+	 * around with ARCH_KMALLOC_MINALIGN
+	 */
+	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
+		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
+
+	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
+		int elem = size_index_elem(i);
+
+		if (elem >= ARRAY_SIZE(size_index))
+			break;
+		size_index[elem] = KMALLOC_SHIFT_LOW;
+	}
+
+	if (KMALLOC_MIN_SIZE >= 64) {
+		/*
+		 * The 96 byte size cache is not used if the alignment
+		 * is 64 byte.
+		 */
+		for (i = 64 + 8; i <= 96; i += 8)
+			size_index[size_index_elem(i)] = 7;
+
+	}
+
+	if (KMALLOC_MIN_SIZE >= 128) {
+		/*
+		 * The 192 byte sized cache is not used if the alignment
+		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
+		 * instead.
+		 */
+		for (i = 128 + 8; i <= 192; i += 8)
+			size_index[size_index_elem(i)] = 8;
+	}
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1])
 		kmalloc_caches[1] = create_kmalloc_cache(NULL, 96, flags);
@@ -379,8 +482,6 @@ void __init create_kmalloc_caches(unsigned long flags)
 	}
 #endif
 }
-
-
 #endif /* !CONFIG_SLOB */
 
 
diff --git a/mm/slub.c b/mm/slub.c
index e813c2d30fe0..6184b0821f7e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2982,7 +2982,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
 		s->allocflags |= __GFP_COMP;
 
 	if (s->flags & SLAB_CACHE_DMA)
-		s->allocflags |= SLUB_DMA;
+		s->allocflags |= GFP_DMA;
 
 	if (s->flags & SLAB_RECLAIM_ACCOUNT)
 		s->allocflags |= __GFP_RECLAIMABLE;
@@ -3210,64 +3210,6 @@ static int __init setup_slub_nomerge(char *str)
 
 __setup("slub_nomerge", setup_slub_nomerge);
 
-/*
- * Conversion table for small slabs sizes / 8 to the index in the
- * kmalloc array. This is necessary for slabs < 192 since we have non power
- * of two cache sizes there. The size of larger slabs can be determined using
- * fls.
- */
-static s8 size_index[24] = {
-	3,	/* 8 */
-	4,	/* 16 */
-	5,	/* 24 */
-	5,	/* 32 */
-	6,	/* 40 */
-	6,	/* 48 */
-	6,	/* 56 */
-	6,	/* 64 */
-	1,	/* 72 */
-	1,	/* 80 */
-	1,	/* 88 */
-	1,	/* 96 */
-	7,	/* 104 */
-	7,	/* 112 */
-	7,	/* 120 */
-	7,	/* 128 */
-	2,	/* 136 */
-	2,	/* 144 */
-	2,	/* 152 */
-	2,	/* 160 */
-	2,	/* 168 */
-	2,	/* 176 */
-	2,	/* 184 */
-	2	/* 192 */
-};
-
-static inline int size_index_elem(size_t bytes)
-{
-	return (bytes - 1) / 8;
-}
-
-static struct kmem_cache *get_slab(size_t size, gfp_t flags)
-{
-	int index;
-
-	if (size <= 192) {
-		if (!size)
-			return ZERO_SIZE_PTR;
-
-		index = size_index[size_index_elem(size)];
-	} else
-		index = fls(size - 1);
-
-#ifdef CONFIG_ZONE_DMA
-	if (unlikely((flags & SLUB_DMA)))
-		return kmalloc_dma_caches[index];
-
-#endif
-	return kmalloc_caches[index];
-}
-
 void *__kmalloc(size_t size, gfp_t flags)
 {
 	struct kmem_cache *s;
@@ -3276,7 +3218,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
 		return kmalloc_large(size, flags);
 
-	s = get_slab(size, flags);
+	s = kmalloc_slab(size, flags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
@@ -3319,7 +3261,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 		return ret;
 	}
 
-	s = get_slab(size, flags);
+	s = kmalloc_slab(size, flags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
@@ -3632,7 +3574,6 @@ void __init kmem_cache_init(void)
 {
 	static __initdata struct kmem_cache boot_kmem_cache,
 		boot_kmem_cache_node;
-	int i;
 
 	if (debug_guardpage_minorder())
 		slub_max_order = 0;
@@ -3663,45 +3604,6 @@ void __init kmem_cache_init(void)
 	kmem_cache_node = bootstrap(&boot_kmem_cache_node);
 
 	/* Now we can use the kmem_cache to allocate kmalloc slabs */
-
-	/*
-	 * Patch up the size_index table if we have strange large alignment
-	 * requirements for the kmalloc array. This is only the case for
-	 * MIPS it seems. The standard arches will not generate any code here.
-	 *
-	 * Largest permitted alignment is 256 bytes due to the way we
-	 * handle the index determination for the smaller caches.
-	 *
-	 * Make sure that nothing crazy happens if someone starts tinkering
-	 * around with ARCH_KMALLOC_MINALIGN
-	 */
-	BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
-		(KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
-
-	for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
-		int elem = size_index_elem(i);
-		if (elem >= ARRAY_SIZE(size_index))
-			break;
-		size_index[elem] = KMALLOC_SHIFT_LOW;
-	}
-
-	if (KMALLOC_MIN_SIZE == 64) {
-		/*
-		 * The 96 byte size cache is not used if the alignment
-		 * is 64 byte.
-		 */
-		for (i = 64 + 8; i <= 96; i += 8)
-			size_index[size_index_elem(i)] = 7;
-	} else if (KMALLOC_MIN_SIZE == 128) {
-		/*
-		 * The 192 byte sized cache is not used if the alignment
-		 * is 128 byte. Redirect kmalloc to use the 256 byte cache
-		 * instead.
-		 */
-		for (i = 128 + 8; i <= 192; i += 8)
-			size_index[size_index_elem(i)] = 8;
-	}
-
 	create_kmalloc_caches(0);
 
 #ifdef CONFIG_SMP
@@ -3877,7 +3779,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
 		return kmalloc_large(size, gfpflags);
 
-	s = get_slab(size, gfpflags);
+	s = kmalloc_slab(size, gfpflags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
@@ -3907,7 +3809,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 		return ret;
 	}
 
-	s = get_slab(size, gfpflags);
+	s = kmalloc_slab(size, gfpflags);
 
 	if (unlikely(ZERO_OR_NULL_PTR(s)))
 		return s;
-- 
cgit 


From ca34956b804b7554fc4e88826773380d9d5122a8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 10 Jan 2013 19:14:19 +0000
Subject: slab: Common definition for kmem_cache_node

Put the definitions for the kmem_cache_node structures together so that
we have one structure. That will allow us to create more common fields in
the future which could yield more opportunities to share code.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slub_def.h | 11 -----------
 mm/slab.c                | 17 -----------------
 mm/slab.h                | 32 ++++++++++++++++++++++++++++++++
 3 files changed, 32 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 16341e5316de..027276fa8713 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -53,17 +53,6 @@ struct kmem_cache_cpu {
 #endif
 };
 
-struct kmem_cache_node {
-	spinlock_t list_lock;	/* Protect partial list and nr_partial */
-	unsigned long nr_partial;
-	struct list_head partial;
-#ifdef CONFIG_SLUB_DEBUG
-	atomic_long_t nr_slabs;
-	atomic_long_t total_objects;
-	struct list_head full;
-#endif
-};
-
 /*
  * Word size structure that can be atomically updated or read and that
  * contains both the order and the number of objects that a slab of the
diff --git a/mm/slab.c b/mm/slab.c
index c162b2eb493a..17f859614546 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -285,23 +285,6 @@ struct arraycache_init {
 	void *entries[BOOT_CPUCACHE_ENTRIES];
 };
 
-/*
- * The slab lists for all objects.
- */
-struct kmem_cache_node {
-	struct list_head slabs_partial;	/* partial list first, better asm code */
-	struct list_head slabs_full;
-	struct list_head slabs_free;
-	unsigned long free_objects;
-	unsigned int free_limit;
-	unsigned int colour_next;	/* Per-node cache coloring */
-	spinlock_t list_lock;
-	struct array_cache *shared;	/* shared per node */
-	struct array_cache **alien;	/* on other nodes */
-	unsigned long next_reap;	/* updated without locking */
-	int free_touched;		/* updated without locking */
-};
-
 /*
  * Need this for bootstrapping a per node allocator.
  */
diff --git a/mm/slab.h b/mm/slab.h
index f0a552ff7b9b..f96b49e4704e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -239,3 +239,35 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 	return s;
 }
 #endif
+
+
+/*
+ * The slab lists for all objects.
+ */
+struct kmem_cache_node {
+	spinlock_t list_lock;
+
+#ifdef CONFIG_SLAB
+	struct list_head slabs_partial;	/* partial list first, better asm code */
+	struct list_head slabs_full;
+	struct list_head slabs_free;
+	unsigned long free_objects;
+	unsigned int free_limit;
+	unsigned int colour_next;	/* Per-node cache coloring */
+	struct array_cache *shared;	/* shared per node */
+	struct array_cache **alien;	/* on other nodes */
+	unsigned long next_reap;	/* updated without locking */
+	int free_touched;		/* updated without locking */
+#endif
+
+#ifdef CONFIG_SLUB
+	unsigned long nr_partial;
+	struct list_head partial;
+#ifdef CONFIG_SLUB_DEBUG
+	atomic_long_t nr_slabs;
+	atomic_long_t total_objects;
+	struct list_head full;
+#endif
+#endif
+
+};
-- 
cgit 


From c601fd6956e92b0eb268d4af754073c76155b99d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Tue, 5 Feb 2013 16:36:47 +0000
Subject: slab: Handle ARCH_DMA_MINALIGN correctly

James Hogan hit boot problems in next-20130204 on Meta:

  META213-Thread0 DSP [LogF] kobject (4fc03980): tried to init an initialized object, something is seriously wrong.
  META213-Thread0 DSP [LogF]
  META213-Thread0 DSP [LogF] Call trace:
  META213-Thread0 DSP [LogF] [<4000888c>] _show_stack+0x68/0x7c
  META213-Thread0 DSP [LogF] [<400088b4>] _dump_stack+0x14/0x28
  META213-Thread0 DSP [LogF] [<40103794>] _kobject_init+0x58/0x9c
  META213-Thread0 DSP [LogF] [<40103810>] _kobject_create+0x38/0x64
  META213-Thread0 DSP [LogF] [<40103eac>] _kobject_create_and_add+0x14/0x8c
  META213-Thread0 DSP [LogF] [<40190ac4>] _mnt_init+0xd8/0x220
  META213-Thread0 DSP [LogF] [<40190508>] _vfs_caches_init+0xb0/0x160
  META213-Thread0 DSP [LogF] [<401851f4>] _start_kernel+0x274/0x340
  META213-Thread0 DSP [LogF] [<40188424>] _metag_start_kernel+0x58/0x6c
  META213-Thread0 DSP [LogF] [<40000044>] __start+0x44/0x48
  META213-Thread0 DSP [LogF]
  META213-Thread0 DSP [LogF] devtmpfs: initialized
  META213-Thread0 DSP [LogF] L2 Cache: Not present
  META213-Thread0 DSP [LogF] BUG: failure at fs/sysfs/dir.c:736/sysfs_read_ns_type()!
  META213-Thread0 DSP [LogF] Kernel panic - not syncing: BUG!
  META213-Thread0 DSP [Thread Exit] Thread has exited - return code = 4294967295

And bisected the problem to commit 95a05b4 ("slab: Common constants for
kmalloc boundaries").

As it turns out, a fixed KMALLOC_SHIFT_LOW does not work for arches with
higher alignment requirements.

Determine KMALLOC_SHIFT_LOW from ARCH_DMA_MINALIGN instead.

Reported-and-tested-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab.h | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index f2327a898a85..0c621752caa6 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -133,6 +133,19 @@ void kfree(const void *);
 void kzfree(const void *);
 size_t ksize(const void *);
 
+/*
+ * Some archs want to perform DMA into kmalloc caches and need a guaranteed
+ * alignment larger than the alignment of a 64-bit integer.
+ * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
+ */
+#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
+#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
+#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
+#define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN)
+#else
+#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
+#endif
+
 #ifdef CONFIG_SLOB
 /*
  * Common fields provided in kmem_cache by all slab allocators
@@ -179,7 +192,9 @@ struct kmem_cache {
 #define KMALLOC_SHIFT_HIGH	((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \
 				(MAX_ORDER + PAGE_SHIFT - 1) : 25)
 #define KMALLOC_SHIFT_MAX	KMALLOC_SHIFT_HIGH
+#ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
+#endif
 #else
 /*
  * SLUB allocates up to order 2 pages directly and otherwise
@@ -187,8 +202,10 @@ struct kmem_cache {
  */
 #define KMALLOC_SHIFT_HIGH	(PAGE_SHIFT + 1)
 #define KMALLOC_SHIFT_MAX	(MAX_ORDER + PAGE_SHIFT)
+#ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	3
 #endif
+#endif
 
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
@@ -200,9 +217,7 @@ struct kmem_cache {
 /*
  * Kmalloc subsystem.
  */
-#if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8
-#define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN
-#else
+#ifndef KMALLOC_MIN_SIZE
 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
 #endif
 
@@ -289,17 +304,6 @@ static __always_inline int kmalloc_size(int n)
 }
 #endif /* !CONFIG_SLOB */
 
-/*
- * Some archs want to perform DMA into kmalloc caches and need a guaranteed
- * alignment larger than the alignment of a 64-bit integer.
- * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that.
- */
-#ifdef ARCH_DMA_MINALIGN
-#define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN
-#else
-#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
-#endif
-
 /*
  * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
  * Intended for arches that get misalignment faults even for 64 bit integer
-- 
cgit 


From f43f627d2f17e95c78647eeddf968d12f5c286b1 Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Mon, 4 Feb 2013 13:37:20 +0000
Subject: PM: make VT switching to the suspend console optional v3

KMS drivers can potentially restore the display configuration without
userspace help.  Such drivers can can call a new funciton,
pm_vt_switch_required(false) if they support this feature.  In that
case, the PM layer won't VT switch to the suspend console at suspend
time and then back to the original VT on resume, but rather leave things
alone for a nicer looking suspend and resume sequence.

v2: make a function so we can handle multiple drivers (Alan)
v3: use a list to track device requests (Rafael)
v4: Squash in build fix from Jesse for CONFIG_VT_CONSOLE_SLEEP=n
v5: Squash in patch from Wu Fengguang to add a few missing static
qualifiers.
v6: Add missing EXPORT_SYMBOL.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Reviewed-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com> (v3)
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 include/linux/pm.h     |  13 ++++++
 kernel/power/console.c | 116 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pm.h b/include/linux/pm.h
index 03d7bb145311..e5da2f353e8f 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -35,6 +35,19 @@ extern void (*pm_idle)(void);
 extern void (*pm_power_off)(void);
 extern void (*pm_power_off_prepare)(void);
 
+struct device; /* we have a circular dep with device.h */
+#ifdef CONFIG_VT_CONSOLE_SLEEP
+extern void pm_vt_switch_required(struct device *dev, bool required);
+extern void pm_vt_switch_unregister(struct device *dev);
+#else
+static inline void pm_vt_switch_required(struct device *dev, bool required)
+{
+}
+static inline void pm_vt_switch_unregister(struct device *dev)
+{
+}
+#endif /* CONFIG_VT_CONSOLE_SLEEP */
+
 /*
  * Device power management
  */
diff --git a/kernel/power/console.c b/kernel/power/console.c
index b1dc456474b5..463aa6736751 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -4,6 +4,7 @@
  * Originally from swsusp.
  */
 
+#include <linux/console.h>
 #include <linux/vt_kern.h>
 #include <linux/kbd_kern.h>
 #include <linux/vt.h>
@@ -14,8 +15,120 @@
 
 static int orig_fgconsole, orig_kmsg;
 
+static DEFINE_MUTEX(vt_switch_mutex);
+
+struct pm_vt_switch {
+	struct list_head head;
+	struct device *dev;
+	bool required;
+};
+
+static LIST_HEAD(pm_vt_switch_list);
+
+
+/**
+ * pm_vt_switch_required - indicate VT switch at suspend requirements
+ * @dev: device
+ * @required: if true, caller needs VT switch at suspend/resume time
+ *
+ * The different console drivers may or may not require VT switches across
+ * suspend/resume, depending on how they handle restoring video state and
+ * what may be running.
+ *
+ * Drivers can indicate support for switchless suspend/resume, which can
+ * save time and flicker, by using this routine and passing 'false' as
+ * the argument.  If any loaded driver needs VT switching, or the
+ * no_console_suspend argument has been passed on the command line, VT
+ * switches will occur.
+ */
+void pm_vt_switch_required(struct device *dev, bool required)
+{
+	struct pm_vt_switch *entry, *tmp;
+
+	mutex_lock(&vt_switch_mutex);
+	list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+		if (tmp->dev == dev) {
+			/* already registered, update requirement */
+			tmp->required = required;
+			goto out;
+		}
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		goto out;
+
+	entry->required = required;
+	entry->dev = dev;
+
+	list_add(&entry->head, &pm_vt_switch_list);
+out:
+	mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_required);
+
+/**
+ * pm_vt_switch_unregister - stop tracking a device's VT switching needs
+ * @dev: device
+ *
+ * Remove @dev from the vt switch list.
+ */
+void pm_vt_switch_unregister(struct device *dev)
+{
+	struct pm_vt_switch *tmp;
+
+	mutex_lock(&vt_switch_mutex);
+	list_for_each_entry(tmp, &pm_vt_switch_list, head) {
+		if (tmp->dev == dev) {
+			list_del(&tmp->head);
+			break;
+		}
+	}
+	mutex_unlock(&vt_switch_mutex);
+}
+EXPORT_SYMBOL(pm_vt_switch_unregister);
+
+/*
+ * There are three cases when a VT switch on suspend/resume are required:
+ *   1) no driver has indicated a requirement one way or another, so preserve
+ *      the old behavior
+ *   2) console suspend is disabled, we want to see debug messages across
+ *      suspend/resume
+ *   3) any registered driver indicates it needs a VT switch
+ *
+ * If none of these conditions is present, meaning we have at least one driver
+ * that doesn't need the switch, and none that do, we can avoid it to make
+ * resume look a little prettier (and suspend too, but that's usually hidden,
+ * e.g. when closing the lid on a laptop).
+ */
+static bool pm_vt_switch(void)
+{
+	struct pm_vt_switch *entry;
+	bool ret = true;
+
+	mutex_lock(&vt_switch_mutex);
+	if (list_empty(&pm_vt_switch_list))
+		goto out;
+
+	if (!console_suspend_enabled)
+		goto out;
+
+	list_for_each_entry(entry, &pm_vt_switch_list, head) {
+		if (entry->required)
+			goto out;
+	}
+
+	ret = false;
+out:
+	mutex_unlock(&vt_switch_mutex);
+	return ret;
+}
+
 int pm_prepare_console(void)
 {
+	if (!pm_vt_switch())
+		return 0;
+
 	orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1);
 	if (orig_fgconsole < 0)
 		return 1;
@@ -26,6 +139,9 @@ int pm_prepare_console(void)
 
 void pm_restore_console(void)
 {
+	if (!pm_vt_switch())
+		return;
+
 	if (orig_fgconsole >= 0) {
 		vt_move_to_console(orig_fgconsole, 0);
 		vt_kmsg_redirect(orig_kmsg);
-- 
cgit 


From 3cf2667b9f8b2c2fe298a427deb399e52321da6b Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Mon, 4 Feb 2013 13:37:21 +0000
Subject: fb: add support for drivers not needing VT switch at suspend/resume
 time

Use the new PM routines to indicate whether we need to VT switch at suspend
and resume time.  When a new driver is bound, set its flag accordingly,
and when unbound, remove it from the PM's console tracking list.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/video/fbmem.c | 7 +++++++
 include/linux/fb.h    | 2 ++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c
index dc61c12ecf8c..2af7153da2e4 100644
--- a/drivers/video/fbmem.c
+++ b/drivers/video/fbmem.c
@@ -1645,6 +1645,11 @@ static int do_register_framebuffer(struct fb_info *fb_info)
 	if (!fb_info->modelist.prev || !fb_info->modelist.next)
 		INIT_LIST_HEAD(&fb_info->modelist);
 
+	if (fb_info->skip_vt_switch)
+		pm_vt_switch_required(fb_info->dev, false);
+	else
+		pm_vt_switch_required(fb_info->dev, true);
+
 	fb_var_to_videomode(&mode, &fb_info->var);
 	fb_add_videomode(&mode, &fb_info->modelist);
 	registered_fb[i] = fb_info;
@@ -1679,6 +1684,8 @@ static int do_unregister_framebuffer(struct fb_info *fb_info)
 	if (ret)
 		return -EINVAL;
 
+	pm_vt_switch_unregister(fb_info->dev);
+
 	unlink_framebuffer(fb_info);
 	if (fb_info->pixmap.addr &&
 	    (fb_info->pixmap.flags & FB_PIXMAP_DEFAULT))
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 58b98606ac26..d49c60f5aa4c 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -501,6 +501,8 @@ struct fb_info {
 			resource_size_t size;
 		} ranges[0];
 	} *apertures;
+
+	bool skip_vt_switch; /* no VT switch on suspend/resume required */
 };
 
 static inline struct apertures_struct *alloc_apertures(unsigned int max_num) {
-- 
cgit 


From 4490108b4a5ada14c7be712260829faecc814ae5 Mon Sep 17 00:00:00 2001
From: Ben Pfaff <blp@nicira.com>
Date: Fri, 15 Feb 2013 17:29:22 -0800
Subject: openvswitch: Allow OVS_USERSPACE_ATTR_USERDATA to be variable length.

Until now, the optional OVS_USERSPACE_ATTR_USERDATA attribute had to be
exactly 64 bits long, if it was present.  However, 64 bits is not enough
space to associate as much information with a flow as would be convenient
for some userspace features now under development.  This commit generalizes
the attribute, allowing it to be any length.

This generalization is backward-compatible: if userspace only uses 64-bit
attributes, then it will not see any change in behavior.

CC: Romain Lenglet <rlenglet@vmware.com>
Signed-off-by: Ben Pfaff <blp@nicira.com>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/linux/openvswitch.h | 11 ++++++-----
 net/openvswitch/datapath.c  | 11 ++++++-----
 net/openvswitch/datapath.h  |  2 +-
 3 files changed, 13 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 99e6414a40d9..67d6c7b03581 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -127,7 +127,8 @@ enum ovs_packet_cmd {
  * for %OVS_PACKET_CMD_EXECUTE.  It has nested %OVS_ACTION_ATTR_* attributes.
  * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
  * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
- * %OVS_USERSPACE_ATTR_USERDATA attribute.
+ * %OVS_USERSPACE_ATTR_USERDATA attribute, with the same length and content
+ * specified there.
  *
  * These attributes follow the &struct ovs_header within the Generic Netlink
  * payload for %OVS_PACKET_* commands.
@@ -137,7 +138,7 @@ enum ovs_packet_attr {
 	OVS_PACKET_ATTR_PACKET,      /* Packet data. */
 	OVS_PACKET_ATTR_KEY,         /* Nested OVS_KEY_ATTR_* attributes. */
 	OVS_PACKET_ATTR_ACTIONS,     /* Nested OVS_ACTION_ATTR_* attributes. */
-	OVS_PACKET_ATTR_USERDATA,    /* u64 OVS_ACTION_ATTR_USERSPACE arg. */
+	OVS_PACKET_ATTR_USERDATA,    /* OVS_ACTION_ATTR_USERSPACE arg. */
 	__OVS_PACKET_ATTR_MAX
 };
 
@@ -389,13 +390,13 @@ enum ovs_sample_attr {
  * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
  * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
  * message should be sent.  Required.
- * @OVS_USERSPACE_ATTR_USERDATA: If present, its u64 argument is copied to the
- * %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA,
+ * @OVS_USERSPACE_ATTR_USERDATA: If present, its variable-length argument is
+ * copied to the %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA.
  */
 enum ovs_userspace_attr {
 	OVS_USERSPACE_ATTR_UNSPEC,
 	OVS_USERSPACE_ATTR_PID,	      /* u32 Netlink PID to receive upcalls. */
-	OVS_USERSPACE_ATTR_USERDATA,  /* u64 optional user-specified cookie. */
+	OVS_USERSPACE_ATTR_USERDATA,  /* Optional user-specified cookie. */
 	__OVS_USERSPACE_ATTR_MAX
 };
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f9d2438e6437..96cd5b243d57 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -370,8 +370,8 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 	len = sizeof(struct ovs_header);
 	len += nla_total_size(skb->len);
 	len += nla_total_size(FLOW_BUFSIZE);
-	if (upcall_info->cmd == OVS_PACKET_CMD_ACTION)
-		len += nla_total_size(8);
+	if (upcall_info->userdata)
+		len += NLA_ALIGN(upcall_info->userdata->nla_len);
 
 	user_skb = genlmsg_new(len, GFP_ATOMIC);
 	if (!user_skb) {
@@ -388,8 +388,9 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 	nla_nest_end(user_skb, nla);
 
 	if (upcall_info->userdata)
-		nla_put_u64(user_skb, OVS_PACKET_ATTR_USERDATA,
-			    nla_get_u64(upcall_info->userdata));
+		__nla_put(user_skb, OVS_PACKET_ATTR_USERDATA,
+			  nla_len(upcall_info->userdata),
+			  nla_data(upcall_info->userdata));
 
 	nla = __nla_reserve(user_skb, OVS_PACKET_ATTR_PACKET, skb->len);
 
@@ -544,7 +545,7 @@ static int validate_userspace(const struct nlattr *attr)
 {
 	static const struct nla_policy userspace_policy[OVS_USERSPACE_ATTR_MAX + 1] =	{
 		[OVS_USERSPACE_ATTR_PID] = {.type = NLA_U32 },
-		[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_U64 },
+		[OVS_USERSPACE_ATTR_USERDATA] = {.type = NLA_UNSPEC },
 	};
 	struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1];
 	int error;
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 031dfbf37c93..9125ad5c5aeb 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -119,7 +119,7 @@ struct ovs_skb_cb {
  * struct dp_upcall - metadata to include with a packet to send to userspace
  * @cmd: One of %OVS_PACKET_CMD_*.
  * @key: Becomes %OVS_PACKET_ATTR_KEY.  Must be nonnull.
- * @userdata: If nonnull, its u64 value is extracted and passed to userspace as
+ * @userdata: If nonnull, its variable-length value is passed to userspace as
  * %OVS_PACKET_ATTR_USERDATA.
  * @pid: Netlink PID to which packet should be sent.  If @pid is 0 then no
  * packet is sent and the packet is accounted in the datapath's @n_lost
-- 
cgit 


From 27cef8b47cfb27fa2955a8577637794f1f275db2 Mon Sep 17 00:00:00 2001
From: Heiko Stübner <heiko@sntech.de>
Date: Sat, 23 Feb 2013 12:06:44 -0800
Subject: Input: auo-pixcir-ts - handle reset gpio directly

Devicetree based platforms don't handle device callbacks very well
and until now no board has come along that needs more extended hwinit
than pulling the rst gpio high.

Therefore pull the reset handling directly into the driver and remove
the callbacks from the driver.

If extended device setup is needed at some later point, power-sequences
would probably be the solution of choice.

Signed-off-by: Heiko Stuebner <heiko@sntech.de>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/touchscreen/auo-pixcir-ts.c | 26 ++++++++++++++++++++------
 include/linux/input/auo-pixcir-ts.h       |  4 +---
 2 files changed, 21 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/touchscreen/auo-pixcir-ts.c b/drivers/input/touchscreen/auo-pixcir-ts.c
index 813413eebab7..6317a9c7884c 100644
--- a/drivers/input/touchscreen/auo-pixcir-ts.c
+++ b/drivers/input/touchscreen/auo-pixcir-ts.c
@@ -511,8 +511,21 @@ static int auo_pixcir_probe(struct i2c_client *client,
 		goto err_gpio_dir;
 	}
 
-	if (pdata->init_hw)
-		pdata->init_hw(client);
+	ret = gpio_request(pdata->gpio_rst, "auo_pixcir_ts_rst");
+	if (ret) {
+		dev_err(&client->dev, "request of gpio %d failed, %d\n",
+			pdata->gpio_rst, ret);
+		goto err_gpio_dir;
+	}
+
+	ret = gpio_direction_output(pdata->gpio_rst, 1);
+	if (ret) {
+		dev_err(&client->dev, "setting direction of gpio %d failed %d\n",
+			pdata->gpio_rst, ret);
+		goto err_gpio_rst;
+	}
+
+	msleep(200);
 
 	ts->client = client;
 	ts->touch_ind_mode = 0;
@@ -597,8 +610,9 @@ err_input_register:
 err_fw_vers:
 	input_free_device(input_dev);
 err_input_alloc:
-	if (pdata->exit_hw)
-		pdata->exit_hw(client);
+	gpio_set_value(pdata->gpio_rst, 0);
+err_gpio_rst:
+	gpio_free(pdata->gpio_rst);
 err_gpio_dir:
 	gpio_free(pdata->gpio_int);
 err_gpio_int:
@@ -616,8 +630,8 @@ static int auo_pixcir_remove(struct i2c_client *client)
 
 	input_unregister_device(ts->input);
 
-	if (pdata->exit_hw)
-		pdata->exit_hw(client);
+	gpio_set_value(pdata->gpio_rst, 0);
+	gpio_free(pdata->gpio_rst);
 
 	gpio_free(pdata->gpio_int);
 
diff --git a/include/linux/input/auo-pixcir-ts.h b/include/linux/input/auo-pixcir-ts.h
index 75d4be717714..5049f21928e4 100644
--- a/include/linux/input/auo-pixcir-ts.h
+++ b/include/linux/input/auo-pixcir-ts.h
@@ -43,12 +43,10 @@
  */
 struct auo_pixcir_ts_platdata {
 	int gpio_int;
+	int gpio_rst;
 
 	int int_setting;
 
-	void (*init_hw)(struct i2c_client *);
-	void (*exit_hw)(struct i2c_client *);
-
 	unsigned int x_max;
 	unsigned int y_max;
 };
-- 
cgit 


From e90a6df80dc45ab53d2f4f4db297434e48c0208e Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Mon, 25 Feb 2013 11:31:43 +0100
Subject: HID: Extend the interface with report requests

Some drivers send reports directly to underlying device, creating an
unwanted dependency on the underlying transport layer. This patch adds
hid_hw_request() to the interface, thereby removing usbhid from the
lion share of the drivers.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@gmail.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 13 +++++++++++++
 include/linux/hid.h           | 20 ++++++++++++++++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 8e0c4bf94ebc..366fd09d257d 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1243,6 +1243,18 @@ static int usbhid_power(struct hid_device *hid, int lvl)
 	return r;
 }
 
+static void usbhid_request(struct hid_device *hid, struct hid_report *rep, int reqtype)
+{
+	switch (reqtype) {
+	case HID_REQ_GET_REPORT:
+		usbhid_submit_report(hid, rep, USB_DIR_IN);
+		break;
+	case HID_REQ_SET_REPORT:
+		usbhid_submit_report(hid, rep, USB_DIR_OUT);
+		break;
+	}
+}
+
 static struct hid_ll_driver usb_hid_driver = {
 	.parse = usbhid_parse,
 	.start = usbhid_start,
@@ -1251,6 +1263,7 @@ static struct hid_ll_driver usb_hid_driver = {
 	.close = usbhid_close,
 	.power = usbhid_power,
 	.hidinput_input_event = usb_hidinput_input_event,
+	.request = usbhid_request,
 };
 
 static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *id)
diff --git a/include/linux/hid.h b/include/linux/hid.h
index e14b465b1146..261c713d4842 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -662,6 +662,7 @@ struct hid_driver {
  * @hidinput_input_event: event input event (e.g. ff or leds)
  * @parse: this method is called only once to parse the device data,
  *	   shouldn't allocate anything to not leak memory
+ * @request: send report request to device (e.g. feature report)
  */
 struct hid_ll_driver {
 	int (*start)(struct hid_device *hdev);
@@ -676,6 +677,10 @@ struct hid_ll_driver {
 			unsigned int code, int value);
 
 	int (*parse)(struct hid_device *hdev);
+
+	void (*request)(struct hid_device *hdev,
+			struct hid_report *report, int reqtype);
+
 };
 
 #define	PM_HINT_FULLON	1<<5
@@ -883,6 +888,21 @@ static inline int hid_hw_power(struct hid_device *hdev, int level)
 	return hdev->ll_driver->power ? hdev->ll_driver->power(hdev, level) : 0;
 }
 
+
+/**
+ * hid_hw_request - send report request to device
+ *
+ * @hdev: hid device
+ * @report: report to send
+ * @reqtype: hid request type
+ */
+static inline void hid_hw_request(struct hid_device *hdev,
+				  struct hid_report *report, int reqtype)
+{
+	if (hdev->ll_driver->request)
+		hdev->ll_driver->request(hdev, report, reqtype);
+}
+
 int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size,
 		int interrupt);
 
-- 
cgit 


From 3373443befa73ee60e4275e7699b26058b01455a Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Mon, 25 Feb 2013 11:31:44 +0100
Subject: HID: Extend the interface with wait io request

Some drivers need to wait for an io from the underlying device, creating
an unwanted dependency on the underlying transport layer. This patch adds
wait() to the interface, thereby removing usbhid from the lion share of
the drivers.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Benjamin Tissoires <benjamin.tissoires@gmail.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c |  1 +
 include/linux/hid.h           | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 366fd09d257d..99d95d3368b5 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1264,6 +1264,7 @@ static struct hid_ll_driver usb_hid_driver = {
 	.power = usbhid_power,
 	.hidinput_input_event = usb_hidinput_input_event,
 	.request = usbhid_request,
+	.wait = usbhid_wait_io,
 };
 
 static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *id)
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 261c713d4842..7071eb3d36c7 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -663,6 +663,7 @@ struct hid_driver {
  * @parse: this method is called only once to parse the device data,
  *	   shouldn't allocate anything to not leak memory
  * @request: send report request to device (e.g. feature report)
+ * @wait: wait for buffered io to complete (send/recv reports)
  */
 struct hid_ll_driver {
 	int (*start)(struct hid_device *hdev);
@@ -681,6 +682,8 @@ struct hid_ll_driver {
 	void (*request)(struct hid_device *hdev,
 			struct hid_report *report, int reqtype);
 
+	int (*wait)(struct hid_device *hdev);
+
 };
 
 #define	PM_HINT_FULLON	1<<5
@@ -903,6 +906,17 @@ static inline void hid_hw_request(struct hid_device *hdev,
 		hdev->ll_driver->request(hdev, report, reqtype);
 }
 
+/**
+ * hid_hw_wait - wait for buffered io to complete
+ *
+ * @hdev: hid device
+ */
+static inline void hid_hw_wait(struct hid_device *hdev)
+{
+	if (hdev->ll_driver->wait)
+		hdev->ll_driver->wait(hdev);
+}
+
 int hid_report_raw_event(struct hid_device *hid, int type, u8 *data, int size,
 		int interrupt);
 
-- 
cgit 


From d6b0c58048d2c8c6f4955c37f670125b2792cd14 Mon Sep 17 00:00:00 2001
From: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Date: Sat, 23 Feb 2013 13:11:14 -0800
Subject: devres: allow adding custom actions to the stack

Sometimes drivers need to execute one-off actions in their error handling
or device teardown paths. An example would be toggling a GPIO line to
reset the controlled device into predefined state.

To allow performing such actions when using managed resources let's allow
adding them to stack/group of devres resources.

Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/base/devres.c  | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/device.h |  4 +++
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/base/devres.c b/drivers/base/devres.c
index 8731979d668a..724957a13d48 100644
--- a/drivers/base/devres.c
+++ b/drivers/base/devres.c
@@ -670,6 +670,80 @@ int devres_release_group(struct device *dev, void *id)
 }
 EXPORT_SYMBOL_GPL(devres_release_group);
 
+/*
+ * Custom devres actions allow inserting a simple function call
+ * into the teadown sequence.
+ */
+
+struct action_devres {
+	void *data;
+	void (*action)(void *);
+};
+
+static int devm_action_match(struct device *dev, void *res, void *p)
+{
+	struct action_devres *devres = res;
+	struct action_devres *target = p;
+
+	return devres->action == target->action &&
+	       devres->data == target->data;
+}
+
+static void devm_action_release(struct device *dev, void *res)
+{
+	struct action_devres *devres = res;
+
+	devres->action(devres->data);
+}
+
+/**
+ * devm_add_action() - add a custom action to list of managed resources
+ * @dev: Device that owns the action
+ * @action: Function that should be called
+ * @data: Pointer to data passed to @action implementation
+ *
+ * This adds a custom action to the list of managed resources so that
+ * it gets executed as part of standard resource unwinding.
+ */
+int devm_add_action(struct device *dev, void (*action)(void *), void *data)
+{
+	struct action_devres *devres;
+
+	devres = devres_alloc(devm_action_release,
+			      sizeof(struct action_devres), GFP_KERNEL);
+	if (!devres)
+		return -ENOMEM;
+
+	devres->data = data;
+	devres->action = action;
+
+	devres_add(dev, devres);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_add_action);
+
+/**
+ * devm_remove_action() - removes previously added custom action
+ * @dev: Device that owns the action
+ * @action: Function implementing the action
+ * @data: Pointer to data passed to @action implementation
+ *
+ * Removes instance of @action previously added by devm_add_action().
+ * Both action and data should match one of the existing entries.
+ */
+void devm_remove_action(struct device *dev, void (*action)(void *), void *data)
+{
+	struct action_devres devres = {
+		.data = data,
+		.action = action,
+	};
+
+	WARN_ON(devres_destroy(dev, devm_action_release, devm_action_match,
+			       &devres));
+
+}
+EXPORT_SYMBOL_GPL(devm_remove_action);
+
 /*
  * Managed kzalloc/kfree
  */
diff --git a/include/linux/device.h b/include/linux/device.h
index 86ef6ab553b1..854b247bf5f9 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -567,6 +567,10 @@ extern void devm_kfree(struct device *dev, void *p);
 void __iomem *devm_request_and_ioremap(struct device *dev,
 			struct resource *res);
 
+/* allows to add/remove a custom action to devres stack */
+int devm_add_action(struct device *dev, void (*action)(void *), void *data);
+void devm_remove_action(struct device *dev, void (*action)(void *), void *data);
+
 struct device_dma_parameters {
 	/*
 	 * a low level driver may set these to teach IOMMU code about
-- 
cgit 


From c849a6143bec520aff2a6646518b0d041402428b Mon Sep 17 00:00:00 2001
From: Andrew de los Reyes <adlr@chromium.org>
Date: Mon, 18 Feb 2013 09:20:21 -0800
Subject: HID: Separate struct hid_device's driver_lock into two locks.

This patch separates struct hid_device's driver_lock into two. The
goal is to allow hid device drivers to receive input during their
probe() or remove() function calls. This is necessary because some
drivers need to communicate with the device to determine parameters
needed during probe (e.g., size of a multi-touch surface), and if
possible, may perfer to communicate with a device on host-initiated
disconnect (e.g., to put it into a low-power state).

Historically, three functions used driver_lock:

- hid_device_probe: blocks to acquire lock
- hid_device_remove: blocks to acquire lock
- hid_input_report: if locked returns -EBUSY, else acquires lock

This patch adds another lock (driver_input_lock) which is used to
block input from occurring. The lock behavior is now:

- hid_device_probe: blocks to acq. driver_lock, then driver_input_lock
- hid_device_remove: blocks to acq. driver_lock, then driver_input_lock
- hid_input_report: if driver_input_lock locked returns -EBUSY, else
  acquires driver_input_lock

This patch also adds two helper functions to be called during probe()
or remove(): hid_device_io_start() and hid_device_io_stop(). These
functions lock and unlock, respectively, driver_input_lock; they also
make a note of whether they did so that hid-core knows if a driver has
changed the lock state.

This patch results in no behavior change for existing devices and
drivers. However, during a probe() or remove() function call in a
driver, that driver may now selectively call hid_device_io_start() to
let input events come through, then optionally call
hid_device_io_stop() to stop them.

Signed-off-by: Andrew de los Reyes <adlr@chromium.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c | 24 +++++++++++++++++++++---
 include/linux/hid.h    | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 66 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index ff75cabf7393..680068c0c46a 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1267,7 +1267,7 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i
 	if (!hid)
 		return -ENODEV;
 
-	if (down_trylock(&hid->driver_lock))
+	if (down_trylock(&hid->driver_input_lock))
 		return -EBUSY;
 
 	if (!hid->driver) {
@@ -1324,7 +1324,7 @@ nomem:
 	ret = hid_report_raw_event(hid, type, data, size, interrupt);
 
 unlock:
-	up(&hid->driver_lock);
+	up(&hid->driver_input_lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(hid_input_report);
@@ -1845,6 +1845,11 @@ static int hid_device_probe(struct device *dev)
 
 	if (down_interruptible(&hdev->driver_lock))
 		return -EINTR;
+	if (down_interruptible(&hdev->driver_input_lock)) {
+		ret = -EINTR;
+		goto unlock_driver_lock;
+	}
+	hdev->io_started = false;
 
 	if (!hdev->driver) {
 		id = hid_match_device(hdev, hdrv);
@@ -1867,6 +1872,9 @@ static int hid_device_probe(struct device *dev)
 		}
 	}
 unlock:
+	if (!hdev->io_started)
+		up(&hdev->driver_input_lock);
+unlock_driver_lock:
 	up(&hdev->driver_lock);
 	return ret;
 }
@@ -1875,9 +1883,15 @@ static int hid_device_remove(struct device *dev)
 {
 	struct hid_device *hdev = container_of(dev, struct hid_device, dev);
 	struct hid_driver *hdrv;
+	int ret = 0;
 
 	if (down_interruptible(&hdev->driver_lock))
 		return -EINTR;
+	if (down_interruptible(&hdev->driver_input_lock)) {
+		ret = -EINTR;
+		goto unlock_driver_lock;
+	}
+	hdev->io_started = false;
 
 	hdrv = hdev->driver;
 	if (hdrv) {
@@ -1889,8 +1903,11 @@ static int hid_device_remove(struct device *dev)
 		hdev->driver = NULL;
 	}
 
+	if (!hdev->io_started)
+		up(&hdev->driver_input_lock);
+unlock_driver_lock:
 	up(&hdev->driver_lock);
-	return 0;
+	return ret;
 }
 
 static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
@@ -2329,6 +2346,7 @@ struct hid_device *hid_allocate_device(void)
 	init_waitqueue_head(&hdev->debug_wait);
 	INIT_LIST_HEAD(&hdev->debug_list);
 	sema_init(&hdev->driver_lock, 1);
+	sema_init(&hdev->driver_input_lock, 1);
 
 	return hdev;
 }
diff --git a/include/linux/hid.h b/include/linux/hid.h
index e14b465b1146..895b85639dec 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -456,7 +456,8 @@ struct hid_device {							/* device report descriptor */
 	unsigned country;						/* HID country */
 	struct hid_report_enum report_enum[HID_REPORT_TYPES];
 
-	struct semaphore driver_lock;					/* protects the current driver */
+	struct semaphore driver_lock;					/* protects the current driver, except during input */
+	struct semaphore driver_input_lock;				/* protects the current driver */
 	struct device dev;						/* device */
 	struct hid_driver *driver;
 	struct hid_ll_driver *ll_driver;
@@ -477,6 +478,7 @@ struct hid_device {							/* device report descriptor */
 	unsigned int status;						/* see STAT flags above */
 	unsigned claimed;						/* Claimed by hidinput, hiddev? */
 	unsigned quirks;						/* Various quirks the device can pull on us */
+	bool io_started;						/* Protected by driver_lock. If IO has started */
 
 	struct list_head inputs;					/* The list of inputs */
 	void *hiddev;							/* The hiddev structure */
@@ -599,6 +601,10 @@ struct hid_usage_id {
  * @resume: invoked on resume if device was not reset (NULL means nop)
  * @reset_resume: invoked on resume if device was reset (NULL means nop)
  *
+ * probe should return -errno on error, or 0 on success. During probe,
+ * input will not be passed to raw_event unless hid_device_io_start is
+ * called.
+ *
  * raw_event and event should return 0 on no action performed, 1 when no
  * further processing should be done and negative on error
  *
@@ -737,6 +743,44 @@ const struct hid_device_id *hid_match_id(struct hid_device *hdev,
 					 const struct hid_device_id *id);
 s32 hid_snto32(__u32 value, unsigned n);
 
+/**
+ * hid_device_io_start - enable HID input during probe, remove
+ *
+ * @hid - the device
+ *
+ * This should only be called during probe or remove and only be
+ * called by the thread calling probe or remove. It will allow
+ * incoming packets to be delivered to the driver.
+ */
+static inline void hid_device_io_start(struct hid_device *hid) {
+	if (hid->io_started) {
+		dev_warn(&hid->dev, "io already started");
+		return;
+	}
+	hid->io_started = true;
+	up(&hid->driver_input_lock);
+}
+
+/**
+ * hid_device_io_stop - disable HID input during probe, remove
+ *
+ * @hid - the device
+ *
+ * Should only be called after hid_device_io_start. It will prevent
+ * incoming packets from going to the driver for the duration of
+ * probe, remove. If called during probe, packets will still go to the
+ * driver after probe is complete. This function should only be called
+ * by the thread calling probe or remove.
+ */
+static inline void hid_device_io_stop(struct hid_device *hid) {
+	if (!hid->io_started) {
+		dev_warn(&hid->dev, "io already stopped");
+		return;
+	}
+	hid->io_started = false;
+	down(&hid->driver_input_lock);
+}
+
 /**
  * hid_map_usage - map usage input bits
  *
-- 
cgit 


From c8801a8e715d7793e1e7bcd2f6fe132234741753 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 19 Mar 2012 16:35:48 +0000
Subject: regulator: core: Mark all get and enable calls as __must_check

It's generally important that devices have power when they expect it so
drivers really ought to be checking for errors on the power up paths.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/regulator/consumer.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h
index 7bc732ce6e50..145022a83085 100644
--- a/include/linux/regulator/consumer.h
+++ b/include/linux/regulator/consumer.h
@@ -141,18 +141,18 @@ void regulator_put(struct regulator *regulator);
 void devm_regulator_put(struct regulator *regulator);
 
 /* regulator output control and status */
-int regulator_enable(struct regulator *regulator);
+int __must_check regulator_enable(struct regulator *regulator);
 int regulator_disable(struct regulator *regulator);
 int regulator_force_disable(struct regulator *regulator);
 int regulator_is_enabled(struct regulator *regulator);
 int regulator_disable_deferred(struct regulator *regulator, int ms);
 
-int regulator_bulk_get(struct device *dev, int num_consumers,
-		       struct regulator_bulk_data *consumers);
-int devm_regulator_bulk_get(struct device *dev, int num_consumers,
-			    struct regulator_bulk_data *consumers);
-int regulator_bulk_enable(int num_consumers,
-			  struct regulator_bulk_data *consumers);
+int __must_check regulator_bulk_get(struct device *dev, int num_consumers,
+				    struct regulator_bulk_data *consumers);
+int __must_check devm_regulator_bulk_get(struct device *dev, int num_consumers,
+					 struct regulator_bulk_data *consumers);
+int __must_check regulator_bulk_enable(int num_consumers,
+				       struct regulator_bulk_data *consumers);
 int regulator_bulk_disable(int num_consumers,
 			   struct regulator_bulk_data *consumers);
 int regulator_bulk_force_disable(int num_consumers,
-- 
cgit 


From f19b00da8ed37db4e3891fe534fcf3a605a0e562 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 18 Feb 2013 06:50:39 +0000
Subject: regulator: core: support shared enable GPIO concept

 A Regulator can be enabled by external GPIO pin.
 This is configurable in the regulator_config.
 At this moment, the GPIO can be owned by only one regulator device.
 In some devices, multiple regulators are enabled by shared one GPIO pin.
 This patch extends this limitation, enabling shared enable GPIO of regulators.

 New list for enable GPIO: 'regulator_ena_gpio_list'
   This manages enable GPIO list.

 New structure for supporting shared enable GPIO: 'regulator_enable_gpio'
   The enable count is used for balancing GPIO control count.
   This count is incremented when GPIO is enabled.
   On the other hand, it's decremented when GPIO is disabled.

 Reference count: 'request_count'
   The reference count, 'request_count' is incremented/decremented on
   requesting/freeing the GPIO. This count makes sure only free the GPIO
   when it has no users.

 How it works
   If the GPIO is already used, skip requesting new GPIO usage.
   The GPIO is new one, request GPIO function and add it to the list of
   enable GPIO.
   This list is used for balancing enable GPIO count and pin control.

 Updating a GPIO and invert code moved
   'ena_gpio' and 'ena_gpio_invert' of the regulator_config were moved to
    new function, regulator_ena_gpio_request().
    Use regulator_enable_pin structure rather than regulator_dev.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Reviewed-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/core.c         | 86 +++++++++++++++++++++++++++++++++++-----
 include/linux/regulator/driver.h |  2 +
 2 files changed, 78 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index da9782bd27d0..71d6adc4eeab 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -51,6 +51,7 @@
 static DEFINE_MUTEX(regulator_list_mutex);
 static LIST_HEAD(regulator_list);
 static LIST_HEAD(regulator_map_list);
+static LIST_HEAD(regulator_ena_gpio_list);
 static bool has_full_constraints;
 static bool board_wants_dummy_regulator;
 
@@ -68,6 +69,19 @@ struct regulator_map {
 	struct regulator_dev *regulator;
 };
 
+/*
+ * struct regulator_enable_gpio
+ *
+ * Management for shared enable GPIO pin
+ */
+struct regulator_enable_gpio {
+	struct list_head list;
+	int gpio;
+	u32 enable_count;	/* a number of enabled shared GPIO */
+	u32 request_count;	/* a number of requested shared GPIO */
+	unsigned int ena_gpio_invert:1;
+};
+
 /*
  * struct regulator
  *
@@ -1456,6 +1470,65 @@ void devm_regulator_put(struct regulator *regulator)
 }
 EXPORT_SYMBOL_GPL(devm_regulator_put);
 
+/* Manage enable GPIO list. Same GPIO pin can be shared among regulators */
+static int regulator_ena_gpio_request(struct regulator_dev *rdev,
+				const struct regulator_config *config)
+{
+	struct regulator_enable_gpio *pin;
+	int ret;
+
+	list_for_each_entry(pin, &regulator_ena_gpio_list, list) {
+		if (pin->gpio == config->ena_gpio) {
+			rdev_dbg(rdev, "GPIO %d is already used\n",
+				config->ena_gpio);
+			goto update_ena_gpio_to_rdev;
+		}
+	}
+
+	ret = gpio_request_one(config->ena_gpio,
+				GPIOF_DIR_OUT | config->ena_gpio_flags,
+				rdev_get_name(rdev));
+	if (ret)
+		return ret;
+
+	pin = kzalloc(sizeof(struct regulator_enable_gpio), GFP_KERNEL);
+	if (pin == NULL) {
+		gpio_free(config->ena_gpio);
+		return -ENOMEM;
+	}
+
+	pin->gpio = config->ena_gpio;
+	pin->ena_gpio_invert = config->ena_gpio_invert;
+	list_add(&pin->list, &regulator_ena_gpio_list);
+
+update_ena_gpio_to_rdev:
+	pin->request_count++;
+	rdev->ena_pin = pin;
+	return 0;
+}
+
+static void regulator_ena_gpio_free(struct regulator_dev *rdev)
+{
+	struct regulator_enable_gpio *pin, *n;
+
+	if (!rdev->ena_pin)
+		return;
+
+	/* Free the GPIO only in case of no use */
+	list_for_each_entry_safe(pin, n, &regulator_ena_gpio_list, list) {
+		if (pin->gpio == rdev->ena_pin->gpio) {
+			if (pin->request_count <= 1) {
+				pin->request_count = 0;
+				gpio_free(pin->gpio);
+				list_del(&pin->list);
+				kfree(pin);
+			} else {
+				pin->request_count--;
+			}
+		}
+	}
+}
+
 static int _regulator_do_enable(struct regulator_dev *rdev)
 {
 	int ret, delay;
@@ -3435,18 +3508,13 @@ regulator_register(const struct regulator_desc *regulator_desc,
 	dev_set_drvdata(&rdev->dev, rdev);
 
 	if (config->ena_gpio && gpio_is_valid(config->ena_gpio)) {
-		ret = gpio_request_one(config->ena_gpio,
-				       GPIOF_DIR_OUT | config->ena_gpio_flags,
-				       rdev_get_name(rdev));
+		ret = regulator_ena_gpio_request(rdev, config);
 		if (ret != 0) {
 			rdev_err(rdev, "Failed to request enable GPIO%d: %d\n",
 				 config->ena_gpio, ret);
 			goto wash;
 		}
 
-		rdev->ena_gpio = config->ena_gpio;
-		rdev->ena_gpio_invert = config->ena_gpio_invert;
-
 		if (config->ena_gpio_flags & GPIOF_OUT_INIT_HIGH)
 			rdev->ena_gpio_state = 1;
 
@@ -3522,8 +3590,7 @@ unset_supplies:
 scrub:
 	if (rdev->supply)
 		_regulator_put(rdev->supply);
-	if (rdev->ena_gpio)
-		gpio_free(rdev->ena_gpio);
+	regulator_ena_gpio_free(rdev);
 	kfree(rdev->constraints);
 wash:
 	device_unregister(&rdev->dev);
@@ -3558,8 +3625,7 @@ void regulator_unregister(struct regulator_dev *rdev)
 	unset_regulator_supplies(rdev);
 	list_del(&rdev->list);
 	kfree(rdev->constraints);
-	if (rdev->ena_gpio)
-		gpio_free(rdev->ena_gpio);
+	regulator_ena_gpio_free(rdev);
 	device_unregister(&rdev->dev);
 	mutex_unlock(&regulator_list_mutex);
 }
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 23070fd83872..a467d11dd67d 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -22,6 +22,7 @@
 struct regmap;
 struct regulator_dev;
 struct regulator_init_data;
+struct regulator_enable_gpio;
 
 enum regulator_status {
 	REGULATOR_STATUS_OFF,
@@ -300,6 +301,7 @@ struct regulator_dev {
 
 	struct dentry *debugfs;
 
+	struct regulator_enable_gpio *ena_pin;
 	int ena_gpio;
 	unsigned int ena_gpio_invert:1;
 	unsigned int ena_gpio_state:1;
-- 
cgit 


From 7b74d149247c8972da1cec3e4c70b67049aaeb69 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 18 Feb 2013 06:50:55 +0000
Subject: regulator: core: use regulator_ena_pin member

 The regulator_dev has regulator_enable_gpio structure.
 'ena_gpio' and 'ena_gpio_invert' were moved to in regulator_enable_gpio.

  regulator_dev   --->   regulator_enable_gpio
  .ena_gpio              .gpio
  .ena_gpio_invert       .ena_gpio_invert

  Pointer, 'ena_pin' is used for checking valid enable GPIO pin.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Reviewed-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/core.c         | 6 +++---
 include/linux/regulator/driver.h | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 57d434d3145a..6c8c82406cd9 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1945,7 +1945,7 @@ EXPORT_SYMBOL_GPL(regulator_disable_regmap);
 static int _regulator_is_enabled(struct regulator_dev *rdev)
 {
 	/* A GPIO control always takes precedence */
-	if (rdev->ena_gpio)
+	if (rdev->ena_pin)
 		return rdev->ena_gpio_state;
 
 	/* If we don't know then assume that the regulator is always on */
@@ -3344,7 +3344,7 @@ static int add_regulator_attributes(struct regulator_dev *rdev)
 		if (status < 0)
 			return status;
 	}
-	if (rdev->ena_gpio || ops->is_enabled) {
+	if (rdev->ena_pin || ops->is_enabled) {
 		status = device_create_file(dev, &dev_attr_state);
 		if (status < 0)
 			return status;
@@ -3556,7 +3556,7 @@ regulator_register(const struct regulator_desc *regulator_desc,
 		if (config->ena_gpio_flags & GPIOF_OUT_INIT_HIGH)
 			rdev->ena_gpio_state = 1;
 
-		if (rdev->ena_gpio_invert)
+		if (config->ena_gpio_invert)
 			rdev->ena_gpio_state = !rdev->ena_gpio_state;
 	}
 
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index a467d11dd67d..7b7aeec04f86 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -302,8 +302,6 @@ struct regulator_dev {
 	struct dentry *debugfs;
 
 	struct regulator_enable_gpio *ena_pin;
-	int ena_gpio;
-	unsigned int ena_gpio_invert:1;
 	unsigned int ena_gpio_state:1;
 };
 
-- 
cgit 


From 07fe6e00f6cca6fef85a14a1dc3ed4f2e35d3f0b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jan 2013 15:03:44 -0500
Subject: get rid of duplicate logics in __SC_....[1-6] definitions

All those guys have the same form - "take a list of type/name pairs,
apply some macro to each of them".  Abstract that part away, convert
all __SC_FOO##x(__VA_ARGS__) to __MAP(x,__SC_FOO,__VA_ARGS__).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/compat.h   | 18 ++++-------
 include/linux/syscalls.h | 82 +++++++++++++++++++-----------------------------
 2 files changed, 38 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compat.h b/include/linux/compat.h
index 76a87fb57ac2..8c1dfc8d830d 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -27,12 +27,6 @@
 #define __SC_DELOUSE(t,v) ((t)(unsigned long)(v))
 #endif
 
-#define __SC_CCAST1(t1, a1)      __SC_DELOUSE(t1,a1)
-#define __SC_CCAST2(t2, a2, ...) __SC_DELOUSE(t2,a2), __SC_CCAST1(__VA_ARGS__)
-#define __SC_CCAST3(t3, a3, ...) __SC_DELOUSE(t3,a3), __SC_CCAST2(__VA_ARGS__)
-#define __SC_CCAST4(t4, a4, ...) __SC_DELOUSE(t4,a4), __SC_CCAST3(__VA_ARGS__)
-#define __SC_CCAST5(t5, a5, ...) __SC_DELOUSE(t5,a5), __SC_CCAST4(__VA_ARGS__)
-#define __SC_CCAST6(t6, a6, ...) __SC_DELOUSE(t6,a6), __SC_CCAST5(__VA_ARGS__)
 #define COMPAT_SYSCALL_DEFINE1(name, ...) \
         COMPAT_SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define COMPAT_SYSCALL_DEFINE2(name, ...) \
@@ -49,19 +43,19 @@
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
-	asmlinkage long compat_sys##name(__SC_DECL##x(__VA_ARGS__));	\
-	static inline long C_SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
-	asmlinkage long compat_SyS##name(__SC_LONG##x(__VA_ARGS__))	\
+	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
+	asmlinkage long compat_SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))\
 	{								\
-		return (long) C_SYSC##name(__SC_CCAST##x(__VA_ARGS__));	\
+		return C_SYSC##name(__MAP(x,__SC_DELOUSE,__VA_ARGS__));	\
 	}								\
 	SYSCALL_ALIAS(compat_sys##name, compat_SyS##name);		\
-	static inline long C_SYSC##name(__SC_DECL##x(__VA_ARGS__))
+	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
-	asmlinkage long compat_sys##name(__SC_DECL##x(__VA_ARGS__))
+	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 313a8e0a6553..f9411f0c1c80 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -78,49 +78,31 @@ struct sigaltstack;
 #include <linux/key.h>
 #include <trace/syscall.h>
 
-#define __SC_DECL1(t1, a1)	t1 a1
-#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
-#define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__)
-#define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__)
-#define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__)
-#define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__)
-
-#define __SC_LONG1(t1, a1) 	long a1
-#define __SC_LONG2(t2, a2, ...) long a2, __SC_LONG1(__VA_ARGS__)
-#define __SC_LONG3(t3, a3, ...) long a3, __SC_LONG2(__VA_ARGS__)
-#define __SC_LONG4(t4, a4, ...) long a4, __SC_LONG3(__VA_ARGS__)
-#define __SC_LONG5(t5, a5, ...) long a5, __SC_LONG4(__VA_ARGS__)
-#define __SC_LONG6(t6, a6, ...) long a6, __SC_LONG5(__VA_ARGS__)
-
-#define __SC_CAST1(t1, a1)	(t1) a1
-#define __SC_CAST2(t2, a2, ...) (t2) a2, __SC_CAST1(__VA_ARGS__)
-#define __SC_CAST3(t3, a3, ...) (t3) a3, __SC_CAST2(__VA_ARGS__)
-#define __SC_CAST4(t4, a4, ...) (t4) a4, __SC_CAST3(__VA_ARGS__)
-#define __SC_CAST5(t5, a5, ...) (t5) a5, __SC_CAST4(__VA_ARGS__)
-#define __SC_CAST6(t6, a6, ...) (t6) a6, __SC_CAST5(__VA_ARGS__)
-
-#define __SC_TEST(type)		BUILD_BUG_ON(sizeof(type) > sizeof(long))
-#define __SC_TEST1(t1, a1)	__SC_TEST(t1)
-#define __SC_TEST2(t2, a2, ...)	__SC_TEST(t2); __SC_TEST1(__VA_ARGS__)
-#define __SC_TEST3(t3, a3, ...)	__SC_TEST(t3); __SC_TEST2(__VA_ARGS__)
-#define __SC_TEST4(t4, a4, ...)	__SC_TEST(t4); __SC_TEST3(__VA_ARGS__)
-#define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
-#define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
+/*
+ * __MAP - apply a macro to syscall arguments
+ * __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to
+ *    m(t1, a1), m(t2, a2), ..., m(tn, an)
+ * The first argument must be equal to the amount of type/name
+ * pairs given.  Note that this list of pairs (i.e. the arguments
+ * of __MAP starting at the third one) is in the same format as
+ * for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
+ */
+#define __MAP1(m,t,a) m(t,a)
+#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
+#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
+#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
+#define __MAP5(m,t,a,...) m(t,a), __MAP4(m,__VA_ARGS__)
+#define __MAP6(m,t,a,...) m(t,a), __MAP5(m,__VA_ARGS__)
+#define __MAP(n,...) __MAP##n(__VA_ARGS__)
+
+#define __SC_DECL(t, a)	t a
+#define __SC_LONG(t, a)	long a
+#define __SC_CAST(t, a)	(t) a
+#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(sizeof(type) > sizeof(long))
 
 #ifdef CONFIG_FTRACE_SYSCALLS
-#define __SC_STR_ADECL1(t, a)		#a
-#define __SC_STR_ADECL2(t, a, ...)	#a, __SC_STR_ADECL1(__VA_ARGS__)
-#define __SC_STR_ADECL3(t, a, ...)	#a, __SC_STR_ADECL2(__VA_ARGS__)
-#define __SC_STR_ADECL4(t, a, ...)	#a, __SC_STR_ADECL3(__VA_ARGS__)
-#define __SC_STR_ADECL5(t, a, ...)	#a, __SC_STR_ADECL4(__VA_ARGS__)
-#define __SC_STR_ADECL6(t, a, ...)	#a, __SC_STR_ADECL5(__VA_ARGS__)
-
-#define __SC_STR_TDECL1(t, a)		#t
-#define __SC_STR_TDECL2(t, a, ...)	#t, __SC_STR_TDECL1(__VA_ARGS__)
-#define __SC_STR_TDECL3(t, a, ...)	#t, __SC_STR_TDECL2(__VA_ARGS__)
-#define __SC_STR_TDECL4(t, a, ...)	#t, __SC_STR_TDECL3(__VA_ARGS__)
-#define __SC_STR_TDECL5(t, a, ...)	#t, __SC_STR_TDECL4(__VA_ARGS__)
-#define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
+#define __SC_STR_ADECL(t, a)	#a
+#define __SC_STR_TDECL(t, a)	#t
 
 extern struct ftrace_event_class event_class_syscall_enter;
 extern struct ftrace_event_class event_class_syscall_exit;
@@ -217,10 +199,10 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define SYSCALL_DEFINEx(x, sname, ...)				\
 	static const char *types_##sname[] = {			\
-		__SC_STR_TDECL##x(__VA_ARGS__)			\
+		__MAP(x,__SC_STR_TDECL,__VA_ARGS__)		\
 	};							\
 	static const char *args_##sname[] = {			\
-		__SC_STR_ADECL##x(__VA_ARGS__)			\
+		__MAP(x,__SC_STR_ADECL,__VA_ARGS__)		\
 	};							\
 	SYSCALL_METADATA(sname, x);				\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
@@ -234,21 +216,21 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
 
 #define __SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
-	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
-	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
+	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
+	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
+	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
-		__SC_TEST##x(__VA_ARGS__);				\
-		return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));	\
+		__MAP(x,__SC_TEST,__VA_ARGS__);				\
+		return SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
 	}								\
 	SYSCALL_ALIAS(sys##name, SyS##name);				\
-	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
+	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
 #define __SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
+	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
-- 
cgit 


From 4a0fd5bf0fd0795af8f1be3b261f5cf146a4cb9b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jan 2013 15:16:58 -0500
Subject: teach SYSCALL_DEFINE<n> how to deal with long long/unsigned long long

... and convert a bunch of SYSCALL_DEFINE ones to SYSCALL_DEFINE<n>,
killing the boilerplate crap around them.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/s390/kernel/sys_s390.c        | 14 ++------------
 fs/dcookies.c                      |  9 +--------
 fs/notify/fanotify/fanotify_user.c | 17 +++--------------
 fs/open.c                          | 28 +++-------------------------
 fs/read_write.c                    | 24 ++++--------------------
 fs/sync.c                          | 26 ++++----------------------
 include/linux/syscalls.h           |  5 +++--
 mm/fadvise.c                       | 18 ++----------------
 mm/readahead.c                     |  9 +--------
 9 files changed, 23 insertions(+), 127 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c
index d0964d22adb5..23eb222c1658 100644
--- a/arch/s390/kernel/sys_s390.c
+++ b/arch/s390/kernel/sys_s390.c
@@ -132,19 +132,9 @@ SYSCALL_DEFINE1(s390_fadvise64_64, struct fadvise64_64_args __user *, args)
  * to
  *   %r2: fd, %r3: mode, %r4/%r5: offset, 96(%r15)-103(%r15): len
  */
-SYSCALL_DEFINE(s390_fallocate)(int fd, int mode, loff_t offset,
-			       u32 len_high, u32 len_low)
+SYSCALL_DEFINE5(s390_fallocate, int, fd, int, mode, loff_t, offset,
+			       u32, len_high, u32, len_low)
 {
 	return sys_fallocate(fd, mode, offset, ((u64)len_high << 32) | len_low);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_s390_fallocate(long fd, long mode, loff_t offset,
-				   long len_high, long len_low)
-{
-	return SYSC_s390_fallocate((int) fd, (int) mode, offset,
-				   (u32) len_high, (u32) len_low);
-}
-SYSCALL_ALIAS(sys_s390_fallocate, SyS_s390_fallocate);
-#endif
-
 #endif
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 17c779967828..f08375b97ffb 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -145,7 +145,7 @@ out:
 /* And here is where the userspace process can look up the cookie value
  * to retrieve the path.
  */
-SYSCALL_DEFINE(lookup_dcookie)(u64 cookie64, char __user * buf, size_t len)
+SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len)
 {
 	unsigned long cookie = (unsigned long)cookie64;
 	int err = -EINVAL;
@@ -201,13 +201,6 @@ out:
 	mutex_unlock(&dcookie_mutex);
 	return err;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_lookup_dcookie(u64 cookie64, long buf, long len)
-{
-	return SYSC_lookup_dcookie(cookie64, (char __user *) buf, (size_t) len);
-}
-SYSCALL_ALIAS(sys_lookup_dcookie, SyS_lookup_dcookie);
-#endif
 
 static int dcookie_init(void)
 {
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 5d8444268a16..d0be29fa94cf 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -755,9 +755,9 @@ out_destroy_group:
 	return fd;
 }
 
-SYSCALL_DEFINE(fanotify_mark)(int fanotify_fd, unsigned int flags,
-			      __u64 mask, int dfd,
-			      const char  __user * pathname)
+SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags,
+			      __u64, mask, int, dfd,
+			      const char  __user *, pathname)
 {
 	struct inode *inode = NULL;
 	struct vfsmount *mnt = NULL;
@@ -857,17 +857,6 @@ fput_and_out:
 	return ret;
 }
 
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fanotify_mark(long fanotify_fd, long flags, __u64 mask,
-				  long dfd, long pathname)
-{
-	return SYSC_fanotify_mark((int) fanotify_fd, (unsigned int) flags,
-				  mask, (int) dfd,
-				  (const char  __user *) pathname);
-}
-SYSCALL_ALIAS(sys_fanotify_mark, SyS_fanotify_mark);
-#endif
-
 /*
  * fanotify_user_setup - Our initialization function.  Note that we cannot return
  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
diff --git a/fs/open.c b/fs/open.c
index 68354466879f..a53922450448 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -212,32 +212,18 @@ COMPAT_SYSCALL_DEFINE2(ftruncate, unsigned int, fd, compat_ulong_t, length)
 
 /* LFS versions of truncate are only needed on 32 bit machines */
 #if BITS_PER_LONG == 32
-SYSCALL_DEFINE(truncate64)(const char __user * path, loff_t length)
+SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
 {
 	return do_sys_truncate(path, length);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_truncate64(long path, loff_t length)
-{
-	return SYSC_truncate64((const char __user *) path, length);
-}
-SYSCALL_ALIAS(sys_truncate64, SyS_truncate64);
-#endif
 
-SYSCALL_DEFINE(ftruncate64)(unsigned int fd, loff_t length)
+SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
 {
 	long ret = do_sys_ftruncate(fd, length, 0);
 	/* avoid REGPARM breakage on x86: */
 	asmlinkage_protect(2, ret, fd, length);
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_ftruncate64(long fd, loff_t length)
-{
-	return SYSC_ftruncate64((unsigned int) fd, length);
-}
-SYSCALL_ALIAS(sys_ftruncate64, SyS_ftruncate64);
-#endif
 #endif /* BITS_PER_LONG == 32 */
 
 
@@ -299,7 +285,7 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	return ret;
 }
 
-SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
+SYSCALL_DEFINE4(fallocate, int, fd, int, mode, loff_t, offset, loff_t, len)
 {
 	struct fd f = fdget(fd);
 	int error = -EBADF;
@@ -311,14 +297,6 @@ SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
 	return error;
 }
 
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fallocate(long fd, long mode, loff_t offset, loff_t len)
-{
-	return SYSC_fallocate((int)fd, (int)mode, offset, len);
-}
-SYSCALL_ALIAS(sys_fallocate, SyS_fallocate);
-#endif
-
 /*
  * access() needs to use the real uid/gid, not the effective uid/gid.
  * We do this by temporarily clearing all FS-related capabilities and
diff --git a/fs/read_write.c b/fs/read_write.c
index a698eff457fb..dcfd58d95f44 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -487,8 +487,8 @@ SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
 	return ret;
 }
 
-SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
-			size_t count, loff_t pos)
+SYSCALL_DEFINE4(pread64, unsigned int, fd, char __user *, buf,
+			size_t, count, loff_t, pos)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -506,17 +506,9 @@ SYSCALL_DEFINE(pread64)(unsigned int fd, char __user *buf,
 
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_pread64(long fd, long buf, long count, loff_t pos)
-{
-	return SYSC_pread64((unsigned int) fd, (char __user *) buf,
-			    (size_t) count, pos);
-}
-SYSCALL_ALIAS(sys_pread64, SyS_pread64);
-#endif
 
-SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
-			 size_t count, loff_t pos)
+SYSCALL_DEFINE4(pwrite64, unsigned int, fd, const char __user *, buf,
+			 size_t, count, loff_t, pos)
 {
 	struct fd f;
 	ssize_t ret = -EBADF;
@@ -534,14 +526,6 @@ SYSCALL_DEFINE(pwrite64)(unsigned int fd, const char __user *buf,
 
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_pwrite64(long fd, long buf, long count, loff_t pos)
-{
-	return SYSC_pwrite64((unsigned int) fd, (const char __user *) buf,
-			     (size_t) count, pos);
-}
-SYSCALL_ALIAS(sys_pwrite64, SyS_pwrite64);
-#endif
 
 /*
  * Reduce an iovec's length in-place.  Return the resulting number of segments
diff --git a/fs/sync.c b/fs/sync.c
index 2c5d6639a66a..905f3f6b3d85 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -283,8 +283,8 @@ EXPORT_SYMBOL(generic_write_sync);
  * already-instantiated disk blocks, there are no guarantees here that the data
  * will be available after a crash.
  */
-SYSCALL_DEFINE(sync_file_range)(int fd, loff_t offset, loff_t nbytes,
-				unsigned int flags)
+SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
+				unsigned int, flags)
 {
 	int ret;
 	struct fd f;
@@ -365,29 +365,11 @@ out_put:
 out:
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_sync_file_range(long fd, loff_t offset, loff_t nbytes,
-				    long flags)
-{
-	return SYSC_sync_file_range((int) fd, offset, nbytes,
-				    (unsigned int) flags);
-}
-SYSCALL_ALIAS(sys_sync_file_range, SyS_sync_file_range);
-#endif
 
 /* It would be nice if people remember that not all the world's an i386
    when they introduce new system calls */
-SYSCALL_DEFINE(sync_file_range2)(int fd, unsigned int flags,
-				 loff_t offset, loff_t nbytes)
+SYSCALL_DEFINE4(sync_file_range2, int, fd, unsigned int, flags,
+				 loff_t, offset, loff_t, nbytes)
 {
 	return sys_sync_file_range(fd, offset, nbytes, flags);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_sync_file_range2(long fd, long flags,
-				     loff_t offset, loff_t nbytes)
-{
-	return SYSC_sync_file_range2((int) fd, (unsigned int) flags,
-				     offset, nbytes);
-}
-SYSCALL_ALIAS(sys_sync_file_range2, SyS_sync_file_range2);
-#endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9411f0c1c80..3e07b92efbf6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -96,9 +96,10 @@ struct sigaltstack;
 #define __MAP(n,...) __MAP##n(__VA_ARGS__)
 
 #define __SC_DECL(t, a)	t a
-#define __SC_LONG(t, a)	long a
+#define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
+#define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
 #define __SC_CAST(t, a)	(t) a
-#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(sizeof(type) > sizeof(long))
+#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
 
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define __SC_STR_ADECL(t, a)	#a
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 7e092689a12a..3bcfd81db45e 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -25,7 +25,7 @@
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
  */
-SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
+SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 {
 	struct fd f = fdget(fd);
 	struct address_space *mapping;
@@ -145,26 +145,12 @@ out:
 	fdput(f);
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
-{
-	return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
-}
-SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
-#endif
 
 #ifdef __ARCH_WANT_SYS_FADVISE64
 
-SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
+SYSCALL_DEFINE4(fadvise64, int, fd, loff_t, offset, size_t, len, int, advice)
 {
 	return sys_fadvise64_64(fd, offset, len, advice);
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
-{
-	return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
-}
-SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
-#endif
 
 #endif
diff --git a/mm/readahead.c b/mm/readahead.c
index 7963f2391236..daed28dd5830 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -576,7 +576,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
 	return 0;
 }
 
-SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
+SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
 {
 	ssize_t ret;
 	struct fd f;
@@ -595,10 +595,3 @@ SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
 	}
 	return ret;
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
-{
-	return SYSC_readahead((int) fd, offset, (size_t) count);
-}
-SYSCALL_ALIAS(sys_readahead, SyS_readahead);
-#endif
-- 
cgit 


From e1b5bb6d1236d4ad2084c53aa83dde7cdf6f8eea Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jan 2013 17:16:07 -0500
Subject: consolidate cond_syscall and SYSCALL_ALIAS declarations

take them to asm/linkage.h, with default in linux/linkage.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/include/asm/linkage.h        |  4 +++-
 arch/alpha/include/asm/unistd.h         | 12 ------------
 arch/arm/include/asm/unistd.h           |  8 --------
 arch/avr32/include/asm/unistd.h         |  8 --------
 arch/blackfin/include/asm/unistd.h      |  8 --------
 arch/cris/include/asm/unistd.h          |  8 --------
 arch/frv/include/asm/unistd.h           | 10 ----------
 arch/h8300/include/asm/linkage.h        |  2 --
 arch/h8300/include/asm/unistd.h         |  7 -------
 arch/ia64/include/asm/linkage.h         |  4 ++++
 arch/ia64/include/asm/unistd.h          | 10 ----------
 arch/m32r/include/asm/unistd.h          | 10 ----------
 arch/m68k/include/asm/unistd.h          |  8 --------
 arch/microblaze/include/asm/unistd.h    |  8 --------
 arch/mips/include/asm/linkage.h         |  3 +++
 arch/mips/include/asm/unistd.h          |  8 --------
 arch/mn10300/include/asm/unistd.h       | 10 ----------
 arch/parisc/include/asm/unistd.h        |  8 --------
 arch/powerpc/include/asm/linkage.h      | 13 +++++++++++++
 arch/powerpc/include/asm/unistd.h       |  6 ------
 arch/powerpc/include/uapi/asm/linkage.h |  6 ------
 arch/s390/include/asm/unistd.h          |  8 --------
 arch/sh/include/asm/unistd.h            |  8 --------
 arch/sparc/include/asm/unistd.h         |  8 --------
 arch/x86/include/asm/unistd.h           |  8 --------
 arch/xtensa/include/asm/unistd.h        |  8 --------
 include/asm-generic/unistd.h            | 17 -----------------
 include/linux/linkage.h                 | 21 +++++++++++++++++++++
 include/linux/syscalls.h                | 14 --------------
 29 files changed, 44 insertions(+), 209 deletions(-)
 create mode 100644 arch/powerpc/include/asm/linkage.h
 delete mode 100644 arch/powerpc/include/uapi/asm/linkage.h

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/linkage.h b/arch/alpha/include/asm/linkage.h
index 291c2d01c44f..7cfd06e8c935 100644
--- a/arch/alpha/include/asm/linkage.h
+++ b/arch/alpha/include/asm/linkage.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-/* Nothing to see here... */
+#define cond_syscall(x)  asm(".weak\t" #x "\n" #x " = sys_ni_syscall")
+#define SYSCALL_ALIAS(alias, name)					\
+	asm ( #alias " = " #name "\n\t.globl " #alias)
 
 #endif
diff --git a/arch/alpha/include/asm/unistd.h b/arch/alpha/include/asm/unistd.h
index 6d6fe7ab5473..43baee17acdf 100644
--- a/arch/alpha/include/asm/unistd.h
+++ b/arch/alpha/include/asm/unistd.h
@@ -18,16 +18,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/* "Conditional" syscalls.  What we want is
-
-	__attribute__((weak,alias("sys_ni_syscall")))
-
-   but that raises the problem of what type to give the symbol.  If we use
-   a prototype, it'll conflict with the definition given in this file and
-   others.  If we use __typeof, we discover that not all symbols actually
-   have declarations.  If we use no prototype, then we get warnings from
-   -Wstrict-prototypes.  Ho hum.  */
-
-#define cond_syscall(x)  asm(".weak\t" #x "\n" #x " = sys_ni_syscall")
-
 #endif /* _ALPHA_UNISTD_H */
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index e4ddfb39ca34..141baa3f9a72 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -43,14 +43,6 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 /*
  * Unimplemented (or alternatively implemented) syscalls
  */
diff --git a/arch/avr32/include/asm/unistd.h b/arch/avr32/include/asm/unistd.h
index dc4d5a931112..c1eb080e45fe 100644
--- a/arch/avr32/include/asm/unistd.h
+++ b/arch/avr32/include/asm/unistd.h
@@ -41,12 +41,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");
-
 #endif /* __ASM_AVR32_UNISTD_H */
diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h
index 04e83ea8d5cc..c35414bdf7bd 100644
--- a/arch/blackfin/include/asm/unistd.h
+++ b/arch/blackfin/include/asm/unistd.h
@@ -20,12 +20,4 @@
 #define __ARCH_WANT_SYS_NICE
 #define __ARCH_WANT_SYS_VFORK
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t_" #x "\n\t.set\t_" #x ",_sys_ni_syscall");
-
 #endif				/* __ASM_BFIN_UNISTD_H */
diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h
index be57a988bfb9..0ff3f6889842 100644
--- a/arch/cris/include/asm/unistd.h
+++ b/arch/cris/include/asm/unistd.h
@@ -34,12 +34,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #endif /* _ASM_CRIS_UNISTD_H_ */
diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h
index 4cfcc7bba25a..70ec7293dce7 100644
--- a/arch/frv/include/asm/unistd.h
+++ b/arch/frv/include/asm/unistd.h
@@ -31,14 +31,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif
-
 #endif /* _ASM_UNISTD_H_ */
diff --git a/arch/h8300/include/asm/linkage.h b/arch/h8300/include/asm/linkage.h
index 6f4df7d46180..1d81604fb0ad 100644
--- a/arch/h8300/include/asm/linkage.h
+++ b/arch/h8300/include/asm/linkage.h
@@ -2,7 +2,5 @@
 #define _H8300_LINKAGE_H
 
 #undef SYMBOL_NAME_LABEL
-#undef SYMBOL_NAME
 #define SYMBOL_NAME_LABEL(_name_) _##_name_##:
-#define SYMBOL_NAME(_name_) _##_name_
 #endif
diff --git a/arch/h8300/include/asm/unistd.h b/arch/h8300/include/asm/unistd.h
index 6721856d841b..ab671ecf5196 100644
--- a/arch/h8300/include/asm/unistd.h
+++ b/arch/h8300/include/asm/unistd.h
@@ -33,11 +33,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- */
-#define cond_syscall(name)						\
-  asm (".weak\t_" #name "\n"				\
-       ".set\t_" #name ",_sys_ni_syscall");
-
 #endif /* _ASM_H8300_UNISTD_H_ */
diff --git a/arch/ia64/include/asm/linkage.h b/arch/ia64/include/asm/linkage.h
index ef22a45c1890..787575701f1c 100644
--- a/arch/ia64/include/asm/linkage.h
+++ b/arch/ia64/include/asm/linkage.h
@@ -11,4 +11,8 @@
 
 #endif
 
+#define cond_syscall(x) asm(".weak\t" #x "#\n" #x "#\t=\tsys_ni_syscall#")
+#define SYSCALL_ALIAS(alias, name)					\
+	asm ( #alias "# = " #name "#\n\t.globl " #alias "#")
+
 #endif
diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h
index 096373800f73..afd45e0d552e 100644
--- a/arch/ia64/include/asm/unistd.h
+++ b/arch/ia64/include/asm/unistd.h
@@ -46,15 +46,5 @@ asmlinkage unsigned long sys_mmap2(
 struct pt_regs;
 asmlinkage long sys_ia64_pipe(void);
 
-/*
- * "Conditional" syscalls
- *
- * Note, this macro can only be used in the file which defines sys_ni_syscall, i.e., in
- * kernel/sys_ni.c.  This version causes warnings because the declaration isn't a
- * proper prototype, but we can't use __typeof__ either, because not all cond_syscall()
- * declarations have prototypes at the moment.
- */
-#define cond_syscall(x) asmlinkage long x (void) __attribute__((weak,alias("sys_ni_syscall")))
-
 #endif /* !__ASSEMBLY__ */
 #endif /* _ASM_IA64_UNISTD_H */
diff --git a/arch/m32r/include/asm/unistd.h b/arch/m32r/include/asm/unistd.h
index 555629b05267..59db80193454 100644
--- a/arch/m32r/include/asm/unistd.h
+++ b/arch/m32r/include/asm/unistd.h
@@ -48,14 +48,4 @@
 #define __IGNORE_getresgid
 #define __IGNORE_chown
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-#endif
-
 #endif /* _ASM_M32R_UNISTD_H */
diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h
index 6cd92671ca5e..014f288fc813 100644
--- a/arch/m68k/include/asm/unistd.h
+++ b/arch/m68k/include/asm/unistd.h
@@ -32,12 +32,4 @@
 #define __ARCH_WANT_SYS_FORK
 #define __ARCH_WANT_SYS_VFORK
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #endif /* _ASM_M68K_UNISTD_H_ */
diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h
index b3778391d9cc..6dece2d002dc 100644
--- a/arch/microblaze/include/asm/unistd.h
+++ b/arch/microblaze/include/asm/unistd.h
@@ -37,13 +37,5 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_FORK
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");
-
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_MICROBLAZE_UNISTD_H */
diff --git a/arch/mips/include/asm/linkage.h b/arch/mips/include/asm/linkage.h
index e9a940d1b0c6..2767dda9e309 100644
--- a/arch/mips/include/asm/linkage.h
+++ b/arch/mips/include/asm/linkage.h
@@ -6,5 +6,8 @@
 #endif
 
 #define __weak __attribute__((weak))
+#define cond_syscall(x) asm(".weak\t" #x "\n" #x "\t=\tsys_ni_syscall")
+#define SYSCALL_ALIAS(alias, name)					\
+	asm ( #alias " = " #name "\n\t.globl " #alias)
 
 #endif
diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h
index 64f661e32879..63c9c886173a 100644
--- a/arch/mips/include/asm/unistd.h
+++ b/arch/mips/include/asm/unistd.h
@@ -63,12 +63,4 @@
 
 #endif /* !__ASSEMBLY__ */
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n" #x "\t=\tsys_ni_syscall")
-
 #endif /* _ASM_UNISTD_H */
diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h
index 7f9d9adfa51e..9d4e2d1ef90e 100644
--- a/arch/mn10300/include/asm/unistd.h
+++ b/arch/mn10300/include/asm/unistd.h
@@ -45,14 +45,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");
-#endif
-
 #endif /* _ASM_UNISTD_H */
diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h
index ae9a46cbfd92..74d835820ee7 100644
--- a/arch/parisc/include/asm/unistd.h
+++ b/arch/parisc/include/asm/unistd.h
@@ -170,12 +170,4 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5)	\
 
 #undef STR
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #endif /* _ASM_PARISC_UNISTD_H_ */
diff --git a/arch/powerpc/include/asm/linkage.h b/arch/powerpc/include/asm/linkage.h
new file mode 100644
index 000000000000..b36f650a13ff
--- /dev/null
+++ b/arch/powerpc/include/asm/linkage.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_POWERPC_LINKAGE_H
+#define _ASM_POWERPC_LINKAGE_H
+
+#ifdef CONFIG_PPC64
+#define cond_syscall(x) \
+	asm ("\t.weak " #x "\n\t.set " #x ", sys_ni_syscall\n"		\
+	     "\t.weak ." #x "\n\t.set ." #x ", .sys_ni_syscall\n")
+#define SYSCALL_ALIAS(alias, name)					\
+	asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n"	\
+	     "\t.globl ." #alias "\n\t.set ." #alias ", ." #name)
+#endif
+
+#endif	/* _ASM_POWERPC_LINKAGE_H */
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index f25b5c45c435..91586d979c99 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -56,11 +56,5 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- */
-#define cond_syscall(x) \
-	asmlinkage long x (void) __attribute__((weak,alias("sys_ni_syscall")))
-
 #endif		/* __ASSEMBLY__ */
 #endif /* _ASM_POWERPC_UNISTD_H_ */
diff --git a/arch/powerpc/include/uapi/asm/linkage.h b/arch/powerpc/include/uapi/asm/linkage.h
deleted file mode 100644
index e1c4ac1cc4ba..000000000000
--- a/arch/powerpc/include/uapi/asm/linkage.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef _ASM_POWERPC_LINKAGE_H
-#define _ASM_POWERPC_LINKAGE_H
-
-/* Nothing to see here... */
-
-#endif	/* _ASM_POWERPC_LINKAGE_H */
diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h
index a6667a952969..651886353551 100644
--- a/arch/s390/include/asm/unistd.h
+++ b/arch/s390/include/asm/unistd.h
@@ -54,12 +54,4 @@
 #define __ARCH_WANT_SYS_VFORK
 #define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #endif /* _ASM_S390_UNISTD_H_ */
diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h
index 5e90fa2b7eed..e77816c4b9bc 100644
--- a/arch/sh/include/asm/unistd.h
+++ b/arch/sh/include/asm/unistd.h
@@ -30,12 +30,4 @@
 # define __ARCH_WANT_SYS_VFORK
 # define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #include <uapi/asm/unistd.h>
diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h
index 5356810bd7e7..dfa53fdd5cbc 100644
--- a/arch/sparc/include/asm/unistd.h
+++ b/arch/sparc/include/asm/unistd.h
@@ -45,12 +45,4 @@
 #define __ARCH_WANT_COMPAT_SYS_SENDFILE
 #endif
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #endif /* _SPARC_UNISTD_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
index 3d5df1c4447f..c2a48139c340 100644
--- a/arch/x86/include/asm/unistd.h
+++ b/arch/x86/include/asm/unistd.h
@@ -50,12 +50,4 @@
 # define __ARCH_WANT_SYS_VFORK
 # define __ARCH_WANT_SYS_CLONE
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-# define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
-
 #endif /* _ASM_X86_UNISTD_H */
diff --git a/arch/xtensa/include/asm/unistd.h b/arch/xtensa/include/asm/unistd.h
index c38834de9ac7..cb4c2ce8d447 100644
--- a/arch/xtensa/include/asm/unistd.h
+++ b/arch/xtensa/include/asm/unistd.h
@@ -4,14 +4,6 @@
 #define __ARCH_WANT_SYS_CLONE
 #include <uapi/asm/unistd.h>
 
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall");
-
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_UTIME
 #define __ARCH_WANT_SYS_LLSEEK
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 4077b5d9ff81..0501fa3f783d 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -9,20 +9,3 @@
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_SYS_LLSEEK
 #endif
-
-/*
- * "Conditional" syscalls
- *
- * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
- * but it doesn't work on all toolchains, so we just do it by hand
- */
-#ifndef cond_syscall
-#ifdef CONFIG_SYMBOL_PREFIX
-#define __SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
-#else
-#define __SYMBOL_PREFIX
-#endif
-#define cond_syscall(x) asm(".weak\t" __SYMBOL_PREFIX #x "\n\t" \
-			    ".set\t" __SYMBOL_PREFIX #x "," \
-			    __SYMBOL_PREFIX "sys_ni_syscall")
-#endif
diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 807f1e533226..829d66c67fc2 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -2,6 +2,7 @@
 #define _LINUX_LINKAGE_H
 
 #include <linux/compiler.h>
+#include <linux/stringify.h>
 #include <asm/linkage.h>
 
 #ifdef __cplusplus
@@ -14,6 +15,26 @@
 #define asmlinkage CPP_ASMLINKAGE
 #endif
 
+#ifndef SYMBOL_NAME
+#ifdef CONFIG_SYMBOL_PREFIX
+#define SYMBOL_NAME(x) CONFIG_SYMBOL_PREFIX ## x
+#else
+#define SYMBOL_NAME(x) x
+#endif
+#endif
+#define __SYMBOL_NAME(x) __stringify(SYMBOL_NAME(x))
+
+#ifndef cond_syscall
+#define cond_syscall(x) asm(".weak\t" __SYMBOL_NAME(x) \
+	"\n\t.set\t" __SYMBOL_NAME(x) "," __SYMBOL_NAME(sys_ni_syscall));
+#endif
+
+#ifndef SYSCALL_ALIAS
+#define SYSCALL_ALIAS(alias, name)				\
+	asm ("\t.globl " __SYMBOL_NAME(alias)			\
+	"\n\t.set\t" __SYMBOL_NAME(alias) "," __SYMBOL_NAME(name))
+#endif
+
 #define __page_aligned_data	__section(.data..page_aligned) __aligned(PAGE_SIZE)
 #define __page_aligned_bss	__section(.bss..page_aligned) __aligned(PAGE_SIZE)
 
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3e07b92efbf6..87584373305d 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -183,20 +183,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
-#ifdef CONFIG_PPC64
-#define SYSCALL_ALIAS(alias, name)					\
-	asm ("\t.globl " #alias "\n\t.set " #alias ", " #name "\n"	\
-	     "\t.globl ." #alias "\n\t.set ." #alias ", ." #name)
-#else
-#if defined(CONFIG_ALPHA) || defined(CONFIG_MIPS)
-#define SYSCALL_ALIAS(alias, name)					\
-	asm ( #alias " = " #name "\n\t.globl " #alias)
-#else
-#define SYSCALL_ALIAS(alias, name)					\
-	asm ("\t.globl " #alias "\n\t.set " #alias ", " #name)
-#endif
-#endif
-
 #ifdef CONFIG_FTRACE_SYSCALLS
 #define SYSCALL_DEFINEx(x, sname, ...)				\
 	static const char *types_##sname[] = {			\
-- 
cgit 


From 22d1a35da0e247a006c286842a1846acb4ffed4f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jan 2013 17:18:07 -0500
Subject: make HAVE_SYSCALL_WRAPPERS unconditional

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/Kconfig             |  3 ---
 arch/alpha/Kconfig       |  1 -
 arch/mips/Kconfig        |  1 -
 arch/powerpc/Kconfig     |  1 -
 arch/s390/Kconfig        |  1 -
 arch/sparc/Kconfig       |  1 -
 arch/tile/Kconfig        |  1 -
 include/linux/compat.h   |  9 ---------
 include/linux/syscalls.h | 10 ----------
 ipc/sem.c                |  2 --
 10 files changed, 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 5a1779c93940..892d6176fcf3 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -157,9 +157,6 @@ config ARCH_USE_BUILTIN_BSWAP
 	 instructions should set this. And it shouldn't hurt to set it
 	 on architectures that don't have such instructions.
 
-config HAVE_SYSCALL_WRAPPERS
-	bool
-
 config KRETPROBES
 	def_bool y
 	depends on KPROBES && HAVE_KRETPROBES
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 5833aa441481..5469f7b444ab 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -4,7 +4,6 @@ config ALPHA
 	select HAVE_AOUT
 	select HAVE_IDE
 	select HAVE_OPROFILE
-	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_DMA_ATTRS
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ae9c716c46bb..32eb3d67bbef 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1737,7 +1737,6 @@ config 32BIT
 config 64BIT
 	bool "64-bit kernel"
 	depends on CPU_SUPPORTS_64BIT_KERNEL && SYS_SUPPORTS_64BIT_KERNEL
-	select HAVE_SYSCALL_WRAPPERS
 	help
 	  Select this option if you want to build a 64-bit kernel.
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index b89d7eb730a2..f460e32fe2a0 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -113,7 +113,6 @@ config PPC
 	select USE_GENERIC_SMP_HELPERS if SMP
 	select HAVE_OPROFILE
 	select HAVE_DEBUG_KMEMLEAK
-	select HAVE_SYSCALL_WRAPPERS if PPC64
 	select GENERIC_ATOMIC64 if PPC32
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select HAVE_PERF_EVENTS
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 4b505370a1d5..f6cc1528df89 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -131,7 +131,6 @@ config S390
 	select HAVE_PERF_EVENTS
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_UID16 if 32BIT
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_VIRT_TO_BUS
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 289127d5241c..67388cdb18c1 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -62,7 +62,6 @@ config SPARC64
 	select HAVE_RCU_TABLE_FREE if SMP
 	select HAVE_MEMBLOCK
 	select HAVE_MEMBLOCK_NODE_MAP
-	select HAVE_SYSCALL_WRAPPERS
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE_MCOUNT_RECORD
diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig
index ff496ab1e794..95bd2ef6c943 100644
--- a/arch/tile/Kconfig
+++ b/arch/tile/Kconfig
@@ -16,7 +16,6 @@ config TILE
 	select GENERIC_PENDING_IRQ if SMP
 	select GENERIC_IRQ_SHOW
 	select HAVE_DEBUG_BUGVERBOSE
-	select HAVE_SYSCALL_WRAPPERS if TILEGX
 	select HAVE_VIRT_TO_BUS
 	select SYS_HYPERVISOR
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 8c1dfc8d830d..110132527e4c 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -40,8 +40,6 @@
 #define COMPAT_SYSCALL_DEFINE6(name, ...) \
 	COMPAT_SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-
 #define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
 	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
 	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
@@ -52,13 +50,6 @@
 	SYSCALL_ALIAS(compat_sys##name, compat_SyS##name);		\
 	static inline long C_SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
-#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
-
-#define COMPAT_SYSCALL_DEFINEx(x, name, ...)				\
-	asmlinkage long compat_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
-
-#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
-
 #ifndef compat_user_stack_pointer
 #define compat_user_stack_pointer() current_user_stack_pointer()
 #endif
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 87584373305d..3b6fc13cb46a 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -198,8 +198,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
 #endif
 
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
-
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
 
 #define __SYSCALL_DEFINEx(x, name, ...)					\
@@ -213,14 +211,6 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	SYSCALL_ALIAS(sys##name, SyS##name);				\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
 
-#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
-
-#define SYSCALL_DEFINE(name) asmlinkage long sys_##name
-#define __SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
-
-#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
-
 asmlinkage long sys_time(time_t __user *tloc);
 asmlinkage long sys_stime(time_t __user *tptr);
 asmlinkage long sys_gettimeofday(struct timeval __user *tv,
diff --git a/ipc/sem.c b/ipc/sem.c
index 58d31f1c1eb5..e7236df7a470 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1156,13 +1156,11 @@ SYSCALL_DEFINE(semctl)(int semid, int semnum, int cmd, union semun arg)
 		return -EINVAL;
 	}
 }
-#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 asmlinkage long SyS_semctl(int semid, int semnum, int cmd, union semun arg)
 {
 	return SYSC_semctl((int) semid, (int) semnum, (int) cmd, arg);
 }
 SYSCALL_ALIAS(sys_semctl, SyS_semctl);
-#endif
 
 /* If the task doesn't already have a undo_list, then allocate one
  * here.  We guarantee there is only one thread using this undo list,
-- 
cgit 


From 2cf0966683430b6468f36ca20515a33ca7f2403c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jan 2013 15:25:54 -0500
Subject: make SYSCALL_DEFINE<n>-generated wrappers do asmlinkage_protect

... and switch i386 to HAVE_SYSCALL_WRAPPERS, killing open-coded
uses of asmlinkage_protect() in a bunch of syscalls.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/include/asm/syscalls.h |  4 +--
 arch/x86/kernel/tls.c           | 14 ++++-------
 arch/x86/um/tls_32.c            |  5 ++--
 fs/aio.c                        |  2 --
 fs/open.c                       | 24 +++---------------
 include/linux/syscalls.h        |  6 ++++-
 kernel/exit.c                   |  5 ----
 kernel/fork.c                   |  5 +---
 kernel/uid16.c                  | 55 +++++++++--------------------------------
 9 files changed, 31 insertions(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 6cf0a9cc60cd..5f87b35fd2ef 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -27,8 +27,8 @@ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
 long sys_rt_sigreturn(void);
 
 /* kernel/tls.c */
-asmlinkage int sys_set_thread_area(struct user_desc __user *);
-asmlinkage int sys_get_thread_area(struct user_desc __user *);
+asmlinkage long sys_set_thread_area(struct user_desc __user *);
+asmlinkage long sys_get_thread_area(struct user_desc __user *);
 
 /* X86_32 only */
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index 9d9d2f9e77a5..f7fec09e3e3a 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -3,13 +3,13 @@
 #include <linux/sched.h>
 #include <linux/user.h>
 #include <linux/regset.h>
+#include <linux/syscalls.h>
 
 #include <asm/uaccess.h>
 #include <asm/desc.h>
 #include <asm/ldt.h>
 #include <asm/processor.h>
 #include <asm/proto.h>
-#include <asm/syscalls.h>
 
 #include "tls.h"
 
@@ -89,11 +89,9 @@ int do_set_thread_area(struct task_struct *p, int idx,
 	return 0;
 }
 
-asmlinkage int sys_set_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, u_info)
 {
-	int ret = do_set_thread_area(current, -1, u_info, 1);
-	asmlinkage_protect(1, ret, u_info);
-	return ret;
+	return do_set_thread_area(current, -1, u_info, 1);
 }
 
 
@@ -139,11 +137,9 @@ int do_get_thread_area(struct task_struct *p, int idx,
 	return 0;
 }
 
-asmlinkage int sys_get_thread_area(struct user_desc __user *u_info)
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, u_info)
 {
-	int ret = do_get_thread_area(current, -1, u_info);
-	asmlinkage_protect(1, ret, u_info);
-	return ret;
+	return do_get_thread_area(current, -1, u_info);
 }
 
 int regset_tls_active(struct task_struct *target,
diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c
index 5f5feff3d24c..80ffa5b9982d 100644
--- a/arch/x86/um/tls_32.c
+++ b/arch/x86/um/tls_32.c
@@ -5,6 +5,7 @@
 
 #include <linux/percpu.h>
 #include <linux/sched.h>
+#include <linux/syscalls.h>
 #include <asm/uaccess.h>
 #include <os.h>
 #include <skas.h>
@@ -274,7 +275,7 @@ clear:
 	goto out;
 }
 
-int sys_set_thread_area(struct user_desc __user *user_desc)
+SYSCALL_DEFINE1(set_thread_area, struct user_desc __user *, user_desc)
 {
 	struct user_desc info;
 	int idx, ret;
@@ -322,7 +323,7 @@ int ptrace_set_thread_area(struct task_struct *child, int idx,
 	return set_tls_entry(child, &info, idx, 0);
 }
 
-int sys_get_thread_area(struct user_desc __user *user_desc)
+SYSCALL_DEFINE1(get_thread_area, struct user_desc __user *, user_desc)
 {
 	struct user_desc info;
 	int idx, ret;
diff --git a/fs/aio.c b/fs/aio.c
index 3f941f2a3059..c3ebb98a527b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1790,7 +1790,5 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 			ret = read_events(ioctx, min_nr, nr, events, timeout);
 		put_ioctx(ioctx);
 	}
-
-	asmlinkage_protect(5, ret, ctx_id, min_nr, nr, events, timeout);
 	return ret;
 }
diff --git a/fs/open.c b/fs/open.c
index a53922450448..8c741002f947 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -197,10 +197,7 @@ out:
 
 SYSCALL_DEFINE2(ftruncate, unsigned int, fd, unsigned long, length)
 {
-	long ret = do_sys_ftruncate(fd, length, 1);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(2, ret, fd, length);
-	return ret;
+	return do_sys_ftruncate(fd, length, 1);
 }
 
 #ifdef CONFIG_COMPAT
@@ -219,10 +216,7 @@ SYSCALL_DEFINE2(truncate64, const char __user *, path, loff_t, length)
 
 SYSCALL_DEFINE2(ftruncate64, unsigned int, fd, loff_t, length)
 {
-	long ret = do_sys_ftruncate(fd, length, 0);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(2, ret, fd, length);
-	return ret;
+	return do_sys_ftruncate(fd, length, 0);
 }
 #endif /* BITS_PER_LONG == 32 */
 
@@ -961,29 +955,19 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 
 SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode)
 {
-	long ret;
-
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, filename, flags, mode);
-	return ret;
+	return do_sys_open(AT_FDCWD, filename, flags, mode);
 }
 
 SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
 		umode_t, mode)
 {
-	long ret;
-
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	ret = do_sys_open(dfd, filename, flags, mode);
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(4, ret, dfd, filename, flags, mode);
-	return ret;
+	return do_sys_open(dfd, filename, flags, mode);
 }
 
 #ifndef __alpha__
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3b6fc13cb46a..9660a8bdcbbe 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -99,6 +99,7 @@ struct sigaltstack;
 #define __TYPE_IS_LL(t) (__same_type((t)0, 0LL) || __same_type((t)0, 0ULL))
 #define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
 #define __SC_CAST(t, a)	(t) a
+#define __SC_ARGS(t, a)	a
 #define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
 
 #ifdef CONFIG_FTRACE_SYSCALLS
@@ -200,13 +201,16 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
 
+#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__));	\
 	asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
 	{								\
+		long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
 		__MAP(x,__SC_TEST,__VA_ARGS__);				\
-		return SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__));	\
+		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
+		return ret;						\
 	}								\
 	SYSCALL_ALIAS(sys##name, SyS##name);				\
 	static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca9935..25d0108d7452 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1629,9 +1629,6 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 	}
 
 	put_pid(pid);
-
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(5, ret, which, upid, infop, options, ru);
 	return ret;
 }
 
@@ -1669,8 +1666,6 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 	ret = do_wait(&wo);
 	put_pid(pid);
 
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(4, ret, upid, stat_addr, options, ru);
 	return ret;
 }
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c9056..e1f34abe5887 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1674,10 +1674,7 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
 		 int, tls_val)
 #endif
 {
-	long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
-	asmlinkage_protect(5, ret, clone_flags, newsp,
-			parent_tidptr, child_tidptr, tls_val);
-	return ret;
+	return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
 }
 #endif
 
diff --git a/kernel/uid16.c b/kernel/uid16.c
index d7948eb10225..f6c83d7ef000 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -18,67 +18,43 @@
 
 SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
-	long ret = sys_chown(filename, low2highuid(user), low2highgid(group));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, filename, user, group);
-	return ret;
+	return sys_chown(filename, low2highuid(user), low2highgid(group));
 }
 
 SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group)
 {
-	long ret = sys_lchown(filename, low2highuid(user), low2highgid(group));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, filename, user, group);
-	return ret;
+	return sys_lchown(filename, low2highuid(user), low2highgid(group));
 }
 
 SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group)
 {
-	long ret = sys_fchown(fd, low2highuid(user), low2highgid(group));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, fd, user, group);
-	return ret;
+	return sys_fchown(fd, low2highuid(user), low2highgid(group));
 }
 
 SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid)
 {
-	long ret = sys_setregid(low2highgid(rgid), low2highgid(egid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(2, ret, rgid, egid);
-	return ret;
+	return sys_setregid(low2highgid(rgid), low2highgid(egid));
 }
 
 SYSCALL_DEFINE1(setgid16, old_gid_t, gid)
 {
-	long ret = sys_setgid(low2highgid(gid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(1, ret, gid);
-	return ret;
+	return sys_setgid(low2highgid(gid));
 }
 
 SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid)
 {
-	long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(2, ret, ruid, euid);
-	return ret;
+	return sys_setreuid(low2highuid(ruid), low2highuid(euid));
 }
 
 SYSCALL_DEFINE1(setuid16, old_uid_t, uid)
 {
-	long ret = sys_setuid(low2highuid(uid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(1, ret, uid);
-	return ret;
+	return sys_setuid(low2highuid(uid));
 }
 
 SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid)
 {
-	long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid),
+	return sys_setresuid(low2highuid(ruid), low2highuid(euid),
 				 low2highuid(suid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, ruid, euid, suid);
-	return ret;
 }
 
 SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euidp, old_uid_t __user *, suidp)
@@ -100,11 +76,8 @@ SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruidp, old_uid_t __user *, euid
 
 SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid)
 {
-	long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid),
+	return sys_setresgid(low2highgid(rgid), low2highgid(egid),
 				 low2highgid(sgid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(3, ret, rgid, egid, sgid);
-	return ret;
 }
 
 
@@ -127,18 +100,12 @@ SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgidp, old_gid_t __user *, egid
 
 SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid)
 {
-	long ret = sys_setfsuid(low2highuid(uid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(1, ret, uid);
-	return ret;
+	return sys_setfsuid(low2highuid(uid));
 }
 
 SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid)
 {
-	long ret = sys_setfsgid(low2highgid(gid));
-	/* avoid REGPARM breakage on x86: */
-	asmlinkage_protect(1, ret, gid);
-	return ret;
+	return sys_setfsgid(low2highgid(gid));
 }
 
 static int groups16_to_user(old_gid_t __user *grouplist,
-- 
cgit 


From 19f4fc3aee180000fe45952691bbe69dde1d9e95 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Feb 2013 02:17:03 -0500
Subject: convert sendfile{,64} to COMPAT_SYSCALL_DEFINE

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/kernel/linux32.c         | 20 -----------------
 arch/mips/kernel/scall64-n32.S     |  2 +-
 arch/mips/kernel/scall64-o32.S     |  2 +-
 arch/parisc/kernel/sys_parisc32.c  | 19 ----------------
 arch/parisc/kernel/syscall_table.S |  4 ++--
 arch/powerpc/include/asm/systbl.h  |  4 ++--
 arch/powerpc/kernel/sys_ppc32.c    | 18 ----------------
 arch/s390/kernel/compat_linux.c    | 42 ------------------------------------
 arch/s390/kernel/compat_linux.h    |  4 ----
 arch/s390/kernel/compat_wrapper.S  | 14 ------------
 arch/s390/kernel/syscalls.S        |  4 ++--
 arch/sparc/kernel/sys32.S          |  1 -
 arch/sparc/kernel/systbls_64.S     |  2 +-
 arch/x86/ia32/sys_ia32.c           | 20 -----------------
 arch/x86/include/asm/sys_ia32.h    |  1 -
 arch/x86/syscalls/syscall_32.tbl   |  2 +-
 fs/compat.c                        | 22 -------------------
 fs/read_write.c                    | 44 ++++++++++++++++++++++++++++++++++++--
 fs/read_write.h                    |  2 --
 include/linux/compat.h             |  2 ++
 20 files changed, 54 insertions(+), 175 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 8eeee1c860c0..b0cc2a7df59f 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -226,26 +226,6 @@ SYSCALL_DEFINE1(32_personality, unsigned long, personality)
 	return ret;
 }
 
-SYSCALL_DEFINE4(32_sendfile, long, out_fd, long, in_fd,
-	compat_off_t __user *, offset, s32, count)
-{
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	off_t of;
-
-	if (offset && get_user(of, offset))
-		return -EFAULT;
-
-	set_fs(KERNEL_DS);
-	ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL, count);
-	set_fs(old_fs);
-
-	if (offset && put_user(of, offset))
-		return -EFAULT;
-
-	return ret;
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, u32 pad0, u64 a2, u64 a3,
 				   size_t count)
 {
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 693d60b0855f..9b4df498fc5b 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -143,7 +143,7 @@ EXPORT(sysn32_call_table)
 	PTR	compat_sys_setitimer
 	PTR	sys_alarm
 	PTR	sys_getpid
-	PTR	sys_32_sendfile
+	PTR	compat_sys_sendfile
 	PTR	sys_socket			/* 6040 */
 	PTR	sys_connect
 	PTR	sys_accept
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index af8887f779f1..c1a70e805751 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -399,7 +399,7 @@ sys_call_table:
 	PTR	sys_capget
 	PTR	sys_capset			/* 4205 */
 	PTR	compat_sys_sigaltstack
-	PTR	sys_32_sendfile
+	PTR	compat_sys_sendfile
 	PTR	sys_ni_syscall
 	PTR	sys_ni_syscall
 	PTR	sys_mips_mmap2			/* 4210 */
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 051c8b90231f..035ab3f94814 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -60,25 +60,6 @@ asmlinkage long sys32_unimplemented(int r26, int r25, int r24, int r23,
     return -ENOSYS;
 }
 
-/* Note: it is necessary to treat out_fd and in_fd as unsigned ints, with the
- * corresponding cast to a signed int to insure that the proper conversion
- * (sign extension) between the register representation of a signed int (msr in
- * 32-bit mode) and the register representation of a signed int (msr in 64-bit
- * mode) is performed.
- */
-asmlinkage long sys32_sendfile(u32 out_fd, u32 in_fd,
-			       compat_off_t __user *offset, compat_size_t count)
-{
-	return compat_sys_sendfile((int)out_fd, (int)in_fd, offset, count);
-}
-
-asmlinkage long sys32_sendfile64(u32 out_fd, u32 in_fd,
-				 compat_loff_t __user *offset, compat_size_t count)
-{
-	return sys_sendfile64((int)out_fd, (int)in_fd,
-				(loff_t __user *)offset, count);
-}
-
 asmlinkage long sys32_semctl(int semid, int semnum, int cmd, union semun arg)
 {
         union semun u;
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index f57dc137b8dd..f232672a9e20 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -198,7 +198,7 @@
 	ENTRY_SAME(madvise)
 	ENTRY_SAME(clone_wrapper)	/* 120 */
 	ENTRY_SAME(setdomainname)
-	ENTRY_DIFF(sendfile)
+	ENTRY_COMP(sendfile)
 	/* struct sockaddr... */
 	ENTRY_SAME(recvfrom)
 	/* struct timex contains longs */
@@ -304,7 +304,7 @@
 	ENTRY_SAME(gettid)
 	ENTRY_OURS(readahead)
 	ENTRY_SAME(tkill)
-	ENTRY_DIFF(sendfile64)
+	ENTRY_COMP(sendfile64)
 	ENTRY_COMP(futex)		/* 210 */
 	ENTRY_COMP(sched_setaffinity)
 	ENTRY_COMP(sched_getaffinity)
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 535b6d8a41cc..634db7d2dc92 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -190,7 +190,7 @@ SYSCALL_SPU(getcwd)
 SYSCALL_SPU(capget)
 SYSCALL_SPU(capset)
 COMPAT_SYS(sigaltstack)
-SYSX_SPU(sys_sendfile,compat_sys_sendfile_wrapper,sys_sendfile)
+COMPAT_SYS_SPU(sendfile)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 PPC_SYS(vfork)
@@ -230,7 +230,7 @@ COMPAT_SYS_SPU(sched_setaffinity)
 COMPAT_SYS_SPU(sched_getaffinity)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
-SYSX(sys_ni_syscall,compat_sys_sendfile64_wrapper,sys_sendfile64)
+SYS32ONLY(sendfile64)
 COMPAT_SYS_SPU(io_setup)
 SYSCALL_SPU(io_destroy)
 COMPAT_SYS_SPU(io_getevents)
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index d0bafc0cdf06..6e7c2509bd2d 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -128,24 +128,6 @@ long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t pt
 }
 #endif
 
-/* Note: it is necessary to treat out_fd and in_fd as unsigned ints, 
- * with the corresponding cast to a signed int to insure that the 
- * proper conversion (sign extension) between the register representation of a signed int (msr in 32-bit mode)
- * and the register representation of a signed int (msr in 64-bit mode) is performed.
- */
-asmlinkage long compat_sys_sendfile_wrapper(u32 out_fd, u32 in_fd,
-					    compat_off_t __user *offset, u32 count)
-{
-	return compat_sys_sendfile((int)out_fd, (int)in_fd, offset, count);
-}
-
-asmlinkage long compat_sys_sendfile64_wrapper(u32 out_fd, u32 in_fd,
-					      compat_loff_t __user *offset, u32 count)
-{
-	return sys_sendfile((int)out_fd, (int)in_fd,
-			    (off_t __user *)offset, count);
-}
-
 unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 19f26de27fae..fbd29c70a297 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -373,48 +373,6 @@ asmlinkage compat_ssize_t sys32_readahead(int fd, u32 offhi, u32 offlo, s32 coun
 	return sys_readahead(fd, ((loff_t)AA(offhi) << 32) | AA(offlo), count);
 }
 
-asmlinkage long sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset, size_t count)
-{
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	off_t of;
-	
-	if (offset && get_user(of, offset))
-		return -EFAULT;
-		
-	set_fs(KERNEL_DS);
-	ret = sys_sendfile(out_fd, in_fd,
-			   offset ? (off_t __force __user *) &of : NULL, count);
-	set_fs(old_fs);
-	
-	if (offset && put_user(of, offset))
-		return -EFAULT;
-		
-	return ret;
-}
-
-asmlinkage long sys32_sendfile64(int out_fd, int in_fd,
-				compat_loff_t __user *offset, s32 count)
-{
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	loff_t lof;
-	
-	if (offset && get_user(lof, offset))
-		return -EFAULT;
-		
-	set_fs(KERNEL_DS);
-	ret = sys_sendfile64(out_fd, in_fd,
-			     offset ? (loff_t __force __user *) &lof : NULL,
-			     count);
-	set_fs(old_fs);
-	
-	if (offset && put_user(lof, offset))
-		return -EFAULT;
-		
-	return ret;
-}
-
 struct stat64_emu31 {
 	unsigned long long  st_dev;
 	unsigned int    __pad1;
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index 00d92a5a6f6c..bce0b7aec8f9 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -106,10 +106,6 @@ long sys32_pread64(unsigned int fd, char __user *ubuf, size_t count,
 long sys32_pwrite64(unsigned int fd, const char __user *ubuf,
 		    size_t count, u32 poshi, u32 poslo);
 compat_ssize_t sys32_readahead(int fd, u32 offhi, u32 offlo, s32 count);
-long sys32_sendfile(int out_fd, int in_fd, compat_off_t __user *offset,
-		    size_t count);
-long sys32_sendfile64(int out_fd, int in_fd, compat_loff_t __user *offset,
-		      s32 count);
 long sys32_stat64(const char __user * filename, struct stat64_emu31 __user * statbuf);
 long sys32_lstat64(const char __user * filename,
 		   struct stat64_emu31 __user * statbuf);
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 626cc6f0f446..a1dda9c67efe 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -666,13 +666,6 @@ ENTRY(sys32_capset_wrapper)
 	llgtr	%r3,%r3			# const cap_user_data_t
 	jg	sys_capset		# branch to system call
 
-ENTRY(sys32_sendfile_wrapper)
-	lgfr	%r2,%r2			# int
-	lgfr	%r3,%r3			# int
-	llgtr	%r4,%r4			# __kernel_off_emu31_t *
-	llgfr	%r5,%r5			# size_t
-	jg	sys32_sendfile		# branch to system call
-
 #sys32_vfork_wrapper			# done in vfork_glue
 
 ENTRY(sys32_truncate64_wrapper)
@@ -1348,13 +1341,6 @@ ENTRY(sys32_readahead_wrapper)
 	lgfr	%r5,%r5			# s32
 	jg	sys32_readahead		# branch to system call
 
-ENTRY(sys32_sendfile64_wrapper)
-	lgfr	%r2,%r2			# int
-	lgfr	%r3,%r3			# int
-	llgtr	%r4,%r4			# compat_loff_t *
-	lgfr	%r5,%r5			# s32
-	jg	sys32_sendfile64	# branch to system call
-
 ENTRY(sys_tkill_wrapper)
 	lgfr	%r2,%r2			# pid_t
 	lgfr	%r3,%r3			# int
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 2695bb89699e..5f3f7fbc5465 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -195,7 +195,7 @@ SYSCALL(sys_getcwd,sys_getcwd,sys32_getcwd_wrapper)
 SYSCALL(sys_capget,sys_capget,sys32_capget_wrapper)
 SYSCALL(sys_capset,sys_capset,sys32_capset_wrapper)		/* 185 */
 SYSCALL(sys_sigaltstack,sys_sigaltstack,compat_sys_sigaltstack)
-SYSCALL(sys_sendfile,sys_sendfile64,sys32_sendfile_wrapper)
+SYSCALL(sys_sendfile,sys_sendfile64,compat_sys_sendfile)
 NI_SYSCALL							/* streams1 */
 NI_SYSCALL							/* streams2 */
 SYSCALL(sys_vfork,sys_vfork,sys_vfork)				/* 190 */
@@ -231,7 +231,7 @@ SYSCALL(sys_madvise,sys_madvise,sys32_madvise_wrapper)
 SYSCALL(sys_getdents64,sys_getdents64,sys32_getdents64_wrapper)	/* 220 */
 SYSCALL(sys_fcntl64,sys_ni_syscall,compat_sys_fcntl64_wrapper)
 SYSCALL(sys_readahead,sys_readahead,sys32_readahead_wrapper)
-SYSCALL(sys_sendfile64,sys_ni_syscall,sys32_sendfile64_wrapper)
+SYSCALL(sys_sendfile64,sys_ni_syscall,compat_sys_sendfile64)
 SYSCALL(sys_setxattr,sys_setxattr,sys32_setxattr_wrapper)
 SYSCALL(sys_lsetxattr,sys_lsetxattr,sys32_lsetxattr_wrapper)	/* 225 */
 SYSCALL(sys_fsetxattr,sys_fsetxattr,sys32_fsetxattr_wrapper)
diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S
index 240a3cecc11e..6c65d69c6635 100644
--- a/arch/sparc/kernel/sys32.S
+++ b/arch/sparc/kernel/sys32.S
@@ -46,7 +46,6 @@ SIGN1(sys32_io_submit, compat_sys_io_submit, %o1)
 SIGN1(sys32_mq_open, compat_sys_mq_open, %o1)
 SIGN1(sys32_select, compat_sys_select, %o0)
 SIGN3(sys32_futex, compat_sys_futex, %o1, %o2, %o5)
-SIGN2(sys32_sendfile, compat_sys_sendfile, %o0, %o1)
 SIGN1(sys32_recvfrom, compat_sys_recvfrom, %o0)
 SIGN1(sys32_recvmsg, compat_sys_recvmsg, %o0)
 SIGN1(sys32_sendmsg, compat_sys_sendmsg, %o0)
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 088134834dab..a1444d0d08ee 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -25,7 +25,7 @@ sys_call_table32:
 /*20*/	.word sys_getpid, sys_capget, sys_capset, sys_setuid16, sys_getuid16
 /*25*/	.word sys32_vmsplice, compat_sys_ptrace, sys_alarm, compat_sys_sigaltstack, sys_pause
 /*30*/	.word compat_sys_utime, sys_lchown, sys_fchown, sys_access, sys_nice
-	.word sys_chown, sys_sync, sys_kill, compat_sys_newstat, sys32_sendfile
+	.word sys_chown, sys_sync, sys_kill, compat_sys_newstat, compat_sys_sendfile
 /*40*/	.word compat_sys_newlstat, sys_dup, sys_sparc_pipe, compat_sys_times, sys_getuid
 	.word sys_umount, sys_setgid16, sys_getgid16, sys_signal, sys_geteuid16
 /*50*/	.word sys_getegid16, sys_acct, sys_nis_syscall, sys_getgid, compat_sys_ioctl
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index ad7a20cbc699..ad6ca0472722 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -194,26 +194,6 @@ asmlinkage long sys32_pwrite(unsigned int fd, const char __user *ubuf,
 }
 
 
-asmlinkage long sys32_sendfile(int out_fd, int in_fd,
-			       compat_off_t __user *offset, s32 count)
-{
-	mm_segment_t old_fs = get_fs();
-	int ret;
-	off_t of;
-
-	if (offset && get_user(of, offset))
-		return -EFAULT;
-
-	set_fs(KERNEL_DS);
-	ret = sys_sendfile(out_fd, in_fd, offset ? (off_t __user *)&of : NULL,
-			   count);
-	set_fs(old_fs);
-
-	if (offset && put_user(of, offset))
-		return -EFAULT;
-	return ret;
-}
-
 /*
  * Some system calls that need sign extended arguments. This could be
  * done by a generic wrapper.
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 8459efc39686..6d944e4bb524 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -41,7 +41,6 @@ asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
 asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
 
 asmlinkage long sys32_personality(unsigned long);
-asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
 
 long sys32_kill(int, int);
 long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index e6d55f0064df..6a00b1257d68 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -193,7 +193,7 @@
 184	i386	capget			sys_capget
 185	i386	capset			sys_capset
 186	i386	sigaltstack		sys_sigaltstack			compat_sys_sigaltstack
-187	i386	sendfile		sys_sendfile			sys32_sendfile
+187	i386	sendfile		sys_sendfile			compat_sys_sendfile
 188	i386	getpmsg
 189	i386	putpmsg
 190	i386	vfork			sys_vfork			stub32_vfork
diff --git a/fs/compat.c b/fs/compat.c
index cc09312f9aed..2ae2a98891cd 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1718,25 +1718,3 @@ COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd,
 	return do_handle_open(mountdirfd, handle, flags);
 }
 #endif
-
-#ifdef __ARCH_WANT_COMPAT_SYS_SENDFILE
-asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
-				    compat_off_t __user *offset, compat_size_t count)
-{
-	loff_t pos;
-	off_t off;
-	ssize_t ret;
-
-	if (offset) {
-		if (unlikely(get_user(off, offset)))
-			return -EFAULT;
-		pos = off;
-		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
-		if (unlikely(put_user(pos, offset)))
-			return -EFAULT;
-		return ret;
-	}
-
-	return do_sendfile(out_fd, in_fd, NULL, count, 0);
-}
-#endif /* __ARCH_WANT_COMPAT_SYS_SENDFILE */
diff --git a/fs/read_write.c b/fs/read_write.c
index dcfd58d95f44..f738e4dccfab 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -853,8 +853,8 @@ SYSCALL_DEFINE5(pwritev, unsigned long, fd, const struct iovec __user *, vec,
 	return ret;
 }
 
-ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
-		    loff_t max)
+static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
+		  	   size_t count, loff_t max)
 {
 	struct fd in, out;
 	struct inode *in_inode, *out_inode;
@@ -978,3 +978,43 @@ SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, loff_t __user *, offset, si
 
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(sendfile, int, out_fd, int, in_fd,
+		compat_off_t __user *, offset, compat_size_t, count)
+{
+	loff_t pos;
+	off_t off;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(get_user(off, offset)))
+			return -EFAULT;
+		pos = off;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, MAX_NON_LFS);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+
+COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
+		compat_loff_t __user *, offset, compat_size_t, count)
+{
+	loff_t pos;
+	ssize_t ret;
+
+	if (offset) {
+		if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
+			return -EFAULT;
+		ret = do_sendfile(out_fd, in_fd, &pos, count, 0);
+		if (unlikely(put_user(pos, offset)))
+			return -EFAULT;
+		return ret;
+	}
+
+	return do_sendfile(out_fd, in_fd, NULL, count, 0);
+}
+#endif
diff --git a/fs/read_write.h b/fs/read_write.h
index d3e00ef67420..d07b954c6e0c 100644
--- a/fs/read_write.h
+++ b/fs/read_write.h
@@ -12,5 +12,3 @@ ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 		unsigned long nr_segs, size_t len, loff_t *ppos, iov_fn_t fn);
 ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
 		unsigned long nr_segs, loff_t *ppos, io_fn_t fn);
-ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, size_t count,
-		    loff_t max);
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 110132527e4c..ad299afcd488 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -670,6 +670,8 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
 
 asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
 				    compat_off_t __user *offset, compat_size_t count);
+asmlinkage long compat_sys_sendfile64(int out_fd, int in_fd,
+				    compat_loff_t __user *offset, compat_size_t count);
 asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr,
 				       compat_stack_t __user *uoss_ptr);
 
-- 
cgit 


From 35280bd4a3fa841897e2638437607fdec6c34f31 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 24 Feb 2013 14:52:17 -0500
Subject: switch epoll_pwait to COMPAT_SYSCALL_DEFINE

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/s390/kernel/compat_wrapper.S | 10 --------
 arch/s390/kernel/syscalls.S       |  2 +-
 fs/compat.c                       | 49 ---------------------------------------
 fs/eventpoll.c                    | 47 +++++++++++++++++++++++++++++++++++++
 include/linux/compat.h            |  5 ++--
 5 files changed, 50 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index a1dda9c67efe..52bea71d93e6 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1270,16 +1270,6 @@ ENTRY(sys_getcpu_wrapper)
 	llgtr	%r4,%r4			# struct getcpu_cache *
 	jg	sys_getcpu
 
-ENTRY(compat_sys_epoll_pwait_wrapper)
-	lgfr	%r2,%r2			# int
-	llgtr	%r3,%r3			# struct compat_epoll_event *
-	lgfr	%r4,%r4			# int
-	lgfr	%r5,%r5			# int
-	llgtr	%r6,%r6			# compat_sigset_t *
-	llgf	%r0,164(%r15)		# compat_size_t
-	stg	%r0,160(%r15)
-	jg	compat_sys_epoll_pwait
-
 ENTRY(compat_sys_utimes_wrapper)
 	llgtr	%r2,%r2			# char *
 	llgtr	%r3,%r3			# struct compat_timeval *
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 5f3f7fbc5465..63d6b4343193 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -320,7 +320,7 @@ SYSCALL(sys_tee,sys_tee,sys_tee_wrapper)
 SYSCALL(sys_vmsplice,sys_vmsplice,compat_sys_vmsplice_wrapper)
 NI_SYSCALL							/* 310 sys_move_pages */
 SYSCALL(sys_getcpu,sys_getcpu,sys_getcpu_wrapper)
-SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait_wrapper)
+SYSCALL(sys_epoll_pwait,sys_epoll_pwait,compat_sys_epoll_pwait)
 SYSCALL(sys_utimes,sys_utimes,compat_sys_utimes_wrapper)
 SYSCALL(sys_s390_fallocate,sys_fallocate,sys_fallocate_wrapper)
 SYSCALL(sys_utimensat,sys_utimensat,compat_sys_utimensat_wrapper)	/* 315 */
diff --git a/fs/compat.c b/fs/compat.c
index 2ae2a98891cd..45137a3832f3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -44,7 +44,6 @@
 #include <linux/signal.h>
 #include <linux/poll.h>
 #include <linux/mm.h>
-#include <linux/eventpoll.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
@@ -1659,54 +1658,6 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds,
 	return ret;
 }
 
-#ifdef CONFIG_EPOLL
-
-asmlinkage long compat_sys_epoll_pwait(int epfd,
-			struct compat_epoll_event __user *events,
-			int maxevents, int timeout,
-			const compat_sigset_t __user *sigmask,
-			compat_size_t sigsetsize)
-{
-	long err;
-	compat_sigset_t csigmask;
-	sigset_t ksigmask, sigsaved;
-
-	/*
-	 * If the caller wants a certain signal mask to be set during the wait,
-	 * we apply it here.
-	 */
-	if (sigmask) {
-		if (sigsetsize != sizeof(compat_sigset_t))
-			return -EINVAL;
-		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
-			return -EFAULT;
-		sigset_from_compat(&ksigmask, &csigmask);
-		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
-		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
-	}
-
-	err = sys_epoll_wait(epfd, events, maxevents, timeout);
-
-	/*
-	 * If we changed the signal mask, we need to restore the original one.
-	 * In case we've got a signal while waiting, we do not restore the
-	 * signal mask yet, and we allow do_signal() to deliver the signal on
-	 * the way back to userspace, before the signal mask is restored.
-	 */
-	if (sigmask) {
-		if (err == -EINTR) {
-			memcpy(&current->saved_sigmask, &sigsaved,
-			       sizeof(sigsaved));
-			set_restore_sigmask();
-		} else
-			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
-	}
-
-	return err;
-}
-
-#endif /* CONFIG_EPOLL */
-
 #ifdef CONFIG_FHANDLE
 /*
  * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 9fec1836057a..495d15558f42 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -40,6 +40,7 @@
 #include <linux/atomic.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/compat.h>
 
 /*
  * LOCKING:
@@ -1940,6 +1941,52 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
 	return error;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
+			struct epoll_event __user *, events,
+			int, maxevents, int, timeout,
+			const compat_sigset_t __user *, sigmask,
+			compat_size_t, sigsetsize)
+{
+	long err;
+	compat_sigset_t csigmask;
+	sigset_t ksigmask, sigsaved;
+
+	/*
+	 * If the caller wants a certain signal mask to be set during the wait,
+	 * we apply it here.
+	 */
+	if (sigmask) {
+		if (sigsetsize != sizeof(compat_sigset_t))
+			return -EINVAL;
+		if (copy_from_user(&csigmask, sigmask, sizeof(csigmask)))
+			return -EFAULT;
+		sigset_from_compat(&ksigmask, &csigmask);
+		sigdelsetmask(&ksigmask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+		sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved);
+	}
+
+	err = sys_epoll_wait(epfd, events, maxevents, timeout);
+
+	/*
+	 * If we changed the signal mask, we need to restore the original one.
+	 * In case we've got a signal while waiting, we do not restore the
+	 * signal mask yet, and we allow do_signal() to deliver the signal on
+	 * the way back to userspace, before the signal mask is restored.
+	 */
+	if (sigmask) {
+		if (err == -EINTR) {
+			memcpy(&current->saved_sigmask, &sigsaved,
+			       sizeof(sigsaved));
+			set_restore_sigmask();
+		} else
+			sigprocmask(SIG_SETMASK, &sigsaved, NULL);
+	}
+
+	return err;
+}
+#endif
+
 static int __init eventpoll_init(void)
 {
 	struct sysinfo si;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index ad299afcd488..cdec8f2e9e21 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -432,10 +432,9 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 /*
  * epoll (fs/eventpoll.c) compat bits follow ...
  */
-struct epoll_event;
-#define compat_epoll_event	epoll_event
+struct epoll_event;	/* fortunately, this one is fixed-layout */
 asmlinkage long compat_sys_epoll_pwait(int epfd,
-			struct compat_epoll_event __user *events,
+			struct epoll_event __user *events,
 			int maxevents, int timeout,
 			const compat_sigset_t __user *sigmask,
 			compat_size_t sigsetsize);
-- 
cgit 


From d5dc77bfeeab0b03a32e3db5e31e2f64605634ab Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 25 Feb 2013 18:42:04 -0500
Subject: consolidate compat lookup_dcookie()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm64/kernel/sys32.S          |  7 -------
 arch/mips/kernel/linux32.c         |  6 ------
 arch/mips/kernel/scall64-o32.S     |  2 +-
 arch/parisc/kernel/sys_parisc32.c  |  7 -------
 arch/parisc/kernel/syscall_table.S |  2 +-
 arch/powerpc/include/asm/systbl.h  |  2 +-
 arch/powerpc/kernel/sys_ppc32.c    |  7 -------
 arch/s390/kernel/compat_wrapper.S  |  7 -------
 arch/s390/kernel/syscalls.S        |  2 +-
 arch/sparc/kernel/sys_sparc32.c    |  8 --------
 arch/sparc/kernel/systbls_64.S     |  2 +-
 arch/tile/kernel/compat.c          |  5 -----
 arch/x86/ia32/sys_ia32.c           |  6 ------
 arch/x86/include/asm/sys_ia32.h    |  1 -
 arch/x86/syscalls/syscall_32.tbl   |  2 +-
 fs/dcookies.c                      | 12 ++++++++++++
 include/linux/compat.h             |  1 +
 kernel/sys_ni.c                    |  1 +
 18 files changed, 20 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/kernel/sys32.S b/arch/arm64/kernel/sys32.S
index 9416d045a687..db01aa978c41 100644
--- a/arch/arm64/kernel/sys32.S
+++ b/arch/arm64/kernel/sys32.S
@@ -84,13 +84,6 @@ compat_sys_readahead_wrapper:
 	b	sys_readahead
 ENDPROC(compat_sys_readahead_wrapper)
 
-compat_sys_lookup_dcookie:
-	orr	x0, x0, x1, lsl #32
-	mov	w1, w2
-	mov	w2, w3
-	b	sys_lookup_dcookie
-ENDPROC(compat_sys_lookup_dcookie)
-
 compat_sys_fadvise64_64_wrapper:
 	mov	w6, w1
 	orr	x1, x2, x3, lsl #32
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index b0cc2a7df59f..6852d4876f82 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -259,12 +259,6 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_a2,
 			     merge_64(len_a4, len_a5));
 }
 
-asmlinkage long sys32_lookup_dcookie(u32 a0, u32 a1, char __user *buf,
-	size_t len)
-{
-	return sys_lookup_dcookie(merge_64(a0, a1), buf, len);
-}
-
 SYSCALL_DEFINE6(32_fanotify_mark, int, fanotify_fd, unsigned int, flags,
 		u64, a3, u64, a4, int, dfd, const char	__user *, pathname)
 {
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index c1a70e805751..91c8c6ea7b09 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -439,7 +439,7 @@ sys_call_table:
 	PTR	compat_sys_io_submit
 	PTR	sys_io_cancel			/* 4245 */
 	PTR	sys_exit_group
-	PTR	sys32_lookup_dcookie
+	PTR	compat_sys_lookup_dcookie
 	PTR	sys_epoll_create
 	PTR	sys_epoll_ctl
 	PTR	sys_epoll_wait			/* 4250 */
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 035ab3f94814..46bdf6080fe4 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -75,13 +75,6 @@ asmlinkage long sys32_semctl(int semid, int semnum, int cmd, union semun arg)
 	return sys_semctl (semid, semnum, cmd, arg);
 }
 
-long sys32_lookup_dcookie(u32 cookie_high, u32 cookie_low, char __user *buf,
-			  size_t len)
-{
-	return sys_lookup_dcookie((u64)cookie_high << 32 | cookie_low,
-				  buf, len);
-}
-
 asmlinkage long compat_sys_fanotify_mark(int fan_fd, int flags, u32 mask_hi,
 					 u32 mask_lo, int fd,
 					 const char __user *pathname)
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index f232672a9e20..30c9a3bba1cc 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -318,7 +318,7 @@
 	ENTRY_SAME(alloc_hugepages)	/* 220 */
 	ENTRY_SAME(free_hugepages)
 	ENTRY_SAME(exit_group)
-	ENTRY_DIFF(lookup_dcookie)
+	ENTRY_COMP(lookup_dcookie)
 	ENTRY_SAME(epoll_create)
 	ENTRY_SAME(epoll_ctl)		/* 225 */
 	ENTRY_SAME(epoll_wait)
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 634db7d2dc92..afef04d6ee52 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -239,7 +239,7 @@ SYSCALL_SPU(io_cancel)
 SYSCALL(set_tid_address)
 SYSX_SPU(sys_fadvise64,ppc32_fadvise64,sys_fadvise64)
 SYSCALL(exit_group)
-SYSX(sys_lookup_dcookie,ppc32_lookup_dcookie,sys_lookup_dcookie)
+COMPAT_SYS(lookup_dcookie)
 SYSCALL_SPU(epoll_create)
 SYSCALL_SPU(epoll_ctl)
 SYSCALL_SPU(epoll_wait)
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index 6e7c2509bd2d..e695230ca181 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -177,13 +177,6 @@ asmlinkage int compat_sys_ftruncate64(unsigned int fd, u32 reg4, unsigned long h
 	return sys_ftruncate(fd, (high << 32) | low);
 }
 
-long ppc32_lookup_dcookie(u32 cookie_high, u32 cookie_low, char __user *buf,
-			  size_t len)
-{
-	return sys_lookup_dcookie((u64)cookie_high << 32 | cookie_low,
-				  buf, len);
-}
-
 long ppc32_fadvise64(int fd, u32 unused, u32 offset_high, u32 offset_low,
 		     size_t len, int advice)
 {
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 68117a3dd252..6d4958ea390b 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -926,13 +926,6 @@ ENTRY(sys_epoll_wait_wrapper)
 	lgfr	%r5,%r5			# int
 	jg	sys_epoll_wait		# branch to system call
 
-ENTRY(sys32_lookup_dcookie_wrapper)
-	sllg	%r2,%r2,32		# get high word of 64bit dcookie
-	or	%r2,%r3			# get low word of 64bit dcookie
-	llgtr	%r3,%r4			# char *
-	llgfr	%r4,%r5			# size_t
-	jg	sys_lookup_dcookie
-
 ENTRY(sys32_fadvise64_wrapper)
 	lgfr	%r2,%r2			# int
 	sllg	%r3,%r3,32		# get high word of 64bit loff_t
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 102254a4397d..9154e17f25b9 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -118,7 +118,7 @@ SYSCALL(sys_newstat,sys_newstat,compat_sys_newstat_wrapper)
 SYSCALL(sys_newlstat,sys_newlstat,compat_sys_newlstat_wrapper)
 SYSCALL(sys_newfstat,sys_newfstat,compat_sys_newfstat_wrapper)
 NI_SYSCALL							/* old uname syscall */
-SYSCALL(sys_lookup_dcookie,sys_lookup_dcookie,sys32_lookup_dcookie_wrapper)	/* 110 */
+SYSCALL(sys_lookup_dcookie,sys_lookup_dcookie,compat_sys_lookup_dcookie)	/* 110 */
 SYSCALL(sys_vhangup,sys_vhangup,sys_vhangup)
 NI_SYSCALL							/* old "idle" system call */
 NI_SYSCALL							/* vm86old for i386 */
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index f38f2280fade..5d4ee8374c84 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -303,14 +303,6 @@ long compat_sys_fadvise64_64(int fd,
 				advice);
 }
 
-long sys32_lookup_dcookie(unsigned long cookie_high,
-			  unsigned long cookie_low,
-			  char __user *buf, size_t len)
-{
-	return sys_lookup_dcookie((cookie_high << 32) | cookie_low,
-				  buf, len);
-}
-
 long compat_sync_file_range(int fd, unsigned long off_high, unsigned long off_low, unsigned long nb_high, unsigned long nb_low, int flags)
 {
 	return sys_sync_file_range(fd,
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 46d575b6f696..8fd932080215 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -59,7 +59,7 @@ sys_call_table32:
 /*190*/	.word sys_init_module, sys_sparc64_personality, sys_remap_file_pages, sys_epoll_create, sys_epoll_ctl
 	.word sys_epoll_wait, sys_ioprio_set, sys_getppid, compat_sys_sparc_sigaction, sys_sgetmask
 /*200*/	.word sys_ssetmask, sys_sigsuspend, compat_sys_newlstat, sys_uselib, compat_sys_old_readdir
-	.word sys32_readahead, sys32_socketcall, sys_syslog, sys32_lookup_dcookie, sys32_fadvise64
+	.word sys32_readahead, sys32_socketcall, sys_syslog, compat_sys_lookup_dcookie, sys32_fadvise64
 /*210*/	.word sys32_fadvise64_64, sys_tgkill, sys_waitpid, sys_swapoff, compat_sys_sysinfo
 	.word compat_sys_ipc, sys32_sigreturn, sys_clone, sys_ioprio_get, compat_sys_adjtimex
 /*220*/	.word compat_sys_sigprocmask, sys_ni_syscall, sys_delete_module, sys_ni_syscall, sys_getpgid
diff --git a/arch/tile/kernel/compat.c b/arch/tile/kernel/compat.c
index 7f72401b4f45..c262a02d8efa 100644
--- a/arch/tile/kernel/compat.c
+++ b/arch/tile/kernel/compat.c
@@ -54,11 +54,6 @@ long compat_sys_pwrite64(unsigned int fd, char __user *ubuf, size_t count,
 	return sys_pwrite64(fd, ubuf, count, ((loff_t)high << 32) | low);
 }
 
-long compat_sys_lookup_dcookie(u32 low, u32 high, char __user *buf, size_t len)
-{
-	return sys_lookup_dcookie(((loff_t)high << 32) | low, buf, len);
-}
-
 long compat_sys_sync_file_range2(int fd, unsigned int flags,
 				 u32 offset_lo, u32 offset_hi,
 				 u32 nbytes_lo, u32 nbytes_hi)
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index ad6ca0472722..c0df976b0b71 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -226,12 +226,6 @@ long sys32_vm86_warning(void)
 	return -ENOSYS;
 }
 
-long sys32_lookup_dcookie(u32 addr_low, u32 addr_high,
-			  char __user *buf, size_t len)
-{
-	return sys_lookup_dcookie(((u64)addr_high << 32) | addr_low, buf, len);
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
 				   size_t count)
 {
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 6d944e4bb524..2b0e0c2d5379 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -45,7 +45,6 @@ asmlinkage long sys32_personality(unsigned long);
 long sys32_kill(int, int);
 long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
 long sys32_vm86_warning(void);
-long sys32_lookup_dcookie(u32, u32, char __user *, size_t);
 
 asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
 asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 6a00b1257d68..0b55cd773e4c 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -259,7 +259,7 @@
 250	i386	fadvise64		sys_fadvise64			sys32_fadvise64
 # 251 is available for reuse (was briefly sys_set_zone_reclaim)
 252	i386	exit_group		sys_exit_group
-253	i386	lookup_dcookie		sys_lookup_dcookie		sys32_lookup_dcookie
+253	i386	lookup_dcookie		sys_lookup_dcookie		compat_sys_lookup_dcookie
 254	i386	epoll_create		sys_epoll_create
 255	i386	epoll_ctl		sys_epoll_ctl
 256	i386	epoll_wait		sys_epoll_wait
diff --git a/fs/dcookies.c b/fs/dcookies.c
index f08375b97ffb..ab5954b50267 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -25,6 +25,7 @@
 #include <linux/dcookies.h>
 #include <linux/mutex.h>
 #include <linux/path.h>
+#include <linux/compat.h>
 #include <asm/uaccess.h>
 
 /* The dcookies are allocated from a kmem_cache and
@@ -202,6 +203,17 @@ out:
 	return err;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, size_t, len)
+{
+#ifdef __BIG_ENDIAN
+	return sys_lookup_dcookie(((u64)w0 << 32) | w1, buf, len);
+#else
+	return sys_lookup_dcookie(((u64)w1 << 32) | w0, buf, len);
+#endif
+}
+#endif
+
 static int dcookie_init(void)
 {
 	struct list_head * d;
diff --git a/include/linux/compat.h b/include/linux/compat.h
index cdec8f2e9e21..482c9e65b5bf 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -429,6 +429,7 @@ extern long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
 				  compat_long_t addr, compat_long_t data);
 
+asmlinkage long compat_sys_lookup_dcookie(u32, u32, char __user *, size_t);
 /*
  * epoll (fs/eventpoll.c) compat bits follow ...
  */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 395084d4ce16..b50e2a003c5a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -20,6 +20,7 @@ cond_syscall(sys_quotactl);
 cond_syscall(sys32_quotactl);
 cond_syscall(sys_acct);
 cond_syscall(sys_lookup_dcookie);
+cond_syscall(compat_sys_lookup_dcookie);
 cond_syscall(sys_swapon);
 cond_syscall(sys_swapoff);
 cond_syscall(sys_kexec_load);
-- 
cgit 


From 56e41d3c5aa84d679eebdb3cb8a70b03c5fbd6c3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 21 Jan 2013 23:15:25 -0500
Subject: merge compat sys_ipc instances

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/kernel/linux32.c        | 69 ---------------------------------------
 arch/mips/kernel/scall64-o32.S    |  2 +-
 arch/powerpc/kernel/sys_ppc32.c   | 67 -------------------------------------
 arch/s390/kernel/compat_linux.c   | 44 ++-----------------------
 arch/s390/kernel/compat_linux.h   |  1 -
 arch/s390/kernel/compat_wrapper.S |  8 -----
 arch/s390/kernel/syscalls.S       |  2 +-
 arch/sparc/kernel/sys_sparc32.c   | 65 ------------------------------------
 arch/x86/ia32/Makefile            |  3 --
 arch/x86/ia32/ipc32.c             | 54 ------------------------------
 arch/x86/include/asm/sys_ia32.h   |  3 --
 arch/x86/syscalls/syscall_32.tbl  |  2 +-
 include/linux/compat.h            |  1 +
 ipc/compat.c                      | 44 +++++++++++++++++++++++++
 kernel/sys_ni.c                   |  2 +-
 15 files changed, 52 insertions(+), 315 deletions(-)
 delete mode 100644 arch/x86/ia32/ipc32.c

(limited to 'include/linux')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 6852d4876f82..7c57b8d7b255 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -119,75 +119,6 @@ SYSCALL_DEFINE6(32_pwrite, unsigned int, fd, const char __user *, buf,
 	return sys_pwrite64(fd, buf, count, merge_64(a4, a5));
 }
 
-#ifdef CONFIG_SYSVIPC
-
-SYSCALL_DEFINE6(32_ipc, u32, call, long, first, long, second, long, third,
-	unsigned long, ptr, unsigned long, fifth)
-{
-	int version, err;
-
-	version = call >> 16; /* hack for backward compatibility */
-	call &= 0xffff;
-
-	switch (call) {
-	case SEMOP:
-		/* struct sembuf is the same on 32 and 64bit :)) */
-		err = sys_semtimedop(first, compat_ptr(ptr), second, NULL);
-		break;
-	case SEMTIMEDOP:
-		err = compat_sys_semtimedop(first, compat_ptr(ptr), second,
-					    compat_ptr(fifth));
-		break;
-	case SEMGET:
-		err = sys_semget(first, second, third);
-		break;
-	case SEMCTL:
-		err = compat_sys_semctl(first, second, third, compat_ptr(ptr));
-		break;
-	case MSGSND:
-		err = compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
-		break;
-	case MSGRCV:
-		err = compat_sys_msgrcv(first, second, fifth, third,
-					version, compat_ptr(ptr));
-		break;
-	case MSGGET:
-		err = sys_msgget((key_t) first, second);
-		break;
-	case MSGCTL:
-		err = compat_sys_msgctl(first, second, compat_ptr(ptr));
-		break;
-	case SHMAT:
-		err = compat_sys_shmat(first, second, third, version,
-				       compat_ptr(ptr));
-		break;
-	case SHMDT:
-		err = sys_shmdt(compat_ptr(ptr));
-		break;
-	case SHMGET:
-		err = sys_shmget(first, (unsigned)second, third);
-		break;
-	case SHMCTL:
-		err = compat_sys_shmctl(first, second, compat_ptr(ptr));
-		break;
-	default:
-		err = -EINVAL;
-		break;
-	}
-
-	return err;
-}
-
-#else
-
-SYSCALL_DEFINE6(32_ipc, u32, call, int, first, int, second, int, third,
-	u32, ptr, u32, fifth)
-{
-	return -ENOSYS;
-}
-
-#endif /* CONFIG_SYSVIPC */
-
 #ifdef CONFIG_MIPS32_N32
 SYSCALL_DEFINE4(n32_semctl, int, semid, int, semnum, int, cmd, u32, arg)
 {
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 91c8c6ea7b09..103bfe570fe8 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -309,7 +309,7 @@ sys_call_table:
 	PTR	compat_sys_wait4
 	PTR	sys_swapoff			/* 4115 */
 	PTR	compat_sys_sysinfo
-	PTR	sys_32_ipc
+	PTR	compat_sys_ipc
 	PTR	sys_fsync
 	PTR	sys32_sigreturn
 	PTR	__sys_clone			/* 4120 */
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index e695230ca181..d78ad7b6c464 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -61,73 +61,6 @@ asmlinkage long ppc32_select(u32 n, compat_ulong_t __user *inp,
 	return compat_sys_select((int)n, inp, outp, exp, compat_ptr(tvp_x));
 }
 
-#ifdef CONFIG_SYSVIPC
-long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr,
-	       u32 fifth)
-{
-	int version;
-
-	version = call >> 16; /* hack for backward compatibility */
-	call &= 0xffff;
-
-	switch (call) {
-
-	case SEMTIMEDOP:
-		if (fifth)
-			/* sign extend semid */
-			return compat_sys_semtimedop((int)first,
-						     compat_ptr(ptr), second,
-						     compat_ptr(fifth));
-		/* else fall through for normal semop() */
-	case SEMOP:
-		/* struct sembuf is the same on 32 and 64bit :)) */
-		/* sign extend semid */
-		return sys_semtimedop((int)first, compat_ptr(ptr), second,
-				      NULL);
-	case SEMGET:
-		/* sign extend key, nsems */
-		return sys_semget((int)first, (int)second, third);
-	case SEMCTL:
-		/* sign extend semid, semnum */
-		return compat_sys_semctl((int)first, (int)second, third,
-					 compat_ptr(ptr));
-
-	case MSGSND:
-		/* sign extend msqid */
-		return compat_sys_msgsnd((int)first, (int)second, third,
-					 compat_ptr(ptr));
-	case MSGRCV:
-		/* sign extend msqid, msgtyp */
-		return compat_sys_msgrcv((int)first, second, (int)fifth,
-					 third, version, compat_ptr(ptr));
-	case MSGGET:
-		/* sign extend key */
-		return sys_msgget((int)first, second);
-	case MSGCTL:
-		/* sign extend msqid */
-		return compat_sys_msgctl((int)first, second, compat_ptr(ptr));
-
-	case SHMAT:
-		/* sign extend shmid */
-		return compat_sys_shmat((int)first, second, third, version,
-					compat_ptr(ptr));
-	case SHMDT:
-		return sys_shmdt(compat_ptr(ptr));
-	case SHMGET:
-		/* sign extend key_t */
-		return sys_shmget((int)first, second, third);
-	case SHMCTL:
-		/* sign extend shmid */
-		return compat_sys_shmctl((int)first, second, compat_ptr(ptr));
-
-	default:
-		return -ENOSYS;
-	}
-
-	return -ENOSYS;
-}
-#endif
-
 unsigned long compat_sys_mmap2(unsigned long addr, size_t len,
 			  unsigned long prot, unsigned long flags,
 			  unsigned long fd, unsigned long pgoff)
diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index fbd29c70a297..8b6e4f5288a2 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -288,51 +288,13 @@ asmlinkage long sys32_getegid16(void)
 	return high2lowgid(from_kgid_munged(current_user_ns(), current_egid()));
 }
 
-/*
- * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation.
- *
- * This is really horribly ugly.
- */
 #ifdef CONFIG_SYSVIPC
-asmlinkage long sys32_ipc(u32 call, int first, int second, int third, u32 ptr)
+COMPAT_SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second,
+		unsigned long, third, compat_uptr_t, ptr)
 {
 	if (call >> 16)		/* hack for backward compatibility */
 		return -EINVAL;
-	switch (call) {
-	case SEMTIMEDOP:
-		return compat_sys_semtimedop(first, compat_ptr(ptr),
-					     second, compat_ptr(third));
-	case SEMOP:
-		/* struct sembuf is the same on 32 and 64bit :)) */
-		return sys_semtimedop(first, compat_ptr(ptr),
-				      second, NULL);
-	case SEMGET:
-		return sys_semget(first, second, third);
-	case SEMCTL:
-		return compat_sys_semctl(first, second, third,
-					 compat_ptr(ptr));
-	case MSGSND:
-		return compat_sys_msgsnd(first, second, third,
-					 compat_ptr(ptr));
-	case MSGRCV:
-		return compat_sys_msgrcv(first, second, 0, third,
-					 0, compat_ptr(ptr));
-	case MSGGET:
-		return sys_msgget((key_t) first, second);
-	case MSGCTL:
-		return compat_sys_msgctl(first, second, compat_ptr(ptr));
-	case SHMAT:
-		return compat_sys_shmat(first, second, third,
-					0, compat_ptr(ptr));
-	case SHMDT:
-		return sys_shmdt(compat_ptr(ptr));
-	case SHMGET:
-		return sys_shmget(first, (unsigned)second, third);
-	case SHMCTL:
-		return compat_sys_shmctl(first, second, compat_ptr(ptr));
-	}
-
-	return -ENOSYS;
+	return compat_sys_ipc(call, first, second, third, ptr, third);
 }
 #endif
 
diff --git a/arch/s390/kernel/compat_linux.h b/arch/s390/kernel/compat_linux.h
index bce0b7aec8f9..976518c0592a 100644
--- a/arch/s390/kernel/compat_linux.h
+++ b/arch/s390/kernel/compat_linux.h
@@ -94,7 +94,6 @@ long sys32_getuid16(void);
 long sys32_geteuid16(void);
 long sys32_getgid16(void);
 long sys32_getegid16(void);
-long sys32_ipc(u32 call, int first, int second, int third, u32 ptr);
 long sys32_truncate64(const char __user * path, unsigned long high,
 		      unsigned long low);
 long sys32_ftruncate64(unsigned int fd, unsigned long high, unsigned long low);
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 6d4958ea390b..17644c8e10e1 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -388,14 +388,6 @@ ENTRY(compat_sys_sysinfo_wrapper)
 	llgtr	%r2,%r2			# struct sysinfo_emu31 *
 	jg	compat_sys_sysinfo	# branch to system call
 
-ENTRY(sys32_ipc_wrapper)
-	llgfr	%r2,%r2			# uint
-	lgfr	%r3,%r3			# int
-	lgfr	%r4,%r4			# int
-	lgfr	%r5,%r5			# int
-	llgfr	%r6,%r6			# u32
-	jg	sys32_ipc		# branch to system call
-
 ENTRY(sys32_fsync_wrapper)
 	llgfr	%r2,%r2			# unsigned int
 	jg	sys_fsync		# branch to system call
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 9154e17f25b9..d2baabed7148 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -125,7 +125,7 @@ NI_SYSCALL							/* vm86old for i386 */
 SYSCALL(sys_wait4,sys_wait4,compat_sys_wait4)
 SYSCALL(sys_swapoff,sys_swapoff,sys32_swapoff_wrapper)		/* 115 */
 SYSCALL(sys_sysinfo,sys_sysinfo,compat_sys_sysinfo_wrapper)
-SYSCALL(sys_s390_ipc,sys_s390_ipc,sys32_ipc_wrapper)
+SYSCALL(sys_s390_ipc,sys_s390_ipc,compat_sys_s390_ipc)
 SYSCALL(sys_fsync,sys_fsync,sys32_fsync_wrapper)
 SYSCALL(sys_sigreturn,sys_sigreturn,sys32_sigreturn)
 SYSCALL(sys_clone,sys_clone,sys_clone_wrapper)			/* 120 */
diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c
index 5d4ee8374c84..d546188b13df 100644
--- a/arch/sparc/kernel/sys_sparc32.c
+++ b/arch/sparc/kernel/sys_sparc32.c
@@ -49,71 +49,6 @@
 #include <asm/mmu_context.h>
 #include <asm/compat_signal.h>
 
-#ifdef CONFIG_SYSVIPC                                                        
-asmlinkage long compat_sys_ipc(u32 call, u32 first, u32 second, u32 third, compat_uptr_t ptr, u32 fifth)
-{
-	int version;
-
-	version = call >> 16; /* hack for backward compatibility */
-	call &= 0xffff;
-
-	switch (call) {
-	case SEMTIMEDOP:
-		if (fifth)
-			/* sign extend semid */
-			return compat_sys_semtimedop((int)first,
-						     compat_ptr(ptr), second,
-						     compat_ptr(fifth));
-		/* else fall through for normal semop() */
-	case SEMOP:
-		/* struct sembuf is the same on 32 and 64bit :)) */
-		/* sign extend semid */
-		return sys_semtimedop((int)first, compat_ptr(ptr), second,
-				      NULL);
-	case SEMGET:
-		/* sign extend key, nsems */
-		return sys_semget((int)first, (int)second, third);
-	case SEMCTL:
-		/* sign extend semid, semnum */
-		return compat_sys_semctl((int)first, (int)second, third,
-					 compat_ptr(ptr));
-
-	case MSGSND:
-		/* sign extend msqid */
-		return compat_sys_msgsnd((int)first, (int)second, third,
-					 compat_ptr(ptr));
-	case MSGRCV:
-		/* sign extend msqid, msgtyp */
-		return compat_sys_msgrcv((int)first, second, (int)fifth,
-					 third, version, compat_ptr(ptr));
-	case MSGGET:
-		/* sign extend key */
-		return sys_msgget((int)first, second);
-	case MSGCTL:
-		/* sign extend msqid */
-		return compat_sys_msgctl((int)first, second, compat_ptr(ptr));
-
-	case SHMAT:
-		/* sign extend shmid */
-		return compat_sys_shmat((int)first, second, third, version,
-					compat_ptr(ptr));
-	case SHMDT:
-		return sys_shmdt(compat_ptr(ptr));
-	case SHMGET:
-		/* sign extend key_t */
-		return sys_shmget((int)first, second, third);
-	case SHMCTL:
-		/* sign extend shmid */
-		return compat_sys_shmctl((int)first, second, compat_ptr(ptr));
-
-	default:
-		return -ENOSYS;
-	}
-
-	return -ENOSYS;
-}
-#endif
-
 asmlinkage long sys32_truncate64(const char __user * path, unsigned long high, unsigned long low)
 {
 	if ((int)high < 0)
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index 455646e0e532..e785b422b766 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -5,9 +5,6 @@
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
 obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
 
-sysv-$(CONFIG_SYSVIPC) := ipc32.o
-obj-$(CONFIG_IA32_EMULATION) += $(sysv-y)
-
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
 audit-class-$(CONFIG_AUDIT) := audit.o
diff --git a/arch/x86/ia32/ipc32.c b/arch/x86/ia32/ipc32.c
deleted file mode 100644
index 29cdcd02ead3..000000000000
--- a/arch/x86/ia32/ipc32.c
+++ /dev/null
@@ -1,54 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/spinlock.h>
-#include <linux/list.h>
-#include <linux/syscalls.h>
-#include <linux/time.h>
-#include <linux/sem.h>
-#include <linux/msg.h>
-#include <linux/shm.h>
-#include <linux/ipc.h>
-#include <linux/compat.h>
-#include <asm/sys_ia32.h>
-
-asmlinkage long sys32_ipc(u32 call, int first, int second, int third,
-			  compat_uptr_t ptr, u32 fifth)
-{
-	int version;
-
-	version = call >> 16; /* hack for backward compatibility */
-	call &= 0xffff;
-
-	switch (call) {
-	case SEMOP:
-		/* struct sembuf is the same on 32 and 64bit :)) */
-		return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
-	case SEMTIMEDOP:
-		return compat_sys_semtimedop(first, compat_ptr(ptr), second,
-						compat_ptr(fifth));
-	case SEMGET:
-		return sys_semget(first, second, third);
-	case SEMCTL:
-		return compat_sys_semctl(first, second, third, compat_ptr(ptr));
-
-	case MSGSND:
-		return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
-	case MSGRCV:
-		return compat_sys_msgrcv(first, second, fifth, third,
-					 version, compat_ptr(ptr));
-	case MSGGET:
-		return sys_msgget((key_t) first, second);
-	case MSGCTL:
-		return compat_sys_msgctl(first, second, compat_ptr(ptr));
-
-	case SHMAT:
-		return compat_sys_shmat(first, second, third, version,
-					compat_ptr(ptr));
-	case SHMDT:
-		return sys_shmdt(compat_ptr(ptr));
-	case SHMGET:
-		return sys_shmget(first, (unsigned)second, third);
-	case SHMCTL:
-		return compat_sys_shmctl(first, second, compat_ptr(ptr));
-	}
-	return -ENOSYS;
-}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 2b0e0c2d5379..df8ad3b3920a 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -57,9 +57,6 @@ asmlinkage long sys32_fallocate(int, int, unsigned,
 asmlinkage long sys32_sigreturn(void);
 asmlinkage long sys32_rt_sigreturn(void);
 
-/* ia32/ipc32.c */
-asmlinkage long sys32_ipc(u32, int, int, int, compat_uptr_t, u32);
-
 asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
 				    const char __user *);
 
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index 0b55cd773e4c..0f6f5becab0d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -123,7 +123,7 @@
 114	i386	wait4			sys_wait4			compat_sys_wait4
 115	i386	swapoff			sys_swapoff
 116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
-117	i386	ipc			sys_ipc				sys32_ipc
+117	i386	ipc			sys_ipc				compat_sys_ipc
 118	i386	fsync			sys_fsync
 119	i386	sigreturn		sys_sigreturn			stub32_sigreturn
 120	i386	clone			sys_clone			stub32_clone
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 482c9e65b5bf..79a4781ac502 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -318,6 +318,7 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
 		int version, void __user *uptr);
 long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
 		void __user *uptr);
+asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32);
 #else
 long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
 long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp,
diff --git a/ipc/compat.c b/ipc/compat.c
index 2547f29dcd1b..1da2e2eb9d70 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -368,6 +368,50 @@ long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
 	return do_msgrcv(first, uptr, second, msgtyp, third,
 			 compat_do_msg_fill);
 }
+
+COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
+	u32, third, compat_uptr_t, ptr, u32, fifth)
+{
+	int version;
+
+	version = call >> 16; /* hack for backward compatibility */
+	call &= 0xffff;
+
+	switch (call) {
+	case SEMOP:
+		/* struct sembuf is the same on 32 and 64bit :)) */
+		return sys_semtimedop(first, compat_ptr(ptr), second, NULL);
+	case SEMTIMEDOP:
+		return compat_sys_semtimedop(first, compat_ptr(ptr), second,
+						compat_ptr(fifth));
+	case SEMGET:
+		return sys_semget(first, second, third);
+	case SEMCTL:
+		return compat_sys_semctl(first, second, third, compat_ptr(ptr));
+
+	case MSGSND:
+		return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
+	case MSGRCV:
+		return compat_sys_msgrcv(first, second, fifth, third,
+					 version, compat_ptr(ptr));
+	case MSGGET:
+		return sys_msgget(first, second);
+	case MSGCTL:
+		return compat_sys_msgctl(first, second, compat_ptr(ptr));
+
+	case SHMAT:
+		return compat_sys_shmat(first, second, third, version,
+					compat_ptr(ptr));
+	case SHMDT:
+		return sys_shmdt(compat_ptr(ptr));
+	case SHMGET:
+		return sys_shmget(first, (unsigned)second, third);
+	case SHMCTL:
+		return compat_sys_shmctl(first, second, compat_ptr(ptr));
+	}
+
+	return -ENOSYS;
+}
 #else
 long compat_sys_semctl(int semid, int semnum, int cmd, int arg)
 {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index b50e2a003c5a..bfd6787b355a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,7 +156,7 @@ cond_syscall(compat_sys_process_vm_writev);
 cond_syscall(sys_pciconfig_read);
 cond_syscall(sys_pciconfig_write);
 cond_syscall(sys_pciconfig_iobase);
-cond_syscall(sys32_ipc);
+cond_syscall(compat_sys_s390_ipc);
 cond_syscall(ppc_rtas);
 cond_syscall(sys_spu_run);
 cond_syscall(sys_spu_create);
-- 
cgit 


From 0e65a81b105a3f646793d46740ad90fa5c067986 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 3 Feb 2013 14:36:44 -0500
Subject: get rid of compat_sys_semctl() and friends in case of
 ARCH_WANT_OLD_COMPAT_IPC

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/kernel/linux32.c     |  24 -------
 arch/mips/kernel/scall64-n32.S |   6 +-
 include/linux/compat.h         |  17 ++---
 ipc/compat.c                   | 158 +++++++++++++++++------------------------
 4 files changed, 73 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index 7c57b8d7b255..d1d576b765f5 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -119,30 +119,6 @@ SYSCALL_DEFINE6(32_pwrite, unsigned int, fd, const char __user *, buf,
 	return sys_pwrite64(fd, buf, count, merge_64(a4, a5));
 }
 
-#ifdef CONFIG_MIPS32_N32
-SYSCALL_DEFINE4(n32_semctl, int, semid, int, semnum, int, cmd, u32, arg)
-{
-	/* compat_sys_semctl expects a pointer to union semun */
-	u32 __user *uptr = compat_alloc_user_space(sizeof(u32));
-	if (put_user(arg, uptr))
-		return -EFAULT;
-	return compat_sys_semctl(semid, semnum, cmd, uptr);
-}
-
-SYSCALL_DEFINE4(n32_msgsnd, int, msqid, u32, msgp, unsigned int, msgsz,
-	int, msgflg)
-{
-	return compat_sys_msgsnd(msqid, msgsz, msgflg, compat_ptr(msgp));
-}
-
-SYSCALL_DEFINE5(n32_msgrcv, int, msqid, u32, msgp, size_t, msgsz,
-	int, msgtyp, int, msgflg)
-{
-	return compat_sys_msgrcv(msqid, msgsz, msgtyp, msgflg, IPC_64,
-				 compat_ptr(msgp));
-}
-#endif
-
 SYSCALL_DEFINE1(32_personality, unsigned long, personality)
 {
 	unsigned int p = personality & 0xffffffff;
diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S
index 9b4df498fc5b..edcb6594e7b5 100644
--- a/arch/mips/kernel/scall64-n32.S
+++ b/arch/mips/kernel/scall64-n32.S
@@ -168,11 +168,11 @@ EXPORT(sysn32_call_table)
 	PTR	sys_newuname
 	PTR	sys_semget
 	PTR	sys_semop
-	PTR	sys_n32_semctl
+	PTR	compat_sys_semctl
 	PTR	sys_shmdt			/* 6065 */
 	PTR	sys_msgget
-	PTR	sys_n32_msgsnd
-	PTR	sys_n32_msgrcv
+	PTR	compat_sys_msgsnd
+	PTR	compat_sys_msgrcv
 	PTR	compat_sys_msgctl
 	PTR	compat_sys_fcntl		/* 6070 */
 	PTR	sys_flock
diff --git a/include/linux/compat.h b/include/linux/compat.h
index 79a4781ac502..2bfe67329dc4 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -311,22 +311,13 @@ asmlinkage long
 compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
 			   compat_size_t __user *len_ptr);
 
-#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
-long compat_sys_semctl(int first, int second, int third, void __user *uptr);
-long compat_sys_msgsnd(int first, int second, int third, void __user *uptr);
-long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
-		int version, void __user *uptr);
-long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
-		void __user *uptr);
 asmlinkage long compat_sys_ipc(u32, int, int, u32, compat_uptr_t, u32);
-#else
-long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
-long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp,
+asmlinkage long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);
+asmlinkage long compat_sys_semctl(int semid, int semnum, int cmd, int arg);
+asmlinkage long compat_sys_msgsnd(int msqid, compat_uptr_t msgp,
 		compat_ssize_t msgsz, int msgflg);
-long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp,
+asmlinkage long compat_sys_msgrcv(int msqid, compat_uptr_t msgp,
 		compat_ssize_t msgsz, long msgtyp, int msgflg);
-long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg);
-#endif
 long compat_sys_msgctl(int first, int second, void __user *uptr);
 long compat_sys_shmctl(int first, int second, void __user *uptr);
 long compat_sys_semtimedop(int semid, struct sembuf __user *tsems,
diff --git a/ipc/compat.c b/ipc/compat.c
index 1da2e2eb9d70..6cb6a4df86e4 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -306,7 +306,7 @@ static long do_compat_semctl(int first, int second, int third, u32 pad)
 	return err;
 }
 
-long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
+static long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
 {
 	struct compat_msgbuf __user *msgp = dest;
 	size_t msgsz;
@@ -320,59 +320,16 @@ long compat_do_msg_fill(void __user *dest, struct msg_msg *msg, size_t bufsz)
 	return msgsz;
 }
 
-#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
-long compat_sys_semctl(int first, int second, int third, void __user *uptr)
-{
-	u32 pad;
-
-	if (!uptr)
-		return -EINVAL;
-	if (get_user(pad, (u32 __user *) uptr))
-		return -EFAULT;
-	return do_compat_semctl(first, second, third, pad);
-}
-
-long compat_sys_msgsnd(int first, int second, int third, void __user *uptr)
-{
-	struct compat_msgbuf __user *up = uptr;
-	long type;
-
-	if (first < 0)
-		return -EINVAL;
-	if (second < 0)
-		return -EINVAL;
-
-	if (get_user(type, &up->mtype))
-		return -EFAULT;
-
-	return do_msgsnd(first, type, up->mtext, second, third);
-}
-
-long compat_sys_msgrcv(int first, int second, int msgtyp, int third,
-			   int version, void __user *uptr)
-{
-	if (first < 0)
-		return -EINVAL;
-	if (second < 0)
-		return -EINVAL;
-
-	if (!version) {
-		struct compat_ipc_kludge ipck;
-		if (!uptr)
-			return -EINVAL;
-		if (copy_from_user (&ipck, uptr, sizeof(ipck)))
-			return -EFAULT;
-		uptr = compat_ptr(ipck.msgp);
-		msgtyp = ipck.msgtyp;
-	}
-	return do_msgrcv(first, uptr, second, msgtyp, third,
-			 compat_do_msg_fill);
-}
+#ifndef COMPAT_SHMLBA
+#define COMPAT_SHMLBA	SHMLBA
+#endif
 
+#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
 COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
 	u32, third, compat_uptr_t, ptr, u32, fifth)
 {
 	int version;
+	u32 pad;
 
 	version = call >> 16; /* hack for backward compatibility */
 	call &= 0xffff;
@@ -387,21 +344,59 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
 	case SEMGET:
 		return sys_semget(first, second, third);
 	case SEMCTL:
-		return compat_sys_semctl(first, second, third, compat_ptr(ptr));
+		if (!ptr)
+			return -EINVAL;
+		if (get_user(pad, (u32 __user *) compat_ptr(ptr)))
+			return -EFAULT;
+		return do_compat_semctl(first, second, third, pad);
+
+	case MSGSND: {
+		struct compat_msgbuf __user *up = compat_ptr(ptr);
+		compat_long_t type;
+
+		if (first < 0 || second < 0)
+			return -EINVAL;
 
-	case MSGSND:
-		return compat_sys_msgsnd(first, second, third, compat_ptr(ptr));
-	case MSGRCV:
-		return compat_sys_msgrcv(first, second, fifth, third,
-					 version, compat_ptr(ptr));
+		if (get_user(type, &up->mtype))
+			return -EFAULT;
+
+		return do_msgsnd(first, type, up->mtext, second, third);
+	}
+	case MSGRCV: {
+		void __user *uptr = compat_ptr(ptr);
+
+		if (first < 0 || second < 0)
+			return -EINVAL;
+
+		if (!version) {
+			struct compat_ipc_kludge ipck;
+			if (!uptr)
+				return -EINVAL;
+			if (copy_from_user (&ipck, uptr, sizeof(ipck)))
+				return -EFAULT;
+			uptr = compat_ptr(ipck.msgp);
+			fifth = ipck.msgtyp;
+		}
+		return do_msgrcv(first, uptr, second, fifth, third,
+				 compat_do_msg_fill);
+	}
 	case MSGGET:
 		return sys_msgget(first, second);
 	case MSGCTL:
 		return compat_sys_msgctl(first, second, compat_ptr(ptr));
 
-	case SHMAT:
-		return compat_sys_shmat(first, second, third, version,
-					compat_ptr(ptr));
+	case SHMAT: {
+		int err;
+		unsigned long raddr;
+
+		if (version == 1)
+			return -EINVAL;
+		err = do_shmat(first, compat_ptr(ptr), second, &raddr,
+			       COMPAT_SHMLBA);
+		if (err < 0)
+			return err;
+		return put_user(raddr, (compat_ulong_t *)compat_ptr(third));
+	}
 	case SHMDT:
 		return sys_shmdt(compat_ptr(ptr));
 	case SHMGET:
@@ -412,29 +407,30 @@ COMPAT_SYSCALL_DEFINE6(ipc, u32, call, int, first, int, second,
 
 	return -ENOSYS;
 }
-#else
-long compat_sys_semctl(int semid, int semnum, int cmd, int arg)
+#endif
+
+COMPAT_SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, int, arg)
 {
 	return do_compat_semctl(semid, semnum, cmd, arg);
 }
 
-long compat_sys_msgsnd(int msqid, struct compat_msgbuf __user *msgp,
-		       compat_ssize_t msgsz, int msgflg)
+COMPAT_SYSCALL_DEFINE4(msgsnd, int, msqid, compat_uptr_t, msgp,
+		       compat_ssize_t, msgsz, int, msgflg)
 {
+	struct compat_msgbuf __user *up = compat_ptr(msgp);
 	compat_long_t mtype;
 
-	if (get_user(mtype, &msgp->mtype))
+	if (get_user(mtype, &up->mtype))
 		return -EFAULT;
-	return do_msgsnd(msqid, mtype, msgp->mtext, (ssize_t)msgsz, msgflg);
+	return do_msgsnd(msqid, mtype, up->mtext, (ssize_t)msgsz, msgflg);
 }
 
-long compat_sys_msgrcv(int msqid, struct compat_msgbuf __user *msgp,
-		       compat_ssize_t msgsz, long msgtyp, int msgflg)
+COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
+		       compat_ssize_t, msgsz, long, msgtyp, int, msgflg)
 {
-	return do_msgrcv(msqid, msgp, (ssize_t)msgsz, msgtyp, msgflg,
-			 compat_do_msg_fill);
+	return do_msgrcv(msqid, compat_ptr(msgp), (ssize_t)msgsz, msgtyp,
+			 msgflg, compat_do_msg_fill);
 }
-#endif
 
 static inline int get_compat_msqid64(struct msqid64_ds *m64,
 				     struct compat_msqid64_ds __user *up64)
@@ -552,28 +548,7 @@ long compat_sys_msgctl(int first, int second, void __user *uptr)
 	return err;
 }
 
-#ifndef COMPAT_SHMLBA
-#define COMPAT_SHMLBA	SHMLBA
-#endif
-
-#ifdef CONFIG_ARCH_WANT_OLD_COMPAT_IPC
-long compat_sys_shmat(int first, int second, compat_uptr_t third, int version,
-			void __user *uptr)
-{
-	int err;
-	unsigned long raddr;
-	compat_ulong_t __user *uaddr;
-
-	if (version == 1)
-		return -EINVAL;
-	err = do_shmat(first, uptr, second, &raddr, COMPAT_SHMLBA);
-	if (err < 0)
-		return err;
-	uaddr = compat_ptr(third);
-	return put_user(raddr, uaddr);
-}
-#else
-long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg)
+COMPAT_SYSCALL_DEFINE3(shmat, int, shmid, compat_uptr_t, shmaddr, int, shmflg)
 {
 	unsigned long ret;
 	long err;
@@ -584,7 +559,6 @@ long compat_sys_shmat(int shmid, compat_uptr_t shmaddr, int shmflg)
 	force_successful_syscall_return();
 	return (long)ret;
 }
-#endif
 
 static inline int get_compat_shmid64_ds(struct shmid64_ds *s64,
 					struct compat_shmid64_ds __user *up64)
-- 
cgit 


From 4b377bab29e6a241db42f27541e7fb63713ee178 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 4 Mar 2013 10:47:59 -0500
Subject: make do_mremap() static

The extern in sys_sparc_64.c was a rudiment of time when do_mremap()
used to exist in MMU case (it doesn't anymore).  As for !MMU one,
nothing uses it outside of mm/nommu.c...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/sparc/kernel/sys_sparc_64.c | 4 ----
 include/linux/mm.h               | 3 ---
 mm/nommu.c                       | 3 +--
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 708bc29d36a8..42beb6fc4ad8 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -470,10 +470,6 @@ SYSCALL_DEFINE2(64_munmap, unsigned long, addr, size_t, len)
 
 	return vm_munmap(addr, len);
 }
-
-extern unsigned long do_mremap(unsigned long addr,
-	unsigned long old_len, unsigned long new_len,
-	unsigned long flags, unsigned long new_addr);
                 
 SYSCALL_DEFINE5(64_mremap, unsigned long, addr,	unsigned long, old_len,
 		unsigned long, new_len, unsigned long, flags,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7acc9dc73c9f..f4c8aa990442 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1079,9 +1079,6 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
 		unsigned long old_addr, struct vm_area_struct *new_vma,
 		unsigned long new_addr, unsigned long len,
 		bool need_rmap_locks);
-extern unsigned long do_mremap(unsigned long addr,
-			       unsigned long old_len, unsigned long new_len,
-			       unsigned long flags, unsigned long new_addr);
 extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start,
 			      unsigned long end, pgprot_t newprot,
 			      int dirty_accountable, int prot_numa);
diff --git a/mm/nommu.c b/mm/nommu.c
index e19328087534..66737e0584ae 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1770,7 +1770,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
  *
  * MREMAP_FIXED is not supported under NOMMU conditions
  */
-unsigned long do_mremap(unsigned long addr,
+static unsigned long do_mremap(unsigned long addr,
 			unsigned long old_len, unsigned long new_len,
 			unsigned long flags, unsigned long new_addr)
 {
@@ -1805,7 +1805,6 @@ unsigned long do_mremap(unsigned long addr,
 	vma->vm_end = vma->vm_start + new_len;
 	return vma->vm_start;
 }
-EXPORT_SYMBOL(do_mremap);
 
 SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 		unsigned long, new_len, unsigned long, flags,
-- 
cgit 


From 45d9550a0e7e9230606ca3c4c6f4dc6297848b2f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 19 Feb 2013 12:17:01 -0800
Subject: workqueue: allow more off-queue flag space

When a work item is off-queue, its work->data contains WORK_STRUCT_*
and WORK_OFFQ_* flags.  As WORK_OFFQ_* flags are used only while a
work item is off-queue, it can occupy bits of work->data which aren't
used while off-queue.  WORK_OFFQ_* currently only use bits used by
on-queue CWQ pointer.  As color bits aren't used while off-queue,
there's no reason to not use them.

Lower WORK_OFFQ_FLAG_BASE from WORK_STRUCT_FLAG_BITS to
WORK_STRUCT_COLOR_SHIFT thus giving 4 more bits to off-queue flag
space which is also used to record worker_pool ID while off-queue.

This doesn't introduce any visible behavior difference.

tj: Rewrote the description.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 8afab27cdbc2..5bd030f630a9 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -68,7 +68,7 @@ enum {
 				  WORK_STRUCT_COLOR_BITS,
 
 	/* data contains off-queue information when !WORK_STRUCT_PWQ */
-	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_FLAG_BITS,
+	WORK_OFFQ_FLAG_BASE	= WORK_STRUCT_COLOR_SHIFT,
 
 	WORK_OFFQ_CANCELING	= (1 << WORK_OFFQ_FLAG_BASE),
 
-- 
cgit 


From 65dff759d2948cf18e2029fc5c0c595b8b7da3a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 1 Mar 2013 15:01:56 +0800
Subject: cgroup: fix cgroup_path() vs rename() race

rename() will change dentry->d_name. The result of this race can
be worse than seeing partially rewritten name, but we might access
a stale pointer because rename() will re-allocate memory to hold
a longer name.

As accessing dentry->name must be protected by dentry->d_lock or
parent inode's i_mutex, while on the other hand cgroup-path() can
be called with some irq-safe spinlocks held, we can't generate
cgroup path using dentry->d_name.

Alternatively we make a copy of dentry->d_name and save it in
cgrp->name when a cgroup is created, and update cgrp->name at
rename().

v5: use flexible array instead of zero-size array.
v4: - allocate root_cgroup_name and all root_cgroup->name points to it.
    - add cgroup_name() wrapper.
v3: use kfree_rcu() instead of synchronize_rcu() in user-visible path.
v2: make cgrp->name RCU safe.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-cgroup.h     |   2 -
 include/linux/cgroup.h |  24 +++++++++++
 kernel/cgroup.c        | 106 +++++++++++++++++++++++++++++++++++--------------
 3 files changed, 100 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index f2b292925ccd..4e595ee8c915 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -247,9 +247,7 @@ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
 {
 	int ret;
 
-	rcu_read_lock();
 	ret = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-	rcu_read_unlock();
 	if (ret)
 		strncpy(buf, "<unavailable>", buflen);
 	return ret;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..75c6ec1ba1ba 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -150,6 +150,11 @@ enum {
 	CGRP_CPUSET_CLONE_CHILDREN,
 };
 
+struct cgroup_name {
+	struct rcu_head rcu_head;
+	char name[];
+};
+
 struct cgroup {
 	unsigned long flags;		/* "unsigned long" so bitops work */
 
@@ -172,6 +177,19 @@ struct cgroup {
 	struct cgroup *parent;		/* my parent */
 	struct dentry *dentry;		/* cgroup fs entry, RCU protected */
 
+	/*
+	 * This is a copy of dentry->d_name, and it's needed because
+	 * we can't use dentry->d_name in cgroup_path().
+	 *
+	 * You must acquire rcu_read_lock() to access cgrp->name, and
+	 * the only place that can change it is rename(), which is
+	 * protected by parent dir's i_mutex.
+	 *
+	 * Normally you should use cgroup_name() wrapper rather than
+	 * access it directly.
+	 */
+	struct cgroup_name __rcu *name;
+
 	/* Private pointers for each registered subsystem */
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
@@ -404,6 +422,12 @@ struct cgroup_scanner {
 	void *data;
 };
 
+/* Caller should hold rcu_read_lock() */
+static inline const char *cgroup_name(const struct cgroup *cgrp)
+{
+	return rcu_dereference(cgrp->name)->name;
+}
+
 int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..50682168abc2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -238,6 +238,8 @@ static DEFINE_SPINLOCK(hierarchy_id_lock);
 /* dummytop is a shorthand for the dummy hierarchy's top cgroup */
 #define dummytop (&rootnode.top_cgroup)
 
+static struct cgroup_name root_cgroup_name = { .name = "/" };
+
 /* This flag indicates whether tasks in the fork and exit paths should
  * check for fork/exit handlers to call. This avoids us having to do
  * extra work in the fork/exit path if none of the subsystems need to
@@ -859,6 +861,17 @@ static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
 	return inode;
 }
 
+static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
+{
+	struct cgroup_name *name;
+
+	name = kmalloc(sizeof(*name) + dentry->d_name.len + 1, GFP_KERNEL);
+	if (!name)
+		return NULL;
+	strcpy(name->name, dentry->d_name.name);
+	return name;
+}
+
 static void cgroup_free_fn(struct work_struct *work)
 {
 	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
@@ -889,6 +902,7 @@ static void cgroup_free_fn(struct work_struct *work)
 	simple_xattrs_free(&cgrp->xattrs);
 
 	ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id);
+	kfree(rcu_dereference_raw(cgrp->name));
 	kfree(cgrp);
 }
 
@@ -1421,6 +1435,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	INIT_LIST_HEAD(&root->allcg_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
+	cgrp->name = &root_cgroup_name;
 	cgrp->top_cgroup = cgrp;
 	init_cgroup_housekeeping(cgrp);
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
@@ -1769,49 +1784,45 @@ static struct kobject *cgroup_kobj;
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
- * Called with cgroup_mutex held or else with an RCU-protected cgroup
- * reference.  Writes path of cgroup into buf.  Returns 0 on success,
- * -errno on error.
+ * Writes path of cgroup into buf.  Returns 0 on success, -errno on error.
+ *
+ * We can't generate cgroup path using dentry->d_name, as accessing
+ * dentry->name must be protected by irq-unsafe dentry->d_lock or parent
+ * inode's i_mutex, while on the other hand cgroup_path() can be called
+ * with some irq-safe spinlocks held.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
-	struct dentry *dentry = cgrp->dentry;
+	int ret = -ENAMETOOLONG;
 	char *start;
 
-	rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(),
-			   "cgroup_path() called without proper locking");
-
-	if (cgrp == dummytop) {
-		/*
-		 * Inactive subsystems have no dentry for their root
-		 * cgroup
-		 */
-		strcpy(buf, "/");
-		return 0;
-	}
-
 	start = buf + buflen - 1;
-
 	*start = '\0';
-	for (;;) {
-		int len = dentry->d_name.len;
 
+	rcu_read_lock();
+	while (cgrp) {
+		const char *name = cgroup_name(cgrp);
+		int len;
+
+		len = strlen(name);
 		if ((start -= len) < buf)
-			return -ENAMETOOLONG;
-		memcpy(start, dentry->d_name.name, len);
-		cgrp = cgrp->parent;
-		if (!cgrp)
-			break;
+			goto out;
+		memcpy(start, name, len);
 
-		dentry = cgrp->dentry;
 		if (!cgrp->parent)
-			continue;
+			break;
+
 		if (--start < buf)
-			return -ENAMETOOLONG;
+			goto out;
 		*start = '/';
+
+		cgrp = cgrp->parent;
 	}
+	ret = 0;
 	memmove(buf, start, buf + buflen - start);
-	return 0;
+out:
+	rcu_read_unlock();
+	return ret;
 }
 EXPORT_SYMBOL_GPL(cgroup_path);
 
@@ -2537,13 +2548,40 @@ static int cgroup_file_release(struct inode *inode, struct file *file)
 static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 			    struct inode *new_dir, struct dentry *new_dentry)
 {
+	int ret;
+	struct cgroup_name *name, *old_name;
+	struct cgroup *cgrp;
+
+	/*
+	 * It's convinient to use parent dir's i_mutex to protected
+	 * cgrp->name.
+	 */
+	lockdep_assert_held(&old_dir->i_mutex);
+
 	if (!S_ISDIR(old_dentry->d_inode->i_mode))
 		return -ENOTDIR;
 	if (new_dentry->d_inode)
 		return -EEXIST;
 	if (old_dir != new_dir)
 		return -EIO;
-	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+
+	cgrp = __d_cgrp(old_dentry);
+
+	name = cgroup_alloc_name(new_dentry);
+	if (!name)
+		return -ENOMEM;
+
+	ret = simple_rename(old_dir, old_dentry, new_dir, new_dentry);
+	if (ret) {
+		kfree(name);
+		return ret;
+	}
+
+	old_name = cgrp->name;
+	rcu_assign_pointer(cgrp->name, name);
+
+	kfree_rcu(old_name, rcu_head);
+	return 0;
 }
 
 static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
@@ -4158,6 +4196,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			     umode_t mode)
 {
 	struct cgroup *cgrp;
+	struct cgroup_name *name;
 	struct cgroupfs_root *root = parent->root;
 	int err = 0;
 	struct cgroup_subsys *ss;
@@ -4168,9 +4207,14 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (!cgrp)
 		return -ENOMEM;
 
+	name = cgroup_alloc_name(dentry);
+	if (!name)
+		goto err_free_cgrp;
+	rcu_assign_pointer(cgrp->name, name);
+
 	cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL);
 	if (cgrp->id < 0)
-		goto err_free_cgrp;
+		goto err_free_name;
 
 	/*
 	 * Only live parents can have children.  Note that the liveliness
@@ -4276,6 +4320,8 @@ err_free_all:
 	deactivate_super(sb);
 err_free_id:
 	ida_simple_remove(&root->cgroup_ida, cgrp->id);
+err_free_name:
+	kfree(rcu_dereference_raw(cgrp->name));
 err_free_cgrp:
 	kfree(cgrp);
 	return err;
-- 
cgit 


From 462fce46065ec4b200c08619c047b9e5a8fd154a Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:41:56 +0900
Subject: KVM: set_memory_region: Drop user_alloc from
 prepare/commit_memory_region()

X86 does not use this any more.  The remaining user, s390's !user_alloc
check, can be simply removed since KVM_SET_MEMORY_REGION ioctl is no
longer supported.

Note: fixed powerpc's indentations with spaces to suppress checkpatch
errors.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/arm/kvm/arm.c         |  6 ++----
 arch/ia64/kvm/kvm-ia64.c   |  6 ++----
 arch/powerpc/kvm/powerpc.c | 12 +++++-------
 arch/s390/kvm/kvm-s390.c   |  9 ++-------
 arch/x86/kvm/x86.c         |  6 ++----
 include/linux/kvm_host.h   |  6 ++----
 virt/kvm/kvm_main.c        |  4 ++--
 7 files changed, 17 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 5a936988eb24..24cb5f66787d 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -231,16 +231,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   struct kvm_memory_slot old,
-				   struct kvm_userspace_memory_region *mem,
-				   bool user_alloc)
+				   struct kvm_userspace_memory_region *mem)
 {
 	return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   struct kvm_userspace_memory_region *mem,
-				   struct kvm_memory_slot old,
-				   bool user_alloc)
+				   struct kvm_memory_slot old)
 {
 }
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index ad3126a58644..cbc5b0417dab 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1579,8 +1579,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
 		struct kvm_memory_slot old,
-		struct kvm_userspace_memory_region *mem,
-		bool user_alloc)
+		struct kvm_userspace_memory_region *mem)
 {
 	unsigned long i;
 	unsigned long pfn;
@@ -1610,8 +1609,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 		struct kvm_userspace_memory_region *mem,
-		struct kvm_memory_slot old,
-		bool user_alloc)
+		struct kvm_memory_slot old)
 {
 	return;
 }
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 934413cd3a1b..22b33159fbc4 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -411,18 +411,16 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 }
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
-                                   struct kvm_memory_slot *memslot,
-                                   struct kvm_memory_slot old,
-                                   struct kvm_userspace_memory_region *mem,
-                                   bool user_alloc)
+				   struct kvm_memory_slot *memslot,
+				   struct kvm_memory_slot old,
+				   struct kvm_userspace_memory_region *mem)
 {
 	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
-               struct kvm_userspace_memory_region *mem,
-               struct kvm_memory_slot old,
-               bool user_alloc)
+				   struct kvm_userspace_memory_region *mem,
+				   struct kvm_memory_slot old)
 {
 	kvmppc_core_commit_memory_region(kvm, mem, old);
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4cf35a0a79e7..07ac302ce246 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -975,8 +975,7 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
 				   struct kvm_memory_slot old,
-				   struct kvm_userspace_memory_region *mem,
-				   bool user_alloc)
+				   struct kvm_userspace_memory_region *mem)
 {
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
@@ -997,16 +996,12 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 	if (mem->memory_size & 0xffffful)
 		return -EINVAL;
 
-	if (!user_alloc)
-		return -EINVAL;
-
 	return 0;
 }
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old,
-				bool user_alloc)
+				struct kvm_memory_slot old)
 {
 	int rc;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 811c5c9c8880..26216bb4403f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6907,8 +6907,7 @@ out_free:
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
-				struct kvm_userspace_memory_region *mem,
-				bool user_alloc)
+				struct kvm_userspace_memory_region *mem)
 {
 	int npages = memslot->npages;
 
@@ -6938,8 +6937,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old,
-				bool user_alloc)
+				struct kvm_memory_slot old)
 {
 
 	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index cad77fe09d77..b4757a1cc4c4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -464,12 +464,10 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
 				struct kvm_memory_slot old,
-				struct kvm_userspace_memory_region *mem,
-				bool user_alloc);
+				struct kvm_userspace_memory_region *mem);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old,
-				bool user_alloc);
+				struct kvm_memory_slot old);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
 /* flush all memory translations */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index adc68feb5c5a..fd3037010e75 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -875,7 +875,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		slots = old_memslots;
 	}
 
-	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem, user_alloc);
+	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem);
 	if (r)
 		goto out_slots;
 
@@ -915,7 +915,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	old_memslots = install_new_memslots(kvm, slots, &new);
 
-	kvm_arch_commit_memory_region(kvm, mem, old, user_alloc);
+	kvm_arch_commit_memory_region(kvm, mem, old);
 
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);
-- 
cgit 


From 47ae31e257c548abdb199e0d26723139a9a967ba Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:43:00 +0900
Subject: KVM: set_memory_region: Drop user_alloc from set_memory_region()

Except ia64's stale code, KVM_SET_MEMORY_REGION support, this is only
used for sanity checks in __kvm_set_memory_region() which can easily
be changed to use slot id instead.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 18 ------------------
 arch/x86/kvm/vmx.c       |  6 +++---
 include/linux/kvm_host.h | 10 +++-------
 virt/kvm/kvm_main.c      | 18 +++++++-----------
 4 files changed, 13 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index cbc5b0417dab..43701f0c0f71 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -942,24 +942,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	int r = -ENOTTY;
 
 	switch (ioctl) {
-	case KVM_SET_MEMORY_REGION: {
-		struct kvm_memory_region kvm_mem;
-		struct kvm_userspace_memory_region kvm_userspace_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-			goto out;
-		kvm_userspace_mem.slot = kvm_mem.slot;
-		kvm_userspace_mem.flags = kvm_mem.flags;
-		kvm_userspace_mem.guest_phys_addr =
-					kvm_mem.guest_phys_addr;
-		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-		r = kvm_vm_ioctl_set_memory_region(kvm,
-					&kvm_userspace_mem, false);
-		if (r)
-			goto out;
-		break;
-		}
 	case KVM_CREATE_IRQCHIP:
 		r = -EFAULT;
 		r = kvm_ioapic_init(kvm);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7cc566b09ff2..58fb7c27e3b5 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3694,7 +3694,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
 	kvm_userspace_mem.flags = 0;
 	kvm_userspace_mem.guest_phys_addr = 0xfee00000ULL;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
 	if (r)
 		goto out;
 
@@ -3724,7 +3724,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	kvm_userspace_mem.guest_phys_addr =
 		kvm->arch.ept_identity_map_addr;
 	kvm_userspace_mem.memory_size = PAGE_SIZE;
-	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem, false);
+	r = __kvm_set_memory_region(kvm, &kvm_userspace_mem);
 	if (r)
 		goto out;
 
@@ -4364,7 +4364,7 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 		.flags = 0,
 	};
 
-	ret = kvm_set_memory_region(kvm, &tss_mem, false);
+	ret = kvm_set_memory_region(kvm, &tss_mem);
 	if (ret)
 		return ret;
 	kvm->arch.tss_addr = addr;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index b4757a1cc4c4..84a994c7a5c5 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -453,11 +453,9 @@ id_to_memslot(struct kvm_memslots *slots, int id)
 }
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  struct kvm_userspace_memory_region *mem,
-			  bool user_alloc);
+			  struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem,
-			    bool user_alloc);
+			    struct kvm_userspace_memory_region *mem);
 void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 			   struct kvm_memory_slot *dont);
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
@@ -553,9 +551,7 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 				struct kvm_dirty_log *log);
 
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-				   struct
-				   kvm_userspace_memory_region *mem,
-				   bool user_alloc);
+				   struct kvm_userspace_memory_region *mem);
 int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index fd3037010e75..5b3e41b81f0d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -745,8 +745,7 @@ enum kvm_mr_change {
  * Must be called holding mmap_sem for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
-			    struct kvm_userspace_memory_region *mem,
-			    bool user_alloc)
+			    struct kvm_userspace_memory_region *mem)
 {
 	int r;
 	gfn_t base_gfn;
@@ -767,7 +766,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
 		goto out;
 	/* We can read the guest memory with __xxx_user() later on. */
-	if (user_alloc &&
+	if ((mem->slot < KVM_USER_MEM_SLOTS) &&
 	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
 	     !access_ok(VERIFY_WRITE,
 			(void __user *)(unsigned long)mem->userspace_addr,
@@ -932,26 +931,23 @@ out:
 EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
 
 int kvm_set_memory_region(struct kvm *kvm,
-			  struct kvm_userspace_memory_region *mem,
-			  bool user_alloc)
+			  struct kvm_userspace_memory_region *mem)
 {
 	int r;
 
 	mutex_lock(&kvm->slots_lock);
-	r = __kvm_set_memory_region(kvm, mem, user_alloc);
+	r = __kvm_set_memory_region(kvm, mem);
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_set_memory_region);
 
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
-				   struct
-				   kvm_userspace_memory_region *mem,
-				   bool user_alloc)
+				   struct kvm_userspace_memory_region *mem)
 {
 	if (mem->slot >= KVM_USER_MEM_SLOTS)
 		return -EINVAL;
-	return kvm_set_memory_region(kvm, mem, user_alloc);
+	return kvm_set_memory_region(kvm, mem);
 }
 
 int kvm_get_dirty_log(struct kvm *kvm,
@@ -2198,7 +2194,7 @@ static long kvm_vm_ioctl(struct file *filp,
 						sizeof kvm_userspace_mem))
 			goto out;
 
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, true);
+		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem);
 		break;
 	}
 	case KVM_GET_DIRTY_LOG: {
-- 
cgit 


From 74d0727cb7aaaea48a6353209093be26abc8d160 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:43:44 +0900
Subject: KVM: set_memory_region: Make kvm_mr_change available to arch code

This will be used for cleaning up prepare/commit_memory_region() later.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h | 18 ++++++++++++++++++
 virt/kvm/kvm_main.c      | 18 ------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 84a994c7a5c5..8eaf61f7b02d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -452,6 +452,24 @@ id_to_memslot(struct kvm_memslots *slots, int id)
 	return slot;
 }
 
+/*
+ * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
+ * - create a new memory slot
+ * - delete an existing memory slot
+ * - modify an existing memory slot
+ *   -- move it in the guest physical memory space
+ *   -- just change its flags
+ *
+ * Since flags can be changed by some of these operations, the following
+ * differentiation is the best we can do for __kvm_set_memory_region():
+ */
+enum kvm_mr_change {
+	KVM_MR_CREATE,
+	KVM_MR_DELETE,
+	KVM_MR_MOVE,
+	KVM_MR_FLAGS_ONLY,
+};
+
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem);
 int __kvm_set_memory_region(struct kvm *kvm,
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5b3e41b81f0d..c7979ed41923 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -718,24 +718,6 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 	return old_memslots; 
 }
 
-/*
- * KVM_SET_USER_MEMORY_REGION ioctl allows the following operations:
- * - create a new memory slot
- * - delete an existing memory slot
- * - modify an existing memory slot
- *   -- move it in the guest physical memory space
- *   -- just change its flags
- *
- * Since flags can be changed by some of these operations, the following
- * differentiation is the best we can do for __kvm_set_memory_region():
- */
-enum kvm_mr_change {
-	KVM_MR_CREATE,
-	KVM_MR_DELETE,
-	KVM_MR_MOVE,
-	KVM_MR_FLAGS_ONLY,
-};
-
 /*
  * Allocate some memory and give it an address in the guest physical address
  * space.
-- 
cgit 


From 7b6195a91d60909a2834ab7181e2b9476e6fe749 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:44:34 +0900
Subject: KVM: set_memory_region: Refactor prepare_memory_region()

This patch drops the parameter old, a copy of the old memory slot, and
adds a new parameter named change to know the change being requested.

This not only cleans up the code but also removes extra copying of the
memory slot structure.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/arm/kvm/arm.c         |  4 ++--
 arch/ia64/kvm/kvm-ia64.c   |  4 ++--
 arch/powerpc/kvm/powerpc.c |  4 ++--
 arch/s390/kvm/kvm-s390.c   |  4 ++--
 arch/x86/kvm/x86.c         | 10 ++++------
 include/linux/kvm_host.h   |  4 ++--
 virt/kvm/kvm_main.c        |  2 +-
 7 files changed, 15 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 24cb5f66787d..96ebab7a1959 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -230,8 +230,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_memory_slot old,
-				   struct kvm_userspace_memory_region *mem)
+				   struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
 {
 	return 0;
 }
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 43701f0c0f71..5c2b07e8c3d6 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1560,8 +1560,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 		struct kvm_memory_slot *memslot,
-		struct kvm_memory_slot old,
-		struct kvm_userspace_memory_region *mem)
+		struct kvm_userspace_memory_region *mem,
+		enum kvm_mr_change change)
 {
 	unsigned long i;
 	unsigned long pfn;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 22b33159fbc4..8aa51cd67c28 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -412,8 +412,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_memory_slot old,
-				   struct kvm_userspace_memory_region *mem)
+				   struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
 {
 	return kvmppc_core_prepare_memory_region(kvm, memslot, mem);
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 07ac302ce246..4288780c86b8 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -974,8 +974,8 @@ int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages)
 /* Section: memory related */
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot,
-				   struct kvm_memory_slot old,
-				   struct kvm_userspace_memory_region *mem)
+				   struct kvm_userspace_memory_region *mem,
+				   enum kvm_mr_change change)
 {
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 26216bb4403f..7198234fa088 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6906,23 +6906,21 @@ out_free:
 
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_memory_slot old,
-				struct kvm_userspace_memory_region *mem)
+				struct kvm_userspace_memory_region *mem,
+				enum kvm_mr_change change)
 {
-	int npages = memslot->npages;
-
 	/*
 	 * Only private memory slots need to be mapped here since
 	 * KVM_SET_MEMORY_REGION ioctl is no longer supported.
 	 */
-	if ((memslot->id >= KVM_USER_MEM_SLOTS) && npages && !old.npages) {
+	if ((memslot->id >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_CREATE)) {
 		unsigned long userspace_addr;
 
 		/*
 		 * MAP_SHARED to prevent internal slot pages from being moved
 		 * by fork()/COW.
 		 */
-		userspace_addr = vm_mmap(NULL, 0, npages * PAGE_SIZE,
+		userspace_addr = vm_mmap(NULL, 0, memslot->npages * PAGE_SIZE,
 					 PROT_READ | PROT_WRITE,
 					 MAP_SHARED | MAP_ANONYMOUS, 0);
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 8eaf61f7b02d..caa72cf7e8e7 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -479,8 +479,8 @@ void kvm_arch_free_memslot(struct kvm_memory_slot *free,
 int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages);
 int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot *memslot,
-				struct kvm_memory_slot old,
-				struct kvm_userspace_memory_region *mem);
+				struct kvm_userspace_memory_region *mem,
+				enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
 				struct kvm_memory_slot old);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index c7979ed41923..8f85bae862c7 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -856,7 +856,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		slots = old_memslots;
 	}
 
-	r = kvm_arch_prepare_memory_region(kvm, &new, old, mem);
+	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
 	if (r)
 		goto out_slots;
 
-- 
cgit 


From 8482644aea11e0647867732319ccf35879a9acc2 Mon Sep 17 00:00:00 2001
From: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Date: Wed, 27 Feb 2013 19:45:25 +0900
Subject: KVM: set_memory_region: Refactor commit_memory_region()

This patch makes the parameter old a const pointer to the old memory
slot and adds a new parameter named change to know the change being
requested: the former is for removing extra copying and the latter is
for cleaning up the code.

Signed-off-by: Takuya Yoshikawa <yoshikawa_takuya_b1@lab.ntt.co.jp>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/arm/kvm/arm.c                 |  3 ++-
 arch/ia64/kvm/kvm-ia64.c           |  3 ++-
 arch/powerpc/include/asm/kvm_ppc.h |  2 +-
 arch/powerpc/kvm/book3s_hv.c       |  4 ++--
 arch/powerpc/kvm/book3s_pr.c       |  2 +-
 arch/powerpc/kvm/booke.c           |  2 +-
 arch/powerpc/kvm/powerpc.c         |  3 ++-
 arch/s390/kvm/kvm-s390.c           |  3 ++-
 arch/x86/kvm/x86.c                 | 15 ++++++++-------
 include/linux/kvm_host.h           |  3 ++-
 virt/kvm/kvm_main.c                |  2 +-
 11 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 96ebab7a1959..b32dc446e802 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -238,7 +238,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   struct kvm_userspace_memory_region *mem,
-				   struct kvm_memory_slot old)
+				   const struct kvm_memory_slot *old,
+				   enum kvm_mr_change change)
 {
 }
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5c2b07e8c3d6..7a54455dde39 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1591,7 +1591,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 		struct kvm_userspace_memory_region *mem,
-		struct kvm_memory_slot old)
+		const struct kvm_memory_slot *old,
+		enum kvm_mr_change change)
 {
 	return;
 }
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 44a657adf416..44fa9ad1d62c 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -152,7 +152,7 @@ extern int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem);
 extern void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old);
+				const struct kvm_memory_slot *old);
 extern int kvm_vm_ioctl_get_smmu_info(struct kvm *kvm,
 				      struct kvm_ppc_smmu_info *info);
 extern void kvmppc_core_flush_memslot(struct kvm *kvm,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 80dcc53a1aba..1e521baf9a7d 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1639,12 +1639,12 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				      struct kvm_userspace_memory_region *mem,
-				      struct kvm_memory_slot old)
+				      const struct kvm_memory_slot *old)
 {
 	unsigned long npages = mem->memory_size >> PAGE_SHIFT;
 	struct kvm_memory_slot *memslot;
 
-	if (npages && old.npages) {
+	if (npages && old->npages) {
 		/*
 		 * If modifying a memslot, reset all the rmap dirty bits.
 		 * If this is a new memslot, we don't need to do anything
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 5e93438afb06..286e23e6b92d 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1283,7 +1283,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old)
 {
 }
 
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 020923e43134..eb88fa621073 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1531,7 +1531,7 @@ int kvmppc_core_prepare_memory_region(struct kvm *kvm,
 
 void kvmppc_core_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old)
 {
 }
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 8aa51cd67c28..7b5d4d20cdc5 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -420,7 +420,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				   struct kvm_userspace_memory_region *mem,
-				   struct kvm_memory_slot old)
+				   const struct kvm_memory_slot *old,
+				   enum kvm_mr_change change)
 {
 	kvmppc_core_commit_memory_region(kvm, mem, old);
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 4288780c86b8..6cae4ad647a9 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1001,7 +1001,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old,
+				enum kvm_mr_change change)
 {
 	int rc;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7198234fa088..35b491229c3a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6935,16 +6935,17 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old)
+				const struct kvm_memory_slot *old,
+				enum kvm_mr_change change)
 {
 
-	int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
+	int nr_mmu_pages = 0;
 
-	if ((mem->slot >= KVM_USER_MEM_SLOTS) && old.npages && !npages) {
+	if ((mem->slot >= KVM_USER_MEM_SLOTS) && (change == KVM_MR_DELETE)) {
 		int ret;
 
-		ret = vm_munmap(old.userspace_addr,
-				old.npages * PAGE_SIZE);
+		ret = vm_munmap(old->userspace_addr,
+				old->npages * PAGE_SIZE);
 		if (ret < 0)
 			printk(KERN_WARNING
 			       "kvm_vm_ioctl_set_memory_region: "
@@ -6961,13 +6962,13 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	 * Existing largepage mappings are destroyed here and new ones will
 	 * not be created until the end of the logging.
 	 */
-	if (npages && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
+	if ((change != KVM_MR_DELETE) && (mem->flags & KVM_MEM_LOG_DIRTY_PAGES))
 		kvm_mmu_slot_remove_write_access(kvm, mem->slot);
 	/*
 	 * If memory slot is created, or moved, we need to clear all
 	 * mmio sptes.
 	 */
-	if (npages && old.base_gfn != mem->guest_phys_addr >> PAGE_SHIFT) {
+	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
 		kvm_mmu_zap_all(kvm);
 		kvm_reload_remote_mmus(kvm);
 	}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index caa72cf7e8e7..ac584cc53581 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -483,7 +483,8 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				enum kvm_mr_change change);
 void kvm_arch_commit_memory_region(struct kvm *kvm,
 				struct kvm_userspace_memory_region *mem,
-				struct kvm_memory_slot old);
+				const struct kvm_memory_slot *old,
+				enum kvm_mr_change change);
 bool kvm_largepages_enabled(void);
 void kvm_disable_largepages(void);
 /* flush all memory translations */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8f85bae862c7..0e919a1d4d56 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -896,7 +896,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	old_memslots = install_new_memslots(kvm, slots, &new);
 
-	kvm_arch_commit_memory_region(kvm, mem, old);
+	kvm_arch_commit_memory_region(kvm, mem, &old, change);
 
 	kvm_free_physmem_slot(&old, &new);
 	kfree(old_memslots);
-- 
cgit 


From 51dcdafcb720a9d1fd73b597d0ccf48837abc59f Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Tue, 5 Mar 2013 14:16:00 +0800
Subject: regulator: core: Add enable_is_inverted flag to indicate set
 enable_mask bits to disable

Add enable_is_inverted flag to indicate set enable_mask bits to disable
when using regulator_enable_regmap and friends APIs.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Reviewed-by: Haojian Zhuang <haojian.zhuang@gmail.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/core.c         | 24 ++++++++++++++++++++----
 include/linux/regulator/driver.h |  3 +++
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index 154bc8f0c1a0..d887b9f5b213 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -1794,7 +1794,10 @@ int regulator_is_enabled_regmap(struct regulator_dev *rdev)
 	if (ret != 0)
 		return ret;
 
-	return (val & rdev->desc->enable_mask) != 0;
+	if (rdev->desc->enable_is_inverted)
+		return (val & rdev->desc->enable_mask) == 0;
+	else
+		return (val & rdev->desc->enable_mask) != 0;
 }
 EXPORT_SYMBOL_GPL(regulator_is_enabled_regmap);
 
@@ -1809,9 +1812,15 @@ EXPORT_SYMBOL_GPL(regulator_is_enabled_regmap);
  */
 int regulator_enable_regmap(struct regulator_dev *rdev)
 {
+	unsigned int val;
+
+	if (rdev->desc->enable_is_inverted)
+		val = 0;
+	else
+		val = rdev->desc->enable_mask;
+
 	return regmap_update_bits(rdev->regmap, rdev->desc->enable_reg,
-				  rdev->desc->enable_mask,
-				  rdev->desc->enable_mask);
+				  rdev->desc->enable_mask, val);
 }
 EXPORT_SYMBOL_GPL(regulator_enable_regmap);
 
@@ -1826,8 +1835,15 @@ EXPORT_SYMBOL_GPL(regulator_enable_regmap);
  */
 int regulator_disable_regmap(struct regulator_dev *rdev)
 {
+	unsigned int val;
+
+	if (rdev->desc->enable_is_inverted)
+		val = rdev->desc->enable_mask;
+	else
+		val = 0;
+
 	return regmap_update_bits(rdev->regmap, rdev->desc->enable_reg,
-				  rdev->desc->enable_mask, 0);
+				  rdev->desc->enable_mask, val);
 }
 EXPORT_SYMBOL_GPL(regulator_disable_regmap);
 
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 7df93f52db08..07ea8f1a127e 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -199,6 +199,8 @@ enum regulator_type {
  *                output when using regulator_set_voltage_sel_regmap
  * @enable_reg: Register for control when using regmap enable/disable ops
  * @enable_mask: Mask for control when using regmap enable/disable ops
+ * @enable_is_inverted: A flag to indicate set enable_mask bits to disable
+ *                      when using regulator_enable_regmap and friends APIs.
  * @bypass_reg: Register for control when using regmap set_bypass
  * @bypass_mask: Mask for control when using regmap set_bypass
  *
@@ -228,6 +230,7 @@ struct regulator_desc {
 	unsigned int apply_bit;
 	unsigned int enable_reg;
 	unsigned int enable_mask;
+	bool enable_is_inverted;
 	unsigned int bypass_reg;
 	unsigned int bypass_mask;
 
-- 
cgit 


From 7d8e0bf56a66bab08d2f316dd87e56c08cecb899 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 10:57:03 +0800
Subject: cgroup: avoid accessing modular cgroup subsys structure without
 locking

subsys[i] is set to NULL in cgroup_unload_subsys() at modular unload,
and that's protected by cgroup_mutex, and then the memory *subsys[i]
resides will be freed.

So this is unsafe without any locking:

  if (!ss || ss->module)
  ...

v2:
- add a comment for enum cgroup_subsys_id
- simplify the comment in cgroup_exit()

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 17 ++++++++++++++---
 kernel/cgroup.c        | 28 ++++++++++++++--------------
 2 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 75c6ec1ba1ba..5f76829dd75e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -44,14 +44,25 @@ extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
 extern const struct file_operations proc_cgroup_operations;
 
-/* Define the enumeration of all builtin cgroup subsystems */
+/*
+ * Define the enumeration of all cgroup subsystems.
+ *
+ * We define ids for builtin subsystems and then modular ones.
+ */
 #define SUBSYS(_x) _x ## _subsys_id,
-#define IS_SUBSYS_ENABLED(option) IS_ENABLED(option)
 enum cgroup_subsys_id {
+#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option)
+#include <linux/cgroup_subsys.h>
+#undef IS_SUBSYS_ENABLED
+	CGROUP_BUILTIN_SUBSYS_COUNT,
+
+	__CGROUP_SUBSYS_TEMP_PLACEHOLDER = CGROUP_BUILTIN_SUBSYS_COUNT - 1,
+
+#define IS_SUBSYS_ENABLED(option) IS_MODULE(option)
 #include <linux/cgroup_subsys.h>
+#undef IS_SUBSYS_ENABLED
 	CGROUP_SUBSYS_COUNT,
 };
-#undef IS_SUBSYS_ENABLED
 #undef SUBSYS
 
 /* Per-subsystem/per-cgroup state maintained by the system. */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9df799d5d31c..7a6c4c72ca55 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4940,17 +4940,17 @@ void cgroup_post_fork(struct task_struct *child)
 	 * and addition to css_set.
 	 */
 	if (need_forkexit_callback) {
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * fork/exit callbacks are supported only for builtin
+		 * subsystems, and the builtin section of the subsys
+		 * array is immutable, so we don't need to lock the
+		 * subsys array here. On the other hand, modular section
+		 * of the array can be freed at module unload, so we
+		 * can't touch that.
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 
-			/*
-			 * fork/exit callbacks are supported only for
-			 * builtin subsystems and we don't need further
-			 * synchronization as they never go away.
-			 */
-			if (!ss || ss->module)
-				continue;
-
 			if (ss->fork)
 				ss->fork(child);
 		}
@@ -5015,13 +5015,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	tsk->cgroups = &init_css_set;
 
 	if (run_callbacks && need_forkexit_callback) {
-		for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+		/*
+		 * fork/exit callbacks are supported only for builtin
+		 * subsystems, see cgroup_post_fork() for details.
+		 */
+		for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
 			struct cgroup_subsys *ss = subsys[i];
 
-			/* modular subsystems can't use callbacks */
-			if (!ss || ss->module)
-				continue;
-
 			if (ss->exit) {
 				struct cgroup *old_cgrp =
 					rcu_dereference_raw(cg->subsys[i])->cgroup;
-- 
cgit 


From 9259826ccb8165f797e4c2c9d17925b41af5f6ae Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 11:37:50 +0800
Subject: res_counter: remove include of cgroup.h from res_counter.h

It's not needed at all.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/res_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 5ae8456d9670..a83a849bf1d3 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -13,7 +13,7 @@
  * info about what this counter is.
  */
 
-#include <linux/cgroup.h>
+#include <linux/spinlock.h>
 
 /*
  * The core object. the cgroup that wishes to account for some
-- 
cgit 


From ff794dea52eaaa09017efea688a1d7f92ab0818e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 11:37:56 +0800
Subject: cpuset: remove include of cgroup.h from cpuset.h

We don't need to include cgroup.h in cpuset.h.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cpuset.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8c8a60d29407..ccd1de8ad822 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -11,7 +11,6 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
-#include <linux/cgroup.h>
 #include <linux/mm.h>
 
 #ifdef CONFIG_CPUSETS
-- 
cgit 


From e1fd1f490fa4213bd3060efa823a39d299538f72 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 Mar 2013 15:04:55 -0500
Subject: get rid of union semop in sys_semctl(2) arguments

just have the bugger take unsigned long and deal with SETVAL
case (when we use an int member in the union) explicitly.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/parisc/kernel/sys_parisc32.c  |  15 -----
 arch/parisc/kernel/syscall_table.S |   2 +-
 arch/sparc/kernel/sys_sparc_64.c   |   2 +-
 include/linux/syscalls.h           |   2 +-
 ipc/compat.c                       |  14 +++--
 ipc/sem.c                          | 121 +++++++++++++++++++++++--------------
 ipc/syscall.c                      |   6 +-
 7 files changed, 91 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index 46bdf6080fe4..f517e08e7f0d 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -60,21 +60,6 @@ asmlinkage long sys32_unimplemented(int r26, int r25, int r24, int r23,
     return -ENOSYS;
 }
 
-asmlinkage long sys32_semctl(int semid, int semnum, int cmd, union semun arg)
-{
-        union semun u;
-	
-        if (cmd == SETVAL) {
-                /* Ugh.  arg is a union of int,ptr,ptr,ptr, so is 8 bytes.
-                 * The int should be in the first 4, but our argument
-                 * frobbing has left it in the last 4.
-                 */
-                u.val = *((int *)&arg + 1);
-                return sys_semctl (semid, semnum, cmd, u);
-	}
-	return sys_semctl (semid, semnum, cmd, arg);
-}
-
 asmlinkage long compat_sys_fanotify_mark(int fan_fd, int flags, u32 mask_hi,
 					 u32 mask_lo, int fd,
 					 const char __user *pathname)
diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S
index 30c9a3bba1cc..0c9107285e66 100644
--- a/arch/parisc/kernel/syscall_table.S
+++ b/arch/parisc/kernel/syscall_table.S
@@ -282,7 +282,7 @@
 	ENTRY_COMP(recvmsg)
 	ENTRY_SAME(semop)		/* 185 */
 	ENTRY_SAME(semget)
-	ENTRY_DIFF(semctl)
+	ENTRY_COMP(semctl)
 	ENTRY_COMP(msgsnd)
 	ENTRY_COMP(msgrcv)
 	ENTRY_SAME(msgget)		/* 190 */
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c
index 42beb6fc4ad8..2daaaa6eda23 100644
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -353,7 +353,7 @@ SYSCALL_DEFINE6(sparc_ipc, unsigned int, call, int, first, unsigned long, second
 		case SEMCTL: {
 			err = sys_semctl(first, second,
 					 (int)third | IPC_64,
-					 (union semun) ptr);
+					 (unsigned long) ptr);
 			goto out;
 		}
 		default:
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 9660a8bdcbbe..65c001f7fa0b 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -657,7 +657,7 @@ asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf);
 asmlinkage long sys_semget(key_t key, int nsems, int semflg);
 asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
 				unsigned nsops);
-asmlinkage long sys_semctl(int semid, int semnum, int cmd, union semun arg);
+asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg);
 asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
 				unsigned nsops,
 				const struct timespec __user *timeout);
diff --git a/ipc/compat.c b/ipc/compat.c
index 6cb6a4df86e4..892f6585dd60 100644
--- a/ipc/compat.c
+++ b/ipc/compat.c
@@ -240,7 +240,7 @@ static inline int put_compat_semid_ds(struct semid64_ds *s,
 
 static long do_compat_semctl(int first, int second, int third, u32 pad)
 {
-	union semun fourth;
+	unsigned long fourth;
 	int err, err2;
 	struct semid64_ds s64;
 	struct semid64_ds __user *up64;
@@ -249,9 +249,13 @@ static long do_compat_semctl(int first, int second, int third, u32 pad)
 	memset(&s64, 0, sizeof(s64));
 
 	if ((third & (~IPC_64)) == SETVAL)
-		fourth.val = (int) pad;
+#ifdef __BIG_ENDIAN
+		fourth = (unsigned long)pad << 32;
+#else
+		fourth = pad;
+#endif
 	else
-		fourth.__pad = compat_ptr(pad);
+		fourth = (unsigned long)compat_ptr(pad);
 	switch (third & (~IPC_64)) {
 	case IPC_INFO:
 	case IPC_RMID:
@@ -269,7 +273,7 @@ static long do_compat_semctl(int first, int second, int third, u32 pad)
 	case IPC_STAT:
 	case SEM_STAT:
 		up64 = compat_alloc_user_space(sizeof(s64));
-		fourth.__pad = up64;
+		fourth = (unsigned long)up64;
 		err = sys_semctl(first, second, third, fourth);
 		if (err < 0)
 			break;
@@ -295,7 +299,7 @@ static long do_compat_semctl(int first, int second, int third, u32 pad)
 		if (err)
 			break;
 
-		fourth.__pad = up64;
+		fourth = (unsigned long)up64;
 		err = sys_semctl(first, second, third, fourth);
 		break;
 
diff --git a/ipc/sem.c b/ipc/sem.c
index e7236df7a470..5b167d00efa6 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -799,7 +799,7 @@ static unsigned long copy_semid_to_user(void __user *buf, struct semid64_ds *in,
 }
 
 static int semctl_nolock(struct ipc_namespace *ns, int semid,
-			 int cmd, int version, union semun arg)
+			 int cmd, int version, void __user *p)
 {
 	int err;
 	struct sem_array *sma;
@@ -834,7 +834,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
 		}
 		max_id = ipc_get_maxid(&sem_ids(ns));
 		up_read(&sem_ids(ns).rw_mutex);
-		if (copy_to_user (arg.__buf, &seminfo, sizeof(struct seminfo))) 
+		if (copy_to_user(p, &seminfo, sizeof(struct seminfo))) 
 			return -EFAULT;
 		return (max_id < 0) ? 0: max_id;
 	}
@@ -871,7 +871,7 @@ static int semctl_nolock(struct ipc_namespace *ns, int semid,
 		tbuf.sem_ctime  = sma->sem_ctime;
 		tbuf.sem_nsems  = sma->sem_nsems;
 		sem_unlock(sma);
-		if (copy_semid_to_user (arg.buf, &tbuf, version))
+		if (copy_semid_to_user(p, &tbuf, version))
 			return -EFAULT;
 		return id;
 	}
@@ -883,8 +883,67 @@ out_unlock:
 	return err;
 }
 
+static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum,
+		unsigned long arg)
+{
+	struct sem_undo *un;
+	struct sem_array *sma;
+	struct sem* curr;
+	int err;
+	int nsems;
+	struct list_head tasks;
+	int val;
+#if defined(CONFIG_64BIT) && defined(__BIG_ENDIAN)
+	/* big-endian 64bit */
+	val = arg >> 32;
+#else
+	/* 32bit or little-endian 64bit */
+	val = arg;
+#endif
+
+	sma = sem_lock_check(ns, semid);
+	if (IS_ERR(sma))
+		return PTR_ERR(sma);
+
+	INIT_LIST_HEAD(&tasks);
+	nsems = sma->sem_nsems;
+
+	err = -EACCES;
+	if (ipcperms(ns, &sma->sem_perm, S_IWUGO))
+		goto out_unlock;
+
+	err = security_sem_semctl(sma, SETVAL);
+	if (err)
+		goto out_unlock;
+
+	err = -EINVAL;
+	if(semnum < 0 || semnum >= nsems)
+		goto out_unlock;
+
+	curr = &sma->sem_base[semnum];
+
+	err = -ERANGE;
+	if (val > SEMVMX || val < 0)
+		goto out_unlock;
+
+	assert_spin_locked(&sma->sem_perm.lock);
+	list_for_each_entry(un, &sma->list_id, list_id)
+		un->semadj[semnum] = 0;
+
+	curr->semval = val;
+	curr->sempid = task_tgid_vnr(current);
+	sma->sem_ctime = get_seconds();
+	/* maybe some queued-up processes were waiting for this */
+	do_smart_update(sma, NULL, 0, 0, &tasks);
+	err = 0;
+out_unlock:
+	sem_unlock(sma);
+	wake_up_sem_queue_do(&tasks);
+	return err;
+}
+
 static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
-		int cmd, int version, union semun arg)
+		int cmd, void __user *p)
 {
 	struct sem_array *sma;
 	struct sem* curr;
@@ -903,7 +962,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 
 	err = -EACCES;
 	if (ipcperms(ns, &sma->sem_perm,
-			(cmd == SETVAL || cmd == SETALL) ? S_IWUGO : S_IRUGO))
+			cmd == SETALL ? S_IWUGO : S_IRUGO))
 		goto out_unlock;
 
 	err = security_sem_semctl(sma, cmd);
@@ -914,7 +973,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 	switch (cmd) {
 	case GETALL:
 	{
-		ushort __user *array = arg.array;
+		ushort __user *array = p;
 		int i;
 
 		if(nsems > SEMMSL_FAST) {
@@ -957,7 +1016,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 			}
 		}
 
-		if (copy_from_user (sem_io, arg.array, nsems*sizeof(ushort))) {
+		if (copy_from_user (sem_io, p, nsems*sizeof(ushort))) {
 			sem_putref(sma);
 			err = -EFAULT;
 			goto out_free;
@@ -991,7 +1050,7 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 		err = 0;
 		goto out_unlock;
 	}
-	/* GETVAL, GETPID, GETNCTN, GETZCNT, SETVAL: fall-through */
+	/* GETVAL, GETPID, GETNCTN, GETZCNT: fall-through */
 	}
 	err = -EINVAL;
 	if(semnum < 0 || semnum >= nsems)
@@ -1012,27 +1071,6 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum,
 	case GETZCNT:
 		err = count_semzcnt(sma,semnum);
 		goto out_unlock;
-	case SETVAL:
-	{
-		int val = arg.val;
-		struct sem_undo *un;
-
-		err = -ERANGE;
-		if (val > SEMVMX || val < 0)
-			goto out_unlock;
-
-		assert_spin_locked(&sma->sem_perm.lock);
-		list_for_each_entry(un, &sma->list_id, list_id)
-			un->semadj[semnum] = 0;
-
-		curr->semval = val;
-		curr->sempid = task_tgid_vnr(current);
-		sma->sem_ctime = get_seconds();
-		/* maybe some queued-up processes were waiting for this */
-		do_smart_update(sma, NULL, 0, 0, &tasks);
-		err = 0;
-		goto out_unlock;
-	}
 	}
 out_unlock:
 	sem_unlock(sma);
@@ -1076,7 +1114,7 @@ copy_semid_from_user(struct semid64_ds *out, void __user *buf, int version)
  * NOTE: no locks must be held, the rw_mutex is taken inside this function.
  */
 static int semctl_down(struct ipc_namespace *ns, int semid,
-		       int cmd, int version, union semun arg)
+		       int cmd, int version, void __user *p)
 {
 	struct sem_array *sma;
 	int err;
@@ -1084,7 +1122,7 @@ static int semctl_down(struct ipc_namespace *ns, int semid,
 	struct kern_ipc_perm *ipcp;
 
 	if(cmd == IPC_SET) {
-		if (copy_semid_from_user(&semid64, arg.buf, version))
+		if (copy_semid_from_user(&semid64, p, version))
 			return -EFAULT;
 	}
 
@@ -1120,11 +1158,11 @@ out_up:
 	return err;
 }
 
-SYSCALL_DEFINE(semctl)(int semid, int semnum, int cmd, union semun arg)
+SYSCALL_DEFINE4(semctl, int, semid, int, semnum, int, cmd, unsigned long, arg)
 {
-	int err = -EINVAL;
 	int version;
 	struct ipc_namespace *ns;
+	void __user *p = (void __user *)arg;
 
 	if (semid < 0)
 		return -EINVAL;
@@ -1137,30 +1175,23 @@ SYSCALL_DEFINE(semctl)(int semid, int semnum, int cmd, union semun arg)
 	case SEM_INFO:
 	case IPC_STAT:
 	case SEM_STAT:
-		err = semctl_nolock(ns, semid, cmd, version, arg);
-		return err;
+		return semctl_nolock(ns, semid, cmd, version, p);
 	case GETALL:
 	case GETVAL:
 	case GETPID:
 	case GETNCNT:
 	case GETZCNT:
-	case SETVAL:
 	case SETALL:
-		err = semctl_main(ns,semid,semnum,cmd,version,arg);
-		return err;
+		return semctl_main(ns, semid, semnum, cmd, p);
+	case SETVAL:
+		return semctl_setval(ns, semid, semnum, arg);
 	case IPC_RMID:
 	case IPC_SET:
-		err = semctl_down(ns, semid, cmd, version, arg);
-		return err;
+		return semctl_down(ns, semid, cmd, version, p);
 	default:
 		return -EINVAL;
 	}
 }
-asmlinkage long SyS_semctl(int semid, int semnum, int cmd, union semun arg)
-{
-	return SYSC_semctl((int) semid, (int) semnum, (int) cmd, arg);
-}
-SYSCALL_ALIAS(sys_semctl, SyS_semctl);
 
 /* If the task doesn't already have a undo_list, then allocate one
  * here.  We guarantee there is only one thread using this undo list,
diff --git a/ipc/syscall.c b/ipc/syscall.c
index 0d1e32ce048e..52429489cde0 100644
--- a/ipc/syscall.c
+++ b/ipc/syscall.c
@@ -33,12 +33,12 @@ SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second,
 	case SEMGET:
 		return sys_semget(first, second, third);
 	case SEMCTL: {
-		union semun fourth;
+		unsigned long arg;
 		if (!ptr)
 			return -EINVAL;
-		if (get_user(fourth.__pad, (void __user * __user *) ptr))
+		if (get_user(arg, (unsigned long __user *) ptr))
 			return -EFAULT;
-		return sys_semctl(first, second, third, fourth);
+		return sys_semctl(first, second, third, arg);
 	}
 
 	case MSGSND:
-- 
cgit 


From 99e621f796d7f0341a51e8cdf32b81663b10b448 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 Mar 2013 15:36:40 -0500
Subject: syscalls.h: slightly reduce the jungles of macros

a) teach __MAP(num, m, <list of type/name pairs>) to take empty
list (with num being 0, of course)
b) fold types__... and args__... declaration and initialization into
SYSCALL_METADATA(num, ...), making their use conditional on num != 0.
That allows to use the SYSCALL_METADATA instead of its near-duplicate
in SYSCALL_DEFINE0.
c) make SYSCALL_METADATA expand to nothing in case if CONFIG_FTRACE_SYSCALLS
is not defined; that allows to make SYSCALL_DEFINE0 and SYSCALL_DEFINEx
definitions independent from CONFIG_FTRACE_SYSCALLS.
d) kill SYSCALL_DEFINE - no users left (SYSCALL_DEFINE[0-6] is, of course,
still alive and well).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/syscalls.h | 49 +++++++++++++++---------------------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 65c001f7fa0b..4147d700a293 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -87,6 +87,7 @@ struct sigaltstack;
  * of __MAP starting at the third one) is in the same format as
  * for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
  */
+#define __MAP0(m,...)
 #define __MAP1(m,t,a) m(t,a)
 #define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
 #define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
@@ -139,7 +140,13 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	  __attribute__((section("_ftrace_events")))			\
 	*__event_exit_##sname = &event_exit_##sname;
 
-#define SYSCALL_METADATA(sname, nb)				\
+#define SYSCALL_METADATA(sname, nb, ...)			\
+	static const char *types_##sname[] = {			\
+		__MAP(nb,__SC_STR_TDECL,__VA_ARGS__)		\
+	};							\
+	static const char *args_##sname[] = {			\
+		__MAP(nb,__SC_STR_ADECL,__VA_ARGS__)		\
+	};							\
 	SYSCALL_TRACE_ENTER_EVENT(sname);			\
 	SYSCALL_TRACE_EXIT_EVENT(sname);			\
 	static struct syscall_metadata __used			\
@@ -147,8 +154,8 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 		.name 		= "sys"#sname,			\
 		.syscall_nr	= -1,	/* Filled in at boot */	\
 		.nb_args 	= nb,				\
-		.types		= types_##sname,		\
-		.args		= args_##sname,			\
+		.types		= nb ? types_##sname : NULL,	\
+		.args		= nb ? args_##sname : NULL,	\
 		.enter_event	= &event_enter_##sname,		\
 		.exit_event	= &event_exit_##sname,		\
 		.enter_fields	= LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
@@ -156,26 +163,13 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 	static struct syscall_metadata __used			\
 	  __attribute__((section("__syscalls_metadata")))	\
 	 *__p_syscall_meta_##sname = &__syscall_meta_##sname;
+#else
+#define SYSCALL_METADATA(sname, nb, ...)
+#endif
 
 #define SYSCALL_DEFINE0(sname)					\
-	SYSCALL_TRACE_ENTER_EVENT(_##sname);			\
-	SYSCALL_TRACE_EXIT_EVENT(_##sname);			\
-	static struct syscall_metadata __used			\
-	  __syscall_meta__##sname = {				\
-		.name 		= "sys_"#sname,			\
-		.syscall_nr	= -1,	/* Filled in at boot */	\
-		.nb_args 	= 0,				\
-		.enter_event	= &event_enter__##sname,	\
-		.exit_event	= &event_exit__##sname,		\
-		.enter_fields	= LIST_HEAD_INIT(__syscall_meta__##sname.enter_fields), \
-	};							\
-	static struct syscall_metadata __used			\
-	  __attribute__((section("__syscalls_metadata")))	\
-	 *__p_syscall_meta_##sname = &__syscall_meta__##sname;	\
+	SYSCALL_METADATA(_##sname, 0);				\
 	asmlinkage long sys_##sname(void)
-#else
-#define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
-#endif
 
 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
@@ -184,22 +178,9 @@ extern struct trace_event_functions exit_syscall_print_funcs;
 #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
-#ifdef CONFIG_FTRACE_SYSCALLS
 #define SYSCALL_DEFINEx(x, sname, ...)				\
-	static const char *types_##sname[] = {			\
-		__MAP(x,__SC_STR_TDECL,__VA_ARGS__)		\
-	};							\
-	static const char *args_##sname[] = {			\
-		__MAP(x,__SC_STR_ADECL,__VA_ARGS__)		\
-	};							\
-	SYSCALL_METADATA(sname, x);				\
+	SYSCALL_METADATA(sname, x, __VA_ARGS__)			\
 	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
-#else
-#define SYSCALL_DEFINEx(x, sname, ...)				\
-	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
-#endif
-
-#define SYSCALL_DEFINE(name) static inline long SYSC_##name
 
 #define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
 #define __SYSCALL_DEFINEx(x, name, ...)					\
-- 
cgit 


From a0f155e9646d5f1c263f6f9aae880151100243bb Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Thu, 28 Feb 2013 12:33:18 +0100
Subject: KVM: Initialize irqfd from kvm_init().

Currently, eventfd introduces module_init/module_exit functions
to initialize/cleanup the irqfd workqueue. This only works, however,
if no other module_init/module_exit functions are built into the
same module.

Let's just move the initialization and cleanup to kvm_init and kvm_exit.
This way, it is also clearer where kvm startup may fail.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h | 13 +++++++++++++
 virt/kvm/eventfd.c       |  7 ++-----
 virt/kvm/kvm_main.c      |  6 ++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ac584cc53581..d50fe173028b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -424,6 +424,19 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 int __must_check vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
+#ifdef __KVM_HAVE_IOAPIC
+int kvm_irqfd_init(void);
+void kvm_irqfd_exit(void);
+#else
+static inline int kvm_irqfd_init(void)
+{
+	return 0;
+}
+
+static inline void kvm_irqfd_exit(void)
+{
+}
+#endif
 int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 		  struct module *module);
 void kvm_exit(void);
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index adb17f266b28..0b6fe69bb03d 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -543,7 +543,7 @@ void kvm_irq_routing_update(struct kvm *kvm,
  * aggregated from all vm* instances. We need our own isolated single-thread
  * queue to prevent deadlock against flushing the normal work-queue.
  */
-static int __init irqfd_module_init(void)
+int kvm_irqfd_init(void)
 {
 	irqfd_cleanup_wq = create_singlethread_workqueue("kvm-irqfd-cleanup");
 	if (!irqfd_cleanup_wq)
@@ -552,13 +552,10 @@ static int __init irqfd_module_init(void)
 	return 0;
 }
 
-static void __exit irqfd_module_exit(void)
+void kvm_irqfd_exit(void)
 {
 	destroy_workqueue(irqfd_cleanup_wq);
 }
-
-module_init(irqfd_module_init);
-module_exit(irqfd_module_exit);
 #endif
 
 /*
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 0e919a1d4d56..faf05bddd131 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2898,6 +2898,9 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 	int r;
 	int cpu;
 
+	r = kvm_irqfd_init();
+	if (r)
+		goto out_irqfd;
 	r = kvm_arch_init(opaque);
 	if (r)
 		goto out_fail;
@@ -2978,6 +2981,8 @@ out_free_0a:
 out_free_0:
 	kvm_arch_exit();
 out_fail:
+	kvm_irqfd_exit();
+out_irqfd:
 	return r;
 }
 EXPORT_SYMBOL_GPL(kvm_init);
@@ -2994,6 +2999,7 @@ void kvm_exit(void)
 	on_each_cpu(hardware_disable_nolock, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
+	kvm_irqfd_exit();
 	free_cpumask_var(cpus_hardware_enabled);
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
-- 
cgit 


From 060f0ce6ff975decd1e0ee318c08e228bccbee1e Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Thu, 28 Feb 2013 12:33:19 +0100
Subject: KVM: Introduce KVM_VIRTIO_CCW_NOTIFY_BUS.

Add a new bus type for virtio-ccw devices on s390.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d50fe173028b..9fa13ebc3381 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -149,6 +149,7 @@ struct kvm_io_bus {
 enum kvm_bus {
 	KVM_MMIO_BUS,
 	KVM_PIO_BUS,
+	KVM_VIRTIO_CCW_NOTIFY_BUS,
 	KVM_NR_BUSES
 };
 
-- 
cgit 


From 82dc3c63c692b1e1d59378ecee948ac88e034aad Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 5 Mar 2013 15:57:22 +0000
Subject: net: introduce NAPI_POLL_WEIGHT

Some drivers use a too big NAPI poll weight.

This patch adds a NAPI_POLL_WEIGHT default value
and issues an error message if a driver attempts
to use a bigger weight.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Eilon Greenstein <eilong@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 5 +++++
 net/core/dev.c            | 3 +++
 2 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b3d00fa4b314..896eb4985f97 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1475,6 +1475,11 @@ static inline void *netdev_priv(const struct net_device *dev)
  */
 #define SET_NETDEV_DEVTYPE(net, devtype)	((net)->dev.type = (devtype))
 
+/* Default NAPI poll() weight
+ * Device drivers are strongly advised to not use bigger value
+ */
+#define NAPI_POLL_WEIGHT 64
+
 /**
  *	netif_napi_add - initialize a napi context
  *	@dev:  network device
diff --git a/net/core/dev.c b/net/core/dev.c
index a06a7a58dd11..96103894ad69 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4057,6 +4057,9 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
 	napi->gro_list = NULL;
 	napi->skb = NULL;
 	napi->poll = poll;
+	if (weight > NAPI_POLL_WEIGHT)
+		pr_err_once("netif_napi_add() called with weight %d on device %s\n",
+			    weight, dev->name);
 	napi->weight = weight;
 	list_add(&napi->dev_list, &dev->napi_list);
 	napi->dev = dev;
-- 
cgit 


From 1fd9c467b4f7e08beee41f9771396f39265f4c08 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 5 Mar 2013 11:51:55 +0800
Subject: mfd: arizona: Define additional FLL control registers

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/arizona/registers.h | 40 +++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 340355136069..a61ce90ecd3f 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -85,12 +85,14 @@
 #define ARIZONA_FLL1_CONTROL_6                   0x176
 #define ARIZONA_FLL1_LOOP_FILTER_TEST_1          0x177
 #define ARIZONA_FLL1_NCO_TEST_0                  0x178
+#define ARIZONA_FLL1_CONTROL_7                   0x179
 #define ARIZONA_FLL1_SYNCHRONISER_1              0x181
 #define ARIZONA_FLL1_SYNCHRONISER_2              0x182
 #define ARIZONA_FLL1_SYNCHRONISER_3              0x183
 #define ARIZONA_FLL1_SYNCHRONISER_4              0x184
 #define ARIZONA_FLL1_SYNCHRONISER_5              0x185
 #define ARIZONA_FLL1_SYNCHRONISER_6              0x186
+#define ARIZONA_FLL1_SYNCHRONISER_7              0x187
 #define ARIZONA_FLL1_SPREAD_SPECTRUM             0x189
 #define ARIZONA_FLL1_GPIO_CLOCK                  0x18A
 #define ARIZONA_FLL2_CONTROL_1                   0x191
@@ -101,12 +103,14 @@
 #define ARIZONA_FLL2_CONTROL_6                   0x196
 #define ARIZONA_FLL2_LOOP_FILTER_TEST_1          0x197
 #define ARIZONA_FLL2_NCO_TEST_0                  0x198
+#define ARIZONA_FLL2_CONTROL_7                   0x199
 #define ARIZONA_FLL2_SYNCHRONISER_1              0x1A1
 #define ARIZONA_FLL2_SYNCHRONISER_2              0x1A2
 #define ARIZONA_FLL2_SYNCHRONISER_3              0x1A3
 #define ARIZONA_FLL2_SYNCHRONISER_4              0x1A4
 #define ARIZONA_FLL2_SYNCHRONISER_5              0x1A5
 #define ARIZONA_FLL2_SYNCHRONISER_6              0x1A6
+#define ARIZONA_FLL2_SYNCHRONISER_7              0x1A7
 #define ARIZONA_FLL2_SPREAD_SPECTRUM             0x1A9
 #define ARIZONA_FLL2_GPIO_CLOCK                  0x1AA
 #define ARIZONA_MIC_CHARGE_PUMP_1                0x200
@@ -1677,6 +1681,13 @@
 #define ARIZONA_FLL1_FRC_INTEG_VAL_SHIFT              0  /* FLL1_FRC_INTEG_VAL - [11:0] */
 #define ARIZONA_FLL1_FRC_INTEG_VAL_WIDTH             12  /* FLL1_FRC_INTEG_VAL - [11:0] */
 
+/*
+ * R377 (0x179) - FLL1 Control 7
+ */
+#define ARIZONA_FLL1_GAIN_MASK                   0x003c  /* FLL1_GAIN */
+#define ARIZONA_FLL1_GAIN_SHIFT                       2  /* FLL1_GAIN */
+#define ARIZONA_FLL1_GAIN_WIDTH                       4  /* FLL1_GAIN */
+
 /*
  * R385 (0x181) - FLL1 Synchroniser 1
  */
@@ -1723,6 +1734,17 @@
 #define ARIZONA_FLL1_CLK_SYNC_SRC_SHIFT               0  /* FLL1_CLK_SYNC_SRC - [3:0] */
 #define ARIZONA_FLL1_CLK_SYNC_SRC_WIDTH               4  /* FLL1_CLK_SYNC_SRC - [3:0] */
 
+/*
+ * R391 (0x187) - FLL1 Synchroniser 7
+ */
+#define ARIZONA_FLL1_SYNC_GAIN_MASK              0x003c  /* FLL1_SYNC_GAIN */
+#define ARIZONA_FLL1_SYNC_GAIN_SHIFT                  2  /* FLL1_SYNC_GAIN */
+#define ARIZONA_FLL1_SYNC_GAIN_WIDTH                  4  /* FLL1_SYNC_GAIN */
+#define ARIZONA_FLL1_SYNC_BW                     0x0001  /* FLL1_SYNC_BW */
+#define ARIZONA_FLL1_SYNC_BW_MASK                0x0001  /* FLL1_SYNC_BW */
+#define ARIZONA_FLL1_SYNC_BW_SHIFT                    0  /* FLL1_SYNC_BW */
+#define ARIZONA_FLL1_SYNC_BW_WIDTH                    1  /* FLL1_SYNC_BW */
+
 /*
  * R393 (0x189) - FLL1 Spread Spectrum
  */
@@ -1815,6 +1837,13 @@
 #define ARIZONA_FLL2_FRC_INTEG_VAL_SHIFT              0  /* FLL2_FRC_INTEG_VAL - [11:0] */
 #define ARIZONA_FLL2_FRC_INTEG_VAL_WIDTH             12  /* FLL2_FRC_INTEG_VAL - [11:0] */
 
+/*
+ * R409 (0x199) - FLL2 Control 7
+ */
+#define ARIZONA_FLL2_GAIN_MASK                   0x003c  /* FLL2_GAIN */
+#define ARIZONA_FLL2_GAIN_SHIFT                       2  /* FLL2_GAIN */
+#define ARIZONA_FLL2_GAIN_WIDTH                       4  /* FLL2_GAIN */
+
 /*
  * R417 (0x1A1) - FLL2 Synchroniser 1
  */
@@ -1861,6 +1890,17 @@
 #define ARIZONA_FLL2_CLK_SYNC_SRC_SHIFT               0  /* FLL2_CLK_SYNC_SRC - [3:0] */
 #define ARIZONA_FLL2_CLK_SYNC_SRC_WIDTH               4  /* FLL2_CLK_SYNC_SRC - [3:0] */
 
+/*
+ * R423 (0x1A7) - FLL2 Synchroniser 7
+ */
+#define ARIZONA_FLL2_SYNC_GAIN_MASK              0x003c  /* FLL2_SYNC_GAIN */
+#define ARIZONA_FLL2_SYNC_GAIN_SHIFT                  2  /* FLL2_SYNC_GAIN */
+#define ARIZONA_FLL2_SYNC_GAIN_WIDTH                  4  /* FLL2_SYNC_GAIN */
+#define ARIZONA_FLL2_SYNC_BW_MASK                0x0001  /* FLL2_SYNC_BW */
+#define ARIZONA_FLL2_SYNC_BW_MASK                0x0001  /* FLL2_SYNC_BW */
+#define ARIZONA_FLL2_SYNC_BW_SHIFT                    0  /* FLL2_SYNC_BW */
+#define ARIZONA_FLL2_SYNC_BW_WIDTH                    1  /* FLL2_SYNC_BW */
+
 /*
  * R425 (0x1A9) - FLL2 Spread Spectrum
  */
-- 
cgit 


From 19a37d1cd5465c10d669a296a2ea24b4c985363b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:05:28 +0800
Subject: sched: Remove some dummy functions

No one will call those functions if CONFIG_SCHED_DEBUG=n.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A748.3050206@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..2715fbb9ea85 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -127,18 +127,6 @@ extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
 extern void proc_sched_set_task(struct task_struct *p);
 extern void
 print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
-#else
-static inline void
-proc_sched_show_task(struct task_struct *p, struct seq_file *m)
-{
-}
-static inline void proc_sched_set_task(struct task_struct *p)
-{
-}
-static inline void
-print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
-{
-}
 #endif
 
 /*
-- 
cgit 


From 090b582f27ac7b6714661020033160130e5297bd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:05:51 +0800
Subject: sched: Remove test_sd_parent()

It's unused.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A75F.4070202@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2715fbb9ea85..e880d7d115ef 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -959,15 +959,6 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
-/* Test a flag in parent sched domain */
-static inline int test_sd_parent(struct sched_domain *sd, int flag)
-{
-	if (sd->parent && (sd->parent->flags & flag))
-		return 1;
-
-	return 0;
-}
-
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
 unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
 
-- 
cgit 


From cc1f4b1f3faed9f2040eff2a75f510b424b3cf18 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:06:09 +0800
Subject: sched: Move SCHED_LOAD_SHIFT macros to kernel/sched/sched.h

They are used internally only.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A771.4070104@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 25 -------------------------
 kernel/sched/sched.h  | 26 +++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e880d7d115ef..f8826d04fb12 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -755,31 +755,6 @@ enum cpu_idle_type {
 	CPU_MAX_IDLE_TYPES
 };
 
-/*
- * Increase resolution of nice-level calculations for 64-bit architectures.
- * The extra resolution improves shares distribution and load balancing of
- * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
- * hierarchies, especially on larger systems. This is not a user-visible change
- * and does not change the user-interface for setting shares/weights.
- *
- * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
- * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
- * increased costs.
- */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
-# define SCHED_LOAD_RESOLUTION	10
-# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
-# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
-#else
-# define SCHED_LOAD_RESOLUTION	0
-# define scale_load(w)		(w)
-# define scale_load_down(w)	(w)
-#endif
-
-#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
-#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
-
 /*
  * Increase resolution of cpu_power calculations
  */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index cc03cfdf469f..709a30cdfd85 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -33,6 +33,31 @@ extern __read_mostly int scheduler_running;
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 
+/*
+ * Increase resolution of nice-level calculations for 64-bit architectures.
+ * The extra resolution improves shares distribution and load balancing of
+ * low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
+ * hierarchies, especially on larger systems. This is not a user-visible change
+ * and does not change the user-interface for setting shares/weights.
+ *
+ * We increase resolution only if we have enough bits to allow this increased
+ * resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
+ * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
+ * increased costs.
+ */
+#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
+# define SCHED_LOAD_RESOLUTION	10
+# define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
+# define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
+#else
+# define SCHED_LOAD_RESOLUTION	0
+# define scale_load(w)		(w)
+# define scale_load_down(w)	(w)
+#endif
+
+#define SCHED_LOAD_SHIFT	(10 + SCHED_LOAD_RESOLUTION)
+#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
+
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 
@@ -784,7 +809,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
-
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-- 
cgit 


From 5e6521eaa1ee581a13b904f35b80c5efeb2baccb Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:06:23 +0800
Subject: sched: Move struct sched_group to kernel/sched/sched.h

Move struct sched_group_power and sched_group and related inline
functions to kernel/sched/sched.h, as they are used internally
only.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A77F.2010705@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 58 ++-------------------------------------------------
 kernel/sched/sched.h  | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f8826d04fb12..0d641304c0ff 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -780,62 +780,6 @@ enum cpu_idle_type {
 
 extern int __weak arch_sd_sibiling_asym_packing(void);
 
-struct sched_group_power {
-	atomic_t ref;
-	/*
-	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
-	 * single CPU.
-	 */
-	unsigned int power, power_orig;
-	unsigned long next_update;
-	/*
-	 * Number of busy cpus in this group.
-	 */
-	atomic_t nr_busy_cpus;
-
-	unsigned long cpumask[0]; /* iteration mask */
-};
-
-struct sched_group {
-	struct sched_group *next;	/* Must be a circular list */
-	atomic_t ref;
-
-	unsigned int group_weight;
-	struct sched_group_power *sgp;
-
-	/*
-	 * The CPUs this group covers.
-	 *
-	 * NOTE: this field is variable length. (Allocated dynamically
-	 * by attaching extra space to the end of the structure,
-	 * depending on how many CPUs the kernel has booted up with)
-	 */
-	unsigned long cpumask[0];
-};
-
-static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
-{
-	return to_cpumask(sg->cpumask);
-}
-
-/*
- * cpumask masking which cpus in the group are allowed to iterate up the domain
- * tree.
- */
-static inline struct cpumask *sched_group_mask(struct sched_group *sg)
-{
-	return to_cpumask(sg->sgp->cpumask);
-}
-
-/**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-	return cpumask_first(sched_group_cpus(group));
-}
-
 struct sched_domain_attr {
 	int relax_domain_level;
 };
@@ -846,6 +790,8 @@ struct sched_domain_attr {
 
 extern int sched_domain_level_max;
 
+struct sched_group;
+
 struct sched_domain {
 	/* These fields must be setup */
 	struct sched_domain *parent;	/* top domain must be null terminated */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 709a30cdfd85..1a4a2b19c2f4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -572,6 +572,62 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
 
+struct sched_group_power {
+	atomic_t ref;
+	/*
+	 * CPU power of this group, SCHED_LOAD_SCALE being max power for a
+	 * single CPU.
+	 */
+	unsigned int power, power_orig;
+	unsigned long next_update;
+	/*
+	 * Number of busy cpus in this group.
+	 */
+	atomic_t nr_busy_cpus;
+
+	unsigned long cpumask[0]; /* iteration mask */
+};
+
+struct sched_group {
+	struct sched_group *next;	/* Must be a circular list */
+	atomic_t ref;
+
+	unsigned int group_weight;
+	struct sched_group_power *sgp;
+
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 */
+	unsigned long cpumask[0];
+};
+
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+	return to_cpumask(sg->cpumask);
+}
+
+/*
+ * cpumask masking which cpus in the group are allowed to iterate up the domain
+ * tree.
+ */
+static inline struct cpumask *sched_group_mask(struct sched_group *sg)
+{
+	return to_cpumask(sg->sgp->cpumask);
+}
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+	return cpumask_first(sched_group_cpus(group));
+}
+
 extern int group_balance_cpu(struct sched_group *sg);
 
 #endif /* CONFIG_SMP */
-- 
cgit 


From b13095f07f25464de65f5ce5ea94e16813d67488 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:06:38 +0800
Subject: sched: Move wake flags to kernel/sched/sched.h

They are used internally only.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A78E.7040609@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 7 -------
 kernel/sched/sched.h  | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 0d641304c0ff..863b505ac48e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -920,13 +920,6 @@ struct uts_namespace;
 struct rq;
 struct sched_domain;
 
-/*
- * wake flags
- */
-#define WF_SYNC		0x01		/* waker goes to sleep after wakup */
-#define WF_FORK		0x02		/* child wakeup after fork */
-#define WF_MIGRATED	0x04		/* internal use, task got migrated */
-
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_HEAD		2
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 1a4a2b19c2f4..4e5c2afdac91 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -865,6 +865,13 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 
+/*
+ * wake flags
+ */
+#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
+#define WF_FORK		0x02		/* child wakeup after fork */
+#define WF_MIGRATED	0x4		/* internal use, task got migrated */
+
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
 	lw->weight += inc;
-- 
cgit 


From c82ba9fa7588dfd02d4dc99ad1af486304bc424c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:06:55 +0800
Subject: sched: Move struct sched_class to kernel/sched/sched.h

It's used internally only.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A79F.8090502@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 59 ---------------------------------------------------
 kernel/sched/sched.h  | 55 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 863b505ac48e..04b834fa14bc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -917,65 +917,6 @@ struct mempolicy;
 struct pipe_inode_info;
 struct uts_namespace;
 
-struct rq;
-struct sched_domain;
-
-#define ENQUEUE_WAKEUP		1
-#define ENQUEUE_HEAD		2
-#ifdef CONFIG_SMP
-#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
-#else
-#define ENQUEUE_WAKING		0
-#endif
-
-#define DEQUEUE_SLEEP		1
-
-struct sched_class {
-	const struct sched_class *next;
-
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
-	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
-	void (*yield_task) (struct rq *rq);
-	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
-
-	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
-
-	struct task_struct * (*pick_next_task) (struct rq *rq);
-	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
-
-#ifdef CONFIG_SMP
-	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
-	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
-
-	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
-	void (*post_schedule) (struct rq *this_rq);
-	void (*task_waking) (struct task_struct *task);
-	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
-
-	void (*set_cpus_allowed)(struct task_struct *p,
-				 const struct cpumask *newmask);
-
-	void (*rq_online)(struct rq *rq);
-	void (*rq_offline)(struct rq *rq);
-#endif
-
-	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
-	void (*task_fork) (struct task_struct *p);
-
-	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
-	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
-	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-			     int oldprio);
-
-	unsigned int (*get_rr_interval) (struct rq *rq,
-					 struct task_struct *task);
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	void (*task_move_group) (struct task_struct *p, int on_rq);
-#endif
-};
-
 struct load_weight {
 	unsigned long weight, inv_weight;
 };
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 4e5c2afdac91..eca526d7afbd 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -951,6 +951,61 @@ enum cpuacct_stat_index {
 	CPUACCT_STAT_NSTATS,
 };
 
+#define ENQUEUE_WAKEUP		1
+#define ENQUEUE_HEAD		2
+#ifdef CONFIG_SMP
+#define ENQUEUE_WAKING		4	/* sched_class::task_waking was called */
+#else
+#define ENQUEUE_WAKING		0
+#endif
+
+#define DEQUEUE_SLEEP		1
+
+struct sched_class {
+	const struct sched_class *next;
+
+	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
+	void (*yield_task) (struct rq *rq);
+	bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+
+	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+
+	struct task_struct * (*pick_next_task) (struct rq *rq);
+	void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+
+#ifdef CONFIG_SMP
+	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
+
+	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	void (*post_schedule) (struct rq *this_rq);
+	void (*task_waking) (struct task_struct *task);
+	void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+
+	void (*set_cpus_allowed)(struct task_struct *p,
+				 const struct cpumask *newmask);
+
+	void (*rq_online)(struct rq *rq);
+	void (*rq_offline)(struct rq *rq);
+#endif
+
+	void (*set_curr_task) (struct rq *rq);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+	void (*task_fork) (struct task_struct *p);
+
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+	void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+			     int oldprio);
+
+	unsigned int (*get_rr_interval) (struct rq *rq,
+					 struct task_struct *task);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	void (*task_move_group) (struct task_struct *p, int on_rq);
+#endif
+};
 
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
-- 
cgit 


From 15f803c94bd92b17708aad9e74226fd0b2c9130c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:07:11 +0800
Subject: sched: Make default_scale_freq_power() static

As default_scale_{freq,smt}_power() and update_rt_power() are
used in kernel/sched/fair.c only, annotate them as static
functions.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A7AF.8010900@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 3 ---
 kernel/sched/fair.c   | 6 +++---
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04b834fa14bc..eadd113e1eb2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -880,9 +880,6 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
 
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
-
 bool cpus_share_cache(int this_cpu, int that_cpu);
 
 #else /* CONFIG_SMP */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7a33e5986fc5..9f2311256ae0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4245,7 +4245,7 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 	return load_idx;
 }
 
-unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
 	return SCHED_POWER_SCALE;
 }
@@ -4255,7 +4255,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
 	return default_scale_freq_power(sd, cpu);
 }
 
-unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
 {
 	unsigned long weight = sd->span_weight;
 	unsigned long smt_gain = sd->smt_gain;
@@ -4270,7 +4270,7 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 	return default_scale_smt_power(sd, cpu);
 }
 
-unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_power(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 	u64 total, available, age_stamp, avg;
-- 
cgit 


From 25cc7da7e6336d3bb6a5bad3d3fa96fce9a81d5b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 16:07:33 +0800
Subject: sched: Move group scheduling functions out of include/linux/sched.h

- Make sched_group_{set_,}runtime(), sched_group_{set_,}period()
and sched_rt_can_attach() static.

- Move sched_{create,destroy,online,offline}_group() to
kernel/sched/sched.h.

- Remove declaration of sched_group_shares().

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5135A7C5.3000708@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 21 ---------------------
 kernel/sched/core.c   | 10 +++++-----
 kernel/sched/sched.h  | 12 ++++++++++++
 3 files changed, 17 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index eadd113e1eb2..fc039ceccbea 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2512,28 +2512,7 @@ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
 #ifdef CONFIG_CGROUP_SCHED
-
 extern struct task_group root_task_group;
-
-extern struct task_group *sched_create_group(struct task_group *parent);
-extern void sched_online_group(struct task_group *tg,
-			       struct task_group *parent);
-extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
-extern void sched_move_task(struct task_struct *tsk);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-extern unsigned long sched_group_shares(struct task_group *tg);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-extern int sched_group_set_rt_runtime(struct task_group *tg,
-				      long rt_runtime_us);
-extern long sched_group_rt_runtime(struct task_group *tg);
-extern int sched_group_set_rt_period(struct task_group *tg,
-				      long rt_period_us);
-extern long sched_group_rt_period(struct task_group *tg);
-extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
-#endif
 #endif /* CONFIG_CGROUP_SCHED */
 
 extern int task_can_switch_user(struct user_struct *up,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..9ad26c986441 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7455,7 +7455,7 @@ unlock:
 	return err;
 }
 
-int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
+static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 {
 	u64 rt_runtime, rt_period;
 
@@ -7467,7 +7467,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
-long sched_group_rt_runtime(struct task_group *tg)
+static long sched_group_rt_runtime(struct task_group *tg)
 {
 	u64 rt_runtime_us;
 
@@ -7479,7 +7479,7 @@ long sched_group_rt_runtime(struct task_group *tg)
 	return rt_runtime_us;
 }
 
-int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 {
 	u64 rt_runtime, rt_period;
 
@@ -7492,7 +7492,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 
-long sched_group_rt_period(struct task_group *tg)
+static long sched_group_rt_period(struct task_group *tg)
 {
 	u64 rt_period_us;
 
@@ -7527,7 +7527,7 @@ static int sched_rt_global_constraints(void)
 	return ret;
 }
 
-int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 {
 	/* Don't accept realtime tasks when there is no way for them to run */
 	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eca526d7afbd..304fc1c77143 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -221,6 +221,18 @@ extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 		struct sched_rt_entity *rt_se, int cpu,
 		struct sched_rt_entity *parent);
 
+extern struct task_group *sched_create_group(struct task_group *parent);
+extern void sched_online_group(struct task_group *tg,
+			       struct task_group *parent);
+extern void sched_destroy_group(struct task_group *tg);
+extern void sched_offline_group(struct task_group *tg);
+
+extern void sched_move_task(struct task_struct *tsk);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+#endif
+
 #else /* CONFIG_CGROUP_SCHED */
 
 struct cfs_bandwidth { };
-- 
cgit 


From 877c685607925238e302cd3aa38788dca6c1b226 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 5 Mar 2013 11:38:08 +0800
Subject: perf: Remove include of cgroup.h from perf_event.h

Move struct perf_cgroup_info and perf_cgroup to
kernel/perf/core.c, and then we can remove include of cgroup.h.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/r/513568A0.6020804@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/perf_event.h | 18 +-----------------
 kernel/events/core.c       | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ee462c2f2..8737e1cee8b2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -21,7 +21,6 @@
  */
 
 #ifdef CONFIG_PERF_EVENTS
-# include <linux/cgroup.h>
 # include <asm/perf_event.h>
 # include <asm/local64.h>
 #endif
@@ -299,22 +298,7 @@ struct swevent_hlist {
 #define PERF_ATTACH_GROUP	0x02
 #define PERF_ATTACH_TASK	0x04
 
-#ifdef CONFIG_CGROUP_PERF
-/*
- * perf_cgroup_info keeps track of time_enabled for a cgroup.
- * This is a per-cpu dynamically allocated data structure.
- */
-struct perf_cgroup_info {
-	u64				time;
-	u64				timestamp;
-};
-
-struct perf_cgroup {
-	struct				cgroup_subsys_state css;
-	struct				perf_cgroup_info *info;	/* timing info, one per cpu */
-};
-#endif
-
+struct perf_cgroup;
 struct ring_buffer;
 
 /**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b0cd86501c30..5976a2a6b4ce 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -37,6 +37,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/mm_types.h>
+#include <linux/cgroup.h>
 
 #include "internal.h"
 
@@ -233,6 +234,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
 
 #ifdef CONFIG_CGROUP_PERF
 
+/*
+ * perf_cgroup_info keeps track of time_enabled for a cgroup.
+ * This is a per-cpu dynamically allocated data structure.
+ */
+struct perf_cgroup_info {
+	u64				time;
+	u64				timestamp;
+};
+
+struct perf_cgroup {
+	struct cgroup_subsys_state	css;
+	struct perf_cgroup_info		*info;
+};
+
 /*
  * Must ensure cgroup is pinned (css_get) before calling
  * this function. In other words, we cannot call this function
-- 
cgit 


From 9a886586c82aa02cb49f8c85e961595716884545 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 15 Feb 2013 19:25:00 +0100
Subject: wireless: move sequence number arithmetic to ieee80211.h

Move the sequence number arithmetic code from mac80211 to
ieee80211.h so others can use it. Also rename the functions
from _seq to _sn, they operate on the sequence number, not
the sequence_control field.

Also move macros to convert the sequence control to/from
the sequence number value from various drivers.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/iwlegacy/3945.h     |  4 ---
 drivers/net/wireless/iwlegacy/4965-mac.c | 13 ++++----
 drivers/net/wireless/iwlegacy/common.h   |  4 ---
 drivers/net/wireless/iwlwifi/dvm/tx.c    | 11 ++++---
 drivers/net/wireless/iwlwifi/iwl-trans.h |  3 --
 drivers/net/wireless/iwlwifi/mvm/sta.c   |  4 +--
 drivers/net/wireless/iwlwifi/mvm/tx.c    |  2 +-
 drivers/net/wireless/iwlwifi/pcie/tx.c   |  2 +-
 drivers/net/wireless/rtlwifi/wifi.h      |  3 --
 include/linux/ieee80211.h                | 28 +++++++++++++++++
 net/mac80211/rx.c                        | 53 +++++++++++++-------------------
 11 files changed, 66 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/iwlegacy/3945.h b/drivers/net/wireless/iwlegacy/3945.h
index 1d45075e0d5b..9a8703def0ba 100644
--- a/drivers/net/wireless/iwlegacy/3945.h
+++ b/drivers/net/wireless/iwlegacy/3945.h
@@ -150,10 +150,6 @@ struct il3945_frame {
 	struct list_head list;
 };
 
-#define SEQ_TO_SN(seq) (((seq) & IEEE80211_SCTL_SEQ) >> 4)
-#define SN_TO_SEQ(ssn) (((ssn) << 4) & IEEE80211_SCTL_SEQ)
-#define MAX_SN ((IEEE80211_SCTL_SEQ) >> 4)
-
 #define SUP_RATE_11A_MAX_NUM_CHANNELS  8
 #define SUP_RATE_11B_MAX_NUM_CHANNELS  4
 #define SUP_RATE_11G_MAX_NUM_CHANNELS  12
diff --git a/drivers/net/wireless/iwlegacy/4965-mac.c b/drivers/net/wireless/iwlegacy/4965-mac.c
index 7941eb3a0166..c092fcbbe965 100644
--- a/drivers/net/wireless/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/iwlegacy/4965-mac.c
@@ -2258,7 +2258,7 @@ il4965_tx_agg_start(struct il_priv *il, struct ieee80211_vif *vif,
 
 	spin_lock_irqsave(&il->sta_lock, flags);
 	tid_data = &il->stations[sta_id].tid[tid];
-	*ssn = SEQ_TO_SN(tid_data->seq_number);
+	*ssn = IEEE80211_SEQ_TO_SN(tid_data->seq_number);
 	tid_data->agg.txq_id = txq_id;
 	il_set_swq_id(&il->txq[txq_id], il4965_get_ac_from_tid(tid), txq_id);
 	spin_unlock_irqrestore(&il->sta_lock, flags);
@@ -2408,7 +2408,7 @@ il4965_txq_check_empty(struct il_priv *il, int sta_id, u8 tid, int txq_id)
 		/* aggregated HW queue */
 		if (txq_id == tid_data->agg.txq_id &&
 		    q->read_ptr == q->write_ptr) {
-			u16 ssn = SEQ_TO_SN(tid_data->seq_number);
+			u16 ssn = IEEE80211_SEQ_TO_SN(tid_data->seq_number);
 			int tx_fifo = il4965_get_fifo_from_tid(tid);
 			D_HT("HW queue empty: continue DELBA flow\n");
 			il4965_txq_agg_disable(il, txq_id, ssn, tx_fifo);
@@ -2627,7 +2627,8 @@ il4965_get_ra_sta_id(struct il_priv *il, struct ieee80211_hdr *hdr)
 static inline u32
 il4965_get_scd_ssn(struct il4965_tx_resp *tx_resp)
 {
-	return le32_to_cpup(&tx_resp->u.status + tx_resp->frame_count) & MAX_SN;
+	return le32_to_cpup(&tx_resp->u.status +
+			    tx_resp->frame_count) & IEEE80211_MAX_SN;
 }
 
 static inline u32
@@ -2717,15 +2718,15 @@ il4965_tx_status_reply_tx(struct il_priv *il, struct il_ht_agg *agg,
 			hdr = (struct ieee80211_hdr *) skb->data;
 
 			sc = le16_to_cpu(hdr->seq_ctrl);
-			if (idx != (SEQ_TO_SN(sc) & 0xff)) {
+			if (idx != (IEEE80211_SEQ_TO_SN(sc) & 0xff)) {
 				IL_ERR("BUG_ON idx doesn't match seq control"
 				       " idx=%d, seq_idx=%d, seq=%d\n", idx,
-				       SEQ_TO_SN(sc), hdr->seq_ctrl);
+				       IEEE80211_SEQ_TO_SN(sc), hdr->seq_ctrl);
 				return -1;
 			}
 
 			D_TX_REPLY("AGG Frame i=%d idx %d seq=%d\n", i, idx,
-				   SEQ_TO_SN(sc));
+				   IEEE80211_SEQ_TO_SN(sc));
 
 			sh = idx - start;
 			if (sh > 64) {
diff --git a/drivers/net/wireless/iwlegacy/common.h b/drivers/net/wireless/iwlegacy/common.h
index 96f2025d936e..73bd3ef316c8 100644
--- a/drivers/net/wireless/iwlegacy/common.h
+++ b/drivers/net/wireless/iwlegacy/common.h
@@ -541,10 +541,6 @@ struct il_frame {
 	struct list_head list;
 };
 
-#define SEQ_TO_SN(seq) (((seq) & IEEE80211_SCTL_SEQ) >> 4)
-#define SN_TO_SEQ(ssn) (((ssn) << 4) & IEEE80211_SCTL_SEQ)
-#define MAX_SN ((IEEE80211_SCTL_SEQ) >> 4)
-
 enum {
 	CMD_SYNC = 0,
 	CMD_SIZE_NORMAL = 0,
diff --git a/drivers/net/wireless/iwlwifi/dvm/tx.c b/drivers/net/wireless/iwlwifi/dvm/tx.c
index 6aec2df3bb27..d499a0366fa6 100644
--- a/drivers/net/wireless/iwlwifi/dvm/tx.c
+++ b/drivers/net/wireless/iwlwifi/dvm/tx.c
@@ -418,7 +418,8 @@ int iwlagn_tx_skb(struct iwl_priv *priv,
 				" Tx flags = 0x%08x, agg.state = %d",
 				info->flags, tid_data->agg.state);
 			IWL_ERR(priv, "sta_id = %d, tid = %d seq_num = %d",
-				sta_id, tid, SEQ_TO_SN(tid_data->seq_number));
+				sta_id, tid,
+				IEEE80211_SEQ_TO_SN(tid_data->seq_number));
 			goto drop_unlock_sta;
 		}
 
@@ -569,7 +570,7 @@ int iwlagn_tx_agg_stop(struct iwl_priv *priv, struct ieee80211_vif *vif,
 		return 0;
 	}
 
-	tid_data->agg.ssn = SEQ_TO_SN(tid_data->seq_number);
+	tid_data->agg.ssn = IEEE80211_SEQ_TO_SN(tid_data->seq_number);
 
 	/* There are still packets for this RA / TID in the HW */
 	if (!test_bit(txq_id, priv->agg_q_alloc)) {
@@ -651,7 +652,7 @@ int iwlagn_tx_agg_start(struct iwl_priv *priv, struct ieee80211_vif *vif,
 
 	spin_lock_bh(&priv->sta_lock);
 	tid_data = &priv->tid_data[sta_id][tid];
-	tid_data->agg.ssn = SEQ_TO_SN(tid_data->seq_number);
+	tid_data->agg.ssn = IEEE80211_SEQ_TO_SN(tid_data->seq_number);
 	tid_data->agg.txq_id = txq_id;
 
 	*ssn = tid_data->agg.ssn;
@@ -911,7 +912,7 @@ static void iwlagn_count_agg_tx_err_status(struct iwl_priv *priv, u16 status)
 static inline u32 iwlagn_get_scd_ssn(struct iwlagn_tx_resp *tx_resp)
 {
 	return le32_to_cpup((__le32 *)&tx_resp->status +
-			    tx_resp->frame_count) & MAX_SN;
+			    tx_resp->frame_count) & IEEE80211_MAX_SN;
 }
 
 static void iwl_rx_reply_tx_agg(struct iwl_priv *priv,
@@ -1148,7 +1149,7 @@ int iwlagn_rx_reply_tx(struct iwl_priv *priv, struct iwl_rx_cmd_buffer *rxb,
 
 	if (tx_resp->frame_count == 1) {
 		u16 next_reclaimed = le16_to_cpu(tx_resp->seq_ctl);
-		next_reclaimed = SEQ_TO_SN(next_reclaimed + 0x10);
+		next_reclaimed = IEEE80211_SEQ_TO_SN(next_reclaimed + 0x10);
 
 		if (is_agg) {
 			/* If this is an aggregation queue, we can rely on the
diff --git a/drivers/net/wireless/iwlwifi/iwl-trans.h b/drivers/net/wireless/iwlwifi/iwl-trans.h
index 8c7bec6b9a0b..00bdc5b00af3 100644
--- a/drivers/net/wireless/iwlwifi/iwl-trans.h
+++ b/drivers/net/wireless/iwlwifi/iwl-trans.h
@@ -114,9 +114,6 @@
  * completely agnostic to these differences.
  * The transport does provide helper functionnality (i.e. SYNC / ASYNC mode),
  */
-#define SEQ_TO_SN(seq) (((seq) & IEEE80211_SCTL_SEQ) >> 4)
-#define SN_TO_SEQ(ssn) (((ssn) << 4) & IEEE80211_SCTL_SEQ)
-#define MAX_SN ((IEEE80211_SCTL_SEQ) >> 4)
 #define SEQ_TO_QUEUE(s)	(((s) >> 8) & 0x1f)
 #define QUEUE_TO_SEQ(q)	(((q) & 0x1f) << 8)
 #define SEQ_TO_INDEX(s)	((s) & 0xff)
diff --git a/drivers/net/wireless/iwlwifi/mvm/sta.c b/drivers/net/wireless/iwlwifi/mvm/sta.c
index 861a7f9f8e7f..52aecf20d0df 100644
--- a/drivers/net/wireless/iwlwifi/mvm/sta.c
+++ b/drivers/net/wireless/iwlwifi/mvm/sta.c
@@ -686,7 +686,7 @@ int iwl_mvm_sta_tx_agg_start(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 	spin_lock_bh(&mvmsta->lock);
 	tid_data = &mvmsta->tid_data[tid];
-	tid_data->ssn = SEQ_TO_SN(tid_data->seq_number);
+	tid_data->ssn = IEEE80211_SEQ_TO_SN(tid_data->seq_number);
 	tid_data->txq_id = txq_id;
 	*ssn = tid_data->ssn;
 
@@ -779,7 +779,7 @@ int iwl_mvm_sta_tx_agg_stop(struct iwl_mvm *mvm, struct ieee80211_vif *vif,
 
 	switch (tid_data->state) {
 	case IWL_AGG_ON:
-		tid_data->ssn = SEQ_TO_SN(tid_data->seq_number);
+		tid_data->ssn = IEEE80211_SEQ_TO_SN(tid_data->seq_number);
 
 		IWL_DEBUG_TX_QUEUES(mvm,
 				    "ssn = %d, next_recl = %d\n",
diff --git a/drivers/net/wireless/iwlwifi/mvm/tx.c b/drivers/net/wireless/iwlwifi/mvm/tx.c
index 6b67ce3f679c..56df249b215e 100644
--- a/drivers/net/wireless/iwlwifi/mvm/tx.c
+++ b/drivers/net/wireless/iwlwifi/mvm/tx.c
@@ -641,7 +641,7 @@ static void iwl_mvm_rx_tx_cmd_single(struct iwl_mvm *mvm,
 		next_reclaimed = ssn;
 	} else {
 		/* The next packet to be reclaimed is the one after this one */
-		next_reclaimed = SEQ_TO_SN(seq_ctl + 0x10);
+		next_reclaimed = IEEE80211_SEQ_TO_SN(seq_ctl + 0x10);
 	}
 
 	IWL_DEBUG_TX_REPLY(mvm,
diff --git a/drivers/net/wireless/iwlwifi/pcie/tx.c b/drivers/net/wireless/iwlwifi/pcie/tx.c
index 8e9e3212fe78..ad7441dfa6fb 100644
--- a/drivers/net/wireless/iwlwifi/pcie/tx.c
+++ b/drivers/net/wireless/iwlwifi/pcie/tx.c
@@ -1581,7 +1581,7 @@ int iwl_trans_pcie_tx(struct iwl_trans *trans, struct sk_buff *skb,
 	 * Check here that the packets are in the right place on the ring.
 	 */
 #ifdef CONFIG_IWLWIFI_DEBUG
-	wifi_seq = SEQ_TO_SN(le16_to_cpu(hdr->seq_ctrl));
+	wifi_seq = IEEE80211_SEQ_TO_SN(le16_to_cpu(hdr->seq_ctrl));
 	WARN_ONCE((iwl_read_prph(trans, SCD_AGGR_SEL) & BIT(txq_id)) &&
 		  ((wifi_seq & 0xff) != q->write_ptr),
 		  "Q: %d WiFi Seq %d tfdNum %d",
diff --git a/drivers/net/wireless/rtlwifi/wifi.h b/drivers/net/wireless/rtlwifi/wifi.h
index f13258a8d995..c3eff32acf6c 100644
--- a/drivers/net/wireless/rtlwifi/wifi.h
+++ b/drivers/net/wireless/rtlwifi/wifi.h
@@ -2127,9 +2127,6 @@ value to host byte ordering.*/
 #define WLAN_FC_GET_TYPE(fc)	(le16_to_cpu(fc) & IEEE80211_FCTL_FTYPE)
 #define WLAN_FC_GET_STYPE(fc)	(le16_to_cpu(fc) & IEEE80211_FCTL_STYPE)
 #define WLAN_FC_MORE_DATA(fc)	(le16_to_cpu(fc) & IEEE80211_FCTL_MOREDATA)
-#define SEQ_TO_SN(seq)		(((seq) & IEEE80211_SCTL_SEQ) >> 4)
-#define SN_TO_SEQ(ssn)		(((ssn) << 4) & IEEE80211_SCTL_SEQ)
-#define MAX_SN			((IEEE80211_SCTL_SEQ) >> 4)
 
 #define	RT_RF_OFF_LEVL_ASPM		BIT(0)	/*PCI ASPM */
 #define	RT_RF_OFF_LEVL_CLK_REQ		BIT(1)	/*PCI clock request */
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 7e24fe0cfbcd..a0c550fb65a6 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -113,6 +113,34 @@
 #define IEEE80211_CTL_EXT_SSW_FBACK	0x9000
 #define IEEE80211_CTL_EXT_SSW_ACK	0xa000
 
+
+#define IEEE80211_SN_MASK		((IEEE80211_SCTL_SEQ) >> 4)
+#define IEEE80211_MAX_SN		IEEE80211_SN_MASK
+#define IEEE80211_SN_MODULO		(IEEE80211_MAX_SN + 1)
+
+static inline int ieee80211_sn_less(u16 sn1, u16 sn2)
+{
+	return ((sn1 - sn2) & IEEE80211_SN_MASK) > (IEEE80211_SN_MODULO >> 1);
+}
+
+static inline u16 ieee80211_sn_add(u16 sn1, u16 sn2)
+{
+	return (sn1 + sn2) & IEEE80211_SN_MASK;
+}
+
+static inline u16 ieee80211_sn_inc(u16 sn)
+{
+	return ieee80211_sn_add(sn, 1);
+}
+
+static inline u16 ieee80211_sn_sub(u16 sn1, u16 sn2)
+{
+	return (sn1 - sn2) & IEEE80211_SN_MASK;
+}
+
+#define IEEE80211_SEQ_TO_SN(seq)	(((seq) & IEEE80211_SCTL_SEQ) >> 4)
+#define IEEE80211_SN_TO_SEQ(ssn)	(((ssn) << 4) & IEEE80211_SCTL_SEQ)
+
 /* miscellaneous IEEE 802.11 constants */
 #define IEEE80211_MAX_FRAG_THRESHOLD	2352
 #define IEEE80211_MAX_RTS_THRESHOLD	2353
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index acf006f2d61a..1f940e2b6f27 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -648,24 +648,6 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
 	return RX_CONTINUE;
 }
 
-#define SEQ_MODULO 0x1000
-#define SEQ_MASK   0xfff
-
-static inline int seq_less(u16 sq1, u16 sq2)
-{
-	return ((sq1 - sq2) & SEQ_MASK) > (SEQ_MODULO >> 1);
-}
-
-static inline u16 seq_inc(u16 sq)
-{
-	return (sq + 1) & SEQ_MASK;
-}
-
-static inline u16 seq_sub(u16 sq1, u16 sq2)
-{
-	return (sq1 - sq2) & SEQ_MASK;
-}
-
 static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
 					    struct tid_ampdu_rx *tid_agg_rx,
 					    int index,
@@ -687,7 +669,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
 	__skb_queue_tail(frames, skb);
 
 no_frame:
-	tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
+	tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
 }
 
 static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata,
@@ -699,8 +681,9 @@ static void ieee80211_release_reorder_frames(struct ieee80211_sub_if_data *sdata
 
 	lockdep_assert_held(&tid_agg_rx->reorder_lock);
 
-	while (seq_less(tid_agg_rx->head_seq_num, head_seq_num)) {
-		index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+	while (ieee80211_sn_less(tid_agg_rx->head_seq_num, head_seq_num)) {
+		index = ieee80211_sn_sub(tid_agg_rx->head_seq_num,
+					 tid_agg_rx->ssn) %
 							tid_agg_rx->buf_size;
 		ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
 						frames);
@@ -727,8 +710,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 	lockdep_assert_held(&tid_agg_rx->reorder_lock);
 
 	/* release the buffer until next missing frame */
-	index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
-						tid_agg_rx->buf_size;
+	index = ieee80211_sn_sub(tid_agg_rx->head_seq_num,
+				 tid_agg_rx->ssn) % tid_agg_rx->buf_size;
 	if (!tid_agg_rx->reorder_buf[index] &&
 	    tid_agg_rx->stored_mpdu_num) {
 		/*
@@ -756,19 +739,22 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
 			 * Increment the head seq# also for the skipped slots.
 			 */
 			tid_agg_rx->head_seq_num =
-				(tid_agg_rx->head_seq_num + skipped) & SEQ_MASK;
+				(tid_agg_rx->head_seq_num +
+				 skipped) & IEEE80211_SN_MASK;
 			skipped = 0;
 		}
 	} else while (tid_agg_rx->reorder_buf[index]) {
 		ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
 						frames);
-		index =	seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) %
+		index =	ieee80211_sn_sub(tid_agg_rx->head_seq_num,
+					 tid_agg_rx->ssn) %
 							tid_agg_rx->buf_size;
 	}
 
 	if (tid_agg_rx->stored_mpdu_num) {
-		j = index = seq_sub(tid_agg_rx->head_seq_num,
-				    tid_agg_rx->ssn) % tid_agg_rx->buf_size;
+		j = index = ieee80211_sn_sub(tid_agg_rx->head_seq_num,
+					     tid_agg_rx->ssn) %
+							tid_agg_rx->buf_size;
 
 		for (; j != (index - 1) % tid_agg_rx->buf_size;
 		     j = (j + 1) % tid_agg_rx->buf_size) {
@@ -809,7 +795,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
 	head_seq_num = tid_agg_rx->head_seq_num;
 
 	/* frame with out of date sequence number */
-	if (seq_less(mpdu_seq_num, head_seq_num)) {
+	if (ieee80211_sn_less(mpdu_seq_num, head_seq_num)) {
 		dev_kfree_skb(skb);
 		goto out;
 	}
@@ -818,8 +804,9 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
 	 * If frame the sequence number exceeds our buffering window
 	 * size release some previous frames to make room for this one.
 	 */
-	if (!seq_less(mpdu_seq_num, head_seq_num + buf_size)) {
-		head_seq_num = seq_inc(seq_sub(mpdu_seq_num, buf_size));
+	if (!ieee80211_sn_less(mpdu_seq_num, head_seq_num + buf_size)) {
+		head_seq_num = ieee80211_sn_inc(
+				ieee80211_sn_sub(mpdu_seq_num, buf_size));
 		/* release stored frames up to new head to stack */
 		ieee80211_release_reorder_frames(sdata, tid_agg_rx,
 						 head_seq_num, frames);
@@ -827,7 +814,8 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
 
 	/* Now the new frame is always in the range of the reordering buffer */
 
-	index = seq_sub(mpdu_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size;
+	index = ieee80211_sn_sub(mpdu_seq_num,
+				 tid_agg_rx->ssn) % tid_agg_rx->buf_size;
 
 	/* check if we already stored this frame */
 	if (tid_agg_rx->reorder_buf[index]) {
@@ -843,7 +831,8 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
 	 */
 	if (mpdu_seq_num == tid_agg_rx->head_seq_num &&
 	    tid_agg_rx->stored_mpdu_num == 0) {
-		tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num);
+		tid_agg_rx->head_seq_num =
+			ieee80211_sn_inc(tid_agg_rx->head_seq_num);
 		ret = false;
 		goto out;
 	}
-- 
cgit 


From b8a31c9a5afff257cc5dd637cda5fef03e12d67b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@openwrt.org>
Date: Fri, 22 Feb 2013 17:28:49 +0100
Subject: ieee80211: mark 802.11 related structs as being 2-byte aligned

Regardless of what header features they use, or if they align the IP
header or not, 802.11 packets from all drivers guarantee a 2-byte
alignment (and there's a debug WARN_ON in case they don't).

Annotate packet structs with __aligned(2) to allow the compiler to use
16-bit load/store operations on platforms with extremely inefficient
unaligned access (e.g. MIPS).

This reduces code size and improves performance on affected platforms
and causes no binary code change on others.

Signed-off-by: Felix Fietkau <nbd@openwrt.org>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index a0c550fb65a6..6e352c31fee0 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -213,7 +213,7 @@ struct ieee80211_hdr {
 	u8 addr3[6];
 	__le16 seq_ctrl;
 	u8 addr4[6];
-} __packed;
+} __packed __aligned(2);
 
 struct ieee80211_hdr_3addr {
 	__le16 frame_control;
@@ -222,7 +222,7 @@ struct ieee80211_hdr_3addr {
 	u8 addr2[6];
 	u8 addr3[6];
 	__le16 seq_ctrl;
-} __packed;
+} __packed __aligned(2);
 
 struct ieee80211_qos_hdr {
 	__le16 frame_control;
@@ -232,7 +232,7 @@ struct ieee80211_qos_hdr {
 	u8 addr3[6];
 	__le16 seq_ctrl;
 	__le16 qos_ctrl;
-} __packed;
+} __packed __aligned(2);
 
 /**
  * ieee80211_has_tods - check if IEEE80211_FCTL_TODS is set
@@ -609,7 +609,7 @@ struct ieee80211s_hdr {
 	__le32 seqnum;
 	u8 eaddr1[6];
 	u8 eaddr2[6];
-} __packed;
+} __packed __aligned(2);
 
 /* Mesh flags */
 #define MESH_FLAGS_AE_A4 	0x1
@@ -903,7 +903,7 @@ struct ieee80211_mgmt {
 			} u;
 		} __packed action;
 	} u;
-} __packed;
+} __packed __aligned(2);
 
 /* Supported Rates value encodings in 802.11n-2009 7.3.2.2 */
 #define BSS_MEMBERSHIP_SELECTOR_HT_PHY	127
@@ -934,20 +934,20 @@ struct ieee80211_rts {
 	__le16 duration;
 	u8 ra[6];
 	u8 ta[6];
-} __packed;
+} __packed __aligned(2);
 
 struct ieee80211_cts {
 	__le16 frame_control;
 	__le16 duration;
 	u8 ra[6];
-} __packed;
+} __packed __aligned(2);
 
 struct ieee80211_pspoll {
 	__le16 frame_control;
 	__le16 aid;
 	u8 bssid[6];
 	u8 ta[6];
-} __packed;
+} __packed __aligned(2);
 
 /* TDLS */
 
-- 
cgit 


From c8bb93f5f5d478a01db66127844d1d2dd30abec7 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Thu, 21 Feb 2013 17:26:44 +0100
Subject: wireless: remove unused VHT MCS defines

There's an enum with the same values (but slightly
different names except for NOT_SUPPORTED) that is
actually used, so remove the defines.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 6e352c31fee0..35c1f96d9365 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1318,11 +1318,6 @@ struct ieee80211_vht_operation {
 } __packed;
 
 
-#define IEEE80211_VHT_MCS_ZERO_TO_SEVEN_SUPPORT 0
-#define IEEE80211_VHT_MCS_ZERO_TO_EIGHT_SUPPORT 1
-#define IEEE80211_VHT_MCS_ZERO_TO_NINE_SUPPORT  2
-#define IEEE80211_VHT_MCS_NOT_SUPPORTED 3
-
 /* 802.11ac VHT Capabilities */
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895			0x00000000
 #define IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991			0x00000001
-- 
cgit 


From 55d942f4246c79a8f3f17f92c224e641c5c26125 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Fri, 1 Mar 2013 13:07:48 +0100
Subject: mac80211: restrict peer's VHT capabilities to own

Implement restricting peer VHT capabilities to the device's own
capabilities. This is useful when a single driver supports more
than one device and the devices have different capabilities
(often they will differ in the number of spatial streams), but
in particular is also necessary for VHT capability overrides to
work correctly -- otherwise it'd be possible to e.g. advertise,
due to overrides, that TX-STBC is not supported, but then still
use it to TX to the AP because it supports RX-STBC.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |   3 +-
 include/net/mac80211.h    |   5 +-
 net/mac80211/vht.c        | 114 +++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 117 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 35c1f96d9365..4cf0c9e4dd99 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1333,10 +1333,11 @@ struct ieee80211_vht_operation {
 #define IEEE80211_VHT_CAP_RXSTBC_2				0x00000200
 #define IEEE80211_VHT_CAP_RXSTBC_3				0x00000300
 #define IEEE80211_VHT_CAP_RXSTBC_4				0x00000400
+#define IEEE80211_VHT_CAP_RXSTBC_MASK				0x00000700
 #define IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE			0x00000800
 #define IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE			0x00001000
 #define IEEE80211_VHT_CAP_BEAMFORMER_ANTENNAS_MAX		0x00006000
-#define IEEE80211_VHT_CAP_SOUNDING_DIMENTION_MAX		0x00030000
+#define IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MAX		0x00030000
 #define IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE			0x00080000
 #define IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE			0x00100000
 #define IEEE80211_VHT_CAP_VHT_TXOP_PS				0x00200000
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 421c3ac8c521..cdd7cea1fd4c 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1228,9 +1228,8 @@ enum ieee80211_sta_rx_bandwidth {
  * @addr: MAC address
  * @aid: AID we assigned to the station if we're an AP
  * @supp_rates: Bitmap of supported rates (per band)
- * @ht_cap: HT capabilities of this STA; restricted to our own TX capabilities
- * @vht_cap: VHT capabilities of this STA; Not restricting any capabilities
- * 	of remote STA. Taking as is.
+ * @ht_cap: HT capabilities of this STA; restricted to our own capabilities
+ * @vht_cap: VHT capabilities of this STA; restricted to our own capabilities
  * @wme: indicates whether the STA supports WME. Only valid during AP-mode.
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void *), size is determined in hw information.
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index cacc1c74556a..171344d4eb7c 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -118,6 +118,8 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 				    struct sta_info *sta)
 {
 	struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
+	struct ieee80211_sta_vht_cap own_cap;
+	u32 cap_info, i;
 
 	memset(vht_cap, 0, sizeof(*vht_cap));
 
@@ -133,12 +135,122 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
 
 	vht_cap->vht_supported = true;
 
-	vht_cap->cap = le32_to_cpu(vht_cap_ie->vht_cap_info);
+	own_cap = sband->vht_cap;
+	/*
+	 * If user has specified capability overrides, take care
+	 * of that if the station we're setting up is the AP that
+	 * we advertised a restricted capability set to. Override
+	 * our own capabilities and then use those below.
+	 */
+	if (sdata->vif.type == NL80211_IFTYPE_STATION &&
+	    !test_sta_flag(sta, WLAN_STA_TDLS_PEER))
+		ieee80211_apply_vhtcap_overrides(sdata, &own_cap);
+
+	/* take some capabilities as-is */
+	cap_info = le32_to_cpu(vht_cap_ie->vht_cap_info);
+	vht_cap->cap = cap_info;
+	vht_cap->cap &= IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895 |
+			IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991 |
+			IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454 |
+			IEEE80211_VHT_CAP_RXLDPC |
+			IEEE80211_VHT_CAP_VHT_TXOP_PS |
+			IEEE80211_VHT_CAP_HTC_VHT |
+			IEEE80211_VHT_CAP_MAX_A_MPDU_LENGTH_EXPONENT_MASK |
+			IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_UNSOL_MFB |
+			IEEE80211_VHT_CAP_VHT_LINK_ADAPTATION_VHT_MRQ_MFB |
+			IEEE80211_VHT_CAP_RX_ANTENNA_PATTERN |
+			IEEE80211_VHT_CAP_TX_ANTENNA_PATTERN;
+
+	/* and some based on our own capabilities */
+	switch (own_cap.cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
+	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
+		vht_cap->cap |= cap_info &
+				IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ;
+		break;
+	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
+		vht_cap->cap |= cap_info &
+				IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+		break;
+	default:
+		/* nothing */
+		break;
+	}
+
+	/* symmetric capabilities */
+	vht_cap->cap |= cap_info & own_cap.cap &
+			(IEEE80211_VHT_CAP_SHORT_GI_80 |
+			 IEEE80211_VHT_CAP_SHORT_GI_160);
+
+	/* remaining ones */
+	if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE) {
+		vht_cap->cap |= cap_info &
+				(IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE |
+				 IEEE80211_VHT_CAP_BEAMFORMER_ANTENNAS_MAX |
+				 IEEE80211_VHT_CAP_SOUNDING_DIMENSIONS_MAX);
+	}
+
+	if (own_cap.cap & IEEE80211_VHT_CAP_SU_BEAMFORMER_CAPABLE)
+		vht_cap->cap |= cap_info &
+				IEEE80211_VHT_CAP_SU_BEAMFORMEE_CAPABLE;
+
+	if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE)
+		vht_cap->cap |= cap_info &
+				IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
+
+	if (own_cap.cap & IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE)
+		vht_cap->cap |= cap_info &
+				IEEE80211_VHT_CAP_MU_BEAMFORMER_CAPABLE;
+
+	if (own_cap.cap & IEEE80211_VHT_CAP_TXSTBC)
+		vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_RXSTBC_MASK;
+
+	if (own_cap.cap & IEEE80211_VHT_CAP_RXSTBC_MASK)
+		vht_cap->cap |= cap_info & IEEE80211_VHT_CAP_TXSTBC;
 
 	/* Copy peer MCS info, the driver might need them. */
 	memcpy(&vht_cap->vht_mcs, &vht_cap_ie->supp_mcs,
 	       sizeof(struct ieee80211_vht_mcs_info));
 
+	/* but also restrict MCSes */
+	for (i = 0; i < 8; i++) {
+		u16 own_rx, own_tx, peer_rx, peer_tx;
+
+		own_rx = le16_to_cpu(own_cap.vht_mcs.rx_mcs_map);
+		own_rx = (own_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+		own_tx = le16_to_cpu(own_cap.vht_mcs.tx_mcs_map);
+		own_tx = (own_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+		peer_rx = le16_to_cpu(vht_cap->vht_mcs.rx_mcs_map);
+		peer_rx = (peer_rx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+		peer_tx = le16_to_cpu(vht_cap->vht_mcs.tx_mcs_map);
+		peer_tx = (peer_tx >> i * 2) & IEEE80211_VHT_MCS_NOT_SUPPORTED;
+
+		if (peer_tx != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+			if (own_rx == IEEE80211_VHT_MCS_NOT_SUPPORTED)
+				peer_tx = IEEE80211_VHT_MCS_NOT_SUPPORTED;
+			else if (own_rx < peer_tx)
+				peer_tx = own_rx;
+		}
+
+		if (peer_rx != IEEE80211_VHT_MCS_NOT_SUPPORTED) {
+			if (own_tx == IEEE80211_VHT_MCS_NOT_SUPPORTED)
+				peer_rx = IEEE80211_VHT_MCS_NOT_SUPPORTED;
+			else if (own_tx < peer_rx)
+				peer_rx = own_tx;
+		}
+
+		vht_cap->vht_mcs.rx_mcs_map &=
+			~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2);
+		vht_cap->vht_mcs.rx_mcs_map |= cpu_to_le16(peer_rx << i * 2);
+
+		vht_cap->vht_mcs.tx_mcs_map &=
+			~cpu_to_le16(IEEE80211_VHT_MCS_NOT_SUPPORTED << i * 2);
+		vht_cap->vht_mcs.tx_mcs_map |= cpu_to_le16(peer_tx << i * 2);
+	}
+
+	/* finally set up the bandwidth */
 	switch (vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK) {
 	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ:
 	case IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ:
-- 
cgit 


From acbba0d0f88e2577b9d92b61b136d13f65831a52 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 6 Mar 2013 01:31:12 +0000
Subject: team: introduce two default team_modeop functions and use them in
 modes

No need to duplicate code for this.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/team.c                 | 19 ++++++++++++++++---
 drivers/net/team/team_mode_broadcast.c  | 14 ++------------
 drivers/net/team/team_mode_roundrobin.c | 14 ++------------
 include/linux/if_team.h                 |  5 ++++-
 4 files changed, 24 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 05c5efe84591..ece70a4abbb1 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -73,11 +73,24 @@ static int team_port_set_orig_dev_addr(struct team_port *port)
 	return __set_port_dev_addr(port->dev, port->orig.dev_addr);
 }
 
-int team_port_set_team_dev_addr(struct team_port *port)
+static int team_port_set_team_dev_addr(struct team *team,
+				       struct team_port *port)
+{
+	return __set_port_dev_addr(port->dev, team->dev->dev_addr);
+}
+
+int team_modeop_port_enter(struct team *team, struct team_port *port)
+{
+	return team_port_set_team_dev_addr(team, port);
+}
+EXPORT_SYMBOL(team_modeop_port_enter);
+
+void team_modeop_port_change_dev_addr(struct team *team,
+				      struct team_port *port)
 {
-	return __set_port_dev_addr(port->dev, port->team->dev->dev_addr);
+	team_port_set_team_dev_addr(team, port);
 }
-EXPORT_SYMBOL(team_port_set_team_dev_addr);
+EXPORT_SYMBOL(team_modeop_port_change_dev_addr);
 
 static void team_refresh_port_linkup(struct team_port *port)
 {
diff --git a/drivers/net/team/team_mode_broadcast.c b/drivers/net/team/team_mode_broadcast.c
index c5db428e73fa..c366cd299c06 100644
--- a/drivers/net/team/team_mode_broadcast.c
+++ b/drivers/net/team/team_mode_broadcast.c
@@ -46,20 +46,10 @@ static bool bc_transmit(struct team *team, struct sk_buff *skb)
 	return sum_ret;
 }
 
-static int bc_port_enter(struct team *team, struct team_port *port)
-{
-	return team_port_set_team_dev_addr(port);
-}
-
-static void bc_port_change_dev_addr(struct team *team, struct team_port *port)
-{
-	team_port_set_team_dev_addr(port);
-}
-
 static const struct team_mode_ops bc_mode_ops = {
 	.transmit		= bc_transmit,
-	.port_enter		= bc_port_enter,
-	.port_change_dev_addr	= bc_port_change_dev_addr,
+	.port_enter		= team_modeop_port_enter,
+	.port_change_dev_addr	= team_modeop_port_change_dev_addr,
 };
 
 static const struct team_mode bc_mode = {
diff --git a/drivers/net/team/team_mode_roundrobin.c b/drivers/net/team/team_mode_roundrobin.c
index 105135aa8f05..ed63a6bc66ce 100644
--- a/drivers/net/team/team_mode_roundrobin.c
+++ b/drivers/net/team/team_mode_roundrobin.c
@@ -64,20 +64,10 @@ drop:
 	return false;
 }
 
-static int rr_port_enter(struct team *team, struct team_port *port)
-{
-	return team_port_set_team_dev_addr(port);
-}
-
-static void rr_port_change_dev_addr(struct team *team, struct team_port *port)
-{
-	team_port_set_team_dev_addr(port);
-}
-
 static const struct team_mode_ops rr_mode_ops = {
 	.transmit		= rr_transmit,
-	.port_enter		= rr_port_enter,
-	.port_change_dev_addr	= rr_port_change_dev_addr,
+	.port_enter		= team_modeop_port_enter,
+	.port_change_dev_addr	= team_modeop_port_change_dev_addr,
 };
 
 static const struct team_mode rr_mode = {
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index cfd21e3d5506..3283def74483 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -112,6 +112,10 @@ struct team_mode_ops {
 	void (*port_disabled)(struct team *team, struct team_port *port);
 };
 
+extern int team_modeop_port_enter(struct team *team, struct team_port *port);
+extern void team_modeop_port_change_dev_addr(struct team *team,
+					     struct team_port *port);
+
 enum team_option_type {
 	TEAM_OPTION_TYPE_U32,
 	TEAM_OPTION_TYPE_STRING,
@@ -236,7 +240,6 @@ static inline struct team_port *team_get_port_by_index_rcu(struct team *team,
 	return NULL;
 }
 
-extern int team_port_set_team_dev_addr(struct team_port *port);
 extern int team_options_register(struct team *team,
 				 const struct team_option *option,
 				 size_t option_count);
-- 
cgit 


From 753f993911b32e479b4fab5d228dc07c11d1e7e7 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@resnulli.us>
Date: Wed, 6 Mar 2013 01:31:13 +0000
Subject: team: introduce random mode

As suggested by Eric Dumazet, allow user to select mode which chooses
TX port randomly. Functionality should be more of less similar to
round-robin mode with even lower overhead.

Signed-off-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/team/Kconfig                | 12 ++++++
 drivers/net/team/Makefile               |  1 +
 drivers/net/team/team_mode_random.c     | 71 +++++++++++++++++++++++++++++++++
 drivers/net/team/team_mode_roundrobin.c | 22 +---------
 include/linux/if_team.h                 | 20 ++++++++++
 5 files changed, 105 insertions(+), 21 deletions(-)
 create mode 100644 drivers/net/team/team_mode_random.c

(limited to 'include/linux')

diff --git a/drivers/net/team/Kconfig b/drivers/net/team/Kconfig
index c3011af68e91..c853d84fd99f 100644
--- a/drivers/net/team/Kconfig
+++ b/drivers/net/team/Kconfig
@@ -37,6 +37,18 @@ config NET_TEAM_MODE_ROUNDROBIN
 	  To compile this team mode as a module, choose M here: the module
 	  will be called team_mode_roundrobin.
 
+config NET_TEAM_MODE_RANDOM
+	tristate "Random mode support"
+	depends on NET_TEAM
+	---help---
+	  Basic mode where port used for transmitting packets is selected
+	  randomly.
+
+	  All added ports are setup to have team's device address.
+
+	  To compile this team mode as a module, choose M here: the module
+	  will be called team_mode_random.
+
 config NET_TEAM_MODE_ACTIVEBACKUP
 	tristate "Active-backup mode support"
 	depends on NET_TEAM
diff --git a/drivers/net/team/Makefile b/drivers/net/team/Makefile
index 975763014e5a..c57e85889751 100644
--- a/drivers/net/team/Makefile
+++ b/drivers/net/team/Makefile
@@ -5,5 +5,6 @@
 obj-$(CONFIG_NET_TEAM) += team.o
 obj-$(CONFIG_NET_TEAM_MODE_BROADCAST) += team_mode_broadcast.o
 obj-$(CONFIG_NET_TEAM_MODE_ROUNDROBIN) += team_mode_roundrobin.o
+obj-$(CONFIG_NET_TEAM_MODE_RANDOM) += team_mode_random.o
 obj-$(CONFIG_NET_TEAM_MODE_ACTIVEBACKUP) += team_mode_activebackup.o
 obj-$(CONFIG_NET_TEAM_MODE_LOADBALANCE) += team_mode_loadbalance.o
diff --git a/drivers/net/team/team_mode_random.c b/drivers/net/team/team_mode_random.c
new file mode 100644
index 000000000000..9eabfaa22f3e
--- /dev/null
+++ b/drivers/net/team/team_mode_random.c
@@ -0,0 +1,71 @@
+/*
+ * drivers/net/team/team_mode_random.c - Random mode for team
+ * Copyright (c) 2013 Jiri Pirko <jiri@resnulli.us>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/reciprocal_div.h>
+#include <linux/if_team.h>
+
+static u32 random_N(unsigned int N)
+{
+	return reciprocal_divide(random32(), N);
+}
+
+static bool rnd_transmit(struct team *team, struct sk_buff *skb)
+{
+	struct team_port *port;
+	int port_index;
+
+	port_index = random_N(team->en_port_count);
+	port = team_get_port_by_index_rcu(team, port_index);
+	port = team_get_first_port_txable_rcu(team, port);
+	if (unlikely(!port))
+		goto drop;
+	if (team_dev_queue_xmit(team, port, skb))
+		return false;
+	return true;
+
+drop:
+	dev_kfree_skb_any(skb);
+	return false;
+}
+
+static const struct team_mode_ops rnd_mode_ops = {
+	.transmit		= rnd_transmit,
+	.port_enter		= team_modeop_port_enter,
+	.port_change_dev_addr	= team_modeop_port_change_dev_addr,
+};
+
+static const struct team_mode rnd_mode = {
+	.kind		= "random",
+	.owner		= THIS_MODULE,
+	.ops		= &rnd_mode_ops,
+};
+
+static int __init rnd_init_module(void)
+{
+	return team_mode_register(&rnd_mode);
+}
+
+static void __exit rnd_cleanup_module(void)
+{
+	team_mode_unregister(&rnd_mode);
+}
+
+module_init(rnd_init_module);
+module_exit(rnd_cleanup_module);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@resnulli.us>");
+MODULE_DESCRIPTION("Random mode for team");
+MODULE_ALIAS("team-mode-random");
diff --git a/drivers/net/team/team_mode_roundrobin.c b/drivers/net/team/team_mode_roundrobin.c
index ed63a6bc66ce..d268e4de781b 100644
--- a/drivers/net/team/team_mode_roundrobin.c
+++ b/drivers/net/team/team_mode_roundrobin.c
@@ -25,26 +25,6 @@ static struct rr_priv *rr_priv(struct team *team)
 	return (struct rr_priv *) &team->mode_priv;
 }
 
-static struct team_port *__get_first_port_up(struct team *team,
-					     struct team_port *port)
-{
-	struct team_port *cur;
-
-	if (team_port_txable(port))
-		return port;
-	cur = port;
-	list_for_each_entry_continue_rcu(cur, &team->port_list, list)
-		if (team_port_txable(port))
-			return cur;
-	list_for_each_entry_rcu(cur, &team->port_list, list) {
-		if (cur == port)
-			break;
-		if (team_port_txable(port))
-			return cur;
-	}
-	return NULL;
-}
-
 static bool rr_transmit(struct team *team, struct sk_buff *skb)
 {
 	struct team_port *port;
@@ -52,7 +32,7 @@ static bool rr_transmit(struct team *team, struct sk_buff *skb)
 
 	port_index = rr_priv(team)->sent_packets++ % team->en_port_count;
 	port = team_get_port_by_index_rcu(team, port_index);
-	port = __get_first_port_up(team, port);
+	port = team_get_first_port_txable_rcu(team, port);
 	if (unlikely(!port))
 		goto drop;
 	if (team_dev_queue_xmit(team, port, skb))
diff --git a/include/linux/if_team.h b/include/linux/if_team.h
index 3283def74483..4474557904f6 100644
--- a/include/linux/if_team.h
+++ b/include/linux/if_team.h
@@ -240,6 +240,26 @@ static inline struct team_port *team_get_port_by_index_rcu(struct team *team,
 	return NULL;
 }
 
+static inline struct team_port *
+team_get_first_port_txable_rcu(struct team *team, struct team_port *port)
+{
+	struct team_port *cur;
+
+	if (likely(team_port_txable(port)))
+		return port;
+	cur = port;
+	list_for_each_entry_continue_rcu(cur, &team->port_list, list)
+		if (team_port_txable(port))
+			return cur;
+	list_for_each_entry_rcu(cur, &team->port_list, list) {
+		if (cur == port)
+			break;
+		if (team_port_txable(port))
+			return cur;
+	}
+	return NULL;
+}
+
 extern int team_options_register(struct team *team,
 				 const struct team_option *option,
 				 size_t option_count);
-- 
cgit 


From 8524982847ff00b66ffb89314c342c51f4138ee7 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Mon, 18 Feb 2013 21:47:45 +0100
Subject: ssb: fix unaligned access to mac address

The mac address should be aligned to u16 to prevent an unaligned access
in drivers/ssb/pci.c where it is casted to __be16.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ssb/ssb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 22958d68ecfe..8b1322296fed 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -26,9 +26,9 @@ struct ssb_sprom_core_pwr_info {
 
 struct ssb_sprom {
 	u8 revision;
-	u8 il0mac[6];		/* MAC address for 802.11b/g */
-	u8 et0mac[6];		/* MAC address for Ethernet */
-	u8 et1mac[6];		/* MAC address for 802.11a */
+	u8 il0mac[6] __aligned(sizeof(u16));	/* MAC address for 802.11b/g */
+	u8 et0mac[6] __aligned(sizeof(u16));	/* MAC address for Ethernet */
+	u8 et1mac[6] __aligned(sizeof(u16));	/* MAC address for 802.11a */
 	u8 et0phyaddr;		/* MII address for enet0 */
 	u8 et1phyaddr;		/* MII address for enet1 */
 	u8 et0mdcport;		/* MDIO for enet0 */
-- 
cgit 


From f04a9d8adf766c480353c0f2427e641251c9b059 Mon Sep 17 00:00:00 2001
From: Rajkumar Kasirajan <rajkumar.kasirajan@stericsson.com>
Date: Wed, 30 May 2012 16:32:37 +0530
Subject: mfd: ab8500-sysctrl: Update correct turn on status

In L9540, turn_on_status register is not updated correctly if
the device is rebooted with AC/USB charger connected. Due to
this, the device boots android instead of entering into charge
only mode. Read the AC/USB status register to detect the charger
presence and update the turn on status manually.

Signed-off-by: Rajkumar Kasirajan <rajkumar.kasirajan@stericsson.com>
Signed-off-by: Per Forlin <per.forlin@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Rupesh KUMAR <rupesh.kumar@stericsson.com>
Reviewed-by: Philippe LANGLAIS <philippe.langlais@stericsson.com>
Tested-by: Rupesh KUMAR <rupesh.kumar@stericsson.com>
Tested-by: Philippe LANGLAIS <philippe.langlais@stericsson.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-core.c         | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/mfd/ab8500-sysctrl.c      |  2 +-
 include/linux/mfd/abx500/ab8500.h |  2 ++
 3 files changed, 42 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 7c84ced2e01b..50e6f1e29727 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -111,6 +111,13 @@
 
 #define AB8500_TURN_ON_STATUS		0x00
 
+#define AB8500_CH_USBCH_STAT1_REG	0x02
+#define VBUS_DET_DBNC100		0x02
+#define VBUS_DET_DBNC1			0x01
+
+static DEFINE_SPINLOCK(on_stat_lock);
+static u8 turn_on_stat_mask = 0xFF;
+static u8 turn_on_stat_set;
 static bool no_bm; /* No battery management */
 module_param(no_bm, bool, S_IRUGO);
 
@@ -1171,6 +1178,15 @@ static ssize_t show_switch_off_status(struct device *dev,
 	return sprintf(buf, "%#x\n", value);
 }
 
+/* use mask and set to override the register turn_on_stat value */
+void ab8500_override_turn_on_stat(u8 mask, u8 set)
+{
+	spin_lock(&on_stat_lock);
+	turn_on_stat_mask = mask;
+	turn_on_stat_set = set;
+	spin_unlock(&on_stat_lock);
+}
+
 /*
  * ab8500 has turned on due to (TURN_ON_STATUS):
  * 0x01 PORnVbat
@@ -1194,6 +1210,20 @@ static ssize_t show_turn_on_status(struct device *dev,
 		AB8500_TURN_ON_STATUS, &value);
 	if (ret < 0)
 		return ret;
+
+	/*
+	 * In L9540, turn_on_status register is not updated correctly if
+	 * the device is rebooted with AC/USB charger connected. Due to
+	 * this, the device boots android instead of entering into charge
+	 * only mode. Read the AC/USB status register to detect the charger
+	 * presence and update the turn on status manually.
+	 */
+	if (is_ab9540(ab8500)) {
+		spin_lock(&on_stat_lock);
+		value = (value & turn_on_stat_mask) | turn_on_stat_set;
+		spin_unlock(&on_stat_lock);
+	}
+
 	return sprintf(buf, "%#x\n", value);
 }
 
@@ -1399,6 +1429,15 @@ static int ab8500_probe(struct platform_device *pdev)
 
 	if (plat && plat->init)
 		plat->init(ab8500);
+	if (is_ab9540(ab8500)) {
+		ret = get_register_interruptible(ab8500, AB8500_CHARGER,
+			AB8500_CH_USBCH_STAT1_REG, &value);
+		if (ret < 0)
+			return ret;
+		if ((value & VBUS_DET_DBNC1) && (value & VBUS_DET_DBNC100))
+			ab8500_override_turn_on_stat(~AB8500_POW_KEY_1_ON,
+						     AB8500_VBUS_DET);
+	}
 
 	/* Clear and mask all interrupts */
 	for (i = 0; i < ab8500->mask_size; i++) {
diff --git a/drivers/mfd/ab8500-sysctrl.c b/drivers/mfd/ab8500-sysctrl.c
index 108fd86552f0..7c773797d267 100644
--- a/drivers/mfd/ab8500-sysctrl.c
+++ b/drivers/mfd/ab8500-sysctrl.c
@@ -21,7 +21,7 @@ void ab8500_power_off(void)
 {
 	sigset_t old;
 	sigset_t all;
-	static char *pss[] = {"ab8500_ac", "ab8500_usb"};
+	static char *pss[] = {"ab8500_ac", "pm2301", "ab8500_usb"};
 	int i;
 	bool charger_present = false;
 	union power_supply_propval val;
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 9db0bda446a0..fdd8be64feeb 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -512,6 +512,8 @@ static inline int is_ab9540_2p0_or_earlier(struct ab8500 *ab)
 	return (is_ab9540(ab) && (ab->chip_id < AB8500_CUT2P0));
 }
 
+void ab8500_override_turn_on_stat(u8 mask, u8 set);
+
 #ifdef CONFIG_AB8500_DEBUG
 void ab8500_dump_all_banks(struct device *dev);
 void ab8500_debug_register_interrupt(int line);
-- 
cgit 


From 734823462590335cbf5c6a1fa5cae84a881dcb43 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 26 Feb 2013 10:06:55 +0000
Subject: mfd: ab8500-gpadc: Add gpadc hw conversion

Add the support of gpacd hw conversion and make the number of
sample configurable.

Signed-off-by: M'boumba Cedric Madianga <cedric.madianga@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Mattias WALLIN <mattias.wallin@stericsson.com>
Tested-by: Michel JAOUEN <michel.jaouen@stericsson.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-debugfs.c            | 292 +++++++++++++++++++++++++++++---
 drivers/mfd/ab8500-gpadc.c              | 282 ++++++++++++++++++++++--------
 include/linux/mfd/abx500/ab8500-gpadc.h |  30 +++-
 3 files changed, 499 insertions(+), 105 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c
index 45fe3c50eb03..59ecd8c680ed 100644
--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c
@@ -101,6 +101,11 @@ static int num_irqs;
 static struct device_attribute **dev_attr;
 static char **event_name;
 
+static u8 avg_sample = SAMPLE_16;
+static u8 trig_edge = RISING_EDGE;
+static u8 conv_type = ADC_SW;
+static u8 trig_timer;
+
 /**
  * struct ab8500_reg_range
  * @first: the first address of the range
@@ -808,9 +813,10 @@ static int ab8500_gpadc_bat_ctrl_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	bat_ctrl_raw = ab8500_gpadc_read_raw(gpadc, BAT_CTRL);
+	bat_ctrl_raw = ab8500_gpadc_read_raw(gpadc, BAT_CTRL,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	bat_ctrl_convert = ab8500_gpadc_ad_to_voltage(gpadc,
-			BAT_CTRL, bat_ctrl_raw);
+		BAT_CTRL, bat_ctrl_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			bat_ctrl_convert, bat_ctrl_raw);
@@ -836,9 +842,10 @@ static int ab8500_gpadc_btemp_ball_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	btemp_ball_raw = ab8500_gpadc_read_raw(gpadc, BTEMP_BALL);
+	btemp_ball_raw = ab8500_gpadc_read_raw(gpadc, BTEMP_BALL,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	btemp_ball_convert = ab8500_gpadc_ad_to_voltage(gpadc, BTEMP_BALL,
-			btemp_ball_raw);
+		btemp_ball_raw);
 
 	return seq_printf(s,
 			"%d,0x%X\n", btemp_ball_convert, btemp_ball_raw);
@@ -865,9 +872,10 @@ static int ab8500_gpadc_main_charger_v_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	main_charger_v_raw = ab8500_gpadc_read_raw(gpadc, MAIN_CHARGER_V);
+	main_charger_v_raw = ab8500_gpadc_read_raw(gpadc, MAIN_CHARGER_V,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	main_charger_v_convert = ab8500_gpadc_ad_to_voltage(gpadc,
-			MAIN_CHARGER_V, main_charger_v_raw);
+		MAIN_CHARGER_V, main_charger_v_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			main_charger_v_convert, main_charger_v_raw);
@@ -895,9 +903,10 @@ static int ab8500_gpadc_acc_detect1_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	acc_detect1_raw = ab8500_gpadc_read_raw(gpadc, ACC_DETECT1);
+	acc_detect1_raw = ab8500_gpadc_read_raw(gpadc, ACC_DETECT1,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	acc_detect1_convert = ab8500_gpadc_ad_to_voltage(gpadc, ACC_DETECT1,
-			acc_detect1_raw);
+		acc_detect1_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			acc_detect1_convert, acc_detect1_raw);
@@ -925,9 +934,10 @@ static int ab8500_gpadc_acc_detect2_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	acc_detect2_raw = ab8500_gpadc_read_raw(gpadc, ACC_DETECT2);
+	acc_detect2_raw = ab8500_gpadc_read_raw(gpadc, ACC_DETECT2,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	acc_detect2_convert = ab8500_gpadc_ad_to_voltage(gpadc,
-	    ACC_DETECT2, acc_detect2_raw);
+		ACC_DETECT2, acc_detect2_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			acc_detect2_convert, acc_detect2_raw);
@@ -955,9 +965,10 @@ static int ab8500_gpadc_aux1_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	aux1_raw = ab8500_gpadc_read_raw(gpadc, ADC_AUX1);
+	aux1_raw = ab8500_gpadc_read_raw(gpadc, ADC_AUX1,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	aux1_convert = ab8500_gpadc_ad_to_voltage(gpadc, ADC_AUX1,
-			aux1_raw);
+		aux1_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			aux1_convert, aux1_raw);
@@ -983,9 +994,10 @@ static int ab8500_gpadc_aux2_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	aux2_raw = ab8500_gpadc_read_raw(gpadc, ADC_AUX2);
+	aux2_raw = ab8500_gpadc_read_raw(gpadc, ADC_AUX2,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	aux2_convert = ab8500_gpadc_ad_to_voltage(gpadc, ADC_AUX2,
-			aux2_raw);
+		aux2_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			aux2_convert, aux2_raw);
@@ -1011,9 +1023,10 @@ static int ab8500_gpadc_main_bat_v_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	main_bat_v_raw = ab8500_gpadc_read_raw(gpadc, MAIN_BAT_V);
+	main_bat_v_raw = ab8500_gpadc_read_raw(gpadc, MAIN_BAT_V,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	main_bat_v_convert = ab8500_gpadc_ad_to_voltage(gpadc, MAIN_BAT_V,
-			main_bat_v_raw);
+		main_bat_v_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			main_bat_v_convert, main_bat_v_raw);
@@ -1040,9 +1053,10 @@ static int ab8500_gpadc_vbus_v_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	vbus_v_raw = ab8500_gpadc_read_raw(gpadc, VBUS_V);
+	vbus_v_raw =  ab8500_gpadc_read_raw(gpadc, VBUS_V,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	vbus_v_convert = ab8500_gpadc_ad_to_voltage(gpadc, VBUS_V,
-			vbus_v_raw);
+		vbus_v_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			vbus_v_convert, vbus_v_raw);
@@ -1068,9 +1082,10 @@ static int ab8500_gpadc_main_charger_c_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	main_charger_c_raw = ab8500_gpadc_read_raw(gpadc, MAIN_CHARGER_C);
+	main_charger_c_raw = ab8500_gpadc_read_raw(gpadc, MAIN_CHARGER_C,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	main_charger_c_convert = ab8500_gpadc_ad_to_voltage(gpadc,
-			MAIN_CHARGER_C, main_charger_c_raw);
+		MAIN_CHARGER_C, main_charger_c_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			main_charger_c_convert, main_charger_c_raw);
@@ -1098,9 +1113,10 @@ static int ab8500_gpadc_usb_charger_c_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	usb_charger_c_raw = ab8500_gpadc_read_raw(gpadc, USB_CHARGER_C);
+	usb_charger_c_raw = ab8500_gpadc_read_raw(gpadc, USB_CHARGER_C,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	usb_charger_c_convert = ab8500_gpadc_ad_to_voltage(gpadc,
-	    USB_CHARGER_C, usb_charger_c_raw);
+		USB_CHARGER_C, usb_charger_c_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			usb_charger_c_convert, usb_charger_c_raw);
@@ -1128,9 +1144,10 @@ static int ab8500_gpadc_bk_bat_v_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	bk_bat_v_raw = ab8500_gpadc_read_raw(gpadc, BK_BAT_V);
+	bk_bat_v_raw = ab8500_gpadc_read_raw(gpadc, BK_BAT_V,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	bk_bat_v_convert = ab8500_gpadc_ad_to_voltage(gpadc,
-			BK_BAT_V, bk_bat_v_raw);
+		BK_BAT_V, bk_bat_v_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			bk_bat_v_convert, bk_bat_v_raw);
@@ -1156,9 +1173,10 @@ static int ab8500_gpadc_die_temp_print(struct seq_file *s, void *p)
 	struct ab8500_gpadc *gpadc;
 
 	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
-	die_temp_raw = ab8500_gpadc_read_raw(gpadc, DIE_TEMP);
+	die_temp_raw = ab8500_gpadc_read_raw(gpadc, DIE_TEMP,
+		avg_sample, trig_edge, trig_timer, conv_type);
 	die_temp_convert = ab8500_gpadc_ad_to_voltage(gpadc, DIE_TEMP,
-			die_temp_raw);
+		die_temp_raw);
 
 	return seq_printf(s, "%d,0x%X\n",
 			die_temp_convert, die_temp_raw);
@@ -1177,6 +1195,208 @@ static const struct file_operations ab8500_gpadc_die_temp_fops = {
 	.owner = THIS_MODULE,
 };
 
+static int ab8500_gpadc_avg_sample_print(struct seq_file *s, void *p)
+{
+	return seq_printf(s, "%d\n", avg_sample);
+}
+
+static int ab8500_gpadc_avg_sample_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab8500_gpadc_avg_sample_print,
+			inode->i_private);
+}
+
+static ssize_t ab8500_gpadc_avg_sample_write(struct file *file,
+	const char __user *user_buf,
+	size_t count, loff_t *ppos)
+{
+	struct device *dev = ((struct seq_file *)(file->private_data))->private;
+	char buf[32];
+	int buf_size;
+	unsigned long user_avg_sample;
+	int err;
+
+	/* Get userspace string and assure termination */
+	buf_size = min(count, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	err = strict_strtoul(buf, 0, &user_avg_sample);
+	if (err)
+		return -EINVAL;
+	if ((user_avg_sample == SAMPLE_1) || (user_avg_sample == SAMPLE_4)
+			|| (user_avg_sample == SAMPLE_8)
+			|| (user_avg_sample == SAMPLE_16)) {
+		avg_sample = (u8) user_avg_sample;
+	} else {
+		dev_err(dev, "debugfs error input: "
+			"should be egal to 1, 4, 8 or 16\n");
+		return -EINVAL;
+	}
+	return buf_size;
+}
+
+static const struct file_operations ab8500_gpadc_avg_sample_fops = {
+	.open = ab8500_gpadc_avg_sample_open,
+	.read = seq_read,
+	.write = ab8500_gpadc_avg_sample_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8500_gpadc_trig_edge_print(struct seq_file *s, void *p)
+{
+	return seq_printf(s, "%d\n", trig_edge);
+}
+
+static int ab8500_gpadc_trig_edge_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab8500_gpadc_trig_edge_print,
+			inode->i_private);
+}
+
+static ssize_t ab8500_gpadc_trig_edge_write(struct file *file,
+	const char __user *user_buf,
+	size_t count, loff_t *ppos)
+{
+	struct device *dev = ((struct seq_file *)(file->private_data))->private;
+	char buf[32];
+	int buf_size;
+	unsigned long user_trig_edge;
+	int err;
+
+	/* Get userspace string and assure termination */
+	buf_size = min(count, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	err = strict_strtoul(buf, 0, &user_trig_edge);
+	if (err)
+		return -EINVAL;
+	if ((user_trig_edge == RISING_EDGE)
+			|| (user_trig_edge == FALLING_EDGE)) {
+		trig_edge = (u8) user_trig_edge;
+	} else {
+		dev_err(dev, "Wrong input:\n"
+			"Enter 0. Rising edge\n"
+			"Enter 1. Falling edge\n");
+		return -EINVAL;
+	}
+	return buf_size;
+}
+
+static const struct file_operations ab8500_gpadc_trig_edge_fops = {
+	.open = ab8500_gpadc_trig_edge_open,
+	.read = seq_read,
+	.write = ab8500_gpadc_trig_edge_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8500_gpadc_trig_timer_print(struct seq_file *s, void *p)
+{
+	return seq_printf(s, "%d\n", trig_timer);
+}
+
+static int ab8500_gpadc_trig_timer_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab8500_gpadc_trig_timer_print,
+			inode->i_private);
+}
+
+static ssize_t ab8500_gpadc_trig_timer_write(struct file *file,
+	const char __user *user_buf,
+	size_t count, loff_t *ppos)
+{
+	struct device *dev = ((struct seq_file *)(file->private_data))->private;
+	char buf[32];
+	int buf_size;
+	unsigned long user_trig_timer;
+	int err;
+
+	/* Get userspace string and assure termination */
+	buf_size = min(count, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	err = strict_strtoul(buf, 0, &user_trig_timer);
+	if (err)
+		return -EINVAL;
+	if ((user_trig_timer >= 0) && (user_trig_timer <= 255)) {
+		trig_timer = (u8) user_trig_timer;
+	} else {
+		dev_err(dev, "debugfs error input: "
+			"should be beetween 0 to 255\n");
+		return -EINVAL;
+	}
+	return buf_size;
+}
+
+static const struct file_operations ab8500_gpadc_trig_timer_fops = {
+	.open = ab8500_gpadc_trig_timer_open,
+	.read = seq_read,
+	.write = ab8500_gpadc_trig_timer_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8500_gpadc_conv_type_print(struct seq_file *s, void *p)
+{
+	return seq_printf(s, "%d\n", conv_type);
+}
+
+static int ab8500_gpadc_conv_type_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab8500_gpadc_conv_type_print,
+			inode->i_private);
+}
+
+static ssize_t ab8500_gpadc_conv_type_write(struct file *file,
+	const char __user *user_buf,
+	size_t count, loff_t *ppos)
+{
+	struct device *dev = ((struct seq_file *)(file->private_data))->private;
+	char buf[32];
+	int buf_size;
+	unsigned long user_conv_type;
+	int err;
+
+	/* Get userspace string and assure termination */
+	buf_size = min(count, (sizeof(buf) - 1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	err = strict_strtoul(buf, 0, &user_conv_type);
+	if (err)
+		return -EINVAL;
+	if ((user_conv_type == ADC_SW)
+			|| (user_conv_type == ADC_HW)) {
+		conv_type = (u8) user_conv_type;
+	} else {
+		dev_err(dev, "Wrong input:\n"
+			"Enter 0. ADC SW conversion\n"
+			"Enter 1. ADC HW conversion\n");
+		return -EINVAL;
+	}
+	return buf_size;
+}
+
+static const struct file_operations ab8500_gpadc_conv_type_fops = {
+	.open = ab8500_gpadc_conv_type_open,
+	.read = seq_read,
+	.write = ab8500_gpadc_conv_type_write,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
 /*
  * return length of an ASCII numerical value, 0 is string is not a
  * numerical value.
@@ -1722,6 +1942,26 @@ static int ab8500_debug_probe(struct platform_device *plf)
 	if (!file)
 		goto err;
 
+	file = debugfs_create_file("avg_sample", (S_IRUGO | S_IWUGO),
+		ab8500_gpadc_dir, &plf->dev, &ab8500_gpadc_avg_sample_fops);
+	if (!file)
+		goto err;
+
+	file = debugfs_create_file("trig_edge", (S_IRUGO | S_IWUGO),
+		ab8500_gpadc_dir, &plf->dev, &ab8500_gpadc_trig_edge_fops);
+	if (!file)
+		goto err;
+
+	file = debugfs_create_file("trig_timer", (S_IRUGO | S_IWUGO),
+		ab8500_gpadc_dir, &plf->dev, &ab8500_gpadc_trig_timer_fops);
+	if (!file)
+		goto err;
+
+	file = debugfs_create_file("conv_type", (S_IRUGO | S_IWUGO),
+		ab8500_gpadc_dir, &plf->dev, &ab8500_gpadc_conv_type_fops);
+	if (!file)
+		goto err;
+
 	return 0;
 
 err:
diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index 7f39479c1afc..8673bf66f7d7 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -55,13 +55,18 @@
 #define EN_VTVOUT			0x02
 #define EN_GPADC			0x01
 #define DIS_GPADC			0x00
-#define SW_AVG_16			0x60
+#define AVG_1				0x00
+#define AVG_4				0x20
+#define AVG_8				0x40
+#define AVG_16				0x60
 #define ADC_SW_CONV			0x04
 #define EN_ICHAR			0x80
 #define BTEMP_PULL_UP			0x08
 #define EN_BUF				0x40
 #define DIS_ZERO			0x00
 #define GPADC_BUSY			0x01
+#define EN_FALLING			0x10
+#define EN_TRIG_EDGE			0x02
 
 /* GPADC constants from AB8500 spec, UM0836 */
 #define ADC_RESOLUTION			1024
@@ -116,7 +121,10 @@ struct adc_cal_data {
  *				the completion of gpadc conversion
  * @ab8500_gpadc_lock:		structure of type mutex
  * @regu:			pointer to the struct regulator
- * @irq:			interrupt number that is used by gpadc
+ * @irq_sw:			interrupt number that is used by gpadc for Sw
+ *				conversion
+ * @irq_hw:			interrupt number that is used by gpadc for Hw
+ *				conversion
  * @cal_data			array of ADC calibration data structs
  */
 struct ab8500_gpadc {
@@ -126,7 +134,8 @@ struct ab8500_gpadc {
 	struct completion ab8500_gpadc_complete;
 	struct mutex ab8500_gpadc_lock;
 	struct regulator *regu;
-	int irq;
+	int irq_sw;
+	int irq_hw;
 	struct adc_cal_data cal_data[NBR_CAL_INPUTS];
 };
 
@@ -244,30 +253,35 @@ int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc, u8 channel,
 EXPORT_SYMBOL(ab8500_gpadc_ad_to_voltage);
 
 /**
- * ab8500_gpadc_convert() - gpadc conversion
+ * ab8500_gpadc_sw_hw_convert() - gpadc conversion
  * @channel:	analog channel to be converted to digital data
+ * @avg_sample:  number of ADC sample to average
+ * @trig_egde:  selected ADC trig edge
+ * @trig_timer: selected ADC trigger delay timer
+ * @conv_type: selected conversion type (HW or SW conversion)
  *
  * This function converts the selected analog i/p to digital
  * data.
  */
-int ab8500_gpadc_convert(struct ab8500_gpadc *gpadc, u8 channel)
+int ab8500_gpadc_sw_hw_convert(struct ab8500_gpadc *gpadc, u8 channel,
+		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type)
 {
 	int ad_value;
 	int voltage;
 
-	ad_value = ab8500_gpadc_read_raw(gpadc, channel);
-
-	/* On failure retry a second time */
+	ad_value = ab8500_gpadc_read_raw(gpadc, channel, avg_sample,
+			trig_edge, trig_timer, conv_type);
+/* On failure retry a second time */
 	if (ad_value < 0)
-		ad_value = ab8500_gpadc_read_raw(gpadc, channel);
-
-	if (ad_value < 0) {
-		dev_err(gpadc->dev, "GPADC raw value failed ch: %d\n", channel);
+		ad_value = ab8500_gpadc_read_raw(gpadc, channel, avg_sample,
+			trig_edge, trig_timer, conv_type);
+if (ad_value < 0) {
+		dev_err(gpadc->dev, "GPADC raw value failed ch: %d\n",
+				channel);
 		return ad_value;
 	}
 
 	voltage = ab8500_gpadc_ad_to_voltage(gpadc, channel, ad_value);
-
 	if (voltage < 0)
 		dev_err(gpadc->dev, "GPADC to voltage conversion failed ch:"
 			" %d AD: 0x%x\n", channel, ad_value);
@@ -279,11 +293,16 @@ EXPORT_SYMBOL(ab8500_gpadc_convert);
 /**
  * ab8500_gpadc_read_raw() - gpadc read
  * @channel:	analog channel to be read
+ * @avg_sample:  number of ADC sample to average
+ * @trig_edge:  selected trig edge
+ * @trig_timer: selected ADC trigger delay timer
+ * @conv_type: selected conversion type (HW or SW conversion)
  *
- * This function obtains the raw ADC value, this then needs
- * to be converted by calling ab8500_gpadc_ad_to_voltage()
+ * This function obtains the raw ADC value for an hardware conversion,
+ * this then needs to be converted by calling ab8500_gpadc_ad_to_voltage()
  */
-int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
+int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
+		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type)
 {
 	int ret;
 	int looplimit = 0;
@@ -293,7 +312,6 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
 		return -ENODEV;
 
 	mutex_lock(&gpadc->ab8500_gpadc_lock);
-
 	/* Enable VTVout LDO this is required for GPADC */
 	pm_runtime_get_sync(gpadc->dev);
 
@@ -321,9 +339,29 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
 		goto out;
 	}
 
-	/* Select the channel source and set average samples to 16 */
-	ret = abx500_set_register_interruptible(gpadc->dev, AB8500_GPADC,
-		AB8500_GPADC_CTRL2_REG, (channel | SW_AVG_16));
+	/* Select the channel source and set average samples */
+	switch (avg_sample) {
+	case SAMPLE_1:
+		val = channel | AVG_1;
+		break;
+	case SAMPLE_4:
+		val = channel | AVG_4;
+		break;
+	case SAMPLE_8:
+		val = channel | AVG_8;
+		break;
+	default:
+		val = channel | AVG_16;
+		break;
+
+	}
+
+	if (conv_type == ADC_HW)
+		ret = abx500_set_register_interruptible(gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL3_REG, val);
+	else
+		ret = abx500_set_register_interruptible(gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL2_REG, val);
 	if (ret < 0) {
 		dev_err(gpadc->dev,
 			"gpadc_conversion: set avg samples failed\n");
@@ -335,22 +373,43 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
 	 * charging current sense if it needed, ABB 3.0 needs some special
 	 * treatment too.
 	 */
+	if ((conv_type == ADC_HW) && (trig_edge)) {
+		ret = abx500_mask_and_set_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+			EN_FALLING, EN_FALLING);
+
+	}
 	switch (channel) {
 	case MAIN_CHARGER_C:
 	case USB_CHARGER_C:
-		ret = abx500_mask_and_set_register_interruptible(gpadc->dev,
-			AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
-			EN_BUF | EN_ICHAR,
-			EN_BUF | EN_ICHAR);
-		break;
-	case BTEMP_BALL:
-		if (!is_ab8500_2p0_or_earlier(gpadc->parent)) {
-			/* Turn on btemp pull-up on ABB 3.0 */
+		if (conv_type == ADC_HW)
 			ret = abx500_mask_and_set_register_interruptible(
 				gpadc->dev,
 				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
-				EN_BUF | BTEMP_PULL_UP,
-				EN_BUF | BTEMP_PULL_UP);
+				EN_BUF | EN_ICHAR | EN_TRIG_EDGE,
+				EN_BUF | EN_ICHAR | EN_TRIG_EDGE);
+		else
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF | EN_ICHAR,
+				EN_BUF | EN_ICHAR);
+		break;
+	case BTEMP_BALL:
+		if (!is_ab8500_2p0_or_earlier(gpadc->parent)) {
+			if (conv_type == ADC_HW)
+				/* Turn on btemp pull-up on ABB 3.0 */
+				ret = abx500_mask_and_set_register_interruptible
+					(gpadc->dev,
+					AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+					EN_BUF | BTEMP_PULL_UP | EN_TRIG_EDGE,
+					EN_BUF | BTEMP_PULL_UP | EN_TRIG_EDGE);
+			else
+				ret = abx500_mask_and_set_register_interruptible
+					(gpadc->dev,
+					AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+					EN_BUF | BTEMP_PULL_UP,
+					EN_BUF | BTEMP_PULL_UP);
 
 		 /*
 		  * Delay might be needed for ABB8500 cut 3.0, if not, remove
@@ -361,8 +420,17 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
 		}
 		/* Intentional fallthrough */
 	default:
-		ret = abx500_mask_and_set_register_interruptible(gpadc->dev,
-			AB8500_GPADC, AB8500_GPADC_CTRL1_REG, EN_BUF, EN_BUF);
+		if (conv_type == ADC_HW)
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF | EN_TRIG_EDGE,
+				EN_BUF | EN_TRIG_EDGE);
+		else
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC,
+				AB8500_GPADC_CTRL1_REG, EN_BUF, EN_BUF);
 		break;
 	}
 	if (ret < 0) {
@@ -371,36 +439,83 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
 		goto out;
 	}
 
-	ret = abx500_mask_and_set_register_interruptible(gpadc->dev,
-		AB8500_GPADC, AB8500_GPADC_CTRL1_REG, ADC_SW_CONV, ADC_SW_CONV);
-	if (ret < 0) {
-		dev_err(gpadc->dev,
-			"gpadc_conversion: start s/w conversion failed\n");
-		goto out;
+	/* Set trigger delay timer */
+	if (conv_type == ADC_HW) {
+		ret = abx500_set_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_AUTO_TIMER_REG, trig_timer);
+		if (ret < 0) {
+			dev_err(gpadc->dev,
+				"gpadc_conversion: trig timer failed\n");
+			goto out;
+		}
+	}
+
+	/* Start SW conversion */
+	if (conv_type == ADC_SW) {
+		ret = abx500_mask_and_set_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+			ADC_SW_CONV, ADC_SW_CONV);
+		if (ret < 0) {
+			dev_err(gpadc->dev,
+				"gpadc_conversion: start s/w conv failed\n");
+			goto out;
+		}
 	}
+
 	/* wait for completion of conversion */
-	if (!wait_for_completion_timeout(&gpadc->ab8500_gpadc_complete,
-					 msecs_to_jiffies(CONVERSION_TIME))) {
-		dev_err(gpadc->dev,
-			"timeout: didn't receive GPADC conversion interrupt\n");
-		ret = -EINVAL;
-		goto out;
+	if (conv_type == ADC_HW) {
+		if (!wait_for_completion_timeout(&gpadc->ab8500_gpadc_complete,
+			2*HZ)) {
+				dev_err(gpadc->dev,
+					"timeout didn't receive"
+					" hw GPADC conv interrupt\n");
+				ret = -EINVAL;
+				goto out;
+		}
+	} else {
+		if (!wait_for_completion_timeout(&gpadc->ab8500_gpadc_complete,
+			msecs_to_jiffies(CONVERSION_TIME))) {
+				dev_err(gpadc->dev,
+					"timeout didn't receive"
+					" sw GPADC conv interrupt\n");
+				ret = -EINVAL;
+				goto out;
+		}
 	}
 
 	/* Read the converted RAW data */
-	ret = abx500_get_register_interruptible(gpadc->dev, AB8500_GPADC,
-		AB8500_GPADC_MANDATAL_REG, &low_data);
-	if (ret < 0) {
-		dev_err(gpadc->dev, "gpadc_conversion: read low data failed\n");
-		goto out;
-	}
+	if (conv_type == ADC_HW) {
+		ret = abx500_get_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_AUTODATAL_REG, &low_data);
+		if (ret < 0) {
+			dev_err(gpadc->dev,
+				"gpadc_conversion: read hw low data failed\n");
+			goto out;
+		}
 
-	ret = abx500_get_register_interruptible(gpadc->dev, AB8500_GPADC,
-		AB8500_GPADC_MANDATAH_REG, &high_data);
-	if (ret < 0) {
-		dev_err(gpadc->dev,
-			"gpadc_conversion: read high data failed\n");
-		goto out;
+		ret = abx500_get_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_AUTODATAH_REG, &high_data);
+		if (ret < 0) {
+			dev_err(gpadc->dev,
+				"gpadc_conversion: read hw high data failed\n");
+			goto out;
+		}
+	} else {
+		ret = abx500_get_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_MANDATAL_REG, &low_data);
+		if (ret < 0) {
+			dev_err(gpadc->dev,
+				"gpadc_conversion: read sw low data failed\n");
+			goto out;
+		}
+
+		ret = abx500_get_register_interruptible(gpadc->dev,
+			AB8500_GPADC, AB8500_GPADC_MANDATAH_REG, &high_data);
+		if (ret < 0) {
+			dev_err(gpadc->dev,
+				"gpadc_conversion: read sw high data failed\n");
+			goto out;
+		}
 	}
 
 	/* Disable GPADC */
@@ -411,6 +526,7 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel)
 		goto out;
 	}
 
+	/* Disable VTVout LDO this is required for GPADC */
 	pm_runtime_mark_last_busy(gpadc->dev);
 	pm_runtime_put_autosuspend(gpadc->dev);
 
@@ -427,9 +543,7 @@ out:
 	 */
 	(void) abx500_set_register_interruptible(gpadc->dev, AB8500_GPADC,
 		AB8500_GPADC_CTRL1_REG, DIS_GPADC);
-
 	pm_runtime_put(gpadc->dev);
-
 	mutex_unlock(&gpadc->ab8500_gpadc_lock);
 	dev_err(gpadc->dev,
 		"gpadc_conversion: Failed to AD convert channel %d\n", channel);
@@ -438,16 +552,16 @@ out:
 EXPORT_SYMBOL(ab8500_gpadc_read_raw);
 
 /**
- * ab8500_bm_gpswadcconvend_handler() - isr for s/w gpadc conversion completion
+ * ab8500_bm_gpadcconvend_handler() - isr for gpadc conversion completion
  * @irq:	irq number
  * @data:	pointer to the data passed during request irq
  *
- * This is a interrupt service routine for s/w gpadc conversion completion.
+ * This is a interrupt service routine for gpadc conversion completion.
  * Notifies the gpadc completion is completed and the converted raw value
  * can be read from the registers.
  * Returns IRQ status(IRQ_HANDLED)
  */
-static irqreturn_t ab8500_bm_gpswadcconvend_handler(int irq, void *_gpadc)
+static irqreturn_t ab8500_bm_gpadcconvend_handler(int irq, void *_gpadc)
 {
 	struct ab8500_gpadc *gpadc = _gpadc;
 
@@ -646,11 +760,19 @@ static int ab8500_gpadc_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	gpadc->irq = platform_get_irq_byname(pdev, "SW_CONV_END");
-	if (gpadc->irq < 0) {
-		dev_err(&pdev->dev, "failed to get platform irq-%d\n",
-			gpadc->irq);
-		ret = gpadc->irq;
+	gpadc->irq_sw = platform_get_irq_byname(pdev, "SW_CONV_END");
+	if (gpadc->irq_sw < 0) {
+		dev_err(gpadc->dev, "failed to get platform irq-%d\n",
+			gpadc->irq_sw);
+		ret = gpadc->irq_sw;
+		goto fail;
+	}
+
+	gpadc->irq_hw = platform_get_irq_byname(pdev, "HW_CONV_END");
+	if (gpadc->irq_hw < 0) {
+		dev_err(gpadc->dev, "failed to get platform irq-%d\n",
+			gpadc->irq_hw);
+		ret = gpadc->irq_hw;
 		goto fail;
 	}
 
@@ -661,14 +783,21 @@ static int ab8500_gpadc_probe(struct platform_device *pdev)
 	/* Initialize completion used to notify completion of conversion */
 	init_completion(&gpadc->ab8500_gpadc_complete);
 
-	/* Register interrupt  - SwAdcComplete */
-	ret = request_threaded_irq(gpadc->irq, NULL,
-		ab8500_bm_gpswadcconvend_handler,
-		IRQF_ONESHOT | IRQF_NO_SUSPEND | IRQF_SHARED,
-				"ab8500-gpadc", gpadc);
+	/* Register interrupts */
+	ret = request_threaded_irq(gpadc->irq_sw, NULL,
+		ab8500_bm_gpadcconvend_handler,
+		IRQF_NO_SUSPEND | IRQF_SHARED, "ab8500-gpadc-sw", gpadc);
+	if (ret < 0) {
+		dev_err(gpadc->dev, "Failed to register interrupt, irq: %d\n",
+			gpadc->irq_sw);
+		goto fail;
+	}
+	ret = request_threaded_irq(gpadc->irq_hw, NULL,
+		ab8500_bm_gpadcconvend_handler,
+		IRQF_NO_SUSPEND | IRQF_SHARED, "ab8500-gpadc-hw", gpadc);
 	if (ret < 0) {
 		dev_err(gpadc->dev, "Failed to register interrupt, irq: %d\n",
-			gpadc->irq);
+			gpadc->irq_hw);
 		goto fail;
 	}
 
@@ -694,7 +823,8 @@ static int ab8500_gpadc_probe(struct platform_device *pdev)
 	dev_dbg(gpadc->dev, "probe success\n");
 	return 0;
 fail_irq:
-	free_irq(gpadc->irq, gpadc);
+	free_irq(gpadc->irq_sw, gpadc);
+	free_irq(gpadc->irq_hw, gpadc);
 fail:
 	kfree(gpadc);
 	gpadc = NULL;
@@ -708,7 +838,8 @@ static int ab8500_gpadc_remove(struct platform_device *pdev)
 	/* remove this gpadc entry from the list */
 	list_del(&gpadc->node);
 	/* remove interrupt  - completion of Sw ADC conversion */
-	free_irq(gpadc->irq, gpadc);
+	free_irq(gpadc->irq_sw, gpadc);
+	free_irq(gpadc->irq_hw, gpadc);
 
 	pm_runtime_get_sync(gpadc->dev);
 	pm_runtime_disable(gpadc->dev);
@@ -757,6 +888,7 @@ subsys_initcall_sync(ab8500_gpadc_init);
 module_exit(ab8500_gpadc_exit);
 
 MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Arun R Murthy, Daniel Willerud, Johan Palsson");
+MODULE_AUTHOR("Arun R Murthy, Daniel Willerud, Johan Palsson,"
+		"M'boumba Cedric Madianga");
 MODULE_ALIAS("platform:ab8500_gpadc");
 MODULE_DESCRIPTION("AB8500 GPADC driver");
diff --git a/include/linux/mfd/abx500/ab8500-gpadc.h b/include/linux/mfd/abx500/ab8500-gpadc.h
index 252966769d93..7694e7ab1880 100644
--- a/include/linux/mfd/abx500/ab8500-gpadc.h
+++ b/include/linux/mfd/abx500/ab8500-gpadc.h
@@ -4,12 +4,14 @@
  *
  * Author: Arun R Murthy <arun.murthy@stericsson.com>
  * Author: Daniel Willerud <daniel.willerud@stericsson.com>
+ * Author: M'boumba Cedric Madianga <cedric.madianga@stericsson.com>
  */
 
 #ifndef	_AB8500_GPADC_H
 #define _AB8500_GPADC_H
 
-/* GPADC source: From datasheet(ADCSwSel[4:0] in GPADCCtrl2) */
+/* GPADC source: From datasheet(ADCSwSel[4:0] in GPADCCtrl2
+ * and ADCHwSel[4:0] in GPADCCtrl3 ) */
 #define BAT_CTRL	0x01
 #define BTEMP_BALL	0x02
 #define MAIN_CHARGER_V	0x03
@@ -24,12 +26,32 @@
 #define BK_BAT_V	0x0C
 #define DIE_TEMP	0x0D
 
+#define SAMPLE_1        1
+#define SAMPLE_4        4
+#define SAMPLE_8        8
+#define SAMPLE_16       16
+#define RISING_EDGE     0
+#define FALLING_EDGE    1
+
+/* Arbitrary ADC conversion type constants */
+#define ADC_SW				0
+#define ADC_HW				1
+
+
 struct ab8500_gpadc;
 
 struct ab8500_gpadc *ab8500_gpadc_get(char *name);
-int ab8500_gpadc_convert(struct ab8500_gpadc *gpadc, u8 channel);
-int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel);
+int ab8500_gpadc_sw_hw_convert(struct ab8500_gpadc *gpadc, u8 channel,
+		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type);
+static inline int ab8500_gpadc_convert(struct ab8500_gpadc *gpadc, u8 channel)
+{
+	return ab8500_gpadc_sw_hw_convert(gpadc, channel,
+			SAMPLE_16, 0, 0, ADC_SW);
+}
+
+int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
+		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type);
 int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc,
-    u8 channel, int ad_value);
+		u8 channel, int ad_value);
 
 #endif /* _AB8500_GPADC_H */
-- 
cgit 


From 022ab148d28e8466e45d28552224e3029f1cccd8 Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Sat, 16 Feb 2013 10:25:07 +0100
Subject: pinctrl: Declare operation structures as const

The pinconf, pinctrl and pinmux operation structures hold function
pointers that are never modified. Declare them as const.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/devicetree.c          | 4 ++--
 drivers/pinctrl/mvebu/pinctrl-mvebu.c | 6 +++---
 drivers/pinctrl/pinconf.c             | 2 +-
 drivers/pinctrl/pinctrl-abx500.c      | 6 +++---
 drivers/pinctrl/pinctrl-at91.c        | 6 +++---
 drivers/pinctrl/pinctrl-bcm2835.c     | 6 +++---
 drivers/pinctrl/pinctrl-exynos5440.c  | 6 +++---
 drivers/pinctrl/pinctrl-falcon.c      | 2 +-
 drivers/pinctrl/pinctrl-imx.c         | 6 +++---
 drivers/pinctrl/pinctrl-lantiq.c      | 4 ++--
 drivers/pinctrl/pinctrl-mxs.c         | 6 +++---
 drivers/pinctrl/pinctrl-nomadik.c     | 6 +++---
 drivers/pinctrl/pinctrl-pxa3xx.c      | 4 ++--
 drivers/pinctrl/pinctrl-samsung.c     | 6 +++---
 drivers/pinctrl/pinctrl-single.c      | 6 +++---
 drivers/pinctrl/pinctrl-sirf.c        | 4 ++--
 drivers/pinctrl/pinctrl-sunxi.c       | 6 +++---
 drivers/pinctrl/pinctrl-tegra.c       | 6 +++---
 drivers/pinctrl/pinctrl-u300.c        | 6 +++---
 drivers/pinctrl/pinctrl-xway.c        | 2 +-
 drivers/pinctrl/spear/pinctrl-spear.c | 4 ++--
 include/linux/pinctrl/pinctrl.h       | 6 +++---
 22 files changed, 55 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/devicetree.c b/drivers/pinctrl/devicetree.c
index fd40a11ad645..c7b7cb477129 100644
--- a/drivers/pinctrl/devicetree.c
+++ b/drivers/pinctrl/devicetree.c
@@ -41,7 +41,7 @@ static void dt_free_map(struct pinctrl_dev *pctldev,
 		     struct pinctrl_map *map, unsigned num_maps)
 {
 	if (pctldev) {
-		struct pinctrl_ops *ops = pctldev->desc->pctlops;
+		const struct pinctrl_ops *ops = pctldev->desc->pctlops;
 		ops->dt_free_map(pctldev, map, num_maps);
 	} else {
 		/* There is no pctldev for PIN_MAP_TYPE_DUMMY_STATE */
@@ -122,7 +122,7 @@ static int dt_to_map_one_config(struct pinctrl *p, const char *statename,
 {
 	struct device_node *np_pctldev;
 	struct pinctrl_dev *pctldev;
-	struct pinctrl_ops *ops;
+	const struct pinctrl_ops *ops;
 	int ret;
 	struct pinctrl_map *map;
 	unsigned num_maps;
diff --git a/drivers/pinctrl/mvebu/pinctrl-mvebu.c b/drivers/pinctrl/mvebu/pinctrl-mvebu.c
index c689c04a4f52..61149914882d 100644
--- a/drivers/pinctrl/mvebu/pinctrl-mvebu.c
+++ b/drivers/pinctrl/mvebu/pinctrl-mvebu.c
@@ -263,7 +263,7 @@ static void mvebu_pinconf_group_dbg_show(struct pinctrl_dev *pctldev,
 	return;
 }
 
-static struct pinconf_ops mvebu_pinconf_ops = {
+static const struct pinconf_ops mvebu_pinconf_ops = {
 	.pin_config_group_get = mvebu_pinconf_group_get,
 	.pin_config_group_set = mvebu_pinconf_group_set,
 	.pin_config_group_dbg_show = mvebu_pinconf_group_dbg_show,
@@ -369,7 +369,7 @@ static int mvebu_pinmux_gpio_set_direction(struct pinctrl_dev *pctldev,
 	return -ENOTSUPP;
 }
 
-static struct pinmux_ops mvebu_pinmux_ops = {
+static const struct pinmux_ops mvebu_pinmux_ops = {
 	.get_functions_count = mvebu_pinmux_get_funcs_count,
 	.get_function_name = mvebu_pinmux_get_func_name,
 	.get_function_groups = mvebu_pinmux_get_groups,
@@ -470,7 +470,7 @@ static void mvebu_pinctrl_dt_free_map(struct pinctrl_dev *pctldev,
 	kfree(map);
 }
 
-static struct pinctrl_ops mvebu_pinctrl_ops = {
+static const struct pinctrl_ops mvebu_pinctrl_ops = {
 	.get_groups_count = mvebu_pinctrl_get_groups_count,
 	.get_group_name = mvebu_pinctrl_get_group_name,
 	.get_group_pins = mvebu_pinctrl_get_group_pins,
diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index ac8d382a79bb..8aefd28c797e 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -670,7 +670,7 @@ static int pinconf_dbg_config_print(struct seq_file *s, void *d)
 	struct pinctrl_maps *maps_node;
 	struct pinctrl_map const *map;
 	struct pinctrl_dev *pctldev = NULL;
-	struct pinconf_ops *confops = NULL;
+	const struct pinconf_ops *confops = NULL;
 	int i, j;
 	bool found = false;
 
diff --git a/drivers/pinctrl/pinctrl-abx500.c b/drivers/pinctrl/pinctrl-abx500.c
index caecdd373061..169d72c59a7b 100644
--- a/drivers/pinctrl/pinctrl-abx500.c
+++ b/drivers/pinctrl/pinctrl-abx500.c
@@ -656,7 +656,7 @@ static void abx500_gpio_disable_free(struct pinctrl_dev *pctldev,
 {
 }
 
-static struct pinmux_ops abx500_pinmux_ops = {
+static const struct pinmux_ops abx500_pinmux_ops = {
 	.get_functions_count = abx500_pmx_get_funcs_cnt,
 	.get_function_name = abx500_pmx_get_func_name,
 	.get_function_groups = abx500_pmx_get_func_groups,
@@ -704,7 +704,7 @@ static void abx500_pin_dbg_show(struct pinctrl_dev *pctldev,
 				 chip->base + offset - 1);
 }
 
-static struct pinctrl_ops abx500_pinctrl_ops = {
+static const struct pinctrl_ops abx500_pinctrl_ops = {
 	.get_groups_count = abx500_get_groups_cnt,
 	.get_group_name = abx500_get_group_name,
 	.get_group_pins = abx500_get_group_pins,
@@ -778,7 +778,7 @@ int abx500_pin_config_set(struct pinctrl_dev *pctldev,
 	return ret;
 }
 
-static struct pinconf_ops abx500_pinconf_ops = {
+static const struct pinconf_ops abx500_pinconf_ops = {
 	.pin_config_get = abx500_pin_config_get,
 	.pin_config_set = abx500_pin_config_set,
 };
diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c
index 75933a6aa828..e50fa5f863e1 100644
--- a/drivers/pinctrl/pinctrl-at91.c
+++ b/drivers/pinctrl/pinctrl-at91.c
@@ -294,7 +294,7 @@ static void at91_dt_free_map(struct pinctrl_dev *pctldev,
 {
 }
 
-static struct pinctrl_ops at91_pctrl_ops = {
+static const struct pinctrl_ops at91_pctrl_ops = {
 	.get_groups_count	= at91_get_groups_count,
 	.get_group_name		= at91_get_group_name,
 	.get_group_pins		= at91_get_group_pins,
@@ -696,7 +696,7 @@ static void at91_gpio_disable_free(struct pinctrl_dev *pctldev,
 	/* Set the pin to some default state, GPIO is usually default */
 }
 
-static struct pinmux_ops at91_pmx_ops = {
+static const struct pinmux_ops at91_pmx_ops = {
 	.get_functions_count	= at91_pmx_get_funcs_count,
 	.get_function_name	= at91_pmx_get_func_name,
 	.get_function_groups	= at91_pmx_get_groups,
@@ -776,7 +776,7 @@ static void at91_pinconf_group_dbg_show(struct pinctrl_dev *pctldev,
 {
 }
 
-static struct pinconf_ops at91_pinconf_ops = {
+static const struct pinconf_ops at91_pinconf_ops = {
 	.pin_config_get			= at91_pinconf_get,
 	.pin_config_set			= at91_pinconf_set,
 	.pin_config_dbg_show		= at91_pinconf_dbg_show,
diff --git a/drivers/pinctrl/pinctrl-bcm2835.c b/drivers/pinctrl/pinctrl-bcm2835.c
index 4eb6d2c4e4df..f28d4b08771a 100644
--- a/drivers/pinctrl/pinctrl-bcm2835.c
+++ b/drivers/pinctrl/pinctrl-bcm2835.c
@@ -795,7 +795,7 @@ out:
 	return err;
 }
 
-static struct pinctrl_ops bcm2835_pctl_ops = {
+static const struct pinctrl_ops bcm2835_pctl_ops = {
 	.get_groups_count = bcm2835_pctl_get_groups_count,
 	.get_group_name = bcm2835_pctl_get_group_name,
 	.get_group_pins = bcm2835_pctl_get_group_pins,
@@ -872,7 +872,7 @@ static int bcm2835_pmx_gpio_set_direction(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static struct pinmux_ops bcm2835_pmx_ops = {
+static const struct pinmux_ops bcm2835_pmx_ops = {
 	.get_functions_count = bcm2835_pmx_get_functions_count,
 	.get_function_name = bcm2835_pmx_get_function_name,
 	.get_function_groups = bcm2835_pmx_get_function_groups,
@@ -916,7 +916,7 @@ static int bcm2835_pinconf_set(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static struct pinconf_ops bcm2835_pinconf_ops = {
+static const struct pinconf_ops bcm2835_pinconf_ops = {
 	.pin_config_get = bcm2835_pinconf_get,
 	.pin_config_set = bcm2835_pinconf_set,
 };
diff --git a/drivers/pinctrl/pinctrl-exynos5440.c b/drivers/pinctrl/pinctrl-exynos5440.c
index 1376eb7305db..169ea3e5f777 100644
--- a/drivers/pinctrl/pinctrl-exynos5440.c
+++ b/drivers/pinctrl/pinctrl-exynos5440.c
@@ -286,7 +286,7 @@ static void exynos5440_dt_free_map(struct pinctrl_dev *pctldev,
 }
 
 /* list of pinctrl callbacks for the pinctrl core */
-static struct pinctrl_ops exynos5440_pctrl_ops = {
+static const struct pinctrl_ops exynos5440_pctrl_ops = {
 	.get_groups_count	= exynos5440_get_group_count,
 	.get_group_name		= exynos5440_get_group_name,
 	.get_group_pins		= exynos5440_get_group_pins,
@@ -374,7 +374,7 @@ static int exynos5440_pinmux_gpio_set_direction(struct pinctrl_dev *pctldev,
 }
 
 /* list of pinmux callbacks for the pinmux vertical in pinctrl core */
-static struct pinmux_ops exynos5440_pinmux_ops = {
+static const struct pinmux_ops exynos5440_pinmux_ops = {
 	.get_functions_count	= exynos5440_get_functions_count,
 	.get_function_name	= exynos5440_pinmux_get_fname,
 	.get_function_groups	= exynos5440_pinmux_get_groups,
@@ -523,7 +523,7 @@ static int exynos5440_pinconf_group_get(struct pinctrl_dev *pctldev,
 }
 
 /* list of pinconfig callbacks for pinconfig vertical in the pinctrl code */
-static struct pinconf_ops exynos5440_pinconf_ops = {
+static const struct pinconf_ops exynos5440_pinconf_ops = {
 	.pin_config_get		= exynos5440_pinconf_get,
 	.pin_config_set		= exynos5440_pinconf_set,
 	.pin_config_group_get	= exynos5440_pinconf_group_get,
diff --git a/drivers/pinctrl/pinctrl-falcon.c b/drivers/pinctrl/pinctrl-falcon.c
index af97a1f90007..f9b2a1d4854f 100644
--- a/drivers/pinctrl/pinctrl-falcon.c
+++ b/drivers/pinctrl/pinctrl-falcon.c
@@ -353,7 +353,7 @@ static void falcon_pinconf_group_dbg_show(struct pinctrl_dev *pctrldev,
 {
 }
 
-static struct pinconf_ops falcon_pinconf_ops = {
+static const struct pinconf_ops falcon_pinconf_ops = {
 	.pin_config_get			= falcon_pinconf_get,
 	.pin_config_set			= falcon_pinconf_set,
 	.pin_config_group_get		= falcon_pinconf_group_get,
diff --git a/drivers/pinctrl/pinctrl-imx.c b/drivers/pinctrl/pinctrl-imx.c
index 4cebb9c6c5c5..0ef190449eab 100644
--- a/drivers/pinctrl/pinctrl-imx.c
+++ b/drivers/pinctrl/pinctrl-imx.c
@@ -207,7 +207,7 @@ static void imx_dt_free_map(struct pinctrl_dev *pctldev,
 	kfree(map);
 }
 
-static struct pinctrl_ops imx_pctrl_ops = {
+static const struct pinctrl_ops imx_pctrl_ops = {
 	.get_groups_count = imx_get_groups_count,
 	.get_group_name = imx_get_group_name,
 	.get_group_pins = imx_get_group_pins,
@@ -299,7 +299,7 @@ static int imx_pmx_get_groups(struct pinctrl_dev *pctldev, unsigned selector,
 	return 0;
 }
 
-static struct pinmux_ops imx_pmx_ops = {
+static const struct pinmux_ops imx_pmx_ops = {
 	.get_functions_count = imx_pmx_get_funcs_count,
 	.get_function_name = imx_pmx_get_func_name,
 	.get_function_groups = imx_pmx_get_groups,
@@ -397,7 +397,7 @@ static void imx_pinconf_group_dbg_show(struct pinctrl_dev *pctldev,
 	}
 }
 
-static struct pinconf_ops imx_pinconf_ops = {
+static const struct pinconf_ops imx_pinconf_ops = {
 	.pin_config_get = imx_pinconf_get,
 	.pin_config_set = imx_pinconf_set,
 	.pin_config_dbg_show = imx_pinconf_dbg_show,
diff --git a/drivers/pinctrl/pinctrl-lantiq.c b/drivers/pinctrl/pinctrl-lantiq.c
index a70384611351..615c5002b757 100644
--- a/drivers/pinctrl/pinctrl-lantiq.c
+++ b/drivers/pinctrl/pinctrl-lantiq.c
@@ -169,7 +169,7 @@ static int ltq_pinctrl_dt_node_to_map(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static struct pinctrl_ops ltq_pctrl_ops = {
+static const struct pinctrl_ops ltq_pctrl_ops = {
 	.get_groups_count	= ltq_get_group_count,
 	.get_group_name		= ltq_get_group_name,
 	.get_group_pins		= ltq_get_group_pins,
@@ -311,7 +311,7 @@ static int ltq_pmx_gpio_request_enable(struct pinctrl_dev *pctrldev,
 	return info->apply_mux(pctrldev, mfp, pin_func);
 }
 
-static struct pinmux_ops ltq_pmx_ops = {
+static const struct pinmux_ops ltq_pmx_ops = {
 	.get_functions_count	= ltq_pmx_func_count,
 	.get_function_name	= ltq_pmx_func_name,
 	.get_function_groups	= ltq_pmx_get_groups,
diff --git a/drivers/pinctrl/pinctrl-mxs.c b/drivers/pinctrl/pinctrl-mxs.c
index 23af9f1f9c35..b45c4eb35798 100644
--- a/drivers/pinctrl/pinctrl-mxs.c
+++ b/drivers/pinctrl/pinctrl-mxs.c
@@ -158,7 +158,7 @@ static void mxs_dt_free_map(struct pinctrl_dev *pctldev,
 	kfree(map);
 }
 
-static struct pinctrl_ops mxs_pinctrl_ops = {
+static const struct pinctrl_ops mxs_pinctrl_ops = {
 	.get_groups_count = mxs_get_groups_count,
 	.get_group_name = mxs_get_group_name,
 	.get_group_pins = mxs_get_group_pins,
@@ -219,7 +219,7 @@ static int mxs_pinctrl_enable(struct pinctrl_dev *pctldev, unsigned selector,
 	return 0;
 }
 
-static struct pinmux_ops mxs_pinmux_ops = {
+static const struct pinmux_ops mxs_pinmux_ops = {
 	.get_functions_count = mxs_pinctrl_get_funcs_count,
 	.get_function_name = mxs_pinctrl_get_func_name,
 	.get_function_groups = mxs_pinctrl_get_func_groups,
@@ -319,7 +319,7 @@ static void mxs_pinconf_group_dbg_show(struct pinctrl_dev *pctldev,
 		seq_printf(s, "0x%lx", config);
 }
 
-static struct pinconf_ops mxs_pinconf_ops = {
+static const struct pinconf_ops mxs_pinconf_ops = {
 	.pin_config_get = mxs_pinconf_get,
 	.pin_config_set = mxs_pinconf_set,
 	.pin_config_group_get = mxs_pinconf_group_get,
diff --git a/drivers/pinctrl/pinctrl-nomadik.c b/drivers/pinctrl/pinctrl-nomadik.c
index 36d20293de5c..2328baaa86bf 100644
--- a/drivers/pinctrl/pinctrl-nomadik.c
+++ b/drivers/pinctrl/pinctrl-nomadik.c
@@ -1764,7 +1764,7 @@ int nmk_pinctrl_dt_node_to_map(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static struct pinctrl_ops nmk_pinctrl_ops = {
+static const struct pinctrl_ops nmk_pinctrl_ops = {
 	.get_groups_count = nmk_get_groups_cnt,
 	.get_group_name = nmk_get_group_name,
 	.get_group_pins = nmk_get_group_pins,
@@ -1975,7 +1975,7 @@ static void nmk_gpio_disable_free(struct pinctrl_dev *pctldev,
 	/* Set the pin to some default state, GPIO is usually default */
 }
 
-static struct pinmux_ops nmk_pinmux_ops = {
+static const struct pinmux_ops nmk_pinmux_ops = {
 	.get_functions_count = nmk_pmx_get_funcs_cnt,
 	.get_function_name = nmk_pmx_get_func_name,
 	.get_function_groups = nmk_pmx_get_func_groups,
@@ -2089,7 +2089,7 @@ static int nmk_pin_config_set(struct pinctrl_dev *pctldev, unsigned pin,
 	return 0;
 }
 
-static struct pinconf_ops nmk_pinconf_ops = {
+static const struct pinconf_ops nmk_pinconf_ops = {
 	.pin_config_get = nmk_pin_config_get,
 	.pin_config_set = nmk_pin_config_set,
 };
diff --git a/drivers/pinctrl/pinctrl-pxa3xx.c b/drivers/pinctrl/pinctrl-pxa3xx.c
index 1f49bb02a6af..05e11de1d144 100644
--- a/drivers/pinctrl/pinctrl-pxa3xx.c
+++ b/drivers/pinctrl/pinctrl-pxa3xx.c
@@ -53,7 +53,7 @@ static int pxa3xx_get_group_pins(struct pinctrl_dev *pctrldev,
 	return 0;
 }
 
-static struct pinctrl_ops pxa3xx_pctrl_ops = {
+static const struct pinctrl_ops pxa3xx_pctrl_ops = {
 	.get_groups_count = pxa3xx_get_groups_count,
 	.get_group_name	= pxa3xx_get_group_name,
 	.get_group_pins	= pxa3xx_get_group_pins,
@@ -161,7 +161,7 @@ static int pxa3xx_pmx_request_gpio(struct pinctrl_dev *pctrldev,
 	return 0;
 }
 
-static struct pinmux_ops pxa3xx_pmx_ops = {
+static const struct pinmux_ops pxa3xx_pmx_ops = {
 	.get_functions_count	= pxa3xx_pmx_get_funcs_count,
 	.get_function_name	= pxa3xx_pmx_get_func_name,
 	.get_function_groups	= pxa3xx_pmx_get_groups,
diff --git a/drivers/pinctrl/pinctrl-samsung.c b/drivers/pinctrl/pinctrl-samsung.c
index f206df175656..3475b92b24a4 100644
--- a/drivers/pinctrl/pinctrl-samsung.c
+++ b/drivers/pinctrl/pinctrl-samsung.c
@@ -214,7 +214,7 @@ static void samsung_dt_free_map(struct pinctrl_dev *pctldev,
 }
 
 /* list of pinctrl callbacks for the pinctrl core */
-static struct pinctrl_ops samsung_pctrl_ops = {
+static const struct pinctrl_ops samsung_pctrl_ops = {
 	.get_groups_count	= samsung_get_group_count,
 	.get_group_name		= samsung_get_group_name,
 	.get_group_pins		= samsung_get_group_pins,
@@ -357,7 +357,7 @@ static int samsung_pinmux_gpio_set_direction(struct pinctrl_dev *pctldev,
 }
 
 /* list of pinmux callbacks for the pinmux vertical in pinctrl core */
-static struct pinmux_ops samsung_pinmux_ops = {
+static const struct pinmux_ops samsung_pinmux_ops = {
 	.get_functions_count	= samsung_get_functions_count,
 	.get_function_name	= samsung_pinmux_get_fname,
 	.get_function_groups	= samsung_pinmux_get_groups,
@@ -468,7 +468,7 @@ static int samsung_pinconf_group_get(struct pinctrl_dev *pctldev,
 }
 
 /* list of pinconfig callbacks for pinconfig vertical in the pinctrl code */
-static struct pinconf_ops samsung_pinconf_ops = {
+static const struct pinconf_ops samsung_pinconf_ops = {
 	.pin_config_get		= samsung_pinconf_get,
 	.pin_config_set		= samsung_pinconf_set,
 	.pin_config_group_get	= samsung_pinconf_group_get,
diff --git a/drivers/pinctrl/pinctrl-single.c b/drivers/pinctrl/pinctrl-single.c
index 5c32e880bcb2..0c0e2da9d880 100644
--- a/drivers/pinctrl/pinctrl-single.c
+++ b/drivers/pinctrl/pinctrl-single.c
@@ -270,7 +270,7 @@ static int pcs_dt_node_to_map(struct pinctrl_dev *pctldev,
 				struct device_node *np_config,
 				struct pinctrl_map **map, unsigned *num_maps);
 
-static struct pinctrl_ops pcs_pinctrl_ops = {
+static const struct pinctrl_ops pcs_pinctrl_ops = {
 	.get_groups_count = pcs_get_groups_count,
 	.get_group_name = pcs_get_group_name,
 	.get_group_pins = pcs_get_group_pins,
@@ -408,7 +408,7 @@ static int pcs_request_gpio(struct pinctrl_dev *pctldev,
 	return -ENOTSUPP;
 }
 
-static struct pinmux_ops pcs_pinmux_ops = {
+static const struct pinmux_ops pcs_pinmux_ops = {
 	.get_functions_count = pcs_get_functions_count,
 	.get_function_name = pcs_get_function_name,
 	.get_function_groups = pcs_get_function_groups,
@@ -451,7 +451,7 @@ static void pcs_pinconf_group_dbg_show(struct pinctrl_dev *pctldev,
 {
 }
 
-static struct pinconf_ops pcs_pinconf_ops = {
+static const struct pinconf_ops pcs_pinconf_ops = {
 	.pin_config_get = pcs_pinconf_get,
 	.pin_config_set = pcs_pinconf_set,
 	.pin_config_group_get = pcs_pinconf_group_get,
diff --git a/drivers/pinctrl/pinctrl-sirf.c b/drivers/pinctrl/pinctrl-sirf.c
index d02498b30c6e..0990a721758e 100644
--- a/drivers/pinctrl/pinctrl-sirf.c
+++ b/drivers/pinctrl/pinctrl-sirf.c
@@ -979,7 +979,7 @@ static void sirfsoc_dt_free_map(struct pinctrl_dev *pctldev,
 	kfree(map);
 }
 
-static struct pinctrl_ops sirfsoc_pctrl_ops = {
+static const struct pinctrl_ops sirfsoc_pctrl_ops = {
 	.get_groups_count = sirfsoc_get_groups_count,
 	.get_group_name = sirfsoc_get_group_name,
 	.get_group_pins = sirfsoc_get_group_pins,
@@ -1181,7 +1181,7 @@ static int sirfsoc_pinmux_request_gpio(struct pinctrl_dev *pmxdev,
 	return 0;
 }
 
-static struct pinmux_ops sirfsoc_pinmux_ops = {
+static const struct pinmux_ops sirfsoc_pinmux_ops = {
 	.enable = sirfsoc_pinmux_enable,
 	.disable = sirfsoc_pinmux_disable,
 	.get_functions_count = sirfsoc_pinmux_get_funcs_count,
diff --git a/drivers/pinctrl/pinctrl-sunxi.c b/drivers/pinctrl/pinctrl-sunxi.c
index 80b11e3415bc..46b8f2d4f0a5 100644
--- a/drivers/pinctrl/pinctrl-sunxi.c
+++ b/drivers/pinctrl/pinctrl-sunxi.c
@@ -1029,7 +1029,7 @@ static void sunxi_pctrl_dt_free_map(struct pinctrl_dev *pctldev,
 	kfree(map);
 }
 
-static struct pinctrl_ops sunxi_pctrl_ops = {
+static const struct pinctrl_ops sunxi_pctrl_ops = {
 	.dt_node_to_map		= sunxi_pctrl_dt_node_to_map,
 	.dt_free_map		= sunxi_pctrl_dt_free_map,
 	.get_groups_count	= sunxi_pctrl_get_groups_count,
@@ -1098,7 +1098,7 @@ static int sunxi_pconf_group_set(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static struct pinconf_ops sunxi_pconf_ops = {
+static const struct pinconf_ops sunxi_pconf_ops = {
 	.pin_config_group_get	= sunxi_pconf_group_get,
 	.pin_config_group_set	= sunxi_pconf_group_set,
 };
@@ -1204,7 +1204,7 @@ error:
 	return ret;
 }
 
-static struct pinmux_ops sunxi_pmx_ops = {
+static const struct pinmux_ops sunxi_pmx_ops = {
 	.get_functions_count	= sunxi_pmx_get_funcs_cnt,
 	.get_function_name	= sunxi_pmx_get_func_name,
 	.get_function_groups	= sunxi_pmx_get_func_groups,
diff --git a/drivers/pinctrl/pinctrl-tegra.c b/drivers/pinctrl/pinctrl-tegra.c
index f195d77a3572..2fa9bc6cd7ab 100644
--- a/drivers/pinctrl/pinctrl-tegra.c
+++ b/drivers/pinctrl/pinctrl-tegra.c
@@ -316,7 +316,7 @@ static int tegra_pinctrl_dt_node_to_map(struct pinctrl_dev *pctldev,
 	return 0;
 }
 
-static struct pinctrl_ops tegra_pinctrl_ops = {
+static const struct pinctrl_ops tegra_pinctrl_ops = {
 	.get_groups_count = tegra_pinctrl_get_groups_count,
 	.get_group_name = tegra_pinctrl_get_group_name,
 	.get_group_pins = tegra_pinctrl_get_group_pins,
@@ -401,7 +401,7 @@ static void tegra_pinctrl_disable(struct pinctrl_dev *pctldev,
 	pmx_writel(pmx, val, g->mux_bank, g->mux_reg);
 }
 
-static struct pinmux_ops tegra_pinmux_ops = {
+static const struct pinmux_ops tegra_pinmux_ops = {
 	.get_functions_count = tegra_pinctrl_get_funcs_count,
 	.get_function_name = tegra_pinctrl_get_func_name,
 	.get_function_groups = tegra_pinctrl_get_func_groups,
@@ -676,7 +676,7 @@ static void tegra_pinconf_config_dbg_show(struct pinctrl_dev *pctldev,
 }
 #endif
 
-static struct pinconf_ops tegra_pinconf_ops = {
+static const struct pinconf_ops tegra_pinconf_ops = {
 	.pin_config_get = tegra_pinconf_get,
 	.pin_config_set = tegra_pinconf_set,
 	.pin_config_group_get = tegra_pinconf_group_get,
diff --git a/drivers/pinctrl/pinctrl-u300.c b/drivers/pinctrl/pinctrl-u300.c
index 2b5772550836..6a3a7503e6a0 100644
--- a/drivers/pinctrl/pinctrl-u300.c
+++ b/drivers/pinctrl/pinctrl-u300.c
@@ -860,7 +860,7 @@ static void u300_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s,
 	seq_printf(s, " " DRIVER_NAME);
 }
 
-static struct pinctrl_ops u300_pctrl_ops = {
+static const struct pinctrl_ops u300_pctrl_ops = {
 	.get_groups_count = u300_get_groups_count,
 	.get_group_name = u300_get_group_name,
 	.get_group_pins = u300_get_group_pins,
@@ -1003,7 +1003,7 @@ static int u300_pmx_get_groups(struct pinctrl_dev *pctldev, unsigned selector,
 	return 0;
 }
 
-static struct pinmux_ops u300_pmx_ops = {
+static const struct pinmux_ops u300_pmx_ops = {
 	.get_functions_count = u300_pmx_get_funcs_count,
 	.get_function_name = u300_pmx_get_func_name,
 	.get_function_groups = u300_pmx_get_groups,
@@ -1046,7 +1046,7 @@ static int u300_pin_config_set(struct pinctrl_dev *pctldev, unsigned pin,
 	return 0;
 }
 
-static struct pinconf_ops u300_pconf_ops = {
+static const struct pinconf_ops u300_pconf_ops = {
 	.is_generic = true,
 	.pin_config_get = u300_pin_config_get,
 	.pin_config_set = u300_pin_config_set,
diff --git a/drivers/pinctrl/pinctrl-xway.c b/drivers/pinctrl/pinctrl-xway.c
index 068224efa6fa..f2977cff8366 100644
--- a/drivers/pinctrl/pinctrl-xway.c
+++ b/drivers/pinctrl/pinctrl-xway.c
@@ -553,7 +553,7 @@ int xway_pinconf_group_set(struct pinctrl_dev *pctldev,
 	return ret;
 }
 
-static struct pinconf_ops xway_pinconf_ops = {
+static const struct pinconf_ops xway_pinconf_ops = {
 	.pin_config_get	= xway_pinconf_get,
 	.pin_config_set	= xway_pinconf_set,
 	.pin_config_group_set = xway_pinconf_group_set,
diff --git a/drivers/pinctrl/spear/pinctrl-spear.c b/drivers/pinctrl/spear/pinctrl-spear.c
index 6a7dae70db08..116da0412c4b 100644
--- a/drivers/pinctrl/spear/pinctrl-spear.c
+++ b/drivers/pinctrl/spear/pinctrl-spear.c
@@ -198,7 +198,7 @@ static void spear_pinctrl_dt_free_map(struct pinctrl_dev *pctldev,
 	kfree(map);
 }
 
-static struct pinctrl_ops spear_pinctrl_ops = {
+static const struct pinctrl_ops spear_pinctrl_ops = {
 	.get_groups_count = spear_pinctrl_get_groups_cnt,
 	.get_group_name = spear_pinctrl_get_group_name,
 	.get_group_pins = spear_pinctrl_get_group_pins,
@@ -340,7 +340,7 @@ static void gpio_disable_free(struct pinctrl_dev *pctldev,
 	gpio_request_endisable(pctldev, range, offset, false);
 }
 
-static struct pinmux_ops spear_pinmux_ops = {
+static const struct pinmux_ops spear_pinmux_ops = {
 	.get_functions_count = spear_pinctrl_get_funcs_count,
 	.get_function_name = spear_pinctrl_get_func_name,
 	.get_function_groups = spear_pinctrl_get_func_groups,
diff --git a/include/linux/pinctrl/pinctrl.h b/include/linux/pinctrl/pinctrl.h
index 778804df293f..2c2a9e8d8578 100644
--- a/include/linux/pinctrl/pinctrl.h
+++ b/include/linux/pinctrl/pinctrl.h
@@ -118,9 +118,9 @@ struct pinctrl_desc {
 	const char *name;
 	struct pinctrl_pin_desc const *pins;
 	unsigned int npins;
-	struct pinctrl_ops *pctlops;
-	struct pinmux_ops *pmxops;
-	struct pinconf_ops *confops;
+	const struct pinctrl_ops *pctlops;
+	const struct pinmux_ops *pmxops;
+	const struct pinconf_ops *confops;
 	struct module *owner;
 };
 
-- 
cgit 


From 3e1a498f2728476535571d270081a17fdfceaf26 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Mon, 25 Feb 2013 14:57:35 +0000
Subject: mfd: ab8500-core: Add Interrupt support for ab8540

ITSource/ITLatch 7, 8, 9 and 10 don't exist on AB8540. This patch
replaces them with '-1' in the interrupt list, and handles the '-1'
in the code accordingly.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-core.c         | 50 +++++++++++++++++++++++++++++++++------
 include/linux/mfd/abx500/ab8500.h |  1 +
 2 files changed, 44 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index 23db4fcea496..0ba08a26cf73 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -103,8 +103,10 @@
 #define AB8500_IT_LATCHHIER1_REG	0x60
 #define AB8500_IT_LATCHHIER2_REG	0x61
 #define AB8500_IT_LATCHHIER3_REG	0x62
+#define AB8540_IT_LATCHHIER4_REG	0x63
 
 #define AB8500_IT_LATCHHIER_NUM		3
+#define AB8540_IT_LATCHHIER_NUM		4
 
 #define AB8500_REV_REG			0x80
 #define AB8500_IC_NAME_REG		0x82
@@ -143,6 +145,12 @@ static const int ab9540_irq_regoffset[AB9540_NUM_IRQ_REGS] = {
 	0, 1, 2, 3, 4, 6, 7, 8, 9, 11, 18, 19, 20, 21, 12, 13, 24, 5, 22, 23
 };
 
+/* AB8540 support */
+static const int ab8540_irq_regoffset[AB8540_NUM_IRQ_REGS] = {
+	0, 1, 2, 3, 4, -1, -1, -1, -1, 11, 18, 19, 20, 21, 12, 13, 24, 5, 22, 23,
+	25, 26, 27, 28, 29, 30, 31,
+};
+
 static const char ab8500_version_str[][7] = {
 	[AB8500_VERSION_AB8500] = "AB8500",
 	[AB8500_VERSION_AB8505] = "AB8505",
@@ -360,6 +368,9 @@ static void ab8500_irq_sync_unlock(struct irq_data *data)
 			is_ab8500_1p1_or_earlier(ab8500))
 			continue;
 
+		if (ab8500->irq_reg_offset[i] < 0)
+			continue;
+
 		ab8500->oldmask[i] = new;
 
 		reg = AB8500_IT_MASK1_REG + ab8500->irq_reg_offset[i];
@@ -431,6 +442,18 @@ static struct irq_chip ab8500_irq_chip = {
 	.irq_set_type		= ab8500_irq_set_type,
 };
 
+static void update_latch_offset(u8 *offset, int i)
+{
+	/* Fix inconsistent ITFromLatch25 bit mapping... */
+	if (unlikely(*offset == 17))
+			*offset = 24;
+	/* Fix inconsistent ab8540 bit mapping... */
+	if (unlikely(*offset == 16))
+			*offset = 25;
+	if ((i==3) && (*offset >= 24))
+			*offset += 2;
+}
+
 static int ab8500_handle_hierarchical_line(struct ab8500 *ab8500,
 					int latch_offset, u8 latch_val)
 {
@@ -482,9 +505,7 @@ static int ab8500_handle_hierarchical_latch(struct ab8500 *ab8500,
 		latch_bit = __ffs(hier_val);
 		latch_offset = (hier_offset << 3) + latch_bit;
 
-		/* Fix inconsistent ITFromLatch25 bit mapping... */
-		if (unlikely(latch_offset == 17))
-			latch_offset = 24;
+		update_latch_offset(&latch_offset, hier_offset);
 
 		status = get_register_interruptible(ab8500,
 				AB8500_INTERRUPT,
@@ -512,7 +533,7 @@ static irqreturn_t ab8500_hierarchical_irq(int irq, void *dev)
 	dev_vdbg(ab8500->dev, "interrupt\n");
 
 	/*  Hierarchical interrupt version */
-	for (i = 0; i < AB8500_IT_LATCHHIER_NUM; i++) {
+	for (i = 0; i < (ab8500->it_latchhier_num); i++) {
 		int status;
 		u8 hier_val;
 
@@ -565,6 +586,9 @@ static irqreturn_t ab8500_irq(int irq, void *dev)
 		if (regoffset == 11 && is_ab8500_1p1_or_earlier(ab8500))
 			continue;
 
+		if (regoffset < 0)
+			continue;
+
 		status = get_register_interruptible(ab8500, AB8500_INTERRUPT,
 			AB8500_IT_LATCH1_REG + regoffset, &value);
 		if (status < 0 || value == 0)
@@ -615,7 +639,9 @@ static int ab8500_irq_init(struct ab8500 *ab8500, struct device_node *np)
 {
 	int num_irqs;
 
-	if (is_ab9540(ab8500))
+	if (is_ab8540(ab8500))
+		num_irqs = AB8540_NR_IRQS;
+	else if (is_ab9540(ab8500))
 		num_irqs = AB9540_NR_IRQS;
 	else if (is_ab8505(ab8500))
 		num_irqs = AB8505_NR_IRQS;
@@ -1552,13 +1578,20 @@ static int ab8500_probe(struct platform_device *pdev)
 			ab8500->chip_id >> 4,
 			ab8500->chip_id & 0x0F);
 
-	/* Configure AB8500 or AB9540 IRQ */
-	if (is_ab9540(ab8500) || is_ab8505(ab8500)) {
+	/* Configure AB8540 */
+	if (is_ab8540(ab8500)) {
+		ab8500->mask_size = AB8540_NUM_IRQ_REGS;
+		ab8500->irq_reg_offset = ab8540_irq_regoffset;
+		ab8500->it_latchhier_num = AB8540_IT_LATCHHIER_NUM;
+	}/* Configure AB8500 or AB9540 IRQ */
+	else if (is_ab9540(ab8500) || is_ab8505(ab8500)) {
 		ab8500->mask_size = AB9540_NUM_IRQ_REGS;
 		ab8500->irq_reg_offset = ab9540_irq_regoffset;
+		ab8500->it_latchhier_num = AB8500_IT_LATCHHIER_NUM;
 	} else {
 		ab8500->mask_size = AB8500_NUM_IRQ_REGS;
 		ab8500->irq_reg_offset = ab8500_irq_regoffset;
+		ab8500->it_latchhier_num = AB8500_IT_LATCHHIER_NUM;
 	}
 	ab8500->mask = devm_kzalloc(&pdev->dev, ab8500->mask_size, GFP_KERNEL);
 	if (!ab8500->mask)
@@ -1620,6 +1653,9 @@ static int ab8500_probe(struct platform_device *pdev)
 				is_ab8500_1p1_or_earlier(ab8500))
 			continue;
 
+		if (ab8500->irq_reg_offset[i] < 0)
+			continue;
+
 		get_register_interruptible(ab8500, AB8500_INTERRUPT,
 			AB8500_IT_LATCH1_REG + ab8500->irq_reg_offset[i],
 			&value);
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index fdd8be64feeb..b5780fd40fe4 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -362,6 +362,7 @@ struct ab8500 {
 	u8 *oldmask;
 	int mask_size;
 	const int *irq_reg_offset;
+	int it_latchhier_num;
 };
 
 struct regulator_reg_init;
-- 
cgit 


From 75932094601b404fc9ef28f7b6c0aa83dd619af0 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 12 Feb 2013 15:11:19 +0000
Subject: mfd: ab8500-sysctrl: Add new reset function

Add a new reset function which uses the AB WD with 0 timeout.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-sysctrl.c              | 63 +++++++++++++++++++++++++++++++
 include/linux/mfd/abx500/ab8500-sysctrl.h |  6 +++
 2 files changed, 69 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-sysctrl.c b/drivers/mfd/ab8500-sysctrl.c
index 6ac63a05893c..f43c42b9f32c 100644
--- a/drivers/mfd/ab8500-sysctrl.c
+++ b/drivers/mfd/ab8500-sysctrl.c
@@ -15,6 +15,12 @@
 #include <linux/mfd/abx500/ab8500.h>
 #include <linux/mfd/abx500/ab8500-sysctrl.h>
 
+/* RtcCtrl bits */
+#define AB8500_ALARM_MIN_LOW  0x08
+#define AB8500_ALARM_MIN_MID 0x09
+#define RTC_CTRL 0x0B
+#define RTC_ALARM_ENABLE 0x4
+
 static struct device *sysctrl_dev;
 
 void ab8500_power_off(void)
@@ -79,6 +85,63 @@ shutdown:
 	}
 }
 
+/*
+ * Use the AB WD to reset the platform. It will perform a hard
+ * reset instead of a soft reset. Write the reset reason to
+ * the AB before reset, which can be read upon restart.
+ */
+void ab8500_restart(char mode, const char *cmd)
+{
+	struct ab8500_platform_data *plat;
+	struct ab8500_sysctrl_platform_data *pdata;
+	u16 reason = 0;
+	u8 val;
+
+	if (sysctrl_dev == NULL) {
+		pr_err("%s: sysctrl not initialized\n", __func__);
+		return;
+	}
+
+	plat = dev_get_platdata(sysctrl_dev->parent);
+	pdata = plat->sysctrl;
+	if (pdata->reboot_reason_code)
+		reason = pdata->reboot_reason_code(cmd);
+	else
+		pr_warn("[%s] No reboot reason set. Default reason %d\n",
+			__func__, reason);
+
+	/*
+	 * Disable RTC alarm, just a precaution so that no alarm
+	 * is running when WD reset is executed.
+	 */
+	abx500_get_register_interruptible(sysctrl_dev, AB8500_RTC,
+		RTC_CTRL , &val);
+	abx500_set_register_interruptible(sysctrl_dev, AB8500_RTC,
+		RTC_CTRL , (val & ~RTC_ALARM_ENABLE));
+
+	/*
+	 * Android is not using the RTC alarm registers during reboot
+	 * so we borrow them for writing the reason of reset
+	 */
+
+	/* reason[8 LSB] */
+	val = reason & 0xFF;
+	abx500_set_register_interruptible(sysctrl_dev, AB8500_RTC,
+		AB8500_ALARM_MIN_LOW , val);
+
+	/* reason[8 MSB] */
+	val = (reason>>8) & 0xFF;
+	abx500_set_register_interruptible(sysctrl_dev, AB8500_RTC,
+		AB8500_ALARM_MIN_MID , val);
+
+	/* Setting WD timeout to 0 */
+	ab8500_sysctrl_write(AB8500_MAINWDOGTIMER, 0xFF, 0x0);
+
+	/* Setting the parameters to AB8500 WD*/
+	ab8500_sysctrl_write(AB8500_MAINWDOGCTRL, 0xFF, (AB8500_ENABLE_WD |
+		AB8500_WD_RESTART_ON_EXPIRE | AB8500_KICK_WD));
+}
+
 static inline bool valid_bank(u8 bank)
 {
 	return ((bank == AB8500_SYS_CTRL1_BLOCK) ||
diff --git a/include/linux/mfd/abx500/ab8500-sysctrl.h b/include/linux/mfd/abx500/ab8500-sysctrl.h
index ebf12e793db9..990bc93f46e1 100644
--- a/include/linux/mfd/abx500/ab8500-sysctrl.h
+++ b/include/linux/mfd/abx500/ab8500-sysctrl.h
@@ -12,6 +12,7 @@
 
 int ab8500_sysctrl_read(u16 reg, u8 *value);
 int ab8500_sysctrl_write(u16 reg, u8 mask, u8 value);
+void ab8500_restart(char mode, const char *cmd);
 
 #else
 
@@ -40,6 +41,7 @@ static inline int ab8500_sysctrl_clear(u16 reg, u8 bits)
 /* Configuration data for SysClkReq1RfClkBuf - SysClkReq8RfClkBuf */
 struct ab8500_sysctrl_platform_data {
 	u8 initial_req_buf_config[8];
+	u16 (*reboot_reason_code)(const char *cmd);
 };
 
 /* Registers */
@@ -299,4 +301,8 @@ struct ab8500_sysctrl_platform_data {
 #define AB9540_SYSCLK12BUF4VALID_SYSCLK12BUF4VALID_MASK 0xFF
 #define AB9540_SYSCLK12BUF4VALID_SYSCLK12BUF4VALID_SHIFT 0
 
+#define AB8500_ENABLE_WD 0x1
+#define AB8500_KICK_WD 0x2
+#define AB8500_WD_RESTART_ON_EXPIRE 0x10
+
 #endif /* __AB8500_SYSCTRL_H */
-- 
cgit 


From e4bffe8d8ad9856143b6e941a17870aee37413d7 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Mon, 11 Feb 2013 10:38:00 +0000
Subject: mfd: ab8500-gpadc: Add support for the AB8540

This patch enables the GPADC to work on AB8540 based platforms.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-gpadc.c              | 316 ++++++++++++++++++++++++++++----
 include/linux/mfd/abx500/ab8500-gpadc.h |  43 +++--
 2 files changed, 305 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index fc8da4496e84..c985b90577f6 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -37,6 +37,13 @@
 #define AB8500_GPADC_AUTODATAL_REG	0x07
 #define AB8500_GPADC_AUTODATAH_REG	0x08
 #define AB8500_GPADC_MUX_CTRL_REG	0x09
+#define AB8540_GPADC_MANDATA2L_REG	0x09
+#define AB8540_GPADC_MANDATA2H_REG	0x0A
+#define AB8540_GPADC_APEAAX_REG		0x10
+#define AB8540_GPADC_APEAAT_REG		0x11
+#define AB8540_GPADC_APEAAM_REG		0x12
+#define AB8540_GPADC_APEAAH_REG		0x13
+#define AB8540_GPADC_APEAAL_REG		0x14
 
 /*
  * OTP register offsets
@@ -49,6 +56,10 @@
 #define AB8500_GPADC_CAL_5		0x13
 #define AB8500_GPADC_CAL_6		0x14
 #define AB8500_GPADC_CAL_7		0x15
+/* New calibration for 8540 */
+#define AB8540_GPADC_OTP4_REG_7	0x38
+#define AB8540_GPADC_OTP4_REG_6	0x39
+#define AB8540_GPADC_OTP4_REG_5	0x3A
 
 /* gpadc constants */
 #define EN_VINTCORE12			0x04
@@ -67,6 +78,7 @@
 #define GPADC_BUSY			0x01
 #define EN_FALLING			0x10
 #define EN_TRIG_EDGE			0x02
+#define EN_VBIAS_XTAL_TEMP		0x02
 
 /* GPADC constants from AB8500 spec, UM0836 */
 #define ADC_RESOLUTION			1024
@@ -85,8 +97,21 @@
 #define ADC_CH_BKBAT_MIN		0
 #define ADC_CH_BKBAT_MAX		3200
 
+/* GPADC constants from AB8540 spec */
+#define ADC_CH_IBAT_MIN			(-6000) /* mA range measured by ADC for ibat*/
+#define ADC_CH_IBAT_MAX			6000
+#define ADC_CH_IBAT_MIN_V		(-60)	/* mV range measured by ADC for ibat*/
+#define ADC_CH_IBAT_MAX_V		60
+#define IBAT_VDROP_L			(-56)  /* mV */
+#define IBAT_VDROP_H			56
+
 /* This is used to not lose precision when dividing to get gain and offset */
-#define CALIB_SCALE			1000
+#define CALIB_SCALE		1000
+/*
+ * Number of bits shift used to not lose precision
+ * when dividing to get ibat gain.
+ */
+#define CALIB_SHIFT_IBAT	20
 
 /* Time in ms before disabling regulator */
 #define GPADC_AUDOSUSPEND_DELAY		1
@@ -97,6 +122,7 @@ enum cal_channels {
 	ADC_INPUT_VMAIN = 0,
 	ADC_INPUT_BTEMP,
 	ADC_INPUT_VBAT,
+	ADC_INPUT_IBAT,
 	NBR_CAL_INPUTS,
 };
 
@@ -107,8 +133,8 @@ enum cal_channels {
  * @offset:		Offset of the ADC channel
  */
 struct adc_cal_data {
-	u64 gain;
-	u64 offset;
+	s64 gain;
+	s64 offset;
 };
 
 /**
@@ -180,6 +206,7 @@ int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc, u8 channel,
 			gpadc->cal_data[ADC_INPUT_VMAIN].offset) / CALIB_SCALE;
 		break;
 
+	case XTAL_TEMP:
 	case BAT_CTRL:
 	case BTEMP_BALL:
 	case ACC_DETECT1:
@@ -198,6 +225,7 @@ int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc, u8 channel,
 		break;
 
 	case MAIN_BAT_V:
+	case VBAT_TRUE_MEAS:
 		/* For some reason we don't have calibrated data */
 		if (!gpadc->cal_data[ADC_INPUT_VBAT].gain) {
 			res = ADC_CH_VBAT_MIN + (ADC_CH_VBAT_MAX -
@@ -241,6 +269,20 @@ int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc, u8 channel,
 			ADC_RESOLUTION;
 		break;
 
+	case IBAT_VIRTUAL_CHANNEL:
+		/* For some reason we don't have calibrated data */
+		if (!gpadc->cal_data[ADC_INPUT_IBAT].gain) {
+			res = ADC_CH_IBAT_MIN + (ADC_CH_IBAT_MAX -
+				ADC_CH_IBAT_MIN) * ad_value /
+				ADC_RESOLUTION;
+			break;
+		}
+		/* Here we can use the calibrated data */
+		res = (int) (ad_value * gpadc->cal_data[ADC_INPUT_IBAT].gain +
+				gpadc->cal_data[ADC_INPUT_IBAT].offset)
+				>> CALIB_SHIFT_IBAT;
+		break;
+
 	default:
 		dev_err(gpadc->dev,
 			"unknown channel, not possible to convert\n");
@@ -303,10 +345,20 @@ EXPORT_SYMBOL(ab8500_gpadc_convert);
  */
 int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type)
+{
+	int raw_data;
+	raw_data = ab8500_gpadc_double_read_raw(gpadc, channel,
+			avg_sample, trig_edge, trig_timer, conv_type, NULL);
+	return raw_data;
+}
+
+int ab8500_gpadc_double_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
+		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type,
+		int *ibat)
 {
 	int ret;
 	int looplimit = 0;
-	u8 val, low_data, high_data;
+	u8 val, low_data, high_data, low_data2, high_data2;
 
 	if (!gpadc)
 		return -ENODEV;
@@ -359,7 +411,6 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 	default:
 		val = channel | AVG_16;
 		break;
-
 	}
 
 	if (conv_type == ADC_HW)
@@ -383,8 +434,8 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 		ret = abx500_mask_and_set_register_interruptible(gpadc->dev,
 			AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
 			EN_FALLING, EN_FALLING);
-
 	}
+
 	switch (channel) {
 	case MAIN_CHARGER_C:
 	case USB_CHARGER_C:
@@ -401,6 +452,55 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 				EN_BUF | EN_ICHAR,
 				EN_BUF | EN_ICHAR);
 		break;
+
+	case XTAL_TEMP:
+		if (conv_type == ADC_HW)
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF  | EN_TRIG_EDGE,
+				EN_BUF  | EN_TRIG_EDGE);
+		else
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF ,
+				EN_BUF);
+		break;
+
+	case VBAT_TRUE_MEAS:
+		if (conv_type == ADC_HW)
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF  | EN_TRIG_EDGE,
+				EN_BUF  | EN_TRIG_EDGE);
+		else
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF ,
+				EN_BUF);
+		break;
+
+	case BAT_CTRL_AND_IBAT:
+	case VBAT_MEAS_AND_IBAT:
+	case VBAT_TRUE_MEAS_AND_IBAT:
+	case BAT_TEMP_AND_IBAT:
+		if (conv_type == ADC_HW)
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_TRIG_EDGE,
+				EN_TRIG_EDGE);
+		else
+			ret = abx500_mask_and_set_register_interruptible(
+				gpadc->dev,
+				AB8500_GPADC, AB8500_GPADC_CTRL1_REG,
+				EN_BUF,
+				0);
+		break;
+
 	case BTEMP_BALL:
 		if (!is_ab8500_2p0_or_earlier(gpadc->parent)) {
 			if (conv_type == ADC_HW)
@@ -471,21 +571,19 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 	/* wait for completion of conversion */
 	if (conv_type == ADC_HW) {
 		if (!wait_for_completion_timeout(&gpadc->ab8500_gpadc_complete,
-			2*HZ)) {
-				dev_err(gpadc->dev,
-					"timeout didn't receive"
-					" hw GPADC conv interrupt\n");
-				ret = -EINVAL;
-				goto out;
+				2 * HZ)) {
+			dev_err(gpadc->dev,
+				"timeout didn't receive hw GPADC conv interrupt\n");
+			ret = -EINVAL;
+			goto out;
 		}
 	} else {
 		if (!wait_for_completion_timeout(&gpadc->ab8500_gpadc_complete,
-			msecs_to_jiffies(CONVERSION_TIME))) {
-				dev_err(gpadc->dev,
-					"timeout didn't receive"
-					" sw GPADC conv interrupt\n");
-				ret = -EINVAL;
-				goto out;
+				msecs_to_jiffies(CONVERSION_TIME))) {
+			dev_err(gpadc->dev,
+				"timeout didn't receive sw GPADC conv interrupt\n");
+			ret = -EINVAL;
+			goto out;
 		}
 	}
 
@@ -523,6 +621,46 @@ int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 			goto out;
 		}
 	}
+	/* Check if double convertion is required */
+	if ((channel == BAT_CTRL_AND_IBAT) ||
+			(channel == VBAT_MEAS_AND_IBAT) ||
+			(channel == VBAT_TRUE_MEAS_AND_IBAT) ||
+			(channel == BAT_TEMP_AND_IBAT)) {
+
+		if (conv_type == ADC_HW) {
+			/* not supported */
+			ret = -ENOTSUPP;
+			dev_err(gpadc->dev,
+				"gpadc_conversion: only SW double conversion supported\n");
+			goto out;
+		} else {
+			/* Read the converted RAW data 2 */
+			ret = abx500_get_register_interruptible(gpadc->dev,
+				AB8500_GPADC, AB8540_GPADC_MANDATA2L_REG,
+				&low_data2);
+			if (ret < 0) {
+				dev_err(gpadc->dev,
+					"gpadc_conversion: read sw low data 2 failed\n");
+				goto out;
+			}
+
+			ret = abx500_get_register_interruptible(gpadc->dev,
+				AB8500_GPADC, AB8540_GPADC_MANDATA2H_REG,
+				&high_data2);
+			if (ret < 0) {
+				dev_err(gpadc->dev,
+					"gpadc_conversion: read sw high data 2 failed\n");
+				goto out;
+			}
+			if (ibat != NULL) {
+				*ibat = (high_data2 << 8) | low_data2;
+			} else {
+				dev_warn(gpadc->dev,
+					"gpadc_conversion: ibat not stored\n");
+			}
+
+		}
+	}
 
 	/* Disable GPADC */
 	ret = abx500_set_register_interruptible(gpadc->dev, AB8500_GPADC,
@@ -586,15 +724,27 @@ static int otp_cal_regs[] = {
 	AB8500_GPADC_CAL_7,
 };
 
+static int otp4_cal_regs[] = {
+	AB8540_GPADC_OTP4_REG_7,
+	AB8540_GPADC_OTP4_REG_6,
+	AB8540_GPADC_OTP4_REG_5,
+};
+
 static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 {
 	int i;
 	int ret[ARRAY_SIZE(otp_cal_regs)];
 	u8 gpadc_cal[ARRAY_SIZE(otp_cal_regs)];
-
+	int ret_otp4[ARRAY_SIZE(otp4_cal_regs)];
+	u8 gpadc_otp4[ARRAY_SIZE(otp4_cal_regs)];
 	int vmain_high, vmain_low;
 	int btemp_high, btemp_low;
 	int vbat_high, vbat_low;
+	int ibat_high, ibat_low;
+	s64 V_gain, V_offset, V2A_gain, V2A_offset;
+	struct ab8500 *ab8500;
+
+	ab8500 = gpadc->parent;
 
 	/* First we read all OTP registers and store the error code */
 	for (i = 0; i < ARRAY_SIZE(otp_cal_regs); i++) {
@@ -614,7 +764,7 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 	 * bt_h/l = btemp_high/low
 	 * vb_h/l = vbat_high/low
 	 *
-	 * Data bits:
+	 * Data bits 8500/9540:
 	 * | 7	   | 6	   | 5	   | 4	   | 3	   | 2	   | 1	   | 0
 	 * |.......|.......|.......|.......|.......|.......|.......|.......
 	 * |						   | vm_h9 | vm_h8
@@ -632,6 +782,35 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 	 * | vb_l5 | vb_l4 | vb_l3 | vb_l2 | vb_l1 | vb_l0 |
 	 * |.......|.......|.......|.......|.......|.......|.......|.......
 	 *
+	 * Data bits 8540:
+	 * OTP2
+	 * | 7	   | 6	   | 5	   | 4	   | 3	   | 2	   | 1	   | 0
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * |
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | vm_h9 | vm_h8 | vm_h7 | vm_h6 | vm_h5 | vm_h4 | vm_h3 | vm_h2
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | vm_h1 | vm_h0 | vm_l4 | vm_l3 | vm_l2 | vm_l1 | vm_l0 | bt_h9
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | bt_h8 | bt_h7 | bt_h6 | bt_h5 | bt_h4 | bt_h3 | bt_h2 | bt_h1
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | bt_h0 | bt_l4 | bt_l3 | bt_l2 | bt_l1 | bt_l0 | vb_h9 | vb_h8
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | vb_h7 | vb_h6 | vb_h5 | vb_h4 | vb_h3 | vb_h2 | vb_h1 | vb_h0
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | vb_l5 | vb_l4 | vb_l3 | vb_l2 | vb_l1 | vb_l0 |
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 *
+	 * Data bits 8540:
+	 * OTP4
+	 * | 7	   | 6	   | 5	   | 4	   | 3	   | 2	   | 1	   | 0
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * |					   | ib_h9 | ib_h8 | ib_h7
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | ib_h6 | ib_h5 | ib_h4 | ib_h3 | ib_h2 | ib_h1 | ib_h0 | ib_l5
+	 * |.......|.......|.......|.......|.......|.......|.......|.......
+	 * | ib_l4 | ib_l3 | ib_l2 | ib_l1 | ib_l0 |
+	 *
 	 *
 	 * Ideal output ADC codes corresponding to injected input voltages
 	 * during manufacturing is:
@@ -644,38 +823,96 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 	 * vbat_low:   Vin = 2380mV  / ADC ideal code = 33
 	 */
 
-	/* Calculate gain and offset for VMAIN if all reads succeeded */
-	if (!(ret[0] < 0 || ret[1] < 0 || ret[2] < 0)) {
-		vmain_high = (((gpadc_cal[0] & 0x03) << 8) |
-			((gpadc_cal[1] & 0x3F) << 2) |
-			((gpadc_cal[2] & 0xC0) >> 6));
+	if (is_ab8540(ab8500)) {
+		/* Calculate gain and offset for VMAIN if all reads succeeded*/
+		if (!(ret[1] < 0 || ret[2] < 0)) {
+			vmain_high = (((gpadc_cal[1] & 0xFF) << 2) |
+				((gpadc_cal[2] & 0xC0) >> 6));
+			vmain_low = ((gpadc_cal[2] & 0x3E) >> 1);
+			gpadc->cal_data[ADC_INPUT_VMAIN].gain = CALIB_SCALE *
+				(19500 - 315) / (vmain_high - vmain_low);
+			gpadc->cal_data[ADC_INPUT_VMAIN].offset = CALIB_SCALE *
+				19500 - (CALIB_SCALE * (19500 - 315) /
+				(vmain_high - vmain_low)) * vmain_high;
+		} else {
+		gpadc->cal_data[ADC_INPUT_VMAIN].gain = 0;
+		}
 
-		vmain_low = ((gpadc_cal[2] & 0x3E) >> 1);
+		/* Read IBAT calibration Data */
+		for (i = 0; i < ARRAY_SIZE(otp4_cal_regs); i++) {
+			ret_otp4[i] = abx500_get_register_interruptible(
+					gpadc->dev, AB8500_OTP_EMUL,
+					otp4_cal_regs[i],  &gpadc_otp4[i]);
+			if (ret_otp4[i] < 0)
+				dev_err(gpadc->dev,
+					"%s: read otp4 reg 0x%02x failed\n",
+					__func__, otp4_cal_regs[i]);
+		}
 
-		gpadc->cal_data[ADC_INPUT_VMAIN].gain = CALIB_SCALE *
-			(19500 - 315) /	(vmain_high - vmain_low);
+		/* Calculate gain and offset for IBAT if all reads succeeded */
+		if (!(ret_otp4[0] < 0 || ret_otp4[1] < 0 || ret_otp4[2] < 0)) {
+			ibat_high = (((gpadc_otp4[0] & 0x07) << 7) |
+				((gpadc_otp4[1] & 0xFE) >> 1));
+			ibat_low = (((gpadc_otp4[1] & 0x01) << 5) |
+				((gpadc_otp4[2] & 0xF8) >> 3));
+
+			V_gain = ((IBAT_VDROP_H - IBAT_VDROP_L)
+				<< CALIB_SHIFT_IBAT) / (ibat_high - ibat_low);
+
+			V_offset = (IBAT_VDROP_H << CALIB_SHIFT_IBAT) -
+				(((IBAT_VDROP_H - IBAT_VDROP_L) <<
+				CALIB_SHIFT_IBAT) / (ibat_high - ibat_low))
+				* ibat_high;
+			/*
+			 * Result obtained is in mV (at a scale factor),
+			 * we need to calculate gain and offset to get mA
+			 */
+			V2A_gain = (ADC_CH_IBAT_MAX - ADC_CH_IBAT_MIN)/
+				(ADC_CH_IBAT_MAX_V - ADC_CH_IBAT_MIN_V);
+			V2A_offset = ((ADC_CH_IBAT_MAX_V * ADC_CH_IBAT_MIN -
+				ADC_CH_IBAT_MAX * ADC_CH_IBAT_MIN_V)
+				<< CALIB_SHIFT_IBAT)
+				/ (ADC_CH_IBAT_MAX_V - ADC_CH_IBAT_MIN_V);
+
+			gpadc->cal_data[ADC_INPUT_IBAT].gain = V_gain * V2A_gain;
+			gpadc->cal_data[ADC_INPUT_IBAT].offset = V_offset *
+				V2A_gain + V2A_offset;
+		} else {
+			gpadc->cal_data[ADC_INPUT_IBAT].gain = 0;
+		}
 
-		gpadc->cal_data[ADC_INPUT_VMAIN].offset = CALIB_SCALE * 19500 -
-			(CALIB_SCALE * (19500 - 315) /
-			 (vmain_high - vmain_low)) * vmain_high;
+		dev_dbg(gpadc->dev, "IBAT gain %llu offset %llu\n",
+			gpadc->cal_data[ADC_INPUT_IBAT].gain,
+			gpadc->cal_data[ADC_INPUT_IBAT].offset);
 	} else {
-		gpadc->cal_data[ADC_INPUT_VMAIN].gain = 0;
+		/* Calculate gain and offset for VMAIN if all reads succeeded */
+		if (!(ret[0] < 0 || ret[1] < 0 || ret[2] < 0)) {
+			vmain_high = (((gpadc_cal[0] & 0x03) << 8) |
+				((gpadc_cal[1] & 0x3F) << 2) |
+				((gpadc_cal[2] & 0xC0) >> 6));
+			vmain_low = ((gpadc_cal[2] & 0x3E) >> 1);
+
+			gpadc->cal_data[ADC_INPUT_VMAIN].gain = CALIB_SCALE *
+				(19500 - 315) / (vmain_high - vmain_low);
+
+			gpadc->cal_data[ADC_INPUT_VMAIN].offset = CALIB_SCALE *
+				19500 - (CALIB_SCALE * (19500 - 315) /
+				(vmain_high - vmain_low)) * vmain_high;
+		} else {
+			gpadc->cal_data[ADC_INPUT_VMAIN].gain = 0;
+		}
 	}
-
 	/* Calculate gain and offset for BTEMP if all reads succeeded */
 	if (!(ret[2] < 0 || ret[3] < 0 || ret[4] < 0)) {
 		btemp_high = (((gpadc_cal[2] & 0x01) << 9) |
-			(gpadc_cal[3] << 1) |
-			((gpadc_cal[4] & 0x80) >> 7));
-
+			(gpadc_cal[3] << 1) | ((gpadc_cal[4] & 0x80) >> 7));
 		btemp_low = ((gpadc_cal[4] & 0x7C) >> 2);
 
 		gpadc->cal_data[ADC_INPUT_BTEMP].gain =
 			CALIB_SCALE * (1300 - 21) / (btemp_high - btemp_low);
-
 		gpadc->cal_data[ADC_INPUT_BTEMP].offset = CALIB_SCALE * 1300 -
-			(CALIB_SCALE * (1300 - 21) /
-			(btemp_high - btemp_low)) * btemp_high;
+			(CALIB_SCALE * (1300 - 21) / (btemp_high - btemp_low))
+			* btemp_high;
 	} else {
 		gpadc->cal_data[ADC_INPUT_BTEMP].gain = 0;
 	}
@@ -687,7 +924,6 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 
 		gpadc->cal_data[ADC_INPUT_VBAT].gain = CALIB_SCALE *
 			(4700 - 2380) /	(vbat_high - vbat_low);
-
 		gpadc->cal_data[ADC_INPUT_VBAT].offset = CALIB_SCALE * 4700 -
 			(CALIB_SCALE * (4700 - 2380) /
 			(vbat_high - vbat_low)) * vbat_high;
diff --git a/include/linux/mfd/abx500/ab8500-gpadc.h b/include/linux/mfd/abx500/ab8500-gpadc.h
index 7694e7ab1880..4131437ace4b 100644
--- a/include/linux/mfd/abx500/ab8500-gpadc.h
+++ b/include/linux/mfd/abx500/ab8500-gpadc.h
@@ -12,19 +12,32 @@
 
 /* GPADC source: From datasheet(ADCSwSel[4:0] in GPADCCtrl2
  * and ADCHwSel[4:0] in GPADCCtrl3 ) */
-#define BAT_CTRL	0x01
-#define BTEMP_BALL	0x02
-#define MAIN_CHARGER_V	0x03
-#define ACC_DETECT1	0x04
-#define ACC_DETECT2	0x05
-#define ADC_AUX1	0x06
-#define ADC_AUX2	0x07
-#define MAIN_BAT_V	0x08
-#define VBUS_V		0x09
-#define MAIN_CHARGER_C	0x0A
-#define USB_CHARGER_C	0x0B
-#define BK_BAT_V	0x0C
-#define DIE_TEMP	0x0D
+#define BAT_CTRL		0x01
+#define BTEMP_BALL		0x02
+#define MAIN_CHARGER_V		0x03
+#define ACC_DETECT1		0x04
+#define ACC_DETECT2		0x05
+#define ADC_AUX1		0x06
+#define ADC_AUX2		0x07
+#define MAIN_BAT_V		0x08
+#define VBUS_V			0x09
+#define MAIN_CHARGER_C		0x0A
+#define USB_CHARGER_C		0x0B
+#define BK_BAT_V		0x0C
+#define DIE_TEMP		0x0D
+#define USB_ID			0x0E
+#define XTAL_TEMP		0x12
+#define VBAT_TRUE_MEAS		0x13
+#define BAT_CTRL_AND_IBAT	0x1C
+#define VBAT_MEAS_AND_IBAT	0x1D
+#define VBAT_TRUE_MEAS_AND_IBAT	0x1E
+#define BAT_TEMP_AND_IBAT	0x1F
+
+/* Virtual channel used only for ibat convertion to ampere
+ * Battery current conversion (ibat) cannot be requested as a single conversion
+ *  but it is always in combination with other input requests
+ */
+#define IBAT_VIRTUAL_CHANNEL		0xFF
 
 #define SAMPLE_1        1
 #define SAMPLE_4        4
@@ -37,7 +50,6 @@
 #define ADC_SW				0
 #define ADC_HW				1
 
-
 struct ab8500_gpadc;
 
 struct ab8500_gpadc *ab8500_gpadc_get(char *name);
@@ -51,6 +63,9 @@ static inline int ab8500_gpadc_convert(struct ab8500_gpadc *gpadc, u8 channel)
 
 int ab8500_gpadc_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type);
+int ab8500_gpadc_double_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
+		u8 avg_sample, u8 trig_edge, u8 trig_timer, u8 conv_type,
+		int *ibat);
 int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc,
 		u8 channel, int ad_value);
 
-- 
cgit 


From bc6b4132bcae4b8e59766ba2dae8f377009b26d0 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 26 Feb 2013 14:02:31 +0000
Subject: mfd: ab8500-debug: Add support for the AB8540

Allow GPADC debug information to be shown when executing on an AB8540
based platform.

Signed-off-by: Alexandre Bourdiol <alexandre.bourdiol@stericsson.com>
Reviewed-by: Marcus COOPER <marcus.xm.cooper@stericsson.com>
Reviewed-by: Philippe LANGLAIS <philippe.langlais@stericsson.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/ab8500-debugfs.c            | 286 +++++++++++++++++++++++++++++++-
 drivers/mfd/ab8500-gpadc.c              |  44 +++++
 include/linux/mfd/abx500/ab8500-gpadc.h |   3 +
 3 files changed, 332 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-debugfs.c b/drivers/mfd/ab8500-debugfs.c
index 074eea9e4bfd..1e44d65e1771 100644
--- a/drivers/mfd/ab8500-debugfs.c
+++ b/drivers/mfd/ab8500-debugfs.c
@@ -1633,6 +1633,254 @@ static const struct file_operations ab8500_gpadc_die_temp_fops = {
 	.owner = THIS_MODULE,
 };
 
+static int ab8540_gpadc_xtal_temp_print(struct seq_file *s, void *p)
+{
+	int xtal_temp_raw;
+	int xtal_temp_convert;
+	struct ab8500_gpadc *gpadc;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	xtal_temp_raw = ab8500_gpadc_read_raw(gpadc, XTAL_TEMP,
+		avg_sample, trig_edge, trig_timer, conv_type);
+	xtal_temp_convert = ab8500_gpadc_ad_to_voltage(gpadc, XTAL_TEMP,
+		xtal_temp_raw);
+
+	return seq_printf(s, "%d,0x%X\n",
+			xtal_temp_convert, xtal_temp_raw);
+}
+
+static int ab8540_gpadc_xtal_temp_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab8540_gpadc_xtal_temp_print,
+			inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_xtal_temp_fops = {
+	.open = ab8540_gpadc_xtal_temp_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8540_gpadc_vbat_true_meas_print(struct seq_file *s, void *p)
+{
+	int vbat_true_meas_raw;
+	int vbat_true_meas_convert;
+	struct ab8500_gpadc *gpadc;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	vbat_true_meas_raw = ab8500_gpadc_read_raw(gpadc, VBAT_TRUE_MEAS,
+		avg_sample, trig_edge, trig_timer, conv_type);
+	vbat_true_meas_convert = ab8500_gpadc_ad_to_voltage(gpadc, VBAT_TRUE_MEAS,
+		vbat_true_meas_raw);
+
+	return seq_printf(s, "%d,0x%X\n",
+			vbat_true_meas_convert, vbat_true_meas_raw);
+}
+
+static int ab8540_gpadc_vbat_true_meas_open(struct inode *inode,
+		struct file *file)
+{
+	return single_open(file, ab8540_gpadc_vbat_true_meas_print,
+			inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_vbat_true_meas_fops = {
+	.open = ab8540_gpadc_vbat_true_meas_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8540_gpadc_bat_ctrl_and_ibat_print(struct seq_file *s, void *p)
+{
+	int bat_ctrl_raw;
+	int bat_ctrl_convert;
+	int ibat_raw;
+	int ibat_convert;
+	struct ab8500_gpadc *gpadc;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	bat_ctrl_raw = ab8500_gpadc_double_read_raw(gpadc, BAT_CTRL_AND_IBAT,
+		avg_sample, trig_edge, trig_timer, conv_type, &ibat_raw);
+
+	bat_ctrl_convert = ab8500_gpadc_ad_to_voltage(gpadc, BAT_CTRL,
+		bat_ctrl_raw);
+	ibat_convert = ab8500_gpadc_ad_to_voltage(gpadc, IBAT_VIRTUAL_CHANNEL,
+		ibat_raw);
+
+	return seq_printf(s, "%d,0x%X\n"  "%d,0x%X\n",
+			bat_ctrl_convert, bat_ctrl_raw,
+			ibat_convert, ibat_raw);
+}
+
+static int ab8540_gpadc_bat_ctrl_and_ibat_open(struct inode *inode,
+		struct file *file)
+{
+	return single_open(file, ab8540_gpadc_bat_ctrl_and_ibat_print,
+			inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_bat_ctrl_and_ibat_fops = {
+	.open = ab8540_gpadc_bat_ctrl_and_ibat_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8540_gpadc_vbat_meas_and_ibat_print(struct seq_file *s, void *p)
+{
+	int vbat_meas_raw;
+	int vbat_meas_convert;
+	int ibat_raw;
+	int ibat_convert;
+	struct ab8500_gpadc *gpadc;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	vbat_meas_raw = ab8500_gpadc_double_read_raw(gpadc, VBAT_MEAS_AND_IBAT,
+		avg_sample, trig_edge, trig_timer, conv_type, &ibat_raw);
+	vbat_meas_convert = ab8500_gpadc_ad_to_voltage(gpadc, MAIN_BAT_V,
+		vbat_meas_raw);
+	ibat_convert = ab8500_gpadc_ad_to_voltage(gpadc, IBAT_VIRTUAL_CHANNEL,
+		ibat_raw);
+
+	return seq_printf(s, "%d,0x%X\n"  "%d,0x%X\n",
+			vbat_meas_convert, vbat_meas_raw,
+			ibat_convert, ibat_raw);
+}
+
+static int ab8540_gpadc_vbat_meas_and_ibat_open(struct inode *inode,
+		struct file *file)
+{
+	return single_open(file, ab8540_gpadc_vbat_meas_and_ibat_print,
+			inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_vbat_meas_and_ibat_fops = {
+	.open = ab8540_gpadc_vbat_meas_and_ibat_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8540_gpadc_vbat_true_meas_and_ibat_print(struct seq_file *s, void *p)
+{
+	int vbat_true_meas_raw;
+	int vbat_true_meas_convert;
+	int ibat_raw;
+	int ibat_convert;
+	struct ab8500_gpadc *gpadc;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	vbat_true_meas_raw = ab8500_gpadc_double_read_raw(gpadc,
+			VBAT_TRUE_MEAS_AND_IBAT, avg_sample, trig_edge,
+			trig_timer, conv_type, &ibat_raw);
+	vbat_true_meas_convert = ab8500_gpadc_ad_to_voltage(gpadc,
+			VBAT_TRUE_MEAS, vbat_true_meas_raw);
+	ibat_convert = ab8500_gpadc_ad_to_voltage(gpadc, IBAT_VIRTUAL_CHANNEL,
+		ibat_raw);
+
+	return seq_printf(s, "%d,0x%X\n"  "%d,0x%X\n",
+			vbat_true_meas_convert, vbat_true_meas_raw,
+			ibat_convert, ibat_raw);
+}
+
+static int ab8540_gpadc_vbat_true_meas_and_ibat_open(struct inode *inode,
+		struct file *file)
+{
+	return single_open(file, ab8540_gpadc_vbat_true_meas_and_ibat_print,
+			inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_vbat_true_meas_and_ibat_fops = {
+	.open = ab8540_gpadc_vbat_true_meas_and_ibat_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8540_gpadc_bat_temp_and_ibat_print(struct seq_file *s, void *p)
+{
+	int bat_temp_raw;
+	int bat_temp_convert;
+	int ibat_raw;
+	int ibat_convert;
+	struct ab8500_gpadc *gpadc;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	bat_temp_raw = ab8500_gpadc_double_read_raw(gpadc, BAT_TEMP_AND_IBAT,
+		avg_sample, trig_edge, trig_timer, conv_type, &ibat_raw);
+	bat_temp_convert = ab8500_gpadc_ad_to_voltage(gpadc, BTEMP_BALL,
+		bat_temp_raw);
+	ibat_convert = ab8500_gpadc_ad_to_voltage(gpadc, IBAT_VIRTUAL_CHANNEL,
+		ibat_raw);
+
+	return seq_printf(s, "%d,0x%X\n"  "%d,0x%X\n",
+			bat_temp_convert, bat_temp_raw,
+			ibat_convert, ibat_raw);
+}
+
+static int ab8540_gpadc_bat_temp_and_ibat_open(struct inode *inode,
+		struct file *file)
+{
+	return single_open(file, ab8540_gpadc_bat_temp_and_ibat_print,
+			inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_bat_temp_and_ibat_fops = {
+	.open = ab8540_gpadc_bat_temp_and_ibat_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
+static int ab8540_gpadc_otp_cal_print(struct seq_file *s, void *p)
+{
+	struct ab8500_gpadc *gpadc;
+	u16 vmain_l, vmain_h, btemp_l, btemp_h;
+	u16 vbat_l, vbat_h, ibat_l, ibat_h;
+
+	gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
+	ab8540_gpadc_get_otp(gpadc, &vmain_l, &vmain_h, &btemp_l, &btemp_h,
+			&vbat_l, &vbat_h, &ibat_l, &ibat_h);
+	return seq_printf(s, "VMAIN_L:0x%X\n"
+			"VMAIN_H:0x%X\n"
+			"BTEMP_L:0x%X\n"
+			"BTEMP_H:0x%X\n"
+			"VBAT_L:0x%X\n"
+			"VBAT_H:0x%X\n"
+			"IBAT_L:0x%X\n"
+			"IBAT_H:0x%X\n"
+			,
+			vmain_l,
+			vmain_h,
+			btemp_l,
+			btemp_h,
+			vbat_l,
+			vbat_h,
+			ibat_l,
+			ibat_h);
+}
+
+static int ab8540_gpadc_otp_cal_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, ab8540_gpadc_otp_cal_print, inode->i_private);
+}
+
+static const struct file_operations ab8540_gpadc_otp_calib_fops = {
+	.open = ab8540_gpadc_otp_cal_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
 static int ab8500_gpadc_avg_sample_print(struct seq_file *s, void *p)
 {
 	return seq_printf(s, "%d\n", avg_sample);
@@ -2386,7 +2634,43 @@ static int ab8500_debug_probe(struct platform_device *plf)
 	    ab8500_gpadc_dir, &plf->dev, &ab8500_gpadc_die_temp_fops);
 	if (!file)
 		goto err;
-
+	if (is_ab8540(ab8500)) {
+		file = debugfs_create_file("xtal_temp", (S_IRUGO | S_IWUGO),
+			ab8500_gpadc_dir, &plf->dev, &ab8540_gpadc_xtal_temp_fops);
+		if (!file)
+			goto err;
+		file = debugfs_create_file("vbattruemeas", (S_IRUGO | S_IWUGO),
+			ab8500_gpadc_dir, &plf->dev,
+			&ab8540_gpadc_vbat_true_meas_fops);
+		if (!file)
+			goto err;
+		file = debugfs_create_file("batctrl_and_ibat",
+				(S_IRUGO | S_IWUGO), ab8500_gpadc_dir,
+				&plf->dev, &ab8540_gpadc_bat_ctrl_and_ibat_fops);
+		if (!file)
+			goto err;
+		file = debugfs_create_file("vbatmeas_and_ibat",
+				(S_IRUGO | S_IWUGO), ab8500_gpadc_dir,
+				&plf->dev,
+				&ab8540_gpadc_vbat_meas_and_ibat_fops);
+		if (!file)
+			goto err;
+		file = debugfs_create_file("vbattruemeas_and_ibat",
+				(S_IRUGO | S_IWUGO), ab8500_gpadc_dir,
+				&plf->dev,
+				&ab8540_gpadc_vbat_true_meas_and_ibat_fops);
+		if (!file)
+			goto err;
+		file = debugfs_create_file("battemp_and_ibat",
+				(S_IRUGO | S_IWUGO), ab8500_gpadc_dir,
+				&plf->dev, &ab8540_gpadc_bat_temp_and_ibat_fops);
+		if (!file)
+			goto err;
+		file = debugfs_create_file("otp_calib", (S_IRUGO | S_IWUGO),
+			ab8500_gpadc_dir, &plf->dev, &ab8540_gpadc_otp_calib_fops);
+		if (!file)
+			goto err;
+	}
 	file = debugfs_create_file("avg_sample", (S_IRUGO | S_IWUGO),
 		ab8500_gpadc_dir, &plf->dev, &ab8500_gpadc_avg_sample_fops);
 	if (!file)
diff --git a/drivers/mfd/ab8500-gpadc.c b/drivers/mfd/ab8500-gpadc.c
index c985b90577f6..e3535c74d5fe 100644
--- a/drivers/mfd/ab8500-gpadc.c
+++ b/drivers/mfd/ab8500-gpadc.c
@@ -135,6 +135,8 @@ enum cal_channels {
 struct adc_cal_data {
 	s64 gain;
 	s64 offset;
+	u16 otp_calib_hi;
+	u16 otp_calib_lo;
 };
 
 /**
@@ -829,6 +831,12 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 			vmain_high = (((gpadc_cal[1] & 0xFF) << 2) |
 				((gpadc_cal[2] & 0xC0) >> 6));
 			vmain_low = ((gpadc_cal[2] & 0x3E) >> 1);
+
+			gpadc->cal_data[ADC_INPUT_VMAIN].otp_calib_hi =
+				(u16)vmain_high;
+			gpadc->cal_data[ADC_INPUT_VMAIN].otp_calib_lo =
+				(u16)vmain_low;
+
 			gpadc->cal_data[ADC_INPUT_VMAIN].gain = CALIB_SCALE *
 				(19500 - 315) / (vmain_high - vmain_low);
 			gpadc->cal_data[ADC_INPUT_VMAIN].offset = CALIB_SCALE *
@@ -856,6 +864,11 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 			ibat_low = (((gpadc_otp4[1] & 0x01) << 5) |
 				((gpadc_otp4[2] & 0xF8) >> 3));
 
+			gpadc->cal_data[ADC_INPUT_IBAT].otp_calib_hi =
+				(u16)ibat_high;
+			gpadc->cal_data[ADC_INPUT_IBAT].otp_calib_lo =
+				(u16)ibat_low;
+
 			V_gain = ((IBAT_VDROP_H - IBAT_VDROP_L)
 				<< CALIB_SHIFT_IBAT) / (ibat_high - ibat_low);
 
@@ -892,6 +905,11 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 				((gpadc_cal[2] & 0xC0) >> 6));
 			vmain_low = ((gpadc_cal[2] & 0x3E) >> 1);
 
+			gpadc->cal_data[ADC_INPUT_VMAIN].otp_calib_hi =
+				(u16)vmain_high;
+			gpadc->cal_data[ADC_INPUT_VMAIN].otp_calib_lo =
+				(u16)vmain_low;
+
 			gpadc->cal_data[ADC_INPUT_VMAIN].gain = CALIB_SCALE *
 				(19500 - 315) / (vmain_high - vmain_low);
 
@@ -902,12 +920,16 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 			gpadc->cal_data[ADC_INPUT_VMAIN].gain = 0;
 		}
 	}
+
 	/* Calculate gain and offset for BTEMP if all reads succeeded */
 	if (!(ret[2] < 0 || ret[3] < 0 || ret[4] < 0)) {
 		btemp_high = (((gpadc_cal[2] & 0x01) << 9) |
 			(gpadc_cal[3] << 1) | ((gpadc_cal[4] & 0x80) >> 7));
 		btemp_low = ((gpadc_cal[4] & 0x7C) >> 2);
 
+		gpadc->cal_data[ADC_INPUT_BTEMP].otp_calib_hi = (u16)btemp_high;
+		gpadc->cal_data[ADC_INPUT_BTEMP].otp_calib_lo = (u16)btemp_low;
+
 		gpadc->cal_data[ADC_INPUT_BTEMP].gain =
 			CALIB_SCALE * (1300 - 21) / (btemp_high - btemp_low);
 		gpadc->cal_data[ADC_INPUT_BTEMP].offset = CALIB_SCALE * 1300 -
@@ -922,6 +944,9 @@ static void ab8500_gpadc_read_calibration_data(struct ab8500_gpadc *gpadc)
 		vbat_high = (((gpadc_cal[4] & 0x03) << 8) | gpadc_cal[5]);
 		vbat_low = ((gpadc_cal[6] & 0xFC) >> 2);
 
+		gpadc->cal_data[ADC_INPUT_VBAT].otp_calib_hi = (u16)vbat_high;
+		gpadc->cal_data[ADC_INPUT_VBAT].otp_calib_lo = (u16)vbat_low;
+
 		gpadc->cal_data[ADC_INPUT_VBAT].gain = CALIB_SCALE *
 			(4700 - 2380) /	(vbat_high - vbat_low);
 		gpadc->cal_data[ADC_INPUT_VBAT].offset = CALIB_SCALE * 4700 -
@@ -1131,6 +1156,25 @@ static void __exit ab8500_gpadc_exit(void)
 	platform_driver_unregister(&ab8500_gpadc_driver);
 }
 
+/**
+ * ab8540_gpadc_get_otp() - returns OTP values
+ *
+ */
+void ab8540_gpadc_get_otp(struct ab8500_gpadc *gpadc,
+			u16 *vmain_l, u16 *vmain_h, u16 *btemp_l, u16 *btemp_h,
+			u16 *vbat_l, u16 *vbat_h, u16 *ibat_l, u16 *ibat_h)
+{
+	*vmain_l = gpadc->cal_data[ADC_INPUT_VMAIN].otp_calib_lo;
+	*vmain_h = gpadc->cal_data[ADC_INPUT_VMAIN].otp_calib_hi;
+	*btemp_l = gpadc->cal_data[ADC_INPUT_BTEMP].otp_calib_lo;
+	*btemp_h = gpadc->cal_data[ADC_INPUT_BTEMP].otp_calib_hi;
+	*vbat_l = gpadc->cal_data[ADC_INPUT_VBAT].otp_calib_lo;
+	*vbat_h = gpadc->cal_data[ADC_INPUT_VBAT].otp_calib_hi;
+	*ibat_l = gpadc->cal_data[ADC_INPUT_IBAT].otp_calib_lo;
+	*ibat_h = gpadc->cal_data[ADC_INPUT_IBAT].otp_calib_hi;
+	return ;
+}
+
 subsys_initcall_sync(ab8500_gpadc_init);
 module_exit(ab8500_gpadc_exit);
 
diff --git a/include/linux/mfd/abx500/ab8500-gpadc.h b/include/linux/mfd/abx500/ab8500-gpadc.h
index 4131437ace4b..49ded001049b 100644
--- a/include/linux/mfd/abx500/ab8500-gpadc.h
+++ b/include/linux/mfd/abx500/ab8500-gpadc.h
@@ -68,5 +68,8 @@ int ab8500_gpadc_double_read_raw(struct ab8500_gpadc *gpadc, u8 channel,
 		int *ibat);
 int ab8500_gpadc_ad_to_voltage(struct ab8500_gpadc *gpadc,
 		u8 channel, int ad_value);
+void ab8540_gpadc_get_otp(struct ab8500_gpadc *gpadc,
+			u16 *vmain_l, u16 *vmain_h, u16 *btemp_l, u16 *btemp_h,
+			u16 *vbat_l, u16 *vbat_h, u16 *ibat_l, u16 *ibat_h);
 
 #endif /* _AB8500_GPADC_H */
-- 
cgit 


From 93ff722e88530b9719cbf53be4f3197722461394 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 31 May 2012 16:16:36 +0200
Subject: ab8500-fg: Add power cut feature for ab8505 and ab8540

Add support for a power cut feature which allows user to
configure when ab8505 and ab8540 based platforms should shut
down system due to low battery.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/mfd/ab8500-core.c            |  36 +++
 drivers/power/ab8500_bmdata.c        |   5 +
 drivers/power/ab8500_fg.c            | 474 +++++++++++++++++++++++++++++++++++
 include/linux/mfd/abx500.h           |  10 +
 include/linux/mfd/abx500/ab8500-bm.h |  18 ++
 5 files changed, 543 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/ab8500-core.c b/drivers/mfd/ab8500-core.c
index c7ff55753a8f..f276352cc9ef 100644
--- a/drivers/mfd/ab8500-core.c
+++ b/drivers/mfd/ab8500-core.c
@@ -113,6 +113,7 @@
 #define AB8500_SWITCH_OFF_STATUS	0x00
 
 #define AB8500_TURN_ON_STATUS		0x00
+#define AB8505_TURN_ON_STATUS_2	0x04
 
 #define AB8500_CH_USBCH_STAT1_REG	0x02
 #define VBUS_DET_DBNC100		0x02
@@ -1401,6 +1402,21 @@ static ssize_t show_turn_on_status(struct device *dev,
 	return sprintf(buf, "%#x\n", value);
 }
 
+static ssize_t show_turn_on_status_2(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	int ret;
+	u8 value;
+	struct ab8500 *ab8500;
+
+	ab8500 = dev_get_drvdata(dev);
+	ret = get_register_interruptible(ab8500, AB8500_SYS_CTRL1_BLOCK,
+		AB8505_TURN_ON_STATUS_2, &value);
+	if (ret < 0)
+		return ret;
+	return sprintf(buf, "%#x\n", (value & 0x1));
+}
+
 static ssize_t show_ab9540_dbbrstn(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
@@ -1457,6 +1473,7 @@ exit:
 static DEVICE_ATTR(chip_id, S_IRUGO, show_chip_id, NULL);
 static DEVICE_ATTR(switch_off_status, S_IRUGO, show_switch_off_status, NULL);
 static DEVICE_ATTR(turn_on_status, S_IRUGO, show_turn_on_status, NULL);
+static DEVICE_ATTR(turn_on_status_2, S_IRUGO, show_turn_on_status_2, NULL);
 static DEVICE_ATTR(dbbrstn, S_IRUGO | S_IWUSR,
 			show_ab9540_dbbrstn, store_ab9540_dbbrstn);
 
@@ -1467,6 +1484,11 @@ static struct attribute *ab8500_sysfs_entries[] = {
 	NULL,
 };
 
+static struct attribute *ab8505_sysfs_entries[] = {
+	&dev_attr_turn_on_status_2.attr,
+	NULL,
+};
+
 static struct attribute *ab9540_sysfs_entries[] = {
 	&dev_attr_chip_id.attr,
 	&dev_attr_switch_off_status.attr,
@@ -1479,6 +1501,10 @@ static struct attribute_group ab8500_attr_group = {
 	.attrs	= ab8500_sysfs_entries,
 };
 
+static struct attribute_group ab8505_attr_group = {
+	.attrs	= ab8505_sysfs_entries,
+};
+
 static struct attribute_group ab9540_attr_group = {
 	.attrs	= ab9540_sysfs_entries,
 };
@@ -1719,6 +1745,12 @@ static int ab8500_probe(struct platform_device *pdev)
 	else
 		ret = sysfs_create_group(&ab8500->dev->kobj,
 					&ab8500_attr_group);
+
+	if ((is_ab8505(ab8500) || is_ab9540(ab8500)) &&
+			ab8500->chip_id >= AB8500_CUT2P0)
+		ret = sysfs_create_group(&ab8500->dev->kobj,
+					 &ab8505_attr_group);
+
 	if (ret)
 		dev_err(ab8500->dev, "error creating sysfs entries\n");
 
@@ -1735,6 +1767,10 @@ static int ab8500_remove(struct platform_device *pdev)
 	else
 		sysfs_remove_group(&ab8500->dev->kobj, &ab8500_attr_group);
 
+	if ((is_ab8505(ab8500) || is_ab9540(ab8500)) &&
+			ab8500->chip_id >= AB8500_CUT2P0)
+		sysfs_remove_group(&ab8500->dev->kobj, &ab8505_attr_group);
+
 	mfd_remove_devices(ab8500->dev);
 
 	return 0;
diff --git a/drivers/power/ab8500_bmdata.c b/drivers/power/ab8500_bmdata.c
index 7a96c0650fbb..e8759763fbe0 100644
--- a/drivers/power/ab8500_bmdata.c
+++ b/drivers/power/ab8500_bmdata.c
@@ -407,6 +407,11 @@ static const struct abx500_fg_parameters fg = {
 	.battok_raising_th_sel1 = 2860,
 	.maint_thres = 95,
 	.user_cap_limit = 15,
+	.pcut_enable = 1,
+	.pcut_max_time = 127,
+	.pcut_flag_time = 112,
+	.pcut_max_restart = 15,
+	.pcut_debounce_time = 2,
 };
 
 static const struct abx500_maxim_parameters maxi_params = {
diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 25dae4c4b0ef..92f342bcf188 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -2344,6 +2344,50 @@ static int ab8500_fg_init_hw_registers(struct ab8500_fg *di)
 		dev_err(di->dev, "BattOk init write failed.\n");
 		goto out;
 	}
+
+	if (((is_ab8505(di->parent) || is_ab9540(di->parent)) &&
+			abx500_get_chip_id(di->dev) >= AB8500_CUT2P0)
+			|| is_ab8540(di->parent)) {
+		ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+			AB8505_RTC_PCUT_MAX_TIME_REG, di->bm->fg_params->pcut_max_time);
+
+		if (ret) {
+			dev_err(di->dev, "%s write failed AB8505_RTC_PCUT_MAX_TIME_REG\n", __func__);
+			goto out;
+		};
+
+		ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+			AB8505_RTC_PCUT_FLAG_TIME_REG, di->bm->fg_params->pcut_flag_time);
+
+		if (ret) {
+			dev_err(di->dev, "%s write failed AB8505_RTC_PCUT_FLAG_TIME_REG\n", __func__);
+			goto out;
+		};
+
+		ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+			AB8505_RTC_PCUT_RESTART_REG, di->bm->fg_params->pcut_max_restart);
+
+		if (ret) {
+			dev_err(di->dev, "%s write failed AB8505_RTC_PCUT_RESTART_REG\n", __func__);
+			goto out;
+		};
+
+		ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+			AB8505_RTC_PCUT_DEBOUNCE_REG, di->bm->fg_params->pcut_debounce_time);
+
+		if (ret) {
+			dev_err(di->dev, "%s write failed AB8505_RTC_PCUT_DEBOUNCE_REG\n", __func__);
+			goto out;
+		};
+
+		ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+			AB8505_RTC_PCUT_CTL_STATUS_REG, di->bm->fg_params->pcut_enable);
+
+		if (ret) {
+			dev_err(di->dev, "%s write failed AB8505_RTC_PCUT_CTL_STATUS_REG\n", __func__);
+			goto out;
+		};
+	}
 out:
 	return ret;
 }
@@ -2546,6 +2590,428 @@ static int ab8500_fg_sysfs_init(struct ab8500_fg *di)
 
 	return ret;
 }
+
+static ssize_t ab8505_powercut_flagtime_read(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+		AB8505_RTC_PCUT_FLAG_TIME_REG, &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_FLAG_TIME_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0x7F));
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_flagtime_write(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	int ret;
+	long unsigned reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	reg_value = simple_strtoul(buf, NULL, 10);
+
+	if (reg_value > 0x7F) {
+		dev_err(dev, "Incorrect parameter, echo 0 (1.98s) - 127 (15.625ms) for flagtime\n");
+		goto fail;
+	}
+
+	ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+		AB8505_RTC_PCUT_FLAG_TIME_REG, (u8)reg_value);
+
+	if (ret < 0)
+		dev_err(dev, "Failed to set AB8505_RTC_PCUT_FLAG_TIME_REG\n");
+
+fail:
+	return count;
+}
+
+static ssize_t ab8505_powercut_maxtime_read(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+		AB8505_RTC_PCUT_MAX_TIME_REG, &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_MAX_TIME_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0x7F));
+
+fail:
+	return ret;
+
+}
+
+static ssize_t ab8505_powercut_maxtime_write(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	int ret;
+	int reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	reg_value = simple_strtoul(buf, NULL, 10);
+	if (reg_value > 0x7F) {
+		dev_err(dev, "Incorrect parameter, echo 0 (0.0s) - 127 (1.98s) for maxtime\n");
+		goto fail;
+	}
+
+	ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+		AB8505_RTC_PCUT_MAX_TIME_REG, (u8)reg_value);
+
+	if (ret < 0)
+		dev_err(dev, "Failed to set AB8505_RTC_PCUT_MAX_TIME_REG\n");
+
+fail:
+	return count;
+}
+
+static ssize_t ab8505_powercut_restart_read(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+		AB8505_RTC_PCUT_RESTART_REG, &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_RESTART_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0xF));
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_restart_write(struct device *dev,
+					     struct device_attribute *attr,
+					     const char *buf, size_t count)
+{
+	int ret;
+	int reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	reg_value = simple_strtoul(buf, NULL, 10);
+	if (reg_value > 0xF) {
+		dev_err(dev, "Incorrect parameter, echo 0 - 15 for number of restart\n");
+		goto fail;
+	}
+
+	ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_RESTART_REG, (u8)reg_value);
+
+	if (ret < 0)
+		dev_err(dev, "Failed to set AB8505_RTC_PCUT_RESTART_REG\n");
+
+fail:
+	return count;
+
+}
+
+static ssize_t ab8505_powercut_timer_read(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_TIME_REG, &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_TIME_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0x7F));
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_restart_counter_read(struct device *dev,
+						    struct device_attribute *attr,
+						    char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_RESTART_REG, &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_RESTART_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0xF0) >> 4);
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_read(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_CTL_STATUS_REG, &reg_value);
+
+	if (ret < 0)
+		goto fail;
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0x1));
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_write(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t count)
+{
+	int ret;
+	int reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	reg_value = simple_strtoul(buf, NULL, 10);
+	if (reg_value > 0x1) {
+		dev_err(dev, "Incorrect parameter, echo 0/1 to disable/enable Pcut feature\n");
+		goto fail;
+	}
+
+	ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_CTL_STATUS_REG, (u8)reg_value);
+
+	if (ret < 0)
+		dev_err(dev, "Failed to set AB8505_RTC_PCUT_CTL_STATUS_REG\n");
+
+fail:
+	return count;
+}
+
+static ssize_t ab8505_powercut_flag_read(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_CTL_STATUS_REG,  &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_CTL_STATUS_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ((reg_value & 0x10) >> 4));
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_debounce_read(struct device *dev,
+					     struct device_attribute *attr,
+					     char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_DEBOUNCE_REG,  &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_DEBOUNCE_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (reg_value & 0x7));
+
+fail:
+	return ret;
+}
+
+static ssize_t ab8505_powercut_debounce_write(struct device *dev,
+					      struct device_attribute *attr,
+					      const char *buf, size_t count)
+{
+	int ret;
+	int reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	reg_value = simple_strtoul(buf, NULL, 10);
+	if (reg_value > 0x7) {
+		dev_err(dev, "Incorrect parameter, echo 0 to 7 for debounce setting\n");
+		goto fail;
+	}
+
+	ret = abx500_set_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_DEBOUNCE_REG, (u8)reg_value);
+
+	if (ret < 0)
+		dev_err(dev, "Failed to set AB8505_RTC_PCUT_DEBOUNCE_REG\n");
+
+fail:
+	return count;
+}
+
+static ssize_t ab8505_powercut_enable_status_read(struct device *dev,
+						  struct device_attribute *attr,
+						  char *buf)
+{
+	int ret;
+	u8 reg_value;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
+						AB8505_RTC_PCUT_CTL_STATUS_REG, &reg_value);
+
+	if (ret < 0) {
+		dev_err(dev, "Failed to read AB8505_RTC_PCUT_CTL_STATUS_REG\n");
+		goto fail;
+	}
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ((reg_value & 0x20) >> 5));
+
+fail:
+	return ret;
+}
+
+static struct device_attribute ab8505_fg_sysfs_psy_attrs[] = {
+	__ATTR(powercut_flagtime, (S_IRUGO | S_IWUSR | S_IWGRP),
+		ab8505_powercut_flagtime_read, ab8505_powercut_flagtime_write),
+	__ATTR(powercut_maxtime, (S_IRUGO | S_IWUSR | S_IWGRP),
+		ab8505_powercut_maxtime_read, ab8505_powercut_maxtime_write),
+	__ATTR(powercut_restart_max, (S_IRUGO | S_IWUSR | S_IWGRP),
+		ab8505_powercut_restart_read, ab8505_powercut_restart_write),
+	__ATTR(powercut_timer, S_IRUGO, ab8505_powercut_timer_read, NULL),
+	__ATTR(powercut_restart_counter, S_IRUGO,
+		ab8505_powercut_restart_counter_read, NULL),
+	__ATTR(powercut_enable, (S_IRUGO | S_IWUSR | S_IWGRP),
+		ab8505_powercut_read, ab8505_powercut_write),
+	__ATTR(powercut_flag, S_IRUGO, ab8505_powercut_flag_read, NULL),
+	__ATTR(powercut_debounce_time, (S_IRUGO | S_IWUSR | S_IWGRP),
+		ab8505_powercut_debounce_read, ab8505_powercut_debounce_write),
+	__ATTR(powercut_enable_status, S_IRUGO,
+		ab8505_powercut_enable_status_read, NULL),
+};
+
+static int ab8500_fg_sysfs_psy_create_attrs(struct device *dev)
+{
+	unsigned int i, j;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	if (((is_ab8505(di->parent) || is_ab9540(di->parent)) &&
+	     abx500_get_chip_id(dev->parent) >= AB8500_CUT2P0)
+	    || is_ab8540(di->parent)) {
+		for (j = 0; j < ARRAY_SIZE(ab8505_fg_sysfs_psy_attrs); j++)
+			if (device_create_file(dev, &ab8505_fg_sysfs_psy_attrs[j]))
+				goto sysfs_psy_create_attrs_failed_ab8505;
+	}
+	return 0;
+sysfs_psy_create_attrs_failed_ab8505:
+	dev_err(dev, "Failed creating sysfs psy attrs for ab8505.\n");
+	while (j--)
+		device_remove_file(dev, &ab8505_fg_sysfs_psy_attrs[i]);
+
+	return -EIO;
+}
+
+static void ab8500_fg_sysfs_psy_remove_attrs(struct device *dev)
+{
+	unsigned int i;
+	struct power_supply *psy = dev_get_drvdata(dev);
+	struct ab8500_fg *di;
+
+	di = to_ab8500_fg_device_info(psy);
+
+	if (((is_ab8505(di->parent) || is_ab9540(di->parent)) &&
+	     abx500_get_chip_id(dev->parent) >= AB8500_CUT2P0)
+	    || is_ab8540(di->parent)) {
+		for (i = 0; i < ARRAY_SIZE(ab8505_fg_sysfs_psy_attrs); i++)
+			(void)device_remove_file(dev, &ab8505_fg_sysfs_psy_attrs[i]);
+	}
+}
+
 /* Exposure to the sysfs interface <<END>> */
 
 #if defined(CONFIG_PM)
@@ -2607,6 +3073,7 @@ static int ab8500_fg_remove(struct platform_device *pdev)
 	ab8500_fg_sysfs_exit(di);
 
 	flush_scheduled_work();
+	ab8500_fg_sysfs_psy_remove_attrs(di->fg_psy.dev);
 	power_supply_unregister(&di->fg_psy);
 	platform_set_drvdata(pdev, NULL);
 	return ret;
@@ -2772,6 +3239,13 @@ static int ab8500_fg_probe(struct platform_device *pdev)
 		goto free_irq;
 	}
 
+	ret = ab8500_fg_sysfs_psy_create_attrs(di->fg_psy.dev);
+	if (ret) {
+		dev_err(di->dev, "failed to create FG psy\n");
+		ab8500_fg_sysfs_exit(di);
+		goto free_irq;
+	}
+
 	/* Calibrate the fg first time */
 	di->flags.calibrate = true;
 	di->calib_state = AB8500_FG_CALIB_INIT;
diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index 9ead60bc66b7..188aedc322c2 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -89,6 +89,11 @@ struct abx500_fg;
  *				points.
  * @maint_thres			This is the threshold where we stop reporting
  *				battery full while in maintenance, in per cent
+ * @pcut_enable:			Enable power cut feature in ab8505
+ * @pcut_max_time:		Max time threshold
+ * @pcut_flag_time:		Flagtime threshold
+ * @pcut_max_restart:		Max number of restarts
+ * @pcut_debounce_time:		Sets battery debounce time
  */
 struct abx500_fg_parameters {
 	int recovery_sleep_timer;
@@ -106,6 +111,11 @@ struct abx500_fg_parameters {
 	int battok_raising_th_sel1;
 	int user_cap_limit;
 	int maint_thres;
+	bool pcut_enable;
+	u8 pcut_max_time;
+	u8 pcut_flag_time;
+	u8 pcut_max_restart;
+	u8 pcut_debounce_time;
 };
 
 /**
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index 8d35bfe164c8..0efbe0efee7f 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -235,6 +235,14 @@
 /* Battery type */
 #define BATTERY_UNKNOWN			00
 
+/* Registers for pcut feature in ab8505 and ab9540 */
+#define AB8505_RTC_PCUT_CTL_STATUS_REG	0x12
+#define AB8505_RTC_PCUT_TIME_REG	0x13
+#define AB8505_RTC_PCUT_MAX_TIME_REG	0x14
+#define AB8505_RTC_PCUT_FLAG_TIME_REG	0x15
+#define AB8505_RTC_PCUT_RESTART_REG	0x16
+#define AB8505_RTC_PCUT_DEBOUNCE_REG	0x17
+
 /**
  * struct res_to_temp - defines one point in a temp to res curve. To
  * be used in battery packs that combines the identification resistor with a
@@ -283,6 +291,11 @@ struct ab8500_fg;
  *				points.
  * @maint_thres			This is the threshold where we stop reporting
  *				battery full while in maintenance, in per cent
+ * @pcut_enable:			Enable power cut feature in ab8505
+ * @pcut_max_time:		Max time threshold
+ * @pcut_flag_time:		Flagtime threshold
+ * @pcut_max_restart:		Max number of restarts
+ * @pcut_debunce_time:	Sets battery debounce time
  */
 struct ab8500_fg_parameters {
 	int recovery_sleep_timer;
@@ -299,6 +312,11 @@ struct ab8500_fg_parameters {
 	int battok_raising_th_sel1;
 	int user_cap_limit;
 	int maint_thres;
+	bool pcut_enable;
+	u8 pcut_max_time;
+	u8 pcut_flag_time;
+	u8 pcut_max_restart;
+	u8 pcut_debunce_time;
 };
 
 /**
-- 
cgit 


From 0f4aa401853e07885707aedfc68c608051b0d6e4 Mon Sep 17 00:00:00 2001
From: Yang QU <yang.qu@stericsson.com>
Date: Tue, 26 Jun 2012 19:25:52 +0800
Subject: ab8500-charger: Add backup battery charge voltages on the ab8540

Add 2.7v, 2.9v, 3.0v, 3.2v and 3.3v charging voltages for backup
battery. Before that only 2.5v, 2.6v, 2.8v, 3.1v were available.

Signed-off-by: Yang QU <yang.qu@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Maxime COQUELIN <maxime.coquelin@stericsson.com>
Reviewed-by: Marcus COOPER <marcus.xm.cooper@stericsson.com>
Tested-by: Xiao Mei ZHANG <xiaomei.zhang@stericsson.com>
---
 drivers/power/ab8500_charger.c       | 19 +++++++++++++++++--
 include/linux/mfd/abx500/ab8500-bm.h | 24 ++++++++++++++++++++----
 2 files changed, 37 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index 24b30b7ea5ca..fd3fa2beca2b 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -2836,6 +2836,7 @@ static int ab8500_charger_usb_get_property(struct power_supply *psy,
 static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 {
 	int ret = 0;
+	u8 bup_vch_range = 0, vbup33_vrtcn = 0;
 
 	/* Setup maximum charger current and voltage for ABB cut2.0 */
 	if (!is_ab8500_1p1_or_earlier(di->parent)) {
@@ -2945,15 +2946,29 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 	}
 
 	/* Backup battery voltage and current */
+	if (di->bm->bkup_bat_v > BUP_VCH_SEL_3P1V)
+		bup_vch_range = BUP_VCH_RANGE;
+	if (di->bm->bkup_bat_v == BUP_VCH_SEL_3P3V)
+		vbup33_vrtcn = VBUP33_VRTCN;
+
 	ret = abx500_set_register_interruptible(di->dev,
 		AB8500_RTC,
 		AB8500_RTC_BACKUP_CHG_REG,
-		di->bm->bkup_bat_v |
-		di->bm->bkup_bat_i);
+		(di->bm->bkup_bat_v & 0x3) | di->bm->bkup_bat_i);
 	if (ret) {
 		dev_err(di->dev, "failed to setup backup battery charging\n");
 		goto out;
 	}
+	if (is_ab8540(di->parent)) {
+		ret = abx500_set_register_interruptible(di->dev,
+			AB8500_RTC,
+			AB8500_RTC_CTRL1_REG,
+			bup_vch_range | vbup33_vrtcn);
+		if (ret) {
+			dev_err(di->dev, "failed to setup backup battery charging\n");
+			goto out;
+		}
+	}
 
 	/* Enable backup battery charging */
 	abx500_mask_and_set_register_interruptible(di->dev,
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index 0efbe0efee7f..a73e05a0441b 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -105,6 +105,7 @@
 #define AB8500_RTC_BACKUP_CHG_REG	0x0C
 #define AB8500_RTC_CC_CONF_REG		0x01
 #define AB8500_RTC_CTRL_REG		0x0B
+#define AB8500_RTC_CTRL1_REG		0x11
 
 /*
  * OTP register offsets
@@ -179,10 +180,25 @@
 #define BUP_ICH_SEL_300UA		0x08
 #define BUP_ICH_SEL_700UA		0x0C
 
-#define BUP_VCH_SEL_2P5V		0x00
-#define BUP_VCH_SEL_2P6V		0x01
-#define BUP_VCH_SEL_2P8V		0x02
-#define BUP_VCH_SEL_3P1V		0x03
+enum bup_vch_sel {
+	BUP_VCH_SEL_2P5V,
+	BUP_VCH_SEL_2P6V,
+	BUP_VCH_SEL_2P8V,
+	BUP_VCH_SEL_3P1V,
+	/*
+	 * Note that the following 5 values 2.7v, 2.9v, 3.0v, 3.2v, 3.3v
+	 * are only available on ab8540. You can't choose these 5
+	 * voltage on ab8500/ab8505/ab9540.
+	 */
+	BUP_VCH_SEL_2P7V,
+	BUP_VCH_SEL_2P9V,
+	BUP_VCH_SEL_3P0V,
+	BUP_VCH_SEL_3P2V,
+	BUP_VCH_SEL_3P3V,
+};
+
+#define BUP_VCH_RANGE		0x02
+#define VBUP33_VRTCN		0x01
 
 /* Battery OVV constants */
 #define BATT_OVV_ENA			0x02
-- 
cgit 


From 4dcdf57773fd45b483fc7613b9e51b89a57d655c Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 14 Feb 2013 09:24:10 +0000
Subject: ab8500-bm: Quick re-attach charging behaviour

Due to a bug in some AB8500 ASICs charger removal cannot always
be detected if the removal and reinsertion is done to close in time.
This patch detects above described case and handles the situation
so that charging will be kept turned on.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/power/ab8500_charger.c            | 105 +++++++++++++++++++++++++++++-
 drivers/power/abx500_chargalg.c           |  33 ++++++++++
 include/linux/mfd/abx500/ux500_chargalg.h |   1 +
 3 files changed, 137 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index dcd3c6feca97..3eb23cf9ff47 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -52,6 +52,7 @@
 #define VBUS_DET_DBNC100		0x02
 #define VBUS_DET_DBNC1			0x01
 #define OTP_ENABLE_WD			0x01
+#define DROP_COUNT_RESET		0x01
 
 #define MAIN_CH_INPUT_CURR_SHIFT	4
 #define VBUS_IN_CURR_LIM_SHIFT		4
@@ -1677,6 +1678,105 @@ static int ab8500_charger_usb_en(struct ux500_charger *charger,
 	return ret;
 }
 
+/**
+ * ab8500_charger_usb_check_enable() - enable usb charging
+ * @charger:	pointer to the ux500_charger structure
+ * @vset:	charging voltage
+ * @iset:	charger output current
+ *
+ * Check if the VBUS charger has been disconnected and reconnected without
+ * AB8500 rising an interrupt. Returns 0 on success.
+ */
+static int ab8500_charger_usb_check_enable(struct ux500_charger *charger,
+	int vset, int iset)
+{
+	u8 usbch_ctrl1 = 0;
+	int ret = 0;
+
+	struct ab8500_charger *di = to_ab8500_charger_usb_device_info(charger);
+
+	if (!di->usb.charger_connected)
+		return ret;
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
+				AB8500_USBCH_CTRL1_REG, &usbch_ctrl1);
+	if (ret < 0) {
+		dev_err(di->dev, "ab8500 read failed %d\n", __LINE__);
+		return ret;
+	}
+	dev_dbg(di->dev, "USB charger ctrl: 0x%02x\n", usbch_ctrl1);
+
+	if (!(usbch_ctrl1 & USB_CH_ENA)) {
+		dev_info(di->dev, "Charging has been disabled abnormally and will be re-enabled\n");
+
+		ret = abx500_mask_and_set_register_interruptible(di->dev,
+					AB8500_CHARGER, AB8500_CHARGER_CTRL,
+					DROP_COUNT_RESET, DROP_COUNT_RESET);
+		if (ret < 0) {
+			dev_err(di->dev, "ab8500 write failed %d\n", __LINE__);
+			return ret;
+		}
+
+		ret = ab8500_charger_usb_en(&di->usb_chg, true, vset, iset);
+		if (ret < 0) {
+			dev_err(di->dev, "Failed to enable VBUS charger %d\n",
+					__LINE__);
+			return ret;
+		}
+	}
+	return ret;
+}
+
+/**
+ * ab8500_charger_ac_check_enable() - enable usb charging
+ * @charger:	pointer to the ux500_charger structure
+ * @vset:	charging voltage
+ * @iset:	charger output current
+ *
+ * Check if the AC charger has been disconnected and reconnected without
+ * AB8500 rising an interrupt. Returns 0 on success.
+ */
+static int ab8500_charger_ac_check_enable(struct ux500_charger *charger,
+	int vset, int iset)
+{
+	u8 mainch_ctrl1 = 0;
+	int ret = 0;
+
+	struct ab8500_charger *di = to_ab8500_charger_ac_device_info(charger);
+
+	if (!di->ac.charger_connected)
+		return ret;
+
+	ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
+				AB8500_MCH_CTRL1, &mainch_ctrl1);
+	if (ret < 0) {
+		dev_err(di->dev, "ab8500 read failed %d\n", __LINE__);
+		return ret;
+	}
+	dev_dbg(di->dev, "AC charger ctrl: 0x%02x\n", mainch_ctrl1);
+
+	if (!(mainch_ctrl1 & MAIN_CH_ENA)) {
+		dev_info(di->dev, "Charging has been disabled abnormally and will be re-enabled\n");
+
+		ret = abx500_mask_and_set_register_interruptible(di->dev,
+					AB8500_CHARGER, AB8500_CHARGER_CTRL,
+					DROP_COUNT_RESET, DROP_COUNT_RESET);
+
+		if (ret < 0) {
+			dev_err(di->dev, "ab8500 write failed %d\n", __LINE__);
+			return ret;
+		}
+
+		ret = ab8500_charger_ac_en(&di->usb_chg, true, vset, iset);
+		if (ret < 0) {
+			dev_err(di->dev, "failed to enable AC charger %d\n",
+				__LINE__);
+			return ret;
+		}
+	}
+	return ret;
+}
+
 /**
  * ab8500_charger_watchdog_kick() - kick charger watchdog
  * @di:		pointer to the ab8500_charger structure
@@ -1734,8 +1834,7 @@ static int ab8500_charger_update_charger_current(struct ux500_charger *charger,
 
 	/* Reset the main and usb drop input current measurement counter */
 	ret = abx500_set_register_interruptible(di->dev, AB8500_CHARGER,
-				AB8500_CHARGER_CTRL,
-				0x1);
+				AB8500_CHARGER_CTRL, DROP_COUNT_RESET);
 	if (ret) {
 		dev_err(di->dev, "%s write failed\n", __func__);
 		return ret;
@@ -3221,6 +3320,7 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->ac_chg.psy.num_supplicants = ARRAY_SIZE(supply_interface),
 	/* ux500_charger sub-class */
 	di->ac_chg.ops.enable = &ab8500_charger_ac_en;
+	di->ac_chg.ops.check_enable = &ab8500_charger_ac_check_enable;
 	di->ac_chg.ops.kick_wd = &ab8500_charger_watchdog_kick;
 	di->ac_chg.ops.update_curr = &ab8500_charger_update_charger_current;
 	di->ac_chg.max_out_volt = ab8500_charger_voltage_map[
@@ -3242,6 +3342,7 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->usb_chg.psy.num_supplicants = ARRAY_SIZE(supply_interface),
 	/* ux500_charger sub-class */
 	di->usb_chg.ops.enable = &ab8500_charger_usb_en;
+	di->usb_chg.ops.check_enable = &ab8500_charger_usb_check_enable;
 	di->usb_chg.ops.kick_wd = &ab8500_charger_watchdog_kick;
 	di->usb_chg.ops.update_curr = &ab8500_charger_update_charger_current;
 	di->usb_chg.max_out_volt = ab8500_charger_voltage_map[
diff --git a/drivers/power/abx500_chargalg.c b/drivers/power/abx500_chargalg.c
index 31507bfe549c..8ab65a3a8190 100644
--- a/drivers/power/abx500_chargalg.c
+++ b/drivers/power/abx500_chargalg.c
@@ -305,6 +305,30 @@ static void abx500_chargalg_state_to(struct abx500_chargalg *di,
 	di->charge_state = state;
 }
 
+static int abx500_chargalg_check_charger_enable(struct abx500_chargalg *di)
+{
+	switch (di->charge_state) {
+	case STATE_NORMAL:
+	case STATE_MAINTENANCE_A:
+	case STATE_MAINTENANCE_B:
+		break;
+	default:
+		return 0;
+	}
+
+	if (di->chg_info.charger_type & USB_CHG) {
+		return di->usb_chg->ops.check_enable(di->usb_chg,
+                         di->bm->bat_type[di->bm->batt_id].normal_vol_lvl,
+                         di->bm->bat_type[di->bm->batt_id].normal_cur_lvl);
+	} else if ((di->chg_info.charger_type & AC_CHG) &&
+		   !(di->ac_chg->external)) {
+		return di->ac_chg->ops.check_enable(di->ac_chg,
+                         di->bm->bat_type[di->bm->batt_id].normal_vol_lvl,
+                         di->bm->bat_type[di->bm->batt_id].normal_cur_lvl);
+	}
+	return 0;
+}
+
 /**
  * abx500_chargalg_check_charger_connection() - Check charger connection change
  * @di:		pointer to the abx500_chargalg structure
@@ -1219,6 +1243,7 @@ static void abx500_chargalg_external_power_changed(struct power_supply *psy)
 static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 {
 	int charger_status;
+	int ret;
 
 	/* Collect data from all power_supply class devices */
 	class_for_each_device(power_supply_class, NULL,
@@ -1229,6 +1254,14 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 	abx500_chargalg_check_charger_voltage(di);
 
 	charger_status = abx500_chargalg_check_charger_connection(di);
+
+	if (is_ab8500(di->parent)) {
+		ret = abx500_chargalg_check_charger_enable(di);
+		if (ret < 0)
+			dev_err(di->dev, "Checking charger is enabled error"
+					": Returned Value %d\n", ret);
+	}
+
 	/*
 	 * First check if we have a charger connected.
 	 * Also we don't allow charging of unknown batteries if configured
diff --git a/include/linux/mfd/abx500/ux500_chargalg.h b/include/linux/mfd/abx500/ux500_chargalg.h
index d43ac0f35526..110d12f09548 100644
--- a/include/linux/mfd/abx500/ux500_chargalg.h
+++ b/include/linux/mfd/abx500/ux500_chargalg.h
@@ -17,6 +17,7 @@ struct ux500_charger;
 
 struct ux500_charger_ops {
 	int (*enable) (struct ux500_charger *, int, int, int);
+	int (*check_enable) (struct ux500_charger *, int, int);
 	int (*kick_wd) (struct ux500_charger *);
 	int (*update_curr) (struct ux500_charger *, int);
 };
-- 
cgit 


From 8891716e24d7b0f4b1c3b4fdff641bcb1fb282c4 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Wed, 13 Feb 2013 11:39:19 +0000
Subject: ab8500-bm: Charge only mode fixes for the ab9540

Fix for charging not getting enabled in charge only mode by
external charger.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/power/ab8500_charger.c            | 42 +++++++++++++++++++++++++++++++
 drivers/power/abx500_chargalg.c           | 14 +++++++++++
 drivers/power/pm2301_charger.c            |  7 ++++++
 include/linux/mfd/abx500/ux500_chargalg.h |  2 ++
 4 files changed, 65 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index 3eb23cf9ff47..f1d712308b02 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -15,6 +15,7 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/notifier.h>
 #include <linux/slab.h>
 #include <linux/platform_device.h>
 #include <linux/power_supply.h>
@@ -97,6 +98,10 @@
 #define AB8500_SW_CONTROL_FALLBACK	0x03
 /* Wait for enumeration before charing in us */
 #define WAIT_ACA_RID_ENUMERATION	(5 * 1000)
+/*External charger control*/
+#define AB8500_SYS_CHARGER_CONTROL_REG		0x52
+#define EXTERNAL_CHARGER_DISABLE_REG_VAL	0x03
+#define EXTERNAL_CHARGER_ENABLE_REG_VAL		0x07
 
 /* UsbLineStatus register - usb types */
 enum ab8500_charger_link_status {
@@ -1678,6 +1683,29 @@ static int ab8500_charger_usb_en(struct ux500_charger *charger,
 	return ret;
 }
 
+static int ab8500_external_charger_prepare(struct notifier_block *charger_nb,
+				unsigned long event, void *data)
+{
+	int ret;
+	struct device *dev = data;
+	/*Toggle External charger control pin*/
+	ret = abx500_set_register_interruptible(dev, AB8500_SYS_CTRL1_BLOCK,
+				  AB8500_SYS_CHARGER_CONTROL_REG,
+				  EXTERNAL_CHARGER_DISABLE_REG_VAL);
+	if (ret < 0) {
+		dev_err(dev, "write reg failed %d\n", ret);
+		goto out;
+	}
+	ret = abx500_set_register_interruptible(dev, AB8500_SYS_CTRL1_BLOCK,
+				  AB8500_SYS_CHARGER_CONTROL_REG,
+				  EXTERNAL_CHARGER_ENABLE_REG_VAL);
+	if (ret < 0)
+		dev_err(dev, "Write reg failed %d\n", ret);
+
+out:
+	return ret;
+}
+
 /**
  * ab8500_charger_usb_check_enable() - enable usb charging
  * @charger:	pointer to the ux500_charger structure
@@ -3221,6 +3249,10 @@ static int ab8500_charger_suspend(struct platform_device *pdev,
 #define ab8500_charger_resume       NULL
 #endif
 
+static struct notifier_block charger_nb = {
+	.notifier_call = ab8500_external_charger_prepare,
+};
+
 static int ab8500_charger_remove(struct platform_device *pdev)
 {
 	struct ab8500_charger *di = platform_get_drvdata(pdev);
@@ -3250,6 +3282,11 @@ static int ab8500_charger_remove(struct platform_device *pdev)
 	/* Delete the work queue */
 	destroy_workqueue(di->charger_wq);
 
+	/* Unregister external charger enable notifier */
+	if (!di->ac_chg.enabled)
+		blocking_notifier_chain_unregister(
+			&charger_notifier_list, &charger_nb);
+
 	flush_scheduled_work();
 	if (di->usb_chg.enabled)
 		power_supply_unregister(&di->usb_chg.psy);
@@ -3331,6 +3368,11 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->ac_chg.enabled = di->bm->ac_enabled;
 	di->ac_chg.external = false;
 
+	/*notifier for external charger enabling*/
+	if (!di->ac_chg.enabled)
+		blocking_notifier_chain_register(
+			&charger_notifier_list, &charger_nb);
+
 	/* USB supply */
 	/* power_supply base class */
 	di->usb_chg.psy.name = "ab8500_usb";
diff --git a/drivers/power/abx500_chargalg.c b/drivers/power/abx500_chargalg.c
index 8ab65a3a8190..a876976678ab 100644
--- a/drivers/power/abx500_chargalg.c
+++ b/drivers/power/abx500_chargalg.c
@@ -26,6 +26,7 @@
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ux500_chargalg.h>
 #include <linux/mfd/abx500/ab8500-bm.h>
+#include <linux/notifier.h>
 
 /* Watchdog kick interval */
 #define CHG_WD_INTERVAL			(6 * HZ)
@@ -243,6 +244,9 @@ struct abx500_chargalg {
 	struct kobject chargalg_kobject;
 };
 
+/*External charger prepare notifier*/
+BLOCKING_NOTIFIER_HEAD(charger_notifier_list);
+
 /* Main battery properties */
 static enum power_supply_property abx500_chargalg_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
@@ -503,6 +507,8 @@ static int abx500_chargalg_kick_watchdog(struct abx500_chargalg *di)
 static int abx500_chargalg_ac_en(struct abx500_chargalg *di, int enable,
 	int vset, int iset)
 {
+	static int abx500_chargalg_ex_ac_enable_toggle;
+
 	if (!di->ac_chg || !di->ac_chg->ops.enable)
 		return -ENXIO;
 
@@ -515,6 +521,14 @@ static int abx500_chargalg_ac_en(struct abx500_chargalg *di, int enable,
 	di->chg_info.ac_iset = iset;
 	di->chg_info.ac_vset = vset;
 
+	/* Enable external charger */
+	if (enable && di->ac_chg->external &&
+	    !abx500_chargalg_ex_ac_enable_toggle) {
+		blocking_notifier_call_chain(&charger_notifier_list,
+					     0, di->dev);
+		abx500_chargalg_ex_ac_enable_toggle++;
+	}
+
 	return di->ac_chg->ops.enable(di->ac_chg, enable, vset, iset);
 }
 
diff --git a/drivers/power/pm2301_charger.c b/drivers/power/pm2301_charger.c
index b560fa5ac4e7..45ef3b9de6b9 100644
--- a/drivers/power/pm2301_charger.c
+++ b/drivers/power/pm2301_charger.c
@@ -1059,6 +1059,13 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 	ret = pm2xxx_charger_detection(pm2, &val);
 
 	if ((ret == 0) && val) {
+		/*
+		 * When boot is due to AC charger plug-in,
+		 * read interrupt registers
+		 */
+		pm2xxx_reg_read(pm2, PM2XXX_REG_INT1, &val);
+		pm2xxx_reg_read(pm2, PM2XXX_REG_INT2, &val);
+		pm2xxx_reg_read(pm2, PM2XXX_REG_INT4, &val);
 		pm2->ac.charger_connected = 1;
 		pm2->ac_conn = true;
 		power_supply_changed(&pm2->ac_chg.psy);
diff --git a/include/linux/mfd/abx500/ux500_chargalg.h b/include/linux/mfd/abx500/ux500_chargalg.h
index 110d12f09548..fa831f1e8cf8 100644
--- a/include/linux/mfd/abx500/ux500_chargalg.h
+++ b/include/linux/mfd/abx500/ux500_chargalg.h
@@ -41,4 +41,6 @@ struct ux500_charger {
 	bool external;
 };
 
+extern struct blocking_notifier_head charger_notifier_list;
+
 #endif
-- 
cgit 


From db43e6c473b57d4e7a55c4bd6edef71f40f13eae Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 14 Feb 2013 12:39:15 +0000
Subject: ab8500-bm: Add usb power path support

AB8540 supports power path function in USB charging mode for fast
power up with dead and weak battery, and it could extend
the battery age.

When USB charging starts, if the Vbattrue is below than SW cut off
voltage, power path and pre-charge should be enabled. If Vbattrue
is higher than SW cut off voltage, power path and pre-charge should
be disabled. This is to make sure full current to battery charge.
At the end of charge, power path should be enable again to reduce
charging the battery again.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/power/ab8500_charger.c            | 81 +++++++++++++++++++++++++++++++
 drivers/power/abx500_chargalg.c           | 62 +++++++++++++++++++++++
 include/linux/mfd/abx500.h                |  1 +
 include/linux/mfd/abx500/ab8500-bm.h      | 12 +++++
 include/linux/mfd/abx500/ux500_chargalg.h |  4 ++
 5 files changed, 160 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index 6fea4fdf8701..f249a65b02e1 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -1925,6 +1925,67 @@ static int ab8500_charger_update_charger_current(struct ux500_charger *charger,
 	return ret;
 }
 
+/**
+ * ab8540_charger_power_path_enable() - enable usb power path mode
+ * @charger:	pointer to the ux500_charger structure
+ * @enable:	enable/disable flag
+ *
+ * Enable or disable the power path for usb mode
+ * Returns error code in case of failure else 0(on success)
+ */
+static int ab8540_charger_power_path_enable(struct ux500_charger *charger,
+		bool enable)
+{
+	int ret;
+	struct ab8500_charger *di;
+
+	if (charger->psy.type == POWER_SUPPLY_TYPE_USB)
+		di = to_ab8500_charger_usb_device_info(charger);
+	else
+		return -ENXIO;
+
+	ret = abx500_mask_and_set_register_interruptible(di->dev,
+				AB8500_CHARGER, AB8540_USB_PP_MODE_REG,
+				BUS_POWER_PATH_MODE_ENA, enable);
+	if (ret) {
+		dev_err(di->dev, "%s write failed\n", __func__);
+		return ret;
+	}
+
+	return ret;
+}
+
+
+/**
+ * ab8540_charger_usb_pre_chg_enable() - enable usb pre change
+ * @charger:	pointer to the ux500_charger structure
+ * @enable:	enable/disable flag
+ *
+ * Enable or disable the pre-chage for usb mode
+ * Returns error code in case of failure else 0(on success)
+ */
+static int ab8540_charger_usb_pre_chg_enable(struct ux500_charger *charger,
+		bool enable)
+{
+	int ret;
+	struct ab8500_charger *di;
+
+	if (charger->psy.type == POWER_SUPPLY_TYPE_USB)
+		di = to_ab8500_charger_usb_device_info(charger);
+	else
+		return -ENXIO;
+
+	ret = abx500_mask_and_set_register_interruptible(di->dev,
+				AB8500_CHARGER, AB8540_USB_PP_CHR_REG,
+				BUS_POWER_PATH_PRECHG_ENA, enable);
+	if (ret) {
+		dev_err(di->dev, "%s write failed\n", __func__);
+		return ret;
+	}
+
+	return ret;
+}
+
 static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
 {
 	struct power_supply *psy;
@@ -3201,6 +3262,23 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 	if (ret < 0)
 		dev_err(di->dev, "%s mask and set failed\n", __func__);
 
+	if (is_ab8540(di->parent)) {
+		ret = abx500_mask_and_set_register_interruptible(di->dev,
+			AB8500_CHARGER, AB8540_USB_PP_MODE_REG,
+			BUS_VSYS_VOL_SELECT_MASK, BUS_VSYS_VOL_SELECT_3P6V);
+		if (ret) {
+			dev_err(di->dev, "failed to setup usb power path vsys voltage\n");
+			goto out;
+		}
+		ret = abx500_mask_and_set_register_interruptible(di->dev,
+			AB8500_CHARGER, AB8540_USB_PP_CHR_REG,
+			BUS_PP_PRECHG_CURRENT_MASK, 0);
+		if (ret) {
+			dev_err(di->dev, "failed to setup usb power path prechage current\n");
+			goto out;
+		}
+	}
+
 out:
 	return ret;
 }
@@ -3484,6 +3562,8 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->usb_chg.ops.check_enable = &ab8500_charger_usb_check_enable;
 	di->usb_chg.ops.kick_wd = &ab8500_charger_watchdog_kick;
 	di->usb_chg.ops.update_curr = &ab8500_charger_update_charger_current;
+	di->usb_chg.ops.pp_enable = &ab8540_charger_power_path_enable;
+	di->usb_chg.ops.pre_chg_enable = &ab8540_charger_usb_pre_chg_enable;
 	di->usb_chg.max_out_volt = ab8500_charger_voltage_map[
 		ARRAY_SIZE(ab8500_charger_voltage_map) - 1];
 	di->usb_chg.max_out_curr = ab8500_charger_current_map[
@@ -3491,6 +3571,7 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->usb_chg.wdt_refresh = CHG_WD_INTERVAL;
 	di->usb_chg.enabled = di->bm->usb_enabled;
 	di->usb_chg.external = false;
+	di->usb_chg.power_path = di->bm->usb_power_path;
 	di->usb_state.usb_current = -1;
 
 	/* Create a work queue for the charger */
diff --git a/drivers/power/abx500_chargalg.c b/drivers/power/abx500_chargalg.c
index a876976678ab..a9b8efdafb8f 100644
--- a/drivers/power/abx500_chargalg.c
+++ b/drivers/power/abx500_chargalg.c
@@ -34,6 +34,9 @@
 /* End-of-charge criteria counter */
 #define EOC_COND_CNT			10
 
+/* Plus margin for the low battery threshold */
+#define BAT_PLUS_MARGIN                (100)
+
 #define to_abx500_chargalg_device_info(x) container_of((x), \
 	struct abx500_chargalg, chargalg_psy);
 
@@ -83,6 +86,7 @@ enum abx500_chargalg_states {
 	STATE_HW_TEMP_PROTECT_INIT,
 	STATE_HW_TEMP_PROTECT,
 	STATE_NORMAL_INIT,
+	STATE_USB_PP_PRE_CHARGE,
 	STATE_NORMAL,
 	STATE_WAIT_FOR_RECHARGE_INIT,
 	STATE_WAIT_FOR_RECHARGE,
@@ -114,6 +118,7 @@ static const char *states[] = {
 	"HW_TEMP_PROTECT_INIT",
 	"HW_TEMP_PROTECT",
 	"NORMAL_INIT",
+	"USB_PP_PRE_CHARGE",
 	"NORMAL",
 	"WAIT_FOR_RECHARGE_INIT",
 	"WAIT_FOR_RECHARGE",
@@ -560,6 +565,37 @@ static int abx500_chargalg_usb_en(struct abx500_chargalg *di, int enable,
 	return di->usb_chg->ops.enable(di->usb_chg, enable, vset, iset);
 }
 
+ /**
+ * ab8540_chargalg_usb_pp_en() - Enable/ disable USB power path
+ * @di:                pointer to the abx500_chargalg structure
+ * @enable:    power path enable/disable
+ *
+ * The USB power path will be enable/ disable
+ */
+static int ab8540_chargalg_usb_pp_en(struct abx500_chargalg *di, bool enable)
+{
+	if (!di->usb_chg || !di->usb_chg->ops.pp_enable)
+		return -ENXIO;
+
+	return di->usb_chg->ops.pp_enable(di->usb_chg, enable);
+}
+
+/**
+ * ab8540_chargalg_usb_pre_chg_en() - Enable/ disable USB pre-charge
+ * @di:                pointer to the abx500_chargalg structure
+ * @enable:    USB pre-charge enable/disable
+ *
+ * The USB USB pre-charge will be enable/ disable
+ */
+static int ab8540_chargalg_usb_pre_chg_en(struct abx500_chargalg *di,
+					  bool enable)
+{
+	if (!di->usb_chg || !di->usb_chg->ops.pre_chg_enable)
+		return -ENXIO;
+
+	return di->usb_chg->ops.pre_chg_enable(di->usb_chg, enable);
+}
+
 /**
  * abx500_chargalg_update_chg_curr() - Update charger current
  * @di:		pointer to the abx500_chargalg structure
@@ -765,6 +801,9 @@ static void abx500_chargalg_end_of_charge(struct abx500_chargalg *di)
 		di->batt_data.avg_curr > 0) {
 		if (++di->eoc_cnt >= EOC_COND_CNT) {
 			di->eoc_cnt = 0;
+			if ((di->chg_info.charger_type & USB_CHG) &&
+			   (di->usb_chg->power_path))
+				ab8540_chargalg_usb_pp_en(di, true);
 			di->charge_status = POWER_SUPPLY_STATUS_FULL;
 			di->maintenance_chg = true;
 			dev_dbg(di->dev, "EOC reached!\n");
@@ -1465,6 +1504,22 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 		break;
 
 	case STATE_NORMAL_INIT:
+		if ((di->chg_info.charger_type & USB_CHG) &&
+				di->usb_chg->power_path) {
+			if (di->batt_data.volt >
+			    (di->bm->fg_params->lowbat_threshold +
+			     BAT_PLUS_MARGIN)) {
+				ab8540_chargalg_usb_pre_chg_en(di, false);
+				ab8540_chargalg_usb_pp_en(di, false);
+			} else {
+				ab8540_chargalg_usb_pp_en(di, true);
+				ab8540_chargalg_usb_pre_chg_en(di, true);
+				abx500_chargalg_state_to(di,
+					STATE_USB_PP_PRE_CHARGE);
+				break;
+			}
+		}
+
 		abx500_chargalg_start_charging(di,
 			di->bm->bat_type[di->bm->batt_id].normal_vol_lvl,
 			di->bm->bat_type[di->bm->batt_id].normal_cur_lvl);
@@ -1479,6 +1534,13 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 
 		break;
 
+	case STATE_USB_PP_PRE_CHARGE:
+		if (di->batt_data.volt >
+			(di->bm->fg_params->lowbat_threshold +
+			BAT_PLUS_MARGIN))
+			abx500_chargalg_state_to(di, STATE_NORMAL_INIT);
+		break;
+
 	case STATE_NORMAL:
 		handle_maxim_chg_curr(di);
 		if (di->charge_status == POWER_SUPPLY_STATUS_FULL &&
diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index 188aedc322c2..cd71d8eadf50 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -267,6 +267,7 @@ struct abx500_bm_data {
 	bool autopower_cfg;
 	bool ac_enabled;
 	bool usb_enabled;
+	bool usb_power_path;
 	bool no_maintenance;
 	bool capacity_scaling;
 	bool chg_unknown_bat;
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index a73e05a0441b..0ebf0c5d1f88 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -69,6 +69,8 @@
 #define AB8500_USBCH_CTRL1_REG		0xC0
 #define AB8500_USBCH_CTRL2_REG		0xC1
 #define AB8500_USBCH_IPT_CRNTLVL_REG	0xC2
+#define AB8540_USB_PP_MODE_REG		0xC5
+#define AB8540_USB_PP_CHR_REG		0xC6
 
 /*
  * Gas Gauge register offsets
@@ -259,6 +261,16 @@ enum bup_vch_sel {
 #define AB8505_RTC_PCUT_RESTART_REG	0x16
 #define AB8505_RTC_PCUT_DEBOUNCE_REG	0x17
 
+/* USB Power Path constants for ab8540 */
+#define BUS_VSYS_VOL_SELECT_MASK		0x06
+#define BUS_VSYS_VOL_SELECT_3P6V		0x00
+#define BUS_VSYS_VOL_SELECT_3P325V		0x02
+#define BUS_VSYS_VOL_SELECT_3P9V		0x04
+#define BUS_VSYS_VOL_SELECT_4P3V		0x06
+#define BUS_POWER_PATH_MODE_ENA			0x01
+#define BUS_PP_PRECHG_CURRENT_MASK		0x0E
+#define BUS_POWER_PATH_PRECHG_ENA		0x01
+
 /**
  * struct res_to_temp - defines one point in a temp to res curve. To
  * be used in battery packs that combines the identification resistor with a
diff --git a/include/linux/mfd/abx500/ux500_chargalg.h b/include/linux/mfd/abx500/ux500_chargalg.h
index fa831f1e8cf8..234c99143bf7 100644
--- a/include/linux/mfd/abx500/ux500_chargalg.h
+++ b/include/linux/mfd/abx500/ux500_chargalg.h
@@ -20,6 +20,8 @@ struct ux500_charger_ops {
 	int (*check_enable) (struct ux500_charger *, int, int);
 	int (*kick_wd) (struct ux500_charger *);
 	int (*update_curr) (struct ux500_charger *, int);
+	int (*pp_enable) (struct ux500_charger *, bool);
+	int (*pre_chg_enable) (struct ux500_charger *, bool);
 };
 
 /**
@@ -30,6 +32,7 @@ struct ux500_charger_ops {
  * @max_out_curr	maximum output charger current in mA
  * @enabled		indicates if this charger is used or not
  * @external		external charger unit (pm2xxx)
+ * @power_path		USB power path support
  */
 struct ux500_charger {
 	struct power_supply psy;
@@ -39,6 +42,7 @@ struct ux500_charger {
 	int wdt_refresh;
 	bool enabled;
 	bool external;
+	bool power_path;
 };
 
 extern struct blocking_notifier_head charger_notifier_list;
-- 
cgit 


From 861a30da53e2c5b9823b5390c1757baaf8f6e356 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Wed, 29 Aug 2012 20:36:51 +0800
Subject: ab8500-bm: Add support for the new ab8540 platform

Provide AB8540 platform specific information required to run the
Battery Management subsystem on AB8540 based devices. For this to
happen we see the introduction of separate platform specific data
structures and a means in which to process them.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
---
 drivers/power/ab8500_bmdata.c        |  91 +++++++++++-
 drivers/power/ab8500_btemp.c         |  42 +++++-
 drivers/power/ab8500_charger.c       | 270 +++++++++++++++--------------------
 include/linux/mfd/abx500.h           |  10 +-
 include/linux/mfd/abx500/ab8500-bm.h |   5 +-
 5 files changed, 248 insertions(+), 170 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_bmdata.c b/drivers/power/ab8500_bmdata.c
index e8759763fbe0..85742a6d29ff 100644
--- a/drivers/power/ab8500_bmdata.c
+++ b/drivers/power/ab8500_bmdata.c
@@ -414,13 +414,20 @@ static const struct abx500_fg_parameters fg = {
 	.pcut_debounce_time = 2,
 };
 
-static const struct abx500_maxim_parameters maxi_params = {
+static const struct abx500_maxim_parameters ab8500_maxi_params = {
 	.ena_maxi = true,
 	.chg_curr = 910,
 	.wait_cycles = 10,
 	.charger_curr_step = 100,
 };
 
+static const struct abx500_maxim_parameters abx540_maxi_params = {
+        .ena_maxi = true,
+        .chg_curr = 3000,
+        .wait_cycles = 10,
+        .charger_curr_step = 200,
+};
+
 static const struct abx500_bm_charger_parameters chg = {
 	.usb_volt_max		= 5500,
 	.usb_curr_max		= 1500,
@@ -428,6 +435,46 @@ static const struct abx500_bm_charger_parameters chg = {
 	.ac_curr_max		= 1500,
 };
 
+/*
+ * This array maps the raw hex value to charger output current used by the
+ * AB8500 values
+ */
+static int ab8500_charge_output_curr_map[] = {
+        100,    200,    300,    400,    500,    600,    700,    800,
+        900,    1000,   1100,   1200,   1300,   1400,   1500,   1500,
+};
+
+static int ab8540_charge_output_curr_map[] = {
+        0,      0,      0,      75,     100,    125,    150,    175,
+        200,    225,    250,    275,    300,    325,    350,    375,
+        400,    425,    450,    475,    500,    525,    550,    575,
+        600,    625,    650,    675,    700,    725,    750,    775,
+        800,    825,    850,    875,    900,    925,    950,    975,
+        1000,   1025,   1050,   1075,   1100,   1125,   1150,   1175,
+        1200,   1225,   1250,   1275,   1300,   1325,   1350,   1375,
+        1400,   1425,   1450,   1500,   1600,   1700,   1900,   2000,
+};
+
+/*
+ * This array maps the raw hex value to charger input current used by the
+ * AB8500 values
+ */
+static int ab8500_charge_input_curr_map[] = {
+        50,     98,     193,    290,    380,    450,    500,    600,
+        700,    800,    900,    1000,   1100,   1300,   1400,   1500,
+};
+
+static int ab8540_charge_input_curr_map[] = {
+        25,     50,     75,     100,    125,    150,    175,    200,
+        225,    250,    275,    300,    325,    350,    375,    400,
+        425,    450,    475,    500,    525,    550,    575,    600,
+        625,    650,    675,    700,    725,    750,    775,    800,
+        825,    850,    875,    900,    925,    950,    975,    1000,
+        1025,   1050,   1075,   1100,   1125,   1150,   1175,   1200,
+        1225,   1250,   1275,   1300,   1325,   1350,   1375,   1400,
+        1425,   1450,   1475,   1500,   1500,   1500,   1500,   1500,
+};
+
 struct abx500_bm_data ab8500_bm_data = {
 	.temp_under             = 3,
 	.temp_low               = 8,
@@ -447,15 +494,53 @@ struct abx500_bm_data ab8500_bm_data = {
 	.fg_res                 = 100,
 	.cap_levels             = &cap_levels,
 	.bat_type               = bat_type_thermistor,
-	.n_btypes               = 3,
+	.n_btypes               = ARRAY_SIZE(bat_type_thermistor),
 	.batt_id                = 0,
 	.interval_charging      = 5,
 	.interval_not_charging  = 120,
 	.temp_hysteresis        = 3,
 	.gnd_lift_resistance    = 34,
-	.maxi                   = &maxi_params,
+	.chg_output_curr        = ab8500_charge_output_curr_map,
+	.n_chg_out_curr         = ARRAY_SIZE(ab8500_charge_output_curr_map),
+	.maxi                   = &ab8500_maxi_params,
 	.chg_params             = &chg,
 	.fg_params              = &fg,
+        .chg_input_curr         = ab8500_charge_input_curr_map,
+        .n_chg_in_curr          = ARRAY_SIZE(ab8500_charge_input_curr_map),
+};
+
+struct abx500_bm_data ab8540_bm_data = {
+        .temp_under             = 3,
+        .temp_low               = 8,
+        .temp_high              = 43,
+        .temp_over              = 48,
+        .main_safety_tmr_h      = 4,
+        .temp_interval_chg      = 20,
+        .temp_interval_nochg    = 120,
+        .usb_safety_tmr_h       = 4,
+        .bkup_bat_v             = BUP_VCH_SEL_2P6V,
+        .bkup_bat_i             = BUP_ICH_SEL_150UA,
+        .no_maintenance         = false,
+        .capacity_scaling       = false,
+        .adc_therm              = ABx500_ADC_THERM_BATCTRL,
+        .chg_unknown_bat        = false,
+        .enable_overshoot       = false,
+        .fg_res                 = 100,
+        .cap_levels             = &cap_levels,
+        .bat_type               = bat_type_thermistor,
+        .n_btypes               = ARRAY_SIZE(bat_type_thermistor),
+        .batt_id                = 0,
+        .interval_charging      = 5,
+        .interval_not_charging  = 120,
+        .temp_hysteresis        = 3,
+        .gnd_lift_resistance    = 0,
+        .maxi                   = &abx540_maxi_params,
+        .chg_params             = &chg,
+        .fg_params              = &fg,
+        .chg_output_curr        = ab8540_charge_output_curr_map,
+        .n_chg_out_curr         = ARRAY_SIZE(ab8540_charge_output_curr_map),
+        .chg_input_curr         = ab8540_charge_input_curr_map,
+        .n_chg_in_curr          = ARRAY_SIZE(ab8540_charge_input_curr_map),
 };
 
 int ab8500_bm_of_probe(struct device *dev,
diff --git a/drivers/power/ab8500_btemp.c b/drivers/power/ab8500_btemp.c
index 91ad3edf6197..7336dcf45f7e 100644
--- a/drivers/power/ab8500_btemp.c
+++ b/drivers/power/ab8500_btemp.c
@@ -42,6 +42,9 @@
 #define BTEMP_BATCTRL_CURR_SRC_16UA	16
 #define BTEMP_BATCTRL_CURR_SRC_18UA	18
 
+#define BTEMP_BATCTRL_CURR_SRC_60UA	60
+#define BTEMP_BATCTRL_CURR_SRC_120UA	120
+
 #define to_ab8500_btemp_device_info(x) container_of((x), \
 	struct ab8500_btemp, btemp_psy);
 
@@ -216,7 +219,12 @@ static int ab8500_btemp_curr_source_enable(struct ab8500_btemp *di,
 	/* Only do this for batteries with internal NTC */
 	if (di->bm->adc_therm == ABx500_ADC_THERM_BATCTRL && enable) {
 
-		if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
+		if (is_ab8540(di->parent)) {
+			if (di->curr_source == BTEMP_BATCTRL_CURR_SRC_60UA)
+				curr = BAT_CTRL_60U_ENA;
+			else
+				curr = BAT_CTRL_120U_ENA;
+		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
 			if (di->curr_source == BTEMP_BATCTRL_CURR_SRC_16UA)
 				curr = BAT_CTRL_16U_ENA;
 			else
@@ -257,7 +265,14 @@ static int ab8500_btemp_curr_source_enable(struct ab8500_btemp *di,
 	} else if (di->bm->adc_therm == ABx500_ADC_THERM_BATCTRL && !enable) {
 		dev_dbg(di->dev, "Disable BATCTRL curr source\n");
 
-		if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
+		if (is_ab8540(di->parent)) {
+			/* Write 0 to the curr bits */
+			ret = abx500_mask_and_set_register_interruptible(
+				di->dev,
+				AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
+				BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA,
+				~(BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA));
+		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
 			/* Write 0 to the curr bits */
 			ret = abx500_mask_and_set_register_interruptible(
 				di->dev,
@@ -314,7 +329,13 @@ static int ab8500_btemp_curr_source_enable(struct ab8500_btemp *di,
 	 * if we got an error above
 	 */
 disable_curr_source:
-	if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
+	if (is_ab8540(di->parent)) {
+		/* Write 0 to the curr bits */
+		ret = abx500_mask_and_set_register_interruptible(di->dev,
+			AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
+			BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA,
+			~(BAT_CTRL_60U_ENA | BAT_CTRL_120U_ENA));
+	} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
 		/* Write 0 to the curr bits */
 		ret = abx500_mask_and_set_register_interruptible(di->dev,
 			AB8500_CHARGER, AB8500_BAT_CTRL_CURRENT_SOURCE,
@@ -541,7 +562,9 @@ static int ab8500_btemp_id(struct ab8500_btemp *di)
 {
 	int res;
 	u8 i;
-	if (is_ab9540(di->parent) || is_ab8505(di->parent))
+	if (is_ab8540(di->parent))
+		di->curr_source = BTEMP_BATCTRL_CURR_SRC_60UA;
+	else if (is_ab9540(di->parent) || is_ab8505(di->parent))
 		di->curr_source = BTEMP_BATCTRL_CURR_SRC_16UA;
 	else
 		di->curr_source = BTEMP_BATCTRL_CURR_SRC_7UA;
@@ -582,9 +605,14 @@ static int ab8500_btemp_id(struct ab8500_btemp *di)
 	 * detected type is Type 1, else we use the 7uA source
 	 */
 	if (di->bm->adc_therm == ABx500_ADC_THERM_BATCTRL &&
-			di->bm->batt_id == 1) {
-		if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-			dev_dbg(di->dev, "Set BATCTRL current source to 16uA\n");
+	    di->bm->batt_id == 1) {
+		if (is_ab8540(di->parent)) {
+			dev_dbg(di->dev,
+				"Set BATCTRL current source to 60uA\n");
+			di->curr_source = BTEMP_BATCTRL_CURR_SRC_60UA;
+		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
+			dev_dbg(di->dev,
+				"Set BATCTRL current source to 16uA\n");
 			di->curr_source = BTEMP_BATCTRL_CURR_SRC_16UA;
 		} else {
 			dev_dbg(di->dev, "Set BATCTRL current source to 20uA\n");
diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index f249a65b02e1..6089ee7bc609 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -57,7 +57,9 @@
 
 #define MAIN_CH_INPUT_CURR_SHIFT	4
 #define VBUS_IN_CURR_LIM_SHIFT		4
+#define AB8540_VBUS_IN_CURR_LIM_SHIFT	2
 #define AUTO_VBUS_IN_CURR_LIM_SHIFT	4
+#define AB8540_AUTO_VBUS_IN_CURR_MASK	0x3F
 #define VBUS_IN_CURR_LIM_RETRY_SET_TIME	30 /* seconds */
 
 #define LED_INDICATOR_PWM_ENA		0x01
@@ -82,6 +84,7 @@
 #define AB8500_USB_LINK_STATUS		0x78
 #define AB8505_USB_LINK_STATUS		0xF8
 #define AB8500_STD_HOST_SUSP		0x18
+#define USB_LINK_STATUS_SHIFT		3
 
 /* Watchdog timeout constant */
 #define WD_TIMER			0x30 /* 4min */
@@ -751,8 +754,7 @@ static int ab8500_charger_max_usb_curr(struct ab8500_charger *di,
 						"VBUS has collapsed\n");
 			ret = -ENXIO;
 			break;
-		}
-		if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
+		} else {
 			dev_dbg(di->dev, "USB Type - Charging not allowed\n");
 			di->max_usb_in_curr.usb_type_max =
 						USB_CH_IP_CUR_LVL_0P05;
@@ -807,30 +809,22 @@ static int ab8500_charger_read_usb_type(struct ab8500_charger *di)
 		dev_err(di->dev, "%s ab8500 read failed\n", __func__);
 		return ret;
 	}
-	if (is_ab8500(di->parent)) {
+	if (is_ab8500(di->parent))
 		ret = abx500_get_register_interruptible(di->dev, AB8500_USB,
-				AB8500_USB_LINE_STAT_REG, &val);
-	} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-			ret = abx500_get_register_interruptible(di->dev,
-				AB8500_USB, AB8500_USB_LINK1_STAT_REG, &val);
-	} else {
-		dev_err(di->dev, "%s unsupported analog baseband\n", __func__);
-		return -ENXIO;
-	}
+			AB8500_USB_LINE_STAT_REG, &val);
+	else
+		ret = abx500_get_register_interruptible(di->dev,
+			AB8500_USB, AB8500_USB_LINK1_STAT_REG, &val);
 	if (ret < 0) {
 		dev_err(di->dev, "%s ab8500 read failed\n", __func__);
 		return ret;
 	}
 
 	/* get the USB type */
-	if (is_ab8500(di->parent)) {
-		val = (val & AB8500_USB_LINK_STATUS) >> 3;
-	} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-			val = (val & AB8505_USB_LINK_STATUS) >> 3;
-	} else {
-		dev_err(di->dev, "%s unsupported analog baseband\n", __func__);
-		return -ENXIO;
-	}
+	if (is_ab8500(di->parent))
+		val = (val & AB8500_USB_LINK_STATUS) >> USB_LINK_STATUS_SHIFT;
+	else
+		val = (val & AB8505_USB_LINK_STATUS) >> USB_LINK_STATUS_SHIFT;
 	ret = ab8500_charger_max_usb_curr(di,
 		(enum ab8500_charger_link_status) val);
 
@@ -866,16 +860,12 @@ static int ab8500_charger_detect_usb_type(struct ab8500_charger *di)
 			return ret;
 		}
 
-		if (is_ab8500(di->parent)) {
+		if (is_ab8500(di->parent))
 			ret = abx500_get_register_interruptible(di->dev,
 				AB8500_USB, AB8500_USB_LINE_STAT_REG, &val);
-		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
+		else
 			ret = abx500_get_register_interruptible(di->dev,
 				AB8500_USB, AB8500_USB_LINK1_STAT_REG, &val);
-		} else {
-			dev_err(di->dev, "%s unsupported analog baseband\n", __func__);
-			return -ENXIO;
-		}
 		if (ret < 0) {
 			dev_err(di->dev, "%s ab8500 read failed\n", __func__);
 			return ret;
@@ -889,14 +879,12 @@ static int ab8500_charger_detect_usb_type(struct ab8500_charger *di)
 		 */
 
 		/* get the USB type */
-		if (is_ab8500(di->parent)) {
-			val = (val & AB8500_USB_LINK_STATUS) >> 3;
-		} else if (is_ab9540(di->parent) || is_ab8505(di->parent)) {
-				val = (val & AB8505_USB_LINK_STATUS) >> 3;
-		} else {
-			dev_err(di->dev, "%s unsupported analog baseband\n", __func__);
-			return -ENXIO;
-		}
+		if (is_ab8500(di->parent))
+			val = (val & AB8500_USB_LINK_STATUS) >>
+							USB_LINK_STATUS_SHIFT;
+		else
+			val = (val & AB8505_USB_LINK_STATUS) >>
+							USB_LINK_STATUS_SHIFT;
 		if (val)
 			break;
 	}
@@ -991,51 +979,6 @@ static int ab8500_charger_voltage_map[] = {
 	4600 ,
 };
 
-/*
- * This array maps the raw hex value to charger current used by the AB8500
- * Values taken from the UM0836
- */
-static int ab8500_charger_current_map[] = {
-	100 ,
-	200 ,
-	300 ,
-	400 ,
-	500 ,
-	600 ,
-	700 ,
-	800 ,
-	900 ,
-	1000 ,
-	1100 ,
-	1200 ,
-	1300 ,
-	1400 ,
-	1500 ,
-};
-
-/*
- * This array maps the raw hex value to VBUS input current used by the AB8500
- * Values taken from the UM0836
- */
-static int ab8500_charger_vbus_in_curr_map[] = {
-	USB_CH_IP_CUR_LVL_0P05,
-	USB_CH_IP_CUR_LVL_0P09,
-	USB_CH_IP_CUR_LVL_0P19,
-	USB_CH_IP_CUR_LVL_0P29,
-	USB_CH_IP_CUR_LVL_0P38,
-	USB_CH_IP_CUR_LVL_0P45,
-	USB_CH_IP_CUR_LVL_0P5,
-	USB_CH_IP_CUR_LVL_0P6,
-	USB_CH_IP_CUR_LVL_0P7,
-	USB_CH_IP_CUR_LVL_0P8,
-	USB_CH_IP_CUR_LVL_0P9,
-	USB_CH_IP_CUR_LVL_1P0,
-	USB_CH_IP_CUR_LVL_1P1,
-	USB_CH_IP_CUR_LVL_1P3,
-	USB_CH_IP_CUR_LVL_1P4,
-	USB_CH_IP_CUR_LVL_1P5,
-};
-
 static int ab8500_voltage_to_regval(int voltage)
 {
 	int i;
@@ -1057,41 +1000,41 @@ static int ab8500_voltage_to_regval(int voltage)
 		return -1;
 }
 
-static int ab8500_current_to_regval(int curr)
+static int ab8500_current_to_regval(struct ab8500_charger *di, int curr)
 {
 	int i;
 
-	if (curr < ab8500_charger_current_map[0])
+	if (curr < di->bm->chg_output_curr[0])
 		return 0;
 
-	for (i = 0; i < ARRAY_SIZE(ab8500_charger_current_map); i++) {
-		if (curr < ab8500_charger_current_map[i])
+	for (i = 0; i < di->bm->n_chg_out_curr; i++) {
+		if (curr < di->bm->chg_output_curr[i])
 			return i - 1;
 	}
 
 	/* If not last element, return error */
-	i = ARRAY_SIZE(ab8500_charger_current_map) - 1;
-	if (curr == ab8500_charger_current_map[i])
+	i = di->bm->n_chg_out_curr - 1;
+	if (curr == di->bm->chg_output_curr[i])
 		return i;
 	else
 		return -1;
 }
 
-static int ab8500_vbus_in_curr_to_regval(int curr)
+static int ab8500_vbus_in_curr_to_regval(struct ab8500_charger *di, int curr)
 {
 	int i;
 
-	if (curr < ab8500_charger_vbus_in_curr_map[0])
+	if (curr < di->bm->chg_input_curr[0])
 		return 0;
 
-	for (i = 0; i < ARRAY_SIZE(ab8500_charger_vbus_in_curr_map); i++) {
-		if (curr < ab8500_charger_vbus_in_curr_map[i])
+	for (i = 0; i < di->bm->n_chg_in_curr; i++) {
+		if (curr < di->bm->chg_input_curr[i])
 			return i - 1;
 	}
 
 	/* If not last element, return error */
-	i = ARRAY_SIZE(ab8500_charger_vbus_in_curr_map) - 1;
-	if (curr == ab8500_charger_vbus_in_curr_map[i])
+	i = di->bm->n_chg_in_curr - 1;
+	if (curr == di->bm->chg_input_curr[i])
 		return i;
 	else
 		return -1;
@@ -1169,7 +1112,7 @@ static int ab8500_charger_set_current(struct ab8500_charger *di,
 	int ich, int reg)
 {
 	int ret = 0;
-	int auto_curr_index, curr_index, prev_curr_index, shift_value, i;
+	int curr_index, prev_curr_index, shift_value, i;
 	u8 reg_value;
 	u32 step_udelay;
 	bool no_stepping = false;
@@ -1187,39 +1130,27 @@ static int ab8500_charger_set_current(struct ab8500_charger *di,
 	case AB8500_MCH_IPT_CURLVL_REG:
 		shift_value = MAIN_CH_INPUT_CURR_SHIFT;
 		prev_curr_index = (reg_value >> shift_value);
-		curr_index = ab8500_current_to_regval(ich);
+		curr_index = ab8500_current_to_regval(di, ich);
 		step_udelay = STEP_UDELAY;
 		if (!di->ac.charger_connected)
 			no_stepping = true;
 		break;
 	case AB8500_USBCH_IPT_CRNTLVL_REG:
-		shift_value = VBUS_IN_CURR_LIM_SHIFT;
+		if (is_ab8540(di->parent))
+			shift_value = AB8540_VBUS_IN_CURR_LIM_SHIFT;
+		else
+			shift_value = VBUS_IN_CURR_LIM_SHIFT;
 		prev_curr_index = (reg_value >> shift_value);
-		curr_index = ab8500_vbus_in_curr_to_regval(ich);
+		curr_index = ab8500_vbus_in_curr_to_regval(di, ich);
 		step_udelay = STEP_UDELAY * 100;
 
-		ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
-					AB8500_CH_USBCH_STAT2_REG, &reg_value);
-		if (ret < 0) {
-			dev_err(di->dev, "%s read failed\n", __func__);
-			goto exit_set_current;
-		}
-		auto_curr_index =
-			reg_value >> AUTO_VBUS_IN_CURR_LIM_SHIFT;
-
-		dev_dbg(di->dev, "%s Auto VBUS curr is %d mA\n",
-			__func__,
-			ab8500_charger_vbus_in_curr_map[auto_curr_index]);
-
-		prev_curr_index = min(prev_curr_index, auto_curr_index);
-
 		if (!di->usb.charger_connected)
 			no_stepping = true;
 		break;
 	case AB8500_CH_OPT_CRNTLVL_REG:
 		shift_value = 0;
 		prev_curr_index = (reg_value >> shift_value);
-		curr_index = ab8500_current_to_regval(ich);
+		curr_index = ab8500_current_to_regval(di, ich);
 		step_udelay = STEP_UDELAY;
 		if (curr_index && (curr_index - prev_curr_index) > 1)
 			step_udelay *= 100;
@@ -1459,8 +1390,8 @@ static int ab8500_charger_ac_en(struct ux500_charger *charger,
 
 		/* Check if the requested voltage or current is valid */
 		volt_index = ab8500_voltage_to_regval(vset);
-		curr_index = ab8500_current_to_regval(iset);
-		input_curr_index = ab8500_current_to_regval(
+		curr_index = ab8500_current_to_regval(di, iset);
+		input_curr_index = ab8500_current_to_regval(di,
 			di->bm->chg_params->ac_curr_max);
 		if (volt_index < 0 || curr_index < 0 || input_curr_index < 0) {
 			dev_err(di->dev,
@@ -1631,7 +1562,7 @@ static int ab8500_charger_usb_en(struct ux500_charger *charger,
 
 		/* Check if the requested voltage or current is valid */
 		volt_index = ab8500_voltage_to_regval(vset);
-		curr_index = ab8500_current_to_regval(ich_out);
+		curr_index = ab8500_current_to_regval(di, ich_out);
 		if (volt_index < 0 || curr_index < 0) {
 			dev_err(di->dev,
 				"Charger voltage or current too high, "
@@ -2396,18 +2327,21 @@ static void ab8500_charger_usb_link_status_work(struct work_struct *work)
 	else
 		dev_dbg(di->dev, "Error reading USB link status\n");
 
-	if (is_ab9540(di->parent) || is_ab8505(di->parent))
-		link_status = AB8505_USB_LINK_STATUS;
-	else
+	if (is_ab8500(di->parent))
 		link_status = AB8500_USB_LINK_STATUS;
+	else
+		link_status = AB8505_USB_LINK_STATUS;
 
 	if (detected_chargers & USB_PW_CONN) {
-		if (((val & link_status) >> 3) == USB_STAT_NOT_VALID_LINK &&
+		if (((val & link_status) >> USB_LINK_STATUS_SHIFT) ==
+				USB_STAT_NOT_VALID_LINK &&
 				di->invalid_charger_detect_state == 0) {
-			dev_dbg(di->dev, "Invalid charger detected, state= 0\n");
+			dev_dbg(di->dev,
+					"Invalid charger detected, state= 0\n");
 			/*Enable charger*/
 			abx500_mask_and_set_register_interruptible(di->dev,
-					AB8500_CHARGER, AB8500_USBCH_CTRL1_REG, 0x01, 0x01);
+					AB8500_CHARGER, AB8500_USBCH_CTRL1_REG,
+					USB_CH_ENA, USB_CH_ENA);
 			/*Enable charger detection*/
 			abx500_mask_and_set_register_interruptible(di->dev, AB8500_USB,
 					AB8500_MCH_IPT_CURLVL_REG, 0x01, 0x01);
@@ -2417,15 +2351,17 @@ static void ab8500_charger_usb_link_status_work(struct work_struct *work)
 
 		}
 		if (di->invalid_charger_detect_state == 1) {
-			dev_dbg(di->dev, "Invalid charger detected, state= 1\n");
+			dev_dbg(di->dev,
+					"Invalid charger detected, state= 1\n");
 			/*Stop charger detection*/
 			abx500_mask_and_set_register_interruptible(di->dev, AB8500_USB,
 					AB8500_MCH_IPT_CURLVL_REG, 0x01, 0x00);
 			/*Check link status*/
-			ret = abx500_get_register_interruptible(di->dev, AB8500_USB,
+			ret = abx500_get_register_interruptible(di->dev,
+					AB8500_USB,
 					AB8500_USB_LINE_STAT_REG, &val);
 			dev_dbg(di->dev, "USB link status= 0x%02x\n",
-					(val & link_status) >> 3);
+				(val & link_status) >> USB_LINK_STATUS_SHIFT);
 			di->invalid_charger_detect_state = 2;
 		}
 	} else {
@@ -2741,7 +2677,7 @@ static void ab8500_charger_vbus_drop_end_work(struct work_struct *work)
 {
 	struct ab8500_charger *di = container_of(work,
 		struct ab8500_charger, vbus_drop_end_work.work);
-	int ret;
+	int ret, curr;
 	u8 reg_value;
 
 	di->flags.vbus_drop_end = false;
@@ -2749,32 +2685,41 @@ static void ab8500_charger_vbus_drop_end_work(struct work_struct *work)
 	/* Reset the drop counter */
 	abx500_set_register_interruptible(di->dev,
 				  AB8500_CHARGER, AB8500_CHARGER_CTRL, 0x01);
-	ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
-						AB8500_CH_USBCH_STAT2_REG,
-						&reg_value);
+
+	if (is_ab8540(di->parent))
+		ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
+				AB8540_CH_USBCH_STAT3_REG, &reg_value);
+	else
+		ret = abx500_get_register_interruptible(di->dev, AB8500_CHARGER,
+				AB8500_CH_USBCH_STAT2_REG, &reg_value);
 	if (ret < 0) {
-		dev_err(di->dev, "%s ab8500 read failed\n", __func__);
-	} else {
-		int curr = ab8500_charger_vbus_in_curr_map[
+		dev_err(di->dev, "%s read failed\n", __func__);
+		return;
+	}
+
+	if (is_ab8540(di->parent))
+		curr = di->bm->chg_input_curr[
+			reg_value & AB8540_AUTO_VBUS_IN_CURR_MASK];
+	else
+		curr = di->bm->chg_input_curr[
 			reg_value >> AUTO_VBUS_IN_CURR_LIM_SHIFT];
-		if (di->max_usb_in_curr.calculated_max != curr) {
-			/* USB source is collapsing */
-			di->max_usb_in_curr.calculated_max = curr;
-			dev_dbg(di->dev,
-				 "VBUS input current limiting to %d mA\n",
-				 di->max_usb_in_curr.calculated_max);
-		} else {
-			/*
-			 * USB source can not give more than this amount.
-			 * Taking more will collapse the source.
-			 */
-			di->max_usb_in_curr.set_max =
-				di->max_usb_in_curr.calculated_max;
-			dev_dbg(di->dev,
-				 "VBUS input current limited to %d mA\n",
-				 di->max_usb_in_curr.set_max);
-			return;
-		}
+
+	if (di->max_usb_in_curr.calculated_max != curr) {
+		/* USB source is collapsing */
+		di->max_usb_in_curr.calculated_max = curr;
+		dev_dbg(di->dev,
+			 "VBUS input current limiting to %d mA\n",
+			 di->max_usb_in_curr.calculated_max);
+	} else {
+		/*
+		 * USB source can not give more than this amount.
+		 * Taking more will collapse the source.
+		 */
+		di->max_usb_in_curr.set_max =
+			di->max_usb_in_curr.calculated_max;
+		dev_dbg(di->dev,
+			 "VBUS input current limited to %d mA\n",
+			 di->max_usb_in_curr.set_max);
 	}
 
 	if (di->usb.charger_connected)
@@ -3134,9 +3079,14 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 			goto out;
 		}
 
-		ret = abx500_set_register_interruptible(di->dev,
-			AB8500_CHARGER,
-			AB8500_CH_OPT_CRNTLVL_MAX_REG, CH_OP_CUR_LVL_1P6);
+		if (is_ab8540(di->parent))
+			ret = abx500_set_register_interruptible(di->dev,
+				AB8500_CHARGER, AB8500_CH_OPT_CRNTLVL_MAX_REG,
+				CH_OP_CUR_LVL_2P);
+		else
+			ret = abx500_set_register_interruptible(di->dev,
+				AB8500_CHARGER, AB8500_CH_OPT_CRNTLVL_MAX_REG,
+				CH_OP_CUR_LVL_1P6);
 		if (ret) {
 			dev_err(di->dev,
 				"failed to set CH_OPT_CRNTLVL_MAX_REG\n");
@@ -3144,7 +3094,8 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 		}
 	}
 
-	if (is_ab9540_2p0(di->parent) || is_ab8505_2p0(di->parent))
+	if (is_ab9540_2p0(di->parent) || is_ab9540_3p0(di->parent)
+	 || is_ab8505_2p0(di->parent) || is_ab8540(di->parent))
 		ret = abx500_mask_and_set_register_interruptible(di->dev,
 			AB8500_CHARGER,
 			AB8500_USBCH_CTRL2_REG,
@@ -3250,7 +3201,8 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 			AB8500_RTC_CTRL1_REG,
 			bup_vch_range | vbup33_vrtcn);
 		if (ret) {
-			dev_err(di->dev, "failed to setup backup battery charging\n");
+			dev_err(di->dev,
+				"failed to setup backup battery charging\n");
 			goto out;
 		}
 	}
@@ -3267,14 +3219,16 @@ static int ab8500_charger_init_hw_registers(struct ab8500_charger *di)
 			AB8500_CHARGER, AB8540_USB_PP_MODE_REG,
 			BUS_VSYS_VOL_SELECT_MASK, BUS_VSYS_VOL_SELECT_3P6V);
 		if (ret) {
-			dev_err(di->dev, "failed to setup usb power path vsys voltage\n");
+			dev_err(di->dev,
+				"failed to setup usb power path vsys voltage\n");
 			goto out;
 		}
 		ret = abx500_mask_and_set_register_interruptible(di->dev,
 			AB8500_CHARGER, AB8540_USB_PP_CHR_REG,
 			BUS_PP_PRECHG_CURRENT_MASK, 0);
 		if (ret) {
-			dev_err(di->dev, "failed to setup usb power path prechage current\n");
+			dev_err(di->dev,
+				"failed to setup usb power path prechage current\n");
 			goto out;
 		}
 	}
@@ -3537,8 +3491,8 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->ac_chg.ops.update_curr = &ab8500_charger_update_charger_current;
 	di->ac_chg.max_out_volt = ab8500_charger_voltage_map[
 		ARRAY_SIZE(ab8500_charger_voltage_map) - 1];
-	di->ac_chg.max_out_curr = ab8500_charger_current_map[
-		ARRAY_SIZE(ab8500_charger_current_map) - 1];
+	di->ac_chg.max_out_curr =
+		di->bm->chg_output_curr[di->bm->n_chg_out_curr - 1];
 	di->ac_chg.wdt_refresh = CHG_WD_INTERVAL;
 	di->ac_chg.enabled = di->bm->ac_enabled;
 	di->ac_chg.external = false;
@@ -3566,8 +3520,8 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->usb_chg.ops.pre_chg_enable = &ab8540_charger_usb_pre_chg_enable;
 	di->usb_chg.max_out_volt = ab8500_charger_voltage_map[
 		ARRAY_SIZE(ab8500_charger_voltage_map) - 1];
-	di->usb_chg.max_out_curr = ab8500_charger_current_map[
-		ARRAY_SIZE(ab8500_charger_current_map) - 1];
+	di->usb_chg.max_out_curr =
+		di->bm->chg_output_curr[di->bm->n_chg_out_curr - 1];
 	di->usb_chg.wdt_refresh = CHG_WD_INTERVAL;
 	di->usb_chg.enabled = di->bm->usb_enabled;
 	di->usb_chg.external = false;
diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index cd71d8eadf50..33b0253569a3 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -246,7 +246,11 @@ struct abx500_bm_charger_parameters {
  * @interval_not_charging charge alg cycle period time when not charging (sec)
  * @temp_hysteresis	temperature hysteresis
  * @gnd_lift_resistance	Battery ground to phone ground resistance (mOhm)
- * @maxi:		maximization parameters
+ * @n_chg_out_curr		number of elements in array chg_output_curr
+ * @n_chg_in_curr		number of elements in array chg_input_curr
+ * @chg_output_curr	charger output current level map
+ * @chg_input_curr		charger input current level map
+ * @maxi		maximization parameters
  * @cap_levels		capacity in percent for the different capacity levels
  * @bat_type		table of supported battery types
  * @chg_params		charger parameters
@@ -281,6 +285,10 @@ struct abx500_bm_data {
 	int interval_not_charging;
 	int temp_hysteresis;
 	int gnd_lift_resistance;
+	int n_chg_out_curr;
+	int n_chg_in_curr;
+	int *chg_output_curr;
+	int *chg_input_curr;
 	const struct abx500_maxim_parameters *maxi;
 	const struct abx500_bm_capacity_levels *cap_levels;
 	struct abx500_battery_type *bat_type;
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index 0ebf0c5d1f88..ee1c1626c886 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -33,7 +33,7 @@
 #define AB8500_CH_STATUS2_REG		0x01
 #define AB8500_CH_USBCH_STAT1_REG	0x02
 #define AB8500_CH_USBCH_STAT2_REG	0x03
-#define AB8500_CH_FSM_STAT_REG		0x04
+#define AB8540_CH_USBCH_STAT3_REG	0x04
 #define AB8500_CH_STAT_REG		0x05
 
 /*
@@ -157,6 +157,7 @@
 #define CH_OP_CUR_LVL_1P4		0x0D
 #define CH_OP_CUR_LVL_1P5		0x0E
 #define CH_OP_CUR_LVL_1P6		0x0F
+#define CH_OP_CUR_LVL_2P		0x3F
 
 /* BTEMP High thermal limits */
 #define BTEMP_HIGH_TH_57_0		0x00
@@ -246,6 +247,8 @@ enum bup_vch_sel {
 #define BAT_CTRL_20U_ENA		0x02
 #define BAT_CTRL_18U_ENA		0x01
 #define BAT_CTRL_16U_ENA		0x02
+#define BAT_CTRL_60U_ENA		0x01
+#define BAT_CTRL_120U_ENA		0x02
 #define BAT_CTRL_CMP_ENA		0x04
 #define FORCE_BAT_CTRL_CMP_HIGH		0x08
 #define BAT_CTRL_PULL_UP_ENA		0x10
-- 
cgit 


From b3ea5f451e4e435b650e34142f8552002dc21297 Mon Sep 17 00:00:00 2001
From: Marcus Cooper <marcus.xm.cooper@stericsson.com>
Date: Wed, 29 Aug 2012 17:56:19 +0200
Subject: ab8500-charger: Add UsbLineCtrl2 reference

When the state of USB Charge detection is changed then the calls
use a define for another register in other bank. This change
creates a new define for the correct register and removes the
magic numbers that are present.

Signed-off-by: Marcus Cooper <marcus.xm.cooper@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Hakan BERG <hakan.berg@stericsson.com>
Reviewed-by: Jonas ABERG <jonas.aberg@stericsson.com>
---
 drivers/power/ab8500_charger.c       | 11 +++++++----
 include/linux/mfd/abx500/ab8500-bm.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index bf8b479914cd..64accb235d2c 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -54,6 +54,7 @@
 #define VBUS_DET_DBNC1			0x01
 #define OTP_ENABLE_WD			0x01
 #define DROP_COUNT_RESET		0x01
+#define USB_CH_DET			0x01
 
 #define MAIN_CH_INPUT_CURR_SHIFT	4
 #define VBUS_IN_CURR_LIM_SHIFT		4
@@ -2348,8 +2349,9 @@ static void ab8500_charger_usb_link_status_work(struct work_struct *work)
 					AB8500_CHARGER, AB8500_USBCH_CTRL1_REG,
 					USB_CH_ENA, USB_CH_ENA);
 			/*Enable charger detection*/
-			abx500_mask_and_set_register_interruptible(di->dev, AB8500_USB,
-					AB8500_MCH_IPT_CURLVL_REG, 0x01, 0x01);
+			abx500_mask_and_set_register_interruptible(di->dev,
+					AB8500_USB, AB8500_USB_LINE_CTRL2_REG,
+					USB_CH_DET, USB_CH_DET);
 			di->invalid_charger_detect_state = 1;
 			/*exit and wait for new link status interrupt.*/
 			return;
@@ -2359,8 +2361,9 @@ static void ab8500_charger_usb_link_status_work(struct work_struct *work)
 			dev_dbg(di->dev,
 					"Invalid charger detected, state= 1\n");
 			/*Stop charger detection*/
-			abx500_mask_and_set_register_interruptible(di->dev, AB8500_USB,
-					AB8500_MCH_IPT_CURLVL_REG, 0x01, 0x00);
+			abx500_mask_and_set_register_interruptible(di->dev,
+					AB8500_USB, AB8500_USB_LINE_CTRL2_REG,
+					USB_CH_DET, 0x00);
 			/*Check link status*/
 			if (is_ab8500(di->parent))
 				ret = abx500_get_register_interruptible(di->dev,
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index ee1c1626c886..f5214dc651f9 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -23,6 +23,7 @@
  * Bank : 0x5
  */
 #define AB8500_USB_LINE_STAT_REG	0x80
+#define AB8500_USB_LINE_CTRL2_REG	0x82
 #define AB8500_USB_LINK1_STAT_REG	0x94
 
 /*
-- 
cgit 


From f4095a0f06476e5914f2c58b4e96258b2e2ba6b7 Mon Sep 17 00:00:00 2001
From: M BenZoubeir <mustapha.ben.zoubeir-xsig@stericsson.com>
Date: Thu, 13 Sep 2012 10:34:18 +0200
Subject: pm2301-charger: Adjust interrupt handler behavior

Signed-off-by: M BenZoubeir <mustapha.ben.zoubeir-xsig@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Philippe LANGLAIS <philippe.langlais@stericsson.com>
---
 drivers/power/pm2301_charger.c | 45 ++++++++++++++++++++++--------------------
 include/linux/pm2301_charger.h |  2 +-
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/pm2301_charger.c b/drivers/power/pm2301_charger.c
index b8eafc3850d0..eed8a89ba4f0 100644
--- a/drivers/power/pm2301_charger.c
+++ b/drivers/power/pm2301_charger.c
@@ -493,14 +493,16 @@ static irqreturn_t  pm2xxx_irq_int(int irq, void *data)
 	struct pm2xxx_interrupts *interrupt = pm2->pm2_int;
 	int i;
 
-	for (i = 0; i < PM2XXX_NUM_INT_REG; i++) {
-		 pm2xxx_reg_read(pm2,
+	do {
+		for (i = 0; i < PM2XXX_NUM_INT_REG; i++) {
+			pm2xxx_reg_read(pm2,
 				pm2xxx_interrupt_registers[i],
 				&(interrupt->reg[i]));
 
-		if (interrupt->reg[i] > 0)
-			interrupt->handler[i](pm2, interrupt->reg[i]);
-	}
+			if (interrupt->reg[i] > 0)
+				interrupt->handler[i](pm2, interrupt->reg[i]);
+		}
+	} while (gpio_get_value(pm2->pdata->gpio_irq_number) == 0);
 
 	return IRQ_HANDLED;
 }
@@ -951,6 +953,7 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 	struct pm2xxx_charger *pm2;
 	int ret = 0;
 	u8 val;
+	int i;
 
 	pm2 = kzalloc(sizeof(struct pm2xxx_charger), GFP_KERNEL);
 	if (!pm2) {
@@ -1062,24 +1065,25 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 	}
 
 	/* Register interrupts */
-	ret = request_threaded_irq(pm2->pdata->irq_number, NULL,
+	ret = request_threaded_irq(gpio_to_irq(pm2->pdata->gpio_irq_number),
+				NULL,
 				pm2xxx_charger_irq[0].isr,
 				pm2->pdata->irq_type,
 				pm2xxx_charger_irq[0].name, pm2);
 
 	if (ret != 0) {
 		dev_err(pm2->dev, "failed to request %s IRQ %d: %d\n",
-		pm2xxx_charger_irq[0].name, pm2->pdata->irq_number, ret);
+		pm2xxx_charger_irq[0].name,
+			gpio_to_irq(pm2->pdata->gpio_irq_number), ret);
 		goto unregister_pm2xxx_charger;
 	}
 	/* pm interrupt can wake up system */
-	ret = enable_irq_wake(pm2->pdata->irq_number);
+	ret = enable_irq_wake(gpio_to_irq(pm2->pdata->gpio_irq_number));
 	if (ret) {
 		dev_err(pm2->dev, "failed to set irq wake\n");
 		goto unregister_pm2xxx_interrupt;
 	}
 
-	/*Initialize lock*/
 	mutex_init(&pm2->lock);
 
 	/*
@@ -1099,16 +1103,16 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 	}
 
 	set_lpn_pin(pm2);
+
+	/* read  interrupt registers */
+	for (i = 0; i < PM2XXX_NUM_INT_REG; i++)
+		pm2xxx_reg_read(pm2,
+			pm2xxx_interrupt_registers[i],
+			&val);
+
 	ret = pm2xxx_charger_detection(pm2, &val);
 
 	if ((ret == 0) && val) {
-		/*
-		 * When boot is due to AC charger plug-in,
-		 * read interrupt registers
-		 */
-		pm2xxx_reg_read(pm2, PM2XXX_REG_INT1, &val);
-		pm2xxx_reg_read(pm2, PM2XXX_REG_INT2, &val);
-		pm2xxx_reg_read(pm2, PM2XXX_REG_INT4, &val);
 		pm2->ac.charger_connected = 1;
 		ab8500_override_turn_on_stat(~AB8500_POW_KEY_1_ON,
 					     AB8500_MAIN_CH_DET);
@@ -1122,10 +1126,10 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 free_gpio:
 	gpio_free(pm2->lpn_pin);
 disable_pm2_irq_wake:
-	disable_irq_wake(pm2->pdata->irq_number);
+	disable_irq_wake(gpio_to_irq(pm2->pdata->gpio_irq_number));
 unregister_pm2xxx_interrupt:
 	/* disable interrupt */
-	free_irq(pm2->pdata->irq_number, pm2);
+	free_irq(gpio_to_irq(pm2->pdata->gpio_irq_number), pm2);
 unregister_pm2xxx_charger:
 	/* unregister power supply */
 	power_supply_unregister(&pm2->ac_chg.psy);
@@ -1148,10 +1152,10 @@ static int pm2xxx_wall_charger_remove(struct i2c_client *i2c_client)
 	pm2xxx_charger_ac_en(&pm2->ac_chg, false, 0, 0);
 
 	/* Disable wake by pm interrupt */
-	disable_irq_wake(pm2->pdata->irq_number);
+	disable_irq_wake(gpio_to_irq(pm2->pdata->gpio_irq_number));
 
 	/* Disable interrupts */
-	free_irq(pm2->pdata->irq_number, pm2);
+	free_irq(gpio_to_irq(pm2->pdata->gpio_irq_number), pm2);
 
 	/* Delete the work queue */
 	destroy_workqueue(pm2->charger_wq);
@@ -1163,7 +1167,6 @@ static int pm2xxx_wall_charger_remove(struct i2c_client *i2c_client)
 
 	power_supply_unregister(&pm2->ac_chg.psy);
 
-	/*Free GPIO60*/
 	gpio_free(pm2->lpn_pin);
 
 	kfree(pm2);
diff --git a/include/linux/pm2301_charger.h b/include/linux/pm2301_charger.h
index fc3f026922ae..85c16defe11a 100644
--- a/include/linux/pm2301_charger.h
+++ b/include/linux/pm2301_charger.h
@@ -48,7 +48,7 @@ struct pm2xxx_charger_platform_data {
 	size_t num_supplicants;
 	int i2c_bus;
 	const char *label;
-	int irq_number;
+	int gpio_irq_number;
 	unsigned int lpn_gpio;
 	int irq_type;
 };
-- 
cgit 


From 9684819b5a29e62acd8265a92d8f3454de9bb71e Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Wed, 27 Feb 2013 16:38:17 +0100
Subject: HID: ll_driver: Extend the interface with idle requests

Some drivers send the idle command directly to underlying device,
creating an unwanted dependency on the underlying transport layer.
This patch adds hid_hw_idle() to the interface, thereby removing
usbhid from the lion share of the drivers.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Reviewed-by: David Herrmann <dh.herrmann@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/usbhid/hid-core.c | 15 +++++++++++++++
 include/linux/hid.h           | 19 +++++++++++++++++++
 2 files changed, 34 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c
index 420466bc481a..effcd3d6f5cf 100644
--- a/drivers/hid/usbhid/hid-core.c
+++ b/drivers/hid/usbhid/hid-core.c
@@ -1253,6 +1253,20 @@ static void usbhid_request(struct hid_device *hid, struct hid_report *rep, int r
 	}
 }
 
+static int usbhid_idle(struct hid_device *hid, int report, int idle,
+		int reqtype)
+{
+	struct usb_device *dev = hid_to_usb_dev(hid);
+	struct usb_interface *intf = to_usb_interface(hid->dev.parent);
+	struct usb_host_interface *interface = intf->cur_altsetting;
+	int ifnum = interface->desc.bInterfaceNumber;
+
+	if (reqtype != HID_REQ_SET_IDLE)
+		return -EINVAL;
+
+	return hid_set_idle(dev, ifnum, report, idle);
+}
+
 static struct hid_ll_driver usb_hid_driver = {
 	.parse = usbhid_parse,
 	.start = usbhid_start,
@@ -1263,6 +1277,7 @@ static struct hid_ll_driver usb_hid_driver = {
 	.hidinput_input_event = usb_hidinput_input_event,
 	.request = usbhid_request,
 	.wait = usbhid_wait_io,
+	.idle = usbhid_idle,
 };
 
 static int usbhid_probe(struct usb_interface *intf, const struct usb_device_id *id)
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 7071eb3d36c7..863744c38ddc 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -664,6 +664,7 @@ struct hid_driver {
  *	   shouldn't allocate anything to not leak memory
  * @request: send report request to device (e.g. feature report)
  * @wait: wait for buffered io to complete (send/recv reports)
+ * @idle: send idle request to device
  */
 struct hid_ll_driver {
 	int (*start)(struct hid_device *hdev);
@@ -683,6 +684,7 @@ struct hid_ll_driver {
 			struct hid_report *report, int reqtype);
 
 	int (*wait)(struct hid_device *hdev);
+	int (*idle)(struct hid_device *hdev, int report, int idle, int reqtype);
 
 };
 
@@ -906,6 +908,23 @@ static inline void hid_hw_request(struct hid_device *hdev,
 		hdev->ll_driver->request(hdev, report, reqtype);
 }
 
+/**
+ * hid_hw_idle - send idle request to device
+ *
+ * @hdev: hid device
+ * @report: report to control
+ * @idle: idle state
+ * @reqtype: hid request type
+ */
+static inline int hid_hw_idle(struct hid_device *hdev, int report, int idle,
+		int reqtype)
+{
+	if (hdev->ll_driver->idle)
+		return hdev->ll_driver->idle(hdev, report, idle, reqtype);
+
+	return 0;
+}
+
 /**
  * hid_hw_wait - wait for buffered io to complete
  *
-- 
cgit 


From d2348fb6fdc6d671ad45b62db237f76c8c115603 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Sat, 2 Mar 2013 11:10:11 +0100
Subject: tick: Dynamically set broadcast irq affinity

When a cpu goes to a deep idle state where its local timer is
shutdown, it notifies the time frame work to use the broadcast timer
instead.  Unfortunately, the broadcast device could wake up any CPU,
including an idle one which is not concerned by the wake up at all. So
in the worst case an idle CPU will wake up to send an IPI to the CPU
whose timer expired.

Provide an opt-in feature CLOCK_EVT_FEAT_DYNIRQ which tells the core
that is should set the interrupt affinity of the broadcast interrupt
to the cpu which has the earliest expiry time. This avoids unnecessary
spurious wakeups and IPIs.

[ tglx: Adopted to cpumask rework, silenced an uninitialized warning,
  massaged changelog ]

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: viresh.kumar@linaro.org
Cc: jacob.jun.pan@linux.intel.com
Cc: linux-arm-kernel@lists.infradead.org
Cc: santosh.shilimkar@ti.com
Cc: linaro-kernel@lists.linaro.org
Cc: patches@linaro.org
Cc: rickard.andersson@stericsson.com
Cc: vincent.guittot@linaro.org
Cc: linus.walleij@stericsson.com
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/1362219013-18173-3-git-send-email-daniel.lezcano@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h   |  5 +++++
 kernel/time/tick-broadcast.c | 39 +++++++++++++++++++++++++++++++--------
 2 files changed, 36 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 66346521cb65..494d33ea78f8 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -55,6 +55,11 @@ enum clock_event_nofitiers {
 #define CLOCK_EVT_FEAT_C3STOP		0x000008
 #define CLOCK_EVT_FEAT_DUMMY		0x000010
 
+/*
+ * Core shall set the interrupt affinity dynamically in broadcast mode
+ */
+#define CLOCK_EVT_FEAT_DYNIRQ		0x000020
+
 /**
  * struct clock_event_device - clock event device descriptor
  * @event_handler:	Assigned by the framework to be called by the low
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 70dd98ce18d7..380910db7157 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -401,13 +401,34 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
 	return tick_broadcast_oneshot_mask;
 }
 
-static int tick_broadcast_set_event(struct clock_event_device *bc,
+/*
+ * Set broadcast interrupt affinity
+ */
+static void tick_broadcast_set_affinity(struct clock_event_device *bc,
+					const struct cpumask *cpumask)
+{
+	if (!(bc->features & CLOCK_EVT_FEAT_DYNIRQ))
+		return;
+
+	if (cpumask_equal(bc->cpumask, cpumask))
+		return;
+
+	bc->cpumask = cpumask;
+	irq_set_affinity(bc->irq, bc->cpumask);
+}
+
+static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 				    ktime_t expires, int force)
 {
+	int ret;
+
 	if (bc->mode != CLOCK_EVT_MODE_ONESHOT)
 		clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 
-	return clockevents_program_event(bc, expires, force);
+	ret = clockevents_program_event(bc, expires, force);
+	if (!ret)
+		tick_broadcast_set_affinity(bc, cpumask_of(cpu));
+	return ret;
 }
 
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -436,7 +457,7 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 {
 	struct tick_device *td;
 	ktime_t now, next_event;
-	int cpu;
+	int cpu, next_cpu = 0;
 
 	raw_spin_lock(&tick_broadcast_lock);
 again:
@@ -447,10 +468,12 @@ again:
 	/* Find all expired events */
 	for_each_cpu(cpu, tick_broadcast_oneshot_mask) {
 		td = &per_cpu(tick_cpu_device, cpu);
-		if (td->evtdev->next_event.tv64 <= now.tv64)
+		if (td->evtdev->next_event.tv64 <= now.tv64) {
 			cpumask_set_cpu(cpu, tmpmask);
-		else if (td->evtdev->next_event.tv64 < next_event.tv64)
+		} else if (td->evtdev->next_event.tv64 < next_event.tv64) {
 			next_event.tv64 = td->evtdev->next_event.tv64;
+			next_cpu = cpu;
+		}
 	}
 
 	/*
@@ -473,7 +496,7 @@ again:
 		 * Rearm the broadcast device. If event expired,
 		 * repeat the above
 		 */
-		if (tick_broadcast_set_event(dev, next_event, 0))
+		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
 			goto again;
 	}
 	raw_spin_unlock(&tick_broadcast_lock);
@@ -515,7 +538,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 		if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) {
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
 			if (dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(bc, dev->next_event, 1);
+				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
 		}
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
@@ -581,7 +604,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
-			tick_broadcast_set_event(bc, tick_next_period, 1);
+			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
 	} else {
-- 
cgit 


From 56dd9470d7c8734f055da2a6bac553caf4a468eb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 24 Feb 2013 00:23:25 +0100
Subject: context_tracking: Move exception handling to generic code

Exceptions handling on context tracking should share common
treatment: on entry we exit user mode if the exception triggered
in that context. Then on exception exit we return to that previous
context.

Generalize this to avoid duplication across archs.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Mats Liljegren <mats.liljegren@enea.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 arch/x86/include/asm/context_tracking.h | 21 ---------------------
 arch/x86/kernel/kvm.c                   |  2 +-
 arch/x86/kernel/traps.c                 |  3 +--
 arch/x86/mm/fault.c                     |  2 +-
 include/linux/context_tracking.h        | 17 ++++++++++++++++-
 5 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h
index 1616562683e9..1fe49704b146 100644
--- a/arch/x86/include/asm/context_tracking.h
+++ b/arch/x86/include/asm/context_tracking.h
@@ -1,31 +1,10 @@
 #ifndef _ASM_X86_CONTEXT_TRACKING_H
 #define _ASM_X86_CONTEXT_TRACKING_H
 
-#ifndef __ASSEMBLY__
-#include <linux/context_tracking.h>
-#include <asm/ptrace.h>
-
-static inline void exception_enter(struct pt_regs *regs)
-{
-	user_exit();
-}
-
-static inline void exception_exit(struct pt_regs *regs)
-{
-#ifdef CONFIG_CONTEXT_TRACKING
-	if (user_mode(regs))
-		user_enter();
-#endif
-}
-
-#else /* __ASSEMBLY__ */
-
 #ifdef CONFIG_CONTEXT_TRACKING
 # define SCHEDULE_USER call schedule_user
 #else
 # define SCHEDULE_USER call schedule
 #endif
 
-#endif /* !__ASSEMBLY__ */
-
 #endif
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index b686a904d7c3..e8bb0d61ecdc 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -20,6 +20,7 @@
  *   Authors: Anthony Liguori <aliguori@us.ibm.com>
  */
 
+#include <linux/context_tracking.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/kvm_para.h>
@@ -43,7 +44,6 @@
 #include <asm/apicdef.h>
 #include <asm/hypervisor.h>
 #include <asm/kvm_guest.h>
-#include <asm/context_tracking.h>
 
 static int kvmapf = 1;
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 68bda7a84159..ecc4ccbdd0cf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -12,6 +12,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
+#include <linux/context_tracking.h>
 #include <linux/interrupt.h>
 #include <linux/kallsyms.h>
 #include <linux/spinlock.h>
@@ -55,8 +56,6 @@
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/mce.h>
-#include <asm/context_tracking.h>
-
 #include <asm/mach_traps.h>
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 2b97525246d4..f946e6ce3315 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -13,12 +13,12 @@
 #include <linux/perf_event.h>		/* perf_sw_event		*/
 #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
 #include <linux/prefetch.h>		/* prefetchw			*/
+#include <linux/context_tracking.h>	/* exception_enter(), ...	*/
 
 #include <asm/traps.h>			/* dotraplinkage, ...		*/
 #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_START		*/
-#include <asm/context_tracking.h>	/* exception_enter(), ...	*/
 
 /*
  * Page fault error code bits:
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index b28d161c1091..5a69273e93e6 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -1,10 +1,11 @@
 #ifndef _LINUX_CONTEXT_TRACKING_H
 #define _LINUX_CONTEXT_TRACKING_H
 
-#ifdef CONFIG_CONTEXT_TRACKING
 #include <linux/sched.h>
 #include <linux/percpu.h>
+#include <asm/ptrace.h>
 
+#ifdef CONFIG_CONTEXT_TRACKING
 struct context_tracking {
 	/*
 	 * When active is false, probes are unset in order
@@ -33,12 +34,26 @@ static inline bool context_tracking_active(void)
 
 extern void user_enter(void);
 extern void user_exit(void);
+
+static inline void exception_enter(struct pt_regs *regs)
+{
+	user_exit();
+}
+
+static inline void exception_exit(struct pt_regs *regs)
+{
+	if (user_mode(regs))
+		user_enter();
+}
+
 extern void context_tracking_task_switch(struct task_struct *prev,
 					 struct task_struct *next);
 #else
 static inline bool context_tracking_in_user(void) { return false; }
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
+static inline void exception_enter(struct pt_regs *regs) { }
+static inline void exception_exit(struct pt_regs *regs) { }
 static inline void context_tracking_task_switch(struct task_struct *prev,
 						struct task_struct *next) { }
 #endif /* !CONFIG_CONTEXT_TRACKING */
-- 
cgit 


From 6c1e0256fad84a843d915414e4b5973b7443d48d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 24 Feb 2013 01:19:14 +0100
Subject: context_tracking: Restore correct previous context state on exception
 exit

On exception exit, we restore the previous context tracking state based on
the regs of the interrupted frame. Iff that frame is in user mode as
stated by user_mode() helper, we restore the context tracking user mode.

However there is a tiny chunck of low level arch code after we pass through
user_enter() and until the CPU eventually resumes userspace.
If an exception happens in this tiny area, exception_enter() correctly
exits the context tracking user mode but exception_exit() won't restore
it because of the value returned by user_mode(regs).

As a result we may return to userspace with the wrong context tracking
state.

To fix this, change exception_enter() to return the context tracking state
prior to its call and pass this saved state to exception_exit(). This restores
the real context tracking state of the interrupted frame.

(May be this patch was suggested to me, I don't recall exactly. If so,
sorry for the missing credit).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Mats Liljegren <mats.liljegren@enea.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 arch/x86/kernel/kvm.c            |  6 ++--
 arch/x86/kernel/traps.c          | 65 +++++++++++++++++++++++++---------------
 arch/x86/mm/fault.c              |  6 ++--
 include/linux/context_tracking.h | 19 +++++++-----
 4 files changed, 61 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e8bb0d61ecdc..cd6d9a5a42f6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -254,16 +254,18 @@ EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
 dotraplinkage void __kprobes
 do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
+	enum ctx_state prev_state;
+
 	switch (kvm_read_and_reset_pf_reason()) {
 	default:
 		do_page_fault(regs, error_code);
 		break;
 	case KVM_PV_REASON_PAGE_NOT_PRESENT:
 		/* page is swapped out by the host. */
-		exception_enter(regs);
+		prev_state = exception_enter();
 		exit_idle();
 		kvm_async_pf_task_wait((u32)read_cr2());
-		exception_exit(regs);
+		exception_exit(prev_state);
 		break;
 	case KVM_PV_REASON_PAGE_READY:
 		rcu_irq_enter();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ecc4ccbdd0cf..ff6d2271cbe2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -175,34 +175,38 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
 #define DO_ERROR(trapnr, signr, str, name)				\
 dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
 {									\
-	exception_enter(regs);						\
+	enum ctx_state prev_state;					\
+									\
+	prev_state = exception_enter();					\
 	if (notify_die(DIE_TRAP, str, regs, error_code,			\
 			trapnr, signr) == NOTIFY_STOP) {		\
-		exception_exit(regs);					\
+		exception_exit(prev_state);				\
 		return;							\
 	}								\
 	conditional_sti(regs);						\
 	do_trap(trapnr, signr, str, regs, error_code, NULL);		\
-	exception_exit(regs);						\
+	exception_exit(prev_state);					\
 }
 
 #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\
 dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\
 {									\
 	siginfo_t info;							\
+	enum ctx_state prev_state;					\
+									\
 	info.si_signo = signr;						\
 	info.si_errno = 0;						\
 	info.si_code = sicode;						\
 	info.si_addr = (void __user *)siaddr;				\
-	exception_enter(regs);						\
+	prev_state = exception_enter();					\
 	if (notify_die(DIE_TRAP, str, regs, error_code,			\
 			trapnr, signr) == NOTIFY_STOP) {		\
-		exception_exit(regs);					\
+		exception_exit(prev_state);				\
 		return;							\
 	}								\
 	conditional_sti(regs);						\
 	do_trap(trapnr, signr, str, regs, error_code, &info);		\
-	exception_exit(regs);						\
+	exception_exit(prev_state);					\
 }
 
 DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV,
@@ -225,14 +229,16 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,
 /* Runs on IST stack */
 dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
 {
-	exception_enter(regs);
+	enum ctx_state prev_state;
+
+	prev_state = exception_enter();
 	if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
 		       X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) {
 		preempt_conditional_sti(regs);
 		do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL);
 		preempt_conditional_cli(regs);
 	}
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
@@ -240,7 +246,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	static const char str[] = "double fault";
 	struct task_struct *tsk = current;
 
-	exception_enter(regs);
+	exception_enter();
 	/* Return not checked because double check cannot be ignored */
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
@@ -260,8 +266,9 @@ dotraplinkage void __kprobes
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
+	enum ctx_state prev_state;
 
-	exception_enter(regs);
+	prev_state = exception_enter();
 	conditional_sti(regs);
 
 #ifdef CONFIG_X86_32
@@ -299,12 +306,14 @@ do_general_protection(struct pt_regs *regs, long error_code)
 
 	force_sig(SIGSEGV, tsk);
 exit:
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 /* May run on IST stack. */
 dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_code)
 {
+	enum ctx_state prev_state;
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -314,7 +323,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
 	    ftrace_int3_handler(regs))
 		return;
 #endif
-	exception_enter(regs);
+	prev_state = exception_enter();
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -335,7 +344,7 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 exit:
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 #ifdef CONFIG_X86_64
@@ -392,11 +401,12 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
 dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
+	enum ctx_state prev_state;
 	int user_icebp = 0;
 	unsigned long dr6;
 	int si_code;
 
-	exception_enter(regs);
+	prev_state = exception_enter();
 
 	get_debugreg(dr6, 6);
 
@@ -466,7 +476,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 /*
@@ -560,17 +570,21 @@ void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	exception_enter(regs);
+	enum ctx_state prev_state;
+
+	prev_state = exception_enter();
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	exception_enter(regs);
+	enum ctx_state prev_state;
+
+	prev_state = exception_enter();
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 dotraplinkage void
@@ -638,7 +652,9 @@ EXPORT_SYMBOL_GPL(math_state_restore);
 dotraplinkage void __kprobes
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	exception_enter(regs);
+	enum ctx_state prev_state;
+
+	prev_state = exception_enter();
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -649,7 +665,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-		exception_exit(regs);
+		exception_exit(prev_state);
 		return;
 	}
 #endif
@@ -657,15 +673,16 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
+	enum ctx_state prev_state;
 
-	exception_enter(regs);
+	prev_state = exception_enter();
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
@@ -677,7 +694,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 	}
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
 #endif
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f946e6ce3315..fa8c02de0d25 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1222,7 +1222,9 @@ good_area:
 dotraplinkage void __kprobes
 do_page_fault(struct pt_regs *regs, unsigned long error_code)
 {
-	exception_enter(regs);
+	enum ctx_state prev_state;
+
+	prev_state = exception_enter();
 	__do_page_fault(regs, error_code);
-	exception_exit(regs);
+	exception_exit(prev_state);
 }
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 5a69273e93e6..365f4a61bf04 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -5,7 +5,6 @@
 #include <linux/percpu.h>
 #include <asm/ptrace.h>
 
-#ifdef CONFIG_CONTEXT_TRACKING
 struct context_tracking {
 	/*
 	 * When active is false, probes are unset in order
@@ -14,12 +13,13 @@ struct context_tracking {
 	 * may be further optimized using static keys.
 	 */
 	bool active;
-	enum {
+	enum ctx_state {
 		IN_KERNEL = 0,
 		IN_USER,
 	} state;
 };
 
+#ifdef CONFIG_CONTEXT_TRACKING
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
 
 static inline bool context_tracking_in_user(void)
@@ -35,14 +35,19 @@ static inline bool context_tracking_active(void)
 extern void user_enter(void);
 extern void user_exit(void);
 
-static inline void exception_enter(struct pt_regs *regs)
+static inline enum ctx_state exception_enter(void)
 {
+	enum ctx_state prev_ctx;
+
+	prev_ctx = this_cpu_read(context_tracking.state);
 	user_exit();
+
+	return prev_ctx;
 }
 
-static inline void exception_exit(struct pt_regs *regs)
+static inline void exception_exit(enum ctx_state prev_ctx)
 {
-	if (user_mode(regs))
+	if (prev_ctx == IN_USER)
 		user_enter();
 }
 
@@ -52,8 +57,8 @@ extern void context_tracking_task_switch(struct task_struct *prev,
 static inline bool context_tracking_in_user(void) { return false; }
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
-static inline void exception_enter(struct pt_regs *regs) { }
-static inline void exception_exit(struct pt_regs *regs) { }
+static inline enum ctx_state exception_enter(void) { return 0; }
+static inline void exception_exit(enum ctx_state prev_ctx) { }
 static inline void context_tracking_task_switch(struct task_struct *prev,
 						struct task_struct *next) { }
 #endif /* !CONFIG_CONTEXT_TRACKING */
-- 
cgit 


From 9fbc42eac1f6917081dc3b39922b2f1c57fdff28 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 25 Feb 2013 17:25:39 +0100
Subject: cputime: Dynamically scale cputime for full dynticks accounting

The full dynticks cputime accounting is able to account either
using the tick or the context tracking subsystem. This way
the housekeeping CPU can keep the low overhead tick based
solution.

This latter mode has a low jiffies resolution granularity and
need to be scaled against CFS precise runtime accounting to
improve its result. We are doing this for CONFIG_TICK_CPU_ACCOUNTING,
now we also need to expand it to full dynticks accounting dynamic
off-case as well.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Mats Liljegren <mats.liljegren@enea.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/sched.h  |   4 +-
 kernel/fork.c          |   2 +-
 kernel/sched/cputime.c | 154 +++++++++++++++++++++++++------------------------
 3 files changed, 83 insertions(+), 77 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..8d1b6034d80b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -570,7 +570,7 @@ struct signal_struct {
 	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	struct cputime prev_cputime;
 #endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -1327,7 +1327,7 @@ struct task_struct {
 
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	struct cputime prev_cputime;
 #endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c9056..f3146ed49074 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1230,7 +1230,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->utime = p->stime = p->gtime = 0;
 	p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 	p->prev_cputime.utime = p->prev_cputime.stime = 0;
 #endif
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f4..024fe1998ad5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -388,82 +388,10 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
 						struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-/*
- * Account a single tick of cpu time.
- * @p: the process that the cpu time gets accounted to
- * @user_tick: indicates if the tick is a user or a system tick
- */
-void account_process_tick(struct task_struct *p, int user_tick)
-{
-	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-	struct rq *rq = this_rq();
-
-	if (vtime_accounting_enabled())
-		return;
-
-	if (sched_clock_irqtime) {
-		irqtime_account_process_tick(p, user_tick, rq);
-		return;
-	}
-
-	if (steal_account_process_tick())
-		return;
-
-	if (user_tick)
-		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
-	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
-		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
-				    one_jiffy_scaled);
-	else
-		account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
-	account_steal_time(jiffies_to_cputime(ticks));
-}
-
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
-{
-
-	if (sched_clock_irqtime) {
-		irqtime_account_idle_ticks(ticks);
-		return;
-	}
-
-	account_idle_time(jiffies_to_cputime(ticks));
-}
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
 /*
  * Use precise platform statistics if available:
  */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	*ut = p->utime;
-	*st = p->stime;
-}
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-
-	*ut = cputime.utime;
-	*st = cputime.stime;
-}
 
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 void vtime_task_switch(struct task_struct *prev)
@@ -518,8 +446,80 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 }
 EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	*ut = p->utime;
+	*st = p->stime;
+}
 
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
+void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputime(p, &cputime);
+
+	*ut = cputime.utime;
+	*st = cputime.stime;
+}
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+/*
+ * Account a single tick of cpu time.
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: indicates if the tick is a user or a system tick
+ */
+void account_process_tick(struct task_struct *p, int user_tick)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	struct rq *rq = this_rq();
+
+	if (vtime_accounting_enabled())
+		return;
+
+	if (sched_clock_irqtime) {
+		irqtime_account_process_tick(p, user_tick, rq);
+		return;
+	}
+
+	if (steal_account_process_tick())
+		return;
+
+	if (user_tick)
+		account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
+		account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
+				    one_jiffy_scaled);
+	else
+		account_idle_time(cputime_one_jiffy);
+}
+
+/*
+ * Account multiple ticks of steal time.
+ * @p: the process from which the cpu time has been stolen
+ * @ticks: number of stolen ticks
+ */
+void account_steal_ticks(unsigned long ticks)
+{
+	account_steal_time(jiffies_to_cputime(ticks));
+}
+
+/*
+ * Account multiple ticks of idle time.
+ * @ticks: number of stolen ticks
+ */
+void account_idle_ticks(unsigned long ticks)
+{
+
+	if (sched_clock_irqtime) {
+		irqtime_account_idle_ticks(ticks);
+		return;
+	}
+
+	account_idle_time(jiffies_to_cputime(ticks));
+}
 
 static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
 {
@@ -545,6 +545,12 @@ static void cputime_adjust(struct task_cputime *curr,
 {
 	cputime_t rtime, stime, total;
 
+	if (vtime_accounting_enabled()) {
+		*ut = curr->utime;
+		*st = curr->stime;
+		return;
+	}
+
 	stime = curr->stime;
 	total = stime + curr->utime;
 
@@ -597,7 +603,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 	thread_group_cputime(p, &cputime);
 	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 }
-#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static unsigned long long vtime_delta(struct task_struct *tsk)
-- 
cgit 


From 090096bf3db1c281ddd034573260045888a68fea Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Wed, 6 Mar 2013 15:39:42 +0000
Subject: net: generic fdb support for drivers without ndo_fdb_<op>

If the driver does not support the ndo_op use the generic
handler for it. This should work in the majority of cases.
Eventually the fdb_dflt_add call gets translated into a
__dev_set_rx_mode() call which should handle hardware
support for filtering via the IFF_UNICAST_FLT flag.

Namely IFF_UNICAST_FLT indicates if the hardware can do
unicast address filtering. If no support is available
the device is put into promisc mode.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: John Fastabend <john.r.fastabend@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  9 ++++++
 net/core/rtnetlink.c      | 81 +++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 84 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 489dd7bb28ec..f28544b2f9af 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -69,6 +69,15 @@ extern int ndo_dflt_fdb_dump(struct sk_buff *skb,
 			     struct netlink_callback *cb,
 			     struct net_device *dev,
 			     int idx);
+extern int ndo_dflt_fdb_add(struct ndmsg *ndm,
+			    struct nlattr *tb[],
+			    struct net_device *dev,
+			    const unsigned char *addr,
+			     u16 flags);
+extern int ndo_dflt_fdb_del(struct ndmsg *ndm,
+			    struct nlattr *tb[],
+			    struct net_device *dev,
+			    const unsigned char *addr);
 
 extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
 				   struct net_device *dev, u16 mode);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b376410ff259..f95b6fbc29e9 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2048,6 +2048,38 @@ errout:
 	rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
 }
 
+/**
+ * ndo_dflt_fdb_add - default netdevice operation to add an FDB entry
+ */
+int ndo_dflt_fdb_add(struct ndmsg *ndm,
+		     struct nlattr *tb[],
+		     struct net_device *dev,
+		     const unsigned char *addr,
+		     u16 flags)
+{
+	int err = -EINVAL;
+
+	/* If aging addresses are supported device will need to
+	 * implement its own handler for this.
+	 */
+	if (ndm->ndm_state && !(ndm->ndm_state & NUD_PERMANENT)) {
+		pr_info("%s: FDB only supports static addresses\n", dev->name);
+		return err;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_add_excl(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_add_excl(dev, addr);
+
+	/* Only return duplicate errors if NLM_F_EXCL is set */
+	if (err == -EEXIST && !(flags & NLM_F_EXCL))
+		err = 0;
+
+	return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_add);
+
 static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
@@ -2100,10 +2132,13 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	}
 
 	/* Embedded bridge, macvlan, and any other device support */
-	if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_add) {
-		err = dev->netdev_ops->ndo_fdb_add(ndm, tb,
-						   dev, addr,
-						   nlh->nlmsg_flags);
+	if ((ndm->ndm_flags & NTF_SELF)) {
+		if (dev->netdev_ops->ndo_fdb_add)
+			err = dev->netdev_ops->ndo_fdb_add(ndm, tb, dev, addr,
+							   nlh->nlmsg_flags);
+		else
+			err = ndo_dflt_fdb_add(ndm, tb, dev, addr,
+					       nlh->nlmsg_flags);
 
 		if (!err) {
 			rtnl_fdb_notify(dev, addr, RTM_NEWNEIGH);
@@ -2114,6 +2149,35 @@ out:
 	return err;
 }
 
+/**
+ * ndo_dflt_fdb_del - default netdevice operation to delete an FDB entry
+ */
+int ndo_dflt_fdb_del(struct ndmsg *ndm,
+		     struct nlattr *tb[],
+		     struct net_device *dev,
+		     const unsigned char *addr)
+{
+	int err = -EOPNOTSUPP;
+
+	/* If aging addresses are supported device will need to
+	 * implement its own handler for this.
+	 */
+	if (ndm->ndm_state & NUD_PERMANENT) {
+		pr_info("%s: FDB only supports static addresses\n", dev->name);
+		return -EINVAL;
+	}
+
+	if (is_unicast_ether_addr(addr) || is_link_local_ether_addr(addr))
+		err = dev_uc_del(dev, addr);
+	else if (is_multicast_ether_addr(addr))
+		err = dev_mc_del(dev, addr);
+	else
+		err = -EINVAL;
+
+	return err;
+}
+EXPORT_SYMBOL(ndo_dflt_fdb_del);
+
 static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 {
 	struct net *net = sock_net(skb->sk);
@@ -2171,8 +2235,11 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
 	}
 
 	/* Embedded bridge, macvlan, and any other device support */
-	if ((ndm->ndm_flags & NTF_SELF) && dev->netdev_ops->ndo_fdb_del) {
-		err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+	if (ndm->ndm_flags & NTF_SELF) {
+		if (dev->netdev_ops->ndo_fdb_del)
+			err = dev->netdev_ops->ndo_fdb_del(ndm, tb, dev, addr);
+		else
+			err = ndo_dflt_fdb_del(ndm, tb, dev, addr);
 
 		if (!err) {
 			rtnl_fdb_notify(dev, addr, RTM_DELNEIGH);
@@ -2257,6 +2324,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
 
 		if (dev->netdev_ops->ndo_fdb_dump)
 			idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, idx);
+		else
+			ndo_dflt_fdb_dump(skb, cb, dev, idx);
 	}
 	rcu_read_unlock();
 
-- 
cgit 


From c031e234ee304b507b79f76a7677ea0a7a8890e8 Mon Sep 17 00:00:00 2001
From: Andy King <acking@vmware.com>
Date: Thu, 7 Mar 2013 05:26:13 +0000
Subject: VSOCK: Split vm_sockets.h into kernel/uapi

Split the vSockets header into kernel and UAPI parts.  The former gets the bits
that used to be in __KERNEL__ guards, while the latter gets everything that is
user-visible.  Tested by compiling vsock (+transport) and a simple user-mode
vSockets application.

Reported-by: David Howells <dhowells@redhat.com>
Acked-by: Dmitry Torokhov <dtor@vmware.com>
Signed-off-by: Andy King <acking@vmware.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/vm_sockets.h      | 23 +++++++++++++++++++++++
 include/uapi/linux/vm_sockets.h | 23 ++++++++---------------
 2 files changed, 31 insertions(+), 15 deletions(-)
 create mode 100644 include/linux/vm_sockets.h

(limited to 'include/linux')

diff --git a/include/linux/vm_sockets.h b/include/linux/vm_sockets.h
new file mode 100644
index 000000000000..0805eecba8f7
--- /dev/null
+++ b/include/linux/vm_sockets.h
@@ -0,0 +1,23 @@
+/*
+ * VMware vSockets Driver
+ *
+ * Copyright (C) 2007-2013 VMware, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation version 2 and no later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef _VM_SOCKETS_H
+#define _VM_SOCKETS_H
+
+#include <uapi/linux/vm_sockets.h>
+
+int vm_sockets_get_local_cid(void);
+
+#endif /* _VM_SOCKETS_H */
diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
index df91301847ec..b4ed5d895699 100644
--- a/include/uapi/linux/vm_sockets.h
+++ b/include/uapi/linux/vm_sockets.h
@@ -13,12 +13,10 @@
  * more details.
  */
 
-#ifndef _VM_SOCKETS_H_
-#define _VM_SOCKETS_H_
+#ifndef _UAPI_VM_SOCKETS_H
+#define _UAPI_VM_SOCKETS_H
 
-#if !defined(__KERNEL__)
-#include <sys/socket.h>
-#endif
+#include <linux/socket.h>
 
 /* Option name for STREAM socket buffer size.  Use as the option name in
  * setsockopt(3) or getsockopt(3) to set or get an unsigned long long that
@@ -137,14 +135,13 @@
 #define VM_SOCKETS_VERSION_MINOR(_v) (((_v) & 0x0000FFFF))
 
 /* Address structure for vSockets.   The address family should be set to
- * whatever vmci_sock_get_af_value_fd() returns.  The structure members should
- * all align on their natural boundaries without resorting to compiler packing
- * directives.  The total size of this structure should be exactly the same as
- * that of struct sockaddr.
+ * AF_VSOCK.  The structure members should all align on their natural
+ * boundaries without resorting to compiler packing directives.  The total size
+ * of this structure should be exactly the same as that of struct sockaddr.
  */
 
 struct sockaddr_vm {
-	sa_family_t svm_family;
+	__kernel_sa_family_t svm_family;
 	unsigned short svm_reserved1;
 	unsigned int svm_port;
 	unsigned int svm_cid;
@@ -156,8 +153,4 @@ struct sockaddr_vm {
 
 #define IOCTL_VM_SOCKETS_GET_LOCAL_CID		_IO(7, 0xb9)
 
-#if defined(__KERNEL__)
-int vm_sockets_get_local_cid(void);
-#endif
-
-#endif
+#endif /* _UAPI_VM_SOCKETS_H */
-- 
cgit 


From ec5f061564238892005257c83565a0b58ec79295 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 7 Mar 2013 09:28:01 +0000
Subject: net: Kill link between CSUM and SG features.

Earlier SG was unset if CSUM was not available for given device to
force skb copy to avoid sending inconsistent csum.
Commit c9af6db4c11c (net: Fix possible wrong checksum generation)
added explicit flag to force copy to fix this issue.  Therefore
there is no need to link SG and CSUM, following patch kills this
link between there two features.

This patch is also required following patch in series.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 13 ++++++++++
 net/core/dev.c            | 63 +++++++++++++++++++++++++----------------------
 net/core/skbuff.c         | 13 ++++++++++
 net/ipv4/af_inet.c        |  3 ---
 net/ipv6/ip6_offload.c    |  3 ---
 5 files changed, 59 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 896eb4985f97..e1ebeffa6b35 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2683,6 +2683,19 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, netdev_features_t features)
 {
 	return __skb_gso_segment(skb, features, true);
 }
+__be16 skb_network_protocol(struct sk_buff *skb);
+
+static inline bool can_checksum_protocol(netdev_features_t features,
+					 __be16 protocol)
+{
+	return ((features & NETIF_F_GEN_CSUM) ||
+		((features & NETIF_F_V4_CSUM) &&
+		 protocol == htons(ETH_P_IP)) ||
+		((features & NETIF_F_V6_CSUM) &&
+		 protocol == htons(ETH_P_IPV6)) ||
+		((features & NETIF_F_FCOE_CRC) &&
+		 protocol == htons(ETH_P_FCOE)));
+}
 
 #ifdef CONFIG_BUG
 extern void netdev_rx_csum_fault(struct net_device *dev);
diff --git a/net/core/dev.c b/net/core/dev.c
index 96103894ad69..bb999931729f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2208,16 +2208,8 @@ out:
 }
 EXPORT_SYMBOL(skb_checksum_help);
 
-/**
- *	skb_mac_gso_segment - mac layer segmentation handler.
- *	@skb: buffer to segment
- *	@features: features for the output path (see dev->features)
- */
-struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
-				    netdev_features_t features)
+__be16 skb_network_protocol(struct sk_buff *skb)
 {
-	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
-	struct packet_offload *ptype;
 	__be16 type = skb->protocol;
 
 	while (type == htons(ETH_P_8021Q)) {
@@ -2225,13 +2217,31 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 		struct vlan_hdr *vh;
 
 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
-			return ERR_PTR(-EINVAL);
+			return 0;
 
 		vh = (struct vlan_hdr *)(skb->data + vlan_depth);
 		type = vh->h_vlan_encapsulated_proto;
 		vlan_depth += VLAN_HLEN;
 	}
 
+	return type;
+}
+
+/**
+ *	skb_mac_gso_segment - mac layer segmentation handler.
+ *	@skb: buffer to segment
+ *	@features: features for the output path (see dev->features)
+ */
+struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
+				    netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
+	struct packet_offload *ptype;
+	__be16 type = skb_network_protocol(skb);
+
+	if (unlikely(!type))
+		return ERR_PTR(-EINVAL);
+
 	__skb_pull(skb, skb->mac_len);
 
 	rcu_read_lock();
@@ -2398,24 +2408,12 @@ static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
 	return 0;
 }
 
-static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
-{
-	return ((features & NETIF_F_GEN_CSUM) ||
-		((features & NETIF_F_V4_CSUM) &&
-		 protocol == htons(ETH_P_IP)) ||
-		((features & NETIF_F_V6_CSUM) &&
-		 protocol == htons(ETH_P_IPV6)) ||
-		((features & NETIF_F_FCOE_CRC) &&
-		 protocol == htons(ETH_P_FCOE)));
-}
-
 static netdev_features_t harmonize_features(struct sk_buff *skb,
 	__be16 protocol, netdev_features_t features)
 {
 	if (skb->ip_summed != CHECKSUM_NONE &&
 	    !can_checksum_protocol(features, protocol)) {
 		features &= ~NETIF_F_ALL_CSUM;
-		features &= ~NETIF_F_SG;
 	} else if (illegal_highdma(skb->dev, skb)) {
 		features &= ~NETIF_F_SG;
 	}
@@ -4921,20 +4919,25 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
 		features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
 	}
 
-	/* Fix illegal SG+CSUM combinations. */
-	if ((features & NETIF_F_SG) &&
-	    !(features & NETIF_F_ALL_CSUM)) {
-		netdev_dbg(dev,
-			"Dropping NETIF_F_SG since no checksum feature.\n");
-		features &= ~NETIF_F_SG;
-	}
-
 	/* TSO requires that SG is present as well. */
 	if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
 		netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
 		features &= ~NETIF_F_ALL_TSO;
 	}
 
+	if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
+					!(features & NETIF_F_IP_CSUM)) {
+		netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
+		features &= ~NETIF_F_TSO;
+		features &= ~NETIF_F_TSO_ECN;
+	}
+
+	if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
+					 !(features & NETIF_F_IPV6_CSUM)) {
+		netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
+		features &= ~NETIF_F_TSO6;
+	}
+
 	/* TSO ECN requires that TSO is present as well. */
 	if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
 		features &= ~NETIF_F_TSO_ECN;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 33245ef54c3b..0a48ae20c903 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -2741,12 +2741,19 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
 	unsigned int tnl_hlen = skb_tnl_header_len(skb);
 	unsigned int headroom;
 	unsigned int len;
+	__be16 proto;
+	bool csum;
 	int sg = !!(features & NETIF_F_SG);
 	int nfrags = skb_shinfo(skb)->nr_frags;
 	int err = -ENOMEM;
 	int i = 0;
 	int pos;
 
+	proto = skb_network_protocol(skb);
+	if (unlikely(!proto))
+		return ERR_PTR(-EINVAL);
+
+	csum = !!can_checksum_protocol(features, proto);
 	__skb_push(skb, doffset);
 	headroom = skb_headroom(skb);
 	pos = skb_headlen(skb);
@@ -2884,6 +2891,12 @@ skip_fraglist:
 		nskb->data_len = len - hsize;
 		nskb->len += nskb->data_len;
 		nskb->truesize += nskb->data_len;
+
+		if (!csum) {
+			nskb->csum = skb_checksum(nskb, doffset,
+						  nskb->len - doffset, 0);
+			nskb->ip_summed = CHECKSUM_NONE;
+		}
 	} while ((offset += len) < skb->len);
 
 	return segs;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 68f6a94f7661..dc3f677360a5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1284,9 +1284,6 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	int id;
 	unsigned int offset = 0;
 
-	if (!(features & NETIF_F_V4_CSUM))
-		features &= ~NETIF_F_SG;
-
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_TCPV4 |
 		       SKB_GSO_UDP |
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 8234c1dcdf72..7a0d25a5479c 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -92,9 +92,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	u8 *prevhdr;
 	int offset = 0;
 
-	if (!(features & NETIF_F_V6_CSUM))
-		features &= ~NETIF_F_SG;
-
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_UDP |
 		       SKB_GSO_DODGY |
-- 
cgit 


From aefbd2b3c2a9c657605e4337f9919d6c6273e8e6 Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 7 Mar 2013 13:21:46 +0000
Subject: tunneling: Capture inner mac header during encapsulation.

This patch adds inner mac header. This will be used in next patch
to find tunner header length. Header len is required to copy tunnel
header to each gso segment.
This patch does not change any functionality.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 34 ++++++++++++++++++++++++++++++++++
 net/core/skbuff.c      |  2 ++
 2 files changed, 36 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 821c7f45d2a7..d7f96ff68f77 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -387,6 +387,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@vlan_tci: vlan tag control information
  *	@inner_transport_header: Inner transport layer header (encapsulation)
  *	@inner_network_header: Network layer header (encapsulation)
+ *	@inner_mac_header: Link layer header (encapsulation)
  *	@transport_header: Transport layer header
  *	@network_header: Network layer header
  *	@mac_header: Link layer header
@@ -505,6 +506,7 @@ struct sk_buff {
 
 	sk_buff_data_t		inner_transport_header;
 	sk_buff_data_t		inner_network_header;
+	sk_buff_data_t		inner_mac_header;
 	sk_buff_data_t		transport_header;
 	sk_buff_data_t		network_header;
 	sk_buff_data_t		mac_header;
@@ -1466,6 +1468,7 @@ static inline void skb_reserve(struct sk_buff *skb, int len)
 
 static inline void skb_reset_inner_headers(struct sk_buff *skb)
 {
+	skb->inner_mac_header = skb->mac_header;
 	skb->inner_network_header = skb->network_header;
 	skb->inner_transport_header = skb->transport_header;
 }
@@ -1511,6 +1514,22 @@ static inline void skb_set_inner_network_header(struct sk_buff *skb,
 	skb->inner_network_header += offset;
 }
 
+static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
+{
+	return skb->head + skb->inner_mac_header;
+}
+
+static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
+{
+	skb->inner_mac_header = skb->data - skb->head;
+}
+
+static inline void skb_set_inner_mac_header(struct sk_buff *skb,
+					    const int offset)
+{
+	skb_reset_inner_mac_header(skb);
+	skb->inner_mac_header += offset;
+}
 static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
 {
 	return skb->transport_header != ~0U;
@@ -1604,6 +1623,21 @@ static inline void skb_set_inner_network_header(struct sk_buff *skb,
 	skb->inner_network_header = skb->data + offset;
 }
 
+static inline unsigned char *skb_inner_mac_header(const struct sk_buff *skb)
+{
+	return skb->inner_mac_header;
+}
+
+static inline void skb_reset_inner_mac_header(struct sk_buff *skb)
+{
+	skb->inner_mac_header = skb->data;
+}
+
+static inline void skb_set_inner_mac_header(struct sk_buff *skb,
+						const int offset)
+{
+	skb->inner_mac_header = skb->data + offset;
+}
 static inline bool skb_transport_header_was_set(const struct sk_buff *skb)
 {
 	return skb->transport_header != NULL;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 0278c7f787bf..31c6737d3189 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -673,6 +673,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->mac_header		= old->mac_header;
 	new->inner_transport_header = old->inner_transport_header;
 	new->inner_network_header = old->inner_network_header;
+	new->inner_mac_header = old->inner_mac_header;
 	skb_dst_copy(new, old);
 	new->rxhash		= old->rxhash;
 	new->ooo_okay		= old->ooo_okay;
@@ -876,6 +877,7 @@ static void skb_headers_offset_update(struct sk_buff *skb, int off)
 		skb->mac_header += off;
 	skb->inner_transport_header += off;
 	skb->inner_network_header += off;
+	skb->inner_mac_header += off;
 }
 
 static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
-- 
cgit 


From 731362674580cb0c696cd1b1a03d8461a10cf90a Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 7 Mar 2013 13:21:51 +0000
Subject: tunneling: Add generic Tunnel segmentation.

Adds generic tunneling offloading support for IPv4-UDP based
tunnels.
GSO type is added to request this offload for a skb.
netdev feature NETIF_F_UDP_TUNNEL is added for hardware offloaded
udp-tunnel support. Currently no device supports this feature,
software offload is used.

This can be used by tunneling protocols like VXLAN.

CC: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Stephen Hemminger <stephen@networkplumber.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h |   7 +--
 include/linux/skbuff.h          |   2 +
 net/core/ethtool.c              |   1 +
 net/ipv4/af_inet.c              |   6 ++-
 net/ipv4/tcp.c                  |   1 +
 net/ipv4/udp.c                  | 115 +++++++++++++++++++++++++++++++---------
 net/ipv6/ip6_offload.c          |   1 +
 net/ipv6/udp_offload.c          |   8 ++-
 8 files changed, 111 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 3dd39340430e..f5e797c0c2a4 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -42,9 +42,9 @@ enum {
 	NETIF_F_TSO6_BIT,		/* ... TCPv6 segmentation */
 	NETIF_F_FSO_BIT,		/* ... FCoE segmentation */
 	NETIF_F_GSO_GRE_BIT,		/* ... GRE with TSO */
-	/**/NETIF_F_GSO_LAST,		/* [can't be last bit, see GSO_MASK] */
-	NETIF_F_GSO_RESERVED2		/* ... free (fill GSO_MASK to 8 bits) */
-		= NETIF_F_GSO_LAST,
+	NETIF_F_GSO_UDP_TUNNEL_BIT,	/* ... UDP TUNNEL with TSO */
+	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
+		NETIF_F_GSO_UDP_TUNNEL_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CSUM_BIT,		/* SCTP checksum offload */
@@ -103,6 +103,7 @@ enum {
 #define NETIF_F_RXFCS		__NETIF_F(RXFCS)
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GRE_GSO		__NETIF_F(GSO_GRE)
+#define NETIF_F_UDP_TUNNEL	__NETIF_F(UDP_TUNNEL)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index d7f96ff68f77..eb2106fe3bb4 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -316,6 +316,8 @@ enum {
 	SKB_GSO_FCOE = 1 << 5,
 
 	SKB_GSO_GRE = 1 << 6,
+
+	SKB_GSO_UDP_TUNNEL = 1 << 7,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 3e9b2c3e30f0..adc1351e6873 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -78,6 +78,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_TSO6_BIT] =             "tx-tcp6-segmentation",
 	[NETIF_F_FSO_BIT] =              "tx-fcoe-segmentation",
 	[NETIF_F_GSO_GRE_BIT] =		 "tx-gre-segmentation",
+	[NETIF_F_GSO_UDP_TUNNEL_BIT] =	 "tx-udp_tnl-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CSUM_BIT] =        "tx-checksum-sctp",
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index dc3f677360a5..9e5882caf8a7 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1283,6 +1283,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	int ihl;
 	int id;
 	unsigned int offset = 0;
+	bool tunnel;
 
 	if (unlikely(skb_shinfo(skb)->gso_type &
 		     ~(SKB_GSO_TCPV4 |
@@ -1290,6 +1291,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_UDP_TUNNEL |
 		       0)))
 		goto out;
 
@@ -1304,6 +1306,8 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	if (unlikely(!pskb_may_pull(skb, ihl)))
 		goto out;
 
+	tunnel = !!skb->encapsulation;
+
 	__skb_pull(skb, ihl);
 	skb_reset_transport_header(skb);
 	iph = ip_hdr(skb);
@@ -1323,7 +1327,7 @@ static struct sk_buff *inet_gso_segment(struct sk_buff *skb,
 	skb = segs;
 	do {
 		iph = ip_hdr(skb);
-		if (proto == IPPROTO_UDP) {
+		if (!tunnel && proto == IPPROTO_UDP) {
 			iph->id = htons(id);
 			iph->frag_off = htons(offset >> 3);
 			if (skb->next != NULL)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 47e854fcae24..8d14573ade77 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3044,6 +3044,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb,
 			       SKB_GSO_TCP_ECN |
 			       SKB_GSO_TCPV6 |
 			       SKB_GSO_GRE |
+			       SKB_GSO_UDP_TUNNEL |
 			       0) ||
 			     !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))))
 			goto out;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 265c42cf963c..41760e043bf5 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2272,31 +2272,88 @@ void __init udp_init(void)
 
 int udp4_ufo_send_check(struct sk_buff *skb)
 {
-	const struct iphdr *iph;
-	struct udphdr *uh;
-
-	if (!pskb_may_pull(skb, sizeof(*uh)))
+	if (!pskb_may_pull(skb, sizeof(struct udphdr)))
 		return -EINVAL;
 
-	iph = ip_hdr(skb);
-	uh = udp_hdr(skb);
+	if (likely(!skb->encapsulation)) {
+		const struct iphdr *iph;
+		struct udphdr *uh;
 
-	uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
-				       IPPROTO_UDP, 0);
-	skb->csum_start = skb_transport_header(skb) - skb->head;
-	skb->csum_offset = offsetof(struct udphdr, check);
-	skb->ip_summed = CHECKSUM_PARTIAL;
+		iph = ip_hdr(skb);
+		uh = udp_hdr(skb);
+
+		uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, skb->len,
+				IPPROTO_UDP, 0);
+		skb->csum_start = skb_transport_header(skb) - skb->head;
+		skb->csum_offset = offsetof(struct udphdr, check);
+		skb->ip_summed = CHECKSUM_PARTIAL;
+	}
 	return 0;
 }
 
+static struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb,
+		netdev_features_t features)
+{
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	int mac_len = skb->mac_len;
+	int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+	int outer_hlen;
+	netdev_features_t enc_features;
+
+	if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
+		goto out;
+
+	skb->encapsulation = 0;
+	__skb_pull(skb, tnl_hlen);
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb_inner_network_offset(skb));
+	skb->mac_len = skb_inner_network_offset(skb);
+
+	/* segment inner packet. */
+	enc_features = skb->dev->hw_enc_features & netif_skb_features(skb);
+	segs = skb_mac_gso_segment(skb, enc_features);
+	if (!segs || IS_ERR(segs))
+		goto out;
+
+	outer_hlen = skb_tnl_header_len(skb);
+	skb = segs;
+	do {
+		struct udphdr *uh;
+		int udp_offset = outer_hlen - tnl_hlen;
+
+		skb->mac_len = mac_len;
+
+		skb_push(skb, outer_hlen);
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+		skb_set_transport_header(skb, udp_offset);
+		uh = udp_hdr(skb);
+		uh->len = htons(skb->len - udp_offset);
+
+		/* csum segment if tunnel sets skb with csum. */
+		if (unlikely(uh->check)) {
+			struct iphdr *iph = ip_hdr(skb);
+
+			uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
+						       skb->len - udp_offset,
+						       IPPROTO_UDP, 0);
+			uh->check = csum_fold(skb_checksum(skb, udp_offset,
+							   skb->len - udp_offset, 0));
+			if (uh->check == 0)
+				uh->check = CSUM_MANGLED_0;
+
+		}
+		skb->ip_summed = CHECKSUM_NONE;
+	} while ((skb = skb->next));
+out:
+	return segs;
+}
+
 struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 	netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	unsigned int mss;
-	int offset;
-	__wsum csum;
-
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
 		goto out;
@@ -2306,6 +2363,7 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		int type = skb_shinfo(skb)->gso_type;
 
 		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_GRE) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
@@ -2316,20 +2374,27 @@ struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
 		goto out;
 	}
 
-	/* Do software UFO. Complete and fill in the UDP checksum as HW cannot
-	 * do checksum of UDP packets sent as multiple IP fragments.
-	 */
-	offset = skb_checksum_start_offset(skb);
-	csum = skb_checksum(skb, offset, skb->len - offset, 0);
-	offset += skb->csum_offset;
-	*(__sum16 *)(skb->data + offset) = csum_fold(csum);
-	skb->ip_summed = CHECKSUM_NONE;
-
 	/* Fragment the skb. IP headers of the fragments are updated in
 	 * inet_gso_segment()
 	 */
-	segs = skb_segment(skb, features);
+	if (skb->encapsulation && skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL)
+		segs = skb_udp_tunnel_segment(skb, features);
+	else {
+		int offset;
+		__wsum csum;
+
+		/* Do software UFO. Complete and fill in the UDP checksum as
+		 * HW cannot do checksum of UDP packets sent as multiple
+		 * IP fragments.
+		 */
+		offset = skb_checksum_start_offset(skb);
+		csum = skb_checksum(skb, offset, skb->len - offset, 0);
+		offset += skb->csum_offset;
+		*(__sum16 *)(skb->data + offset) = csum_fold(csum);
+		skb->ip_summed = CHECKSUM_NONE;
+
+		segs = skb_segment(skb, features);
+	}
 out:
 	return segs;
 }
-
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 7a0d25a5479c..71b766ee821d 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -97,6 +97,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		       SKB_GSO_DODGY |
 		       SKB_GSO_TCP_ECN |
 		       SKB_GSO_GRE |
+		       SKB_GSO_UDP_TUNNEL |
 		       SKB_GSO_TCPV6 |
 		       0)))
 		goto out;
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index cf05cf073c51..3bb3a891a424 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -21,6 +21,10 @@ static int udp6_ufo_send_check(struct sk_buff *skb)
 	const struct ipv6hdr *ipv6h;
 	struct udphdr *uh;
 
+	/* UDP Tunnel offload on ipv6 is not yet supported. */
+	if (skb->encapsulation)
+		return -EINVAL;
+
 	if (!pskb_may_pull(skb, sizeof(*uh)))
 		return -EINVAL;
 
@@ -56,7 +60,9 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		/* Packet is from an untrusted source, reset gso_segs. */
 		int type = skb_shinfo(skb)->gso_type;
 
-		if (unlikely(type & ~(SKB_GSO_UDP | SKB_GSO_DODGY |
+		if (unlikely(type & ~(SKB_GSO_UDP |
+				      SKB_GSO_DODGY |
+				      SKB_GSO_UDP_TUNNEL |
 				      SKB_GSO_GRE) ||
 			     !(type & (SKB_GSO_UDP))))
 			goto out;
-- 
cgit 


From 6150f3bc0b4f94f0eea3e32b4e7462025e4bd972 Mon Sep 17 00:00:00 2001
From: Nicolas Royer <nicolas@eukrea.com>
Date: Wed, 20 Feb 2013 17:10:23 +0100
Subject: ARM: AT91SAM9G45: same platform data structure for all crypto
 peripherals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Only AES use DMA in AT91SAM9G45 (TDES and SHA use PDC).

However latest Atmel TDES and SHA IP releases use DMA instead of PDC.
  --> Atmel TDES and SHA drivers need DMA platform data for those IP releases.

Goal of this patch is to use the same platform data structure for all Atmel
crypto peripherals. This structure contains information about DMA interface.

Signed-off-by: Nicolas Royer <nicolas@eukrea.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Acked-by: Eric Bénard <eric@eukrea.com>
Tested-by: Eric Bénard <eric@eukrea.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/arm/mach-at91/at91sam9g45_devices.c   | 14 ++++++--------
 include/linux/platform_data/atmel-aes.h    | 22 ----------------------
 include/linux/platform_data/crypto-atmel.h | 22 ++++++++++++++++++++++
 3 files changed, 28 insertions(+), 30 deletions(-)
 delete mode 100644 include/linux/platform_data/atmel-aes.h
 create mode 100644 include/linux/platform_data/crypto-atmel.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/at91sam9g45_devices.c b/arch/arm/mach-at91/at91sam9g45_devices.c
index 827c9f2a70fb..f0bf68268ca2 100644
--- a/arch/arm/mach-at91/at91sam9g45_devices.c
+++ b/arch/arm/mach-at91/at91sam9g45_devices.c
@@ -18,7 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/i2c-gpio.h>
 #include <linux/atmel-mci.h>
-#include <linux/platform_data/atmel-aes.h>
+#include <linux/platform_data/crypto-atmel.h>
 
 #include <linux/platform_data/at91_adc.h>
 
@@ -1900,7 +1900,8 @@ static void __init at91_add_device_tdes(void) {}
  * -------------------------------------------------------------------- */
 
 #if defined(CONFIG_CRYPTO_DEV_ATMEL_AES) || defined(CONFIG_CRYPTO_DEV_ATMEL_AES_MODULE)
-static struct aes_platform_data aes_data;
+static struct crypto_platform_data aes_data;
+static struct crypto_dma_data alt_atslave;
 static u64 aes_dmamask = DMA_BIT_MASK(32);
 
 static struct resource aes_resources[] = {
@@ -1931,23 +1932,20 @@ static struct platform_device at91sam9g45_aes_device = {
 static void __init at91_add_device_aes(void)
 {
 	struct at_dma_slave	*atslave;
-	struct aes_dma_data	*alt_atslave;
-
-	alt_atslave = kzalloc(sizeof(struct aes_dma_data), GFP_KERNEL);
 
 	/* DMA TX slave channel configuration */
-	atslave = &alt_atslave->txdata;
+	atslave = &alt_atslave.txdata;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_SRC_H2SEL_HW |
 						ATC_SRC_PER(AT_DMA_ID_AES_RX);
 
 	/* DMA RX slave channel configuration */
-	atslave = &alt_atslave->rxdata;
+	atslave = &alt_atslave.rxdata;
 	atslave->dma_dev = &at_hdmac_device.dev;
 	atslave->cfg = ATC_FIFOCFG_ENOUGHSPACE	| ATC_DST_H2SEL_HW |
 						ATC_DST_PER(AT_DMA_ID_AES_TX);
 
-	aes_data.dma_slave = alt_atslave;
+	aes_data.dma_slave = &alt_atslave;
 	platform_device_register(&at91sam9g45_aes_device);
 }
 #else
diff --git a/include/linux/platform_data/atmel-aes.h b/include/linux/platform_data/atmel-aes.h
deleted file mode 100644
index ab68082fbcb0..000000000000
--- a/include/linux/platform_data/atmel-aes.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef __LINUX_ATMEL_AES_H
-#define __LINUX_ATMEL_AES_H
-
-#include <linux/platform_data/dma-atmel.h>
-
-/**
- * struct aes_dma_data - DMA data for AES
- */
-struct aes_dma_data {
-	struct at_dma_slave	txdata;
-	struct at_dma_slave	rxdata;
-};
-
-/**
- * struct aes_platform_data - board-specific AES configuration
- * @dma_slave: DMA slave interface to use in data transfers.
- */
-struct aes_platform_data {
-	struct aes_dma_data	*dma_slave;
-};
-
-#endif /* __LINUX_ATMEL_AES_H */
diff --git a/include/linux/platform_data/crypto-atmel.h b/include/linux/platform_data/crypto-atmel.h
new file mode 100644
index 000000000000..b46e0d9062a0
--- /dev/null
+++ b/include/linux/platform_data/crypto-atmel.h
@@ -0,0 +1,22 @@
+#ifndef __LINUX_CRYPTO_ATMEL_H
+#define __LINUX_CRYPTO_ATMEL_H
+
+#include <linux/platform_data/dma-atmel.h>
+
+/**
+ * struct crypto_dma_data - DMA data for AES/TDES/SHA
+ */
+struct crypto_dma_data {
+	struct at_dma_slave	txdata;
+	struct at_dma_slave	rxdata;
+};
+
+/**
+ * struct crypto_platform_data - board-specific AES/TDES/SHA configuration
+ * @dma_slave: DMA slave interface to use in data transfers.
+ */
+struct crypto_platform_data {
+	struct crypto_dma_data	*dma_slave;
+};
+
+#endif /* __LINUX_CRYPTO_ATMEL_H */
-- 
cgit 


From e61667af2f77d481411f2ccd307fed2247d785a8 Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Sun, 10 Mar 2013 05:18:39 +0000
Subject: tcp: Remove unused tw_cookie_values from tcp_timewait_sock

tw_cookie_values is never used in the TCP-stack.

It was added by 435cf559f (TCPCT part 1d: define TCP cookie option,
extend existing struct's), but already at that time it was not used at
all, nor mentioned in the commit-message.

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f28408c07dc2..515c3746b675 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -361,10 +361,6 @@ struct tcp_timewait_sock {
 #ifdef CONFIG_TCP_MD5SIG
 	struct tcp_md5sig_key	  *tw_md5_key;
 #endif
-	/* Few sockets in timewait have cookies; in that case, then this
-	 * object holds a reference to them (tw_cookie_values->kref).
-	 */
-	struct tcp_cookie_values  *tw_cookie_values;
 };
 
 static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk)
-- 
cgit 


From 26fd76cab2e61cedc5c25f7151fb31b57ddc53c7 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Fri, 22 Feb 2013 10:53:25 +0100
Subject: NFC: llcp: Implement socket options

Some LLCP services (e.g. the validation ones) require some control over
the LLCP link parameters like the receive window (RW) or the MIU extension
(MIUX). This can only be done through socket options.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/socket.h   |   1 +
 include/uapi/linux/nfc.h |   4 ++
 net/nfc/llcp/llcp.h      |   3 ++
 net/nfc/llcp/sock.c      | 119 ++++++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 125 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 2b9f74b0ffea..428c37a1f95c 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -298,6 +298,7 @@ struct ucred {
 #define SOL_IUCV	277
 #define SOL_CAIF	278
 #define SOL_ALG		279
+#define SOL_NFC		280
 
 /* IPX options */
 #define IPX_TYPE	1
diff --git a/include/uapi/linux/nfc.h b/include/uapi/linux/nfc.h
index 7969f46f1bb3..855630fe731d 100644
--- a/include/uapi/linux/nfc.h
+++ b/include/uapi/linux/nfc.h
@@ -220,4 +220,8 @@ struct sockaddr_nfc_llcp {
 #define NFC_LLCP_DIRECTION_RX		0x00
 #define NFC_LLCP_DIRECTION_TX		0x01
 
+/* socket option names */
+#define NFC_LLCP_RW   0
+#define NFC_LLCP_MIUX 1
+
 #endif /*__LINUX_NFC_H */
diff --git a/net/nfc/llcp/llcp.h b/net/nfc/llcp/llcp.h
index 32cec81939e6..5f117adac2e5 100644
--- a/net/nfc/llcp/llcp.h
+++ b/net/nfc/llcp/llcp.h
@@ -104,6 +104,9 @@ struct nfc_llcp_sock {
 	u8 dsap;
 	char *service_name;
 	size_t service_name_len;
+	u8 rw;
+	u16 miux;
+
 
 	/* Remote link parameters */
 	u8 remote_rw;
diff --git a/net/nfc/llcp/sock.c b/net/nfc/llcp/sock.c
index cc564992ba95..9357a756f7a9 100644
--- a/net/nfc/llcp/sock.c
+++ b/net/nfc/llcp/sock.c
@@ -223,6 +223,121 @@ error:
 	return ret;
 }
 
+static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, unsigned int optlen)
+{
+	struct sock *sk = sock->sk;
+	struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+	u32 opt;
+	int err = 0;
+
+	pr_debug("%p optname %d\n", sk, optname);
+
+	if (level != SOL_NFC)
+		return -ENOPROTOOPT;
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case NFC_LLCP_RW:
+		if (sk->sk_state == LLCP_CONNECTED ||
+		    sk->sk_state == LLCP_BOUND ||
+		    sk->sk_state == LLCP_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt > LLCP_MAX_RW) {
+			err = -EINVAL;
+			break;
+		}
+
+		llcp_sock->rw = (u8) opt;
+
+		break;
+
+	case NFC_LLCP_MIUX:
+		if (sk->sk_state == LLCP_CONNECTED ||
+		    sk->sk_state == LLCP_BOUND ||
+		    sk->sk_state == LLCP_LISTEN) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (get_user(opt, (u32 __user *) optval)) {
+			err = -EFAULT;
+			break;
+		}
+
+		if (opt > LLCP_MAX_MIUX) {
+			err = -EINVAL;
+			break;
+		}
+
+		llcp_sock->miux = (u16) opt;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+
+	return err;
+}
+
+static int nfc_llcp_getsockopt(struct socket *sock, int level, int optname,
+			       char __user *optval, int __user *optlen)
+{
+	struct sock *sk = sock->sk;
+	struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
+	int len, err = 0;
+
+	pr_debug("%p optname %d\n", sk, optname);
+
+	if (level != SOL_NFC)
+		return -ENOPROTOOPT;
+
+	if (get_user(len, optlen))
+		return -EFAULT;
+
+	len = min_t(u32, len, sizeof(u32));
+
+	lock_sock(sk);
+
+	switch (optname) {
+	case NFC_LLCP_RW:
+		if (put_user(llcp_sock->rw, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	case NFC_LLCP_MIUX:
+		if (put_user(llcp_sock->miux, (u32 __user *) optval))
+			err = -EFAULT;
+
+		break;
+
+	default:
+		err = -ENOPROTOOPT;
+		break;
+	}
+
+	release_sock(sk);
+
+	if (put_user(len, optlen))
+		return -EFAULT;
+
+	return err;
+}
+
 void nfc_llcp_accept_unlink(struct sock *sk)
 {
 	struct nfc_llcp_sock *llcp_sock = nfc_llcp_sock(sk);
@@ -735,8 +850,8 @@ static const struct proto_ops llcp_sock_ops = {
 	.ioctl          = sock_no_ioctl,
 	.listen         = llcp_sock_listen,
 	.shutdown       = sock_no_shutdown,
-	.setsockopt     = sock_no_setsockopt,
-	.getsockopt     = sock_no_getsockopt,
+	.setsockopt     = nfc_llcp_setsockopt,
+	.getsockopt     = nfc_llcp_getsockopt,
 	.sendmsg        = llcp_sock_sendmsg,
 	.recvmsg        = llcp_sock_recvmsg,
 	.mmap           = sock_no_mmap,
-- 
cgit 


From 3a08a8f9f0936e182d387afd85fdc5d303381521 Mon Sep 17 00:00:00 2001
From: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Date: Mon, 4 Mar 2013 23:32:07 +0530
Subject: kvm: Record the preemption status of vcpus using preempt notifiers

Note that we mark as preempted only when vcpu's task state was
Running during preemption.

Thanks Jiannan, Avi for preemption notifier ideas. Thanks Gleb, PeterZ
for their precious suggestions. Thanks Srikar for an idea on avoiding
rcu lock while checking task state that improved overcommit numbers.

Reviewed-by: Chegu Vinod <chegu_vinod@hp.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Raghavendra K T <raghavendra.kt@linux.vnet.ibm.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 include/linux/kvm_host.h | 1 +
 virt/kvm/kvm_main.c      | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 9fa13ebc3381..0f4941a9c9c8 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -253,6 +253,7 @@ struct kvm_vcpu {
 		bool dy_eligible;
 	} spin_loop;
 #endif
+	bool preempted;
 	struct kvm_vcpu_arch arch;
 };
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index faf05bddd131..470f2bc8205a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -244,6 +244,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 
 	kvm_vcpu_set_in_spin_loop(vcpu, false);
 	kvm_vcpu_set_dy_eligible(vcpu, false);
+	vcpu->preempted = false;
 
 	r = kvm_arch_vcpu_init(vcpu);
 	if (r < 0)
@@ -2880,6 +2881,8 @@ struct kvm_vcpu *preempt_notifier_to_vcpu(struct preempt_notifier *pn)
 static void kvm_sched_in(struct preempt_notifier *pn, int cpu)
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
+	if (vcpu->preempted)
+		vcpu->preempted = false;
 
 	kvm_arch_vcpu_load(vcpu, cpu);
 }
@@ -2889,6 +2892,8 @@ static void kvm_sched_out(struct preempt_notifier *pn,
 {
 	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
 
+	if (current->state == TASK_RUNNING)
+		vcpu->preempted = true;
 	kvm_arch_vcpu_put(vcpu);
 }
 
-- 
cgit 


From e0c25362384f4be9c755c98560cd4b1cdb2ec79c Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Sun, 10 Mar 2013 21:52:53 -0500
Subject: clocksource: add empty version of clocksource_of_init

Add an empty clocksource_of_init when !CLKSRC_OF. This is needed for builds
where no timer has selected CLKSRC_OF.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 27cfda427dd9..08ed5e19d8c6 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -340,6 +340,7 @@ extern void clocksource_of_init(void);
 		__used __section(__clksrc_of_table)			\
 		 = { .compatible = compat, .data = fn };
 #else
+static inline void clocksource_of_init(void) {}
 #define CLOCKSOURCE_OF_DECLARE(name, compat, fn)
 #endif
 
-- 
cgit 


From 44f507163d9e51238458ee6904b4d71fb0723723 Mon Sep 17 00:00:00 2001
From: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
Date: Mon, 11 Mar 2013 09:28:44 -0600
Subject: VFIO: Wrapper for getting reference to vfio_device

- Added vfio_device_get_from_dev() as wrapper to get
  reference to vfio_device from struct device.

- Added vfio_device_data() as a wrapper to get device_data from
  vfio_device.

Signed-off-by: Vijay Mohan Pandarathil <vijaymohan.pandarathil@hp.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio.c  | 30 +++++++++++++++++++++++++++++-
 include/linux/vfio.h |  3 +++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index fcc12f3e60a3..21eddd9e0f26 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -392,12 +392,13 @@ static void vfio_device_release(struct kref *kref)
 }
 
 /* Device reference always implies a group reference */
-static void vfio_device_put(struct vfio_device *device)
+void vfio_device_put(struct vfio_device *device)
 {
 	struct vfio_group *group = device->group;
 	kref_put_mutex(&device->kref, vfio_device_release, &group->device_lock);
 	vfio_group_put(group);
 }
+EXPORT_SYMBOL_GPL(vfio_device_put);
 
 static void vfio_device_get(struct vfio_device *device)
 {
@@ -627,6 +628,33 @@ int vfio_add_group_dev(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(vfio_add_group_dev);
 
+/**
+ * Get a reference to the vfio_device for a device that is known to
+ * be bound to a vfio driver.  The driver implicitly holds a
+ * vfio_device reference between vfio_add_group_dev and
+ * vfio_del_group_dev.  We can therefore use drvdata to increment
+ * that reference from the struct device.  This additional
+ * reference must be released by calling vfio_device_put.
+ */
+struct vfio_device *vfio_device_get_from_dev(struct device *dev)
+{
+	struct vfio_device *device = dev_get_drvdata(dev);
+
+	vfio_device_get(device);
+
+	return device;
+}
+EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
+
+/*
+ * Caller must hold a reference to the vfio_device
+ */
+void *vfio_device_data(struct vfio_device *device)
+{
+	return device->device_data;
+}
+EXPORT_SYMBOL_GPL(vfio_device_data);
+
 /* Given a referenced group, check if it contains the device */
 static bool vfio_dev_present(struct vfio_group *group, struct device *dev)
 {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ab9e86224c54..ac8d488e4372 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -45,6 +45,9 @@ extern int vfio_add_group_dev(struct device *dev,
 			      void *device_data);
 
 extern void *vfio_del_group_dev(struct device *dev);
+extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
+extern void vfio_device_put(struct vfio_device *device);
+extern void *vfio_device_data(struct vfio_device *device);
 
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
-- 
cgit 


From b818d1a7f72575eef17e00dc4085512c9cc8897d Mon Sep 17 00:00:00 2001
From: Hector Palacios <hector.palacios@digi.com>
Date: Sun, 10 Mar 2013 22:50:02 +0000
Subject: phy/micrel: Add support for KSZ8031

Micrel PHY KSZ8031 is similar to KSZ8021 and also requires the special
initialization of "Operation Mode Strap Override" in reg 0x16
introduced in 212ea99 (phy/micrel: Implement support for KSZ8021).

Signed-off-by: Hector Palacios <hector.palacios@digi.com>
Reviewed-by: Marek Vasut <marex@denx.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/micrel.c   | 14 ++++++++++++++
 include/linux/micrel_phy.h |  1 +
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index abf7b6153d00..018af1852fe1 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -191,6 +191,19 @@ static struct phy_driver ksphy_driver[] = {
 	.ack_interrupt	= kszphy_ack_interrupt,
 	.config_intr	= kszphy_config_intr,
 	.driver		= { .owner = THIS_MODULE,},
+}, {
+	.phy_id		= PHY_ID_KSZ8031,
+	.phy_id_mask	= 0x00ffffff,
+	.name		= "Micrel KSZ8031",
+	.features	= (PHY_BASIC_FEATURES | SUPPORTED_Pause |
+			   SUPPORTED_Asym_Pause),
+	.flags		= PHY_HAS_MAGICANEG | PHY_HAS_INTERRUPT,
+	.config_init	= ksz8021_config_init,
+	.config_aneg	= genphy_config_aneg,
+	.read_status	= genphy_read_status,
+	.ack_interrupt	= kszphy_ack_interrupt,
+	.config_intr	= kszphy_config_intr,
+	.driver		= { .owner = THIS_MODULE,},
 }, {
 	.phy_id		= PHY_ID_KSZ8041,
 	.phy_id_mask	= 0x00fffff0,
@@ -325,6 +338,7 @@ static struct mdio_device_id __maybe_unused micrel_tbl[] = {
 	{ PHY_ID_KSZ8001, 0x00ffffff },
 	{ PHY_ID_KS8737, 0x00fffff0 },
 	{ PHY_ID_KSZ8021, 0x00ffffff },
+	{ PHY_ID_KSZ8031, 0x00ffffff },
 	{ PHY_ID_KSZ8041, 0x00fffff0 },
 	{ PHY_ID_KSZ8051, 0x00fffff0 },
 	{ PHY_ID_KSZ8061, 0x00fffff0 },
diff --git a/include/linux/micrel_phy.h b/include/linux/micrel_phy.h
index 9dbb41a4e250..8752dbbc6135 100644
--- a/include/linux/micrel_phy.h
+++ b/include/linux/micrel_phy.h
@@ -19,6 +19,7 @@
 #define PHY_ID_KSZ9021		0x00221610
 #define PHY_ID_KS8737		0x00221720
 #define PHY_ID_KSZ8021		0x00221555
+#define PHY_ID_KSZ8031		0x00221556
 #define PHY_ID_KSZ8041		0x00221510
 #define PHY_ID_KSZ8051		0x00221550
 /* same id: ks8001 Rev. A/B, and ks8721 Rev 3. */
-- 
cgit 


From 6ba8a3b19e764b6a65e4030ab0999be50c291e6c Mon Sep 17 00:00:00 2001
From: Nandita Dukkipati <nanditad@google.com>
Date: Mon, 11 Mar 2013 10:00:43 +0000
Subject: tcp: Tail loss probe (TLP)

This patch series implement the Tail loss probe (TLP) algorithm described
in http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01. The
first patch implements the basic algorithm.

TLP's goal is to reduce tail latency of short transactions. It achieves
this by converting retransmission timeouts (RTOs) occuring due
to tail losses (losses at end of transactions) into fast recovery.
TLP transmits one packet in two round-trips when a connection is in
Open state and isn't receiving any ACKs. The transmitted packet, aka
loss probe, can be either new or a retransmission. When there is tail
loss, the ACK from a loss probe triggers FACK/early-retransmit based
fast recovery, thus avoiding a costly RTO. In the absence of loss,
there is no change in the connection state.

PTO stands for probe timeout. It is a timer event indicating
that an ACK is overdue and triggers a loss probe packet. The PTO value
is set to max(2*SRTT, 10ms) and is adjusted to account for delayed
ACK timer when there is only one oustanding packet.

TLP Algorithm

On transmission of new data in Open state:
  -> packets_out > 1: schedule PTO in max(2*SRTT, 10ms).
  -> packets_out == 1: schedule PTO in max(2*RTT, 1.5*RTT + 200ms)
  -> PTO = min(PTO, RTO)

Conditions for scheduling PTO:
  -> Connection is in Open state.
  -> Connection is either cwnd limited or no new data to send.
  -> Number of probes per tail loss episode is limited to one.
  -> Connection is SACK enabled.

When PTO fires:
  new_segment_exists:
    -> transmit new segment.
    -> packets_out++. cwnd remains same.

  no_new_packet:
    -> retransmit the last segment.
       Its ACK triggers FACK or early retransmit based recovery.

ACK path:
  -> rearm RTO at start of ACK processing.
  -> reschedule PTO if need be.

In addition, the patch includes a small variation to the Early Retransmit
(ER) algorithm, such that ER and TLP together can in principle recover any
N-degree of tail loss through fast recovery. TLP is controlled by the same
sysctl as ER, tcp_early_retrans sysctl.
tcp_early_retrans==0; disables TLP and ER.
		 ==1; enables RFC5827 ER.
		 ==2; delayed ER.
		 ==3; TLP and delayed ER. [DEFAULT]
		 ==4; TLP only.

The TLP patch series have been extensively tested on Google Web servers.
It is most effective for short Web trasactions, where it reduced RTOs by 15%
and improved HTTP response time (average by 6%, 99th percentile by 10%).
The transmitted probes account for <0.5% of the overall transmissions.

Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   8 ++-
 include/linux/tcp.h                    |   1 -
 include/net/inet_connection_sock.h     |   5 +-
 include/net/tcp.h                      |   6 +-
 include/uapi/linux/snmp.h              |   1 +
 net/ipv4/inet_diag.c                   |   4 +-
 net/ipv4/proc.c                        |   1 +
 net/ipv4/sysctl_net_ipv4.c             |   4 +-
 net/ipv4/tcp_input.c                   |  24 ++++---
 net/ipv4/tcp_ipv4.c                    |   4 +-
 net/ipv4/tcp_output.c                  | 128 +++++++++++++++++++++++++++++++--
 net/ipv4/tcp_timer.c                   |  13 ++--
 12 files changed, 171 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index dc2dc87d2557..1cae6c383e1b 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -190,7 +190,9 @@ tcp_early_retrans - INTEGER
 	Enable Early Retransmit (ER), per RFC 5827. ER lowers the threshold
 	for triggering fast retransmit when the amount of outstanding data is
 	small and when no previously unsent data can be transmitted (such
-	that limited transmit could be used).
+	that limited transmit could be used). Also controls the use of
+	Tail loss probe (TLP) that converts RTOs occuring due to tail
+	losses into fast recovery (draft-dukkipati-tcpm-tcp-loss-probe-01).
 	Possible values:
 		0 disables ER
 		1 enables ER
@@ -198,7 +200,9 @@ tcp_early_retrans - INTEGER
 		  by a fourth of RTT. This mitigates connection falsely
 		  recovers when network has a small degree of reordering
 		  (less than 3 packets).
-	Default: 2
+		3 enables delayed ER and TLP.
+		4 enables TLP only.
+	Default: 3
 
 tcp_ecn - INTEGER
 	Control use of Explicit Congestion Notification (ECN) by TCP.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 515c3746b675..01860d74555c 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -201,7 +201,6 @@ struct tcp_sock {
 		unused      : 1;
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
-		early_retrans_delayed:1, /* Delayed ER timer installed */
 		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 183292722f6e..de2c78529afa 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -133,6 +133,8 @@ struct inet_connection_sock {
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */
 #define ICSK_TIME_DACK		2	/* Delayed ack timer */
 #define ICSK_TIME_PROBE0	3	/* Zero window probe timer */
+#define ICSK_TIME_EARLY_RETRANS 4	/* Early retransmit timer */
+#define ICSK_TIME_LOSS_PROBE	5	/* Tail loss probe timer */
 
 static inline struct inet_connection_sock *inet_csk(const struct sock *sk)
 {
@@ -222,7 +224,8 @@ static inline void inet_csk_reset_xmit_timer(struct sock *sk, const int what,
 		when = max_when;
 	}
 
-	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0) {
+	if (what == ICSK_TIME_RETRANS || what == ICSK_TIME_PROBE0 ||
+	    what == ICSK_TIME_EARLY_RETRANS || what ==  ICSK_TIME_LOSS_PROBE) {
 		icsk->icsk_pending = what;
 		icsk->icsk_timeout = jiffies + when;
 		sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
diff --git a/include/net/tcp.h b/include/net/tcp.h
index a2baa5e4ba31..ab9f947b118b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -543,6 +543,8 @@ extern bool tcp_syn_flood_action(struct sock *sk,
 extern void tcp_push_one(struct sock *, unsigned int mss_now);
 extern void tcp_send_ack(struct sock *sk);
 extern void tcp_send_delayed_ack(struct sock *sk);
+extern void tcp_send_loss_probe(struct sock *sk);
+extern bool tcp_schedule_loss_probe(struct sock *sk);
 
 /* tcp_input.c */
 extern void tcp_cwnd_application_limited(struct sock *sk);
@@ -873,8 +875,8 @@ static inline void tcp_enable_fack(struct tcp_sock *tp)
 static inline void tcp_enable_early_retrans(struct tcp_sock *tp)
 {
 	tp->do_early_retrans = sysctl_tcp_early_retrans &&
-		!sysctl_tcp_thin_dupack && sysctl_tcp_reordering == 3;
-	tp->early_retrans_delayed = 0;
+		sysctl_tcp_early_retrans < 4 && !sysctl_tcp_thin_dupack &&
+		sysctl_tcp_reordering == 3;
 }
 
 static inline void tcp_disable_early_retrans(struct tcp_sock *tp)
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index b49eab89c9fd..290bed6b085f 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -202,6 +202,7 @@ enum
 	LINUX_MIB_TCPFORWARDRETRANS,		/* TCPForwardRetrans */
 	LINUX_MIB_TCPSLOWSTARTRETRANS,		/* TCPSlowStartRetrans */
 	LINUX_MIB_TCPTIMEOUTS,			/* TCPTimeouts */
+	LINUX_MIB_TCPLOSSPROBES,		/* TCPLossProbes */
 	LINUX_MIB_TCPRENORECOVERYFAIL,		/* TCPRenoRecoveryFail */
 	LINUX_MIB_TCPSACKRECOVERYFAIL,		/* TCPSackRecoveryFail */
 	LINUX_MIB_TCPSCHEDULERFAILED,		/* TCPSchedulerFailed */
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 7afa2c3c788f..8620408af574 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -158,7 +158,9 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 
 #define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
 
-	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		r->idiag_timer = 1;
 		r->idiag_retrans = icsk->icsk_retransmits;
 		r->idiag_expires = EXPIRES_IN_MS(icsk->icsk_timeout);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 32030a24e776..4c35911d935f 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -224,6 +224,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS),
 	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
 	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
+	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
 	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 960fd29d9b8e..cca4550f4082 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -28,7 +28,7 @@
 
 static int zero;
 static int one = 1;
-static int two = 2;
+static int four = 4;
 static int tcp_retr1_max = 255;
 static int ip_local_port_range_min[] = { 1, 1 };
 static int ip_local_port_range_max[] = { 65535, 65535 };
@@ -760,7 +760,7 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
-		.extra2		= &two,
+		.extra2		= &four,
 	},
 	{
 		.procname	= "udp_mem",
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0d9bdacce99f..b794f89ac1f2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -98,7 +98,7 @@ int sysctl_tcp_frto_response __read_mostly;
 int sysctl_tcp_thin_dupack __read_mostly;
 
 int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
-int sysctl_tcp_early_retrans __read_mostly = 2;
+int sysctl_tcp_early_retrans __read_mostly = 3;
 
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
@@ -2150,15 +2150,16 @@ static bool tcp_pause_early_retransmit(struct sock *sk, int flag)
 	 * max(RTT/4, 2msec) unless ack has ECE mark, no RTT samples
 	 * available, or RTO is scheduled to fire first.
 	 */
-	if (sysctl_tcp_early_retrans < 2 || (flag & FLAG_ECE) || !tp->srtt)
+	if (sysctl_tcp_early_retrans < 2 || sysctl_tcp_early_retrans > 3 ||
+	    (flag & FLAG_ECE) || !tp->srtt)
 		return false;
 
 	delay = max_t(unsigned long, (tp->srtt >> 5), msecs_to_jiffies(2));
 	if (!time_after(inet_csk(sk)->icsk_timeout, (jiffies + delay)))
 		return false;
 
-	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, delay, TCP_RTO_MAX);
-	tp->early_retrans_delayed = 1;
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_EARLY_RETRANS, delay,
+				  TCP_RTO_MAX);
 	return true;
 }
 
@@ -2321,7 +2322,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	 * interval if appropriate.
 	 */
 	if (tp->do_early_retrans && !tp->retrans_out && tp->sacked_out &&
-	    (tp->packets_out == (tp->sacked_out + 1) && tp->packets_out < 4) &&
+	    (tp->packets_out >= (tp->sacked_out + 1) && tp->packets_out < 4) &&
 	    !tcp_may_send_now(sk))
 		return !tcp_pause_early_retransmit(sk, flag);
 
@@ -3081,6 +3082,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
  */
 void tcp_rearm_rto(struct sock *sk)
 {
+	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* If the retrans timer is currently being used by Fast Open
@@ -3094,12 +3096,13 @@ void tcp_rearm_rto(struct sock *sk)
 	} else {
 		u32 rto = inet_csk(sk)->icsk_rto;
 		/* Offset the time elapsed after installing regular RTO */
-		if (tp->early_retrans_delayed) {
+		if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+		    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 			struct sk_buff *skb = tcp_write_queue_head(sk);
 			const u32 rto_time_stamp = TCP_SKB_CB(skb)->when + rto;
 			s32 delta = (s32)(rto_time_stamp - tcp_time_stamp);
 			/* delta may not be positive if the socket is locked
-			 * when the delayed ER timer fires and is rescheduled.
+			 * when the retrans timer fires and is rescheduled.
 			 */
 			if (delta > 0)
 				rto = delta;
@@ -3107,7 +3110,6 @@ void tcp_rearm_rto(struct sock *sk)
 		inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto,
 					  TCP_RTO_MAX);
 	}
-	tp->early_retrans_delayed = 0;
 }
 
 /* This function is called when the delayed ER timer fires. TCP enters
@@ -3601,7 +3603,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	if (after(ack, tp->snd_nxt))
 		goto invalid_ack;
 
-	if (tp->early_retrans_delayed)
+	if (icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 
 	if (after(ack, prior_snd_una))
@@ -3678,6 +3681,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 		if (dst)
 			dst_confirm(dst);
 	}
+
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS)
+		tcp_schedule_loss_probe(sk);
 	return 1;
 
 no_queue:
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 8cdee120a50c..b7ab868c8284 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2703,7 +2703,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
 	__u16 srcp = ntohs(inet->inet_sport);
 	int rx_queue;
 
-	if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
+	if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) {
 		timer_active	= 1;
 		timer_expires	= icsk->icsk_timeout;
 	} else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e2b4461074da..beb63dbc85f5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -74,6 +74,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 /* Account for new data that has been sent to the network. */
 static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 {
+	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned int prior_packets = tp->packets_out;
 
@@ -85,7 +86,8 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 		tp->frto_counter = 3;
 
 	tp->packets_out += tcp_skb_pcount(skb);
-	if (!prior_packets || tp->early_retrans_delayed)
+	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
+	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
 		tcp_rearm_rto(sk);
 }
 
@@ -1959,6 +1961,9 @@ static int tcp_mtu_probe(struct sock *sk)
  * snd_up-64k-mss .. snd_up cannot be large. However, taking into
  * account rare use of URG, this is not a big flaw.
  *
+ * Send at most one packet when push_one > 0. Temporarily ignore
+ * cwnd limit to force at most one packet out when push_one == 2.
+
  * Returns true, if no segments are in flight and we have queued segments,
  * but cannot send anything now because of SWS or another problem.
  */
@@ -1994,8 +1999,13 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			goto repair; /* Skip network transmission */
 
 		cwnd_quota = tcp_cwnd_test(tp, skb);
-		if (!cwnd_quota)
-			break;
+		if (!cwnd_quota) {
+			if (push_one == 2)
+				/* Force out a loss probe pkt. */
+				cwnd_quota = 1;
+			else
+				break;
+		}
 
 		if (unlikely(!tcp_snd_wnd_test(tp, skb, mss_now)))
 			break;
@@ -2049,10 +2059,120 @@ repair:
 	if (likely(sent_pkts)) {
 		if (tcp_in_cwnd_reduction(sk))
 			tp->prr_out += sent_pkts;
+
+		/* Send one loss probe per tail loss episode. */
+		if (push_one != 2)
+			tcp_schedule_loss_probe(sk);
 		tcp_cwnd_validate(sk);
 		return false;
 	}
-	return !tp->packets_out && tcp_send_head(sk);
+	return (push_one == 2) || (!tp->packets_out && tcp_send_head(sk));
+}
+
+bool tcp_schedule_loss_probe(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u32 timeout, tlp_time_stamp, rto_time_stamp;
+	u32 rtt = tp->srtt >> 3;
+
+	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
+		return false;
+	/* No consecutive loss probes. */
+	if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) {
+		tcp_rearm_rto(sk);
+		return false;
+	}
+	/* Don't do any loss probe on a Fast Open connection before 3WHS
+	 * finishes.
+	 */
+	if (sk->sk_state == TCP_SYN_RECV)
+		return false;
+
+	/* TLP is only scheduled when next timer event is RTO. */
+	if (icsk->icsk_pending != ICSK_TIME_RETRANS)
+		return false;
+
+	/* Schedule a loss probe in 2*RTT for SACK capable connections
+	 * in Open state, that are either limited by cwnd or application.
+	 */
+	if (sysctl_tcp_early_retrans < 3 || !rtt || !tp->packets_out ||
+	    !tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
+		return false;
+
+	if ((tp->snd_cwnd > tcp_packets_in_flight(tp)) &&
+	     tcp_send_head(sk))
+		return false;
+
+	/* Probe timeout is at least 1.5*rtt + TCP_DELACK_MAX to account
+	 * for delayed ack when there's one outstanding packet.
+	 */
+	timeout = rtt << 1;
+	if (tp->packets_out == 1)
+		timeout = max_t(u32, timeout,
+				(rtt + (rtt >> 1) + TCP_DELACK_MAX));
+	timeout = max_t(u32, timeout, msecs_to_jiffies(10));
+
+	/* If RTO is shorter, just schedule TLP in its place. */
+	tlp_time_stamp = tcp_time_stamp + timeout;
+	rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout;
+	if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) {
+		s32 delta = rto_time_stamp - tcp_time_stamp;
+		if (delta > 0)
+			timeout = delta;
+	}
+
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout,
+				  TCP_RTO_MAX);
+	return true;
+}
+
+/* When probe timeout (PTO) fires, send a new segment if one exists, else
+ * retransmit the last segment.
+ */
+void tcp_send_loss_probe(struct sock *sk)
+{
+	struct sk_buff *skb;
+	int pcount;
+	int mss = tcp_current_mss(sk);
+	int err = -1;
+
+	if (tcp_send_head(sk) != NULL) {
+		err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
+		goto rearm_timer;
+	}
+
+	/* Retransmit last segment. */
+	skb = tcp_write_queue_tail(sk);
+	if (WARN_ON(!skb))
+		goto rearm_timer;
+
+	pcount = tcp_skb_pcount(skb);
+	if (WARN_ON(!pcount))
+		goto rearm_timer;
+
+	if ((pcount > 1) && (skb->len > (pcount - 1) * mss)) {
+		if (unlikely(tcp_fragment(sk, skb, (pcount - 1) * mss, mss)))
+			goto rearm_timer;
+		skb = tcp_write_queue_tail(sk);
+	}
+
+	if (WARN_ON(!skb || !tcp_skb_pcount(skb)))
+		goto rearm_timer;
+
+	/* Probe with zero data doesn't trigger fast recovery. */
+	if (skb->len > 0)
+		err = __tcp_retransmit_skb(sk, skb);
+
+rearm_timer:
+	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+				  inet_csk(sk)->icsk_rto,
+				  TCP_RTO_MAX);
+
+	if (likely(!err))
+		NET_INC_STATS_BH(sock_net(sk),
+				 LINUX_MIB_TCPLOSSPROBES);
+	return;
 }
 
 /* Push out any pending frames which were held back due to
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index b78aac30c498..ecd61d54147f 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -342,10 +342,6 @@ void tcp_retransmit_timer(struct sock *sk)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 
-	if (tp->early_retrans_delayed) {
-		tcp_resume_early_retransmit(sk);
-		return;
-	}
 	if (tp->fastopen_rsk) {
 		WARN_ON_ONCE(sk->sk_state != TCP_SYN_RECV &&
 			     sk->sk_state != TCP_FIN_WAIT1);
@@ -495,13 +491,20 @@ void tcp_write_timer_handler(struct sock *sk)
 	}
 
 	event = icsk->icsk_pending;
-	icsk->icsk_pending = 0;
 
 	switch (event) {
+	case ICSK_TIME_EARLY_RETRANS:
+		tcp_resume_early_retransmit(sk);
+		break;
+	case ICSK_TIME_LOSS_PROBE:
+		tcp_send_loss_probe(sk);
+		break;
 	case ICSK_TIME_RETRANS:
+		icsk->icsk_pending = 0;
 		tcp_retransmit_timer(sk);
 		break;
 	case ICSK_TIME_PROBE0:
+		icsk->icsk_pending = 0;
 		tcp_probe_timer(sk);
 		break;
 	}
-- 
cgit 


From 9b717a8d245075ffb8e95a2dfb4ee97ce4747457 Mon Sep 17 00:00:00 2001
From: Nandita Dukkipati <nanditad@google.com>
Date: Mon, 11 Mar 2013 10:00:44 +0000
Subject: tcp: TLP loss detection.

This is the second of the TLP patch series; it augments the basic TLP
algorithm with a loss detection scheme.

This patch implements a mechanism for loss detection when a Tail
loss probe retransmission plugs a hole thereby masking packet loss
from the sender. The loss detection algorithm relies on counting
TLP dupacks as outlined in Sec. 3 of:
http://tools.ietf.org/html/draft-dukkipati-tcpm-tcp-loss-probe-01

The basic idea is: Sender keeps track of TLP "episode" upon
retransmission of a TLP packet. An episode ends when the sender receives
an ACK above the SND.NXT (tracked by tlp_high_seq) at the time of the
episode. We want to make sure that before the episode ends the sender
receives a "TLP dupack", indicating that the TLP retransmission was
unnecessary, so there was no loss/hole that needed plugging. If the
sender gets no TLP dupack before the end of the episode, then it reduces
ssthresh and the congestion window, because the TLP packet arriving at
the receiver probably plugged a hole.

Signed-off-by: Nandita Dukkipati <nanditad@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h       |  1 +
 include/uapi/linux/snmp.h |  1 +
 net/ipv4/proc.c           |  1 +
 net/ipv4/tcp_input.c      | 39 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/tcp_minisocks.c  |  1 +
 net/ipv4/tcp_output.c     |  9 +++++++++
 net/ipv4/tcp_timer.c      |  2 ++
 7 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 01860d74555c..763c108ee03d 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -204,6 +204,7 @@ struct tcp_sock {
 		syn_data:1,	/* SYN includes data */
 		syn_fastopen:1,	/* SYN includes Fast Open option */
 		syn_data_acked:1;/* data in SYN is acked by SYN-ACK */
+	u32	tlp_high_seq;	/* snd_nxt at the time of TLP retransmit. */
 
 /* RTT measurement */
 	u32	srtt;		/* smoothed round trip time << 3	*/
diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h
index 290bed6b085f..e00013a1debc 100644
--- a/include/uapi/linux/snmp.h
+++ b/include/uapi/linux/snmp.h
@@ -203,6 +203,7 @@ enum
 	LINUX_MIB_TCPSLOWSTARTRETRANS,		/* TCPSlowStartRetrans */
 	LINUX_MIB_TCPTIMEOUTS,			/* TCPTimeouts */
 	LINUX_MIB_TCPLOSSPROBES,		/* TCPLossProbes */
+	LINUX_MIB_TCPLOSSPROBERECOVERY,		/* TCPLossProbeRecovery */
 	LINUX_MIB_TCPRENORECOVERYFAIL,		/* TCPRenoRecoveryFail */
 	LINUX_MIB_TCPSACKRECOVERYFAIL,		/* TCPSackRecoveryFail */
 	LINUX_MIB_TCPSCHEDULERFAILED,		/* TCPSchedulerFailed */
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 4c35911d935f..b6f2ea174898 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -225,6 +225,7 @@ static const struct snmp_mib snmp4_net_list[] = {
 	SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS),
 	SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS),
 	SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES),
+	SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY),
 	SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
 	SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index b794f89ac1f2..836d74dd0187 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2682,6 +2682,7 @@ static void tcp_init_cwnd_reduction(struct sock *sk, const bool set_ssthresh)
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	tp->high_seq = tp->snd_nxt;
+	tp->tlp_high_seq = 0;
 	tp->snd_cwnd_cnt = 0;
 	tp->prior_cwnd = tp->snd_cwnd;
 	tp->prr_delivered = 0;
@@ -3569,6 +3570,38 @@ static void tcp_send_challenge_ack(struct sock *sk)
 	}
 }
 
+/* This routine deals with acks during a TLP episode.
+ * Ref: loss detection algorithm in draft-dukkipati-tcpm-tcp-loss-probe.
+ */
+static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	bool is_tlp_dupack = (ack == tp->tlp_high_seq) &&
+			     !(flag & (FLAG_SND_UNA_ADVANCED |
+				       FLAG_NOT_DUP | FLAG_DATA_SACKED));
+
+	/* Mark the end of TLP episode on receiving TLP dupack or when
+	 * ack is after tlp_high_seq.
+	 */
+	if (is_tlp_dupack) {
+		tp->tlp_high_seq = 0;
+		return;
+	}
+
+	if (after(ack, tp->tlp_high_seq)) {
+		tp->tlp_high_seq = 0;
+		/* Don't reduce cwnd if DSACK arrives for TLP retrans. */
+		if (!(flag & FLAG_DSACKING_ACK)) {
+			tcp_init_cwnd_reduction(sk, true);
+			tcp_set_ca_state(sk, TCP_CA_CWR);
+			tcp_end_cwnd_reduction(sk);
+			tcp_set_ca_state(sk, TCP_CA_Open);
+			NET_INC_STATS_BH(sock_net(sk),
+					 LINUX_MIB_TCPLOSSPROBERECOVERY);
+		}
+	}
+}
+
 /* This routine deals with incoming acks, but not outgoing ones. */
 static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 {
@@ -3676,6 +3709,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 			tcp_cong_avoid(sk, ack, prior_in_flight);
 	}
 
+	if (tp->tlp_high_seq)
+		tcp_process_tlp_ack(sk, ack, flag);
+
 	if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
 		struct dst_entry *dst = __sk_dst_get(sk);
 		if (dst)
@@ -3697,6 +3733,9 @@ no_queue:
 	 */
 	if (tcp_send_head(sk))
 		tcp_ack_probe(sk);
+
+	if (tp->tlp_high_seq)
+		tcp_process_tlp_ack(sk, ack, flag);
 	return 1;
 
 invalid_ack:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index b83a49cc3816..4bdb09fca401 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -440,6 +440,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->fackets_out = 0;
 		newtp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 		tcp_enable_early_retrans(newtp);
+		newtp->tlp_high_seq = 0;
 
 		/* So many TCP implementations out there (incorrectly) count the
 		 * initial SYN frame in their delayed-ACK and congestion control
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index beb63dbc85f5..8e7742f0b5d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2132,6 +2132,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
  */
 void tcp_send_loss_probe(struct sock *sk)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
 	int pcount;
 	int mss = tcp_current_mss(sk);
@@ -2142,6 +2143,10 @@ void tcp_send_loss_probe(struct sock *sk)
 		goto rearm_timer;
 	}
 
+	/* At most one outstanding TLP retransmission. */
+	if (tp->tlp_high_seq)
+		goto rearm_timer;
+
 	/* Retransmit last segment. */
 	skb = tcp_write_queue_tail(sk);
 	if (WARN_ON(!skb))
@@ -2164,6 +2169,10 @@ void tcp_send_loss_probe(struct sock *sk)
 	if (skb->len > 0)
 		err = __tcp_retransmit_skb(sk, skb);
 
+	/* Record snd_nxt for loss detection. */
+	if (likely(!err))
+		tp->tlp_high_seq = tp->snd_nxt;
+
 rearm_timer:
 	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 				  inet_csk(sk)->icsk_rto,
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index ecd61d54147f..eeccf795e917 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -356,6 +356,8 @@ void tcp_retransmit_timer(struct sock *sk)
 
 	WARN_ON(tcp_write_queue_empty(sk));
 
+	tp->tlp_high_seq = 0;
+
 	if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 	    !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 		/* Receiver dastardly shrinks window. Our retransmits
-- 
cgit 


From 42e836eb4527fb635cb799a701fe4c9fe741c03a Mon Sep 17 00:00:00 2001
From: Michael Stapelberg <michael@stapelberg.de>
Date: Mon, 11 Mar 2013 13:56:44 +0000
Subject: phy: add set_wol/get_wol functions

This allows ethernet drivers (such as the mv643xx_eth) to support
Wake on LAN on platforms where PHY registers have to be configured
for Wake on LAN (e.g. the Marvell Kirkwood based qnap TS-119P II).

Signed-off-by: Michael Stapelberg <michael@stapelberg.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 16 ++++++++++++++++
 include/linux/phy.h   | 10 ++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index ef9ea9248223..298b4c201733 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1188,3 +1188,19 @@ int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data)
 	return 0;
 }
 EXPORT_SYMBOL(phy_ethtool_set_eee);
+
+int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)
+{
+	if (phydev->drv->set_wol)
+		return phydev->drv->set_wol(phydev, wol);
+
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(phy_ethtool_set_wol);
+
+void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol)
+{
+	if (phydev->drv->get_wol)
+		phydev->drv->get_wol(phydev, wol);
+}
+EXPORT_SYMBOL(phy_ethtool_get_wol);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 33999adbf8c8..9e11039dd7a3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -455,6 +455,14 @@ struct phy_driver {
 	 */
 	void (*txtstamp)(struct phy_device *dev, struct sk_buff *skb, int type);
 
+	/* Some devices (e.g. qnap TS-119P II) require PHY register changes to
+	 * enable Wake on LAN, so set_wol is provided to be called in the
+	 * ethernet driver's set_wol function. */
+	int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);
+
+	/* See set_wol, but for checking whether Wake on LAN is enabled. */
+	void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol);
+
 	struct device_driver driver;
 };
 #define to_phy_driver(d) container_of(d, struct phy_driver, driver)
@@ -560,6 +568,8 @@ int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable);
 int phy_get_eee_err(struct phy_device *phydev);
 int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data);
 int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data);
+int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol);
+void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol);
 
 int __init mdio_bus_init(void);
 void mdio_bus_exit(void);
-- 
cgit 


From d84ff0512f1bfc0d8c864efadb4523fce68919cc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:29:59 -0700
Subject: workqueue: consistently use int for @cpu variables

Workqueue is mixing unsigned int and int for @cpu variables.  There's
no point in using unsigned int for cpus - many of cpu related APIs
take int anyway.  Consistently use int for @cpu variables so that we
can use negative values to mark special ones.

This patch doesn't introduce any visible behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h   |  6 +++---
 kernel/workqueue.c          | 24 +++++++++++-------------
 kernel/workqueue_internal.h |  5 ++---
 3 files changed, 16 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 5bd030f630a9..899be6636d20 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -435,7 +435,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork);
 
 extern void workqueue_set_max_active(struct workqueue_struct *wq,
 				     int max_active);
-extern bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq);
+extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
 
 /*
@@ -466,12 +466,12 @@ static inline bool __deprecated flush_delayed_work_sync(struct delayed_work *dwo
 }
 
 #ifndef CONFIG_SMP
-static inline long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
 	return fn(arg);
 }
 #else
-long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg);
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg);
 #endif /* CONFIG_SMP */
 
 #ifdef CONFIG_FREEZER
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 26c67c76b6c5..73c5f68065b5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -124,7 +124,7 @@ enum {
 
 struct worker_pool {
 	spinlock_t		lock;		/* the pool lock */
-	unsigned int		cpu;		/* I: the associated cpu */
+	int			cpu;		/* I: the associated cpu */
 	int			id;		/* I: pool ID */
 	unsigned int		flags;		/* X: flags */
 
@@ -467,8 +467,7 @@ static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
 	return &pools[highpri];
 }
 
-static struct pool_workqueue *get_pwq(unsigned int cpu,
-				      struct workqueue_struct *wq)
+static struct pool_workqueue *get_pwq(int cpu, struct workqueue_struct *wq)
 {
 	if (!(wq->flags & WQ_UNBOUND)) {
 		if (likely(cpu < nr_cpu_ids))
@@ -730,7 +729,7 @@ static void wake_up_worker(struct worker_pool *pool)
  * CONTEXT:
  * spin_lock_irq(rq->lock)
  */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+void wq_worker_waking_up(struct task_struct *task, int cpu)
 {
 	struct worker *worker = kthread_data(task);
 
@@ -755,8 +754,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
  * RETURNS:
  * Worker task on @cpu to wake up, %NULL if none.
  */
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
-				       unsigned int cpu)
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
 {
 	struct worker *worker = kthread_data(task), *to_wakeup = NULL;
 	struct worker_pool *pool;
@@ -1159,7 +1157,7 @@ static bool is_chained_work(struct workqueue_struct *wq)
 	return worker && worker->current_pwq->wq == wq;
 }
 
-static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
+static void __queue_work(int cpu, struct workqueue_struct *wq,
 			 struct work_struct *work)
 {
 	struct pool_workqueue *pwq;
@@ -1714,7 +1712,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 	if (pool->cpu != WORK_CPU_UNBOUND)
 		worker->task = kthread_create_on_node(worker_thread,
 					worker, cpu_to_node(pool->cpu),
-					"kworker/%u:%d%s", pool->cpu, id, pri);
+					"kworker/%d:%d%s", pool->cpu, id, pri);
 	else
 		worker->task = kthread_create(worker_thread, worker,
 					      "kworker/u:%d%s", id, pri);
@@ -3345,7 +3343,7 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
  * RETURNS:
  * %true if congested, %false otherwise.
  */
-bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+bool workqueue_congested(int cpu, struct workqueue_struct *wq)
 {
 	struct pool_workqueue *pwq = get_pwq(cpu, wq);
 
@@ -3461,7 +3459,7 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
 					       unsigned long action,
 					       void *hcpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
+	int cpu = (unsigned long)hcpu;
 	struct worker_pool *pool;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
@@ -3507,7 +3505,7 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
 						 void *hcpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
+	int cpu = (unsigned long)hcpu;
 	struct work_struct unbind_work;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
@@ -3547,7 +3545,7 @@ static void work_for_cpu_fn(struct work_struct *work)
  * It is up to the caller to ensure that the cpu doesn't go offline.
  * The caller must not hold any locks which would prevent @fn from completing.
  */
-long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
 {
 	struct work_for_cpu wfc = { .fn = fn, .arg = arg };
 
@@ -3705,7 +3703,7 @@ out_unlock:
 
 static int __init init_workqueues(void)
 {
-	unsigned int cpu;
+	int cpu;
 
 	/* make sure we have enough bits for OFFQ pool ID */
 	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index f9c887731e2b..f116f071d919 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -59,8 +59,7 @@ static inline struct worker *current_wq_worker(void)
  * Scheduler hooks for concurrency managed workqueue.  Only to be used from
  * sched.c and workqueue.c.
  */
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
-				       unsigned int cpu);
+void wq_worker_waking_up(struct task_struct *task, int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
 
 #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
-- 
cgit 


From 7a4e344c5675eefbde93ed9a98ef45e0e4957bc2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:00 -0700
Subject: workqueue: introduce workqueue_attrs

Introduce struct workqueue_attrs which carries worker attributes -
currently the nice level and allowed cpumask along with helper
routines alloc_workqueue_attrs() and free_workqueue_attrs().

Each worker_pool now carries ->attrs describing the attributes of its
workers.  All functions dealing with cpumask and nice level of workers
are updated to follow worker_pool->attrs instead of determining them
from other characteristics of the worker_pool, and init_workqueues()
is updated to set worker_pool->attrs appropriately for all standard
pools.

Note that create_worker() is updated to always perform set_user_nice()
and use set_cpus_allowed_ptr() combined with manual assertion of
PF_THREAD_BOUND instead of kthread_bind().  This simplifies handling
random attributes without affecting the outcome.

This patch doesn't introduce any behavior changes.

v2: Missing cpumask_var_t definition caused build failure on some
    archs.  linux/cpumask.h included.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: kbuild test robot <fengguang.wu@intel.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h |  13 ++++++
 kernel/workqueue.c        | 103 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 94 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 899be6636d20..00c1b9ba8252 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -11,6 +11,7 @@
 #include <linux/lockdep.h>
 #include <linux/threads.h>
 #include <linux/atomic.h>
+#include <linux/cpumask.h>
 
 struct workqueue_struct;
 
@@ -115,6 +116,15 @@ struct delayed_work {
 	int cpu;
 };
 
+/*
+ * A struct for workqueue attributes.  This can be used to change
+ * attributes of an unbound workqueue.
+ */
+struct workqueue_attrs {
+	int			nice;		/* nice level */
+	cpumask_var_t		cpumask;	/* allowed CPUs */
+};
+
 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
 {
 	return container_of(work, struct delayed_work, work);
@@ -399,6 +409,9 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
+void free_workqueue_attrs(struct workqueue_attrs *attrs);
+
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
 extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 094f16668e1b..b0d3cbb83f63 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -148,6 +148,8 @@ struct worker_pool {
 	struct mutex		assoc_mutex;	/* protect POOL_DISASSOCIATED */
 	struct ida		worker_ida;	/* L: for worker IDs */
 
+	struct workqueue_attrs	*attrs;		/* I: worker attributes */
+
 	/*
 	 * The current concurrency level.  As it's likely to be accessed
 	 * from other CPUs during try_to_wake_up(), put it in a separate
@@ -1566,14 +1568,13 @@ __acquires(&pool->lock)
 		 * against POOL_DISASSOCIATED.
 		 */
 		if (!(pool->flags & POOL_DISASSOCIATED))
-			set_cpus_allowed_ptr(current, get_cpu_mask(pool->cpu));
+			set_cpus_allowed_ptr(current, pool->attrs->cpumask);
 
 		spin_lock_irq(&pool->lock);
 		if (pool->flags & POOL_DISASSOCIATED)
 			return false;
 		if (task_cpu(current) == pool->cpu &&
-		    cpumask_equal(&current->cpus_allowed,
-				  get_cpu_mask(pool->cpu)))
+		    cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
 			return true;
 		spin_unlock_irq(&pool->lock);
 
@@ -1679,7 +1680,7 @@ static void rebind_workers(struct worker_pool *pool)
 		 * wq doesn't really matter but let's keep @worker->pool
 		 * and @pwq->pool consistent for sanity.
 		 */
-		if (std_worker_pool_pri(worker->pool))
+		if (worker->pool->attrs->nice < 0)
 			wq = system_highpri_wq;
 		else
 			wq = system_wq;
@@ -1721,7 +1722,7 @@ static struct worker *alloc_worker(void)
  */
 static struct worker *create_worker(struct worker_pool *pool)
 {
-	const char *pri = std_worker_pool_pri(pool) ? "H" : "";
+	const char *pri = pool->attrs->nice < 0  ? "H" : "";
 	struct worker *worker = NULL;
 	int id = -1;
 
@@ -1751,24 +1752,23 @@ static struct worker *create_worker(struct worker_pool *pool)
 	if (IS_ERR(worker->task))
 		goto fail;
 
-	if (std_worker_pool_pri(pool))
-		set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+	set_user_nice(worker->task, pool->attrs->nice);
+	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
 
 	/*
-	 * Determine CPU binding of the new worker depending on
-	 * %POOL_DISASSOCIATED.  The caller is responsible for ensuring the
-	 * flag remains stable across this function.  See the comments
-	 * above the flag definition for details.
-	 *
-	 * As an unbound worker may later become a regular one if CPU comes
-	 * online, make sure every worker has %PF_THREAD_BOUND set.
+	 * %PF_THREAD_BOUND is used to prevent userland from meddling with
+	 * cpumask of workqueue workers.  This is an abuse.  We need
+	 * %PF_NO_SETAFFINITY.
 	 */
-	if (!(pool->flags & POOL_DISASSOCIATED)) {
-		kthread_bind(worker->task, pool->cpu);
-	} else {
-		worker->task->flags |= PF_THREAD_BOUND;
+	worker->task->flags |= PF_THREAD_BOUND;
+
+	/*
+	 * The caller is responsible for ensuring %POOL_DISASSOCIATED
+	 * remains stable across this function.  See the comments above the
+	 * flag definition for details.
+	 */
+	if (pool->flags & POOL_DISASSOCIATED)
 		worker->flags |= WORKER_UNBOUND;
-	}
 
 	return worker;
 fail:
@@ -3123,7 +3123,52 @@ int keventd_up(void)
 	return system_wq != NULL;
 }
 
-static void init_worker_pool(struct worker_pool *pool)
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
+ *
+ * Undo alloc_workqueue_attrs().
+ */
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
+{
+	if (attrs) {
+		free_cpumask_var(attrs->cpumask);
+		kfree(attrs);
+	}
+}
+
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it.  Returns NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+{
+	struct workqueue_attrs *attrs;
+
+	attrs = kzalloc(sizeof(*attrs), gfp_mask);
+	if (!attrs)
+		goto fail;
+	if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+		goto fail;
+
+	cpumask_setall(attrs->cpumask);
+	return attrs;
+fail:
+	free_workqueue_attrs(attrs);
+	return NULL;
+}
+
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initiailize a newly zalloc'd @pool.  It also allocates @pool->attrs.
+ * Returns 0 on success, -errno on failure.
+ */
+static int init_worker_pool(struct worker_pool *pool)
 {
 	spin_lock_init(&pool->lock);
 	pool->flags |= POOL_DISASSOCIATED;
@@ -3141,6 +3186,11 @@ static void init_worker_pool(struct worker_pool *pool)
 	mutex_init(&pool->manager_arb);
 	mutex_init(&pool->assoc_mutex);
 	ida_init(&pool->worker_ida);
+
+	pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	if (!pool->attrs)
+		return -ENOMEM;
+	return 0;
 }
 
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
@@ -3792,7 +3842,8 @@ out_unlock:
 
 static int __init init_workqueues(void)
 {
-	int cpu;
+	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+	int i, cpu;
 
 	/* make sure we have enough bits for OFFQ pool ID */
 	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
@@ -3809,10 +3860,18 @@ static int __init init_workqueues(void)
 	for_each_wq_cpu(cpu) {
 		struct worker_pool *pool;
 
+		i = 0;
 		for_each_std_worker_pool(pool, cpu) {
-			init_worker_pool(pool);
+			BUG_ON(init_worker_pool(pool));
 			pool->cpu = cpu;
 
+			if (cpu != WORK_CPU_UNBOUND)
+				cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+			else
+				cpumask_setall(pool->attrs->cpumask);
+
+			pool->attrs->nice = std_nice[i++];
+
 			/* alloc pool ID */
 			BUG_ON(worker_pool_assign_id(pool));
 		}
-- 
cgit 


From 493008a8e475771a2126e0ce95a73e35b371d277 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:03 -0700
Subject: workqueue: drop WQ_RESCUER and test workqueue->rescuer for NULL
 instead

WQ_RESCUER is superflous.  WQ_MEM_RECLAIM indicates that the user
wants a rescuer and testing wq->rescuer for NULL can answer whether a
given workqueue has a rescuer or not.  Drop WQ_RESCUER and test
wq->rescuer directly.

This will help simplifying __alloc_workqueue_key() failure path by
allowing it to use destroy_workqueue() on a partially constructed
workqueue, which in turn will help implementing dynamic management of
pool_workqueues.

While at it, clear wq->rescuer after freeing it in
destroy_workqueue().  This is a precaution as scheduled changes will
make destruction more complex.

This patch doesn't introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h |  1 -
 kernel/workqueue.c        | 22 ++++++++++------------
 2 files changed, 10 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 00c1b9ba8252..c270b4eedf16 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -295,7 +295,6 @@ enum {
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
 	WQ_DRAINING		= 1 << 6, /* internal: workqueue is draining */
-	WQ_RESCUER		= 1 << 7, /* internal: workqueue has rescuer */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a8b86f7b6e34..7ff2b9c5cc3a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1827,7 +1827,7 @@ static void send_mayday(struct work_struct *work)
 
 	lockdep_assert_held(&workqueue_lock);
 
-	if (!(wq->flags & WQ_RESCUER))
+	if (!wq->rescuer)
 		return;
 
 	/* mayday mayday mayday */
@@ -2285,7 +2285,7 @@ sleep:
  * @__rescuer: self
  *
  * Workqueue rescuer thread function.  There's one rescuer for each
- * workqueue which has WQ_RESCUER set.
+ * workqueue which has WQ_MEM_RECLAIM set.
  *
  * Regular work processing on a pool may block trying to create a new
  * worker which uses GFP_KERNEL allocation which has slight chance of
@@ -2769,7 +2769,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
 	 * flusher is not running on the same workqueue by verifying write
 	 * access.
 	 */
-	if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
+	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
 		lock_map_acquire(&pwq->wq->lockdep_map);
 	else
 		lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -3412,13 +3412,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	va_end(args);
 	va_end(args1);
 
-	/*
-	 * Workqueues which may be used during memory reclaim should
-	 * have a rescuer to guarantee forward progress.
-	 */
-	if (flags & WQ_MEM_RECLAIM)
-		flags |= WQ_RESCUER;
-
 	max_active = max_active ?: WQ_DFL_ACTIVE;
 	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
@@ -3449,7 +3442,11 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 	}
 	local_irq_enable();
 
-	if (flags & WQ_RESCUER) {
+	/*
+	 * Workqueues which may be used during memory reclaim should
+	 * have a rescuer to guarantee forward progress.
+	 */
+	if (flags & WQ_MEM_RECLAIM) {
 		struct worker *rescuer;
 
 		wq->rescuer = rescuer = alloc_worker();
@@ -3533,9 +3530,10 @@ void destroy_workqueue(struct workqueue_struct *wq)
 
 	spin_unlock_irq(&workqueue_lock);
 
-	if (wq->flags & WQ_RESCUER) {
+	if (wq->rescuer) {
 		kthread_stop(wq->rescuer->task);
 		kfree(wq->rescuer);
+		wq->rescuer = NULL;
 	}
 
 	/*
-- 
cgit 


From 9e8cd2f5898ab6710ad81f4583fada08bf8049a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:04 -0700
Subject: workqueue: implement apply_workqueue_attrs()

Implement apply_workqueue_attrs() which applies workqueue_attrs to the
specified unbound workqueue by creating a new pwq (pool_workqueue)
linked to worker_pool with the specified attributes.

A new pwq is linked at the head of wq->pwqs instead of tail and
__queue_work() verifies that the first unbound pwq has positive refcnt
before choosing it for the actual queueing.  This is to cover the case
where creation of a new pwq races with queueing.  As base ref on a pwq
won't be dropped without making another pwq the first one,
__queue_work() is guaranteed to make progress and not add work item to
a dead pwq.

init_and_link_pwq() is updated to return the last first pwq the new
pwq replaced, which is put by apply_workqueue_attrs().

Note that apply_workqueue_attrs() is almost identical to unbound pwq
part of alloc_and_link_pwqs().  The only difference is that there is
no previous first pwq.  apply_workqueue_attrs() is implemented to
handle such cases and replaces unbound pwq handling in
alloc_and_link_pwqs().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h |  2 ++
 kernel/workqueue.c        | 91 ++++++++++++++++++++++++++++++++++++-----------
 2 files changed, 73 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index c270b4eedf16..e152394fa7eb 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -410,6 +410,8 @@ extern void destroy_workqueue(struct workqueue_struct *wq);
 
 struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask);
 void free_workqueue_attrs(struct workqueue_attrs *attrs);
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs);
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 16fb6747276a..2a67fbbd192c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1228,7 +1228,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	if (unlikely(wq->flags & WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
-
+retry:
 	/* pwq which will be used unless @work is executing elsewhere */
 	if (!(wq->flags & WQ_UNBOUND)) {
 		if (cpu == WORK_CPU_UNBOUND)
@@ -1262,6 +1262,25 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 		spin_lock(&pwq->pool->lock);
 	}
 
+	/*
+	 * pwq is determined and locked.  For unbound pools, we could have
+	 * raced with pwq release and it could already be dead.  If its
+	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
+	 * without another pwq replacing it as the first pwq or while a
+	 * work item is executing on it, so the retying is guaranteed to
+	 * make forward-progress.
+	 */
+	if (unlikely(!pwq->refcnt)) {
+		if (wq->flags & WQ_UNBOUND) {
+			spin_unlock(&pwq->pool->lock);
+			cpu_relax();
+			goto retry;
+		}
+		/* oops */
+		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
+			  wq->name, cpu);
+	}
+
 	/* pwq determined, queue */
 	trace_workqueue_queue_work(req_cpu, pwq, work);
 
@@ -3425,7 +3444,8 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
 
 static void init_and_link_pwq(struct pool_workqueue *pwq,
 			      struct workqueue_struct *wq,
-			      struct worker_pool *pool)
+			      struct worker_pool *pool,
+			      struct pool_workqueue **p_last_pwq)
 {
 	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
 
@@ -3445,13 +3465,58 @@ static void init_and_link_pwq(struct pool_workqueue *pwq,
 	mutex_lock(&wq->flush_mutex);
 	spin_lock_irq(&workqueue_lock);
 
+	if (p_last_pwq)
+		*p_last_pwq = first_pwq(wq);
 	pwq->work_color = wq->work_color;
-	list_add_tail_rcu(&pwq->pwqs_node, &wq->pwqs);
+	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
 
 	spin_unlock_irq(&workqueue_lock);
 	mutex_unlock(&wq->flush_mutex);
 }
 
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq.  If @attrs doesn't match the
+ * current attributes, a new pwq is created and made the first pwq which
+ * will serve all new work items.  Older pwqs are released as in-flight
+ * work items finish.  Note that a work item which repeatedly requeues
+ * itself back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations.  Returns 0 on success and -errno on
+ * failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+			  const struct workqueue_attrs *attrs)
+{
+	struct pool_workqueue *pwq, *last_pwq;
+	struct worker_pool *pool;
+
+	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+		return -EINVAL;
+
+	pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL);
+	if (!pwq)
+		return -ENOMEM;
+
+	pool = get_unbound_pool(attrs);
+	if (!pool) {
+		kmem_cache_free(pwq_cache, pwq);
+		return -ENOMEM;
+	}
+
+	init_and_link_pwq(pwq, wq, pool, &last_pwq);
+	if (last_pwq) {
+		spin_lock_irq(&last_pwq->pool->lock);
+		put_pwq(last_pwq);
+		spin_unlock_irq(&last_pwq->pool->lock);
+	}
+
+	return 0;
+}
+
 static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 {
 	bool highpri = wq->flags & WQ_HIGHPRI;
@@ -3468,26 +3533,12 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq)
 			struct worker_pool *cpu_pools =
 				per_cpu(cpu_worker_pools, cpu);
 
-			init_and_link_pwq(pwq, wq, &cpu_pools[highpri]);
+			init_and_link_pwq(pwq, wq, &cpu_pools[highpri], NULL);
 		}
+		return 0;
 	} else {
-		struct pool_workqueue *pwq;
-		struct worker_pool *pool;
-
-		pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL);
-		if (!pwq)
-			return -ENOMEM;
-
-		pool = get_unbound_pool(unbound_std_wq_attrs[highpri]);
-		if (!pool) {
-			kmem_cache_free(pwq_cache, pwq);
-			return -ENOMEM;
-		}
-
-		init_and_link_pwq(pwq, wq, pool);
+		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
 	}
-
-	return 0;
 }
 
 static int wq_clamp_max_active(int max_active, unsigned int flags,
-- 
cgit 


From 618b01eb426dd2d73a4b5e5ebc6379e4eee3b123 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:04 -0700
Subject: workqueue: make it clear that WQ_DRAINING is an internal flag

We're gonna add another internal WQ flag.  Let's make the distinction
clear.  Prefix WQ_DRAINING with __ and move it to bit 16.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h | 2 +-
 kernel/workqueue.c        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index e152394fa7eb..1751ec4c47c9 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -294,7 +294,7 @@ enum {
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
-	WQ_DRAINING		= 1 << 6, /* internal: workqueue is draining */
+	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2a67fbbd192c..590f4d048ec7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1225,7 +1225,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
 	debug_work_activate(work);
 
 	/* if dying, only works from the same workqueue are allowed */
-	if (unlikely(wq->flags & WQ_DRAINING) &&
+	if (unlikely(wq->flags & __WQ_DRAINING) &&
 	    WARN_ON_ONCE(!is_chained_work(wq)))
 		return;
 retry:
@@ -2763,11 +2763,11 @@ void drain_workqueue(struct workqueue_struct *wq)
 	/*
 	 * __queue_work() needs to test whether there are drainers, is much
 	 * hotter than drain_workqueue() and already looks at @wq->flags.
-	 * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+	 * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
 	 */
 	spin_lock_irq(&workqueue_lock);
 	if (!wq->nr_drainers++)
-		wq->flags |= WQ_DRAINING;
+		wq->flags |= __WQ_DRAINING;
 	spin_unlock_irq(&workqueue_lock);
 reflush:
 	flush_workqueue(wq);
@@ -2795,7 +2795,7 @@ reflush:
 
 	spin_lock(&workqueue_lock);
 	if (!--wq->nr_drainers)
-		wq->flags &= ~WQ_DRAINING;
+		wq->flags &= ~__WQ_DRAINING;
 	spin_unlock(&workqueue_lock);
 
 	local_irq_enable();
-- 
cgit 


From 8719dceae2f98a578507c0f6b49c93f320bd729c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:04 -0700
Subject: workqueue: reject adjusting max_active or applying attrs to ordered
 workqueues

Adjusting max_active of or applying new workqueue_attrs to an ordered
workqueue breaks its ordering guarantee.  The former is obvious.  The
latter is because applying attrs creates a new pwq (pool_workqueue)
and there is no ordering constraint between the old and new pwqs.

Make apply_workqueue_attrs() and workqueue_set_max_active() trigger
WARN_ON() if those operations are requested on an ordered workqueue
and fail / ignore respectively.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 include/linux/workqueue.h | 3 ++-
 kernel/workqueue.c        | 9 +++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 1751ec4c47c9..5668ab249af5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -295,6 +295,7 @@ enum {
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
 
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
+	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
 
 	WQ_MAX_ACTIVE		= 512,	  /* I like 512, better ideas? */
 	WQ_MAX_UNBOUND_PER_CPU	= 4,	  /* 4 * #cpus for unbound wq */
@@ -397,7 +398,7 @@ __alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
 #define alloc_ordered_workqueue(fmt, flags, args...)			\
-	alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args)
+	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | (flags), 1, ##args)
 
 #define create_workqueue(name)						\
 	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 590f4d048ec7..cecd4ffe2c40 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3494,9 +3494,14 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 	struct pool_workqueue *pwq, *last_pwq;
 	struct worker_pool *pool;
 
+	/* only unbound workqueues can change attributes */
 	if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
 		return -EINVAL;
 
+	/* creating multiple pwqs breaks ordering guarantee */
+	if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+		return -EINVAL;
+
 	pwq = kmem_cache_zalloc(pwq_cache, GFP_KERNEL);
 	if (!pwq)
 		return -ENOMEM;
@@ -3752,6 +3757,10 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 {
 	struct pool_workqueue *pwq;
 
+	/* disallow meddling with max_active for ordered workqueues */
+	if (WARN_ON(wq->flags & __WQ_ORDERED))
+		return;
+
 	max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
 
 	spin_lock_irq(&workqueue_lock);
-- 
cgit 


From ba630e4940924ad1962883c207a62890778ced63 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:04 -0700
Subject: cpumask: implement cpumask_parse()

We have cpulist_parse() but not cpumask_parse().  Implement it using
bitmap_parse().

bitmap_parse() is weird in that it takes @len for a string in
kernel-memory which also is inconsistent with bitmap_parselist().
Make cpumask_parse() calculate the length and don't expose the
inconsistency to cpumask users.  Maybe we can fix up bitmap_parse()
later.

This will be used to expose workqueue cpumask knobs to userland via
sysfs.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/cpumask.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 032560295fcb..d08e4d2a9b92 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -590,6 +590,21 @@ static inline int cpulist_scnprintf(char *buf, int len,
 				    nr_cpumask_bits);
 }
 
+/**
+ * cpumask_parse - extract a cpumask from from a string
+ * @buf: the buffer to extract from
+ * @dstp: the cpumask to set.
+ *
+ * Returns -errno, or 0 for success.
+ */
+static inline int cpumask_parse(const char *buf, struct cpumask *dstp)
+{
+	char *nl = strchr(buf, '\n');
+	int len = nl ? nl - buf : strlen(buf);
+
+	return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits);
+}
+
 /**
  * cpulist_parse - extract a cpumask from a user string of ranges
  * @buf: the buffer to extract from
-- 
cgit 


From d73ce004225a7b2ed75f4340bb63721d55552265 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:05 -0700
Subject: driver/base: implement subsys_virtual_register()

Kay tells me the most appropriate place to expose workqueues to
userland would be /sys/devices/virtual/workqueues/WQ_NAME which is
symlinked to /sys/bus/workqueue/devices/WQ_NAME and that we're lacking
a way to do that outside of driver core as virtual_device_parent()
isn't exported and there's no inteface to conveniently create a
virtual subsystem.

This patch implements subsys_virtual_register() by factoring out
subsys_register() from subsys_system_register() and using it with
virtual_device_parent() as the origin directory.  It's identical to
subsys_system_register() other than the origin directory but we aren't
gonna restrict the device names which should be used under it.

This will be used to expose workqueue attributes to userland.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
---
 drivers/base/base.h    |  2 ++
 drivers/base/bus.c     | 73 +++++++++++++++++++++++++++++++++++---------------
 drivers/base/core.c    |  2 +-
 include/linux/device.h |  2 ++
 4 files changed, 57 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 6ee17bb391a9..b8bdfe61daa6 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -101,6 +101,8 @@ static inline int hypervisor_init(void) { return 0; }
 extern int platform_bus_init(void);
 extern void cpu_dev_init(void);
 
+struct kobject *virtual_device_parent(struct device *dev);
+
 extern int bus_add_device(struct device *dev);
 extern void bus_probe_device(struct device *dev);
 extern void bus_remove_device(struct device *dev);
diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 519865b53f76..2ae2d2f92b6b 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -1205,26 +1205,10 @@ static void system_root_device_release(struct device *dev)
 {
 	kfree(dev);
 }
-/**
- * subsys_system_register - register a subsystem at /sys/devices/system/
- * @subsys: system subsystem
- * @groups: default attributes for the root device
- *
- * All 'system' subsystems have a /sys/devices/system/<name> root device
- * with the name of the subsystem. The root device can carry subsystem-
- * wide attributes. All registered devices are below this single root
- * device and are named after the subsystem with a simple enumeration
- * number appended. The registered devices are not explicitely named;
- * only 'id' in the device needs to be set.
- *
- * Do not use this interface for anything new, it exists for compatibility
- * with bad ideas only. New subsystems should use plain subsystems; and
- * add the subsystem-wide attributes should be added to the subsystem
- * directory itself and not some create fake root-device placed in
- * /sys/devices/system/<name>.
- */
-int subsys_system_register(struct bus_type *subsys,
-			   const struct attribute_group **groups)
+
+static int subsys_register(struct bus_type *subsys,
+			   const struct attribute_group **groups,
+			   struct kobject *parent_of_root)
 {
 	struct device *dev;
 	int err;
@@ -1243,7 +1227,7 @@ int subsys_system_register(struct bus_type *subsys,
 	if (err < 0)
 		goto err_name;
 
-	dev->kobj.parent = &system_kset->kobj;
+	dev->kobj.parent = parent_of_root;
 	dev->groups = groups;
 	dev->release = system_root_device_release;
 
@@ -1263,8 +1247,55 @@ err_dev:
 	bus_unregister(subsys);
 	return err;
 }
+
+/**
+ * subsys_system_register - register a subsystem at /sys/devices/system/
+ * @subsys: system subsystem
+ * @groups: default attributes for the root device
+ *
+ * All 'system' subsystems have a /sys/devices/system/<name> root device
+ * with the name of the subsystem. The root device can carry subsystem-
+ * wide attributes. All registered devices are below this single root
+ * device and are named after the subsystem with a simple enumeration
+ * number appended. The registered devices are not explicitely named;
+ * only 'id' in the device needs to be set.
+ *
+ * Do not use this interface for anything new, it exists for compatibility
+ * with bad ideas only. New subsystems should use plain subsystems; and
+ * add the subsystem-wide attributes should be added to the subsystem
+ * directory itself and not some create fake root-device placed in
+ * /sys/devices/system/<name>.
+ */
+int subsys_system_register(struct bus_type *subsys,
+			   const struct attribute_group **groups)
+{
+	return subsys_register(subsys, groups, &system_kset->kobj);
+}
 EXPORT_SYMBOL_GPL(subsys_system_register);
 
+/**
+ * subsys_virtual_register - register a subsystem at /sys/devices/virtual/
+ * @subsys: virtual subsystem
+ * @groups: default attributes for the root device
+ *
+ * All 'virtual' subsystems have a /sys/devices/system/<name> root device
+ * with the name of the subystem.  The root device can carry subsystem-wide
+ * attributes.  All registered devices are below this single root device.
+ * There's no restriction on device naming.  This is for kernel software
+ * constructs which need sysfs interface.
+ */
+int subsys_virtual_register(struct bus_type *subsys,
+			    const struct attribute_group **groups)
+{
+	struct kobject *virtual_dir;
+
+	virtual_dir = virtual_device_parent(NULL);
+	if (!virtual_dir)
+		return -ENOMEM;
+
+	return subsys_register(subsys, groups, virtual_dir);
+}
+
 int __init buses_init(void)
 {
 	bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 56536f4b0f6b..f58084a86e8c 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -690,7 +690,7 @@ void device_initialize(struct device *dev)
 	set_dev_node(dev, -1);
 }
 
-static struct kobject *virtual_device_parent(struct device *dev)
+struct kobject *virtual_device_parent(struct device *dev)
 {
 	static struct kobject *virtual_dir = NULL;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 9d6464ea99c6..ee10d4e7be1a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -302,6 +302,8 @@ void subsys_interface_unregister(struct subsys_interface *sif);
 
 int subsys_system_register(struct bus_type *subsys,
 			   const struct attribute_group **groups);
+int subsys_virtual_register(struct bus_type *subsys,
+			    const struct attribute_group **groups);
 
 /**
  * struct class - device classes
-- 
cgit 


From 226223ab3c4118ddd10688cc2c131135848371ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 11:30:05 -0700
Subject: workqueue: implement sysfs interface for workqueues

There are cases where workqueue users want to expose control knobs to
userland.  e.g. Unbound workqueues with custom attributes are
scheduled to be used for writeback workers and depending on
configuration it can be useful to allow admins to tinker with the
priority or allowed CPUs.

This patch implements workqueue_sysfs_register(), which makes the
workqueue visible under /sys/bus/workqueue/devices/WQ_NAME.  There
currently are two attributes common to both per-cpu and unbound pools
and extra attributes for unbound pools including nice level and
cpumask.

If alloc_workqueue*() is called with WQ_SYSFS,
workqueue_sysfs_register() is called automatically as part of
workqueue creation.  This is the preferred method unless the workqueue
user wants to apply workqueue_attrs before making the workqueue
visible to userland.

v2: Disallow exposing ordered workqueues as ordered workqueues can't
    be tuned in any way.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |   8 ++
 kernel/workqueue.c        | 288 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 296 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 5668ab249af5..7f6d29a417c0 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -293,6 +293,7 @@ enum {
 	WQ_MEM_RECLAIM		= 1 << 3, /* may be used for memory reclaim */
 	WQ_HIGHPRI		= 1 << 4, /* high priority */
 	WQ_CPU_INTENSIVE	= 1 << 5, /* cpu instensive workqueue */
+	WQ_SYSFS		= 1 << 6, /* visible in sysfs, see wq_sysfs_register() */
 
 	__WQ_DRAINING		= 1 << 16, /* internal: workqueue is draining */
 	__WQ_ORDERED		= 1 << 17, /* internal: workqueue is ordered */
@@ -495,4 +496,11 @@ extern bool freeze_workqueues_busy(void);
 extern void thaw_workqueues(void);
 #endif /* CONFIG_FREEZER */
 
+#ifdef CONFIG_SYSFS
+int workqueue_sysfs_register(struct workqueue_struct *wq);
+#else	/* CONFIG_SYSFS */
+static inline int workqueue_sysfs_register(struct workqueue_struct *wq)
+{ return 0; }
+#endif	/* CONFIG_SYSFS */
+
 #endif
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index cecd4ffe2c40..c82feac0a878 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -210,6 +210,8 @@ struct wq_flusher {
 	struct completion	done;		/* flush completion */
 };
 
+struct wq_device;
+
 /*
  * The externally visible workqueue abstraction is an array of
  * per-CPU workqueues:
@@ -233,6 +235,10 @@ struct workqueue_struct {
 
 	int			nr_drainers;	/* W: drain in progress */
 	int			saved_max_active; /* W: saved pwq max_active */
+
+#ifdef CONFIG_SYSFS
+	struct wq_device	*wq_dev;	/* I: for sysfs interface */
+#endif
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
@@ -442,6 +448,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
 static DEFINE_IDR(worker_pool_idr);
 
 static int worker_thread(void *__worker);
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+				 const struct workqueue_attrs *from);
 
 /* allocate ID and assign it to @pool */
 static int worker_pool_assign_id(struct worker_pool *pool)
@@ -3153,6 +3161,281 @@ int keventd_up(void)
 	return system_wq != NULL;
 }
 
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME.  All visible workqueues have the
+ * following attributes.
+ *
+ *  per_cpu	RO bool	: whether the workqueue is per-cpu or unbound
+ *  max_active	RW int	: maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ *  id		RO int	: the associated pool ID
+ *  nice	RW int	: nice value of the workers
+ *  cpumask	RW mask	: bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+	struct workqueue_struct		*wq;
+	struct device			dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	return wq_dev->wq;
+}
+
+static ssize_t wq_per_cpu_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+
+static ssize_t wq_max_active_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t wq_max_active_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int val;
+
+	if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+		return -EINVAL;
+
+	workqueue_set_max_active(wq, val);
+	return count;
+}
+
+static struct device_attribute wq_sysfs_attrs[] = {
+	__ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
+	__ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
+	__ATTR_NULL,
+};
+
+static ssize_t wq_pool_id_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct worker_pool *pool;
+	int written;
+
+	rcu_read_lock_sched();
+	pool = first_pwq(wq)->pool;
+	written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
+	rcu_read_unlock_sched();
+
+	return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	rcu_read_lock_sched();
+	written = scnprintf(buf, PAGE_SIZE, "%d\n",
+			    first_pwq(wq)->pool->attrs->nice);
+	rcu_read_unlock_sched();
+
+	return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+	struct workqueue_attrs *attrs;
+
+	attrs = alloc_workqueue_attrs(GFP_KERNEL);
+	if (!attrs)
+		return NULL;
+
+	rcu_read_lock_sched();
+	copy_workqueue_attrs(attrs, first_pwq(wq)->pool->attrs);
+	rcu_read_unlock_sched();
+	return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+	    attrs->nice >= -20 && attrs->nice <= 19)
+		ret = apply_workqueue_attrs(wq, attrs);
+	else
+		ret = -EINVAL;
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	rcu_read_lock_sched();
+	written = cpumask_scnprintf(buf, PAGE_SIZE,
+				    first_pwq(wq)->pool->attrs->cpumask);
+	rcu_read_unlock_sched();
+
+	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+	return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = cpumask_parse(buf, attrs->cpumask);
+	if (!ret)
+		ret = apply_workqueue_attrs(wq, attrs);
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+	__ATTR(pool_id, 0444, wq_pool_id_show, NULL),
+	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+	.name				= "workqueue",
+	.dev_attrs			= wq_sysfs_attrs,
+};
+
+static int __init wq_sysfs_init(void)
+{
+	return subsys_virtual_register(&wq_subsys, NULL);
+}
+core_initcall(wq_sysfs_init);
+
+static void wq_device_release(struct device *dev)
+{
+	struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+	kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev;
+	int ret;
+
+	/*
+	 * Adjusting max_active or creating new pwqs by applyting
+	 * attributes breaks ordering guarantee.  Disallow exposing ordered
+	 * workqueues.
+	 */
+	if (WARN_ON(wq->flags & __WQ_ORDERED))
+		return -EINVAL;
+
+	wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+	if (!wq_dev)
+		return -ENOMEM;
+
+	wq_dev->wq = wq;
+	wq_dev->dev.bus = &wq_subsys;
+	wq_dev->dev.init_name = wq->name;
+	wq_dev->dev.release = wq_device_release;
+
+	/*
+	 * unbound_attrs are created separately.  Suppress uevent until
+	 * everything is ready.
+	 */
+	dev_set_uevent_suppress(&wq_dev->dev, true);
+
+	ret = device_register(&wq_dev->dev);
+	if (ret) {
+		kfree(wq_dev);
+		wq->wq_dev = NULL;
+		return ret;
+	}
+
+	if (wq->flags & WQ_UNBOUND) {
+		struct device_attribute *attr;
+
+		for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+			ret = device_create_file(&wq_dev->dev, attr);
+			if (ret) {
+				device_unregister(&wq_dev->dev);
+				wq->wq_dev = NULL;
+				return ret;
+			}
+		}
+	}
+
+	kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+	return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+	struct wq_device *wq_dev = wq->wq_dev;
+
+	if (!wq->wq_dev)
+		return;
+
+	wq->wq_dev = NULL;
+	device_unregister(&wq_dev->dev);
+}
+#else	/* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)	{ }
+#endif	/* CONFIG_SYSFS */
+
 /**
  * free_workqueue_attrs - free a workqueue_attrs
  * @attrs: workqueue_attrs to free
@@ -3625,6 +3908,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 		wake_up_process(rescuer->task);
 	}
 
+	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+		goto err_destroy;
+
 	/*
 	 * workqueue_lock protects global freeze state and workqueues
 	 * list.  Grab it, set max_active accordingly and add the new
@@ -3693,6 +3979,8 @@ void destroy_workqueue(struct workqueue_struct *wq)
 
 	spin_unlock_irq(&workqueue_lock);
 
+	workqueue_sysfs_unregister(wq);
+
 	if (wq->rescuer) {
 		kthread_stop(wq->rescuer->task);
 		kfree(wq->rescuer);
-- 
cgit 


From cc2a8b1a5595a435191fb197d92d1f3e193c9a6d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 12 Mar 2013 13:59:14 -0700
Subject: async: remove unused @node from struct async_domain

The @node in struct async_domain is unused after we introduce
async_global_pending, remove it.

tj: Unnecessary whitespace adjustments dropped.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/async.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/async.h b/include/linux/async.h
index a2e3f18b2ad6..98ea0fef30d5 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -18,7 +18,6 @@
 typedef u64 async_cookie_t;
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
 struct async_domain {
-	struct list_head node;
 	struct list_head pending;
 	unsigned registered:1;
 };
@@ -27,8 +26,7 @@ struct async_domain {
  * domain participates in global async_synchronize_full
  */
 #define ASYNC_DOMAIN(_name) \
-	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
-				      .pending = LIST_HEAD_INIT(_name.pending), \
+	struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending),	\
 				      .registered = 1 }
 
 /*
@@ -36,8 +34,7 @@ struct async_domain {
  * complete, this domain does not participate in async_synchronize_full
  */
 #define ASYNC_DOMAIN_EXCLUSIVE(_name) \
-	struct async_domain _name = { .node = LIST_HEAD_INIT(_name.node), \
-				      .pending = LIST_HEAD_INIT(_name.pending), \
+	struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \
 				      .registered = 0 }
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
-- 
cgit 


From 362f2b098b188ede9c4350cc20e58040dbfa515e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 12 Mar 2013 13:59:14 -0700
Subject: async: rename and redefine async_func_ptr

A function type is typically defined as
typedef ret_type (*func)(args..)

but async_func_ptr is not.  Redefine it.

Also rename async_func_ptr to async_func_t for _func_t suffix is more generic.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
---
 arch/sh/drivers/pci/pcie-sh7786.c |  2 +-
 include/linux/async.h             |  6 +++---
 kernel/async.c                    | 20 ++++++++++----------
 3 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c
index c2c85f6cd738..a162a7f86b2e 100644
--- a/arch/sh/drivers/pci/pcie-sh7786.c
+++ b/arch/sh/drivers/pci/pcie-sh7786.c
@@ -35,7 +35,7 @@ static unsigned int nr_ports;
 
 static struct sh7786_pcie_hwops {
 	int (*core_init)(void);
-	async_func_ptr *port_init_hw;
+	async_func_t port_init_hw;
 } *sh7786_pcie_hwops;
 
 static struct resource sh7786_pci0_resources[] = {
diff --git a/include/linux/async.h b/include/linux/async.h
index 98ea0fef30d5..6b0226bdaadc 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -16,7 +16,7 @@
 #include <linux/list.h>
 
 typedef u64 async_cookie_t;
-typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
+typedef void (*async_func_t) (void *data, async_cookie_t cookie);
 struct async_domain {
 	struct list_head pending;
 	unsigned registered:1;
@@ -37,8 +37,8 @@ struct async_domain {
 	struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \
 				      .registered = 0 }
 
-extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
-extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+extern async_cookie_t async_schedule(async_func_t func, void *data);
+extern async_cookie_t async_schedule_domain(async_func_t func, void *data,
 					    struct async_domain *domain);
 void async_unregister_domain(struct async_domain *domain);
 extern void async_synchronize_full(void);
diff --git a/kernel/async.c b/kernel/async.c
index ab99c92f6b68..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -73,7 +73,7 @@ struct async_entry {
 	struct list_head	global_list;
 	struct work_struct	work;
 	async_cookie_t		cookie;
-	async_func_ptr		*func;
+	async_func_t		func;
 	void			*data;
 	struct async_domain	*domain;
 };
@@ -145,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
 	wake_up(&async_done);
 }
 
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
 {
 	struct async_entry *entry;
 	unsigned long flags;
@@ -165,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
 		spin_unlock_irqrestore(&async_lock, flags);
 
 		/* low on memory.. run synchronously */
-		ptr(data, newcookie);
+		func(data, newcookie);
 		return newcookie;
 	}
 	INIT_LIST_HEAD(&entry->domain_list);
 	INIT_LIST_HEAD(&entry->global_list);
 	INIT_WORK(&entry->work, async_run_entry_fn);
-	entry->func = ptr;
+	entry->func = func;
 	entry->data = data;
 	entry->domain = domain;
 
@@ -198,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
 
 /**
  * async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
  * @data: data pointer to pass to the function
  *
  * Returns an async_cookie_t that may be used for checkpointing later.
  * Note: This function may be called from atomic or non-atomic contexts.
  */
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
 {
-	return __async_schedule(ptr, data, &async_dfl_domain);
+	return __async_schedule(func, data, &async_dfl_domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
 
 /**
  * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
  * @data: data pointer to pass to the function
  * @domain: the domain
  *
@@ -222,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
  * synchronization domain is specified via @domain.  Note: This function
  * may be called from atomic or non-atomic contexts.
  */
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
 				     struct async_domain *domain)
 {
-	return __async_schedule(ptr, data, domain);
+	return __async_schedule(func, data, domain);
 }
 EXPORT_SYMBOL_GPL(async_schedule_domain);
 
-- 
cgit 


From 49d0de082c31de34cc896c14eec5f1c2ade0415a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 14 Feb 2013 16:42:34 -0800
Subject: rcu: Fix hlist_bl_set_first_rcu() annotation

Abhi noticed that we were getting a complaint from the RCU subsystem
about access of an RCU protected list under the write side bit lock.
This commit adds additional annotation to check both the RCU read
lock and the write side bit lock before printing a message.

Reported by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Tested-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/list_bl.h    | 5 +++++
 include/linux/rculist_bl.h | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h
index 31f9d75adc5b..2eb88556c5c5 100644
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -125,6 +125,11 @@ static inline void hlist_bl_unlock(struct hlist_bl_head *b)
 	__bit_spin_unlock(0, (unsigned long *)b);
 }
 
+static inline bool hlist_bl_is_locked(struct hlist_bl_head *b)
+{
+	return bit_spin_is_locked(0, (unsigned long *)b);
+}
+
 /**
  * hlist_bl_for_each_entry	- iterate over list of given type
  * @tpos:	the type * to use as a loop cursor.
diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h
index cf1244fbf3b6..4f216c59e7db 100644
--- a/include/linux/rculist_bl.h
+++ b/include/linux/rculist_bl.h
@@ -20,7 +20,7 @@ static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h,
 static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h)
 {
 	return (struct hlist_bl_node *)
-		((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK);
+		((unsigned long)rcu_dereference_check(h->first, hlist_bl_is_locked(h)) & ~LIST_BL_LOCKMASK);
 }
 
 /**
-- 
cgit 


From e7b2dcc52b0e2d598a469f01cc460ccdde6869f2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Tue, 12 Mar 2013 15:35:58 -0700
Subject: cgroup: remove cgroup_is_descendant()

It was used by ns cgroup, and ns cgroup was removed long ago.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup.c        | 28 ----------------------------
 2 files changed, 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5f76829dd75e..7e818a3ef60a 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -448,9 +448,6 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-/* Return true if cgrp is a descendant of the task's cgroup */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
-
 /*
  * Control Group taskset, used to pass around set of tasks to cgroup_subsys
  * methods.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7a6c4c72ca55..f51443fd5f71 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5035,34 +5035,6 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
 	put_css_set_taskexit(cg);
 }
 
-/**
- * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
- * @cgrp: the cgroup in question
- * @task: the task in question
- *
- * See if @cgrp is a descendant of @task's cgroup in the appropriate
- * hierarchy.
- *
- * If we are sending in dummytop, then presumably we are creating
- * the top cgroup in the subsystem.
- *
- * Called only by the ns (nsproxy) cgroup.
- */
-int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
-{
-	int ret;
-	struct cgroup *target;
-
-	if (cgrp == dummytop)
-		return 1;
-
-	target = task_cgroup_from_root(task, cgrp->root);
-	while (cgrp != target && cgrp!= cgrp->top_cgroup)
-		cgrp = cgrp->parent;
-	ret = (cgrp == target);
-	return ret;
-}
-
 static void check_for_release(struct cgroup *cgrp)
 {
 	/* All of these checks rely on RCU to keep the cgroup
-- 
cgit 


From e62676169118bc2d42e5008b3f8872646313f077 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 Mar 2013 17:41:37 -0700
Subject: workqueue: implement current_is_workqueue_rescuer()

Implement a function which queries whether it currently is running off
a workqueue rescuer.  This will be used to convert writeback to
workqueue.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 13 +++++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7f6d29a417c0..df30763c8682 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -451,6 +451,7 @@ extern bool cancel_delayed_work_sync(struct delayed_work *dwork);
 
 extern void workqueue_set_max_active(struct workqueue_struct *wq,
 				     int max_active);
+extern bool current_is_workqueue_rescuer(void);
 extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c82feac0a878..f5c8bbb9ada3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4071,6 +4071,19 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
 }
 EXPORT_SYMBOL_GPL(workqueue_set_max_active);
 
+/**
+ * current_is_workqueue_rescuer - is %current workqueue rescuer?
+ *
+ * Determine whether %current is a workqueue rescuer.  Can be used from
+ * work functions to determine whether it's being run off the rescuer task.
+ */
+bool current_is_workqueue_rescuer(void)
+{
+	struct worker *worker = current_wq_worker();
+
+	return worker && worker == worker->current_pwq->wq->rescuer;
+}
+
 /**
  * workqueue_congested - test whether a workqueue is congested
  * @cpu: CPU in question
-- 
cgit 


From e86ac13b031cf71d8f40ff513e627aac80e6b765 Mon Sep 17 00:00:00 2001
From: Mugunthan V N <mugunthanvnm@ti.com>
Date: Mon, 11 Mar 2013 23:16:35 +0000
Subject: drivers: net: ethernet: cpsw: change cpts_active_slave to
 active_slave

Change cpts_active_slave to active_slave so that the same DT property
can be used to ethtool and SIOCGMIIPHY.

CC: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: Mugunthan V N <mugunthanvnm@ti.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/cpsw.txt |  7 ++++---
 arch/arm/boot/dts/am33xx.dtsi                  |  2 +-
 drivers/net/ethernet/ti/cpsw.c                 | 10 +++++-----
 include/linux/platform_data/cpsw.h             |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/net/cpsw.txt b/Documentation/devicetree/bindings/net/cpsw.txt
index 8e49c4200928..4f2ca6b4a182 100644
--- a/Documentation/devicetree/bindings/net/cpsw.txt
+++ b/Documentation/devicetree/bindings/net/cpsw.txt
@@ -15,7 +15,8 @@ Required properties:
 - mac_control		: Specifies Default MAC control register content
 			  for the specific platform
 - slaves		: Specifies number for slaves
-- cpts_active_slave	: Specifies the slave to use for time stamping
+- active_slave		: Specifies the slave to use for time stamping,
+			  ethtool and SIOCGMIIPHY
 - cpts_clock_mult	: Numerator to convert input clock ticks into nanoseconds
 - cpts_clock_shift	: Denominator to convert input clock ticks into nanoseconds
 
@@ -52,7 +53,7 @@ Examples:
 		rx_descs = <64>;
 		mac_control = <0x20>;
 		slaves = <2>;
-		cpts_active_slave = <0>;
+		active_slave = <0>;
 		cpts_clock_mult = <0x80000000>;
 		cpts_clock_shift = <29>;
 		cpsw_emac0: slave@0 {
@@ -78,7 +79,7 @@ Examples:
 		rx_descs = <64>;
 		mac_control = <0x20>;
 		slaves = <2>;
-		cpts_active_slave = <0>;
+		active_slave = <0>;
 		cpts_clock_mult = <0x80000000>;
 		cpts_clock_shift = <29>;
 		cpsw_emac0: slave@0 {
diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi
index 0957645b73af..91fe4f148f80 100644
--- a/arch/arm/boot/dts/am33xx.dtsi
+++ b/arch/arm/boot/dts/am33xx.dtsi
@@ -349,7 +349,7 @@
 			rx_descs = <64>;
 			mac_control = <0x20>;
 			slaves = <2>;
-			cpts_active_slave = <0>;
+			active_slave = <0>;
 			cpts_clock_mult = <0x80000000>;
 			cpts_clock_shift = <29>;
 			reg = <0x4a100000 0x800
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 01ffbc486982..98aa17a9516a 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -942,7 +942,7 @@ static void cpsw_ndo_change_rx_flags(struct net_device *ndev, int flags)
 
 static void cpsw_hwtstamp_v1(struct cpsw_priv *priv)
 {
-	struct cpsw_slave *slave = &priv->slaves[priv->data.cpts_active_slave];
+	struct cpsw_slave *slave = &priv->slaves[priv->data.active_slave];
 	u32 ts_en, seq_id;
 
 	if (!priv->cpts->tx_enable && !priv->cpts->rx_enable) {
@@ -971,7 +971,7 @@ static void cpsw_hwtstamp_v2(struct cpsw_priv *priv)
 	if (priv->data.dual_emac)
 		slave = &priv->slaves[priv->emac_port];
 	else
-		slave = &priv->slaves[priv->data.cpts_active_slave];
+		slave = &priv->slaves[priv->data.active_slave];
 
 	ctrl = slave_read(slave, CPSW2_CONTROL);
 	ctrl &= ~CTRL_ALL_TS_MASK;
@@ -1282,12 +1282,12 @@ static int cpsw_probe_dt(struct cpsw_platform_data *data,
 	}
 	data->slaves = prop;
 
-	if (of_property_read_u32(node, "cpts_active_slave", &prop)) {
-		pr_err("Missing cpts_active_slave property in the DT.\n");
+	if (of_property_read_u32(node, "active_slave", &prop)) {
+		pr_err("Missing active_slave property in the DT.\n");
 		ret = -EINVAL;
 		goto error_ret;
 	}
-	data->cpts_active_slave = prop;
+	data->active_slave = prop;
 
 	if (of_property_read_u32(node, "cpts_clock_mult", &prop)) {
 		pr_err("Missing cpts_clock_mult property in the DT.\n");
diff --git a/include/linux/platform_data/cpsw.h b/include/linux/platform_data/cpsw.h
index 798fb80b024b..bb3cd58d71e3 100644
--- a/include/linux/platform_data/cpsw.h
+++ b/include/linux/platform_data/cpsw.h
@@ -30,7 +30,7 @@ struct cpsw_platform_data {
 	u32	channels;	/* number of cpdma channels (symmetric) */
 	u32	slaves;		/* number of slave cpgmac ports */
 	struct cpsw_slave_data	*slave_data;
-	u32	cpts_active_slave; /* time stamping slave */
+	u32	active_slave; /* time stamping, ethtool and SIOCGMIIPHY slave */
 	u32	cpts_clock_mult;  /* convert input clock ticks to nanoseconds */
 	u32	cpts_clock_shift; /* convert input clock ticks to nanoseconds */
 	u32	ale_entries;	/* ale table size */
-- 
cgit 


From eaa907c546f76222227dfc41784b22588af1e3d7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 6 Mar 2013 11:18:36 +0000
Subject: tick: Provide a check for a forced broadcast pending

On the CPU which gets woken along with the target CPU of the broadcast
the following happens:

  deep_idle()
			<-- spurious wakeup
  broadcast_exit()
    set forced bit

  enable interrupts

			<-- Nothing happens

  disable interrupts

  broadcast_enter()
			<-- Here we observe the forced bit is set
  deep_idle()

Now after that the target CPU of the broadcast runs the broadcast
handler and finds the other CPU in both the broadcast and the forced
mask, sends the IPI and stuff gets back to normal.

So it's not actually harmful, just more evidence for the theory, that
hardware designers have access to very special drug supplies.

Now there is no point in going back to deep idle just to wake up again
right away via an IPI. Provide a check which allows the idle code to
avoid the deep idle transition.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: LAK <linux-arm-kernel@lists.infradead.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Veen <arjan@infradead.org>
Cc: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Cc: Jason Liu <liu.h.jason@gmail.com>
Link: http://lkml.kernel.org/r/20130306111537.565418308@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h   |  6 ++++++
 kernel/time/tick-broadcast.c | 12 ++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 494d33ea78f8..646aac136eed 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -175,6 +175,12 @@ extern void tick_broadcast(const struct cpumask *mask);
 extern int tick_receive_broadcast(void);
 #endif
 
+#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT)
+extern int tick_check_broadcast_expired(void);
+#else
+static inline int tick_check_broadcast_expired(void) { return 0; }
+#endif
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void clockevents_notify(unsigned long reason, void *arg);
 #else
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2100aad6b5f2..d76d816afc5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -403,6 +403,18 @@ struct cpumask *tick_get_broadcast_oneshot_mask(void)
 	return tick_broadcast_oneshot_mask;
 }
 
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen. We detected
+ * that in tick_broadcast_oneshot_control(). The callsite can use this
+ * to avoid a deep idle transition as we are about to get the
+ * broadcast IPI right away.
+ */
+int tick_check_broadcast_expired(void)
+{
+	return cpumask_test_cpu(smp_processor_id(), tick_broadcast_force_mask);
+}
+
 /*
  * Set broadcast interrupt affinity
  */
-- 
cgit 


From be871b7e54711479d3b9d3617d49898770830db2 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Tue, 12 Mar 2013 17:21:19 +0100
Subject: device: separate all subsys mutexes

ca22e56d (driver-core: implement 'sysdev' functionality for regular
devices and buses) has introduced bus_register macro with a static
key to distinguish different subsys mutex classes.

This however doesn't work for different subsys which use a common
registering function. One example is subsys_system_register (and
mce_device and cpu_device).

In the end this leads to the following lockdep splat:
[  207.271924] ======================================================
[  207.271932] [ INFO: possible circular locking dependency detected ]
[  207.271942] 3.9.0-rc1-0.7-default+ #34 Not tainted
[  207.271948] -------------------------------------------------------
[  207.271957] bash/10493 is trying to acquire lock:
[  207.271963]  (subsys mutex){+.+.+.}, at: [<ffffffff8134af27>] bus_remove_device+0x37/0x1c0
[  207.271987]
[  207.271987] but task is already holding lock:
[  207.271995]  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffff81046ccf>] cpu_hotplug_begin+0x2f/0x60
[  207.272012]
[  207.272012] which lock already depends on the new lock.
[  207.272012]
[  207.272023]
[  207.272023] the existing dependency chain (in reverse order) is:
[  207.272033]
[  207.272033] -> #4 (cpu_hotplug.lock){+.+.+.}:
[  207.272044]        [<ffffffff810ae329>] lock_acquire+0xe9/0x120
[  207.272056]        [<ffffffff814ad807>] mutex_lock_nested+0x37/0x360
[  207.272069]        [<ffffffff81046ba9>] get_online_cpus+0x29/0x40
[  207.272082]        [<ffffffff81185210>] drain_all_stock+0x30/0x150
[  207.272094]        [<ffffffff811853da>] mem_cgroup_reclaim+0xaa/0xe0
[  207.272104]        [<ffffffff8118775e>] __mem_cgroup_try_charge+0x51e/0xcf0
[  207.272114]        [<ffffffff81188486>] mem_cgroup_charge_common+0x36/0x60
[  207.272125]        [<ffffffff811884da>] mem_cgroup_newpage_charge+0x2a/0x30
[  207.272135]        [<ffffffff81150531>] do_wp_page+0x231/0x830
[  207.272147]        [<ffffffff8115151e>] handle_pte_fault+0x19e/0x8d0
[  207.272157]        [<ffffffff81151da8>] handle_mm_fault+0x158/0x1e0
[  207.272166]        [<ffffffff814b6153>] do_page_fault+0x2a3/0x4e0
[  207.272178]        [<ffffffff814b2578>] page_fault+0x28/0x30
[  207.272189]
[  207.272189] -> #3 (&mm->mmap_sem){++++++}:
[  207.272199]        [<ffffffff810ae329>] lock_acquire+0xe9/0x120
[  207.272208]        [<ffffffff8114c5ad>] might_fault+0x6d/0x90
[  207.272218]        [<ffffffff811a11e3>] filldir64+0xb3/0x120
[  207.272229]        [<ffffffffa013fc19>] call_filldir+0x89/0x130 [ext3]
[  207.272248]        [<ffffffffa0140377>] ext3_readdir+0x6b7/0x7e0 [ext3]
[  207.272263]        [<ffffffff811a1519>] vfs_readdir+0xa9/0xc0
[  207.272273]        [<ffffffff811a15cb>] sys_getdents64+0x9b/0x110
[  207.272284]        [<ffffffff814bb599>] system_call_fastpath+0x16/0x1b
[  207.272296]
[  207.272296] -> #2 (&type->i_mutex_dir_key#3){+.+.+.}:
[  207.272309]        [<ffffffff810ae329>] lock_acquire+0xe9/0x120
[  207.272319]        [<ffffffff814ad807>] mutex_lock_nested+0x37/0x360
[  207.272329]        [<ffffffff8119c254>] link_path_walk+0x6f4/0x9a0
[  207.272339]        [<ffffffff8119e7fa>] path_openat+0xba/0x470
[  207.272349]        [<ffffffff8119ecf8>] do_filp_open+0x48/0xa0
[  207.272358]        [<ffffffff8118d81c>] file_open_name+0xdc/0x110
[  207.272369]        [<ffffffff8118d885>] filp_open+0x35/0x40
[  207.272378]        [<ffffffff8135c76e>] _request_firmware+0x52e/0xb20
[  207.272389]        [<ffffffff8135cdd6>] request_firmware+0x16/0x20
[  207.272399]        [<ffffffffa03bdb91>] request_microcode_fw+0x61/0xd0 [microcode]
[  207.272416]        [<ffffffffa03bd554>] microcode_init_cpu+0x104/0x150 [microcode]
[  207.272431]        [<ffffffffa03bd61c>] mc_device_add+0x7c/0xb0 [microcode]
[  207.272444]        [<ffffffff8134a419>] subsys_interface_register+0xc9/0x100
[  207.272457]        [<ffffffffa04fc0f4>] 0xffffffffa04fc0f4
[  207.272472]        [<ffffffff81000202>] do_one_initcall+0x42/0x180
[  207.272485]        [<ffffffff810bbeff>] load_module+0x19df/0x1b70
[  207.272499]        [<ffffffff810bc376>] sys_init_module+0xe6/0x130
[  207.272511]        [<ffffffff814bb599>] system_call_fastpath+0x16/0x1b
[  207.272523]
[  207.272523] -> #1 (umhelper_sem){++++.+}:
[  207.272537]        [<ffffffff810ae329>] lock_acquire+0xe9/0x120
[  207.272548]        [<ffffffff814ae9c4>] down_read+0x34/0x50
[  207.272559]        [<ffffffff81062bff>] usermodehelper_read_trylock+0x4f/0x100
[  207.272575]        [<ffffffff8135c7dd>] _request_firmware+0x59d/0xb20
[  207.272587]        [<ffffffff8135cdd6>] request_firmware+0x16/0x20
[  207.272599]        [<ffffffffa03bdb91>] request_microcode_fw+0x61/0xd0 [microcode]
[  207.272613]        [<ffffffffa03bd554>] microcode_init_cpu+0x104/0x150 [microcode]
[  207.272627]        [<ffffffffa03bd61c>] mc_device_add+0x7c/0xb0 [microcode]
[  207.272641]        [<ffffffff8134a419>] subsys_interface_register+0xc9/0x100
[  207.272654]        [<ffffffffa04fc0f4>] 0xffffffffa04fc0f4
[  207.272666]        [<ffffffff81000202>] do_one_initcall+0x42/0x180
[  207.272678]        [<ffffffff810bbeff>] load_module+0x19df/0x1b70
[  207.272690]        [<ffffffff810bc376>] sys_init_module+0xe6/0x130
[  207.272702]        [<ffffffff814bb599>] system_call_fastpath+0x16/0x1b
[  207.272715]
[  207.272715] -> #0 (subsys mutex){+.+.+.}:
[  207.272729]        [<ffffffff810ae002>] __lock_acquire+0x13b2/0x15f0
[  207.272740]        [<ffffffff810ae329>] lock_acquire+0xe9/0x120
[  207.272751]        [<ffffffff814ad807>] mutex_lock_nested+0x37/0x360
[  207.272763]        [<ffffffff8134af27>] bus_remove_device+0x37/0x1c0
[  207.272775]        [<ffffffff81349114>] device_del+0x134/0x1f0
[  207.272786]        [<ffffffff813491f2>] device_unregister+0x22/0x60
[  207.272798]        [<ffffffff814a24ea>] mce_cpu_callback+0x15e/0x1ad
[  207.272812]        [<ffffffff814b6402>] notifier_call_chain+0x72/0x130
[  207.272824]        [<ffffffff81073d6e>] __raw_notifier_call_chain+0xe/0x10
[  207.272839]        [<ffffffff81498f76>] _cpu_down+0x1d6/0x350
[  207.272851]        [<ffffffff81499130>] cpu_down+0x40/0x60
[  207.272862]        [<ffffffff8149cc55>] store_online+0x75/0xe0
[  207.272874]        [<ffffffff813474a0>] dev_attr_store+0x20/0x30
[  207.272886]        [<ffffffff812090d9>] sysfs_write_file+0xd9/0x150
[  207.272900]        [<ffffffff8118e10b>] vfs_write+0xcb/0x130
[  207.272911]        [<ffffffff8118e924>] sys_write+0x64/0xa0
[  207.272923]        [<ffffffff814bb599>] system_call_fastpath+0x16/0x1b
[  207.272936]
[  207.272936] other info that might help us debug this:
[  207.272936]
[  207.272952] Chain exists of:
[  207.272952]   subsys mutex --> &mm->mmap_sem --> cpu_hotplug.lock
[  207.272952]
[  207.272973]  Possible unsafe locking scenario:
[  207.272973]
[  207.272984]        CPU0                    CPU1
[  207.272992]        ----                    ----
[  207.273000]   lock(cpu_hotplug.lock);
[  207.273009]                                lock(&mm->mmap_sem);
[  207.273020]                                lock(cpu_hotplug.lock);
[  207.273031]   lock(subsys mutex);
[  207.273040]
[  207.273040]  *** DEADLOCK ***
[  207.273040]
[  207.273055] 5 locks held by bash/10493:
[  207.273062]  #0:  (&buffer->mutex){+.+.+.}, at: [<ffffffff81209049>] sysfs_write_file+0x49/0x150
[  207.273080]  #1:  (s_active#150){.+.+.+}, at: [<ffffffff812090c2>] sysfs_write_file+0xc2/0x150
[  207.273099]  #2:  (x86_cpu_hotplug_driver_mutex){+.+.+.}, at: [<ffffffff81027557>] cpu_hotplug_driver_lock+0x17/0x20
[  207.273121]  #3:  (cpu_add_remove_lock){+.+.+.}, at: [<ffffffff8149911c>] cpu_down+0x2c/0x60
[  207.273140]  #4:  (cpu_hotplug.lock){+.+.+.}, at: [<ffffffff81046ccf>] cpu_hotplug_begin+0x2f/0x60
[  207.273158]
[  207.273158] stack backtrace:
[  207.273170] Pid: 10493, comm: bash Not tainted 3.9.0-rc1-0.7-default+ #34
[  207.273180] Call Trace:
[  207.273192]  [<ffffffff810ab373>] print_circular_bug+0x223/0x310
[  207.273204]  [<ffffffff810ae002>] __lock_acquire+0x13b2/0x15f0
[  207.273216]  [<ffffffff812086b0>] ? sysfs_hash_and_remove+0x60/0xc0
[  207.273227]  [<ffffffff810ae329>] lock_acquire+0xe9/0x120
[  207.273239]  [<ffffffff8134af27>] ? bus_remove_device+0x37/0x1c0
[  207.273251]  [<ffffffff814ad807>] mutex_lock_nested+0x37/0x360
[  207.273263]  [<ffffffff8134af27>] ? bus_remove_device+0x37/0x1c0
[  207.273274]  [<ffffffff812086b0>] ? sysfs_hash_and_remove+0x60/0xc0
[  207.273286]  [<ffffffff8134af27>] bus_remove_device+0x37/0x1c0
[  207.273298]  [<ffffffff81349114>] device_del+0x134/0x1f0
[  207.273309]  [<ffffffff813491f2>] device_unregister+0x22/0x60
[  207.273321]  [<ffffffff814a24ea>] mce_cpu_callback+0x15e/0x1ad
[  207.273332]  [<ffffffff814b6402>] notifier_call_chain+0x72/0x130
[  207.273344]  [<ffffffff81073d6e>] __raw_notifier_call_chain+0xe/0x10
[  207.273356]  [<ffffffff81498f76>] _cpu_down+0x1d6/0x350
[  207.273368]  [<ffffffff81027557>] ? cpu_hotplug_driver_lock+0x17/0x20
[  207.273380]  [<ffffffff81499130>] cpu_down+0x40/0x60
[  207.273391]  [<ffffffff8149cc55>] store_online+0x75/0xe0
[  207.273402]  [<ffffffff813474a0>] dev_attr_store+0x20/0x30
[  207.273413]  [<ffffffff812090d9>] sysfs_write_file+0xd9/0x150
[  207.273425]  [<ffffffff8118e10b>] vfs_write+0xcb/0x130
[  207.273436]  [<ffffffff8118e924>] sys_write+0x64/0xa0
[  207.273447]  [<ffffffff814bb599>] system_call_fastpath+0x16/0x1b

Which reports a false possitive deadlock because it sees:
1) load_module -> subsys_interface_register -> mc_deveice_add (*) -> subsys->p->mutex -> link_path_walk -> lookup_slow -> i_mutex
2) sys_write -> _cpu_down -> cpu_hotplug_begin -> cpu_hotplug.lock -> mce_cpu_callback -> mce_device_remove(**) -> device_unregister -> bus_remove_device -> subsys mutex
3) vfs_readdir -> i_mutex -> filldir64 -> might_fault -> might_lock_read(mmap_sem) -> page_fault -> mmap_sem -> drain_all_stock -> cpu_hotplug.lock

but
1) takes cpu_subsys subsys (*) but 2) takes mce_device subsys (**) so
the deadlock is not possible AFAICS.

The fix is quite simple. We can pull the key inside bus_type structure
because they are defined per device so the pointer will be unique as
well. bus_register doesn't need to be a macro anymore so change it
to the inline. We could get rid of __bus_register as there is no other
caller but maybe somebody will want to use a different key so keep it
around for now.

Reported-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/base/bus.c     |  8 ++++----
 include/linux/device.h | 12 +++---------
 2 files changed, 7 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/bus.c b/drivers/base/bus.c
index 519865b53f76..8a00dec574d6 100644
--- a/drivers/base/bus.c
+++ b/drivers/base/bus.c
@@ -898,18 +898,18 @@ static ssize_t bus_uevent_store(struct bus_type *bus,
 static BUS_ATTR(uevent, S_IWUSR, NULL, bus_uevent_store);
 
 /**
- * __bus_register - register a driver-core subsystem
+ * bus_register - register a driver-core subsystem
  * @bus: bus to register
- * @key: lockdep class key
  *
  * Once we have that, we register the bus with the kobject
  * infrastructure, then register the children subsystems it has:
  * the devices and drivers that belong to the subsystem.
  */
-int __bus_register(struct bus_type *bus, struct lock_class_key *key)
+int bus_register(struct bus_type *bus)
 {
 	int retval;
 	struct subsys_private *priv;
+	struct lock_class_key *key = &bus->lock_key;
 
 	priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL);
 	if (!priv)
@@ -981,7 +981,7 @@ out:
 	bus->p = NULL;
 	return retval;
 }
-EXPORT_SYMBOL_GPL(__bus_register);
+EXPORT_SYMBOL_GPL(bus_register);
 
 /**
  * bus_unregister - remove a bus from the system
diff --git a/include/linux/device.h b/include/linux/device.h
index 9d6464ea99c6..4a7c4a84afee 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -111,17 +111,11 @@ struct bus_type {
 	struct iommu_ops *iommu_ops;
 
 	struct subsys_private *p;
+	struct lock_class_key lock_key;
 };
 
-/* This is a #define to keep the compiler from merging different
- * instances of the __key variable */
-#define bus_register(subsys)			\
-({						\
-	static struct lock_class_key __key;	\
-	__bus_register(subsys, &__key);	\
-})
-extern int __must_check __bus_register(struct bus_type *bus,
-				       struct lock_class_key *key);
+extern int __must_check bus_register(struct bus_type *bus);
+
 extern void bus_unregister(struct bus_type *bus);
 
 extern int __must_check bus_rescan_devices(struct bus_type *bus);
-- 
cgit 


From f792685006274a850e6cc0ea9ade275ccdfc90bc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 5 Mar 2013 18:05:46 +0100
Subject: math64: New div64_u64_rem helper

Provide an extended version of div64_u64() that
also returns the remainder of the division.

We are going to need this to refine the cputime
scaling code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/math64.h | 19 ++++++++++++++++++-
 lib/div64.c            | 19 +++++++++++++------
 2 files changed, 31 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index b8ba85544721..931a619407bf 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -29,6 +29,15 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 	return dividend / divisor;
 }
 
+/**
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor
+ */
+static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+{
+	*remainder = dividend % divisor;
+	return dividend / divisor;
+}
+
 /**
  * div64_u64 - unsigned 64bit divide with 64bit divisor
  */
@@ -61,8 +70,16 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
 #endif
 
+#ifndef div64_u64_rem
+extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
+#endif
+
 #ifndef div64_u64
-extern u64 div64_u64(u64 dividend, u64 divisor);
+static inline u64 div64_u64(u64 dividend, u64 divisor)
+{
+	u64 remainder;
+	return div64_u64_rem(dividend, divisor, &remainder);
+}
 #endif
 
 #ifndef div64_s64
diff --git a/lib/div64.c b/lib/div64.c
index a163b6caef73..3af5728d95fd 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,9 +79,10 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
- * div64_u64 - unsigned 64bit divide with 64bit divisor
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
+ * @remainder:  64bit remainder
  *
  * This implementation is a modified version of the algorithm proposed
  * by the book 'Hacker's Delight'.  The original source and full proof
@@ -89,27 +90,33 @@ EXPORT_SYMBOL(div_s64_rem);
  *
  * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
  */
-#ifndef div64_u64
-u64 div64_u64(u64 dividend, u64 divisor)
+#ifndef div64_u64_rem
+u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
 {
 	u32 high = divisor >> 32;
 	u64 quot;
 
 	if (high == 0) {
-		quot = div_u64(dividend, divisor);
+		u32 rem32;
+		quot = div_u64_rem(dividend, divisor, &rem32);
+		*remainder = rem32;
 	} else {
 		int n = 1 + fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
 			quot--;
-		if ((dividend - quot * divisor) >= divisor)
+
+		*remainder = dividend - quot * divisor;
+		if (*remainder >= divisor) {
 			quot++;
+			*remainder -= divisor;
+		}
 	}
 
 	return quot;
 }
-EXPORT_SYMBOL(div64_u64);
+EXPORT_SYMBOL(div64_u64_rem);
 #endif
 
 /**
-- 
cgit 


From 8425e3d5bdbe8e741d2c73cf3189ed59b4038b84 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 13 Mar 2013 16:51:36 -0700
Subject: workqueue: inline trivial wrappers

There's no reason to make these trivial wrappers full (exported)
functions.  Inline the followings.

 queue_work()
 queue_delayed_work()
 mod_delayed_work()
 schedule_work_on()
 schedule_work()
 schedule_delayed_work_on()
 schedule_delayed_work()
 keventd_up()

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/workqueue.h | 123 +++++++++++++++++++++++++++++++++++++++++-----
 kernel/workqueue.c        | 111 -----------------------------------------
 2 files changed, 111 insertions(+), 123 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index df30763c8682..835d12b76960 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -417,28 +417,16 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
 
 extern bool queue_work_on(int cpu, struct workqueue_struct *wq,
 			struct work_struct *work);
-extern bool queue_work(struct workqueue_struct *wq, struct work_struct *work);
 extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *work, unsigned long delay);
-extern bool queue_delayed_work(struct workqueue_struct *wq,
-			struct delayed_work *work, unsigned long delay);
 extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 			struct delayed_work *dwork, unsigned long delay);
-extern bool mod_delayed_work(struct workqueue_struct *wq,
-			struct delayed_work *dwork, unsigned long delay);
 
 extern void flush_workqueue(struct workqueue_struct *wq);
 extern void drain_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
 
-extern bool schedule_work_on(int cpu, struct work_struct *work);
-extern bool schedule_work(struct work_struct *work);
-extern bool schedule_delayed_work_on(int cpu, struct delayed_work *work,
-				     unsigned long delay);
-extern bool schedule_delayed_work(struct delayed_work *work,
-				  unsigned long delay);
 extern int schedule_on_each_cpu(work_func_t func);
-extern int keventd_up(void);
 
 int execute_in_process_context(work_func_t fn, struct execute_work *);
 
@@ -455,6 +443,117 @@ extern bool current_is_workqueue_rescuer(void);
 extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
 
+/**
+ * queue_work - queue work on a workqueue
+ * @wq: workqueue to use
+ * @work: work to queue
+ *
+ * Returns %false if @work was already on a queue, %true otherwise.
+ *
+ * We queue the work to the CPU on which it was submitted, but if the CPU dies
+ * it can be processed by another CPU.
+ */
+static inline bool queue_work(struct workqueue_struct *wq,
+			      struct work_struct *work)
+{
+	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
+}
+
+/**
+ * queue_delayed_work - queue work on a workqueue after delay
+ * @wq: workqueue to use
+ * @dwork: delayable work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
+ */
+static inline bool queue_delayed_work(struct workqueue_struct *wq,
+				      struct delayed_work *dwork,
+				      unsigned long delay)
+{
+	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+
+/**
+ * mod_delayed_work - modify delay of or queue a delayed work
+ * @wq: workqueue to use
+ * @dwork: work to queue
+ * @delay: number of jiffies to wait before queueing
+ *
+ * mod_delayed_work_on() on local CPU.
+ */
+static inline bool mod_delayed_work(struct workqueue_struct *wq,
+				    struct delayed_work *dwork,
+				    unsigned long delay)
+{
+	return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
+}
+
+/**
+ * schedule_work_on - put work task on a specific cpu
+ * @cpu: cpu to put the work task on
+ * @work: job to be done
+ *
+ * This puts a job on a specific cpu
+ */
+static inline bool schedule_work_on(int cpu, struct work_struct *work)
+{
+	return queue_work_on(cpu, system_wq, work);
+}
+
+/**
+ * schedule_work - put work task in global workqueue
+ * @work: job to be done
+ *
+ * Returns %false if @work was already on the kernel-global workqueue and
+ * %true otherwise.
+ *
+ * This puts a job in the kernel-global workqueue if it was not already
+ * queued and leaves it in the same position on the kernel-global
+ * workqueue otherwise.
+ */
+static inline bool schedule_work(struct work_struct *work)
+{
+	return queue_work(system_wq, work);
+}
+
+/**
+ * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
+ * @cpu: cpu to use
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue on the specified CPU.
+ */
+static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
+					    unsigned long delay)
+{
+	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
+}
+
+/**
+ * schedule_delayed_work - put work task in global workqueue after delay
+ * @dwork: job to be done
+ * @delay: number of jiffies to wait or 0 for immediate execution
+ *
+ * After waiting for a given time this puts a job in the kernel-global
+ * workqueue.
+ */
+static inline bool schedule_delayed_work(struct delayed_work *dwork,
+					 unsigned long delay)
+{
+	return queue_delayed_work(system_wq, dwork, delay);
+}
+
+/**
+ * keventd_up - is workqueue initialized yet?
+ */
+static inline bool keventd_up(void)
+{
+	return system_wq != NULL;
+}
+
 /*
  * Like above, but uses del_timer() instead of del_timer_sync(). This means,
  * if it returns 0 the timer function may be running and the queueing is in
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 147fc5a784f0..f37421fb4f35 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1340,22 +1340,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_work_on);
 
-/**
- * queue_work - queue work on a workqueue
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns %false if @work was already on a queue, %true otherwise.
- *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
- */
-bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
-{
-	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
-}
-EXPORT_SYMBOL_GPL(queue_work);
-
 void delayed_work_timer_fn(unsigned long __data)
 {
 	struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -1430,21 +1414,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
-/**
- * queue_delayed_work - queue work on a workqueue after delay
- * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
- */
-bool queue_delayed_work(struct workqueue_struct *wq,
-			struct delayed_work *dwork, unsigned long delay)
-{
-	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
-}
-EXPORT_SYMBOL_GPL(queue_delayed_work);
-
 /**
  * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
  * @cpu: CPU number to execute work on
@@ -1483,21 +1452,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(mod_delayed_work_on);
 
-/**
- * mod_delayed_work - modify delay of or queue a delayed work
- * @wq: workqueue to use
- * @dwork: work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * mod_delayed_work_on() on local CPU.
- */
-bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
-		      unsigned long delay)
-{
-	return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
-}
-EXPORT_SYMBOL_GPL(mod_delayed_work);
-
 /**
  * worker_enter_idle - enter idle state
  * @worker: worker which is entering idle state
@@ -3001,66 +2955,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
 }
 EXPORT_SYMBOL(cancel_delayed_work_sync);
 
-/**
- * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to put the work task on
- * @work: job to be done
- *
- * This puts a job on a specific cpu
- */
-bool schedule_work_on(int cpu, struct work_struct *work)
-{
-	return queue_work_on(cpu, system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work_on);
-
-/**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns %false if @work was already on the kernel-global workqueue and
- * %true otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-bool schedule_work(struct work_struct *work)
-{
-	return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-
-/**
- * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
- * @cpu: cpu to use
- * @dwork: job to be done
- * @delay: number of jiffies to wait
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue on the specified CPU.
- */
-bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
-			      unsigned long delay)
-{
-	return queue_delayed_work_on(cpu, system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work_on);
-
-/**
- * schedule_delayed_work - put work task in global workqueue after delay
- * @dwork: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue.
- */
-bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
-{
-	return queue_delayed_work(system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work);
-
 /**
  * schedule_on_each_cpu - execute a function synchronously on each online CPU
  * @func: the function to call
@@ -3154,11 +3048,6 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
 }
 EXPORT_SYMBOL_GPL(execute_in_process_context);
 
-int keventd_up(void)
-{
-	return system_wq != NULL;
-}
-
 #ifdef CONFIG_SYSFS
 /*
  * Workqueues with WQ_SYSFS flag set is visible to userland via
-- 
cgit 


From ae63b31e4d0e2ec09c569306ea46f664508ef717 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 3 May 2012 23:09:03 -0400
Subject: tracing: Separate out trace events from global variables

The trace events for ftrace are all defined via global variables.
The arrays of events and event systems are linked to a global list.
This prevents multiple users of the event system (what to enable and
what not to).

By adding descriptors to represent the event/file relation, as well
as to which trace_array descriptor they are associated with, allows
for more than one set of events to be defined. Once the trace events
files have a link between the trace event and the trace_array they
are associated with, we can create multiple trace_arrays that can
record separate events in separate buffers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h       |  51 ++-
 include/trace/ftrace.h             |   3 +-
 kernel/trace/trace.c               |   8 +
 kernel/trace/trace.h               |  39 +-
 kernel/trace/trace_events.c        | 776 +++++++++++++++++++++++++------------
 kernel/trace/trace_events_filter.c |   5 +-
 6 files changed, 622 insertions(+), 260 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 13a54d0bdfa8..c7191d482f98 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -182,18 +182,20 @@ extern int ftrace_event_reg(struct ftrace_event_call *event,
 			    enum trace_reg type, void *data);
 
 enum {
-	TRACE_EVENT_FL_ENABLED_BIT,
 	TRACE_EVENT_FL_FILTERED_BIT,
-	TRACE_EVENT_FL_RECORDED_CMD_BIT,
 	TRACE_EVENT_FL_CAP_ANY_BIT,
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
 };
 
+/*
+ * Event flags:
+ *  FILTERED	  - The event has a filter attached
+ *  CAP_ANY	  - Any user can enable for perf
+ *  NO_SET_FILTER - Set when filter has error and is to be ignored
+ */
 enum {
-	TRACE_EVENT_FL_ENABLED		= (1 << TRACE_EVENT_FL_ENABLED_BIT),
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
-	TRACE_EVENT_FL_RECORDED_CMD	= (1 << TRACE_EVENT_FL_RECORDED_CMD_BIT),
 	TRACE_EVENT_FL_CAP_ANY		= (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
@@ -203,12 +205,44 @@ struct ftrace_event_call {
 	struct list_head	list;
 	struct ftrace_event_class *class;
 	char			*name;
-	struct dentry		*dir;
 	struct trace_event	event;
 	const char		*print_fmt;
 	struct event_filter	*filter;
+	struct list_head	*files;
 	void			*mod;
 	void			*data;
+	int			flags; /* static flags of different events */
+
+#ifdef CONFIG_PERF_EVENTS
+	int				perf_refcount;
+	struct hlist_head __percpu	*perf_events;
+#endif
+};
+
+struct trace_array;
+struct ftrace_subsystem_dir;
+
+enum {
+	FTRACE_EVENT_FL_ENABLED_BIT,
+	FTRACE_EVENT_FL_RECORDED_CMD_BIT,
+};
+
+/*
+ * Ftrace event file flags:
+ *  ENABELD	  - The event is enabled
+ *  RECORDED_CMD  - The comms should be recorded at sched_switch
+ */
+enum {
+	FTRACE_EVENT_FL_ENABLED		= (1 << FTRACE_EVENT_FL_ENABLED_BIT),
+	FTRACE_EVENT_FL_RECORDED_CMD	= (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT),
+};
+
+struct ftrace_event_file {
+	struct list_head		list;
+	struct ftrace_event_call	*event_call;
+	struct dentry			*dir;
+	struct trace_array		*tr;
+	struct ftrace_subsystem_dir	*system;
 
 	/*
 	 * 32 bit flags:
@@ -223,17 +257,12 @@ struct ftrace_event_call {
 	 *
 	 * Note: Reads of flags do not hold the event_mutex since
 	 * they occur in critical sections. But the way flags
-	 * is currently used, these changes do no affect the code
+	 * is currently used, these changes do not affect the code
 	 * except that when a change is made, it may have a slight
 	 * delay in propagating the changes to other CPUs due to
 	 * caching and such.
 	 */
 	unsigned int		flags;
-
-#ifdef CONFIG_PERF_EVENTS
-	int				perf_refcount;
-	struct hlist_head __percpu	*perf_events;
-#endif
 };
 
 #define __TRACE_EVENT_FLAGS(name, value)				\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 40dc5e8fe340..191d9661e277 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -518,7 +518,8 @@ static inline notrace int ftrace_get_offsets_##call(			\
 static notrace void							\
 ftrace_raw_event_##call(void *__data, proto)				\
 {									\
-	struct ftrace_event_call *event_call = __data;			\
+	struct ftrace_event_file *ftrace_file = __data;			\
+	struct ftrace_event_call *event_call = ftrace_file->event_call;	\
 	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4f1dade56981..932931897b8d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -189,6 +189,8 @@ unsigned long long ns2usecs(cycle_t nsec)
  */
 static struct trace_array	global_trace;
 
+LIST_HEAD(ftrace_trace_arrays);
+
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
 int filter_current_check_discard(struct ring_buffer *buffer,
@@ -5359,6 +5361,12 @@ __init static int tracer_alloc_buffers(void)
 
 	register_die_notifier(&trace_die_notifier);
 
+	global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+
+	INIT_LIST_HEAD(&global_trace.systems);
+	INIT_LIST_HEAD(&global_trace.events);
+	list_add(&global_trace.list, &ftrace_trace_arrays);
+
 	while (trace_boot_options) {
 		char *option;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2081971367ea..037f7eb03d69 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -158,13 +158,39 @@ struct trace_array_cpu {
  */
 struct trace_array {
 	struct ring_buffer	*buffer;
+	struct list_head	list;
 	int			cpu;
 	int			buffer_disabled;
+	unsigned int		flags;
 	cycle_t			time_start;
+	struct dentry		*dir;
+	struct dentry		*event_dir;
+	struct list_head	systems;
+	struct list_head	events;
 	struct task_struct	*waiter;
 	struct trace_array_cpu	*data[NR_CPUS];
 };
 
+enum {
+	TRACE_ARRAY_FL_GLOBAL	= (1 << 0)
+};
+
+extern struct list_head ftrace_trace_arrays;
+
+/*
+ * The global tracer (top) should be the first trace array added,
+ * but we check the flag anyway.
+ */
+static inline struct trace_array *top_trace_array(void)
+{
+	struct trace_array *tr;
+
+	tr = list_entry(ftrace_trace_arrays.prev,
+			typeof(*tr), list);
+	WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+	return tr;
+}
+
 #define FTRACE_CMP_TYPE(var, type) \
 	__builtin_types_compatible_p(typeof(var), type *)
 
@@ -851,12 +877,19 @@ struct event_filter {
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
-	struct dentry		*entry;
 	struct event_filter	*filter;
-	int			nr_events;
 	int			ref_count;
 };
 
+struct ftrace_subsystem_dir {
+	struct list_head		list;
+	struct event_subsystem		*subsystem;
+	struct trace_array		*tr;
+	struct dentry			*entry;
+	int				ref_count;
+	int				nr_events;
+};
+
 #define FILTER_PRED_INVALID	((unsigned short)-1)
 #define FILTER_PRED_IS_RIGHT	(1 << 15)
 #define FILTER_PRED_FOLD	(1 << 15)
@@ -914,7 +947,7 @@ extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 extern int apply_event_filter(struct ftrace_event_call *call,
 			      char *filter_string);
-extern int apply_subsystem_event_filter(struct event_subsystem *system,
+extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 					char *filter_string);
 extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..439955239bae 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -36,6 +36,19 @@ EXPORT_SYMBOL_GPL(event_storage);
 LIST_HEAD(ftrace_events);
 LIST_HEAD(ftrace_common_fields);
 
+/* Double loops, do not use break, only goto's work */
+#define do_for_each_event_file(tr, file)			\
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
+		list_for_each_entry(file, &tr->events, list)
+
+#define do_for_each_event_file_safe(tr, file)			\
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {	\
+		struct ftrace_event_file *___n;				\
+		list_for_each_entry_safe(file, ___n, &tr->events, list)
+
+#define while_for_each_event_file()		\
+	}
+
 struct list_head *
 trace_get_fields(struct ftrace_event_call *event_call)
 {
@@ -149,15 +162,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
 int ftrace_event_reg(struct ftrace_event_call *call,
 		     enum trace_reg type, void *data)
 {
+	struct ftrace_event_file *file = data;
+
 	switch (type) {
 	case TRACE_REG_REGISTER:
 		return tracepoint_probe_register(call->name,
 						 call->class->probe,
-						 call);
+						 file);
 	case TRACE_REG_UNREGISTER:
 		tracepoint_probe_unregister(call->name,
 					    call->class->probe,
-					    call);
+					    file);
 		return 0;
 
 #ifdef CONFIG_PERF_EVENTS
@@ -183,54 +198,57 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
 
 void trace_event_enable_cmd_record(bool enable)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-		if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+	do_for_each_event_file(tr, file) {
+
+		if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
 			continue;
 
 		if (enable) {
 			tracing_start_cmdline_record();
-			call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+			file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
 		} else {
 			tracing_stop_cmdline_record();
-			call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+			file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
 		}
-	}
+	} while_for_each_event_file();
 	mutex_unlock(&event_mutex);
 }
 
-static int ftrace_event_enable_disable(struct ftrace_event_call *call,
-					int enable)
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+				       int enable)
 {
+	struct ftrace_event_call *call = file->event_call;
 	int ret = 0;
 
 	switch (enable) {
 	case 0:
-		if (call->flags & TRACE_EVENT_FL_ENABLED) {
-			call->flags &= ~TRACE_EVENT_FL_ENABLED;
-			if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+			file->flags &= ~FTRACE_EVENT_FL_ENABLED;
+			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
 				tracing_stop_cmdline_record();
-				call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+				file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
 			}
-			call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
+			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
 		break;
 	case 1:
-		if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
 			if (trace_flags & TRACE_ITER_RECORD_CMD) {
 				tracing_start_cmdline_record();
-				call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+				file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
 			}
-			ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
+			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
 				tracing_stop_cmdline_record();
 				pr_info("event trace: Could not enable event "
 					"%s\n", call->name);
 				break;
 			}
-			call->flags |= TRACE_EVENT_FL_ENABLED;
+			file->flags |= FTRACE_EVENT_FL_ENABLED;
 		}
 		break;
 	}
@@ -238,13 +256,13 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
 	return ret;
 }
 
-static void ftrace_clear_events(void)
+static void ftrace_clear_events(struct trace_array *tr)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-		ftrace_event_enable_disable(call, 0);
+	list_for_each_entry(file, &tr->events, list) {
+		ftrace_event_enable_disable(file, 0);
 	}
 	mutex_unlock(&event_mutex);
 }
@@ -257,6 +275,8 @@ static void __put_system(struct event_subsystem *system)
 	if (--system->ref_count)
 		return;
 
+	list_del(&system->list);
+
 	if (filter) {
 		kfree(filter->filter_string);
 		kfree(filter);
@@ -271,24 +291,45 @@ static void __get_system(struct event_subsystem *system)
 	system->ref_count++;
 }
 
-static void put_system(struct event_subsystem *system)
+static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+{
+	WARN_ON_ONCE(dir->ref_count == 0);
+	dir->ref_count++;
+	__get_system(dir->subsystem);
+}
+
+static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+{
+	WARN_ON_ONCE(dir->ref_count == 0);
+	/* If the subsystem is about to be freed, the dir must be too */
+	WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
+
+	__put_system(dir->subsystem);
+	if (!--dir->ref_count)
+		kfree(dir);
+}
+
+static void put_system(struct ftrace_subsystem_dir *dir)
 {
 	mutex_lock(&event_mutex);
-	__put_system(system);
+	__put_system_dir(dir);
 	mutex_unlock(&event_mutex);
 }
 
 /*
  * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
  */
-static int __ftrace_set_clr_event(const char *match, const char *sub,
-				  const char *event, int set)
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+				  const char *sub, const char *event, int set)
 {
+	struct ftrace_event_file *file;
 	struct ftrace_event_call *call;
 	int ret = -EINVAL;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
+	list_for_each_entry(file, &tr->events, list) {
+
+		call = file->event_call;
 
 		if (!call->name || !call->class || !call->class->reg)
 			continue;
@@ -307,7 +348,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 		if (event && strcmp(event, call->name) != 0)
 			continue;
 
-		ftrace_event_enable_disable(call, set);
+		ftrace_event_enable_disable(file, set);
 
 		ret = 0;
 	}
@@ -316,7 +357,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 	return ret;
 }
 
-static int ftrace_set_clr_event(char *buf, int set)
+static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 {
 	char *event = NULL, *sub = NULL, *match;
 
@@ -344,7 +385,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
-	return __ftrace_set_clr_event(match, sub, event, set);
+	return __ftrace_set_clr_event(tr, match, sub, event, set);
 }
 
 /**
@@ -361,7 +402,9 @@ static int ftrace_set_clr_event(char *buf, int set)
  */
 int trace_set_clr_event(const char *system, const char *event, int set)
 {
-	return __ftrace_set_clr_event(NULL, system, event, set);
+	struct trace_array *tr = top_trace_array();
+
+	return __ftrace_set_clr_event(tr, NULL, system, event, set);
 }
 EXPORT_SYMBOL_GPL(trace_set_clr_event);
 
@@ -373,6 +416,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos)
 {
 	struct trace_parser parser;
+	struct seq_file *m = file->private_data;
+	struct trace_array *tr = m->private;
 	ssize_t read, ret;
 
 	if (!cnt)
@@ -395,7 +440,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 
 		parser.buffer[parser.idx] = 0;
 
-		ret = ftrace_set_clr_event(parser.buffer + !set, set);
+		ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
 		if (ret)
 			goto out_put;
 	}
@@ -411,17 +456,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = v;
+	struct ftrace_event_file *file = v;
+	struct ftrace_event_call *call;
+	struct trace_array *tr = m->private;
 
 	(*pos)++;
 
-	list_for_each_entry_continue(call, &ftrace_events, list) {
+	list_for_each_entry_continue(file, &tr->events, list) {
+		call = file->event_call;
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
 		 */
 		if (call->class && call->class->reg)
-			return call;
+			return file;
 	}
 
 	return NULL;
@@ -429,30 +477,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+	file = list_entry(&tr->events, struct ftrace_event_file, list);
 	for (l = 0; l <= *pos; ) {
-		call = t_next(m, call, &l);
-		if (!call)
+		file = t_next(m, file, &l);
+		if (!file)
 			break;
 	}
-	return call;
+	return file;
 }
 
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = v;
+	struct ftrace_event_file *file = v;
+	struct trace_array *tr = m->private;
 
 	(*pos)++;
 
-	list_for_each_entry_continue(call, &ftrace_events, list) {
-		if (call->flags & TRACE_EVENT_FL_ENABLED)
-			return call;
+	list_for_each_entry_continue(file, &tr->events, list) {
+		if (file->flags & FTRACE_EVENT_FL_ENABLED)
+			return file;
 	}
 
 	return NULL;
@@ -460,23 +510,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
-	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr = m->private;
 	loff_t l;
 
 	mutex_lock(&event_mutex);
 
-	call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+	file = list_entry(&tr->events, struct ftrace_event_file, list);
 	for (l = 0; l <= *pos; ) {
-		call = s_next(m, call, &l);
-		if (!call)
+		file = s_next(m, file, &l);
+		if (!file)
 			break;
 	}
-	return call;
+	return file;
 }
 
 static int t_show(struct seq_file *m, void *v)
 {
-	struct ftrace_event_call *call = v;
+	struct ftrace_event_file *file = v;
+	struct ftrace_event_call *call = file->event_call;
 
 	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
 		seq_printf(m, "%s:", call->class->system);
@@ -494,10 +546,10 @@ static ssize_t
 event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
 {
-	struct ftrace_event_call *call = filp->private_data;
+	struct ftrace_event_file *file = filp->private_data;
 	char *buf;
 
-	if (call->flags & TRACE_EVENT_FL_ENABLED)
+	if (file->flags & FTRACE_EVENT_FL_ENABLED)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -509,10 +561,13 @@ static ssize_t
 event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
-	struct ftrace_event_call *call = filp->private_data;
+	struct ftrace_event_file *file = filp->private_data;
 	unsigned long val;
 	int ret;
 
+	if (!file)
+		return -EINVAL;
+
 	ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
 	if (ret)
 		return ret;
@@ -525,7 +580,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	case 0:
 	case 1:
 		mutex_lock(&event_mutex);
-		ret = ftrace_event_enable_disable(call, val);
+		ret = ftrace_event_enable_disable(file, val);
 		mutex_unlock(&event_mutex);
 		break;
 
@@ -543,14 +598,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	const char set_to_char[4] = { '?', '0', '1', 'X' };
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct event_subsystem *system = dir->subsystem;
 	struct ftrace_event_call *call;
+	struct ftrace_event_file *file;
+	struct trace_array *tr = dir->tr;
 	char buf[2];
 	int set = 0;
 	int ret;
 
 	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
+	list_for_each_entry(file, &tr->events, list) {
+		call = file->event_call;
 		if (!call->name || !call->class || !call->class->reg)
 			continue;
 
@@ -562,7 +621,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
+		set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
 
 		/*
 		 * If we have a mixture, no need to look further.
@@ -584,7 +643,8 @@ static ssize_t
 system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		    loff_t *ppos)
 {
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct event_subsystem *system = dir->subsystem;
 	const char *name = NULL;
 	unsigned long val;
 	ssize_t ret;
@@ -607,7 +667,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (system)
 		name = system->name;
 
-	ret = __ftrace_set_clr_event(NULL, name, NULL, val);
+	ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
 	if (ret)
 		goto out;
 
@@ -845,43 +905,75 @@ static LIST_HEAD(event_subsystems);
 static int subsystem_open(struct inode *inode, struct file *filp)
 {
 	struct event_subsystem *system = NULL;
+	struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+	struct trace_array *tr;
 	int ret;
 
-	if (!inode->i_private)
-		goto skip_search;
-
 	/* Make sure the system still exists */
 	mutex_lock(&event_mutex);
-	list_for_each_entry(system, &event_subsystems, list) {
-		if (system == inode->i_private) {
-			/* Don't open systems with no events */
-			if (!system->nr_events) {
-				system = NULL;
-				break;
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		list_for_each_entry(dir, &tr->systems, list) {
+			if (dir == inode->i_private) {
+				/* Don't open systems with no events */
+				if (dir->nr_events) {
+					__get_system_dir(dir);
+					system = dir->subsystem;
+				}
+				goto exit_loop;
 			}
-			__get_system(system);
-			break;
 		}
 	}
+ exit_loop:
 	mutex_unlock(&event_mutex);
 
-	if (system != inode->i_private)
+	if (!system)
 		return -ENODEV;
 
- skip_search:
+	/* Some versions of gcc think dir can be uninitialized here */
+	WARN_ON(!dir);
+
 	ret = tracing_open_generic(inode, filp);
-	if (ret < 0 && system)
-		put_system(system);
+	if (ret < 0)
+		put_system(dir);
+
+	return ret;
+}
+
+static int system_tr_open(struct inode *inode, struct file *filp)
+{
+	struct ftrace_subsystem_dir *dir;
+	struct trace_array *tr = inode->i_private;
+	int ret;
+
+	/* Make a temporary dir that has no system but points to tr */
+	dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		return -ENOMEM;
+
+	dir->tr = tr;
+
+	ret = tracing_open_generic(inode, filp);
+	if (ret < 0)
+		kfree(dir);
+
+	filp->private_data = dir;
 
 	return ret;
 }
 
 static int subsystem_release(struct inode *inode, struct file *file)
 {
-	struct event_subsystem *system = inode->i_private;
+	struct ftrace_subsystem_dir *dir = file->private_data;
 
-	if (system)
-		put_system(system);
+	/*
+	 * If dir->subsystem is NULL, then this is a temporary
+	 * descriptor that was made for a trace_array to enable
+	 * all subsystems.
+	 */
+	if (dir->subsystem)
+		put_system(dir);
+	else
+		kfree(dir);
 
 	return 0;
 }
@@ -890,7 +982,8 @@ static ssize_t
 subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 		      loff_t *ppos)
 {
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
+	struct event_subsystem *system = dir->subsystem;
 	struct trace_seq *s;
 	int r;
 
@@ -915,7 +1008,7 @@ static ssize_t
 subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
-	struct event_subsystem *system = filp->private_data;
+	struct ftrace_subsystem_dir *dir = filp->private_data;
 	char *buf;
 	int err;
 
@@ -932,7 +1025,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 	buf[cnt] = '\0';
 
-	err = apply_subsystem_event_filter(system, buf);
+	err = apply_subsystem_event_filter(dir, buf);
 	free_page((unsigned long) buf);
 	if (err < 0)
 		return err;
@@ -1041,30 +1134,35 @@ static const struct file_operations ftrace_system_enable_fops = {
 	.release = subsystem_release,
 };
 
+static const struct file_operations ftrace_tr_enable_fops = {
+	.open = system_tr_open,
+	.read = system_enable_read,
+	.write = system_enable_write,
+	.llseek = default_llseek,
+	.release = subsystem_release,
+};
+
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
 	.llseek = default_llseek,
 };
 
-static struct dentry *event_trace_events_dir(void)
+static int
+ftrace_event_open(struct inode *inode, struct file *file,
+		  const struct seq_operations *seq_ops)
 {
-	static struct dentry *d_tracer;
-	static struct dentry *d_events;
-
-	if (d_events)
-		return d_events;
-
-	d_tracer = tracing_init_dentry();
-	if (!d_tracer)
-		return NULL;
+	struct seq_file *m;
+	int ret;
 
-	d_events = debugfs_create_dir("events", d_tracer);
-	if (!d_events)
-		pr_warning("Could not create debugfs "
-			   "'events' directory\n");
+	ret = seq_open(file, seq_ops);
+	if (ret < 0)
+		return ret;
+	m = file->private_data;
+	/* copy tr over to seq ops */
+	m->private = inode->i_private;
 
-	return d_events;
+	return ret;
 }
 
 static int
@@ -1072,117 +1170,169 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
 {
 	const struct seq_operations *seq_ops = &show_event_seq_ops;
 
-	return seq_open(file, seq_ops);
+	return ftrace_event_open(inode, file, seq_ops);
 }
 
 static int
 ftrace_event_set_open(struct inode *inode, struct file *file)
 {
 	const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+	struct trace_array *tr = inode->i_private;
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
-		ftrace_clear_events();
+		ftrace_clear_events(tr);
 
-	return seq_open(file, seq_ops);
+	return ftrace_event_open(inode, file, seq_ops);
+}
+
+static struct event_subsystem *
+create_new_subsystem(const char *name)
+{
+	struct event_subsystem *system;
+
+	/* need to create new entry */
+	system = kmalloc(sizeof(*system), GFP_KERNEL);
+	if (!system)
+		return NULL;
+
+	system->ref_count = 1;
+	system->name = kstrdup(name, GFP_KERNEL);
+
+	if (!system->name)
+		goto out_free;
+
+	system->filter = NULL;
+
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter)
+		goto out_free;
+
+	list_add(&system->list, &event_subsystems);
+
+	return system;
+
+ out_free:
+	kfree(system->name);
+	kfree(system);
+	return NULL;
 }
 
 static struct dentry *
-event_subsystem_dir(const char *name, struct dentry *d_events)
+event_subsystem_dir(struct trace_array *tr, const char *name,
+		    struct ftrace_event_file *file, struct dentry *parent)
 {
+	struct ftrace_subsystem_dir *dir;
 	struct event_subsystem *system;
 	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
-	list_for_each_entry(system, &event_subsystems, list) {
+	list_for_each_entry(dir, &tr->systems, list) {
+		system = dir->subsystem;
 		if (strcmp(system->name, name) == 0) {
-			system->nr_events++;
-			return system->entry;
+			dir->nr_events++;
+			file->system = dir;
+			return dir->entry;
 		}
 	}
 
-	/* need to create new entry */
-	system = kmalloc(sizeof(*system), GFP_KERNEL);
-	if (!system) {
-		pr_warning("No memory to create event subsystem %s\n",
-			   name);
-		return d_events;
+	/* Now see if the system itself exists. */
+	list_for_each_entry(system, &event_subsystems, list) {
+		if (strcmp(system->name, name) == 0)
+			break;
 	}
+	/* Reset system variable when not found */
+	if (&system->list == &event_subsystems)
+		system = NULL;
 
-	system->entry = debugfs_create_dir(name, d_events);
-	if (!system->entry) {
-		pr_warning("Could not create event subsystem %s\n",
-			   name);
-		kfree(system);
-		return d_events;
-	}
+	dir = kmalloc(sizeof(*dir), GFP_KERNEL);
+	if (!dir)
+		goto out_fail;
 
-	system->nr_events = 1;
-	system->ref_count = 1;
-	system->name = kstrdup(name, GFP_KERNEL);
-	if (!system->name) {
-		debugfs_remove(system->entry);
-		kfree(system);
-		return d_events;
+	if (!system) {
+		system = create_new_subsystem(name);
+		if (!system)
+			goto out_free;
+	} else
+		__get_system(system);
+
+	dir->entry = debugfs_create_dir(name, parent);
+	if (!dir->entry) {
+		pr_warning("Failed to create system directory %s\n", name);
+		__put_system(system);
+		goto out_free;
 	}
 
-	list_add(&system->list, &event_subsystems);
-
-	system->filter = NULL;
-
-	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
-	if (!system->filter) {
-		pr_warning("Could not allocate filter for subsystem "
-			   "'%s'\n", name);
-		return system->entry;
-	}
+	dir->tr = tr;
+	dir->ref_count = 1;
+	dir->nr_events = 1;
+	dir->subsystem = system;
+	file->system = dir;
 
-	entry = debugfs_create_file("filter", 0644, system->entry, system,
+	entry = debugfs_create_file("filter", 0644, dir->entry, dir,
 				    &ftrace_subsystem_filter_fops);
 	if (!entry) {
 		kfree(system->filter);
 		system->filter = NULL;
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", name);
+		pr_warning("Could not create debugfs '%s/filter' entry\n", name);
 	}
 
-	trace_create_file("enable", 0644, system->entry, system,
+	trace_create_file("enable", 0644, dir->entry, dir,
 			  &ftrace_system_enable_fops);
 
-	return system->entry;
+	list_add(&dir->list, &tr->systems);
+
+	return dir->entry;
+
+ out_free:
+	kfree(dir);
+ out_fail:
+	/* Only print this message if failed on memory allocation */
+	if (!dir || !system)
+		pr_warning("No memory to create event subsystem %s\n",
+			   name);
+	return NULL;
 }
 
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+event_create_dir(struct dentry *parent,
+		 struct ftrace_event_file *file,
 		 const struct file_operations *id,
 		 const struct file_operations *enable,
 		 const struct file_operations *filter,
 		 const struct file_operations *format)
 {
+	struct ftrace_event_call *call = file->event_call;
+	struct trace_array *tr = file->tr;
 	struct list_head *head;
+	struct dentry *d_events;
 	int ret;
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
-		d_events = event_subsystem_dir(call->class->system, d_events);
-
-	call->dir = debugfs_create_dir(call->name, d_events);
-	if (!call->dir) {
-		pr_warning("Could not create debugfs "
-			   "'%s' directory\n", call->name);
+	if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
+		d_events = event_subsystem_dir(tr, call->class->system, file, parent);
+		if (!d_events)
+			return -ENOMEM;
+	} else
+		d_events = parent;
+
+	file->dir = debugfs_create_dir(call->name, d_events);
+	if (!file->dir) {
+		pr_warning("Could not create debugfs '%s' directory\n",
+			   call->name);
 		return -1;
 	}
 
 	if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
-		trace_create_file("enable", 0644, call->dir, call,
+		trace_create_file("enable", 0644, file->dir, file,
 				  enable);
 
 #ifdef CONFIG_PERF_EVENTS
 	if (call->event.type && call->class->reg)
-		trace_create_file("id", 0444, call->dir, call,
+		trace_create_file("id", 0444, file->dir, call,
 		 		  id);
 #endif
 
@@ -1196,23 +1346,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
 		if (ret < 0) {
 			pr_warning("Could not initialize trace point"
 				   " events/%s\n", call->name);
-			return ret;
+			return -1;
 		}
 	}
-	trace_create_file("filter", 0644, call->dir, call,
+	trace_create_file("filter", 0644, file->dir, call,
 			  filter);
 
-	trace_create_file("format", 0444, call->dir, call,
+	trace_create_file("format", 0444, file->dir, call,
 			  format);
 
 	return 0;
 }
 
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+	if (!dir)
+		return;
+
+	if (!--dir->nr_events) {
+		debugfs_remove_recursive(dir->entry);
+		list_del(&dir->list);
+		__put_system_dir(dir);
+	}
+}
+
+static void remove_event_from_tracers(struct ftrace_event_call *call)
+{
+	struct ftrace_event_file *file;
+	struct trace_array *tr;
+
+	do_for_each_event_file_safe(tr, file) {
+
+		if (file->event_call != call)
+			continue;
+
+		list_del(&file->list);
+		debugfs_remove_recursive(file->dir);
+		remove_subsystem(file->system);
+		kfree(file);
+
+		/*
+		 * The do_for_each_event_file_safe() is
+		 * a double loop. After finding the call for this
+		 * trace_array, we use break to jump to the next
+		 * trace_array.
+		 */
+		break;
+	} while_for_each_event_file();
+}
+
 static void event_remove(struct ftrace_event_call *call)
 {
-	ftrace_event_enable_disable(call, 0);
+	struct trace_array *tr;
+	struct ftrace_event_file *file;
+
+	do_for_each_event_file(tr, file) {
+		if (file->event_call != call)
+			continue;
+		ftrace_event_enable_disable(file, 0);
+		/*
+		 * The do_for_each_event_file() is
+		 * a double loop. After finding the call for this
+		 * trace_array, we use break to jump to the next
+		 * trace_array.
+		 */
+		break;
+	} while_for_each_event_file();
+
 	if (call->event.funcs)
 		__unregister_ftrace_event(&call->event);
+	remove_event_from_tracers(call);
 	list_del(&call->list);
 }
 
@@ -1234,61 +1437,58 @@ static int event_init(struct ftrace_event_call *call)
 }
 
 static int
-__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
-		       const struct file_operations *id,
-		       const struct file_operations *enable,
-		       const struct file_operations *filter,
-		       const struct file_operations *format)
+__register_event(struct ftrace_event_call *call, struct module *mod)
 {
-	struct dentry *d_events;
 	int ret;
 
 	ret = event_init(call);
 	if (ret < 0)
 		return ret;
 
-	d_events = event_trace_events_dir();
-	if (!d_events)
-		return -ENOENT;
-
-	ret = event_create_dir(call, d_events, id, enable, filter, format);
-	if (!ret)
-		list_add(&call->list, &ftrace_events);
+	list_add(&call->list, &ftrace_events);
 	call->mod = mod;
 
-	return ret;
+	return 0;
 }
 
+/* Add an event to a trace directory */
+static int
+__trace_add_new_event(struct ftrace_event_call *call,
+		      struct trace_array *tr,
+		      const struct file_operations *id,
+		      const struct file_operations *enable,
+		      const struct file_operations *filter,
+		      const struct file_operations *format)
+{
+	struct ftrace_event_file *file;
+
+	file = kzalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	file->event_call = call;
+	file->tr = tr;
+	list_add(&file->list, &tr->events);
+
+	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
+}
+
+struct ftrace_module_file_ops;
+static void __add_event_to_tracers(struct ftrace_event_call *call,
+				   struct ftrace_module_file_ops *file_ops);
+
 /* Add an additional event_call dynamically */
 int trace_add_event_call(struct ftrace_event_call *call)
 {
 	int ret;
 	mutex_lock(&event_mutex);
-	ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
-				     &ftrace_enable_fops,
-				     &ftrace_event_filter_fops,
-				     &ftrace_event_format_fops);
-	mutex_unlock(&event_mutex);
-	return ret;
-}
 
-static void remove_subsystem_dir(const char *name)
-{
-	struct event_subsystem *system;
+	ret = __register_event(call, NULL);
+	if (ret >= 0)
+		__add_event_to_tracers(call, NULL);
 
-	if (strcmp(name, TRACE_SYSTEM) == 0)
-		return;
-
-	list_for_each_entry(system, &event_subsystems, list) {
-		if (strcmp(system->name, name) == 0) {
-			if (!--system->nr_events) {
-				debugfs_remove_recursive(system->entry);
-				list_del(&system->list);
-				__put_system(system);
-			}
-			break;
-		}
-	}
+	mutex_unlock(&event_mutex);
+	return ret;
 }
 
 /*
@@ -1299,8 +1499,6 @@ static void __trace_remove_event_call(struct ftrace_event_call *call)
 	event_remove(call);
 	trace_destroy_fields(call);
 	destroy_preds(call);
-	debugfs_remove_recursive(call->dir);
-	remove_subsystem_dir(call->class->system);
 }
 
 /* Remove an event_call */
@@ -1335,6 +1533,17 @@ struct ftrace_module_file_ops {
 	struct file_operations		filter;
 };
 
+static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+
+	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+		if (file_ops->mod == mod)
+			return file_ops;
+	}
+	return NULL;
+}
+
 static struct ftrace_module_file_ops *
 trace_create_file_ops(struct module *mod)
 {
@@ -1386,9 +1595,8 @@ static void trace_module_add_events(struct module *mod)
 		return;
 
 	for_each_event(call, start, end) {
-		__trace_add_event_call(*call, mod,
-				       &file_ops->id, &file_ops->enable,
-				       &file_ops->filter, &file_ops->format);
+		__register_event(*call, mod);
+		__add_event_to_tracers(*call, file_ops);
 	}
 }
 
@@ -1444,6 +1652,10 @@ static int trace_module_notify(struct notifier_block *self,
 	return 0;
 }
 #else
+static struct ftrace_module_file_ops *find_ftrace_file_ops(struct module *mod)
+{
+	return NULL;
+}
 static int trace_module_notify(struct notifier_block *self,
 			       unsigned long val, void *data)
 {
@@ -1451,6 +1663,72 @@ static int trace_module_notify(struct notifier_block *self,
 }
 #endif /* CONFIG_MODULES */
 
+/* Create a new event directory structure for a trace directory. */
+static void
+__trace_add_event_dirs(struct trace_array *tr)
+{
+	struct ftrace_module_file_ops *file_ops = NULL;
+	struct ftrace_event_call *call;
+	int ret;
+
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (call->mod) {
+			/*
+			 * Directories for events by modules need to
+			 * keep module ref counts when opened (as we don't
+			 * want the module to disappear when reading one
+			 * of these files). The file_ops keep account of
+			 * the module ref count.
+			 *
+			 * As event_calls are added in groups by module,
+			 * when we find one file_ops, we don't need to search for
+			 * each call in that module, as the rest should be the
+			 * same. Only search for a new one if the last one did
+			 * not match.
+			 */
+			if (!file_ops || call->mod != file_ops->mod)
+				file_ops = find_ftrace_file_ops(call->mod);
+			if (!file_ops)
+				continue; /* Warn? */
+			ret = __trace_add_new_event(call, tr,
+					&file_ops->id, &file_ops->enable,
+					&file_ops->filter, &file_ops->format);
+			if (ret < 0)
+				pr_warning("Could not create directory for event %s\n",
+					   call->name);
+			continue;
+		}
+		ret = __trace_add_new_event(call, tr,
+					    &ftrace_event_id_fops,
+					    &ftrace_enable_fops,
+					    &ftrace_event_filter_fops,
+					    &ftrace_event_format_fops);
+		if (ret < 0)
+			pr_warning("Could not create directory for event %s\n",
+				   call->name);
+	}
+}
+
+static void
+__add_event_to_tracers(struct ftrace_event_call *call,
+		       struct ftrace_module_file_ops *file_ops)
+{
+	struct trace_array *tr;
+
+	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+		if (file_ops)
+			__trace_add_new_event(call, tr,
+					      &file_ops->id, &file_ops->enable,
+					      &file_ops->filter, &file_ops->format);
+		else
+			__trace_add_new_event(call, tr,
+					      &ftrace_event_id_fops,
+					      &ftrace_enable_fops,
+					      &ftrace_event_filter_fops,
+					      &ftrace_event_format_fops);
+	}
+}
+
 static struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
 	.priority = 0,
@@ -1471,8 +1749,43 @@ static __init int setup_trace_event(char *str)
 }
 __setup("trace_event=", setup_trace_event);
 
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+	struct dentry *d_events;
+	struct dentry *entry;
+
+	entry = debugfs_create_file("set_event", 0644, parent,
+				    tr, &ftrace_set_event_fops);
+	if (!entry) {
+		pr_warning("Could not create debugfs 'set_event' entry\n");
+		return -ENOMEM;
+	}
+
+	d_events = debugfs_create_dir("events", parent);
+	if (!d_events)
+		pr_warning("Could not create debugfs 'events' directory\n");
+
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("enable", 0644, d_events,
+			  tr, &ftrace_tr_enable_fops);
+
+	tr->event_dir = d_events;
+	__trace_add_event_dirs(tr);
+
+	return 0;
+}
+
 static __init int event_trace_enable(void)
 {
+	struct trace_array *tr = top_trace_array();
 	struct ftrace_event_call **iter, *call;
 	char *buf = bootup_event_buf;
 	char *token;
@@ -1494,7 +1807,7 @@ static __init int event_trace_enable(void)
 		if (!*token)
 			continue;
 
-		ret = ftrace_set_clr_event(token, 1);
+		ret = ftrace_set_clr_event(tr, token, 1);
 		if (ret)
 			pr_warn("Failed to enable trace event: %s\n", token);
 	}
@@ -1506,61 +1819,29 @@ static __init int event_trace_enable(void)
 
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call;
+	struct trace_array *tr;
 	struct dentry *d_tracer;
 	struct dentry *entry;
-	struct dentry *d_events;
 	int ret;
 
+	tr = top_trace_array();
+
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 		return 0;
 
 	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    NULL, &ftrace_avail_fops);
+				    tr, &ftrace_avail_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'available_events' entry\n");
 
-	entry = debugfs_create_file("set_event", 0644, d_tracer,
-				    NULL, &ftrace_set_event_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_event' entry\n");
-
-	d_events = event_trace_events_dir();
-	if (!d_events)
-		return 0;
-
-	/* ring buffer internal formats */
-	trace_create_file("header_page", 0444, d_events,
-			  ring_buffer_print_page_header,
-			  &ftrace_show_header_fops);
-
-	trace_create_file("header_event", 0444, d_events,
-			  ring_buffer_print_entry_header,
-			  &ftrace_show_header_fops);
-
-	trace_create_file("enable", 0644, d_events,
-			  NULL, &ftrace_system_enable_fops);
-
 	if (trace_define_common_fields())
 		pr_warning("tracing: Failed to allocate common fields");
 
-	/*
-	 * Early initialization already enabled ftrace event.
-	 * Now it's only necessary to create the event directory.
-	 */
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		ret = event_create_dir(call, d_events,
-				       &ftrace_event_id_fops,
-				       &ftrace_enable_fops,
-				       &ftrace_event_filter_fops,
-				       &ftrace_event_format_fops);
-		if (ret < 0)
-			event_remove(call);
-	}
+	ret = event_trace_add_tracer(d_tracer, tr);
+	if (ret)
+		return ret;
 
 	ret = register_module_notifier(&trace_module_nb);
 	if (ret)
@@ -1627,13 +1908,20 @@ static __init void event_test_stuff(void)
  */
 static __init void event_trace_self_tests(void)
 {
+	struct ftrace_subsystem_dir *dir;
+	struct ftrace_event_file *file;
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
+	struct trace_array *tr;
 	int ret;
 
+	tr = top_trace_array();
+
 	pr_info("Running tests on trace events:\n");
 
-	list_for_each_entry(call, &ftrace_events, list) {
+	list_for_each_entry(file, &tr->events, list) {
+
+		call = file->event_call;
 
 		/* Only test those that have a probe */
 		if (!call->class || !call->class->probe)
@@ -1657,15 +1945,15 @@ static __init void event_trace_self_tests(void)
 		 * If an event is already enabled, someone is using
 		 * it and the self test should not be on.
 		 */
-		if (call->flags & TRACE_EVENT_FL_ENABLED) {
+		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
 			pr_warning("Enabled event during self test!\n");
 			WARN_ON_ONCE(1);
 			continue;
 		}
 
-		ftrace_event_enable_disable(call, 1);
+		ftrace_event_enable_disable(file, 1);
 		event_test_stuff();
-		ftrace_event_enable_disable(call, 0);
+		ftrace_event_enable_disable(file, 0);
 
 		pr_cont("OK\n");
 	}
@@ -1674,7 +1962,9 @@ static __init void event_trace_self_tests(void)
 
 	pr_info("Running tests on trace event systems:\n");
 
-	list_for_each_entry(system, &event_subsystems, list) {
+	list_for_each_entry(dir, &tr->systems, list) {
+
+		system = dir->subsystem;
 
 		/* the ftrace system is special, skip it */
 		if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +1972,7 @@ static __init void event_trace_self_tests(void)
 
 		pr_info("Testing event system %s: ", system->name);
 
-		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error enabling system %s\n",
 				   system->name);
@@ -1691,7 +1981,7 @@ static __init void event_trace_self_tests(void)
 
 		event_test_stuff();
 
-		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+		ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error disabling system %s\n",
 				   system->name);
@@ -1706,7 +1996,7 @@ static __init void event_trace_self_tests(void)
 	pr_info("Running tests on all trace events:\n");
 	pr_info("Testing all events: ");
 
-	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error enabling all events\n");
 		return;
@@ -1715,7 +2005,7 @@ static __init void event_trace_self_tests(void)
 	event_test_stuff();
 
 	/* reset sysname */
-	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+	ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
 		return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..2a22a177ab44 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1907,16 +1907,17 @@ out_unlock:
 	return err;
 }
 
-int apply_subsystem_event_filter(struct event_subsystem *system,
+int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
 				 char *filter_string)
 {
+	struct event_subsystem *system = dir->subsystem;
 	struct event_filter *filter;
 	int err = 0;
 
 	mutex_lock(&event_mutex);
 
 	/* Make sure the system still has events */
-	if (!system->nr_events) {
+	if (!dir->nr_events) {
 		err = -ENODEV;
 		goto out_unlock;
 	}
-- 
cgit 


From ccb469a198cffac94a7eea0b69f715f06e2ddf15 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Aug 2012 10:32:10 -0400
Subject: tracing: Pass the ftrace_file to the buffer lock reserve code

Pass the struct ftrace_event_file *ftrace_file to the
trace_event_buffer_lock_reserve() (new function that replaces the
trace_current_buffer_lock_reserver()).

The ftrace_file holds a pointer to the trace_array that is in use.
In the case of multiple buffers with different trace_arrays, this
allows different events to be recorded into different buffers.

Also fixed some of the stale comments in include/trace/ftrace.h

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  7 +++++++
 include/trace/ftrace.h       |  9 +++++----
 kernel/trace/trace.c         | 12 ++++++++++++
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index c7191d482f98..fd28c170c597 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -128,6 +128,13 @@ enum print_line_t {
 void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
+struct ftrace_event_file;
+
+struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_buffer,
+				struct ftrace_event_file *ftrace_file,
+				int type, unsigned long len,
+				unsigned long flags, int pc);
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_buffer,
 				  int type, unsigned long len,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 191d9661e277..e5d140a91fd7 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -414,7 +414,8 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *
  * static void ftrace_raw_event_<call>(void *__data, proto)
  * {
- *	struct ftrace_event_call *event_call = __data;
+ *	struct ftrace_event_file *ftrace_file = __data;
+ *	struct ftrace_event_call *event_call = ftrace_file->event_call;
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
  *	struct ring_buffer_event *event;
  *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
@@ -428,7 +429,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *
  *	__data_size = ftrace_get_offsets_<call>(&__data_offsets, args);
  *
- *	event = trace_current_buffer_lock_reserve(&buffer,
+ *	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,
  *				  event_<call>->event.type,
  *				  sizeof(*entry) + __data_size,
  *				  irq_flags, pc);
@@ -440,7 +441,7 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *			   __array macros.
  *
  *	if (!filter_current_check_discard(buffer, event_call, entry, event))
- *		trace_current_buffer_unlock_commit(buffer,
+ *		trace_nowake_buffer_unlock_commit(buffer,
  *						   event, irq_flags, pc);
  * }
  *
@@ -533,7 +534,7 @@ ftrace_raw_event_##call(void *__data, proto)				\
 									\
 	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
-	event = trace_current_buffer_lock_reserve(&buffer,		\
+	event = trace_event_buffer_lock_reserve(&buffer, ftrace_file,	\
 				 event_call->event.type,		\
 				 sizeof(*entry) + __data_size,		\
 				 irq_flags, pc);			\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 91fe40905828..29bff72f97ef 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1293,6 +1293,18 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
 
+struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+			  struct ftrace_event_file *ftrace_file,
+			  int type, unsigned long len,
+			  unsigned long flags, int pc)
+{
+	*current_rb = ftrace_file->tr->buffer;
+	return trace_buffer_lock_reserve(*current_rb,
+					 type, len, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+
 struct ring_buffer_event *
 trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 				  int type, unsigned long len,
-- 
cgit 


From 15693458c4bc0693fd63a50d60f35b628fcf4e29 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 28 Feb 2013 19:59:17 -0500
Subject: tracing/ring-buffer: Move poll wake ups into ring buffer code

Move the logic to wake up on ring buffer data into the ring buffer
code itself. This simplifies the tracing code a lot and also has the
added benefit that waiters on one of the instance buffers can be woken
only when data is added to that instance instead of data added to
any instance.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |   6 ++
 kernel/trace/ring_buffer.c  | 146 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.c        |  83 ++++---------------------
 3 files changed, 164 insertions(+), 71 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1342e69542f3..d69cf637a15a 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -4,6 +4,7 @@
 #include <linux/kmemcheck.h>
 #include <linux/mm.h>
 #include <linux/seq_file.h>
+#include <linux/poll.h>
 
 struct ring_buffer;
 struct ring_buffer_iter;
@@ -96,6 +97,11 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k
 	__ring_buffer_alloc((size), (flags), &__key);	\
 })
 
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu);
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+			  struct file *filp, poll_table *poll_table);
+
+
 #define RING_BUFFER_ALL_CPUS -1
 
 void ring_buffer_free(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7244acde77b0..56b6ea32d2e7 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/trace_clock.h>
 #include <linux/trace_seq.h>
 #include <linux/spinlock.h>
+#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/hardirq.h>
@@ -442,6 +443,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
 	return ret;
 }
 
+struct rb_irq_work {
+	struct irq_work			work;
+	wait_queue_head_t		waiters;
+	bool				waiters_pending;
+};
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
@@ -476,6 +483,8 @@ struct ring_buffer_per_cpu {
 	struct list_head		new_pages; /* new pages to add */
 	struct work_struct		update_pages_work;
 	struct completion		update_done;
+
+	struct rb_irq_work		irq_work;
 };
 
 struct ring_buffer {
@@ -495,6 +504,8 @@ struct ring_buffer {
 	struct notifier_block		cpu_notify;
 #endif
 	u64				(*clock)(void);
+
+	struct rb_irq_work		irq_work;
 };
 
 struct ring_buffer_iter {
@@ -506,6 +517,118 @@ struct ring_buffer_iter {
 	u64				read_stamp;
 };
 
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+	struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+
+	wake_up_all(&rbwork->waiters);
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	DEFINE_WAIT(wait);
+	struct rb_irq_work *work;
+
+	/*
+	 * Depending on what the caller is waiting for, either any
+	 * data in any cpu buffer, or a specific buffer, put the
+	 * caller on the appropriate wait queue.
+	 */
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		work = &buffer->irq_work;
+	else {
+		cpu_buffer = buffer->buffers[cpu];
+		work = &cpu_buffer->irq_work;
+	}
+
+
+	prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+
+	/*
+	 * The events can happen in critical sections where
+	 * checking a work queue can cause deadlocks.
+	 * After adding a task to the queue, this flag is set
+	 * only to notify events to try to wake up the queue
+	 * using irq_work.
+	 *
+	 * We don't clear it even if the buffer is no longer
+	 * empty. The flag only causes the next event to run
+	 * irq_work to do the work queue wake up. The worse
+	 * that can happen if we race with !trace_empty() is that
+	 * an event will cause an irq_work to try to wake up
+	 * an empty queue.
+	 *
+	 * There's no reason to protect this flag either, as
+	 * the work queue and irq_work logic will do the necessary
+	 * synchronization for the wake ups. The only thing
+	 * that is necessary is that the wake up happens after
+	 * a task has been queued. It's OK for spurious wake ups.
+	 */
+	work->waiters_pending = true;
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
+		schedule();
+
+	finish_wait(&work->waiters, &wait);
+}
+
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+			  struct file *filp, poll_table *poll_table)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct rb_irq_work *work;
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+		return POLLIN | POLLRDNORM;
+
+	if (cpu == RING_BUFFER_ALL_CPUS)
+		work = &buffer->irq_work;
+	else {
+		cpu_buffer = buffer->buffers[cpu];
+		work = &cpu_buffer->irq_work;
+	}
+
+	work->waiters_pending = true;
+	poll_wait(filp, &work->waiters, poll_table);
+
+	if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+	    (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+		return POLLIN | POLLRDNORM;
+	return 0;
+}
+
 /* buffer may be either ring_buffer or ring_buffer_per_cpu */
 #define RB_WARN_ON(b, cond)						\
 	({								\
@@ -1061,6 +1184,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
 	cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
 	INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
 	init_completion(&cpu_buffer->update_done);
+	init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
 
 	bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
 			    GFP_KERNEL, cpu_to_node(cpu));
@@ -1156,6 +1280,8 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
 	buffer->clock = trace_clock_local;
 	buffer->reader_lock_key = key;
 
+	init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+
 	/* need at least two pages */
 	if (nr_pages < 2)
 		nr_pages = 2;
@@ -2610,6 +2736,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	rb_end_commit(cpu_buffer);
 }
 
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+	if (buffer->irq_work.waiters_pending) {
+		buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&buffer->irq_work.work);
+	}
+
+	if (cpu_buffer->irq_work.waiters_pending) {
+		cpu_buffer->irq_work.waiters_pending = false;
+		/* irq_work_queue() supplies it's own memory barriers */
+		irq_work_queue(&cpu_buffer->irq_work.work);
+	}
+}
+
 /**
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
@@ -2629,6 +2771,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	rb_wakeups(buffer, cpu_buffer);
+
 	trace_recursive_unlock();
 
 	preempt_enable_notrace();
@@ -2801,6 +2945,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	rb_wakeups(buffer, cpu_buffer);
+
 	ret = 0;
  out:
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ec146c96df4..b5b25b6575a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -19,7 +19,6 @@
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
 #include <linux/irqflags.h>
-#include <linux/irq_work.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
@@ -86,14 +85,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
  */
 static DEFINE_PER_CPU(bool, trace_cmdline_save);
 
-/*
- * When a reader is waiting for data, then this variable is
- * set to true.
- */
-static bool trace_wakeup_needed;
-
-static struct irq_work trace_work_wakeup;
-
 /*
  * Kill all tracing for good (never come back).
  * It is initialized to 1 but will turn to zero if the initialization
@@ -334,28 +325,12 @@ static inline void trace_access_lock_init(void)
 
 #endif
 
-/* trace_wait is a waitqueue for tasks blocked on trace_poll */
-static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
-
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
 	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
 	TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
 	TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
 
-/**
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-static void trace_wake_up(struct irq_work *work)
-{
-	wake_up_all(&trace_wait);
-
-}
-
 /**
  * tracing_on - enable tracing buffers
  *
@@ -763,36 +738,11 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 static void default_wait_pipe(struct trace_iterator *iter)
 {
-	DEFINE_WAIT(wait);
-
-	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
-
-	/*
-	 * The events can happen in critical sections where
-	 * checking a work queue can cause deadlocks.
-	 * After adding a task to the queue, this flag is set
-	 * only to notify events to try to wake up the queue
-	 * using irq_work.
-	 *
-	 * We don't clear it even if the buffer is no longer
-	 * empty. The flag only causes the next event to run
-	 * irq_work to do the work queue wake up. The worse
-	 * that can happen if we race with !trace_empty() is that
-	 * an event will cause an irq_work to try to wake up
-	 * an empty queue.
-	 *
-	 * There's no reason to protect this flag either, as
-	 * the work queue and irq_work logic will do the necessary
-	 * synchronization for the wake ups. The only thing
-	 * that is necessary is that the wake up happens after
-	 * a task has been queued. It's OK for spurious wake ups.
-	 */
-	trace_wakeup_needed = true;
-
-	if (trace_empty(iter))
-		schedule();
+	/* Iterators are static, they should be filled or empty */
+	if (trace_buffer_iter(iter, iter->cpu_file))
+		return;
 
-	finish_wait(&trace_wait, &wait);
+	ring_buffer_wait(iter->tr->buffer, iter->cpu_file);
 }
 
 /**
@@ -1262,11 +1212,6 @@ void
 __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
 {
 	__this_cpu_write(trace_cmdline_save, true);
-	if (trace_wakeup_needed) {
-		trace_wakeup_needed = false;
-		/* irq_work_queue() supplies it's own memory barriers */
-		irq_work_queue(&trace_work_wakeup);
-	}
 	ring_buffer_unlock_commit(buffer, event);
 }
 
@@ -3557,21 +3502,18 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 static unsigned int
 trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
 {
-	if (trace_flags & TRACE_ITER_BLOCK) {
+	/* Iterators are static, they should be filled or empty */
+	if (trace_buffer_iter(iter, iter->cpu_file))
+		return POLLIN | POLLRDNORM;
+
+	if (trace_flags & TRACE_ITER_BLOCK)
 		/*
 		 * Always select as readable when in blocking mode
 		 */
 		return POLLIN | POLLRDNORM;
-	} else {
-		if (!trace_empty(iter))
-			return POLLIN | POLLRDNORM;
-		trace_wakeup_needed = true;
-		poll_wait(filp, &trace_wait, poll_table);
-		if (!trace_empty(iter))
-			return POLLIN | POLLRDNORM;
-
-		return 0;
-	}
+	else
+		return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file,
+					     filp, poll_table);
 }
 
 static unsigned int
@@ -5701,7 +5643,6 @@ __init static int tracer_alloc_buffers(void)
 #endif
 
 	trace_init_cmdlines();
-	init_irq_work(&trace_work_wakeup, trace_wake_up);
 
 	register_tracer(&nop_trace);
 
-- 
cgit 


From f71130de5c7fba92faf3901784714e37a234c08f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 21 Feb 2013 10:32:38 +0800
Subject: tracing: Add a helper function for event print functions

Move duplicate code in event print functions to a helper function.

This shrinks the size of the kernel by ~13K.

   text    data     bss     dec     hex filename
6596137 1743966 10138672        18478775        119f6b7 vmlinux.o.old
6583002 1743849 10138672        18465523        119c2f3 vmlinux.o.new

Link: http://lkml.kernel.org/r/51258746.2060304@huawei.com

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  8 ++++++--
 include/trace/ftrace.h       | 23 ++++++-----------------
 kernel/trace/trace_output.c  | 26 ++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index fd28c170c597..4d79d2dc189c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -38,6 +38,12 @@ const char *ftrace_print_symbols_seq_u64(struct trace_seq *p,
 const char *ftrace_print_hex_seq(struct trace_seq *p,
 				 const unsigned char *buf, int len);
 
+struct trace_iterator;
+struct trace_event;
+
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+			   struct trace_event *event);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
@@ -95,8 +101,6 @@ enum trace_iter_flags {
 };
 
 
-struct trace_event;
-
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
 				      int flags, struct trace_event *event);
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index e5d140a91fd7..17a77fcac2a2 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -227,29 +227,18 @@ static notrace enum print_line_t					\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags,	\
 			 struct trace_event *trace_event)		\
 {									\
-	struct ftrace_event_call *event;				\
 	struct trace_seq *s = &iter->seq;				\
+	struct trace_seq __maybe_unused *p = &iter->tmp_seq;		\
 	struct ftrace_raw_##call *field;				\
-	struct trace_entry *entry;					\
-	struct trace_seq *p = &iter->tmp_seq;				\
 	int ret;							\
 									\
-	event = container_of(trace_event, struct ftrace_event_call,	\
-			     event);					\
-									\
-	entry = iter->ent;						\
+	field = (typeof(field))iter->ent;				\
 									\
-	if (entry->type != event->event.type) {				\
-		WARN_ON_ONCE(1);					\
-		return TRACE_TYPE_UNHANDLED;				\
-	}								\
-									\
-	field = (typeof(field))entry;					\
-									\
-	trace_seq_init(p);						\
-	ret = trace_seq_printf(s, "%s: ", event->name);			\
+	ret = ftrace_raw_output_prep(iter, trace_event);		\
 	if (ret)							\
-		ret = trace_seq_printf(s, print);			\
+		return ret;						\
+									\
+	ret = trace_seq_printf(s, print);				\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 194d79602dc7..aa92ac322ba2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -397,6 +397,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
 }
 EXPORT_SYMBOL(ftrace_print_hex_seq);
 
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+			   struct trace_event *trace_event)
+{
+	struct ftrace_event_call *event;
+	struct trace_seq *s = &iter->seq;
+	struct trace_seq *p = &iter->tmp_seq;
+	struct trace_entry *entry;
+	int ret;
+
+	event = container_of(trace_event, struct ftrace_event_call, event);
+	entry = iter->ent;
+
+	if (entry->type != event->event.type) {
+		WARN_ON_ONCE(1);
+		return TRACE_TYPE_UNHANDLED;
+	}
+
+	trace_seq_init(p);
+	ret = trace_seq_printf(s, "%s: ", event->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+EXPORT_SYMBOL(ftrace_raw_output_prep);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
cgit 


From 2a30c11f6a037e2475f3c651bc57e697e79fa963 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Mon, 4 Mar 2013 22:27:04 -0500
Subject: tracing: Add comment for trace event flag IGNORE_ENABLE

All the trace event flags have comments but the IGNORE_ENABLE flag
which is set for ftrace internal events that should not be enabled
via the debugfs "enable" file. That is, if the top level enable file
is set, it will enable all events. It use to just check the ftrace
event call descriptor "reg" field and skip those whithout it, but now
some ftrace internal events have a reg field but still need to be
skipped. The flag was created to ignore those events.

Now document it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4d79d2dc189c..0b0814d90164 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -204,6 +204,7 @@ enum {
  *  FILTERED	  - The event has a filter attached
  *  CAP_ANY	  - Any user can enable for perf
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
+ *  IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
-- 
cgit 


From 575380da8b46969a2c6a7e14a51056a63b30fe2e Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Mon, 4 Mar 2013 23:05:12 -0500
Subject: tracing: Only clear trace buffer on module unload if event was traced

Currently, when a module with events is unloaded, the trace buffer is
cleared. This is just a safety net in case the module might have some
strange callback when its event is outputted. But there's no reason
to reset the buffer if the module didn't have any of its events traced.

Add a flag to the event "call" structure called WAS_ENABLED and gets set
when the event is ever enabled, and this flag never gets cleared. When a
module gets unloaded, if any of its events have this flag set, then the
trace buffer will get cleared.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  5 +++++
 kernel/trace/trace_events.c  | 12 ++++++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 0b0814d90164..d6964244e567 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -197,6 +197,7 @@ enum {
 	TRACE_EVENT_FL_CAP_ANY_BIT,
 	TRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	TRACE_EVENT_FL_IGNORE_ENABLE_BIT,
+	TRACE_EVENT_FL_WAS_ENABLED_BIT,
 };
 
 /*
@@ -205,12 +206,16 @@ enum {
  *  CAP_ANY	  - Any user can enable for perf
  *  NO_SET_FILTER - Set when filter has error and is to be ignored
  *  IGNORE_ENABLE - For ftrace internal events, do not enable with debugfs file
+ *  WAS_ENABLED   - Set and stays set when an event was ever enabled
+ *                    (used for module unloading, if a module event is enabled,
+ *                     it is best to clear the buffers that used it).
  */
 enum {
 	TRACE_EVENT_FL_FILTERED		= (1 << TRACE_EVENT_FL_FILTERED_BIT),
 	TRACE_EVENT_FL_CAP_ANY		= (1 << TRACE_EVENT_FL_CAP_ANY_BIT),
 	TRACE_EVENT_FL_NO_SET_FILTER	= (1 << TRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	TRACE_EVENT_FL_IGNORE_ENABLE	= (1 << TRACE_EVENT_FL_IGNORE_ENABLE_BIT),
+	TRACE_EVENT_FL_WAS_ENABLED	= (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT),
 };
 
 struct ftrace_event_call {
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0f1307a29fcf..9a7dc4bf1171 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -245,6 +245,9 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 				break;
 			}
 			file->flags |= FTRACE_EVENT_FL_ENABLED;
+
+			/* WAS_ENABLED gets set but never cleared. */
+			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
 		}
 		break;
 	}
@@ -1626,12 +1629,13 @@ static void trace_module_remove_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
-	bool found = false;
+	bool clear_trace = false;
 
 	down_write(&trace_event_mutex);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
-			found = true;
+			if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
+				clear_trace = true;
 			__trace_remove_event_call(call);
 		}
 	}
@@ -1648,9 +1652,9 @@ static void trace_module_remove_events(struct module *mod)
 
 	/*
 	 * It is safest to reset the ring buffer if the module being unloaded
-	 * registered any events.
+	 * registered any events that were used.
 	 */
-	if (found)
+	if (clear_trace)
 		tracing_reset_current_online_cpus();
 	up_write(&trace_event_mutex);
 }
-- 
cgit 


From 12883efb670c28dff57dcd7f4f995a1ffe153b2d Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Tue, 5 Mar 2013 09:24:35 -0500
Subject: tracing: Consolidate max_tr into main trace_array structure

Currently, the way the latency tracers and snapshot feature works
is to have a separate trace_array called "max_tr" that holds the
snapshot buffer. For latency tracers, this snapshot buffer is used
to swap the running buffer with this buffer to save the current max
latency.

The only items needed for the max_tr is really just a copy of the buffer
itself, the per_cpu data pointers, the time_start timestamp that states
when the max latency was triggered, and the cpu that the max latency
was triggered on. All other fields in trace_array are unused by the
max_tr, making the max_tr mostly bloat.

This change removes the max_tr completely, and adds a new structure
called trace_buffer, that holds the buffer pointer, the per_cpu data
pointers, the time_start timestamp, and the cpu where the latency occurred.

The trace_array, now has two trace_buffers, one for the normal trace and
one for the max trace or snapshot. By doing this, not only do we remove
the bloat from the max_trace but the instances of traces can now use
their own snapshot feature and not have just the top level global_trace have
the snapshot feature and latency tracers for itself.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h         |   2 +
 kernel/trace/blktrace.c              |   4 +-
 kernel/trace/trace.c                 | 486 +++++++++++++++++++----------------
 kernel/trace/trace.h                 |  37 ++-
 kernel/trace/trace_functions.c       |   8 +-
 kernel/trace/trace_functions_graph.c |  12 +-
 kernel/trace/trace_irqsoff.c         |  10 +-
 kernel/trace/trace_kdb.c             |   8 +-
 kernel/trace/trace_mmiotrace.c       |  12 +-
 kernel/trace/trace_output.c          |   2 +-
 kernel/trace/trace_sched_switch.c    |   8 +-
 kernel/trace/trace_sched_wakeup.c    |  16 +-
 kernel/trace/trace_selftest.c        |  42 +--
 kernel/trace/trace_syscalls.c        |   4 +-
 14 files changed, 365 insertions(+), 286 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d6964244e567..d84c4a575514 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -8,6 +8,7 @@
 #include <linux/perf_event.h>
 
 struct trace_array;
+struct trace_buffer;
 struct tracer;
 struct dentry;
 
@@ -67,6 +68,7 @@ struct trace_entry {
 struct trace_iterator {
 	struct trace_array	*tr;
 	struct tracer		*trace;
+	struct trace_buffer	*trace_buffer;
 	void			*private;
 	int			cpu_file;
 	struct mutex		mutex;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 71259e2b6b61..90a55054744c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 	bool blk_tracer = blk_tracer_enabled;
 
 	if (blk_tracer) {
-		buffer = blk_tr->buffer;
+		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	if (blk_tracer) {
 		tracing_record_cmdline(current);
 
-		buffer = blk_tr->buffer;
+		buffer = blk_tr->trace_buffer.buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + pdu_len,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8a852a55db4..a08c127db865 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -195,27 +195,15 @@ cycle_t ftrace_now(int cpu)
 	u64 ts;
 
 	/* Early boot up does not have a buffer yet */
-	if (!global_trace.buffer)
+	if (!global_trace.trace_buffer.buffer)
 		return trace_clock_local();
 
-	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
-	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+	ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
 
 	return ts;
 }
 
-/*
- * The max_tr is used to snapshot the global_trace when a maximum
- * latency is reached. Some tracers will use this to store a maximum
- * trace while it continues examining live traces.
- *
- * The buffers for the max_tr are set up the same as the global_trace.
- * When a snapshot is taken, the link list of the max_tr is swapped
- * with the link list of the global_trace and the buffers are reset for
- * the global_trace so the tracing can continue.
- */
-static struct trace_array	max_tr;
-
 int tracing_is_enabled(void)
 {
 	return tracing_is_on();
@@ -339,8 +327,8 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
  */
 void tracing_on(void)
 {
-	if (global_trace.buffer)
-		ring_buffer_record_on(global_trace.buffer);
+	if (global_trace.trace_buffer.buffer)
+		ring_buffer_record_on(global_trace.trace_buffer.buffer);
 	/*
 	 * This flag is only looked at when buffers haven't been
 	 * allocated yet. We don't really care about the race
@@ -361,8 +349,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
  */
 void tracing_off(void)
 {
-	if (global_trace.buffer)
-		ring_buffer_record_off(global_trace.buffer);
+	if (global_trace.trace_buffer.buffer)
+		ring_buffer_record_off(global_trace.trace_buffer.buffer);
 	/*
 	 * This flag is only looked at when buffers haven't been
 	 * allocated yet. We don't really care about the race
@@ -378,8 +366,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
  */
 int tracing_is_on(void)
 {
-	if (global_trace.buffer)
-		return ring_buffer_record_is_on(global_trace.buffer);
+	if (global_trace.trace_buffer.buffer)
+		return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
 	return !global_trace.buffer_disabled;
 }
 EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -637,13 +625,14 @@ unsigned long __read_mostly	tracing_max_latency;
 static void
 __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 {
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu);
-	struct trace_array_cpu *max_data;
+	struct trace_buffer *trace_buf = &tr->trace_buffer;
+	struct trace_buffer *max_buf = &tr->max_buffer;
+	struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+	struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
 
-	max_tr.cpu = cpu;
-	max_tr.time_start = data->preempt_timestamp;
+	max_buf->cpu = cpu;
+	max_buf->time_start = data->preempt_timestamp;
 
-	max_data = per_cpu_ptr(max_tr.data, cpu);
 	max_data->saved_latency = tracing_max_latency;
 	max_data->critical_start = data->critical_start;
 	max_data->critical_end = data->critical_end;
@@ -686,9 +675,9 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	arch_spin_lock(&ftrace_max_lock);
 
-	buf = tr->buffer;
-	tr->buffer = max_tr.buffer;
-	max_tr.buffer = buf;
+	buf = tr->trace_buffer.buffer;
+	tr->trace_buffer.buffer = tr->max_buffer.buffer;
+	tr->max_buffer.buffer = buf;
 
 	__update_max_tr(tr, tsk, cpu);
 	arch_spin_unlock(&ftrace_max_lock);
@@ -716,7 +705,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	arch_spin_lock(&ftrace_max_lock);
 
-	ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+	ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
 
 	if (ret == -EBUSY) {
 		/*
@@ -725,7 +714,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 		 * the max trace buffer (no one writes directly to it)
 		 * and flag that it failed.
 		 */
-		trace_array_printk(&max_tr, _THIS_IP_,
+		trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
 			"Failed to swap buffers due to commit in progress\n");
 	}
 
@@ -742,7 +731,7 @@ static void default_wait_pipe(struct trace_iterator *iter)
 	if (trace_buffer_iter(iter, iter->cpu_file))
 		return;
 
-	ring_buffer_wait(iter->tr->buffer, iter->cpu_file);
+	ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
 }
 
 /**
@@ -803,17 +792,19 @@ int register_tracer(struct tracer *type)
 		 * internal tracing to verify that everything is in order.
 		 * If we fail, we do not register this tracer.
 		 */
-		tracing_reset_online_cpus(tr);
+		tracing_reset_online_cpus(&tr->trace_buffer);
 
 		tr->current_trace = type;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 		if (type->use_max_tr) {
 			/* If we expanded the buffers, make sure the max is expanded too */
 			if (ring_buffer_expanded)
-				ring_buffer_resize(max_tr.buffer, trace_buf_size,
+				ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
 						   RING_BUFFER_ALL_CPUS);
 			type->allocated_snapshot = true;
 		}
+#endif
 
 		/* the test is responsible for initializing and enabling */
 		pr_info("Testing tracer %s: ", type->name);
@@ -827,16 +818,18 @@ int register_tracer(struct tracer *type)
 			goto out;
 		}
 		/* Only reset on passing, to avoid touching corrupted buffers */
-		tracing_reset_online_cpus(tr);
+		tracing_reset_online_cpus(&tr->trace_buffer);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 		if (type->use_max_tr) {
 			type->allocated_snapshot = false;
 
 			/* Shrink the max buffer again */
 			if (ring_buffer_expanded)
-				ring_buffer_resize(max_tr.buffer, 1,
+				ring_buffer_resize(tr->max_buffer.buffer, 1,
 						   RING_BUFFER_ALL_CPUS);
 		}
+#endif
 
 		printk(KERN_CONT "PASSED\n");
 	}
@@ -870,9 +863,9 @@ int register_tracer(struct tracer *type)
 	return ret;
 }
 
-void tracing_reset(struct trace_array *tr, int cpu)
+void tracing_reset(struct trace_buffer *buf, int cpu)
 {
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = buf->buffer;
 
 	if (!buffer)
 		return;
@@ -886,9 +879,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
 	ring_buffer_record_enable(buffer);
 }
 
-void tracing_reset_online_cpus(struct trace_array *tr)
+void tracing_reset_online_cpus(struct trace_buffer *buf)
 {
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = buf->buffer;
 	int cpu;
 
 	if (!buffer)
@@ -899,7 +892,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 	/* Make sure all commits have finished */
 	synchronize_sched();
 
-	tr->time_start = ftrace_now(tr->cpu);
+	buf->time_start = ftrace_now(buf->cpu);
 
 	for_each_online_cpu(cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
@@ -909,7 +902,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 
 void tracing_reset_current(int cpu)
 {
-	tracing_reset(&global_trace, cpu);
+	tracing_reset(&global_trace.trace_buffer, cpu);
 }
 
 void tracing_reset_all_online_cpus(void)
@@ -918,7 +911,10 @@ void tracing_reset_all_online_cpus(void)
 
 	mutex_lock(&trace_types_lock);
 	list_for_each_entry(tr, &ftrace_trace_arrays, list) {
-		tracing_reset_online_cpus(tr);
+		tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+		tracing_reset_online_cpus(&tr->max_buffer);
+#endif
 	}
 	mutex_unlock(&trace_types_lock);
 }
@@ -988,13 +984,15 @@ void tracing_start(void)
 	/* Prevent the buffers from switching */
 	arch_spin_lock(&ftrace_max_lock);
 
-	buffer = global_trace.buffer;
+	buffer = global_trace.trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 
-	buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+	buffer = global_trace.max_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
+#endif
 
 	arch_spin_unlock(&ftrace_max_lock);
 
@@ -1026,7 +1024,7 @@ static void tracing_start_tr(struct trace_array *tr)
 		goto out;
 	}
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_enable(buffer);
 
@@ -1053,13 +1051,15 @@ void tracing_stop(void)
 	/* Prevent the buffers from switching */
 	arch_spin_lock(&ftrace_max_lock);
 
-	buffer = global_trace.buffer;
+	buffer = global_trace.trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 
-	buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+	buffer = global_trace.max_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
+#endif
 
 	arch_spin_unlock(&ftrace_max_lock);
 
@@ -1080,7 +1080,7 @@ static void tracing_stop_tr(struct trace_array *tr)
 	if (tr->stop_count++)
 		goto out;
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	if (buffer)
 		ring_buffer_record_disable(buffer);
 
@@ -1246,7 +1246,7 @@ trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
 			  int type, unsigned long len,
 			  unsigned long flags, int pc)
 {
-	*current_rb = ftrace_file->tr->buffer;
+	*current_rb = ftrace_file->tr->trace_buffer.buffer;
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
@@ -1257,7 +1257,7 @@ trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
 				  int type, unsigned long len,
 				  unsigned long flags, int pc)
 {
-	*current_rb = global_trace.buffer;
+	*current_rb = global_trace.trace_buffer.buffer;
 	return trace_buffer_lock_reserve(*current_rb,
 					 type, len, flags, pc);
 }
@@ -1296,7 +1296,7 @@ trace_function(struct trace_array *tr,
 	       int pc)
 {
 	struct ftrace_event_call *call = &event_function;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 
@@ -1437,7 +1437,7 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
 void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 		   int pc)
 {
-	__ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
+	__ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
 }
 
 /**
@@ -1453,7 +1453,8 @@ void trace_dump_stack(void)
 	local_save_flags(flags);
 
 	/* skipping 3 traces, seems to get us at the caller of this function */
-	__ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
+	__ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, 3,
+			     preempt_count(), NULL);
 }
 
 static DEFINE_PER_CPU(int, user_stack_count);
@@ -1623,7 +1624,7 @@ void trace_printk_init_buffers(void)
 	 * directly here. If the global_trace.buffer is already
 	 * allocated here, then this was called by module code.
 	 */
-	if (global_trace.buffer)
+	if (global_trace.trace_buffer.buffer)
 		tracing_start_cmdline_record();
 }
 
@@ -1683,7 +1684,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 
 	local_save_flags(flags);
 	size = sizeof(*entry) + sizeof(u32) * len;
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
 					  flags, pc);
 	if (!event)
@@ -1706,27 +1707,12 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
-int trace_array_printk(struct trace_array *tr,
-		       unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_array_vprintk(tr, ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-
-int trace_array_vprintk(struct trace_array *tr,
-			unsigned long ip, const char *fmt, va_list args)
+static int
+__trace_array_vprintk(struct ring_buffer *buffer,
+		      unsigned long ip, const char *fmt, va_list args)
 {
 	struct ftrace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
 	int len = 0, size, pc;
 	struct print_entry *entry;
 	unsigned long flags;
@@ -1754,7 +1740,6 @@ int trace_array_vprintk(struct trace_array *tr,
 
 	local_save_flags(flags);
 	size = sizeof(*entry) + len + 1;
-	buffer = tr->buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					  flags, pc);
 	if (!event)
@@ -1775,6 +1760,42 @@ int trace_array_vprintk(struct trace_array *tr,
 	return len;
 }
 
+int trace_array_vprintk(struct trace_array *tr,
+			unsigned long ip, const char *fmt, va_list args)
+{
+	return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+}
+
+int trace_array_printk(struct trace_array *tr,
+		       unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_array_vprintk(tr, ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
+int trace_array_printk_buf(struct ring_buffer *buffer,
+			   unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1800,7 +1821,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
 	if (buf_iter)
 		event = ring_buffer_iter_peek(buf_iter, ts);
 	else
-		event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+		event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
 					 lost_events);
 
 	if (event) {
@@ -1815,7 +1836,7 @@ static struct trace_entry *
 __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
 		  unsigned long *missing_events, u64 *ent_ts)
 {
-	struct ring_buffer *buffer = iter->tr->buffer;
+	struct ring_buffer *buffer = iter->trace_buffer->buffer;
 	struct trace_entry *ent, *next = NULL;
 	unsigned long lost_events = 0, next_lost = 0;
 	int cpu_file = iter->cpu_file;
@@ -1892,7 +1913,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
 
 static void trace_consume(struct trace_iterator *iter)
 {
-	ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+	ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
 			    &iter->lost_events);
 }
 
@@ -1925,13 +1946,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 
 void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 {
-	struct trace_array *tr = iter->tr;
 	struct ring_buffer_event *event;
 	struct ring_buffer_iter *buf_iter;
 	unsigned long entries = 0;
 	u64 ts;
 
-	per_cpu_ptr(tr->data, cpu)->skipped_entries = 0;
+	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
 
 	buf_iter = trace_buffer_iter(iter, cpu);
 	if (!buf_iter)
@@ -1945,13 +1965,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
 	 * by the timestamp being before the start of the buffer.
 	 */
 	while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
-		if (ts >= iter->tr->time_start)
+		if (ts >= iter->trace_buffer->time_start)
 			break;
 		entries++;
 		ring_buffer_read(buf_iter, NULL);
 	}
 
-	per_cpu_ptr(tr->data, cpu)->skipped_entries = entries;
+	per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
 }
 
 /*
@@ -1978,8 +1998,10 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		*iter->trace = *tr->current_trace;
 	mutex_unlock(&trace_types_lock);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (iter->snapshot && iter->trace->use_max_tr)
 		return ERR_PTR(-EBUSY);
+#endif
 
 	if (!iter->snapshot)
 		atomic_inc(&trace_record_cmdline_disabled);
@@ -2021,17 +2043,21 @@ static void s_stop(struct seq_file *m, void *p)
 {
 	struct trace_iterator *iter = m->private;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (iter->snapshot && iter->trace->use_max_tr)
 		return;
+#endif
 
 	if (!iter->snapshot)
 		atomic_dec(&trace_record_cmdline_disabled);
+
 	trace_access_unlock(iter->cpu_file);
 	trace_event_read_unlock();
 }
 
 static void
-get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+get_total_entries(struct trace_buffer *buf,
+		  unsigned long *total, unsigned long *entries)
 {
 	unsigned long count;
 	int cpu;
@@ -2040,19 +2066,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
 	*entries = 0;
 
 	for_each_tracing_cpu(cpu) {
-		count = ring_buffer_entries_cpu(tr->buffer, cpu);
+		count = ring_buffer_entries_cpu(buf->buffer, cpu);
 		/*
 		 * If this buffer has skipped entries, then we hold all
 		 * entries for the trace and we need to ignore the
 		 * ones before the time stamp.
 		 */
-		if (per_cpu_ptr(tr->data, cpu)->skipped_entries) {
-			count -= per_cpu_ptr(tr->data, cpu)->skipped_entries;
+		if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+			count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
 			/* total is the same as the entries */
 			*total += count;
 		} else
 			*total += count +
-				ring_buffer_overrun_cpu(tr->buffer, cpu);
+				ring_buffer_overrun_cpu(buf->buffer, cpu);
 		*entries += count;
 	}
 }
@@ -2069,27 +2095,27 @@ static void print_lat_help_header(struct seq_file *m)
 	seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
 
-static void print_event_info(struct trace_array *tr, struct seq_file *m)
+static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
 {
 	unsigned long total;
 	unsigned long entries;
 
-	get_total_entries(tr, &total, &entries);
+	get_total_entries(buf, &total, &entries);
 	seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
 		   entries, total, num_online_cpus());
 	seq_puts(m, "#\n");
 }
 
-static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
 {
-	print_event_info(tr, m);
+	print_event_info(buf, m);
 	seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
 	seq_puts(m, "#              | |       |          |         |\n");
 }
 
-static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
 {
-	print_event_info(tr, m);
+	print_event_info(buf, m);
 	seq_puts(m, "#                              _-----=> irqs-off\n");
 	seq_puts(m, "#                             / _----=> need-resched\n");
 	seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
@@ -2103,8 +2129,8 @@ void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 {
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_array *tr = iter->tr;
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, tr->cpu);
+	struct trace_buffer *buf = iter->trace_buffer;
+	struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
 	struct tracer *type = iter->trace;
 	unsigned long entries;
 	unsigned long total;
@@ -2112,7 +2138,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 
 	name = type->name;
 
-	get_total_entries(tr, &total, &entries);
+	get_total_entries(buf, &total, &entries);
 
 	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
 		   name, UTS_RELEASE);
@@ -2123,7 +2149,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 		   nsecs_to_usecs(data->saved_latency),
 		   entries,
 		   total,
-		   tr->cpu,
+		   buf->cpu,
 #if defined(CONFIG_PREEMPT_NONE)
 		   "server",
 #elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2174,7 +2200,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	if (cpumask_test_cpu(iter->cpu, iter->started))
 		return;
 
-	if (per_cpu_ptr(iter->tr->data, iter->cpu)->skipped_entries)
+	if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
 		return;
 
 	cpumask_set_cpu(iter->cpu, iter->started);
@@ -2304,7 +2330,7 @@ int trace_empty(struct trace_iterator *iter)
 			if (!ring_buffer_iter_empty(buf_iter))
 				return 0;
 		} else {
-			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
 				return 0;
 		}
 		return 1;
@@ -2316,7 +2342,7 @@ int trace_empty(struct trace_iterator *iter)
 			if (!ring_buffer_iter_empty(buf_iter))
 				return 0;
 		} else {
-			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+			if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
 				return 0;
 		}
 	}
@@ -2394,9 +2420,9 @@ void trace_default_header(struct seq_file *m)
 	} else {
 		if (!(trace_flags & TRACE_ITER_VERBOSE)) {
 			if (trace_flags & TRACE_ITER_IRQ_INFO)
-				print_func_help_header_irq(iter->tr, m);
+				print_func_help_header_irq(iter->trace_buffer, m);
 			else
-				print_func_help_header(iter->tr, m);
+				print_func_help_header(iter->trace_buffer, m);
 		}
 	}
 }
@@ -2515,11 +2541,15 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
 		goto fail;
 
+	iter->tr = tr;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
 	/* Currently only the top directory has a snapshot */
 	if (tr->current_trace->print_max || snapshot)
-		iter->tr = &max_tr;
+		iter->trace_buffer = &tr->max_buffer;
 	else
-		iter->tr = tr;
+#endif
+		iter->trace_buffer = &tr->trace_buffer;
 	iter->snapshot = snapshot;
 	iter->pos = -1;
 	mutex_init(&iter->mutex);
@@ -2530,7 +2560,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 		iter->trace->open(iter);
 
 	/* Annotate start of buffers if we had overruns */
-	if (ring_buffer_overruns(iter->tr->buffer))
+	if (ring_buffer_overruns(iter->trace_buffer->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
 	/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2544,7 +2574,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter->buffer_iter[cpu] =
-				ring_buffer_read_prepare(iter->tr->buffer, cpu);
+				ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
 		}
 		ring_buffer_read_prepare_sync();
 		for_each_tracing_cpu(cpu) {
@@ -2554,7 +2584,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter->tr->buffer, cpu);
+			ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
 		ring_buffer_read_prepare_sync();
 		ring_buffer_read_start(iter->buffer_iter[cpu]);
 		tracing_iter_reset(iter, cpu);
@@ -2593,12 +2623,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 		return 0;
 
 	iter = m->private;
-
-	/* Only the global tracer has a matching max_tr */
-	if (iter->tr == &max_tr)
-		tr = &global_trace;
-	else
-		tr = iter->tr;
+	tr = iter->tr;
 
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
@@ -2634,9 +2659,9 @@ static int tracing_open(struct inode *inode, struct file *file)
 		struct trace_array *tr = tc->tr;
 
 		if (tc->cpu == RING_BUFFER_ALL_CPUS)
-			tracing_reset_online_cpus(tr);
+			tracing_reset_online_cpus(&tr->trace_buffer);
 		else
-			tracing_reset(tr, tc->cpu);
+			tracing_reset(&tr->trace_buffer, tc->cpu);
 	}
 
 	if (file->f_mode & FMODE_READ) {
@@ -2805,13 +2830,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
 		 */
 		if (cpumask_test_cpu(cpu, tracing_cpumask) &&
 				!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_inc(&per_cpu_ptr(tr->data, cpu)->disabled);
-			ring_buffer_record_disable_cpu(tr->buffer, cpu);
+			atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+			ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
 		}
 		if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
 				cpumask_test_cpu(cpu, tracing_cpumask_new)) {
-			atomic_dec(&per_cpu_ptr(tr->data, cpu)->disabled);
-			ring_buffer_record_enable_cpu(tr->buffer, cpu);
+			atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+			ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
 		}
 	}
 	arch_spin_unlock(&ftrace_max_lock);
@@ -2930,9 +2955,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
 		trace_event_enable_cmd_record(enabled);
 
 	if (mask == TRACE_ITER_OVERWRITE) {
-		ring_buffer_change_overwrite(global_trace.buffer, enabled);
+		ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
 #ifdef CONFIG_TRACER_MAX_TRACE
-		ring_buffer_change_overwrite(max_tr.buffer, enabled);
+		ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
 #endif
 	}
 
@@ -3116,42 +3141,44 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 
 int tracer_init(struct tracer *t, struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 	return t->init(tr);
 }
 
-static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
 {
 	int cpu;
 	for_each_tracing_cpu(cpu)
-		per_cpu_ptr(tr->data, cpu)->entries = val;
+		per_cpu_ptr(buf->data, cpu)->entries = val;
 }
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 /* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_array *tr,
-					struct trace_array *size_tr, int cpu_id)
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+					struct trace_buffer *size_buf, int cpu_id)
 {
 	int cpu, ret = 0;
 
 	if (cpu_id == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
-			ret = ring_buffer_resize(tr->buffer,
-				 per_cpu_ptr(size_tr->data, cpu)->entries, cpu);
+			ret = ring_buffer_resize(trace_buf->buffer,
+				 per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
 			if (ret < 0)
 				break;
-			per_cpu_ptr(tr->data, cpu)->entries =
-				per_cpu_ptr(size_tr->data, cpu)->entries;
+			per_cpu_ptr(trace_buf->data, cpu)->entries =
+				per_cpu_ptr(size_buf->data, cpu)->entries;
 		}
 	} else {
-		ret = ring_buffer_resize(tr->buffer,
-				 per_cpu_ptr(size_tr->data, cpu_id)->entries, cpu_id);
+		ret = ring_buffer_resize(trace_buf->buffer,
+				 per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
 		if (ret == 0)
-			per_cpu_ptr(tr->data, cpu_id)->entries =
-				per_cpu_ptr(size_tr->data, cpu_id)->entries;
+			per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+				per_cpu_ptr(size_buf->data, cpu_id)->entries;
 	}
 
 	return ret;
 }
+#endif /* CONFIG_TRACER_MAX_TRACE */
 
 static int __tracing_resize_ring_buffer(struct trace_array *tr,
 					unsigned long size, int cpu)
@@ -3166,20 +3193,22 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	ring_buffer_expanded = 1;
 
 	/* May be called before buffers are initialized */
-	if (!tr->buffer)
+	if (!tr->trace_buffer.buffer)
 		return 0;
 
-	ret = ring_buffer_resize(tr->buffer, size, cpu);
+	ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
 	if (ret < 0)
 		return ret;
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
 	    !tr->current_trace->use_max_tr)
 		goto out;
 
-	ret = ring_buffer_resize(max_tr.buffer, size, cpu);
+	ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
 	if (ret < 0) {
-		int r = resize_buffer_duplicate_size(tr, tr, cpu);
+		int r = resize_buffer_duplicate_size(&tr->trace_buffer,
+						     &tr->trace_buffer, cpu);
 		if (r < 0) {
 			/*
 			 * AARGH! We are left with different
@@ -3202,15 +3231,17 @@ static int __tracing_resize_ring_buffer(struct trace_array *tr,
 	}
 
 	if (cpu == RING_BUFFER_ALL_CPUS)
-		set_buffer_entries(&max_tr, size);
+		set_buffer_entries(&tr->max_buffer, size);
 	else
-		per_cpu_ptr(max_tr.data, cpu)->entries = size;
+		per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
 
  out:
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
 	if (cpu == RING_BUFFER_ALL_CPUS)
-		set_buffer_entries(tr, size);
+		set_buffer_entries(&tr->trace_buffer, size);
 	else
-		per_cpu_ptr(tr->data, cpu)->entries = size;
+		per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
 
 	return ret;
 }
@@ -3277,7 +3308,9 @@ static int tracing_set_tracer(const char *buf)
 	static struct trace_option_dentry *topts;
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
+#ifdef CONFIG_TRACER_MAX_TRACE
 	bool had_max_tr;
+#endif
 	int ret = 0;
 
 	mutex_lock(&trace_types_lock);
@@ -3308,7 +3341,10 @@ static int tracing_set_tracer(const char *buf)
 	if (tr->current_trace->reset)
 		tr->current_trace->reset(tr);
 
+#ifdef CONFIG_TRACER_MAX_TRACE
 	had_max_tr = tr->current_trace->allocated_snapshot;
+
+	/* Current trace needs to be nop_trace before synchronize_sched */
 	tr->current_trace = &nop_trace;
 
 	if (had_max_tr && !t->use_max_tr) {
@@ -3325,22 +3361,28 @@ static int tracing_set_tracer(const char *buf)
 		 * The max_tr ring buffer has some state (e.g. ring->clock) and
 		 * we want preserve it.
 		 */
-		ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
-		set_buffer_entries(&max_tr, 1);
-		tracing_reset_online_cpus(&max_tr);
+		ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+		set_buffer_entries(&tr->max_buffer, 1);
+		tracing_reset_online_cpus(&tr->max_buffer);
 		tr->current_trace->allocated_snapshot = false;
 	}
+#else
+	tr->current_trace = &nop_trace;
+#endif
 	destroy_trace_option_files(topts);
 
 	topts = create_trace_option_files(tr, t);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
 	if (t->use_max_tr && !had_max_tr) {
 		/* we need to make per cpu buffer sizes equivalent */
-		ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
+		ret = resize_buffer_duplicate_size(&tr->max_buffer, &tr->trace_buffer,
 						   RING_BUFFER_ALL_CPUS);
 		if (ret < 0)
 			goto out;
 		t->allocated_snapshot = true;
 	}
+#endif
 
 	if (t->init) {
 		ret = tracer_init(t, tr);
@@ -3468,6 +3510,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->cpu_file = tc->cpu;
 	iter->tr = tc->tr;
+	iter->trace_buffer = &tc->tr->trace_buffer;
 	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
@@ -3518,7 +3561,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl
 		 */
 		return POLLIN | POLLRDNORM;
 	else
-		return ring_buffer_poll_wait(iter->tr->buffer, iter->cpu_file,
+		return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
 					     filp, poll_table);
 }
 
@@ -3857,8 +3900,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		for_each_tracing_cpu(cpu) {
 			/* fill in the size from first enabled cpu */
 			if (size == 0)
-				size = per_cpu_ptr(tr->data, cpu)->entries;
-			if (size != per_cpu_ptr(tr->data, cpu)->entries) {
+				size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
+			if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
 				buf_size_same = 0;
 				break;
 			}
@@ -3874,7 +3917,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		} else
 			r = sprintf(buf, "X\n");
 	} else
-		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->data, tc->cpu)->entries >> 10);
+		r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
 
 	mutex_unlock(&trace_types_lock);
 
@@ -3921,7 +3964,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
 
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
-		size += per_cpu_ptr(tr->data, cpu)->entries >> 10;
+		size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
 		if (!ring_buffer_expanded)
 			expanded_size += trace_buf_size >> 10;
 	}
@@ -4026,7 +4069,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 
 	local_save_flags(irq_flags);
 	size = sizeof(*entry) + cnt + 2; /* possible \n added */
-	buffer = global_trace.buffer;
+	buffer = global_trace.trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
 					  irq_flags, preempt_count());
 	if (!event) {
@@ -4111,16 +4154,19 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
 
 	tr->clock_id = i;
 
-	ring_buffer_set_clock(tr->buffer, trace_clocks[i].func);
-	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && max_tr.buffer)
-		ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+	ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
 
 	/*
 	 * New clock may not be consistent with the previous clock.
 	 * Reset the buffer so that it doesn't have incomparable timestamps.
 	 */
-	tracing_reset_online_cpus(&global_trace);
-	tracing_reset_online_cpus(&max_tr);
+	tracing_reset_online_cpus(&global_trace.trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+	if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+		ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
+	tracing_reset_online_cpus(&global_trace.max_buffer);
+#endif
 
 	mutex_unlock(&trace_types_lock);
 
@@ -4160,6 +4206,7 @@ static int tracing_snapshot_open(struct inode *inode, struct file *file)
 			return -ENOMEM;
 		}
 		iter->tr = tc->tr;
+		iter->trace_buffer = &tc->tr->max_buffer;
 		m->private = iter;
 		file->private_data = m;
 	}
@@ -4196,18 +4243,18 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	case 0:
 		if (tr->current_trace->allocated_snapshot) {
 			/* free spare buffer */
-			ring_buffer_resize(max_tr.buffer, 1,
+			ring_buffer_resize(tr->max_buffer.buffer, 1,
 					   RING_BUFFER_ALL_CPUS);
-			set_buffer_entries(&max_tr, 1);
-			tracing_reset_online_cpus(&max_tr);
+			set_buffer_entries(&tr->max_buffer, 1);
+			tracing_reset_online_cpus(&tr->max_buffer);
 			tr->current_trace->allocated_snapshot = false;
 		}
 		break;
 	case 1:
 		if (!tr->current_trace->allocated_snapshot) {
 			/* allocate spare buffer */
-			ret = resize_buffer_duplicate_size(&max_tr,
-					&global_trace, RING_BUFFER_ALL_CPUS);
+			ret = resize_buffer_duplicate_size(&tr->max_buffer,
+					&tr->trace_buffer, RING_BUFFER_ALL_CPUS);
 			if (ret < 0)
 				break;
 			tr->current_trace->allocated_snapshot = true;
@@ -4220,7 +4267,7 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		break;
 	default:
 		if (tr->current_trace->allocated_snapshot)
-			tracing_reset_online_cpus(&max_tr);
+			tracing_reset_online_cpus(&tr->max_buffer);
 		break;
 	}
 
@@ -4338,6 +4385,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
 	info->iter.tr		= tr;
 	info->iter.cpu_file	= tc->cpu;
 	info->iter.trace	= tr->current_trace;
+	info->iter.trace_buffer = &tr->trace_buffer;
 	info->spare		= NULL;
 	/* Force reading ring buffer for first read */
 	info->read		= (unsigned int)-1;
@@ -4369,7 +4417,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 		return 0;
 
 	if (!info->spare)
-		info->spare = ring_buffer_alloc_read_page(iter->tr->buffer, iter->cpu_file);
+		info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+							  iter->cpu_file);
 	if (!info->spare)
 		return -ENOMEM;
 
@@ -4379,7 +4428,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 
  again:
 	trace_access_lock(iter->cpu_file);
-	ret = ring_buffer_read_page(iter->tr->buffer,
+	ret = ring_buffer_read_page(iter->trace_buffer->buffer,
 				    &info->spare,
 				    count,
 				    iter->cpu_file, 0);
@@ -4421,7 +4470,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
 	struct trace_iterator *iter = &info->iter;
 
 	if (info->spare)
-		ring_buffer_free_read_page(iter->tr->buffer, info->spare);
+		ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
 	kfree(info);
 
 	return 0;
@@ -4521,7 +4570,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 
  again:
 	trace_access_lock(iter->cpu_file);
-	entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
+	entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
 
 	for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
@@ -4532,7 +4581,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 			break;
 
 		ref->ref = 1;
-		ref->buffer = iter->tr->buffer;
+		ref->buffer = iter->trace_buffer->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
 		if (!ref->page) {
 			kfree(ref);
@@ -4564,7 +4613,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
 
-		entries = ring_buffer_entries_cpu(iter->tr->buffer, iter->cpu_file);
+		entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
 	}
 
 	trace_access_unlock(iter->cpu_file);
@@ -4605,6 +4654,7 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 {
 	struct trace_cpu *tc = filp->private_data;
 	struct trace_array *tr = tc->tr;
+	struct trace_buffer *trace_buf = &tr->trace_buffer;
 	struct trace_seq *s;
 	unsigned long cnt;
 	unsigned long long t;
@@ -4617,41 +4667,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
 
 	trace_seq_init(s);
 
-	cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "entries: %ld\n", cnt);
 
-	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "overrun: %ld\n", cnt);
 
-	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
 
-	cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "bytes: %ld\n", cnt);
 
 	if (trace_clocks[trace_clock_id].in_ns) {
 		/* local or global for trace_clock */
-		t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+		t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 		usec_rem = do_div(t, USEC_PER_SEC);
 		trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
 								t, usec_rem);
 
-		t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+		t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
 		usec_rem = do_div(t, USEC_PER_SEC);
 		trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
 	} else {
 		/* counter or tsc mode for trace_clock */
 		trace_seq_printf(s, "oldest event ts: %llu\n",
-				ring_buffer_oldest_event_ts(tr->buffer, cpu));
+				ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
 
 		trace_seq_printf(s, "now ts: %llu\n",
-				ring_buffer_time_stamp(tr->buffer, cpu));
+				ring_buffer_time_stamp(trace_buf->buffer, cpu));
 	}
 
-	cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "dropped events: %ld\n", cnt);
 
-	cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+	cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
 	trace_seq_printf(s, "read events: %ld\n", cnt);
 
 	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4754,7 +4804,7 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
 static void
 tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
 {
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, cpu);
+	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
 	struct dentry *d_cpu;
 	char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -5038,7 +5088,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	char buf[64];
 	int r;
 
@@ -5057,7 +5107,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	unsigned long val;
 	int ret;
 
@@ -5129,18 +5179,18 @@ static int new_instance_create(const char *name)
 
 	rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
 
-	tr->buffer = ring_buffer_alloc(trace_buf_size, rb_flags);
-	if (!tr->buffer)
+	tr->trace_buffer.buffer = ring_buffer_alloc(trace_buf_size, rb_flags);
+	if (!tr->trace_buffer.buffer)
 		goto out_free_tr;
 
-	tr->data = alloc_percpu(struct trace_array_cpu);
-	if (!tr->data)
+	tr->trace_buffer.data = alloc_percpu(struct trace_array_cpu);
+	if (!tr->trace_buffer.data)
 		goto out_free_tr;
 
 	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(tr->data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(tr->data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(tr->data, i)->trace_cpu.tr = tr;
+		memset(per_cpu_ptr(tr->trace_buffer.data, i), 0, sizeof(struct trace_array_cpu));
+		per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(tr->trace_buffer.data, i)->trace_cpu.tr = tr;
 	}
 
 	/* Holder for file callbacks */
@@ -5164,8 +5214,8 @@ static int new_instance_create(const char *name)
 	return 0;
 
  out_free_tr:
-	if (tr->buffer)
-		ring_buffer_free(tr->buffer);
+	if (tr->trace_buffer.buffer)
+		ring_buffer_free(tr->trace_buffer.buffer);
 	kfree(tr->name);
 	kfree(tr);
 
@@ -5198,8 +5248,8 @@ static int instance_delete(const char *name)
 
 	event_trace_del_tracer(tr);
 	debugfs_remove_recursive(tr->dir);
-	free_percpu(tr->data);
-	ring_buffer_free(tr->buffer);
+	free_percpu(tr->trace_buffer.data);
+	ring_buffer_free(tr->trace_buffer.buffer);
 
 	kfree(tr->name);
 	kfree(tr);
@@ -5439,6 +5489,7 @@ void trace_init_global_iter(struct trace_iterator *iter)
 	iter->tr = &global_trace;
 	iter->trace = iter->tr->current_trace;
 	iter->cpu_file = RING_BUFFER_ALL_CPUS;
+	iter->trace_buffer = &global_trace.trace_buffer;
 }
 
 static void
@@ -5476,7 +5527,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+		atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5544,7 +5595,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
 		trace_flags |= old_userobj;
 
 		for_each_tracing_cpu(cpu) {
-			atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+			atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 		}
 		tracing_on();
 	}
@@ -5594,58 +5645,59 @@ __init static int tracer_alloc_buffers(void)
 	raw_spin_lock_init(&global_trace.start_lock);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
-	if (!global_trace.buffer) {
+	global_trace.trace_buffer.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
+	if (!global_trace.trace_buffer.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
 
-	global_trace.data = alloc_percpu(struct trace_array_cpu);
+	global_trace.trace_buffer.data = alloc_percpu(struct trace_array_cpu);
 
-	if (!global_trace.data) {
+	if (!global_trace.trace_buffer.data) {
 		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
 
 	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(global_trace.data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(global_trace.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(global_trace.data, i)->trace_cpu.tr = &global_trace;
+		memset(per_cpu_ptr(global_trace.trace_buffer.data, i), 0,
+		       sizeof(struct trace_array_cpu));
+		per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(global_trace.trace_buffer.data, i)->trace_cpu.tr = &global_trace;
 	}
 
 	if (global_trace.buffer_disabled)
 		tracing_off();
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.data = alloc_percpu(struct trace_array_cpu);
-	if (!max_tr.data) {
+	global_trace.max_buffer.data = alloc_percpu(struct trace_array_cpu);
+	if (!global_trace.max_buffer.data) {
 		printk(KERN_ERR "tracer: failed to allocate percpu memory!\n");
 		WARN_ON(1);
 		goto out_free_cpumask;
 	}
-	max_tr.buffer = ring_buffer_alloc(1, rb_flags);
-	raw_spin_lock_init(&max_tr.start_lock);
-	if (!max_tr.buffer) {
+	global_trace.max_buffer.buffer = ring_buffer_alloc(1, rb_flags);
+	if (!global_trace.max_buffer.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
 		WARN_ON(1);
-		ring_buffer_free(global_trace.buffer);
+		ring_buffer_free(global_trace.trace_buffer.buffer);
 		goto out_free_cpumask;
 	}
 
 	for_each_tracing_cpu(i) {
-		memset(per_cpu_ptr(max_tr.data, i), 0, sizeof(struct trace_array_cpu));
-		per_cpu_ptr(max_tr.data, i)->trace_cpu.cpu = i;
-		per_cpu_ptr(max_tr.data, i)->trace_cpu.tr = &max_tr;
+		memset(per_cpu_ptr(global_trace.max_buffer.data, i), 0,
+		       sizeof(struct trace_array_cpu));
+		per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.cpu = i;
+		per_cpu_ptr(global_trace.max_buffer.data, i)->trace_cpu.tr = &global_trace;
 	}
 #endif
 
 	/* Allocate the first page for all buffers */
-	set_buffer_entries(&global_trace,
-			   ring_buffer_size(global_trace.buffer, 0));
+	set_buffer_entries(&global_trace.trace_buffer,
+			   ring_buffer_size(global_trace.trace_buffer.buffer, 0));
 #ifdef CONFIG_TRACER_MAX_TRACE
-	set_buffer_entries(&max_tr, 1);
+	set_buffer_entries(&global_trace.max_buffer, 1);
 #endif
 
 	trace_init_cmdlines();
@@ -5682,8 +5734,10 @@ __init static int tracer_alloc_buffers(void)
 	return 0;
 
 out_free_cpumask:
-	free_percpu(global_trace.data);
-	free_percpu(max_tr.data);
+	free_percpu(global_trace.trace_buffer.data);
+#ifdef CONFIG_TRACER_MAX_TRACE
+	free_percpu(global_trace.max_buffer.data);
+#endif
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fa60b2977524..986834f1f4dd 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -167,16 +167,37 @@ struct trace_array_cpu {
 
 struct tracer;
 
+struct trace_buffer {
+	struct trace_array		*tr;
+	struct ring_buffer		*buffer;
+	struct trace_array_cpu __percpu	*data;
+	cycle_t				time_start;
+	int				cpu;
+};
+
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
  * They have on/off state as well:
  */
 struct trace_array {
-	struct ring_buffer	*buffer;
 	struct list_head	list;
 	char			*name;
-	int			cpu;
+	struct trace_buffer	trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+	/*
+	 * The max_buffer is used to snapshot the trace when a maximum
+	 * latency is reached, or when the user initiates a snapshot.
+	 * Some tracers will use this to store a maximum trace while
+	 * it continues examining live traces.
+	 *
+	 * The buffers for the max_buffer are set up the same as the trace_buffer
+	 * When a snapshot is taken, the buffer of the max_buffer is swapped
+	 * with the buffer of the trace_buffer and the buffers are reset for
+	 * the trace_buffer so the tracing can continue.
+	 */
+	struct trace_buffer	max_buffer;
+#endif
 	int			buffer_disabled;
 	struct trace_cpu	trace_cpu;	/* place holder */
 #ifdef CONFIG_FTRACE_SYSCALLS
@@ -189,7 +210,6 @@ struct trace_array {
 	int			clock_id;
 	struct tracer		*current_trace;
 	unsigned int		flags;
-	cycle_t			time_start;
 	raw_spinlock_t		start_lock;
 	struct dentry		*dir;
 	struct dentry		*options;
@@ -198,7 +218,6 @@ struct trace_array {
 	struct list_head	systems;
 	struct list_head	events;
 	struct task_struct	*waiter;
-	struct trace_array_cpu __percpu	*data;
 };
 
 enum {
@@ -345,9 +364,11 @@ struct tracer {
 	struct tracer		*next;
 	struct tracer_flags	*flags;
 	bool			print_max;
+	bool			enabled;
+#ifdef CONFIG_TRACER_MAX_TRACE
 	bool			use_max_tr;
 	bool			allocated_snapshot;
-	bool			enabled;
+#endif
 };
 
 
@@ -493,8 +514,8 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
 
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
-void tracing_reset(struct trace_array *tr, int cpu);
-void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset(struct trace_buffer *buf, int cpu);
+void tracing_reset_online_cpus(struct trace_buffer *buf);
 void tracing_reset_current(int cpu);
 void tracing_reset_all_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
@@ -674,6 +695,8 @@ trace_array_vprintk(struct trace_array *tr,
 		    unsigned long ip, const char *fmt, va_list args);
 int trace_array_printk(struct trace_array *tr,
 		       unsigned long ip, const char *fmt, ...);
+int trace_array_printk_buf(struct ring_buffer *buffer,
+			   unsigned long ip, const char *fmt, ...);
 void trace_printk_seq(struct trace_seq *s);
 enum print_line_t print_trace_line(struct trace_iterator *iter);
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9d73861efc6a..e467c0c7bdd5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
 static int function_trace_init(struct trace_array *tr)
 {
 	func_trace = tr;
-	tr->cpu = get_cpu();
+	tr->trace_buffer.cpu = get_cpu();
 	put_cpu();
 
 	tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
 
 static void function_trace_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 }
 
 /* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
 		goto out;
 
 	cpu = smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	if (!atomic_read(&data->disabled)) {
 		local_save_flags(flags);
 		trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
 	 */
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1)) {
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index ca986d61a282..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
 {
 	struct ftrace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ent_entry *entry;
 
 	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
 {
 	struct ftrace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ftrace_graph_ret_entry *entry;
 
 	if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
 			 * We need to consume the current entry to see
 			 * the next one.
 			 */
-			ring_buffer_consume(iter->tr->buffer, iter->cpu,
+			ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
 					    NULL, NULL);
-			event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+			event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
 						 NULL, NULL);
 		}
 
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9b52f9cf7a0d..5aa40ab72b57 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -121,7 +121,7 @@ static int func_prolog_dec(struct trace_array *tr,
 	if (!irqs_disabled_flags(*flags))
 		return 0;
 
-	*data = per_cpu_ptr(tr->data, cpu);
+	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&(*data)->disabled);
 
 	if (likely(disabled == 1))
@@ -175,7 +175,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
 		per_cpu(tracing_cpu, cpu) = 0;
 
 	tracing_max_latency = 0;
-	tracing_reset_online_cpus(irqsoff_trace);
+	tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
 
 	return start_irqsoff_tracer(irqsoff_trace, set);
 }
@@ -380,7 +380,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (per_cpu(tracing_cpu, cpu))
 		return;
 
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 
 	if (unlikely(!data) || atomic_read(&data->disabled))
 		return;
@@ -418,7 +418,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (!tracer_enabled)
 		return;
 
-	data = per_cpu_ptr(tr->data, cpu);
+	data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 
 	if (unlikely(!data) ||
 	    !data->critical_start || atomic_read(&data->disabled))
@@ -568,7 +568,7 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 
 	if (start_irqsoff_tracer(tr, is_graph()))
 		printk(KERN_ERR "failed to start irqsoff tracer\n");
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 349f6941e8f2..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	trace_init_global_iter(&iter);
 
 	for_each_tracing_cpu(cpu) {
-		atomic_inc(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+		atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
 
 	old_userobj = trace_flags;
@@ -46,14 +46,14 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
 	if (cpu_file == RING_BUFFER_ALL_CPUS) {
 		for_each_tracing_cpu(cpu) {
 			iter.buffer_iter[cpu] =
-			ring_buffer_read_prepare(iter.tr->buffer, cpu);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
 			ring_buffer_read_start(iter.buffer_iter[cpu]);
 			tracing_iter_reset(&iter, cpu);
 		}
 	} else {
 		iter.cpu_file = cpu_file;
 		iter.buffer_iter[cpu_file] =
-			ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+			ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
 		ring_buffer_read_start(iter.buffer_iter[cpu_file]);
 		tracing_iter_reset(&iter, cpu_file);
 	}
@@ -83,7 +83,7 @@ out:
 	trace_flags = old_userobj;
 
 	for_each_tracing_cpu(cpu) {
-		atomic_dec(&per_cpu_ptr(iter.tr->data, cpu)->disabled);
+		atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
 	}
 
 	for_each_tracing_cpu(cpu)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 2472f6f76b50..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
 	overrun_detected = false;
 	prev_overruns = 0;
 
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 }
 
 static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
 	unsigned long cnt = atomic_xchg(&dropped_count, 0);
-	unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+	unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
 
 	if (over > prev_overruns)
 		cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 				struct mmiotrace_rw *rw)
 {
 	struct ftrace_event_call *call = &event_mmiotrace_rw;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
 	int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
 	struct trace_array *tr = mmio_trace_array;
-	struct trace_array_cpu *data = per_cpu_ptr(tr->data, smp_processor_id());
+	struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
 	__trace_mmiotrace_rw(tr, data, rw);
 }
 
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 				struct mmiotrace_map *map)
 {
 	struct ftrace_event_call *call = &event_mmiotrace_map;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
 	int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 	struct trace_array_cpu *data;
 
 	preempt_disable();
-	data = per_cpu_ptr(tr->data, smp_processor_id());
+	data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
 	__trace_mmiotrace_map(tr, data, map);
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index aa92ac322ba2..2edc7220d017 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -643,7 +643,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
 {
 	unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
 	unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
-	unsigned long long abs_ts = iter->ts - iter->tr->time_start;
+	unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
 	unsigned long long rel_ts = next_ts - iter->ts;
 	struct trace_seq *s = &iter->seq;
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 1ffe39abd6fc..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   unsigned long flags, int pc)
 {
 	struct ftrace_event_call *call = &event_context_switch;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
 	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(ctx_trace->data, cpu);
+	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
 
 	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct ftrace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	struct ring_buffer *buffer = tr->buffer;
+	struct ring_buffer *buffer = tr->trace_buffer.buffer;
 
 	event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
 					  sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
 	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
-	data = per_cpu_ptr(ctx_trace->data, cpu);
+	data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
 
 	if (likely(!atomic_read(&data->disabled)))
 		tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f9ceb75a95b7..c16f8cd63c3c 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -89,7 +89,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
 	if (cpu != wakeup_current_cpu)
 		goto out_enable;
 
-	*data = per_cpu_ptr(tr->data, cpu);
+	*data = per_cpu_ptr(tr->trace_buffer.data, cpu);
 	disabled = atomic_inc_return(&(*data)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
@@ -353,7 +353,7 @@ probe_wakeup_sched_switch(void *ignore,
 
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
-	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 	if (likely(disabled != 1))
 		goto out;
 
@@ -365,7 +365,7 @@ probe_wakeup_sched_switch(void *ignore,
 		goto out_unlock;
 
 	/* The task we are waiting for is waking up */
-	data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu);
+	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
 
 	__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +387,7 @@ out_unlock:
 	arch_spin_unlock(&wakeup_lock);
 	local_irq_restore(flags);
 out:
-	atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 }
 
 static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +405,7 @@ static void wakeup_reset(struct trace_array *tr)
 {
 	unsigned long flags;
 
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 
 	local_irq_save(flags);
 	arch_spin_lock(&wakeup_lock);
@@ -435,7 +435,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 		return;
 
 	pc = preempt_count();
-	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 	if (unlikely(disabled != 1))
 		goto out;
 
@@ -458,7 +458,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 
 	local_save_flags(flags);
 
-	data = per_cpu_ptr(wakeup_trace->data, wakeup_cpu);
+	data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
 
@@ -472,7 +472,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
 out_locked:
 	arch_spin_unlock(&wakeup_lock);
 out:
-	atomic_dec(&per_cpu_ptr(wakeup_trace->data, cpu)->disabled);
+	atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
 }
 
 static void start_wakeup_tracer(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..8672c40cb153 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	return 0;
 }
 
-static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
+static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
 {
 	struct ring_buffer_event *event;
 	struct trace_entry *entry;
 	unsigned int loops = 0;
 
-	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
+	while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
 		entry = ring_buffer_event_data(event);
 
 		/*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
  * Test the trace buffer to see if all the elements
  * are still sane.
  */
-static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
 {
 	unsigned long flags, cnt = 0;
 	int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	local_irq_save(flags);
 	arch_spin_lock(&ftrace_max_lock);
 
-	cnt = ring_buffer_entries(tr->buffer);
+	cnt = ring_buffer_entries(buf->buffer);
 
 	/*
 	 * The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 	 */
 	tracing_off();
 	for_each_possible_cpu(cpu) {
-		ret = trace_test_buffer_cpu(tr, cpu);
+		ret = trace_test_buffer_cpu(buf, cpu);
 		if (ret)
 			break;
 	}
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	msleep(100);
 
 	/* we should have nothing in the buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	if (ret)
 		goto out;
 
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	tracing_start();
 
 	/* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 0;
 
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
@@ -737,7 +737,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	 * Simulate the init() callback but we attach a watchdog callback
 	 * to detect and recover from possible hangs
 	 */
-	tracing_reset_online_cpus(tr);
+	tracing_reset_online_cpus(&tr->trace_buffer);
 	set_graph_array(tr);
 	ret = register_ftrace_graph(&trace_graph_return,
 				    &trace_graph_entry_watchdog);
@@ -760,7 +760,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	tracing_stop();
 
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 
 	trace->reset(tr);
 	tracing_start();
@@ -815,9 +815,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (!ret)
-		ret = trace_test_buffer(&max_tr, &count);
+		ret = trace_test_buffer(&tr->max_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
@@ -877,9 +877,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (!ret)
-		ret = trace_test_buffer(&max_tr, &count);
+		ret = trace_test_buffer(&tr->max_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
@@ -943,11 +943,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (ret)
 		goto out;
 
-	ret = trace_test_buffer(&max_tr, &count);
+	ret = trace_test_buffer(&tr->max_buffer, &count);
 	if (ret)
 		goto out;
 
@@ -973,11 +973,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	if (ret)
 		goto out;
 
-	ret = trace_test_buffer(&max_tr, &count);
+	ret = trace_test_buffer(&tr->max_buffer, &count);
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1084,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-	ret = trace_test_buffer(tr, NULL);
+	ret = trace_test_buffer(&tr->trace_buffer, NULL);
 	printk("ret = %d\n", ret);
 	if (!ret)
-		ret = trace_test_buffer(&max_tr, &count);
+		ret = trace_test_buffer(&tr->max_buffer, &count);
 
 
 	trace->reset(tr);
@@ -1126,7 +1126,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	/* stop the tracing. */
 	tracing_stop();
 	/* check the trace buffer */
-	ret = trace_test_buffer(tr, &count);
+	ret = trace_test_buffer(&tr->trace_buffer, &count);
 	trace->reset(tr);
 	tracing_start();
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 1cd37ffb4093..68f3f344be65 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -321,7 +321,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
 
 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
 			sys_data->enter_event->event.type, size, 0, 0);
 	if (!event)
@@ -355,7 +355,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
 	if (!sys_data)
 		return;
 
-	buffer = tr->buffer;
+	buffer = tr->trace_buffer.buffer;
 	event = trace_buffer_lock_reserve(buffer,
 			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
 	if (!event)
-- 
cgit 


From ad909e21bbe69f1d39055d346540abd827190eca Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <srostedt@redhat.com>
Date: Wed, 6 Mar 2013 21:45:37 -0500
Subject: tracing: Add internal tracing_snapshot() functions

The new snapshot feature is quite handy. It's a way for the user
to take advantage of the spare buffer that, until then, only
the latency tracers used to "snapshot" the buffer when it hit
a max latency. Now users can trigger a "snapshot" manually when
some condition is hit in a program. But a snapshot currently can
not be triggered by a condition inside the kernel.

With the addition of tracing_snapshot() and tracing_snapshot_alloc(),
snapshots can now be taking when a condition is hit, and the
developer wants to snapshot the case without stopping the trace.

Note, any snapshot will overwrite the old one, so take care
in how this is done.

These new functions are to be used like tracing_on(), tracing_off()
and trace_printk() are. That is, they should never be called
in the mainline Linux kernel. They are solely for the purpose
of debugging.

The tracing_snapshot() will not allocate a buffer, but it is
safe to be called from any context (except NMIs). But if a
snapshot buffer isn't allocated when it is called, it will write
to the live buffer, complaining about the lack of a snapshot
buffer, and then stop tracing (giving you the "permanent snapshot").

tracing_snapshot_alloc() will allocate the snapshot buffer if
it was not already allocated and then take the snapshot. This routine
*may sleep*, and must be called from context that can sleep.
The allocation is done with GFP_KERNEL and not atomic.

If you need a snapshot in an atomic context, say in early boot,
then it is best to call the tracing_snapshot_alloc() before then,
where it will allocate the buffer, and then you can use the
tracing_snapshot() anywhere you want and still get snapshots.

Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kernel.h |  4 +++
 kernel/trace/trace.c   | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c566927efcbd..bc5392a326ab 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -483,6 +483,8 @@ enum ftrace_dump_mode {
 void tracing_on(void);
 void tracing_off(void);
 int tracing_is_on(void);
+void tracing_snapshot(void);
+void tracing_snapshot_alloc(void);
 
 extern void tracing_start(void);
 extern void tracing_stop(void);
@@ -570,6 +572,8 @@ static inline void trace_dump_stack(void) { }
 static inline void tracing_on(void) { }
 static inline void tracing_off(void) { }
 static inline int tracing_is_on(void) { return 0; }
+static inline void tracing_snapshot(void) { }
+static inline void tracing_snapshot_alloc(void) { }
 
 static inline __printf(1, 2)
 int trace_printk(const char *fmt, ...)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3a89496dc99b..307524d784ec 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -339,6 +339,90 @@ void tracing_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_on);
 
+#ifdef CONFIG_TRACER_SNAPSHOT
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+	struct trace_array *tr = &global_trace;
+	struct tracer *tracer = tr->current_trace;
+	unsigned long flags;
+
+	if (!tr->allocated_snapshot) {
+		trace_printk("*** SNAPSHOT NOT ALLOCATED ***\n");
+		trace_printk("*** stopping trace here!   ***\n");
+		tracing_off();
+		return;
+	}
+
+	/* Note, snapshot can not be used when the tracer uses it */
+	if (tracer->use_max_tr) {
+		trace_printk("*** LATENCY TRACER ACTIVE ***\n");
+		trace_printk("*** Can not use snapshot (sorry) ***\n");
+		return;
+	}
+
+	local_irq_save(flags);
+	update_max_tr(tr, current, smp_processor_id());
+	local_irq_restore(flags);
+}
+
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+					struct trace_buffer *size_buf, int cpu_id);
+
+/**
+ * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to trace_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+	struct trace_array *tr = &global_trace;
+	int ret;
+
+	if (!tr->allocated_snapshot) {
+
+		/* allocate spare buffer */
+		ret = resize_buffer_duplicate_size(&tr->max_buffer,
+				   &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+		if (WARN_ON(ret < 0))
+			return;
+
+		tr->allocated_snapshot = true;
+	}
+
+	tracing_snapshot();
+}
+#else
+void tracing_snapshot(void)
+{
+	WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
+}
+void tracing_snapshot_alloc(void)
+{
+	/* Give warning */
+	tracing_snapshot();
+}
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
 /**
  * tracing_off - turn off tracing buffers
  *
-- 
cgit 


From 09ae72348eccb60e304cf8ce94653f4a78fcd407 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 8 Mar 2013 21:02:34 -0500
Subject: tracing: Add trace_puts() for even faster trace_printk() tracing

The trace_printk() is extremely fast and is very handy as it can be
used in any context (including NMIs!). But it still requires scanning
the fmt string for parsing the args. Even the trace_bprintk() requires
a scan to know what args will be saved, although it doesn't copy the
format string itself.

Several times trace_printk() has no args, and wastes cpu cycles scanning
the fmt string.

Adding trace_puts() allows the developer to use an even faster
tracing method that only saves the pointer to the string in the
ring buffer without doing any format parsing at all. This will
help remove even more of the "Heisenbug" effect, when debugging.

Also fixed up the F_printk()s for the ftrace internal bprint and print events.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kernel.h       | 41 +++++++++++++++++++++++-
 kernel/trace/trace.c         | 76 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         |  2 ++
 kernel/trace/trace_entries.h | 23 +++++++++++---
 kernel/trace/trace_output.c  | 75 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h  |  2 ++
 6 files changed, 214 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index bc5392a326ab..a3a5574a61fc 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -514,7 +514,8 @@ do {									\
  *
  * This is intended as a debugging tool for the developer only.
  * Please refrain from leaving trace_printks scattered around in
- * your code.
+ * your code. (Extra memory is used for special buffers that are
+ * allocated when trace_printk() is used)
  */
 
 #define trace_printk(fmt, args...)					\
@@ -537,6 +538,44 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...);
 extern __printf(2, 3)
 int __trace_printk(unsigned long ip, const char *fmt, ...);
 
+/**
+ * trace_puts - write a string into the ftrace buffer
+ * @str: the string to record
+ *
+ * Note: __trace_bputs is an internal function for trace_puts and
+ *       the @ip is passed in via the trace_puts macro.
+ *
+ * This is similar to trace_printk() but is made for those really fast
+ * paths that a developer wants the least amount of "Heisenbug" affects,
+ * where the processing of the print format is still too much.
+ *
+ * This function allows a kernel developer to debug fast path sections
+ * that printk is not appropriate for. By scattering in various
+ * printk like tracing in the code, a developer can quickly see
+ * where problems are occurring.
+ *
+ * This is intended as a debugging tool for the developer only.
+ * Please refrain from leaving trace_puts scattered around in
+ * your code. (Extra memory is used for special buffers that are
+ * allocated when trace_puts() is used)
+ *
+ * Returns: 0 if nothing was written, positive # if string was.
+ *  (1 when __trace_bputs is used, strlen(str) when __trace_puts is used)
+ */
+
+extern int __trace_bputs(unsigned long ip, const char *str);
+extern int __trace_puts(unsigned long ip, const char *str, int size);
+#define trace_puts(str) ({						\
+	static const char *trace_printk_fmt				\
+		__attribute__((section("__trace_printk_fmt"))) =	\
+		__builtin_constant_p(str) ? str : NULL;			\
+									\
+	if (__builtin_constant_p(str))					\
+		__trace_bputs(_THIS_IP_, trace_printk_fmt);		\
+	else								\
+		__trace_puts(_THIS_IP_, str, strlen(str));		\
+})
+
 extern void trace_dump_stack(void);
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4021a5e66412..5043a0c4dde0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -350,6 +350,77 @@ void tracing_on(void)
 }
 EXPORT_SYMBOL_GPL(tracing_on);
 
+/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip:	   The address of the caller
+ * @str:   The constant string to write
+ * @size:  The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+	int alloc;
+
+	alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+	local_save_flags(irq_flags);
+	buffer = global_trace.trace_buffer.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, 
+					  irq_flags, preempt_count());
+	if (!event)
+		return 0;
+
+	entry = ring_buffer_event_data(event);
+	entry->ip = ip;
+
+	memcpy(&entry->buf, str, size);
+
+	/* Add a newline if necessary */
+	if (entry->buf[size - 1] != '\n') {
+		entry->buf[size] = '\n';
+		entry->buf[size + 1] = '\0';
+	} else
+		entry->buf[size] = '\0';
+
+	__buffer_unlock_commit(buffer, event);
+
+	return size;
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip:	   The address of the caller
+ * @str:   The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+	struct ring_buffer_event *event;
+	struct ring_buffer *buffer;
+	struct bputs_entry *entry;
+	unsigned long irq_flags;
+	int size = sizeof(struct bputs_entry);
+
+	local_save_flags(irq_flags);
+	buffer = global_trace.trace_buffer.buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+					  irq_flags, preempt_count());
+	if (!event)
+		return 0;
+
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->str			= str;
+
+	__buffer_unlock_commit(buffer, event);
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
 #ifdef CONFIG_TRACER_SNAPSHOT
 /**
  * trace_snapshot - take a snapshot of the current buffer.
@@ -2475,6 +2546,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPUTS &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return trace_print_bputs_msg_only(iter);
+
 	if (iter->ent->type == TRACE_BPRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 26bc71834041..d5764a8532e2 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,6 +34,7 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_BLK,
+	TRACE_BPUTS,
 
 	__TRACE_LAST_TYPE,
 };
@@ -277,6 +278,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
+		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_map,		\
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
 		__dynamic_array(	u32,	buf	)
 	),
 
-	F_printk("%08lx fmt:%p",
-		 __entry->ip, __entry->fmt),
+	F_printk("%pf: %s",
+		 (void *)__entry->ip, __entry->fmt),
 
 	FILTER_OTHER
 );
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
 		__dynamic_array(	char,	buf	)
 	),
 
-	F_printk("%08lx %s",
-		 __entry->ip, __entry->buf),
+	F_printk("%pf: %s",
+		 (void *)__entry->ip, __entry->buf),
+
+	FILTER_OTHER
+);
+
+FTRACE_ENTRY(bputs, bputs_entry,
+
+	TRACE_BPUTS,
+
+	F_STRUCT(
+		__field(	unsigned long,	ip	)
+		__field(	const char *,	str	)
+	),
+
+	F_printk("%pf: %s",
+		 (void *)__entry->ip, __entry->str),
 
 	FILTER_OTHER
 );
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 2edc7220d017..19f48e7edc39 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	return ret;
 }
 
+enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bputs_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_puts(s, field->str);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1244,6 +1260,64 @@ static struct trace_event trace_user_stack_event = {
 	.funcs		= &trace_user_stack_funcs,
 };
 
+/* TRACE_BPUTS */
+static enum print_line_t
+trace_bputs_print(struct trace_iterator *iter, int flags,
+		   struct trace_event *event)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bputs_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_puts(s, field->str))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+
+static enum print_line_t
+trace_bputs_raw(struct trace_iterator *iter, int flags,
+		struct trace_event *event)
+{
+	struct bputs_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_puts(s, field->str))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event_functions trace_bputs_funcs = {
+	.trace		= trace_bputs_print,
+	.raw		= trace_bputs_raw,
+};
+
+static struct trace_event trace_bputs_event = {
+	.type		= TRACE_BPUTS,
+	.funcs		= &trace_bputs_funcs,
+};
+
 /* TRACE_BPRINT */
 static enum print_line_t
 trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1356,6 +1430,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_wake_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bputs_event,
 	&trace_bprint_event,
 	&trace_print_event,
 	NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..af77870de278 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -4,6 +4,8 @@
 #include <linux/trace_seq.h>
 #include "trace.h"
 
+extern enum print_line_t
+trace_print_bputs_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
-- 
cgit 


From 9d3c752c062e3266f1051ba0825276ea1e2777da Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 8 Mar 2013 22:11:57 -0500
Subject: tracing: Optimize trace_printk() with one arg to use trace_puts()

Although trace_printk() is extremely fast, especially when it uses
trace_bprintk() (writes args straight to buffer instead of inserting
into string), it still has the overhead of calling one of the printf
sprintf() functions, that need to scan the fmt string to determine
what, if any args it has.

This is a waste of precious CPU cycles if the printk format has no
args but a single constant string. It is better to use trace_puts()
which does not have the overhead of the fmt scanning.

But wouldn't it be nice if the developer didn't have to think about
such things, and the compile would just do it for them?

  trace_printk("this string has no args\n");
  [...]
  trace_printk("this sting does %p %d\n", foo, bar);

As tracing is critical to have the least amount of overhead,
especially when dealing with race conditions, and you want to
eliminate any "Heisenbugs", you want the trace_printk() to use the
fastest possible means of tracing.

Currently the macro magic determines if it will use trace_bprintk()
or if the fmt is a dynamic string (a variable), it will fall
back to the slow trace_printk() method that does a full snprintf()
before copying it into the buffer, where as trace_bprintk() only
copys the pointer to the fmt and the args into the buffer.

Well, now there's a way to spend some more Hogwarts cash and come
up with new fancy macro magic.

  #define trace_printk(fmt, ...)			\
  do {							\
	char _______STR[] = __stringify((__VA_ARGS__));	\
	if (sizeof(_______STR) > 3)			\
		do_trace_printk(fmt, ##__VA_ARGS__);	\
	else						\
		trace_puts(fmt);			\
  } while (0)

The above needs a bit of explaining (both here and in the comments).

By stringifying the __VA_ARGS__, we can, at compile time, determine
the number of args that are being passed to trace_printk(). The extra
parenthesis are required, otherwise the compiler complains about
too many parameters for __stringify if there is more than one arg.

When there are no args, the __stringify((__VA_ARGS__)) converts into
"()\0", a string of 3 characters. Anything else, will be a string
containing more than 3 characters. Now we assign that string to a
dynamic char array, and then take the sizeof() of that array.
If it is greater than 3 characters, we know trace_printk() has args
and we need to do the full "do_trace_printk()" on them, otherwise
it was only passed a single arg and we can optimize to use trace_puts().

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven "The King of Nasty Macros!" Rostedt <rostedt@goodmis.org>
---
 include/linux/kernel.h | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index a3a5574a61fc..d0a16fe03fef 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -516,9 +516,30 @@ do {									\
  * Please refrain from leaving trace_printks scattered around in
  * your code. (Extra memory is used for special buffers that are
  * allocated when trace_printk() is used)
+ *
+ * A little optization trick is done here. If there's only one
+ * argument, there's no need to scan the string for printf formats.
+ * The trace_puts() will suffice. But how can we take advantage of
+ * using trace_puts() when trace_printk() has only one argument?
+ * By stringifying the args and checking the size we can tell
+ * whether or not there are args. __stringify((__VA_ARGS__)) will
+ * turn into "()\0" with a size of 3 when there are no args, anything
+ * else will be bigger. All we need to do is define a string to this,
+ * and then take its size and compare to 3. If it's bigger, use
+ * do_trace_printk() otherwise, optimize it to trace_puts(). Then just
+ * let gcc optimize the rest.
  */
 
-#define trace_printk(fmt, args...)					\
+#define trace_printk(fmt, ...)				\
+do {							\
+	char _______STR[] = __stringify((__VA_ARGS__));	\
+	if (sizeof(_______STR) > 3)			\
+		do_trace_printk(fmt, ##__VA_ARGS__);	\
+	else						\
+		trace_puts(fmt);			\
+} while (0)
+
+#define do_trace_printk(fmt, args...)					\
 do {									\
 	static const char *trace_printk_fmt				\
 		__attribute__((section("__trace_printk_fmt"))) =	\
-- 
cgit 


From 57d01ad09721fb7719c4c8c72b434398186f35a0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 12:38:06 -0400
Subject: tracing: Fix comments for ftrace_event_file/call flags

Most of the flags for the struct ftrace_event_file were moved over
to the flags of the struct ftrace_event_call, but the comments were
never updated.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index d84c4a575514..4cb6cd8338a4 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -230,6 +230,13 @@ struct ftrace_event_call {
 	struct list_head	*files;
 	void			*mod;
 	void			*data;
+	/*
+	 *   bit 0:		filter_active
+	 *   bit 1:		allow trace by non root (cap any)
+	 *   bit 2:		failed to apply filter
+	 *   bit 3:		ftrace internal event (do not enable)
+	 *   bit 4:		Event was enabled by module
+	 */
 	int			flags; /* static flags of different events */
 
 #ifdef CONFIG_PERF_EVENTS
@@ -248,7 +255,7 @@ enum {
 
 /*
  * Ftrace event file flags:
- *  ENABELD	  - The event is enabled
+ *  ENABLED	  - The event is enabled
  *  RECORDED_CMD  - The comms should be recorded at sched_switch
  */
 enum {
@@ -265,12 +272,8 @@ struct ftrace_event_file {
 
 	/*
 	 * 32 bit flags:
-	 *   bit 1:		enabled
-	 *   bit 2:		filter_active
-	 *   bit 3:		enabled cmd record
-	 *   bit 4:		allow trace by non root (cap any)
-	 *   bit 5:		failed to apply filter
-	 *   bit 6:		ftrace internal event (do not enable)
+	 *   bit 0:		enabled
+	 *   bit 1:		enabled cmd record
 	 *
 	 * Changes to flags must hold the event_mutex.
 	 *
-- 
cgit 


From e67efb93f0e9130174293ffaa5975f87b301b531 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 15:07:59 -0400
Subject: ftrace: Clean up function probe methods

When a function probe is created, each function that the probe is
attached to, a "callback" method is called. On release of the probe,
each function entry calls the "free" method.

First, "callback" is a confusing name and does not really match what
it does. Callback sounds like it will be called when the probe
triggers. But that's not the case. This is really an "init" function,
so lets rename it as such.

Secondly, both "init" and "free" do not pass enough information back
to the handlers. Pass back the ops, ip and data for each time the
method is called. We have the information, might as well use it.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 6 ++++--
 kernel/trace/ftrace.c  | 6 +++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e5ca8ef50e9b..832422d706f4 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -259,8 +259,10 @@ struct ftrace_probe_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
-	int			(*callback)(unsigned long ip, void **data);
-	void			(*free)(void **data);
+	int			(*init)(struct ftrace_probe_ops *ops,
+					unsigned long ip, void **data);
+	void			(*free)(struct ftrace_probe_ops *ops,
+					unsigned long ip, void **data);
 	int			(*print)(struct seq_file *m,
 					 unsigned long ip,
 					 struct ftrace_probe_ops *ops,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index dab031fec85b..ff0ef41c6d93 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2984,7 +2984,7 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
-		entry->ops->free(&entry->data);
+		entry->ops->free(entry->ops, entry->ip, &entry->data);
 	kfree(entry);
 }
 
@@ -3045,8 +3045,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 		 * for each function we find. We call the callback
 		 * to give the caller an opportunity to do so.
 		 */
-		if (ops->callback) {
-			if (ops->callback(rec->ip, &entry->data) < 0) {
+		if (ops->init) {
+			if (ops->init(ops, rec->ip, &entry->data) < 0) {
 				/* caller does not like this func */
 				kfree(entry);
 				continue;
-- 
cgit 


From 417944c4c7a0f657158d0515f3b8e8c043fd788f Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Tue, 12 Mar 2013 13:26:18 -0400
Subject: tracing: Add a way to soft disable trace events

In order to let triggers enable or disable events, we need a 'soft'
method for doing so. For example, if a function probe is added that
lets a user enable or disable events when a function is called, that
change must be done without taking locks or a mutex, and definitely
it can't sleep. But the full enabling of a tracepoint is expensive.

By adding a 'SOFT_DISABLE' flag, and converting the flags to be updated
without the protection of a mutex (using set/clear_bit()), this soft
disable flag can be used to allow critical sections to enable or disable
events from being traced (after the event has been placed into "SOFT_MODE").

Some caveats though: The comm recorder (to map pids with a comm) can not
be soft disabled (yet). If you disable an event with with a "soft"
disable and wait a while before reading the trace, the comm cache may be
replaced and you'll get a bunch of <...> for comms in the trace.

Reading the "enable" file for an event that is disabled will now give
you "0*" where the '*' denotes that the tracepoint is still active but
the event itself is "disabled".

[ fixed _BIT used in & operation : thanks to Dan Carpenter and smatch ]

Cc: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Tom Zanussi <tom.zanussi@linux.intel.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 20 ++++++++----
 include/trace/ftrace.h       |  8 +++++
 kernel/trace/trace_events.c  | 75 ++++++++++++++++++++++++++++++++++++--------
 3 files changed, 84 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4cb6cd8338a4..4e28b011e63b 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -251,16 +251,23 @@ struct ftrace_subsystem_dir;
 enum {
 	FTRACE_EVENT_FL_ENABLED_BIT,
 	FTRACE_EVENT_FL_RECORDED_CMD_BIT,
+	FTRACE_EVENT_FL_SOFT_MODE_BIT,
+	FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
 };
 
 /*
  * Ftrace event file flags:
  *  ENABLED	  - The event is enabled
  *  RECORDED_CMD  - The comms should be recorded at sched_switch
+ *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
+ *  SOFT_DISABLED - When set, do not trace the event (even though its
+ *                   tracepoint may be enabled)
  */
 enum {
 	FTRACE_EVENT_FL_ENABLED		= (1 << FTRACE_EVENT_FL_ENABLED_BIT),
 	FTRACE_EVENT_FL_RECORDED_CMD	= (1 << FTRACE_EVENT_FL_RECORDED_CMD_BIT),
+	FTRACE_EVENT_FL_SOFT_MODE	= (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT),
+	FTRACE_EVENT_FL_SOFT_DISABLED	= (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT),
 };
 
 struct ftrace_event_file {
@@ -274,17 +281,18 @@ struct ftrace_event_file {
 	 * 32 bit flags:
 	 *   bit 0:		enabled
 	 *   bit 1:		enabled cmd record
+	 *   bit 2:		enable/disable with the soft disable bit
+	 *   bit 3:		soft disabled
 	 *
-	 * Changes to flags must hold the event_mutex.
-	 *
-	 * Note: Reads of flags do not hold the event_mutex since
-	 * they occur in critical sections. But the way flags
+	 * Note: The bits must be set atomically to prevent races
+	 * from other writers. Reads of flags do not need to be in
+	 * sync as they occur in critical sections. But the way flags
 	 * is currently used, these changes do not affect the code
 	 * except that when a change is made, it may have a slight
 	 * delay in propagating the changes to other CPUs due to
-	 * caching and such.
+	 * caching and such. Which is mostly OK ;-)
 	 */
-	unsigned int		flags;
+	unsigned long		flags;
 };
 
 #define __TRACE_EVENT_FLAGS(name, value)				\
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index bbf09c2021b9..4bda044e6c77 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -413,6 +413,10 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	int __data_size;
  *	int pc;
  *
+ *	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
+ *		     &ftrace_file->flags))
+ *		return;
+ *
  *	local_save_flags(irq_flags);
  *	pc = preempt_count();
  *
@@ -518,6 +522,10 @@ ftrace_raw_event_##call(void *__data, proto)				\
 	int __data_size;						\
 	int pc;								\
 									\
+	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,			\
+		     &ftrace_file->flags))				\
+		return;							\
+									\
 	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 38b54c5edeb9..106640b0df4a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -205,37 +205,77 @@ void trace_event_enable_cmd_record(bool enable)
 
 		if (enable) {
 			tracing_start_cmdline_record();
-			file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
+			set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 		} else {
 			tracing_stop_cmdline_record();
-			file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
+			clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 		}
 	} while_for_each_event_file();
 	mutex_unlock(&event_mutex);
 }
 
-static int ftrace_event_enable_disable(struct ftrace_event_file *file,
-				       int enable)
+static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+					 int enable, int soft_disable)
 {
 	struct ftrace_event_call *call = file->event_call;
 	int ret = 0;
+	int disable;
 
 	switch (enable) {
 	case 0:
-		if (file->flags & FTRACE_EVENT_FL_ENABLED) {
-			file->flags &= ~FTRACE_EVENT_FL_ENABLED;
+		/*
+		 * When soft_disable is set and enable is cleared, we want
+		 * to clear the SOFT_DISABLED flag but leave the event in the
+		 * state that it was. That is, if the event was enabled and
+		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
+		 * is set we do not want the event to be enabled before we
+		 * clear the bit.
+		 *
+		 * When soft_disable is not set but the SOFT_MODE flag is,
+		 * we do nothing. Do not disable the tracepoint, otherwise
+		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+		 */
+		if (soft_disable) {
+			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
+			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+		} else
+			disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+
+		if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
+			clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
 			if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
 				tracing_stop_cmdline_record();
-				file->flags &= ~FTRACE_EVENT_FL_RECORDED_CMD;
+				clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			call->class->reg(call, TRACE_REG_UNREGISTER, file);
 		}
+		/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
+		if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+			set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
 		break;
 	case 1:
+		/*
+		 * When soft_disable is set and enable is set, we want to
+		 * register the tracepoint for the event, but leave the event
+		 * as is. That means, if the event was already enabled, we do
+		 * nothing (but set SOFT_MODE). If the event is disabled, we
+		 * set SOFT_DISABLED before enabling the event tracepoint, so
+		 * it still seems to be disabled.
+		 */
+		if (!soft_disable)
+			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+		else
+			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+
 		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+
+			/* Keep the event disabled, when going to SOFT_MODE. */
+			if (soft_disable)
+				set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+
 			if (trace_flags & TRACE_ITER_RECORD_CMD) {
 				tracing_start_cmdline_record();
-				file->flags |= FTRACE_EVENT_FL_RECORDED_CMD;
+				set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
 			}
 			ret = call->class->reg(call, TRACE_REG_REGISTER, file);
 			if (ret) {
@@ -244,7 +284,7 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 					"%s\n", call->name);
 				break;
 			}
-			file->flags |= FTRACE_EVENT_FL_ENABLED;
+			set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
 
 			/* WAS_ENABLED gets set but never cleared. */
 			call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
@@ -255,6 +295,12 @@ static int ftrace_event_enable_disable(struct ftrace_event_file *file,
 	return ret;
 }
 
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+				       int enable)
+{
+	return __ftrace_event_enable_disable(file, enable, 0);
+}
+
 static void ftrace_clear_events(struct trace_array *tr)
 {
 	struct ftrace_event_file *file;
@@ -547,12 +593,15 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_file *file = filp->private_data;
 	char *buf;
 
-	if (file->flags & FTRACE_EVENT_FL_ENABLED)
-		buf = "1\n";
-	else
+	if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+		if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
+			buf = "0*\n";
+		else
+			buf = "1\n";
+	} else
 		buf = "0\n";
 
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
 }
 
 static ssize_t
-- 
cgit 


From c142be8ebe0b7bf73c8a0063925623f3e4b980c0 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 13 Mar 2013 09:55:57 -0400
Subject: tracing: Add skip argument to trace_dump_stack()

Altough the trace_dump_stack() already skips three functions in
the call to stack trace, which gets the stack trace to start
at the caller of the function, the caller may want to skip some
more too (as it may have helper functions).

Add a skip argument to the trace_dump_stack() that lets the caller
skip back tracing functions that it doesn't care about.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kernel.h |  2 +-
 kernel/trace/trace.c   | 13 +++++++++----
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index d0a16fe03fef..239dbb9627ca 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -597,7 +597,7 @@ extern int __trace_puts(unsigned long ip, const char *str, int size);
 		__trace_puts(_THIS_IP_, str, strlen(str));		\
 })
 
-extern void trace_dump_stack(void);
+extern void trace_dump_stack(int skip);
 
 /*
  * The double __builtin_constant_p is because gcc will give us an error
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c5b844621562..8aa53213201f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1657,8 +1657,9 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
 
 /**
  * trace_dump_stack - record a stack back trace in the trace buffer
+ * @skip: Number of functions to skip (helper handlers)
  */
-void trace_dump_stack(void)
+void trace_dump_stack(int skip)
 {
 	unsigned long flags;
 
@@ -1667,9 +1668,13 @@ void trace_dump_stack(void)
 
 	local_save_flags(flags);
 
-	/* skipping 3 traces, seems to get us at the caller of this function */
-	__ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, 3,
-			     preempt_count(), NULL);
+	/*
+	 * Skip 3 more, seems to get us at the caller of
+	 * this function.
+	 */
+	skip += 3;
+	__ftrace_trace_stack(global_trace.trace_buffer.buffer,
+			     flags, skip, preempt_count(), NULL);
 }
 
 static DEFINE_PER_CPU(int, user_stack_count);
-- 
cgit 


From 8aacf017b065a805d27467843490c976835eb4a5 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Thu, 14 Mar 2013 13:13:45 -0400
Subject: tracing: Add "uptime" trace clock that uses jiffies

Add a simple trace clock called "uptime" for those that are
interested in the uptime of the trace. It uses jiffies as that's
the safest method, as other uptime clocks grab seq locks, which could
cause a deadlock if taken from an event or function tracer.

Requested-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_clock.h |  1 +
 kernel/trace/trace.c        |  1 +
 kernel/trace/trace_clock.c  | 10 ++++++++++
 3 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
index d563f37e1a1d..1d7ca2739272 100644
--- a/include/linux/trace_clock.h
+++ b/include/linux/trace_clock.h
@@ -16,6 +16,7 @@
 
 extern u64 notrace trace_clock_local(void);
 extern u64 notrace trace_clock(void);
+extern u64 notrace trace_clock_jiffies(void);
 extern u64 notrace trace_clock_global(void);
 extern u64 notrace trace_clock_counter(void);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f90ca16afcf2..8eabfbb8003e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -647,6 +647,7 @@ static struct {
 	{ trace_clock_local,	"local",	1 },
 	{ trace_clock_global,	"global",	1 },
 	{ trace_clock_counter,	"counter",	0 },
+	{ trace_clock_jiffies,	"uptime",	1 },
 	ARCH_TRACE_CLOCKS
 };
 
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
 	return local_clock();
 }
 
+/*
+ * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ */
+u64 notrace trace_clock_jiffies(void)
+{
+	u64 jiffy = jiffies - INITIAL_JIFFIES;
+
+	/* Return nsecs */
+	return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+}
 
 /*
  * trace_clock_global(): special globally coherent trace clock
-- 
cgit 


From b92021b09df70c1609e3547f3d6128dd560be97f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 15 Mar 2013 15:04:17 +1030
Subject: CONFIG_SYMBOL_PREFIX: cleanup.

We have CONFIG_SYMBOL_PREFIX, which three archs define to the string
"_".  But Al Viro broke this in "consolidate cond_syscall and
SYSCALL_ALIAS declarations" (in linux-next), and he's not the first to
do so.

Using CONFIG_SYMBOL_PREFIX is awkward, since we usually just want to
prefix it so something.  So various places define helpers which are
defined to nothing if CONFIG_SYMBOL_PREFIX isn't set:

1) include/asm-generic/unistd.h defines __SYMBOL_PREFIX.
2) include/asm-generic/vmlinux.lds.h defines VMLINUX_SYMBOL(sym)
3) include/linux/export.h defines MODULE_SYMBOL_PREFIX.
4) include/linux/kernel.h defines SYMBOL_PREFIX (which differs from #7)
5) kernel/modsign_certificate.S defines ASM_SYMBOL(sym)
6) scripts/modpost.c defines MODULE_SYMBOL_PREFIX
7) scripts/Makefile.lib defines SYMBOL_PREFIX on the commandline if
   CONFIG_SYMBOL_PREFIX is set, so that we have a non-string version
   for pasting.

(arch/h8300/include/asm/linkage.h defines SYMBOL_NAME(), too).

Let's solve this properly:
1) No more generic prefix, just CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX.
2) Make linux/export.h usable from asm.
3) Define VMLINUX_SYMBOL() and VMLINUX_SYMBOL_STR().
4) Make everyone use them.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: James Hogan <james.hogan@imgtec.com>
Tested-by: James Hogan <james.hogan@imgtec.com> (metag)
---
 Makefile                          |  2 +-
 arch/Kconfig                      |  6 ++++++
 arch/blackfin/Kconfig             |  5 +----
 arch/h8300/Kconfig                |  5 +----
 arch/metag/Kconfig                |  5 +----
 drivers/mtd/chips/gen_probe.c     |  8 +++++---
 include/asm-generic/unistd.h      | 12 ++++--------
 include/asm-generic/vmlinux.lds.h |  8 +-------
 include/linux/export.h            | 20 ++++++++++++++------
 include/linux/kernel.h            |  7 -------
 include/linux/module.h            |  4 ++--
 kernel/modsign_certificate.S      | 13 +++----------
 kernel/module.c                   |  2 +-
 scripts/Makefile.lib              |  7 -------
 scripts/link-vmlinux.sh           |  5 ++---
 scripts/mod/modpost.c             | 36 +++++++++++++++---------------------
 16 files changed, 57 insertions(+), 88 deletions(-)

(limited to 'include/linux')

diff --git a/Makefile b/Makefile
index a05ea42c5f18..0b09ba5e492a 100644
--- a/Makefile
+++ b/Makefile
@@ -1398,7 +1398,7 @@ quiet_cmd_rmfiles = $(if $(wildcard $(rm-files)),CLEAN   $(wildcard $(rm-files))
 # Run depmod only if we have System.map and depmod is executable
 quiet_cmd_depmod = DEPMOD  $(KERNELRELEASE)
       cmd_depmod = $(CONFIG_SHELL) $(srctree)/scripts/depmod.sh $(DEPMOD) \
-                   $(KERNELRELEASE) "$(patsubst "%",%,$(CONFIG_SYMBOL_PREFIX))"
+                   $(KERNELRELEASE) "$(patsubst y,_,$(CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX))"
 
 # Create temporary dir for module support files
 # clean it up only when building all modules
diff --git a/arch/Kconfig b/arch/Kconfig
index 1455579791ec..7b433a4bcc28 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -384,6 +384,12 @@ config MODULES_USE_ELF_REL
 	  Modules only use ELF REL relocations.  Modules with ELF RELA
 	  relocations will give an error.
 
+config HAVE_UNDERSCORE_SYMBOL_PREFIX
+	bool
+	help
+	  Some architectures generate an _ in front of C symbols; things like
+	  module loading and assembly files need to know about this.
+
 #
 # ABI hall of shame
 #
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index c3f2e0bc644a..453ebe46b065 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -1,7 +1,3 @@
-config SYMBOL_PREFIX
-	string
-	default "_"
-
 config MMU
 	def_bool n
 
@@ -33,6 +29,7 @@ config BLACKFIN
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_WANT_OPTIONAL_GPIOLIB
 	select HAVE_UID16
+	select HAVE_UNDERSCORE_SYMBOL_PREFIX
 	select VIRT_TO_BUS
 	select ARCH_WANT_IPC_PARSE_VERSION
 	select HAVE_GENERIC_HARDIRQS
diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig
index 79250de1b12a..303e4f9a79d1 100644
--- a/arch/h8300/Kconfig
+++ b/arch/h8300/Kconfig
@@ -12,10 +12,7 @@ config H8300
 	select MODULES_USE_ELF_RELA
 	select OLD_SIGSUSPEND3
 	select OLD_SIGACTION
-
-config SYMBOL_PREFIX
-	string
-	default "_"
+	select HAVE_UNDERSCORE_SYMBOL_PREFIX
 
 config MMU
 	bool
diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig
index afc8973d1488..2099617e3ec0 100644
--- a/arch/metag/Kconfig
+++ b/arch/metag/Kconfig
@@ -1,7 +1,3 @@
-config SYMBOL_PREFIX
-	string
-	default "_"
-
 config METAG
 	def_bool y
 	select EMBEDDED
@@ -27,6 +23,7 @@ config METAG
 	select HAVE_MOD_ARCH_SPECIFIC
 	select HAVE_PERF_EVENTS
 	select HAVE_SYSCALL_TRACEPOINTS
+	select HAVE_UNDERSCORE_SYMBOL_PREFIX
 	select IRQ_DOMAIN
 	select MODULES_USE_ELF_RELA
 	select OF
diff --git a/drivers/mtd/chips/gen_probe.c b/drivers/mtd/chips/gen_probe.c
index 3b9a2843c5f8..74dbb6bcf488 100644
--- a/drivers/mtd/chips/gen_probe.c
+++ b/drivers/mtd/chips/gen_probe.c
@@ -204,14 +204,16 @@ static inline struct mtd_info *cfi_cmdset_unknown(struct map_info *map,
 	struct cfi_private *cfi = map->fldrv_priv;
 	__u16 type = primary?cfi->cfiq->P_ID:cfi->cfiq->A_ID;
 #ifdef CONFIG_MODULES
-	char probename[16+sizeof(MODULE_SYMBOL_PREFIX)];
+	char probename[sizeof(VMLINUX_SYMBOL_STR(cfi_cmdset_%4.4X))];
 	cfi_cmdset_fn_t *probe_function;
 
-	sprintf(probename, MODULE_SYMBOL_PREFIX "cfi_cmdset_%4.4X", type);
+	sprintf(probename, VMLINUX_SYMBOL_STR(cfi_cmdset_%4.4X), type);
 
 	probe_function = __symbol_get(probename);
 	if (!probe_function) {
-		request_module(probename + sizeof(MODULE_SYMBOL_PREFIX) - 1);
+		char modname[sizeof("cfi_cmdset_%4.4X")];
+		sprintf(modname, "cfi_cmdset_%4.4X", type);
+		request_module(modname);
 		probe_function = __symbol_get(probename);
 	}
 
diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h
index 4077b5d9ff81..15c0598e1109 100644
--- a/include/asm-generic/unistd.h
+++ b/include/asm-generic/unistd.h
@@ -1,4 +1,5 @@
 #include <uapi/asm-generic/unistd.h>
+#include <linux/export.h>
 
 /*
  * These are required system calls, we should
@@ -17,12 +18,7 @@
  * but it doesn't work on all toolchains, so we just do it by hand
  */
 #ifndef cond_syscall
-#ifdef CONFIG_SYMBOL_PREFIX
-#define __SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
-#else
-#define __SYMBOL_PREFIX
-#endif
-#define cond_syscall(x) asm(".weak\t" __SYMBOL_PREFIX #x "\n\t" \
-			    ".set\t" __SYMBOL_PREFIX #x "," \
-			    __SYMBOL_PREFIX "sys_ni_syscall")
+#define cond_syscall(x) asm(".weak\t" VMLINUX_SYMBOL_STR(x) "\n\t"	\
+			    ".set\t" VMLINUX_SYMBOL_STR(x) ","	\
+			    VMLINUX_SYMBOL_STR(sys_ni_syscall))
 #endif
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index afa12c7a025c..eb58d2d7d971 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -52,13 +52,7 @@
 #define LOAD_OFFSET 0
 #endif
 
-#ifndef SYMBOL_PREFIX
-#define VMLINUX_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define VMLINUX_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
+#include <linux/export.h>
 
 /* Align . to a 8 byte boundary equals to maximum function alignment. */
 #define ALIGN_FUNCTION()  . = ALIGN(8)
diff --git a/include/linux/export.h b/include/linux/export.h
index 696c0f48afc7..412cd509effe 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -5,17 +5,24 @@
  * to reduce the amount of pointless cruft we feed to gcc when only
  * exporting a simple symbol or two.
  *
- * If you feel the need to add #include <linux/foo.h> to this file
- * then you are doing something wrong and should go away silently.
+ * Try not to add #includes here.  It slows compilation and makes kernel
+ * hackers place grumpy comments in header files.
  */
 
 /* Some toolchains use a `_' prefix for all user symbols. */
-#ifdef CONFIG_SYMBOL_PREFIX
-#define MODULE_SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
+#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX
+#define __VMLINUX_SYMBOL(x) _##x
+#define __VMLINUX_SYMBOL_STR(x) "_" #x
 #else
-#define MODULE_SYMBOL_PREFIX ""
+#define __VMLINUX_SYMBOL(x) x
+#define __VMLINUX_SYMBOL_STR(x) #x
 #endif
 
+/* Indirect, so macros are expanded before pasting. */
+#define VMLINUX_SYMBOL(x) __VMLINUX_SYMBOL(x)
+#define VMLINUX_SYMBOL_STR(x) __VMLINUX_SYMBOL_STR(x)
+
+#ifndef __ASSEMBLY__
 struct kernel_symbol
 {
 	unsigned long value;
@@ -51,7 +58,7 @@ extern struct module __this_module;
 	__CRC_SYMBOL(sym, sec)					\
 	static const char __kstrtab_##sym[]			\
 	__attribute__((section("__ksymtab_strings"), aligned(1))) \
-	= MODULE_SYMBOL_PREFIX #sym;				\
+	= VMLINUX_SYMBOL_STR(sym);				\
 	static const struct kernel_symbol __ksymtab_##sym	\
 	__used							\
 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
@@ -85,5 +92,6 @@ extern struct module __this_module;
 #define EXPORT_UNUSED_SYMBOL_GPL(sym)
 
 #endif /* CONFIG_MODULES */
+#endif /* !__ASSEMBLY__ */
 
 #endif /* _LINUX_EXPORT_H */
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 80d36874689b..e13e992eae8a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -723,13 +723,6 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 /* Trap pasters of __FUNCTION__ at compile-time */
 #define __FUNCTION__ (__func__)
 
-/* This helps us to avoid #ifdef CONFIG_SYMBOL_PREFIX */
-#ifdef CONFIG_SYMBOL_PREFIX
-#define SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
-#else
-#define SYMBOL_PREFIX ""
-#endif
-
 /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
diff --git a/include/linux/module.h b/include/linux/module.h
index ead1b5719a12..46f1ea01e6f6 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -190,7 +190,7 @@ extern int modules_disabled; /* for sysctl */
 /* Get/put a kernel symbol (calls must be symmetric) */
 void *__symbol_get(const char *symbol);
 void *__symbol_get_gpl(const char *symbol);
-#define symbol_get(x) ((typeof(&x))(__symbol_get(MODULE_SYMBOL_PREFIX #x)))
+#define symbol_get(x) ((typeof(&x))(__symbol_get(VMLINUX_SYMBOL_STR(x))))
 
 /* modules using other modules: kdb wants to see this. */
 struct module_use {
@@ -453,7 +453,7 @@ extern void __module_put_and_exit(struct module *mod, long code)
 #ifdef CONFIG_MODULE_UNLOAD
 unsigned long module_refcount(struct module *mod);
 void __symbol_put(const char *symbol);
-#define symbol_put(x) __symbol_put(MODULE_SYMBOL_PREFIX #x)
+#define symbol_put(x) __symbol_put(VMLINUX_SYMBOL_STR(x))
 void symbol_put_addr(void *addr);
 
 /* Sometimes we know we already have a refcount, and it's easier not
diff --git a/kernel/modsign_certificate.S b/kernel/modsign_certificate.S
index 246b4c6e6135..4a9a86d12c8b 100644
--- a/kernel/modsign_certificate.S
+++ b/kernel/modsign_certificate.S
@@ -1,15 +1,8 @@
-/* SYMBOL_PREFIX defined on commandline from CONFIG_SYMBOL_PREFIX */
-#ifndef SYMBOL_PREFIX
-#define ASM_SYMBOL(sym) sym
-#else
-#define PASTE2(x,y) x##y
-#define PASTE(x,y) PASTE2(x,y)
-#define ASM_SYMBOL(sym) PASTE(SYMBOL_PREFIX, sym)
-#endif
+#include <linux/export.h>
 
 #define GLOBAL(name)	\
-	.globl ASM_SYMBOL(name);	\
-	ASM_SYMBOL(name):
+	.globl VMLINUX_SYMBOL(name);	\
+	VMLINUX_SYMBOL(name):
 
 	.section ".init.data","aw"
 
diff --git a/kernel/module.c b/kernel/module.c
index 0925c9a71975..cfd4a3f68d7d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1209,7 +1209,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 
 	/* Since this should be found in kernel (which can't be removed),
 	 * no locking is necessary. */
-	if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
+	if (!find_symbol(VMLINUX_SYMBOL_STR(module_layout), NULL,
 			 &crc, true, false))
 		BUG();
 	return check_version(sechdrs, versindex, "module_layout", mod, crc,
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 07125e697d7a..a373a1f66023 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -119,13 +119,6 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_GCOV))
 endif
 
-ifdef CONFIG_SYMBOL_PREFIX
-_sym_flags = -DSYMBOL_PREFIX=$(patsubst "%",%,$(CONFIG_SYMBOL_PREFIX))
-_cpp_flags += $(_sym_flags)
-_a_flags += $(_sym_flags)
-endif
-
-
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index 3d569d6022c2..014994936b1c 100644
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -74,9 +74,8 @@ kallsyms()
 	info KSYM ${2}
 	local kallsymopt;
 
-	if [ -n "${CONFIG_SYMBOL_PREFIX}" ]; then
-		kallsymopt="${kallsymopt} \
-			    --symbol-prefix=${CONFIG_SYMBOL_PREFIX}"
+	if [ -n "${CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX}" ]; then
+		kallsymopt="${kallsymopt} --symbol-prefix=_"
 	fi
 
 	if [ -n "${CONFIG_KALLSYMS_ALL}" ]; then
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 78b30c1548e9..282decfa29ae 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -18,14 +18,7 @@
 #include "modpost.h"
 #include "../../include/generated/autoconf.h"
 #include "../../include/linux/license.h"
-
-/* Some toolchains use a `_' prefix for all user symbols. */
-#ifdef CONFIG_SYMBOL_PREFIX
-#define MODULE_SYMBOL_PREFIX CONFIG_SYMBOL_PREFIX
-#else
-#define MODULE_SYMBOL_PREFIX ""
-#endif
-
+#include "../../include/linux/export.h"
 
 /* Are we using CONFIG_MODVERSIONS? */
 int modversions = 0;
@@ -562,7 +555,7 @@ static void parse_elf_finish(struct elf_info *info)
 static int ignore_undef_symbol(struct elf_info *info, const char *symname)
 {
 	/* ignore __this_module, it will be resolved shortly */
-	if (strcmp(symname, MODULE_SYMBOL_PREFIX "__this_module") == 0)
+	if (strcmp(symname, VMLINUX_SYMBOL_STR(__this_module)) == 0)
 		return 1;
 	/* ignore global offset table */
 	if (strcmp(symname, "_GLOBAL_OFFSET_TABLE_") == 0)
@@ -583,8 +576,8 @@ static int ignore_undef_symbol(struct elf_info *info, const char *symname)
 	return 0;
 }
 
-#define CRC_PFX     MODULE_SYMBOL_PREFIX "__crc_"
-#define KSYMTAB_PFX MODULE_SYMBOL_PREFIX "__ksymtab_"
+#define CRC_PFX     VMLINUX_SYMBOL_STR(__crc_)
+#define KSYMTAB_PFX VMLINUX_SYMBOL_STR(__ksymtab_)
 
 static void handle_modversions(struct module *mod, struct elf_info *info,
 			       Elf_Sym *sym, const char *symname)
@@ -637,14 +630,15 @@ static void handle_modversions(struct module *mod, struct elf_info *info,
 		}
 #endif
 
-		if (memcmp(symname, MODULE_SYMBOL_PREFIX,
-			   strlen(MODULE_SYMBOL_PREFIX)) == 0) {
-			mod->unres =
-			  alloc_symbol(symname +
-			               strlen(MODULE_SYMBOL_PREFIX),
-			               ELF_ST_BIND(sym->st_info) == STB_WEAK,
-			               mod->unres);
-		}
+#ifdef CONFIG_HAVE_UNDERSCORE_SYMBOL_PREFIX
+		if (symname[0] != '_')
+			break;
+		else
+			symname++;
+#endif
+		mod->unres = alloc_symbol(symname,
+					  ELF_ST_BIND(sym->st_info) == STB_WEAK,
+					  mod->unres);
 		break;
 	default:
 		/* All exported symbols */
@@ -652,9 +646,9 @@ static void handle_modversions(struct module *mod, struct elf_info *info,
 			sym_add_exported(symname + strlen(KSYMTAB_PFX), mod,
 					export);
 		}
-		if (strcmp(symname, MODULE_SYMBOL_PREFIX "init_module") == 0)
+		if (strcmp(symname, VMLINUX_SYMBOL_STR(init_module)) == 0)
 			mod->has_init = 1;
-		if (strcmp(symname, MODULE_SYMBOL_PREFIX "cleanup_module") == 0)
+		if (strcmp(symname, VMLINUX_SYMBOL_STR(cleanup_module)) == 0)
 			mod->has_cleanup = 1;
 		break;
 	}
-- 
cgit 


From 8a7fbfab4be39b8690543f3d29b26860d2f6c576 Mon Sep 17 00:00:00 2001
From: "nikolay@redhat.com" <nikolay@redhat.com>
Date: Tue, 12 Mar 2013 02:49:01 +0000
Subject: netxen: write IP address to firmware when using bonding

This patch allows LRO aggregation on bonded devices that contain an
NX3031 device. It also adds a for_each_netdev_in_bond_rcu(bond, slave)
macro which executes for each slave that has bond as master.

V3: After testing and discussing this with Rajesh, I decided to keep the
    vlan ip cache and just rename it to ip_cache since it will store bond
    ip addresses too. A new master flag has been added to the ip cache to
    denote that the address has been added because of a master device.
    I've taken care of the enslave/release cases by checking for various
    combinations of events and flags (e.g. netxen has a master, it's a
    bond master and it's not marked as a slave means it is being enslaved
    and is dev_open()ed in bond_enslave).
    I've changed netxen_free_ip_list() to have a "master" parameter which
    causes all IP addresses marked as master to be deleted (used when a
    netxen is being released). I've made the patch use the new upper
    device API as well. The following cases were tested:
    - bond -> netxen
    - vlan -> netxen
    - vlan -> bond -> netxen

V2: Remove local ip caching, retrieve addresses dynamically and
    restore them if necessary.

Note: Tested with NX3031 adapter.

Tested-by: Rajesh Borundia <rajesh.borundia@qlogic.com>
Signed-off-by: Andy Gospodarek <agospoda@redhat.com>
Signed-off-by: Nikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/qlogic/netxen/netxen_nic.h    |   5 +-
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   | 220 ++++++++++++++-------
 include/linux/netdevice.h                          |   8 +
 3 files changed, 155 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic.h b/drivers/net/ethernet/qlogic/netxen/netxen_nic.h
index eb3dfdbb642b..322a36b76727 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic.h
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic.h
@@ -955,9 +955,10 @@ typedef struct nx_mac_list_s {
 	uint8_t mac_addr[ETH_ALEN+2];
 } nx_mac_list_t;
 
-struct nx_vlan_ip_list {
+struct nx_ip_list {
 	struct list_head list;
 	__be32 ip_addr;
+	bool master;
 };
 
 /*
@@ -1605,7 +1606,7 @@ struct netxen_adapter {
 	struct net_device *netdev;
 	struct pci_dev *pdev;
 	struct list_head mac_list;
-	struct list_head vlan_ip_list;
+	struct list_head ip_list;
 
 	spinlock_t tx_clean_lock;
 
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 501f49207da5..7867aebc05f2 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -90,7 +90,7 @@ static irqreturn_t netxen_intr(int irq, void *data);
 static irqreturn_t netxen_msi_intr(int irq, void *data);
 static irqreturn_t netxen_msix_intr(int irq, void *data);
 
-static void netxen_free_vlan_ip_list(struct netxen_adapter *);
+static void netxen_free_ip_list(struct netxen_adapter *, bool);
 static void netxen_restore_indev_addr(struct net_device *dev, unsigned long);
 static struct rtnl_link_stats64 *netxen_nic_get_stats(struct net_device *dev,
 						      struct rtnl_link_stats64 *stats);
@@ -1450,7 +1450,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	spin_lock_init(&adapter->tx_clean_lock);
 	INIT_LIST_HEAD(&adapter->mac_list);
-	INIT_LIST_HEAD(&adapter->vlan_ip_list);
+	INIT_LIST_HEAD(&adapter->ip_list);
 
 	err = netxen_setup_pci_map(adapter);
 	if (err)
@@ -1585,7 +1585,7 @@ static void netxen_nic_remove(struct pci_dev *pdev)
 
 	cancel_work_sync(&adapter->tx_timeout_task);
 
-	netxen_free_vlan_ip_list(adapter);
+	netxen_free_ip_list(adapter, false);
 	netxen_nic_detach(adapter);
 
 	nx_decr_dev_ref_cnt(adapter);
@@ -3137,62 +3137,77 @@ netxen_destip_supported(struct netxen_adapter *adapter)
 }
 
 static void
-netxen_free_vlan_ip_list(struct netxen_adapter *adapter)
+netxen_free_ip_list(struct netxen_adapter *adapter, bool master)
 {
-	struct nx_vlan_ip_list  *cur;
-	struct list_head *head = &adapter->vlan_ip_list;
+	struct nx_ip_list  *cur, *tmp_cur;
 
-	while (!list_empty(head)) {
-		cur = list_entry(head->next, struct nx_vlan_ip_list, list);
-		netxen_config_ipaddr(adapter, cur->ip_addr, NX_IP_DOWN);
-		list_del(&cur->list);
-		kfree(cur);
+	list_for_each_entry_safe(cur, tmp_cur, &adapter->ip_list, list) {
+		if (master) {
+			if (cur->master) {
+				netxen_config_ipaddr(adapter, cur->ip_addr,
+						     NX_IP_DOWN);
+				list_del(&cur->list);
+				kfree(cur);
+			}
+		} else {
+			netxen_config_ipaddr(adapter, cur->ip_addr, NX_IP_DOWN);
+			list_del(&cur->list);
+			kfree(cur);
+		}
 	}
-
 }
-static void
-netxen_list_config_vlan_ip(struct netxen_adapter *adapter,
+
+static bool
+netxen_list_config_ip(struct netxen_adapter *adapter,
 		struct in_ifaddr *ifa, unsigned long event)
 {
 	struct net_device *dev;
-	struct nx_vlan_ip_list *cur, *tmp_cur;
+	struct nx_ip_list *cur, *tmp_cur;
 	struct list_head *head;
+	bool ret = false;
 
 	dev = ifa->ifa_dev ? ifa->ifa_dev->dev : NULL;
 
 	if (dev == NULL)
-		return;
-
-	if (!is_vlan_dev(dev))
-		return;
+		goto out;
 
 	switch (event) {
 	case NX_IP_UP:
-		list_for_each(head, &adapter->vlan_ip_list) {
-			cur = list_entry(head, struct nx_vlan_ip_list, list);
+		list_for_each(head, &adapter->ip_list) {
+			cur = list_entry(head, struct nx_ip_list, list);
 
 			if (cur->ip_addr == ifa->ifa_address)
-				return;
+				goto out;
 		}
 
-		cur = kzalloc(sizeof(struct nx_vlan_ip_list), GFP_ATOMIC);
+		cur = kzalloc(sizeof(struct nx_ip_list), GFP_ATOMIC);
 		if (cur == NULL)
-			return;
-
+			goto out;
+		if (dev->priv_flags & IFF_802_1Q_VLAN)
+			dev = vlan_dev_real_dev(dev);
+		cur->master = !!netif_is_bond_master(dev);
 		cur->ip_addr = ifa->ifa_address;
-		list_add_tail(&cur->list, &adapter->vlan_ip_list);
+		list_add_tail(&cur->list, &adapter->ip_list);
+		netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_UP);
+		ret = true;
 		break;
 	case NX_IP_DOWN:
 		list_for_each_entry_safe(cur, tmp_cur,
-					&adapter->vlan_ip_list, list) {
+					&adapter->ip_list, list) {
 			if (cur->ip_addr == ifa->ifa_address) {
 				list_del(&cur->list);
 				kfree(cur);
+				netxen_config_ipaddr(adapter, ifa->ifa_address,
+						     NX_IP_DOWN);
+				ret = true;
 				break;
 			}
 		}
 	}
+out:
+	return ret;
 }
+
 static void
 netxen_config_indev_addr(struct netxen_adapter *adapter,
 		struct net_device *dev, unsigned long event)
@@ -3209,14 +3224,10 @@ netxen_config_indev_addr(struct netxen_adapter *adapter,
 	for_ifa(indev) {
 		switch (event) {
 		case NETDEV_UP:
-			netxen_config_ipaddr(adapter,
-					ifa->ifa_address, NX_IP_UP);
-			netxen_list_config_vlan_ip(adapter, ifa, NX_IP_UP);
+			netxen_list_config_ip(adapter, ifa, NX_IP_UP);
 			break;
 		case NETDEV_DOWN:
-			netxen_config_ipaddr(adapter,
-					ifa->ifa_address, NX_IP_DOWN);
-			netxen_list_config_vlan_ip(adapter, ifa, NX_IP_DOWN);
+			netxen_list_config_ip(adapter, ifa, NX_IP_DOWN);
 			break;
 		default:
 			break;
@@ -3231,23 +3242,78 @@ netxen_restore_indev_addr(struct net_device *netdev, unsigned long event)
 
 {
 	struct netxen_adapter *adapter = netdev_priv(netdev);
-	struct nx_vlan_ip_list *pos, *tmp_pos;
+	struct nx_ip_list *pos, *tmp_pos;
 	unsigned long ip_event;
 
 	ip_event = (event == NETDEV_UP) ? NX_IP_UP : NX_IP_DOWN;
 	netxen_config_indev_addr(adapter, netdev, event);
 
-	list_for_each_entry_safe(pos, tmp_pos, &adapter->vlan_ip_list, list) {
+	list_for_each_entry_safe(pos, tmp_pos, &adapter->ip_list, list) {
 		netxen_config_ipaddr(adapter, pos->ip_addr, ip_event);
 	}
 }
 
+static inline bool
+netxen_config_checkdev(struct net_device *dev)
+{
+	struct netxen_adapter *adapter;
+
+	if (!is_netxen_netdev(dev))
+		return false;
+	adapter = netdev_priv(dev);
+	if (!adapter)
+		return false;
+	if (!netxen_destip_supported(adapter))
+		return false;
+	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
+		return false;
+
+	return true;
+}
+
+/**
+ * netxen_config_master - configure addresses based on master
+ * @dev: netxen device
+ * @event: netdev event
+ */
+static void netxen_config_master(struct net_device *dev, unsigned long event)
+{
+	struct net_device *master, *slave;
+	struct netxen_adapter *adapter = netdev_priv(dev);
+
+	rcu_read_lock();
+	master = netdev_master_upper_dev_get_rcu(dev);
+	/*
+	 * This is the case where the netxen nic is being
+	 * enslaved and is dev_open()ed in bond_enslave()
+	 * Now we should program the bond's (and its vlans')
+	 * addresses in the netxen NIC.
+	 */
+	if (master && netif_is_bond_master(master) &&
+	    !netif_is_bond_slave(dev)) {
+		netxen_config_indev_addr(adapter, master, event);
+		for_each_netdev_rcu(&init_net, slave)
+			if (slave->priv_flags & IFF_802_1Q_VLAN &&
+			    vlan_dev_real_dev(slave) == master)
+				netxen_config_indev_addr(adapter, slave, event);
+	}
+	rcu_read_unlock();
+	/*
+	 * This is the case where the netxen nic is being
+	 * released and is dev_close()ed in bond_release()
+	 * just before IFF_BONDING is stripped.
+	 */
+	if (!master && dev->priv_flags & IFF_BONDING)
+		netxen_free_ip_list(adapter, true);
+}
+
 static int netxen_netdev_event(struct notifier_block *this,
 				 unsigned long event, void *ptr)
 {
 	struct netxen_adapter *adapter;
 	struct net_device *dev = (struct net_device *)ptr;
 	struct net_device *orig_dev = dev;
+	struct net_device *slave;
 
 recheck:
 	if (dev == NULL)
@@ -3257,19 +3323,28 @@ recheck:
 		dev = vlan_dev_real_dev(dev);
 		goto recheck;
 	}
-
-	if (!is_netxen_netdev(dev))
-		goto done;
-
-	adapter = netdev_priv(dev);
-
-	if (!adapter)
-		goto done;
-
-	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
-		goto done;
-
-	netxen_config_indev_addr(adapter, orig_dev, event);
+	if (event == NETDEV_UP || event == NETDEV_DOWN) {
+		/* If this is a bonding device, look for netxen-based slaves*/
+		if (netif_is_bond_master(dev)) {
+			rcu_read_lock();
+			for_each_netdev_in_bond_rcu(dev, slave) {
+				if (!netxen_config_checkdev(slave))
+					continue;
+				adapter = netdev_priv(slave);
+				netxen_config_indev_addr(adapter,
+							 orig_dev, event);
+			}
+			rcu_read_unlock();
+		} else {
+			if (!netxen_config_checkdev(dev))
+				goto done;
+			adapter = netdev_priv(dev);
+			/* Act only if the actual netxen is the target */
+			if (orig_dev == dev)
+				netxen_config_master(dev, event);
+			netxen_config_indev_addr(adapter, orig_dev, event);
+		}
+	}
 done:
 	return NOTIFY_DONE;
 }
@@ -3279,12 +3354,12 @@ netxen_inetaddr_event(struct notifier_block *this,
 		unsigned long event, void *ptr)
 {
 	struct netxen_adapter *adapter;
-	struct net_device *dev;
-
+	struct net_device *dev, *slave;
 	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	unsigned long ip_event;
 
 	dev = ifa->ifa_dev ? ifa->ifa_dev->dev : NULL;
-
+	ip_event = (event == NETDEV_UP) ? NX_IP_UP : NX_IP_DOWN;
 recheck:
 	if (dev == NULL)
 		goto done;
@@ -3293,31 +3368,24 @@ recheck:
 		dev = vlan_dev_real_dev(dev);
 		goto recheck;
 	}
-
-	if (!is_netxen_netdev(dev))
-		goto done;
-
-	adapter = netdev_priv(dev);
-
-	if (!adapter || !netxen_destip_supported(adapter))
-		goto done;
-
-	if (adapter->is_up != NETXEN_ADAPTER_UP_MAGIC)
-		goto done;
-
-	switch (event) {
-	case NETDEV_UP:
-		netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_UP);
-		netxen_list_config_vlan_ip(adapter, ifa, NX_IP_UP);
-		break;
-	case NETDEV_DOWN:
-		netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_DOWN);
-		netxen_list_config_vlan_ip(adapter, ifa, NX_IP_DOWN);
-		break;
-	default:
-		break;
+	if (event == NETDEV_UP || event == NETDEV_DOWN) {
+		/* If this is a bonding device, look for netxen-based slaves*/
+		if (netif_is_bond_master(dev)) {
+			rcu_read_lock();
+			for_each_netdev_in_bond_rcu(dev, slave) {
+				if (!netxen_config_checkdev(slave))
+					continue;
+				adapter = netdev_priv(slave);
+				netxen_list_config_ip(adapter, ifa, ip_event);
+			}
+			rcu_read_unlock();
+		} else {
+			if (!netxen_config_checkdev(dev))
+				goto done;
+			adapter = netdev_priv(dev);
+			netxen_list_config_ip(adapter, ifa, ip_event);
+		}
 	}
-
 done:
 	return NOTIFY_DONE;
 }
@@ -3334,7 +3402,7 @@ static void
 netxen_restore_indev_addr(struct net_device *dev, unsigned long event)
 { }
 static void
-netxen_free_vlan_ip_list(struct netxen_adapter *adapter)
+netxen_free_ip_list(struct netxen_adapter *adapter, bool master)
 { }
 #endif
 
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index e1ebeffa6b35..9fc1ab0c8914 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1617,6 +1617,9 @@ extern seqcount_t	devnet_rename_seq;	/* Device rename seq */
 		list_for_each_entry_continue(d, &(net)->dev_base_head, dev_list)
 #define for_each_netdev_continue_rcu(net, d)		\
 	list_for_each_entry_continue_rcu(d, &(net)->dev_base_head, dev_list)
+#define for_each_netdev_in_bond_rcu(bond, slave)	\
+		for_each_netdev_rcu(&init_net, slave)	\
+			if (netdev_master_upper_dev_get_rcu(slave) == bond)
 #define net_device_entry(lh)	list_entry(lh, struct net_device, dev_list)
 
 static inline struct net_device *next_net_device(struct net_device *dev)
@@ -2774,6 +2777,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
 	dev->gso_max_size = size;
 }
 
+static inline bool netif_is_bond_master(struct net_device *dev)
+{
+	return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
+}
+
 static inline bool netif_is_bond_slave(struct net_device *dev)
 {
 	return dev->flags & IFF_SLAVE && dev->priv_flags & IFF_BONDING;
-- 
cgit 


From 764444f5a324ad5a272773f078192819084388ce Mon Sep 17 00:00:00 2001
From: Fernando Luis Vazquez Cao <fernando_b1@lab.ntt.co.jp>
Date: Wed, 13 Mar 2013 16:57:25 +0000
Subject: net: clean leftover of COMPAT_NET_DEV_OPS removal

COMPAT_NET_DEV_OPS was removed a while back and with it the definition of
netdev_resync_ops() went away. Let's finish the clean-up.

Signed-off-by: Fernando Luis Vazquez Cao <fernando@oss.ntt.co.jp>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9fc1ab0c8914..56e3e0665272 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1692,7 +1692,6 @@ extern int 		netdev_refcnt_read(const struct net_device *dev);
 extern void		free_netdev(struct net_device *dev);
 extern void		synchronize_net(void);
 extern int		init_dummy_netdev(struct net_device *dev);
-extern void		netdev_resync_ops(struct net_device *dev);
 
 extern struct net_device	*dev_get_by_index(struct net *net, int ifindex);
 extern struct net_device	*__dev_get_by_index(struct net *net, int ifindex);
-- 
cgit 


From dad3cab3e063110b3ae3dc82a00e7aacd09b91ec Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm@ti.com>
Date: Mon, 4 Mar 2013 16:52:49 -0600
Subject: USB: fix trivial usb_device kernel-doc errors

Fix trivial kernel-doc warnings:
Warning(include/linux/usb.h:574): No description found for parameter 'usb3_lpm_enabled'
Warning(include/linux/usb.h:574): Excess struct/union/enum/typedef member 'usb_classdev' description in 'usb_device'
Warning(include/linux/usb.h:574): Excess struct/union/enum/typedef member 'usbfs_dentry' description in 'usb_device'

Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jiri Kosina <trivial@kernel.org>
Cc: linux-usb@vger.kernel.org
Cc: linux-kernel@vger.kernel.org

Signed-off-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb.h b/include/linux/usb.h
index 4d22d0f6167a..52464fb2389b 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -469,14 +469,12 @@ struct usb3_lpm_parameters {
  * @lpm_capable: device supports LPM
  * @usb2_hw_lpm_capable: device can perform USB2 hardware LPM
  * @usb2_hw_lpm_enabled: USB2 hardware LPM enabled
+ * @usb3_lpm_enabled: USB3 hardware LPM enabled
  * @string_langid: language ID for strings
  * @product: iProduct string, if present (static)
  * @manufacturer: iManufacturer string, if present (static)
  * @serial: iSerialNumber string, if present (static)
  * @filelist: usbfs files that are open to this device
- * @usb_classdev: USB class device that was created for usbfs device
- *	access from userspace
- * @usbfs_dentry: usbfs dentry entry for the device
  * @maxchild: number of ports if hub
  * @quirks: quirks of the whole device
  * @urbnum: number of URBs submitted for the whole device
-- 
cgit 


From 96dd86fa588169b745a71aedf2070e80f4943623 Mon Sep 17 00:00:00 2001
From: "K. Y. Srinivasan" <kys@microsoft.com>
Date: Fri, 15 Mar 2013 12:30:06 -0700
Subject: Drivers: hv: Add a new driver to support host initiated backup

This driver supports host initiated backup of the guest. On Windows guests,
the host can generate application consistent backups using the Windows VSS
framework. On Linux, we ensure that the backup will be file system consistent.
This driver allows the host to initiate a  "Freeze" operation on all the mounted
file systems in the guest. Once the mounted file systems in the guest are frozen,
the host snapshots the guest's file systems. Once this is done, the guest's file
systems are "thawed".

This driver has a user-level component (daemon) that invokes the appropriate
operation on all the mounted file systems in response to the requests from
the host. The duration for which the guest is frozen is very short - a few seconds.
During this interval, the diff disk is comitted.

In this version of the patch I have addressed the feedback from Olaf Herring.
Also, some of the connector related issues have been fixed.

Signed-off-by: K. Y. Srinivasan <kys@microsoft.com>
Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
Cc: Evgeniy Polyakov <zbr@ioremap.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/hv/Makefile            |   2 +-
 drivers/hv/hv_snapshot.c       | 287 +++++++++++++++++++++++++++++++++++++++++
 drivers/hv/hv_util.c           |  10 ++
 include/linux/hyperv.h         |  69 ++++++++++
 include/uapi/linux/connector.h |   5 +-
 tools/hv/hv_vss_daemon.c       | 220 +++++++++++++++++++++++++++++++
 6 files changed, 591 insertions(+), 2 deletions(-)
 create mode 100644 drivers/hv/hv_snapshot.c
 create mode 100644 tools/hv/hv_vss_daemon.c

(limited to 'include/linux')

diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile
index e6abfa02d8b7..0a74b5661186 100644
--- a/drivers/hv/Makefile
+++ b/drivers/hv/Makefile
@@ -5,4 +5,4 @@ obj-$(CONFIG_HYPERV_BALLOON)	+= hv_balloon.o
 hv_vmbus-y := vmbus_drv.o \
 		 hv.o connection.o channel.o \
 		 channel_mgmt.o ring_buffer.o
-hv_utils-y := hv_util.o hv_kvp.o
+hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o
diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c
new file mode 100644
index 000000000000..8ad5653ce447
--- /dev/null
+++ b/drivers/hv/hv_snapshot.c
@@ -0,0 +1,287 @@
+/*
+ * An implementation of host initiated guest snapshot.
+ *
+ *
+ * Copyright (C) 2013, Microsoft, Inc.
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/net.h>
+#include <linux/nls.h>
+#include <linux/connector.h>
+#include <linux/workqueue.h>
+#include <linux/hyperv.h>
+
+
+
+/*
+ * Global state maintained for transaction that is being processed.
+ * Note that only one transaction can be active at any point in time.
+ *
+ * This state is set when we receive a request from the host; we
+ * cleanup this state when the transaction is completed - when we respond
+ * to the host with the key value.
+ */
+
+static struct {
+	bool active; /* transaction status - active or not */
+	int recv_len; /* number of bytes received. */
+	struct vmbus_channel *recv_channel; /* chn we got the request */
+	u64 recv_req_id; /* request ID. */
+	struct hv_vss_msg  *msg; /* current message */
+} vss_transaction;
+
+
+static void vss_respond_to_host(int error);
+
+static struct cb_id vss_id = { CN_VSS_IDX, CN_VSS_VAL };
+static const char vss_name[] = "vss_kernel_module";
+static __u8 *recv_buffer;
+
+static void vss_send_op(struct work_struct *dummy);
+static DECLARE_WORK(vss_send_op_work, vss_send_op);
+
+/*
+ * Callback when data is received from user mode.
+ */
+
+static void
+vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp)
+{
+	struct hv_vss_msg *vss_msg;
+
+	vss_msg = (struct hv_vss_msg *)msg->data;
+
+	if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) {
+		pr_info("VSS daemon registered\n");
+		vss_transaction.active = false;
+		if (vss_transaction.recv_channel != NULL)
+			hv_vss_onchannelcallback(vss_transaction.recv_channel);
+		return;
+
+	}
+	vss_respond_to_host(vss_msg->error);
+}
+
+
+static void vss_send_op(struct work_struct *dummy)
+{
+	int op = vss_transaction.msg->vss_hdr.operation;
+	struct cn_msg *msg;
+	struct hv_vss_msg *vss_msg;
+
+	msg = kzalloc(sizeof(*msg) + sizeof(*vss_msg), GFP_ATOMIC);
+	if (!msg)
+		return;
+
+	vss_msg = (struct hv_vss_msg *)msg->data;
+
+	msg->id.idx =  CN_VSS_IDX;
+	msg->id.val = CN_VSS_VAL;
+
+	vss_msg->vss_hdr.operation = op;
+	msg->len = sizeof(struct hv_vss_msg);
+
+	cn_netlink_send(msg, 0, GFP_ATOMIC);
+	kfree(msg);
+
+	return;
+}
+
+/*
+ * Send a response back to the host.
+ */
+
+static void
+vss_respond_to_host(int error)
+{
+	struct icmsg_hdr *icmsghdrp;
+	u32	buf_len;
+	struct vmbus_channel *channel;
+	u64	req_id;
+
+	/*
+	 * If a transaction is not active; log and return.
+	 */
+
+	if (!vss_transaction.active) {
+		/*
+		 * This is a spurious call!
+		 */
+		pr_warn("VSS: Transaction not active\n");
+		return;
+	}
+	/*
+	 * Copy the global state for completing the transaction. Note that
+	 * only one transaction can be active at a time.
+	 */
+
+	buf_len = vss_transaction.recv_len;
+	channel = vss_transaction.recv_channel;
+	req_id = vss_transaction.recv_req_id;
+	vss_transaction.active = false;
+
+	icmsghdrp = (struct icmsg_hdr *)
+			&recv_buffer[sizeof(struct vmbuspipe_hdr)];
+
+	if (channel->onchannel_callback == NULL)
+		/*
+		 * We have raced with util driver being unloaded;
+		 * silently return.
+		 */
+		return;
+
+	icmsghdrp->status = error;
+
+	icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE;
+
+	vmbus_sendpacket(channel, recv_buffer, buf_len, req_id,
+				VM_PKT_DATA_INBAND, 0);
+
+}
+
+/*
+ * This callback is invoked when we get a VSS message from the host.
+ * The host ensures that only one VSS transaction can be active at a time.
+ */
+
+void hv_vss_onchannelcallback(void *context)
+{
+	struct vmbus_channel *channel = context;
+	u32 recvlen;
+	u64 requestid;
+	struct hv_vss_msg *vss_msg;
+
+
+	struct icmsg_hdr *icmsghdrp;
+	struct icmsg_negotiate *negop = NULL;
+
+	if (vss_transaction.active) {
+		/*
+		 * We will defer processing this callback once
+		 * the current transaction is complete.
+		 */
+		vss_transaction.recv_channel = channel;
+		return;
+	}
+
+	vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen,
+			 &requestid);
+
+	if (recvlen > 0) {
+		icmsghdrp = (struct icmsg_hdr *)&recv_buffer[
+			sizeof(struct vmbuspipe_hdr)];
+
+		if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) {
+			vmbus_prep_negotiate_resp(icmsghdrp, negop,
+				 recv_buffer, MAX_SRV_VER, MAX_SRV_VER);
+			/*
+			 * We currently negotiate the highest number the
+			 * host has presented. If this version is not
+			 * atleast 5.0, reject.
+			 */
+			negop = (struct icmsg_negotiate *)&recv_buffer[
+				sizeof(struct vmbuspipe_hdr) +
+				sizeof(struct icmsg_hdr)];
+
+			if (negop->icversion_data[1].major < 5)
+				negop->icframe_vercnt = 0;
+		} else {
+			vss_msg = (struct hv_vss_msg *)&recv_buffer[
+				sizeof(struct vmbuspipe_hdr) +
+				sizeof(struct icmsg_hdr)];
+
+			/*
+			 * Stash away this global state for completing the
+			 * transaction; note transactions are serialized.
+			 */
+
+			vss_transaction.recv_len = recvlen;
+			vss_transaction.recv_channel = channel;
+			vss_transaction.recv_req_id = requestid;
+			vss_transaction.active = true;
+			vss_transaction.msg = (struct hv_vss_msg *)vss_msg;
+
+			switch (vss_msg->vss_hdr.operation) {
+				/*
+				 * Initiate a "freeze/thaw"
+				 * operation in the guest.
+				 * We respond to the host once
+				 * the operation is complete.
+				 *
+				 * We send the message to the
+				 * user space daemon and the
+				 * operation is performed in
+				 * the daemon.
+				 */
+			case VSS_OP_FREEZE:
+			case VSS_OP_THAW:
+				schedule_work(&vss_send_op_work);
+				return;
+
+			case VSS_OP_HOT_BACKUP:
+				vss_msg->vss_cf.flags =
+					 VSS_HBU_NO_AUTO_RECOVERY;
+				vss_respond_to_host(0);
+				return;
+
+			case VSS_OP_GET_DM_INFO:
+				vss_msg->dm_info.flags = 0;
+				vss_respond_to_host(0);
+				return;
+
+			default:
+				vss_respond_to_host(0);
+				return;
+
+			}
+
+		}
+
+		icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION
+			| ICMSGHDRFLAG_RESPONSE;
+
+		vmbus_sendpacket(channel, recv_buffer,
+				       recvlen, requestid,
+				       VM_PKT_DATA_INBAND, 0);
+	}
+
+}
+
+int
+hv_vss_init(struct hv_util_service *srv)
+{
+	int err;
+
+	err = cn_add_callback(&vss_id, vss_name, vss_cn_callback);
+	if (err)
+		return err;
+	recv_buffer = srv->recv_buffer;
+
+	/*
+	 * When this driver loads, the user level daemon that
+	 * processes the host requests may not yet be running.
+	 * Defer processing channel callbacks until the daemon
+	 * has registered.
+	 */
+	vss_transaction.active = true;
+	return 0;
+}
+
+void hv_vss_deinit(void)
+{
+	cn_del_callback(&vss_id);
+	cancel_work_sync(&vss_send_op_work);
+}
diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c
index 1d4cbd8e8261..2f561c5dfe24 100644
--- a/drivers/hv/hv_util.c
+++ b/drivers/hv/hv_util.c
@@ -49,6 +49,12 @@ static struct hv_util_service util_kvp = {
 	.util_deinit = hv_kvp_deinit,
 };
 
+static struct hv_util_service util_vss = {
+	.util_cb = hv_vss_onchannelcallback,
+	.util_init = hv_vss_init,
+	.util_deinit = hv_vss_deinit,
+};
+
 static void perform_shutdown(struct work_struct *dummy)
 {
 	orderly_poweroff(true);
@@ -339,6 +345,10 @@ static const struct hv_vmbus_device_id id_table[] = {
 	{ HV_KVP_GUID,
 	  .driver_data = (unsigned long)&util_kvp
 	},
+	/* VSS GUID */
+	{ HV_VSS_GUID,
+	  .driver_data = (unsigned long)&util_vss
+	},
 	{ },
 };
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index df77ba9a8166..95d0850584da 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -27,6 +27,63 @@
 
 #include <linux/types.h>
 
+
+/*
+ * Implementation of host controlled snapshot of the guest.
+ */
+
+#define VSS_OP_REGISTER 128
+
+enum hv_vss_op {
+	VSS_OP_CREATE = 0,
+	VSS_OP_DELETE,
+	VSS_OP_HOT_BACKUP,
+	VSS_OP_GET_DM_INFO,
+	VSS_OP_BU_COMPLETE,
+	/*
+	 * Following operations are only supported with IC version >= 5.0
+	 */
+	VSS_OP_FREEZE, /* Freeze the file systems in the VM */
+	VSS_OP_THAW, /* Unfreeze the file systems */
+	VSS_OP_AUTO_RECOVER,
+	VSS_OP_COUNT /* Number of operations, must be last */
+};
+
+
+/*
+ * Header for all VSS messages.
+ */
+struct hv_vss_hdr {
+	__u8 operation;
+	__u8 reserved[7];
+} __attribute__((packed));
+
+
+/*
+ * Flag values for the hv_vss_check_feature. Linux supports only
+ * one value.
+ */
+#define VSS_HBU_NO_AUTO_RECOVERY	0x00000005
+
+struct hv_vss_check_feature {
+	__u32 flags;
+} __attribute__((packed));
+
+struct hv_vss_check_dm_info {
+	__u32 flags;
+} __attribute__((packed));
+
+struct hv_vss_msg {
+	union {
+		struct hv_vss_hdr vss_hdr;
+		int error;
+	};
+	union {
+		struct hv_vss_check_feature vss_cf;
+		struct hv_vss_check_dm_info dm_info;
+	};
+} __attribute__((packed));
+
 /*
  * An implementation of HyperV key value pair (KVP) functionality for Linux.
  *
@@ -1252,6 +1309,14 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver);
 			0xb9, 0x8b, 0x8b, 0xa1, 0xa1, 0xf3, 0xf9, 0x5a \
 		}
 
+/*
+ * VSS (Backup/Restore) GUID
+ */
+#define HV_VSS_GUID \
+	.guid = { \
+			0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42, \
+			0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40 \
+		}
 /*
  * Common header for Hyper-V ICs
  */
@@ -1356,6 +1421,10 @@ int hv_kvp_init(struct hv_util_service *);
 void hv_kvp_deinit(void);
 void hv_kvp_onchannelcallback(void *);
 
+int hv_vss_init(struct hv_util_service *);
+void hv_vss_deinit(void);
+void hv_vss_onchannelcallback(void *);
+
 /*
  * Negotiated version with the Host.
  */
diff --git a/include/uapi/linux/connector.h b/include/uapi/linux/connector.h
index 8761a0349c74..4cb283505e45 100644
--- a/include/uapi/linux/connector.h
+++ b/include/uapi/linux/connector.h
@@ -44,8 +44,11 @@
 #define CN_VAL_DRBD			0x1
 #define CN_KVP_IDX			0x9	/* HyperV KVP */
 #define CN_KVP_VAL			0x1	/* queries from the kernel */
+#define CN_VSS_IDX			0xA     /* HyperV VSS */
+#define CN_VSS_VAL			0x1     /* queries from the kernel */
 
-#define CN_NETLINK_USERS		10	/* Highest index + 1 */
+
+#define CN_NETLINK_USERS		11	/* Highest index + 1 */
 
 /*
  * Maximum connector's message size.
diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
new file mode 100644
index 000000000000..95269952aa92
--- /dev/null
+++ b/tools/hv/hv_vss_daemon.c
@@ -0,0 +1,220 @@
+/*
+ * An implementation of the host initiated guest snapshot for Hyper-V.
+ *
+ *
+ * Copyright (C) 2013, Microsoft, Inc.
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <linux/types.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <linux/connector.h>
+#include <linux/hyperv.h>
+#include <linux/netlink.h>
+#include <syslog.h>
+
+static char vss_recv_buffer[4096];
+static char vss_send_buffer[4096];
+static struct sockaddr_nl addr;
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+
+static int vss_operate(int operation)
+{
+	char *fs_op;
+	char cmd[512];
+	char buf[512];
+	FILE *file;
+	char *p;
+	char *x;
+	int error;
+
+	switch (operation) {
+	case VSS_OP_FREEZE:
+		fs_op = "-f ";
+		break;
+	case VSS_OP_THAW:
+		fs_op = "-u ";
+		break;
+	}
+
+	file = popen("mount | awk '/^\/dev\// { print $3}'", "r");
+	if (file == NULL)
+		return;
+
+	while ((p = fgets(buf, sizeof(buf), file)) != NULL) {
+		x = strchr(p, '\n');
+		*x = '\0';
+		if (!strncmp(p, "/", sizeof("/")))
+			continue;
+
+		sprintf(cmd, "%s %s %s", "fsfreeze ", fs_op, p);
+		syslog(LOG_INFO, "VSS cmd is %s\n", cmd);
+		error = system(cmd);
+	}
+	pclose(file);
+
+	sprintf(cmd, "%s %s %s", "fsfreeze ", fs_op, "/");
+	syslog(LOG_INFO, "VSS cmd is %s\n", cmd);
+	error = system(cmd);
+
+	return error;
+}
+
+static int netlink_send(int fd, struct cn_msg *msg)
+{
+	struct nlmsghdr *nlh;
+	unsigned int size;
+	struct msghdr message;
+	char buffer[64];
+	struct iovec iov[2];
+
+	size = NLMSG_SPACE(sizeof(struct cn_msg) + msg->len);
+
+	nlh = (struct nlmsghdr *)buffer;
+	nlh->nlmsg_seq = 0;
+	nlh->nlmsg_pid = getpid();
+	nlh->nlmsg_type = NLMSG_DONE;
+	nlh->nlmsg_len = NLMSG_LENGTH(size - sizeof(*nlh));
+	nlh->nlmsg_flags = 0;
+
+	iov[0].iov_base = nlh;
+	iov[0].iov_len = sizeof(*nlh);
+
+	iov[1].iov_base = msg;
+	iov[1].iov_len = size;
+
+	memset(&message, 0, sizeof(message));
+	message.msg_name = &addr;
+	message.msg_namelen = sizeof(addr);
+	message.msg_iov = iov;
+	message.msg_iovlen = 2;
+
+	return sendmsg(fd, &message, 0);
+}
+
+int main(void)
+{
+	int fd, len, nl_group;
+	int error;
+	struct cn_msg *message;
+	struct pollfd pfd;
+	struct nlmsghdr *incoming_msg;
+	struct cn_msg	*incoming_cn_msg;
+	int	op;
+	struct hv_vss_msg *vss_msg;
+
+	daemon(1, 0);
+	openlog("Hyper-V VSS", 0, LOG_USER);
+	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
+
+	fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+	if (fd < 0) {
+		syslog(LOG_ERR, "netlink socket creation failed; error:%d", fd);
+		exit(EXIT_FAILURE);
+	}
+	addr.nl_family = AF_NETLINK;
+	addr.nl_pad = 0;
+	addr.nl_pid = 0;
+	addr.nl_groups = 0;
+
+
+	error = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+	if (error < 0) {
+		syslog(LOG_ERR, "bind failed; error:%d", error);
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+	nl_group = CN_VSS_IDX;
+	setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &nl_group, sizeof(nl_group));
+	/*
+	 * Register ourselves with the kernel.
+	 */
+	message = (struct cn_msg *)vss_send_buffer;
+	message->id.idx = CN_VSS_IDX;
+	message->id.val = CN_VSS_VAL;
+	message->ack = 0;
+	vss_msg = (struct hv_vss_msg *)message->data;
+	vss_msg->vss_hdr.operation = VSS_OP_REGISTER;
+
+	message->len = sizeof(struct hv_vss_msg);
+
+	len = netlink_send(fd, message);
+	if (len < 0) {
+		syslog(LOG_ERR, "netlink_send failed; error:%d", len);
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+
+	pfd.fd = fd;
+
+	while (1) {
+		struct sockaddr *addr_p = (struct sockaddr *) &addr;
+		socklen_t addr_l = sizeof(addr);
+		pfd.events = POLLIN;
+		pfd.revents = 0;
+		poll(&pfd, 1, -1);
+
+		len = recvfrom(fd, vss_recv_buffer, sizeof(vss_recv_buffer), 0,
+				addr_p, &addr_l);
+
+		if (len < 0 || addr.nl_pid) {
+			syslog(LOG_ERR, "recvfrom failed; pid:%u error:%d %s",
+					addr.nl_pid, errno, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		incoming_msg = (struct nlmsghdr *)vss_recv_buffer;
+
+		if (incoming_msg->nlmsg_type != NLMSG_DONE)
+			continue;
+
+		incoming_cn_msg = (struct cn_msg *)NLMSG_DATA(incoming_msg);
+		vss_msg = (struct hv_vss_msg *)incoming_cn_msg->data;
+		op = vss_msg->vss_hdr.operation;
+		error =  HV_S_OK;
+
+		switch (op) {
+		case VSS_OP_FREEZE:
+		case VSS_OP_THAW:
+			error = vss_operate(op);
+			if (error)
+				error = HV_E_FAIL;
+			break;
+		default:
+			syslog(LOG_ERR, "Illegal op:%d\n", op);
+		}
+		vss_msg->error = error;
+		len = netlink_send(fd, incoming_cn_msg);
+		if (len < 0) {
+			syslog(LOG_ERR, "net_link send failed; error:%d", len);
+			exit(EXIT_FAILURE);
+		}
+	}
+
+}
-- 
cgit 


From fa882867ae5f8543eb304a1667563f1c99514475 Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Fri, 8 Mar 2013 09:21:46 +0100
Subject: ipack: add ipack_get_device() ipack_put_device()

Prepare everything for later use.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/ipack/ipack.c | 12 ++++++++++++
 include/linux/ipack.h |  3 +++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ipack/ipack.c b/drivers/ipack/ipack.c
index 7ec6b208b1cb..4f913aa88971 100644
--- a/drivers/ipack/ipack.c
+++ b/drivers/ipack/ipack.c
@@ -461,6 +461,18 @@ void ipack_device_unregister(struct ipack_device *dev)
 }
 EXPORT_SYMBOL_GPL(ipack_device_unregister);
 
+void ipack_get_device(struct ipack_device *dev)
+{
+	get_device(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(ipack_get_device);
+
+void ipack_put_device(struct ipack_device *dev)
+{
+	put_device(&dev->dev);
+}
+EXPORT_SYMBOL_GPL(ipack_put_device);
+
 static int __init ipack_init(void)
 {
 	ida_init(&ipack_ida);
diff --git a/include/linux/ipack.h b/include/linux/ipack.h
index fea12cbb2aeb..def91fd996f4 100644
--- a/include/linux/ipack.h
+++ b/include/linux/ipack.h
@@ -221,6 +221,9 @@ void ipack_driver_unregister(struct ipack_driver *edrv);
 int ipack_device_register(struct ipack_device *dev);
 void ipack_device_unregister(struct ipack_device *dev);
 
+void ipack_get_device(struct ipack_device *dev);
+void ipack_put_device(struct ipack_device *dev);
+
 /**
  * DEFINE_IPACK_DEVICE_TABLE - macro used to describe a IndustryPack table
  * @_table: device table name
-- 
cgit 


From e926301b39a07f587ff8c66354a2e2ee4c29162c Mon Sep 17 00:00:00 2001
From: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Date: Fri, 8 Mar 2013 09:21:47 +0100
Subject: ipack: split ipack_device_register() in several functions

One function is ipack_device_init(). If it fails, the caller should execute
ipack_put_device().

The second function is ipack_device_add that only adds the device. If
it fails, the caller should execute ipack_put_device().

Then the device is removed with refcount = 0, as device_register() kernel
documentation says.

ipack_device_del() is added to remove the device.

Signed-off-by: Samuel Iglesias Gonsalvez <siglesias@igalia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/ipack/carriers/tpci200.c | 14 +++++++++++++-
 drivers/ipack/ipack.c            | 24 ++++++++++++++----------
 include/linux/ipack.h            | 39 +++++++++++++++++++++++++++++----------
 3 files changed, 56 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ipack/carriers/tpci200.c b/drivers/ipack/carriers/tpci200.c
index 0246b1fddffe..c276fde318e5 100644
--- a/drivers/ipack/carriers/tpci200.c
+++ b/drivers/ipack/carriers/tpci200.c
@@ -480,6 +480,7 @@ static void tpci200_release_device(struct ipack_device *dev)
 
 static int tpci200_create_device(struct tpci200_board *tpci200, int i)
 {
+	int ret;
 	enum ipack_space space;
 	struct ipack_device *dev =
 		kzalloc(sizeof(struct ipack_device), GFP_KERNEL);
@@ -495,7 +496,18 @@ static int tpci200_create_device(struct tpci200_board *tpci200, int i)
 			+ tpci200_space_interval[space] * i;
 		dev->region[space].size = tpci200_space_size[space];
 	}
-	return ipack_device_register(dev);
+
+	ret = ipack_device_init(dev);
+	if (ret < 0) {
+		ipack_put_device(dev);
+		return ret;
+	}
+
+	ret = ipack_device_add(dev);
+	if (ret < 0)
+		ipack_put_device(dev);
+
+	return ret;
 }
 
 static int tpci200_pci_probe(struct pci_dev *pdev,
diff --git a/drivers/ipack/ipack.c b/drivers/ipack/ipack.c
index 4f913aa88971..6e066c53acce 100644
--- a/drivers/ipack/ipack.c
+++ b/drivers/ipack/ipack.c
@@ -227,7 +227,7 @@ static int ipack_unregister_bus_member(struct device *dev, void *data)
 	struct ipack_bus_device *bus = data;
 
 	if (idev->bus == bus)
-		ipack_device_unregister(idev);
+		ipack_device_del(idev);
 
 	return 1;
 }
@@ -419,7 +419,7 @@ out:
 	return ret;
 }
 
-int ipack_device_register(struct ipack_device *dev)
+int ipack_device_init(struct ipack_device *dev)
 {
 	int ret;
 
@@ -428,6 +428,7 @@ int ipack_device_register(struct ipack_device *dev)
 	dev->dev.parent = dev->bus->parent;
 	dev_set_name(&dev->dev,
 		     "ipack-dev.%u.%u", dev->bus->bus_nr, dev->slot);
+	device_initialize(&dev->dev);
 
 	if (dev->bus->ops->set_clockrate(dev, 8))
 		dev_warn(&dev->dev, "failed to switch to 8 MHz operation for reading of device ID.\n");
@@ -447,19 +448,22 @@ int ipack_device_register(struct ipack_device *dev)
 			dev_err(&dev->dev, "failed to switch to 32 MHz operation.\n");
 	}
 
-	ret = device_register(&dev->dev);
-	if (ret < 0)
-		kfree(dev->id);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ipack_device_init);
 
-	return ret;
+int ipack_device_add(struct ipack_device *dev)
+{
+	return device_add(&dev->dev);
 }
-EXPORT_SYMBOL_GPL(ipack_device_register);
+EXPORT_SYMBOL_GPL(ipack_device_add);
 
-void ipack_device_unregister(struct ipack_device *dev)
+void ipack_device_del(struct ipack_device *dev)
 {
-	device_unregister(&dev->dev);
+	device_del(&dev->dev);
+	ipack_put_device(dev);
 }
-EXPORT_SYMBOL_GPL(ipack_device_unregister);
+EXPORT_SYMBOL_GPL(ipack_device_del);
 
 void ipack_get_device(struct ipack_device *dev)
 {
diff --git a/include/linux/ipack.h b/include/linux/ipack.h
index def91fd996f4..1888e06ddf64 100644
--- a/include/linux/ipack.h
+++ b/include/linux/ipack.h
@@ -207,19 +207,38 @@ int ipack_driver_register(struct ipack_driver *edrv, struct module *owner,
 void ipack_driver_unregister(struct ipack_driver *edrv);
 
 /**
- *	ipack_device_register -- register an IPack device with the kernel
- *	@dev: the new device to register.
+ *	ipack_device_init -- initialize an IPack device
+ * @dev: the new device to initialize.
  *
- *	Register a new IPack device ("module" in IndustryPack jargon). The call
- *	is done by the carrier driver.  The carrier should populate the fields
- *	bus and slot as well as the region array of @dev prior to calling this
- *	function.  The rest of the fields will be allocated and populated
- *	during registration.
+ * Initialize a new IPack device ("module" in IndustryPack jargon). The call
+ * is done by the carrier driver.  The carrier should populate the fields
+ * bus and slot as well as the region array of @dev prior to calling this
+ * function.  The rest of the fields will be allocated and populated
+ * during initalization.
  *
- *	Return zero on success or error code on failure.
+ * Return zero on success or error code on failure.
+ *
+ * NOTE: _Never_ directly free @dev after calling this function, even
+ * if it returned an error! Always use ipack_put_device() to give up the
+ * reference initialized in this function instead.
+ */
+int ipack_device_init(struct ipack_device *dev);
+
+/**
+ *	ipack_device_add -- Add an IPack device
+ * @dev: the new device to add.
+ *
+ * Add a new IPack device. The call is done by the carrier driver
+ * after calling ipack_device_init().
+ *
+ * Return zero on success or error code on failure.
+ *
+ * NOTE: _Never_ directly free @dev after calling this function, even
+ * if it returned an error! Always use ipack_put_device() to give up the
+ * reference initialized in this function instead.
  */
-int ipack_device_register(struct ipack_device *dev);
-void ipack_device_unregister(struct ipack_device *dev);
+int ipack_device_add(struct ipack_device *dev);
+void ipack_device_del(struct ipack_device *dev);
 
 void ipack_get_device(struct ipack_device *dev);
 void ipack_put_device(struct ipack_device *dev);
-- 
cgit 


From 5caf4636259ae3af0efbb9bfc4cd97874b547c7d Mon Sep 17 00:00:00 2001
From: Feng Tang <feng.tang@intel.com>
Date: Tue, 12 Mar 2013 11:56:46 +0800
Subject: clocksource: Add new feature flag CLOCK_SOURCE_SUSPEND_NONSTOP

Some x86 processors have a TSC clocksource, which continues to run
even when system is suspended. Also most OMAP platforms have a
32 KHz timer which has similar capability. Add a feature flag so that
it could be utilized.

Signed-off-by: Feng Tang <feng.tang@intel.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/clocksource.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 27cfda427dd9..aa7032c7238f 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -206,6 +206,7 @@ struct clocksource {
 #define CLOCK_SOURCE_WATCHDOG			0x10
 #define CLOCK_SOURCE_VALID_FOR_HRES		0x20
 #define CLOCK_SOURCE_UNSTABLE			0x40
+#define CLOCK_SOURCE_SUSPEND_NONSTOP		0x80
 
 /* simplify initialization of mask field */
 #define CLOCKSOURCE_MASK(bits) (cycle_t)((bits) < 64 ? ((1ULL<<(bits))-1) : -1)
-- 
cgit 


From a362db3d6c8a952cbde510b1fa35d0ee001b347e Mon Sep 17 00:00:00 2001
From: Cong Wang <amwang@redhat.com>
Date: Sat, 16 Mar 2013 04:47:55 +0000
Subject: net: fix some typos in netif features

Cc: Pravin B Shelar <pshelar@nicira.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Cong Wang <amwang@redhat.com>
Acked-by: Pravin B Shelar <pshelar@nicira.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdev_features.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index f5e797c0c2a4..d6ee2d008ee4 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -102,8 +102,8 @@ enum {
 #define NETIF_F_VLAN_CHALLENGED	__NETIF_F(VLAN_CHALLENGED)
 #define NETIF_F_RXFCS		__NETIF_F(RXFCS)
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
-#define NETIF_F_GRE_GSO		__NETIF_F(GSO_GRE)
-#define NETIF_F_UDP_TUNNEL	__NETIF_F(UDP_TUNNEL)
+#define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
+#define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
-- 
cgit 


From 1a2c6181c4a1922021b4d7df373bba612c3e5f04 Mon Sep 17 00:00:00 2001
From: Christoph Paasch <christoph.paasch@uclouvain.be>
Date: Sun, 17 Mar 2013 08:23:34 +0000
Subject: tcp: Remove TCPCT

TCPCT uses option-number 253, reserved for experimental use and should
not be used in production environments.
Further, TCPCT does not fully implement RFC 6013.

As a nice side-effect, removing TCPCT increases TCP's performance for
very short flows:

Doing an apache-benchmark with -c 100 -n 100000, sending HTTP-requests
for files of 1KB size.

before this patch:
	average (among 7 runs) of 20845.5 Requests/Second
after:
	average (among 7 runs) of 21403.6 Requests/Second

Signed-off-by: Christoph Paasch <christoph.paasch@uclouvain.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |   8 -
 drivers/infiniband/hw/cxgb4/cm.c       |   2 +-
 include/linux/tcp.h                    |  10 --
 include/net/request_sock.h             |   8 +-
 include/net/tcp.h                      |  89 +----------
 include/uapi/linux/tcp.h               |  26 ----
 net/dccp/ipv4.c                        |   5 +-
 net/dccp/ipv6.c                        |   5 +-
 net/ipv4/inet_connection_sock.c        |   2 +-
 net/ipv4/syncookies.c                  |   3 +-
 net/ipv4/sysctl_net_ipv4.c             |   7 -
 net/ipv4/tcp.c                         | 267 ---------------------------------
 net/ipv4/tcp_input.c                   |  69 +--------
 net/ipv4/tcp_ipv4.c                    |  60 +-------
 net/ipv4/tcp_minisocks.c               |  40 +----
 net/ipv4/tcp_output.c                  | 219 +--------------------------
 net/ipv6/syncookies.c                  |   3 +-
 net/ipv6/tcp_ipv6.c                    |  56 +------
 18 files changed, 38 insertions(+), 841 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 18a24c405ac0..17953e2bc3e9 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -175,14 +175,6 @@ tcp_congestion_control - STRING
 	is inherited.
 	[see setsockopt(listenfd, SOL_TCP, TCP_CONGESTION, "name" ...) ]
 
-tcp_cookie_size - INTEGER
-	Default size of TCP Cookie Transactions (TCPCT) option, that may be
-	overridden on a per socket basis by the TCPCT socket option.
-	Values greater than the maximum (16) are interpreted as the maximum.
-	Values greater than zero and less than the minimum (8) are interpreted
-	as the minimum.  Odd values are interpreted as the next even value.
-	Default: 0 (off).
-
 tcp_dsack - BOOLEAN
 	Allows TCP to send "duplicate" SACKs.
 
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index 8dcc84fd9d30..54fd31fcc332 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -2915,7 +2915,7 @@ static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
 	 */
 	memset(&tmp_opt, 0, sizeof(tmp_opt));
 	tcp_clear_options(&tmp_opt);
-	tcp_parse_options(skb, &tmp_opt, NULL, 0, NULL);
+	tcp_parse_options(skb, &tmp_opt, 0, NULL);
 
 	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
 	memset(req, 0, sizeof(*req));
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 763c108ee03d..ed6a7456eecd 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -90,9 +90,6 @@ struct tcp_options_received {
 		sack_ok : 4,	/* SACK seen on SYN packet		*/
 		snd_wscale : 4,	/* Window scaling received from sender	*/
 		rcv_wscale : 4;	/* Window scaling to send to receiver	*/
-	u8	cookie_plus:6,	/* bytes in authenticator/cookie option	*/
-		cookie_out_never:1,
-		cookie_in_always:1;
 	u8	num_sacks;	/* Number of SACK blocks		*/
 	u16	user_mss;	/* mss requested by user in ioctl	*/
 	u16	mss_clamp;	/* Maximal mss, negotiated at connection setup */
@@ -102,7 +99,6 @@ static inline void tcp_clear_options(struct tcp_options_received *rx_opt)
 {
 	rx_opt->tstamp_ok = rx_opt->sack_ok = 0;
 	rx_opt->wscale_ok = rx_opt->snd_wscale = 0;
-	rx_opt->cookie_plus = 0;
 }
 
 /* This is the max number of SACKS that we'll generate and process. It's safe
@@ -320,12 +316,6 @@ struct tcp_sock {
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
-	/* When the cookie options are generated and exchanged, then this
-	 * object holds a reference to them (cookie_values->kref).  Also
-	 * contains related tcp_cookie_transactions fields.
-	 */
-	struct tcp_cookie_values  *cookie_values;
-
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
 	/* fastopen_rsk points to request_sock that resulted in this big
diff --git a/include/net/request_sock.h b/include/net/request_sock.h
index a51dbd17c2de..9069e65c1c56 100644
--- a/include/net/request_sock.h
+++ b/include/net/request_sock.h
@@ -27,19 +27,13 @@ struct sk_buff;
 struct dst_entry;
 struct proto;
 
-/* empty to "strongly type" an otherwise void parameter.
- */
-struct request_values {
-};
-
 struct request_sock_ops {
 	int		family;
 	int		obj_size;
 	struct kmem_cache	*slab;
 	char		*slab_name;
 	int		(*rtx_syn_ack)(struct sock *sk,
-				       struct request_sock *req,
-				       struct request_values *rvp);
+				       struct request_sock *req);
 	void		(*send_ack)(struct sock *sk, struct sk_buff *skb,
 				    struct request_sock *req);
 	void		(*send_reset)(struct sock *sk,
diff --git a/include/net/tcp.h b/include/net/tcp.h
index ab9f947b118b..7f2f17198d75 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -179,7 +179,6 @@ extern void tcp_time_wait(struct sock *sk, int state, int timeo);
 #define TCPOPT_SACK             5       /* SACK Block */
 #define TCPOPT_TIMESTAMP	8	/* Better RTT estimations/PAWS */
 #define TCPOPT_MD5SIG		19	/* MD5 Signature (RFC2385) */
-#define TCPOPT_COOKIE		253	/* Cookie extension (experimental) */
 #define TCPOPT_EXP		254	/* Experimental */
 /* Magic number to be after the option value for sharing TCP
  * experimental options. See draft-ietf-tcpm-experimental-options-00.txt
@@ -454,7 +453,7 @@ extern void tcp_syn_ack_timeout(struct sock *sk, struct request_sock *req);
 extern int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t len, int nonblock, int flags, int *addr_len);
 extern void tcp_parse_options(const struct sk_buff *skb,
-			      struct tcp_options_received *opt_rx, const u8 **hvpp,
+			      struct tcp_options_received *opt_rx,
 			      int estab, struct tcp_fastopen_cookie *foc);
 extern const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
 
@@ -476,7 +475,6 @@ extern int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr,
 extern int tcp_connect(struct sock *sk);
 extern struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 					struct request_sock *req,
-					struct request_values *rvp,
 					struct tcp_fastopen_cookie *foc);
 extern int tcp_disconnect(struct sock *sk, int flags);
 
@@ -1589,91 +1587,6 @@ struct tcp_request_sock_ops {
 #endif
 };
 
-/* Using SHA1 for now, define some constants.
- */
-#define COOKIE_DIGEST_WORDS (SHA_DIGEST_WORDS)
-#define COOKIE_MESSAGE_WORDS (SHA_MESSAGE_BYTES / 4)
-#define COOKIE_WORKSPACE_WORDS (COOKIE_DIGEST_WORDS + COOKIE_MESSAGE_WORDS)
-
-extern int tcp_cookie_generator(u32 *bakery);
-
-/**
- *	struct tcp_cookie_values - each socket needs extra space for the
- *	cookies, together with (optional) space for any SYN data.
- *
- *	A tcp_sock contains a pointer to the current value, and this is
- *	cloned to the tcp_timewait_sock.
- *
- * @cookie_pair:	variable data from the option exchange.
- *
- * @cookie_desired:	user specified tcpct_cookie_desired.  Zero
- *			indicates default (sysctl_tcp_cookie_size).
- *			After cookie sent, remembers size of cookie.
- *			Range 0, TCP_COOKIE_MIN to TCP_COOKIE_MAX.
- *
- * @s_data_desired:	user specified tcpct_s_data_desired.  When the
- *			constant payload is specified (@s_data_constant),
- *			holds its length instead.
- *			Range 0 to TCP_MSS_DESIRED.
- *
- * @s_data_payload:	constant data that is to be included in the
- *			payload of SYN or SYNACK segments when the
- *			cookie option is present.
- */
-struct tcp_cookie_values {
-	struct kref	kref;
-	u8		cookie_pair[TCP_COOKIE_PAIR_SIZE];
-	u8		cookie_pair_size;
-	u8		cookie_desired;
-	u16		s_data_desired:11,
-			s_data_constant:1,
-			s_data_in:1,
-			s_data_out:1,
-			s_data_unused:2;
-	u8		s_data_payload[0];
-};
-
-static inline void tcp_cookie_values_release(struct kref *kref)
-{
-	kfree(container_of(kref, struct tcp_cookie_values, kref));
-}
-
-/* The length of constant payload data.  Note that s_data_desired is
- * overloaded, depending on s_data_constant: either the length of constant
- * data (returned here) or the limit on variable data.
- */
-static inline int tcp_s_data_size(const struct tcp_sock *tp)
-{
-	return (tp->cookie_values != NULL && tp->cookie_values->s_data_constant)
-		? tp->cookie_values->s_data_desired
-		: 0;
-}
-
-/**
- *	struct tcp_extend_values - tcp_ipv?.c to tcp_output.c workspace.
- *
- *	As tcp_request_sock has already been extended in other places, the
- *	only remaining method is to pass stack values along as function
- *	parameters.  These parameters are not needed after sending SYNACK.
- *
- * @cookie_bakery:	cryptographic secret and message workspace.
- *
- * @cookie_plus:	bytes in authenticator/cookie option, copied from
- *			struct tcp_options_received (above).
- */
-struct tcp_extend_values {
-	struct request_values		rv;
-	u32				cookie_bakery[COOKIE_WORKSPACE_WORDS];
-	u8				cookie_plus:6,
-					cookie_out_never:1,
-					cookie_in_always:1;
-};
-
-static inline struct tcp_extend_values *tcp_xv(struct request_values *rvp)
-{
-	return (struct tcp_extend_values *)rvp;
-}
-
 extern void tcp_v4_init(void);
 extern void tcp_init(void);
 
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 6b1ead0b0c9d..8d776ebc4829 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -102,7 +102,6 @@ enum {
 #define TCP_QUICKACK		12	/* Block/reenable quick acks */
 #define TCP_CONGESTION		13	/* Congestion control algorithm */
 #define TCP_MD5SIG		14	/* TCP MD5 Signature (RFC2385) */
-#define TCP_COOKIE_TRANSACTIONS	15	/* TCP Cookie Transactions */
 #define TCP_THIN_LINEAR_TIMEOUTS 16      /* Use linear timeouts for thin streams*/
 #define TCP_THIN_DUPACK         17      /* Fast retrans. after 1 dupack */
 #define TCP_USER_TIMEOUT	18	/* How long for loss retry before timeout */
@@ -199,29 +198,4 @@ struct tcp_md5sig {
 	__u8	tcpm_key[TCP_MD5SIG_MAXKEYLEN];		/* key (binary) */
 };
 
-/* for TCP_COOKIE_TRANSACTIONS (TCPCT) socket option */
-#define TCP_COOKIE_MIN		 8		/*  64-bits */
-#define TCP_COOKIE_MAX		16		/* 128-bits */
-#define TCP_COOKIE_PAIR_SIZE	(2*TCP_COOKIE_MAX)
-
-/* Flags for both getsockopt and setsockopt */
-#define TCP_COOKIE_IN_ALWAYS	(1 << 0)	/* Discard SYN without cookie */
-#define TCP_COOKIE_OUT_NEVER	(1 << 1)	/* Prohibit outgoing cookies,
-						 * supercedes everything. */
-
-/* Flags for getsockopt */
-#define TCP_S_DATA_IN		(1 << 2)	/* Was data received? */
-#define TCP_S_DATA_OUT		(1 << 3)	/* Was data sent? */
-
-/* TCP_COOKIE_TRANSACTIONS data */
-struct tcp_cookie_transactions {
-	__u16	tcpct_flags;			/* see above */
-	__u8	__tcpct_pad1;			/* zero */
-	__u8	tcpct_cookie_desired;		/* bytes */
-	__u16	tcpct_s_data_desired;		/* bytes of variable data */
-	__u16	tcpct_used;			/* bytes in value */
-	__u8	tcpct_value[TCP_MSS_DEFAULT];
-};
-
-
 #endif /* _UAPI_LINUX_TCP_H */
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 4f9f5eb478f1..ebc54fef85a5 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -500,8 +500,7 @@ static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
 	return &rt->dst;
 }
 
-static int dccp_v4_send_response(struct sock *sk, struct request_sock *req,
-				 struct request_values *rv_unused)
+static int dccp_v4_send_response(struct sock *sk, struct request_sock *req)
 {
 	int err = -1;
 	struct sk_buff *skb;
@@ -658,7 +657,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	dreq->dreq_gss     = dreq->dreq_iss;
 	dreq->dreq_service = service;
 
-	if (dccp_v4_send_response(sk, req, NULL))
+	if (dccp_v4_send_response(sk, req))
 		goto drop_and_free;
 
 	inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 6e05981f271e..9c61f9c02fdb 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -213,8 +213,7 @@ out:
 }
 
 
-static int dccp_v6_send_response(struct sock *sk, struct request_sock *req,
-				 struct request_values *rv_unused)
+static int dccp_v6_send_response(struct sock *sk, struct request_sock *req)
 {
 	struct inet6_request_sock *ireq6 = inet6_rsk(req);
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -428,7 +427,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	dreq->dreq_gss     = dreq->dreq_iss;
 	dreq->dreq_service = service;
 
-	if (dccp_v6_send_response(sk, req, NULL))
+	if (dccp_v6_send_response(sk, req))
 		goto drop_and_free;
 
 	inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 786d97aee751..6acb541c9091 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -559,7 +559,7 @@ static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
 
 int inet_rtx_syn_ack(struct sock *parent, struct request_sock *req)
 {
-	int err = req->rsk_ops->rtx_syn_ack(parent, req, NULL);
+	int err = req->rsk_ops->rtx_syn_ack(parent, req);
 
 	if (!err)
 		req->num_retrans++;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index ef54377fb11c..7f4a5cb8f8d0 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -267,7 +267,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 			     struct ip_options *opt)
 {
 	struct tcp_options_received tcp_opt;
-	const u8 *hash_location;
 	struct inet_request_sock *ireq;
 	struct tcp_request_sock *treq;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -294,7 +293,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb,
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
+	tcp_parse_options(skb, &tcp_opt, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
 		goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index cca4550f4082..cb45062c8be0 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -732,13 +732,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "tcp_cookie_size",
-		.data		= &sysctl_tcp_cookie_size,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname       = "tcp_thin_linear_timeouts",
 		.data           = &sysctl_tcp_thin_linear_timeouts,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 8d14573ade77..17a6810af5c8 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -409,15 +409,6 @@ void tcp_init_sock(struct sock *sk)
 
 	icsk->icsk_sync_mss = tcp_sync_mss;
 
-	/* TCP Cookie Transactions */
-	if (sysctl_tcp_cookie_size > 0) {
-		/* Default, cookies without s_data_payload. */
-		tp->cookie_values =
-			kzalloc(sizeof(*tp->cookie_values),
-				sk->sk_allocation);
-		if (tp->cookie_values != NULL)
-			kref_init(&tp->cookie_values->kref);
-	}
 	/* Presumed zeroed, in order of appearance:
 	 *	cookie_in_always, cookie_out_never,
 	 *	s_data_constant, s_data_in, s_data_out
@@ -2397,92 +2388,6 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
-	case TCP_COOKIE_TRANSACTIONS: {
-		struct tcp_cookie_transactions ctd;
-		struct tcp_cookie_values *cvp = NULL;
-
-		if (sizeof(ctd) > optlen)
-			return -EINVAL;
-		if (copy_from_user(&ctd, optval, sizeof(ctd)))
-			return -EFAULT;
-
-		if (ctd.tcpct_used > sizeof(ctd.tcpct_value) ||
-		    ctd.tcpct_s_data_desired > TCP_MSS_DESIRED)
-			return -EINVAL;
-
-		if (ctd.tcpct_cookie_desired == 0) {
-			/* default to global value */
-		} else if ((0x1 & ctd.tcpct_cookie_desired) ||
-			   ctd.tcpct_cookie_desired > TCP_COOKIE_MAX ||
-			   ctd.tcpct_cookie_desired < TCP_COOKIE_MIN) {
-			return -EINVAL;
-		}
-
-		if (TCP_COOKIE_OUT_NEVER & ctd.tcpct_flags) {
-			/* Supercedes all other values */
-			lock_sock(sk);
-			if (tp->cookie_values != NULL) {
-				kref_put(&tp->cookie_values->kref,
-					 tcp_cookie_values_release);
-				tp->cookie_values = NULL;
-			}
-			tp->rx_opt.cookie_in_always = 0; /* false */
-			tp->rx_opt.cookie_out_never = 1; /* true */
-			release_sock(sk);
-			return err;
-		}
-
-		/* Allocate ancillary memory before locking.
-		 */
-		if (ctd.tcpct_used > 0 ||
-		    (tp->cookie_values == NULL &&
-		     (sysctl_tcp_cookie_size > 0 ||
-		      ctd.tcpct_cookie_desired > 0 ||
-		      ctd.tcpct_s_data_desired > 0))) {
-			cvp = kzalloc(sizeof(*cvp) + ctd.tcpct_used,
-				      GFP_KERNEL);
-			if (cvp == NULL)
-				return -ENOMEM;
-
-			kref_init(&cvp->kref);
-		}
-		lock_sock(sk);
-		tp->rx_opt.cookie_in_always =
-			(TCP_COOKIE_IN_ALWAYS & ctd.tcpct_flags);
-		tp->rx_opt.cookie_out_never = 0; /* false */
-
-		if (tp->cookie_values != NULL) {
-			if (cvp != NULL) {
-				/* Changed values are recorded by a changed
-				 * pointer, ensuring the cookie will differ,
-				 * without separately hashing each value later.
-				 */
-				kref_put(&tp->cookie_values->kref,
-					 tcp_cookie_values_release);
-			} else {
-				cvp = tp->cookie_values;
-			}
-		}
-
-		if (cvp != NULL) {
-			cvp->cookie_desired = ctd.tcpct_cookie_desired;
-
-			if (ctd.tcpct_used > 0) {
-				memcpy(cvp->s_data_payload, ctd.tcpct_value,
-				       ctd.tcpct_used);
-				cvp->s_data_desired = ctd.tcpct_used;
-				cvp->s_data_constant = 1; /* true */
-			} else {
-				/* No constant payload data. */
-				cvp->s_data_desired = ctd.tcpct_s_data_desired;
-				cvp->s_data_constant = 0; /* false */
-			}
-
-			tp->cookie_values = cvp;
-		}
-		release_sock(sk);
-		return err;
-	}
 	default:
 		/* fallthru */
 		break;
@@ -2902,41 +2807,6 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EFAULT;
 		return 0;
 
-	case TCP_COOKIE_TRANSACTIONS: {
-		struct tcp_cookie_transactions ctd;
-		struct tcp_cookie_values *cvp = tp->cookie_values;
-
-		if (get_user(len, optlen))
-			return -EFAULT;
-		if (len < sizeof(ctd))
-			return -EINVAL;
-
-		memset(&ctd, 0, sizeof(ctd));
-		ctd.tcpct_flags = (tp->rx_opt.cookie_in_always ?
-				   TCP_COOKIE_IN_ALWAYS : 0)
-				| (tp->rx_opt.cookie_out_never ?
-				   TCP_COOKIE_OUT_NEVER : 0);
-
-		if (cvp != NULL) {
-			ctd.tcpct_flags |= (cvp->s_data_in ?
-					    TCP_S_DATA_IN : 0)
-					 | (cvp->s_data_out ?
-					    TCP_S_DATA_OUT : 0);
-
-			ctd.tcpct_cookie_desired = cvp->cookie_desired;
-			ctd.tcpct_s_data_desired = cvp->s_data_desired;
-
-			memcpy(&ctd.tcpct_value[0], &cvp->cookie_pair[0],
-			       cvp->cookie_pair_size);
-			ctd.tcpct_used = cvp->cookie_pair_size;
-		}
-
-		if (put_user(sizeof(ctd), optlen))
-			return -EFAULT;
-		if (copy_to_user(optval, &ctd, sizeof(ctd)))
-			return -EFAULT;
-		return 0;
-	}
 	case TCP_THIN_LINEAR_TIMEOUTS:
 		val = tp->thin_lto;
 		break;
@@ -3409,134 +3279,6 @@ EXPORT_SYMBOL(tcp_md5_hash_key);
 
 #endif
 
-/* Each Responder maintains up to two secret values concurrently for
- * efficient secret rollover.  Each secret value has 4 states:
- *
- * Generating.  (tcp_secret_generating != tcp_secret_primary)
- *    Generates new Responder-Cookies, but not yet used for primary
- *    verification.  This is a short-term state, typically lasting only
- *    one round trip time (RTT).
- *
- * Primary.  (tcp_secret_generating == tcp_secret_primary)
- *    Used both for generation and primary verification.
- *
- * Retiring.  (tcp_secret_retiring != tcp_secret_secondary)
- *    Used for verification, until the first failure that can be
- *    verified by the newer Generating secret.  At that time, this
- *    cookie's state is changed to Secondary, and the Generating
- *    cookie's state is changed to Primary.  This is a short-term state,
- *    typically lasting only one round trip time (RTT).
- *
- * Secondary.  (tcp_secret_retiring == tcp_secret_secondary)
- *    Used for secondary verification, after primary verification
- *    failures.  This state lasts no more than twice the Maximum Segment
- *    Lifetime (2MSL).  Then, the secret is discarded.
- */
-struct tcp_cookie_secret {
-	/* The secret is divided into two parts.  The digest part is the
-	 * equivalent of previously hashing a secret and saving the state,
-	 * and serves as an initialization vector (IV).  The message part
-	 * serves as the trailing secret.
-	 */
-	u32				secrets[COOKIE_WORKSPACE_WORDS];
-	unsigned long			expires;
-};
-
-#define TCP_SECRET_1MSL (HZ * TCP_PAWS_MSL)
-#define TCP_SECRET_2MSL (HZ * TCP_PAWS_MSL * 2)
-#define TCP_SECRET_LIFE (HZ * 600)
-
-static struct tcp_cookie_secret tcp_secret_one;
-static struct tcp_cookie_secret tcp_secret_two;
-
-/* Essentially a circular list, without dynamic allocation. */
-static struct tcp_cookie_secret *tcp_secret_generating;
-static struct tcp_cookie_secret *tcp_secret_primary;
-static struct tcp_cookie_secret *tcp_secret_retiring;
-static struct tcp_cookie_secret *tcp_secret_secondary;
-
-static DEFINE_SPINLOCK(tcp_secret_locker);
-
-/* Select a pseudo-random word in the cookie workspace.
- */
-static inline u32 tcp_cookie_work(const u32 *ws, const int n)
-{
-	return ws[COOKIE_DIGEST_WORDS + ((COOKIE_MESSAGE_WORDS-1) & ws[n])];
-}
-
-/* Fill bakery[COOKIE_WORKSPACE_WORDS] with generator, updating as needed.
- * Called in softirq context.
- * Returns: 0 for success.
- */
-int tcp_cookie_generator(u32 *bakery)
-{
-	unsigned long jiffy = jiffies;
-
-	if (unlikely(time_after_eq(jiffy, tcp_secret_generating->expires))) {
-		spin_lock_bh(&tcp_secret_locker);
-		if (!time_after_eq(jiffy, tcp_secret_generating->expires)) {
-			/* refreshed by another */
-			memcpy(bakery,
-			       &tcp_secret_generating->secrets[0],
-			       COOKIE_WORKSPACE_WORDS);
-		} else {
-			/* still needs refreshing */
-			get_random_bytes(bakery, COOKIE_WORKSPACE_WORDS);
-
-			/* The first time, paranoia assumes that the
-			 * randomization function isn't as strong.  But,
-			 * this secret initialization is delayed until
-			 * the last possible moment (packet arrival).
-			 * Although that time is observable, it is
-			 * unpredictably variable.  Mash in the most
-			 * volatile clock bits available, and expire the
-			 * secret extra quickly.
-			 */
-			if (unlikely(tcp_secret_primary->expires ==
-				     tcp_secret_secondary->expires)) {
-				struct timespec tv;
-
-				getnstimeofday(&tv);
-				bakery[COOKIE_DIGEST_WORDS+0] ^=
-					(u32)tv.tv_nsec;
-
-				tcp_secret_secondary->expires = jiffy
-					+ TCP_SECRET_1MSL
-					+ (0x0f & tcp_cookie_work(bakery, 0));
-			} else {
-				tcp_secret_secondary->expires = jiffy
-					+ TCP_SECRET_LIFE
-					+ (0xff & tcp_cookie_work(bakery, 1));
-				tcp_secret_primary->expires = jiffy
-					+ TCP_SECRET_2MSL
-					+ (0x1f & tcp_cookie_work(bakery, 2));
-			}
-			memcpy(&tcp_secret_secondary->secrets[0],
-			       bakery, COOKIE_WORKSPACE_WORDS);
-
-			rcu_assign_pointer(tcp_secret_generating,
-					   tcp_secret_secondary);
-			rcu_assign_pointer(tcp_secret_retiring,
-					   tcp_secret_primary);
-			/*
-			 * Neither call_rcu() nor synchronize_rcu() needed.
-			 * Retiring data is not freed.  It is replaced after
-			 * further (locked) pointer updates, and a quiet time
-			 * (minimum 1MSL, maximum LIFE - 2MSL).
-			 */
-		}
-		spin_unlock_bh(&tcp_secret_locker);
-	} else {
-		rcu_read_lock_bh();
-		memcpy(bakery,
-		       &rcu_dereference(tcp_secret_generating)->secrets[0],
-		       COOKIE_WORKSPACE_WORDS);
-		rcu_read_unlock_bh();
-	}
-	return 0;
-}
-EXPORT_SYMBOL(tcp_cookie_generator);
-
 void tcp_done(struct sock *sk)
 {
 	struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
@@ -3591,7 +3333,6 @@ void __init tcp_init(void)
 	unsigned long limit;
 	int max_rshare, max_wshare, cnt;
 	unsigned int i;
-	unsigned long jiffy = jiffies;
 
 	BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
 
@@ -3667,13 +3408,5 @@ void __init tcp_init(void)
 
 	tcp_register_congestion_control(&tcp_reno);
 
-	memset(&tcp_secret_one.secrets[0], 0, sizeof(tcp_secret_one.secrets));
-	memset(&tcp_secret_two.secrets[0], 0, sizeof(tcp_secret_two.secrets));
-	tcp_secret_one.expires = jiffy; /* past due */
-	tcp_secret_two.expires = jiffy; /* past due */
-	tcp_secret_generating = &tcp_secret_one;
-	tcp_secret_primary = &tcp_secret_one;
-	tcp_secret_retiring = &tcp_secret_two;
-	tcp_secret_secondary = &tcp_secret_two;
 	tcp_tasklet_init();
 }
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 836d74dd0187..19f0149fb6a2 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3760,8 +3760,8 @@ old_ack:
  * But, this can also be called on packets in the established flow when
  * the fast version below fails.
  */
-void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *opt_rx,
-		       const u8 **hvpp, int estab,
+void tcp_parse_options(const struct sk_buff *skb,
+		       struct tcp_options_received *opt_rx, int estab,
 		       struct tcp_fastopen_cookie *foc)
 {
 	const unsigned char *ptr;
@@ -3845,31 +3845,6 @@ void tcp_parse_options(const struct sk_buff *skb, struct tcp_options_received *o
 				 */
 				break;
 #endif
-			case TCPOPT_COOKIE:
-				/* This option is variable length.
-				 */
-				switch (opsize) {
-				case TCPOLEN_COOKIE_BASE:
-					/* not yet implemented */
-					break;
-				case TCPOLEN_COOKIE_PAIR:
-					/* not yet implemented */
-					break;
-				case TCPOLEN_COOKIE_MIN+0:
-				case TCPOLEN_COOKIE_MIN+2:
-				case TCPOLEN_COOKIE_MIN+4:
-				case TCPOLEN_COOKIE_MIN+6:
-				case TCPOLEN_COOKIE_MAX:
-					/* 16-bit multiple */
-					opt_rx->cookie_plus = opsize;
-					*hvpp = ptr;
-					break;
-				default:
-					/* ignore option */
-					break;
-				}
-				break;
-
 			case TCPOPT_EXP:
 				/* Fast Open option shares code 254 using a
 				 * 16 bits magic number. It's valid only in
@@ -3915,8 +3890,7 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr
  * If it is wrong it falls back on tcp_parse_options().
  */
 static bool tcp_fast_parse_options(const struct sk_buff *skb,
-				   const struct tcphdr *th,
-				   struct tcp_sock *tp, const u8 **hvpp)
+				   const struct tcphdr *th, struct tcp_sock *tp)
 {
 	/* In the spirit of fast parsing, compare doff directly to constant
 	 * values.  Because equality is used, short doff can be ignored here.
@@ -3930,7 +3904,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
 			return true;
 	}
 
-	tcp_parse_options(skb, &tp->rx_opt, hvpp, 1, NULL);
+	tcp_parse_options(skb, &tp->rx_opt, 1, NULL);
 	if (tp->rx_opt.saw_tstamp)
 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
@@ -5311,12 +5285,10 @@ out:
 static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
 				  const struct tcphdr *th, int syn_inerr)
 {
-	const u8 *hash_location;
 	struct tcp_sock *tp = tcp_sk(sk);
 
 	/* RFC1323: H1. Apply PAWS check first. */
-	if (tcp_fast_parse_options(skb, th, tp, &hash_location) &&
-	    tp->rx_opt.saw_tstamp &&
+	if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&
 	    tcp_paws_discard(sk, skb)) {
 		if (!th->rst) {
 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSESTABREJECTED);
@@ -5670,12 +5642,11 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 
 	if (mss == tp->rx_opt.user_mss) {
 		struct tcp_options_received opt;
-		const u8 *hash_location;
 
 		/* Get original SYNACK MSS value if user MSS sets mss_clamp */
 		tcp_clear_options(&opt);
 		opt.user_mss = opt.mss_clamp = 0;
-		tcp_parse_options(synack, &opt, &hash_location, 0, NULL);
+		tcp_parse_options(synack, &opt, 0, NULL);
 		mss = opt.mss_clamp;
 	}
 
@@ -5706,14 +5677,12 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
 static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 					 const struct tcphdr *th, unsigned int len)
 {
-	const u8 *hash_location;
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_cookie_values *cvp = tp->cookie_values;
 	struct tcp_fastopen_cookie foc = { .len = -1 };
 	int saved_clamp = tp->rx_opt.mss_clamp;
 
-	tcp_parse_options(skb, &tp->rx_opt, &hash_location, 0, &foc);
+	tcp_parse_options(skb, &tp->rx_opt, 0, &foc);
 	if (tp->rx_opt.saw_tstamp)
 		tp->rx_opt.rcv_tsecr -= tp->tsoffset;
 
@@ -5810,30 +5779,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
 		 * is initialized. */
 		tp->copied_seq = tp->rcv_nxt;
 
-		if (cvp != NULL &&
-		    cvp->cookie_pair_size > 0 &&
-		    tp->rx_opt.cookie_plus > 0) {
-			int cookie_size = tp->rx_opt.cookie_plus
-					- TCPOLEN_COOKIE_BASE;
-			int cookie_pair_size = cookie_size
-					     + cvp->cookie_desired;
-
-			/* A cookie extension option was sent and returned.
-			 * Note that each incoming SYNACK replaces the
-			 * Responder cookie.  The initial exchange is most
-			 * fragile, as protection against spoofing relies
-			 * entirely upon the sequence and timestamp (above).
-			 * This replacement strategy allows the correct pair to
-			 * pass through, while any others will be filtered via
-			 * Responder verification later.
-			 */
-			if (sizeof(cvp->cookie_pair) >= cookie_pair_size) {
-				memcpy(&cvp->cookie_pair[cvp->cookie_desired],
-				       hash_location, cookie_size);
-				cvp->cookie_pair_size = cookie_pair_size;
-			}
-		}
-
 		smp_mb();
 
 		tcp_finish_connect(sk, skb);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index b7ab868c8284..b27c758ca23f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -838,7 +838,6 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  */
 static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct request_sock *req,
-			      struct request_values *rvp,
 			      u16 queue_mapping,
 			      bool nocache)
 {
@@ -851,7 +850,7 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
 		return -1;
 
-	skb = tcp_make_synack(sk, dst, req, rvp, NULL);
+	skb = tcp_make_synack(sk, dst, req, NULL);
 
 	if (skb) {
 		__tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
@@ -868,10 +867,9 @@ static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 	return err;
 }
 
-static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
-			     struct request_values *rvp)
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req)
 {
-	int res = tcp_v4_send_synack(sk, NULL, req, rvp, 0, false);
+	int res = tcp_v4_send_synack(sk, NULL, req, 0, false);
 
 	if (!res)
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
@@ -1371,8 +1369,7 @@ static bool tcp_fastopen_check(struct sock *sk, struct sk_buff *skb,
 static int tcp_v4_conn_req_fastopen(struct sock *sk,
 				    struct sk_buff *skb,
 				    struct sk_buff *skb_synack,
-				    struct request_sock *req,
-				    struct request_values *rvp)
+				    struct request_sock *req)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
@@ -1467,9 +1464,7 @@ static int tcp_v4_conn_req_fastopen(struct sock *sk,
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_extend_values tmp_ext;
 	struct tcp_options_received tmp_opt;
-	const u8 *hash_location;
 	struct request_sock *req;
 	struct inet_request_sock *ireq;
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -1519,42 +1514,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
 	tmp_opt.user_mss  = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0,
-	    want_cookie ? NULL : &foc);
-
-	if (tmp_opt.cookie_plus > 0 &&
-	    tmp_opt.saw_tstamp &&
-	    !tp->rx_opt.cookie_out_never &&
-	    (sysctl_tcp_cookie_size > 0 ||
-	     (tp->cookie_values != NULL &&
-	      tp->cookie_values->cookie_desired > 0))) {
-		u8 *c;
-		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
-		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
-
-		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
-			goto drop_and_release;
-
-		/* Secret recipe starts with IP addresses */
-		*mess++ ^= (__force u32)daddr;
-		*mess++ ^= (__force u32)saddr;
-
-		/* plus variable length Initiator Cookie */
-		c = (u8 *)mess;
-		while (l-- > 0)
-			*c++ ^= *hash_location++;
-
-		want_cookie = false;	/* not our kind of cookie */
-		tmp_ext.cookie_out_never = 0; /* false */
-		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
-	} else if (!tp->rx_opt.cookie_in_always) {
-		/* redundant indications, but ensure initialization. */
-		tmp_ext.cookie_out_never = 1; /* true */
-		tmp_ext.cookie_plus = 0;
-	} else {
-		goto drop_and_release;
-	}
-	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
@@ -1636,7 +1596,6 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 	 * of tcp_v4_send_synack()->tcp_select_initial_window().
 	 */
 	skb_synack = tcp_make_synack(sk, dst, req,
-	    (struct request_values *)&tmp_ext,
 	    fastopen_cookie_present(&valid_foc) ? &valid_foc : NULL);
 
 	if (skb_synack) {
@@ -1660,8 +1619,7 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 		if (fastopen_cookie_present(&foc) && foc.len != 0)
 			NET_INC_STATS_BH(sock_net(sk),
 			    LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
-	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req,
-	    (struct request_values *)&tmp_ext))
+	} else if (tcp_v4_conn_req_fastopen(sk, skb, skb_synack, req))
 		goto drop_and_free;
 
 	return 0;
@@ -2241,12 +2199,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
 	if (inet_csk(sk)->icsk_bind_hash)
 		inet_put_port(sk);
 
-	/* TCP Cookie Transactions */
-	if (tp->cookie_values != NULL) {
-		kref_put(&tp->cookie_values->kref,
-			 tcp_cookie_values_release);
-		tp->cookie_values = NULL;
-	}
 	BUG_ON(tp->fastopen_rsk != NULL);
 
 	/* If socket is aborted during connect operation */
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 4bdb09fca401..8f0234f8bb95 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -93,13 +93,12 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
 			   const struct tcphdr *th)
 {
 	struct tcp_options_received tmp_opt;
-	const u8 *hash_location;
 	struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 	bool paws_reject = false;
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+		tcp_parse_options(skb, &tmp_opt, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.rcv_tsecr	-= tcptw->tw_ts_offset;
@@ -388,32 +387,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct tcp_request_sock *treq = tcp_rsk(req);
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp = tcp_sk(newsk);
-		struct tcp_sock *oldtp = tcp_sk(sk);
-		struct tcp_cookie_values *oldcvp = oldtp->cookie_values;
-
-		/* TCP Cookie Transactions require space for the cookie pair,
-		 * as it differs for each connection.  There is no need to
-		 * copy any s_data_payload stored at the original socket.
-		 * Failure will prevent resuming the connection.
-		 *
-		 * Presumed copied, in order of appearance:
-		 *	cookie_in_always, cookie_out_never
-		 */
-		if (oldcvp != NULL) {
-			struct tcp_cookie_values *newcvp =
-				kzalloc(sizeof(*newtp->cookie_values),
-					GFP_ATOMIC);
-
-			if (newcvp != NULL) {
-				kref_init(&newcvp->kref);
-				newcvp->cookie_desired =
-						oldcvp->cookie_desired;
-				newtp->cookie_values = newcvp;
-			} else {
-				/* Not Yet Implemented */
-				newtp->cookie_values = NULL;
-			}
-		}
 
 		/* Now setup tcp_sock */
 		newtp->pred_flags = 0;
@@ -422,8 +395,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->rcv_nxt = treq->rcv_isn + 1;
 
 		newtp->snd_sml = newtp->snd_una =
-		newtp->snd_nxt = newtp->snd_up =
-			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+		newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
 
 		tcp_prequeue_init(newtp);
 		INIT_LIST_HEAD(&newtp->tsq_node);
@@ -460,8 +432,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		tcp_set_ca_state(newsk, TCP_CA_Open);
 		tcp_init_xmit_timers(newsk);
 		skb_queue_head_init(&newtp->out_of_order_queue);
-		newtp->write_seq = newtp->pushed_seq =
-			treq->snt_isn + 1 + tcp_s_data_size(oldtp);
+		newtp->write_seq = newtp->pushed_seq = treq->snt_isn + 1;
 
 		newtp->rx_opt.saw_tstamp = 0;
 
@@ -538,7 +509,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 			   bool fastopen)
 {
 	struct tcp_options_received tmp_opt;
-	const u8 *hash_location;
 	struct sock *child;
 	const struct tcphdr *th = tcp_hdr(skb);
 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
@@ -548,7 +518,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
-		tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
+		tcp_parse_options(skb, &tmp_opt, 0, NULL);
 
 		if (tmp_opt.saw_tstamp) {
 			tmp_opt.ts_recent = req->ts_recent;
@@ -648,7 +618,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	 */
 	if ((flg & TCP_FLAG_ACK) && !fastopen &&
 	    (TCP_SKB_CB(skb)->ack_seq !=
-	     tcp_rsk(req)->snt_isn + 1 + tcp_s_data_size(tcp_sk(sk))))
+	     tcp_rsk(req)->snt_isn + 1))
 		return sk;
 
 	/* Also, it would be not so bad idea to check rcv_tsecr, which
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8e7742f0b5d2..ac5871ebe086 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -65,9 +65,6 @@ int sysctl_tcp_base_mss __read_mostly = TCP_BASE_MSS;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
-int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
-EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
-
 static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
 			   int push_one, gfp_t gfp);
 
@@ -386,7 +383,6 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
 #define OPTION_TS		(1 << 1)
 #define OPTION_MD5		(1 << 2)
 #define OPTION_WSCALE		(1 << 3)
-#define OPTION_COOKIE_EXTENSION	(1 << 4)
 #define OPTION_FAST_OPEN_COOKIE	(1 << 8)
 
 struct tcp_out_options {
@@ -400,36 +396,6 @@ struct tcp_out_options {
 	struct tcp_fastopen_cookie *fastopen_cookie;	/* Fast open cookie */
 };
 
-/* The sysctl int routines are generic, so check consistency here.
- */
-static u8 tcp_cookie_size_check(u8 desired)
-{
-	int cookie_size;
-
-	if (desired > 0)
-		/* previously specified */
-		return desired;
-
-	cookie_size = ACCESS_ONCE(sysctl_tcp_cookie_size);
-	if (cookie_size <= 0)
-		/* no default specified */
-		return 0;
-
-	if (cookie_size <= TCP_COOKIE_MIN)
-		/* value too small, specify minimum */
-		return TCP_COOKIE_MIN;
-
-	if (cookie_size >= TCP_COOKIE_MAX)
-		/* value too large, specify maximum */
-		return TCP_COOKIE_MAX;
-
-	if (cookie_size & 1)
-		/* 8-bit multiple, illegal, fix it */
-		cookie_size++;
-
-	return (u8)cookie_size;
-}
-
 /* Write previously computed TCP options to the packet.
  *
  * Beware: Something in the Internet is very sensitive to the ordering of
@@ -448,27 +414,9 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 {
 	u16 options = opts->options;	/* mungable copy */
 
-	/* Having both authentication and cookies for security is redundant,
-	 * and there's certainly not enough room.  Instead, the cookie-less
-	 * extension variant is proposed.
-	 *
-	 * Consider the pessimal case with authentication.  The options
-	 * could look like:
-	 *   COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
-	 */
 	if (unlikely(OPTION_MD5 & options)) {
-		if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
-			*ptr++ = htonl((TCPOPT_COOKIE << 24) |
-				       (TCPOLEN_COOKIE_BASE << 16) |
-				       (TCPOPT_MD5SIG << 8) |
-				       TCPOLEN_MD5SIG);
-		} else {
-			*ptr++ = htonl((TCPOPT_NOP << 24) |
-				       (TCPOPT_NOP << 16) |
-				       (TCPOPT_MD5SIG << 8) |
-				       TCPOLEN_MD5SIG);
-		}
-		options &= ~OPTION_COOKIE_EXTENSION;
+		*ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
+			       (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG);
 		/* overload cookie hash location */
 		opts->hash_location = (__u8 *)ptr;
 		ptr += 4;
@@ -497,44 +445,6 @@ static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
 		*ptr++ = htonl(opts->tsecr);
 	}
 
-	/* Specification requires after timestamp, so do it now.
-	 *
-	 * Consider the pessimal case without authentication.  The options
-	 * could look like:
-	 *   MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
-	 */
-	if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
-		__u8 *cookie_copy = opts->hash_location;
-		u8 cookie_size = opts->hash_size;
-
-		/* 8-bit multiple handled in tcp_cookie_size_check() above,
-		 * and elsewhere.
-		 */
-		if (0x2 & cookie_size) {
-			__u8 *p = (__u8 *)ptr;
-
-			/* 16-bit multiple */
-			*p++ = TCPOPT_COOKIE;
-			*p++ = TCPOLEN_COOKIE_BASE + cookie_size;
-			*p++ = *cookie_copy++;
-			*p++ = *cookie_copy++;
-			ptr++;
-			cookie_size -= 2;
-		} else {
-			/* 32-bit multiple */
-			*ptr++ = htonl(((TCPOPT_NOP << 24) |
-					(TCPOPT_NOP << 16) |
-					(TCPOPT_COOKIE << 8) |
-					TCPOLEN_COOKIE_BASE) +
-				       cookie_size);
-		}
-
-		if (cookie_size > 0) {
-			memcpy(ptr, cookie_copy, cookie_size);
-			ptr += (cookie_size / 4);
-		}
-	}
-
 	if (unlikely(OPTION_SACK_ADVERTISE & options)) {
 		*ptr++ = htonl((TCPOPT_NOP << 24) |
 			       (TCPOPT_NOP << 16) |
@@ -593,11 +503,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 				struct tcp_md5sig_key **md5)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_cookie_values *cvp = tp->cookie_values;
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
-	u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
-			 tcp_cookie_size_check(cvp->cookie_desired) :
-			 0;
 	struct tcp_fastopen_request *fastopen = tp->fastopen_req;
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -649,52 +555,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 			tp->syn_fastopen = 1;
 		}
 	}
-	/* Note that timestamps are required by the specification.
-	 *
-	 * Odd numbers of bytes are prohibited by the specification, ensuring
-	 * that the cookie is 16-bit aligned, and the resulting cookie pair is
-	 * 32-bit aligned.
-	 */
-	if (*md5 == NULL &&
-	    (OPTION_TS & opts->options) &&
-	    cookie_size > 0) {
-		int need = TCPOLEN_COOKIE_BASE + cookie_size;
-
-		if (0x2 & need) {
-			/* 32-bit multiple */
-			need += 2; /* NOPs */
-
-			if (need > remaining) {
-				/* try shrinking cookie to fit */
-				cookie_size -= 2;
-				need -= 4;
-			}
-		}
-		while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
-			cookie_size -= 4;
-			need -= 4;
-		}
-		if (TCP_COOKIE_MIN <= cookie_size) {
-			opts->options |= OPTION_COOKIE_EXTENSION;
-			opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
-			opts->hash_size = cookie_size;
-
-			/* Remember for future incarnations. */
-			cvp->cookie_desired = cookie_size;
-
-			if (cvp->cookie_desired != cvp->cookie_pair_size) {
-				/* Currently use random bytes as a nonce,
-				 * assuming these are completely unpredictable
-				 * by hostile users of the same system.
-				 */
-				get_random_bytes(&cvp->cookie_pair[0],
-						 cookie_size);
-				cvp->cookie_pair_size = cookie_size;
-			}
 
-			remaining -= need;
-		}
-	}
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -704,14 +565,10 @@ static unsigned int tcp_synack_options(struct sock *sk,
 				   unsigned int mss, struct sk_buff *skb,
 				   struct tcp_out_options *opts,
 				   struct tcp_md5sig_key **md5,
-				   struct tcp_extend_values *xvp,
 				   struct tcp_fastopen_cookie *foc)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	unsigned int remaining = MAX_TCP_OPTION_SPACE;
-	u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
-			 xvp->cookie_plus :
-			 0;
 
 #ifdef CONFIG_TCP_MD5SIG
 	*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
@@ -759,28 +616,7 @@ static unsigned int tcp_synack_options(struct sock *sk,
 			remaining -= need;
 		}
 	}
-	/* Similar rationale to tcp_syn_options() applies here, too.
-	 * If the <SYN> options fit, the same options should fit now!
-	 */
-	if (*md5 == NULL &&
-	    ireq->tstamp_ok &&
-	    cookie_plus > TCPOLEN_COOKIE_BASE) {
-		int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
-
-		if (0x2 & need) {
-			/* 32-bit multiple */
-			need += 2; /* NOPs */
-		}
-		if (need <= remaining) {
-			opts->options |= OPTION_COOKIE_EXTENSION;
-			opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
-			remaining -= need;
-		} else {
-			/* There's no error return, so flag it. */
-			xvp->cookie_out_never = 1; /* true */
-			opts->hash_size = 0;
-		}
-	}
+
 	return MAX_TCP_OPTION_SPACE - remaining;
 }
 
@@ -2802,32 +2638,24 @@ int tcp_send_synack(struct sock *sk)
  * sk: listener socket
  * dst: dst entry attached to the SYNACK
  * req: request_sock pointer
- * rvp: request_values pointer
  *
  * Allocate one skb and build a SYNACK packet.
  * @dst is consumed : Caller should not use it again.
  */
 struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 				struct request_sock *req,
-				struct request_values *rvp,
 				struct tcp_fastopen_cookie *foc)
 {
 	struct tcp_out_options opts;
-	struct tcp_extend_values *xvp = tcp_xv(rvp);
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct tcp_sock *tp = tcp_sk(sk);
-	const struct tcp_cookie_values *cvp = tp->cookie_values;
 	struct tcphdr *th;
 	struct sk_buff *skb;
 	struct tcp_md5sig_key *md5;
 	int tcp_header_size;
 	int mss;
-	int s_data_desired = 0;
 
-	if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired)
-		s_data_desired = cvp->s_data_desired;
-	skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired,
-			sk_gfp_atomic(sk, GFP_ATOMIC));
+	skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC));
 	if (unlikely(!skb)) {
 		dst_release(dst);
 		return NULL;
@@ -2869,9 +2697,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	else
 #endif
 	TCP_SKB_CB(skb)->when = tcp_time_stamp;
-	tcp_header_size = tcp_synack_options(sk, req, mss,
-					     skb, &opts, &md5, xvp, foc)
-			+ sizeof(*th);
+	tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+					     foc) + sizeof(*th);
 
 	skb_push(skb, tcp_header_size);
 	skb_reset_transport_header(skb);
@@ -2889,40 +2716,6 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
 			     TCPHDR_SYN | TCPHDR_ACK);
 
-	if (OPTION_COOKIE_EXTENSION & opts.options) {
-		if (s_data_desired) {
-			u8 *buf = skb_put(skb, s_data_desired);
-
-			/* copy data directly from the listening socket. */
-			memcpy(buf, cvp->s_data_payload, s_data_desired);
-			TCP_SKB_CB(skb)->end_seq += s_data_desired;
-		}
-
-		if (opts.hash_size > 0) {
-			__u32 workspace[SHA_WORKSPACE_WORDS];
-			u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
-			u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
-
-			/* Secret recipe depends on the Timestamp, (future)
-			 * Sequence and Acknowledgment Numbers, Initiator
-			 * Cookie, and others handled by IP variant caller.
-			 */
-			*tail-- ^= opts.tsval;
-			*tail-- ^= tcp_rsk(req)->rcv_isn + 1;
-			*tail-- ^= TCP_SKB_CB(skb)->seq + 1;
-
-			/* recommended */
-			*tail-- ^= (((__force u32)th->dest << 16) | (__force u32)th->source);
-			*tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
-
-			sha_transform((__u32 *)&xvp->cookie_bakery[0],
-				      (char *)mess,
-				      &workspace[0]);
-			opts.hash_location =
-				(__u8 *)&xvp->cookie_bakery[0];
-		}
-	}
-
 	th->seq = htonl(TCP_SKB_CB(skb)->seq);
 	/* XXX data is queued and acked as is. No buffer/window check */
 	th->ack_seq = htonl(tcp_rsk(req)->rcv_nxt);
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 8a0848b60b35..d5dda20bd717 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -149,7 +149,6 @@ static inline int cookie_check(const struct sk_buff *skb, __u32 cookie)
 struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_options_received tcp_opt;
-	const u8 *hash_location;
 	struct inet_request_sock *ireq;
 	struct inet6_request_sock *ireq6;
 	struct tcp_request_sock *treq;
@@ -177,7 +176,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
 
 	/* check for timestamp cookie support */
 	memset(&tcp_opt, 0, sizeof(tcp_opt));
-	tcp_parse_options(skb, &tcp_opt, &hash_location, 0, NULL);
+	tcp_parse_options(skb, &tcp_opt, 0, NULL);
 
 	if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok))
 		goto out;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 9b6460055df5..0a97add2ab74 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -454,7 +454,6 @@ out:
 static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
 			      struct flowi6 *fl6,
 			      struct request_sock *req,
-			      struct request_values *rvp,
 			      u16 queue_mapping)
 {
 	struct inet6_request_sock *treq = inet6_rsk(req);
@@ -466,7 +465,7 @@ static int tcp_v6_send_synack(struct sock *sk, struct dst_entry *dst,
 	if (!dst && (dst = inet6_csk_route_req(sk, fl6, req)) == NULL)
 		goto done;
 
-	skb = tcp_make_synack(sk, dst, req, rvp, NULL);
+	skb = tcp_make_synack(sk, dst, req, NULL);
 
 	if (skb) {
 		__tcp_v6_send_check(skb, &treq->loc_addr, &treq->rmt_addr);
@@ -481,13 +480,12 @@ done:
 	return err;
 }
 
-static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req,
-			     struct request_values *rvp)
+static int tcp_v6_rtx_synack(struct sock *sk, struct request_sock *req)
 {
 	struct flowi6 fl6;
 	int res;
 
-	res = tcp_v6_send_synack(sk, NULL, &fl6, req, rvp, 0);
+	res = tcp_v6_send_synack(sk, NULL, &fl6, req, 0);
 	if (!res)
 		TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
 	return res;
@@ -940,9 +938,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
  */
 static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-	struct tcp_extend_values tmp_ext;
 	struct tcp_options_received tmp_opt;
-	const u8 *hash_location;
 	struct request_sock *req;
 	struct inet6_request_sock *treq;
 	struct ipv6_pinfo *np = inet6_sk(sk);
@@ -980,50 +976,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
 	tcp_clear_options(&tmp_opt);
 	tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
 	tmp_opt.user_mss = tp->rx_opt.user_mss;
-	tcp_parse_options(skb, &tmp_opt, &hash_location, 0, NULL);
-
-	if (tmp_opt.cookie_plus > 0 &&
-	    tmp_opt.saw_tstamp &&
-	    !tp->rx_opt.cookie_out_never &&
-	    (sysctl_tcp_cookie_size > 0 ||
-	     (tp->cookie_values != NULL &&
-	      tp->cookie_values->cookie_desired > 0))) {
-		u8 *c;
-		u32 *d;
-		u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
-		int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
-
-		if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
-			goto drop_and_free;
-
-		/* Secret recipe starts with IP addresses */
-		d = (__force u32 *)&ipv6_hdr(skb)->daddr.s6_addr32[0];
-		*mess++ ^= *d++;
-		*mess++ ^= *d++;
-		*mess++ ^= *d++;
-		*mess++ ^= *d++;
-		d = (__force u32 *)&ipv6_hdr(skb)->saddr.s6_addr32[0];
-		*mess++ ^= *d++;
-		*mess++ ^= *d++;
-		*mess++ ^= *d++;
-		*mess++ ^= *d++;
-
-		/* plus variable length Initiator Cookie */
-		c = (u8 *)mess;
-		while (l-- > 0)
-			*c++ ^= *hash_location++;
-
-		want_cookie = false;	/* not our kind of cookie */
-		tmp_ext.cookie_out_never = 0; /* false */
-		tmp_ext.cookie_plus = tmp_opt.cookie_plus;
-	} else if (!tp->rx_opt.cookie_in_always) {
-		/* redundant indications, but ensure initialization. */
-		tmp_ext.cookie_out_never = 1; /* true */
-		tmp_ext.cookie_plus = 0;
-	} else {
-		goto drop_and_free;
-	}
-	tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
+	tcp_parse_options(skb, &tmp_opt, 0, NULL);
 
 	if (want_cookie && !tmp_opt.saw_tstamp)
 		tcp_clear_options(&tmp_opt);
@@ -1101,7 +1054,6 @@ have_isn:
 		goto drop_and_release;
 
 	if (tcp_v6_send_synack(sk, dst, &fl6, req,
-			       (struct request_values *)&tmp_ext,
 			       skb_get_queue_mapping(skb)) ||
 	    want_cookie)
 		goto drop_and_free;
-- 
cgit 


From 8655cc490e83f66476de8c1294411860325c3531 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Tue, 19 Feb 2013 21:10:30 +0000
Subject: iio: Add broken out info_mask fields for shared_by_type and separate

This simplifies the code, removes an extensive layer of 'helper' macros
and gives us twice as much room to play with in these masks before we
have any need to be clever.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Acked-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-core.c | 30 ++++++++++++++++++++++++++++++
 include/linux/iio/iio.h         | 10 +++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index 6d8b02785647..f05289f7b512 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -708,6 +708,36 @@ static int iio_device_add_channel_sysfs(struct iio_dev *indio_dev,
 			goto error_ret;
 		attrcount++;
 	}
+	for_each_set_bit(i, &chan->info_mask_separate, sizeof(long)*8) {
+		ret = __iio_add_chan_devattr(iio_chan_info_postfix[i],
+					     chan,
+					     &iio_read_channel_info,
+					     &iio_write_channel_info,
+					     i,
+					     0,
+					     &indio_dev->dev,
+					     &indio_dev->channel_attr_list);
+		if (ret < 0)
+			goto error_ret;
+		attrcount++;
+	}
+	for_each_set_bit(i, &chan->info_mask_shared_by_type, sizeof(long)*8) {
+		ret = __iio_add_chan_devattr(iio_chan_info_postfix[i],
+					     chan,
+					     &iio_read_channel_info,
+					     &iio_write_channel_info,
+					     i,
+					     1,
+					     &indio_dev->dev,
+					     &indio_dev->channel_attr_list);
+		if (ret == -EBUSY) {
+			ret = 0;
+			continue;
+		} else if (ret < 0) {
+			goto error_ret;
+		}
+		attrcount++;
+	}
 
 	if (chan->ext_info) {
 		unsigned int i = 0;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index da8c776ba0bd..76976509d628 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -218,6 +218,10 @@ ssize_t iio_enum_write(struct iio_dev *indio_dev,
  *			endianness:	little or big endian
  * @info_mask:		What information is to be exported about this channel.
  *			This includes calibbias, scale etc.
+ * @info_mask_separate: What information is to be exported that is specific to
+ *			this channel.
+ * @info_mask_shared_by_type: What information is to be exported that is shared
+*			by all channels of the same type.
  * @event_mask:		What events can this channel produce.
  * @ext_info:		Array of extended info attributes for this channel.
  *			The array is NULL terminated, the last element should
@@ -253,6 +257,8 @@ struct iio_chan_spec {
 		enum iio_endian endianness;
 	} scan_type;
 	long			info_mask;
+	long			info_mask_separate;
+	long			info_mask_shared_by_type;
 	long			event_mask;
 	const struct iio_chan_spec_ext_info *ext_info;
 	const char		*extend_name;
@@ -275,7 +281,9 @@ struct iio_chan_spec {
 static inline bool iio_channel_has_info(const struct iio_chan_spec *chan,
 	enum iio_chan_info_enum type)
 {
-	return chan->info_mask & IIO_CHAN_INFO_BITS(type);
+	return (chan->info_mask & IIO_CHAN_INFO_BITS(type)) |
+	       (chan->info_mask_separate & type) |
+	       (chan->info_mask_shared_by_type & type);
 }
 
 #define IIO_ST(si, rb, sb, sh)						\
-- 
cgit 


From 5ea864940e8ab63c2669902650b807d0507c390d Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Wed, 27 Feb 2013 19:41:59 +0000
Subject: iio:st_sensors move to info_mask_(shared_by_type/separate)

The original info_mask is going away in favour of the broken out versions.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Acked-by: Denis Ciocca <denis.ciocca@st.com>
---
 include/linux/iio/common/st_sensors.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/common/st_sensors.h b/include/linux/iio/common/st_sensors.h
index 1f86a97ab2e2..7c0c0d3aef35 100644
--- a/include/linux/iio/common/st_sensors.h
+++ b/include/linux/iio/common/st_sensors.h
@@ -15,6 +15,7 @@
 #include <linux/spi/spi.h>
 #include <linux/irqreturn.h>
 #include <linux/iio/trigger.h>
+#include <linux/bitops.h>
 
 #define ST_SENSORS_TX_MAX_LENGTH		2
 #define ST_SENSORS_RX_MAX_LENGTH		6
@@ -45,8 +46,8 @@
 { \
 	.type = device_type, \
 	.modified = 1, \
-	.info_mask = IIO_CHAN_INFO_RAW_SEPARATE_BIT | \
-			IIO_CHAN_INFO_SCALE_SEPARATE_BIT, \
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+			BIT(IIO_CHAN_INFO_SCALE), \
 	.scan_index = index, \
 	.channel2 = mod, \
 	.address = addr, \
-- 
cgit 


From ea0c68006321eea78a3702a9d68ff9395e06da38 Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Wed, 27 Feb 2013 19:42:39 +0000
Subject: iio:adc:ad_sigma_delta move to info_mask_(shared_by_type/separate)

The original info_mask is going away in favour of the broken out versions.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Acked-by: Lars-Peter Clausen <lars@metafoo.de>
---
 include/linux/iio/adc/ad_sigma_delta.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h
index 2e4eab9868a3..e7fdec4db9da 100644
--- a/include/linux/iio/adc/ad_sigma_delta.h
+++ b/include/linux/iio/adc/ad_sigma_delta.h
@@ -133,9 +133,9 @@ int ad_sd_validate_trigger(struct iio_dev *indio_dev, struct iio_trigger *trig);
 		.channel2 = (_channel2), \
 		.address = (_address), \
 		.extend_name = (_extend_name), \
-		.info_mask = IIO_CHAN_INFO_RAW_SEPARATE_BIT | \
-			IIO_CHAN_INFO_SCALE_SHARED_BIT | \
-			IIO_CHAN_INFO_OFFSET_SEPARATE_BIT, \
+		.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+			BIT(IIO_CHAN_INFO_OFFSET), \
+		.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \
 		.scan_index = (_si), \
 		.scan_type = { \
 			.sign = 'u', \
-- 
cgit 


From b841f8abc27466026ecf4e5590c6c737c2e86e7e Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Wed, 27 Feb 2013 19:35:55 +0000
Subject: staging:iio:accel:adis move to info_mask_(shared_by_type/separate)

The original info_mask is going away in favour of the broken out versions.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Acked-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/staging/iio/accel/adis16201_core.c |  8 +++----
 drivers/staging/iio/accel/adis16203_core.c |  2 +-
 drivers/staging/iio/accel/adis16204_core.c |  8 +++----
 drivers/staging/iio/accel/adis16209_core.c |  4 ++--
 drivers/staging/iio/accel/adis16240_core.c |  9 +++-----
 drivers/staging/iio/gyro/adis16260_core.c  |  4 ++--
 include/linux/iio/imu/adis.h               | 34 +++++++++++++++---------------
 7 files changed, 32 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/iio/accel/adis16201_core.c b/drivers/staging/iio/accel/adis16201_core.c
index 9e5791ff2a04..ab8ec7af88b4 100644
--- a/drivers/staging/iio/accel/adis16201_core.c
+++ b/drivers/staging/iio/accel/adis16201_core.c
@@ -134,14 +134,14 @@ static const struct iio_chan_spec adis16201_channels[] = {
 	ADIS_SUPPLY_CHAN(ADIS16201_SUPPLY_OUT, ADIS16201_SCAN_SUPPLY, 12),
 	ADIS_TEMP_CHAN(ADIS16201_TEMP_OUT, ADIS16201_SCAN_TEMP, 12),
 	ADIS_ACCEL_CHAN(X, ADIS16201_XACCL_OUT, ADIS16201_SCAN_ACC_X,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	ADIS_ACCEL_CHAN(Y, ADIS16201_YACCL_OUT, ADIS16201_SCAN_ACC_Y,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	ADIS_AUX_ADC_CHAN(ADIS16201_AUX_ADC, ADIS16201_SCAN_AUX_ADC, 12),
 	ADIS_INCLI_CHAN(X, ADIS16201_XINCL_OUT, ADIS16201_SCAN_INCLI_X,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	ADIS_INCLI_CHAN(X, ADIS16201_YINCL_OUT, ADIS16201_SCAN_INCLI_Y,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	IIO_CHAN_SOFT_TIMESTAMP(7)
 };
 
diff --git a/drivers/staging/iio/accel/adis16203_core.c b/drivers/staging/iio/accel/adis16203_core.c
index 8c235273ff13..b08ac8fdeee2 100644
--- a/drivers/staging/iio/accel/adis16203_core.c
+++ b/drivers/staging/iio/accel/adis16203_core.c
@@ -102,7 +102,7 @@ static const struct iio_chan_spec adis16203_channels[] = {
 	ADIS_SUPPLY_CHAN(ADIS16203_SUPPLY_OUT, ADIS16203_SCAN_SUPPLY, 12),
 	ADIS_AUX_ADC_CHAN(ADIS16203_AUX_ADC, ADIS16203_SCAN_AUX_ADC, 12),
 	ADIS_INCLI_CHAN(X, ADIS16203_XINCL_OUT, ADIS16203_SCAN_INCLI_X,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	/* Fixme: Not what it appears to be - see data sheet */
 	ADIS_INCLI_CHAN(Y, ADIS16203_YINCL_OUT, ADIS16203_SCAN_INCLI_Y, 0, 14),
 	ADIS_TEMP_CHAN(ADIS16203_TEMP_OUT, ADIS16203_SCAN_TEMP, 12),
diff --git a/drivers/staging/iio/accel/adis16204_core.c b/drivers/staging/iio/accel/adis16204_core.c
index f3592668e066..792ec25a50dc 100644
--- a/drivers/staging/iio/accel/adis16204_core.c
+++ b/drivers/staging/iio/accel/adis16204_core.c
@@ -140,13 +140,11 @@ static const struct iio_chan_spec adis16204_channels[] = {
 	ADIS_AUX_ADC_CHAN(ADIS16204_AUX_ADC, ADIS16204_SCAN_AUX_ADC, 12),
 	ADIS_TEMP_CHAN(ADIS16204_TEMP_OUT, ADIS16204_SCAN_TEMP, 12),
 	ADIS_ACCEL_CHAN(X, ADIS16204_XACCL_OUT, ADIS16204_SCAN_ACC_X,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT |
-		IIO_CHAN_INFO_PEAK_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS) | BIT(IIO_CHAN_INFO_PEAK), 14),
 	ADIS_ACCEL_CHAN(Y, ADIS16204_YACCL_OUT, ADIS16204_SCAN_ACC_Y,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT |
-		IIO_CHAN_INFO_PEAK_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS) | BIT(IIO_CHAN_INFO_PEAK), 14),
 	ADIS_ACCEL_CHAN(ROOT_SUM_SQUARED_X_Y, ADIS16204_XY_RSS_OUT,
-		ADIS16204_SCAN_ACC_XY, IIO_CHAN_INFO_PEAK_SEPARATE_BIT, 14),
+		ADIS16204_SCAN_ACC_XY, BIT(IIO_CHAN_INFO_PEAK), 14),
 	IIO_CHAN_SOFT_TIMESTAMP(5),
 };
 
diff --git a/drivers/staging/iio/accel/adis16209_core.c b/drivers/staging/iio/accel/adis16209_core.c
index 69c50ee44ce3..323c169d699c 100644
--- a/drivers/staging/iio/accel/adis16209_core.c
+++ b/drivers/staging/iio/accel/adis16209_core.c
@@ -133,9 +133,9 @@ static const struct iio_chan_spec adis16209_channels[] = {
 	ADIS_SUPPLY_CHAN(ADIS16209_SUPPLY_OUT, ADIS16209_SCAN_SUPPLY, 14),
 	ADIS_TEMP_CHAN(ADIS16209_TEMP_OUT, ADIS16209_SCAN_TEMP, 12),
 	ADIS_ACCEL_CHAN(X, ADIS16209_XACCL_OUT, ADIS16209_SCAN_ACC_X,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	ADIS_ACCEL_CHAN(Y, ADIS16209_YACCL_OUT, ADIS16209_SCAN_ACC_Y,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT, 14),
+		BIT(IIO_CHAN_INFO_CALIBBIAS), 14),
 	ADIS_AUX_ADC_CHAN(ADIS16209_AUX_ADC, ADIS16209_SCAN_AUX_ADC, 12),
 	ADIS_INCLI_CHAN(X, ADIS16209_XINCL_OUT, ADIS16209_SCAN_INCLI_X, 0, 14),
 	ADIS_INCLI_CHAN(Y, ADIS16209_YINCL_OUT, ADIS16209_SCAN_INCLI_Y, 0, 14),
diff --git a/drivers/staging/iio/accel/adis16240_core.c b/drivers/staging/iio/accel/adis16240_core.c
index e97fa0b0233d..fd1f0fd0fba8 100644
--- a/drivers/staging/iio/accel/adis16240_core.c
+++ b/drivers/staging/iio/accel/adis16240_core.c
@@ -176,14 +176,11 @@ static const struct iio_chan_spec adis16240_channels[] = {
 	ADIS_SUPPLY_CHAN(ADIS16240_SUPPLY_OUT, ADIS16240_SCAN_SUPPLY, 10),
 	ADIS_AUX_ADC_CHAN(ADIS16240_AUX_ADC, ADIS16240_SCAN_AUX_ADC, 10),
 	ADIS_ACCEL_CHAN(X, ADIS16240_XACCL_OUT, ADIS16240_SCAN_ACC_X,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT |
-		IIO_CHAN_INFO_PEAK_SEPARATE_BIT, 10),
+		BIT(IIO_CHAN_INFO_CALIBBIAS) | BIT(IIO_CHAN_INFO_PEAK), 10),
 	ADIS_ACCEL_CHAN(Y, ADIS16240_YACCL_OUT, ADIS16240_SCAN_ACC_Y,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT |
-		IIO_CHAN_INFO_PEAK_SEPARATE_BIT, 10),
+		BIT(IIO_CHAN_INFO_CALIBBIAS) | BIT(IIO_CHAN_INFO_PEAK), 10),
 	ADIS_ACCEL_CHAN(Z, ADIS16240_ZACCL_OUT, ADIS16240_SCAN_ACC_Z,
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT |
-		IIO_CHAN_INFO_PEAK_SEPARATE_BIT, 10),
+		BIT(IIO_CHAN_INFO_CALIBBIAS) | BIT(IIO_CHAN_INFO_PEAK), 10),
 	ADIS_TEMP_CHAN(ADIS16240_TEMP_OUT, ADIS16240_SCAN_TEMP, 10),
 	IIO_CHAN_SOFT_TIMESTAMP(6)
 };
diff --git a/drivers/staging/iio/gyro/adis16260_core.c b/drivers/staging/iio/gyro/adis16260_core.c
index 6e80b8c768ae..620d63fd099b 100644
--- a/drivers/staging/iio/gyro/adis16260_core.c
+++ b/drivers/staging/iio/gyro/adis16260_core.c
@@ -124,8 +124,8 @@ static IIO_DEVICE_ATTR(sampling_frequency_available,
 #define ADIS16260_GYRO_CHANNEL_SET(axis, mod)				\
 struct iio_chan_spec adis16260_channels_##axis[] = {		\
 	ADIS_GYRO_CHAN(mod, ADIS16260_GYRO_OUT, ADIS16260_SCAN_GYRO, \
-		IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT | \
-		IIO_CHAN_INFO_CALIBSCALE_SEPARATE_BIT, 14), \
+		BIT(IIO_CHAN_INFO_CALIBBIAS) | \
+		BIT(IIO_CHAN_INFO_CALIBSCALE), 14), \
 	ADIS_INCLI_CHAN(mod, ADIS16260_ANGL_OUT, ADIS16260_SCAN_ANGL, 0, 14), \
 	ADIS_TEMP_CHAN(ADIS16260_TEMP_OUT, ADIS16260_SCAN_TEMP, 12), \
 	ADIS_SUPPLY_CHAN(ADIS16260_SUPPLY_OUT, ADIS16260_SCAN_SUPPLY, 12), \
diff --git a/include/linux/iio/imu/adis.h b/include/linux/iio/imu/adis.h
index ff781dca2e9a..b665dc7f017b 100644
--- a/include/linux/iio/imu/adis.h
+++ b/include/linux/iio/imu/adis.h
@@ -162,8 +162,8 @@ int adis_single_conversion(struct iio_dev *indio_dev,
 	.indexed = 1, \
 	.channel = (chan), \
 	.extend_name = name, \
-	.info_mask = IIO_CHAN_INFO_RAW_SEPARATE_BIT | \
-		IIO_CHAN_INFO_SCALE_SEPARATE_BIT, \
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+		BIT(IIO_CHAN_INFO_SCALE), \
 	.address = (addr), \
 	.scan_index = (si), \
 	.scan_type = { \
@@ -184,9 +184,9 @@ int adis_single_conversion(struct iio_dev *indio_dev,
 	.type = IIO_TEMP, \
 	.indexed = 1, \
 	.channel = 0, \
-	.info_mask = IIO_CHAN_INFO_RAW_SEPARATE_BIT | \
-		IIO_CHAN_INFO_SCALE_SEPARATE_BIT | \
-		IIO_CHAN_INFO_OFFSET_SEPARATE_BIT, \
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+		BIT(IIO_CHAN_INFO_SCALE) | \
+		BIT(IIO_CHAN_INFO_OFFSET), \
 	.address = (addr), \
 	.scan_index = (si), \
 	.scan_type = { \
@@ -197,13 +197,13 @@ int adis_single_conversion(struct iio_dev *indio_dev,
 	}, \
 }
 
-#define ADIS_MOD_CHAN(_type, mod, addr, si, info, bits) { \
+#define ADIS_MOD_CHAN(_type, mod, addr, si, info_sep, bits) { \
 	.type = (_type), \
 	.modified = 1, \
 	.channel2 = IIO_MOD_ ## mod, \
-	.info_mask = IIO_CHAN_INFO_RAW_SEPARATE_BIT | \
-		 IIO_CHAN_INFO_SCALE_SHARED_BIT | \
-		 info, \
+	.info_mask_separate = BIT(IIO_CHAN_INFO_RAW) | \
+		 info_sep, \
+	.info_mask_shared_by_type = BIT(IIO_CHAN_INFO_SCALE), \
 	.address = (addr), \
 	.scan_index = (si), \
 	.scan_type = { \
@@ -214,17 +214,17 @@ int adis_single_conversion(struct iio_dev *indio_dev,
 	}, \
 }
 
-#define ADIS_ACCEL_CHAN(mod, addr, si, info, bits) \
-	ADIS_MOD_CHAN(IIO_ACCEL, mod, addr, si, info, bits)
+#define ADIS_ACCEL_CHAN(mod, addr, si, info_sep, bits) \
+	ADIS_MOD_CHAN(IIO_ACCEL, mod, addr, si, info_sep, bits)
 
-#define ADIS_GYRO_CHAN(mod, addr, si, info, bits) \
-	ADIS_MOD_CHAN(IIO_ANGL_VEL, mod, addr, si, info, bits)
+#define ADIS_GYRO_CHAN(mod, addr, si, info_sep, bits) \
+	ADIS_MOD_CHAN(IIO_ANGL_VEL, mod, addr, si, info_sep, bits)
 
-#define ADIS_INCLI_CHAN(mod, addr, si, info, bits) \
-	ADIS_MOD_CHAN(IIO_INCLI, mod, addr, si, info, bits)
+#define ADIS_INCLI_CHAN(mod, addr, si, info_sep, bits) \
+	ADIS_MOD_CHAN(IIO_INCLI, mod, addr, si, info_sep, bits)
 
-#define ADIS_ROT_CHAN(mod, addr, si, info, bits) \
-	ADIS_MOD_CHAN(IIO_ROT, mod, addr, si, info, bits)
+#define ADIS_ROT_CHAN(mod, addr, si, info_sep, bits) \
+	ADIS_MOD_CHAN(IIO_ROT, mod, addr, si, info_sep, bits)
 
 #ifdef CONFIG_IIO_ADIS_LIB_BUFFER
 
-- 
cgit 


From b9606e2aa97d3d831d1236c0e789a33a2f867a8a Mon Sep 17 00:00:00 2001
From: Jonathan Cameron <jic23@kernel.org>
Date: Wed, 27 Feb 2013 19:43:52 +0000
Subject: iio:core drop info_mask from struct iio_info

This has been replaced by the pair of masks info_mask_separate
and info_mask_shared_by_type.  Other variants may follow.

Signed-off-by: Jonathan Cameron <jic23@kernel.org>
Acked-by: Lars-Peter Clausen <lars@metafoo.de>
---
 drivers/iio/industrialio-core.c | 17 ----------
 include/linux/iio/iio.h         | 73 +----------------------------------------
 2 files changed, 1 insertion(+), 89 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
index f05289f7b512..e145931ef1b8 100644
--- a/drivers/iio/industrialio-core.c
+++ b/drivers/iio/industrialio-core.c
@@ -691,23 +691,6 @@ static int iio_device_add_channel_sysfs(struct iio_dev *indio_dev,
 
 	if (chan->channel < 0)
 		return 0;
-	for_each_set_bit(i, &chan->info_mask, sizeof(long)*8) {
-		ret = __iio_add_chan_devattr(iio_chan_info_postfix[i/2],
-					     chan,
-					     &iio_read_channel_info,
-					     &iio_write_channel_info,
-					     i/2,
-					     !(i%2),
-					     &indio_dev->dev,
-					     &indio_dev->channel_attr_list);
-		if (ret == -EBUSY && (i%2 == 0)) {
-			ret = 0;
-			continue;
-		}
-		if (ret < 0)
-			goto error_ret;
-		attrcount++;
-	}
 	for_each_set_bit(i, &chan->info_mask_separate, sizeof(long)*8) {
 		ret = __iio_add_chan_devattr(iio_chan_info_postfix[i],
 					     chan,
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 76976509d628..8d171f427632 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -38,76 +38,6 @@ enum iio_chan_info_enum {
 	IIO_CHAN_INFO_HYSTERESIS,
 };
 
-#define IIO_CHAN_INFO_SHARED_BIT(type) BIT(type*2)
-#define IIO_CHAN_INFO_SEPARATE_BIT(type) BIT(type*2 + 1)
-#define IIO_CHAN_INFO_BITS(type) (IIO_CHAN_INFO_SHARED_BIT(type) | \
-				    IIO_CHAN_INFO_SEPARATE_BIT(type))
-
-#define IIO_CHAN_INFO_RAW_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_RAW)
-#define IIO_CHAN_INFO_PROCESSED_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_PROCESSED)
-#define IIO_CHAN_INFO_SCALE_SEPARATE_BIT		\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_SCALE)
-#define IIO_CHAN_INFO_SCALE_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_SCALE)
-#define IIO_CHAN_INFO_OFFSET_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_OFFSET)
-#define IIO_CHAN_INFO_OFFSET_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_OFFSET)
-#define IIO_CHAN_INFO_CALIBSCALE_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_CALIBSCALE)
-#define IIO_CHAN_INFO_CALIBSCALE_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_CALIBSCALE)
-#define IIO_CHAN_INFO_CALIBBIAS_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_CALIBBIAS)
-#define IIO_CHAN_INFO_CALIBBIAS_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_CALIBBIAS)
-#define IIO_CHAN_INFO_PEAK_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_PEAK)
-#define IIO_CHAN_INFO_PEAK_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_PEAK)
-#define IIO_CHAN_INFO_PEAKSCALE_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_PEAKSCALE)
-#define IIO_CHAN_INFO_PEAKSCALE_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_PEAKSCALE)
-#define IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW_SEPARATE_BIT	\
-	IIO_CHAN_INFO_SEPARATE_BIT(				\
-		IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW)
-#define IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW_SHARED_BIT	\
-	IIO_CHAN_INFO_SHARED_BIT(				\
-		IIO_CHAN_INFO_QUADRATURE_CORRECTION_RAW)
-#define IIO_CHAN_INFO_AVERAGE_RAW_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_AVERAGE_RAW)
-#define IIO_CHAN_INFO_AVERAGE_RAW_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_AVERAGE_RAW)
-#define IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY_SHARED_BIT \
-	IIO_CHAN_INFO_SHARED_BIT(			       \
-		IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY)
-#define IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY_SEPARATE_BIT \
-	IIO_CHAN_INFO_SEPARATE_BIT(			       \
-		IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY)
-#define IIO_CHAN_INFO_SAMP_FREQ_SEPARATE_BIT		\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_SAMP_FREQ)
-#define IIO_CHAN_INFO_SAMP_FREQ_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_SAMP_FREQ)
-#define IIO_CHAN_INFO_FREQUENCY_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_FREQUENCY)
-#define IIO_CHAN_INFO_FREQUENCY_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_FREQUENCY)
-#define IIO_CHAN_INFO_PHASE_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_PHASE)
-#define IIO_CHAN_INFO_PHASE_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_PHASE)
-#define IIO_CHAN_INFO_HARDWAREGAIN_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_HARDWAREGAIN)
-#define IIO_CHAN_INFO_HARDWAREGAIN_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_HARDWAREGAIN)
-#define IIO_CHAN_INFO_HYSTERESIS_SEPARATE_BIT			\
-	IIO_CHAN_INFO_SEPARATE_BIT(IIO_CHAN_INFO_HYSTERESIS)
-#define IIO_CHAN_INFO_HYSTERESIS_SHARED_BIT			\
-	IIO_CHAN_INFO_SHARED_BIT(IIO_CHAN_INFO_HYSTERESIS)
-
 enum iio_endian {
 	IIO_CPU,
 	IIO_BE,
@@ -281,8 +211,7 @@ struct iio_chan_spec {
 static inline bool iio_channel_has_info(const struct iio_chan_spec *chan,
 	enum iio_chan_info_enum type)
 {
-	return (chan->info_mask & IIO_CHAN_INFO_BITS(type)) |
-	       (chan->info_mask_separate & type) |
+	return (chan->info_mask_separate & type) |
 	       (chan->info_mask_shared_by_type & type);
 }
 
-- 
cgit 


From 717bfb5f46f0ee809f6ce04ebdf44521730fff05 Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Sun, 17 Mar 2013 20:07:25 +0800
Subject: ALSA: snd-usb: handle raw data format of UAC2 devices

UAC2 compliant audio devices may announce the capability to transport
raw audio data on their endpoints. Catch this and handle it as
'special' stream on the ALSA side.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Reported-by: Andreas Koch <andreas@akdesigninc.com>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/usb/audio-v2.h |  2 ++
 sound/usb/format.c           | 11 +++++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/audio-v2.h b/include/linux/usb/audio-v2.h
index ed13053153f4..c5f2158ab00e 100644
--- a/include/linux/usb/audio-v2.h
+++ b/include/linux/usb/audio-v2.h
@@ -170,6 +170,8 @@ struct uac2_as_header_descriptor {
 	__u8 iChannelNames;
 } __attribute__((packed));
 
+#define UAC2_FORMAT_TYPE_I_RAW_DATA	(1 << 31)
+
 /* 4.10.1.2 Class-Specific AS Isochronous Audio Data Endpoint Descriptor */
 
 struct uac2_iso_endpoint_descriptor {
diff --git a/sound/usb/format.c b/sound/usb/format.c
index b30d6fb89b40..a695cafc0599 100644
--- a/sound/usb/format.c
+++ b/sound/usb/format.c
@@ -47,7 +47,7 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip,
 				     int protocol)
 {
 	int sample_width, sample_bytes;
-	u64 pcm_formats;
+	u64 pcm_formats = 0;
 
 	switch (protocol) {
 	case UAC_VERSION_1:
@@ -63,14 +63,17 @@ static u64 parse_audio_format_i_type(struct snd_usb_audio *chip,
 		struct uac_format_type_i_ext_descriptor *fmt = _fmt;
 		sample_width = fmt->bBitResolution;
 		sample_bytes = fmt->bSubslotSize;
+
+		if (format & UAC2_FORMAT_TYPE_I_RAW_DATA)
+			pcm_formats |= SNDRV_PCM_FMTBIT_SPECIAL;
+
 		format <<= 1;
 		break;
 	}
 	}
 
-	pcm_formats = 0;
-
-	if (format == 0 || format == (1 << UAC_FORMAT_TYPE_I_UNDEFINED)) {
+	if ((pcm_formats == 0) &&
+	    (format == 0 || format == (1 << UAC_FORMAT_TYPE_I_UNDEFINED))) {
 		/* some devices don't define this correctly... */
 		snd_printdd(KERN_INFO "%d:%u:%d : format type 0 is detected, processed as PCM\n",
 			    chip->dev->devnum, fp->iface, fp->altsetting);
-- 
cgit 


From 1f0972f5b05a674d73e4eb314fa1b6c78e37aef1 Mon Sep 17 00:00:00 2001
From: Roger Quadros <rogerq@ti.com>
Date: Tue, 12 Mar 2013 13:24:19 +0200
Subject: usb: phy: nop: Add some parameters to platform data

Add clk_rate parameter to platform data. If supplied, the
NOP phy driver will program the clock to that rate during probe.

Also add 2 flags, needs_vcc and needs_reset.
If the flag is set and the regulator couldn't be found
then the driver will bail out with -EPROBE_DEFER.

Signed-off-by: Roger Quadros <rogerq@ti.com>
Acked-by: Felipe Balbi <balbi@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/nop-usb-xceiv.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/nop-usb-xceiv.h b/include/linux/usb/nop-usb-xceiv.h
index 28884c717411..148d35171aac 100644
--- a/include/linux/usb/nop-usb-xceiv.h
+++ b/include/linux/usb/nop-usb-xceiv.h
@@ -5,6 +5,11 @@
 
 struct nop_usb_xceiv_platform_data {
 	enum usb_phy_type type;
+	unsigned long clk_rate;
+
+	/* if set fails with -EPROBE_DEFER if can't get regulator */
+	unsigned int needs_vcc:1;
+	unsigned int needs_reset:1;
 };
 
 #if defined(CONFIG_NOP_USB_XCEIV) || (defined(CONFIG_NOP_USB_XCEIV_MODULE) && defined(MODULE))
-- 
cgit 


From 4495afcf713adb5bdb16504052952bdd0d11f90a Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Tue, 26 Feb 2013 20:03:28 +0530
Subject: usb: dwc3: omap: remove platform data associated with dwc3-omap

omap5 is not going to have support for non-dt boot making the platform
data associated with dwc3 useless. Removed it here.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/dwc3/dwc3-omap.c            | 24 ++++++++++--------------
 include/linux/platform_data/dwc3-omap.h |  4 ----
 2 files changed, 10 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index e1206b419932..43a248219aae 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -309,7 +309,6 @@ static int dwc3_omap_remove_core(struct device *dev, void *c)
 
 static int dwc3_omap_probe(struct platform_device *pdev)
 {
-	struct dwc3_omap_data	*pdata = pdev->dev.platform_data;
 	struct device_node	*node = pdev->dev.of_node;
 
 	struct dwc3_omap	*omap;
@@ -326,6 +325,11 @@ static int dwc3_omap_probe(struct platform_device *pdev)
 	void __iomem		*base;
 	void			*context;
 
+	if (!node) {
+		dev_err(dev, "device node not found\n");
+		return -EINVAL;
+	}
+
 	omap = devm_kzalloc(dev, sizeof(*omap), GFP_KERNEL);
 	if (!omap) {
 		dev_err(dev, "not enough memory\n");
@@ -387,12 +391,7 @@ static int dwc3_omap_probe(struct platform_device *pdev)
 
 	reg = dwc3_omap_readl(omap->base, USBOTGSS_UTMI_OTG_STATUS);
 
-	if (node)
-		of_property_read_u32(node, "utmi-mode", &utmi_mode);
-	else if (pdata)
-		utmi_mode = pdata->utmi_mode;
-	else
-		dev_dbg(dev, "missing platform data\n");
+	of_property_read_u32(node, "utmi-mode", &utmi_mode);
 
 	switch (utmi_mode) {
 	case DWC3_OMAP_UTMI_MODE_SW:
@@ -435,13 +434,10 @@ static int dwc3_omap_probe(struct platform_device *pdev)
 
 	dwc3_omap_writel(omap->base, USBOTGSS_IRQENABLE_SET_1, reg);
 
-	if (node) {
-		ret = of_platform_populate(node, NULL, NULL, dev);
-		if (ret) {
-			dev_err(&pdev->dev,
-				"failed to add create dwc3 core\n");
-			return ret;
-		}
+	ret = of_platform_populate(node, NULL, NULL, dev);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to create dwc3 core\n");
+		return ret;
 	}
 
 	return 0;
diff --git a/include/linux/platform_data/dwc3-omap.h b/include/linux/platform_data/dwc3-omap.h
index ada401244e0b..1d36ca874cc8 100644
--- a/include/linux/platform_data/dwc3-omap.h
+++ b/include/linux/platform_data/dwc3-omap.h
@@ -41,7 +41,3 @@ enum dwc3_omap_utmi_mode {
 	DWC3_OMAP_UTMI_MODE_HW,
 	DWC3_OMAP_UTMI_MODE_SW,
 };
-
-struct dwc3_omap_data {
-	enum dwc3_omap_utmi_mode	utmi_mode;
-};
-- 
cgit 


From f07bd56bbdaa2340ebf46af9a37e7b2d1b4578e3 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 24 Jan 2013 14:52:24 +0200
Subject: usb: gadget: udc-core: allow udc class register gadget device

Currently all UDC drivers are calling
device_register() before calling
usb_add_gadget_udc(). In order to avoid
code duplication, we can allow udc-core.c
register that device.

However that would become a really large patch,
so to cope with the meanwhile and allow us
to write bite-sized patches, we're adding
a flag which will be set by UDC driver once
it removes the code for registering the
gadget device.

Once all are converted, the new flag will
be removed.

Reviewed-by: Tomasz Figa <tomasz.figa@gmail.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc-core.c | 23 +++++++++++++++++++----
 include/linux/usb/gadget.h    |  4 ++++
 2 files changed, 23 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc-core.c b/drivers/usb/gadget/udc-core.c
index 2a9cd369f71c..919505426ec1 100644
--- a/drivers/usb/gadget/udc-core.c
+++ b/drivers/usb/gadget/udc-core.c
@@ -173,6 +173,14 @@ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
 	if (!udc)
 		goto err1;
 
+	if (gadget->register_my_device) {
+		dev_set_name(&gadget->dev, "gadget");
+
+		ret = device_register(&gadget->dev);
+		if (ret)
+			goto err2;
+	}
+
 	device_initialize(&udc->dev);
 	udc->dev.release = usb_udc_release;
 	udc->dev.class = udc_class;
@@ -180,7 +188,7 @@ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
 	udc->dev.parent = parent;
 	ret = dev_set_name(&udc->dev, "%s", kobject_name(&parent->kobj));
 	if (ret)
-		goto err2;
+		goto err3;
 
 	udc->gadget = gadget;
 
@@ -189,18 +197,22 @@ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
 
 	ret = device_add(&udc->dev);
 	if (ret)
-		goto err3;
+		goto err4;
 
 	mutex_unlock(&udc_lock);
 
 	return 0;
-err3:
+
+err4:
 	list_del(&udc->list);
 	mutex_unlock(&udc_lock);
 
-err2:
+err3:
 	put_device(&udc->dev);
 
+err2:
+	if (gadget->register_my_device)
+		put_device(&gadget->dev);
 err1:
 	return ret;
 }
@@ -254,6 +266,9 @@ found:
 
 	kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
 	device_unregister(&udc->dev);
+
+	if (gadget->register_my_device)
+		device_unregister(&gadget->dev);
 }
 EXPORT_SYMBOL_GPL(usb_del_gadget_udc);
 
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 2e297e80d59a..fcd9ef8d3f70 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -494,6 +494,9 @@ struct usb_gadget_ops {
  *	only supports HNP on a different root port.
  * @b_hnp_enable: OTG device feature flag, indicating that the A-Host
  *	enabled HNP support.
+ * @register_my_device: Flag telling udc-core that UDC driver didn't
+ *	register the gadget device to the driver model. Temporary until
+ *	all UDC drivers are fixed up properly.
  * @name: Identifies the controller hardware type.  Used in diagnostics
  *	and sometimes configuration.
  * @dev: Driver model state for this abstract device.
@@ -531,6 +534,7 @@ struct usb_gadget {
 	unsigned			b_hnp_enable:1;
 	unsigned			a_hnp_support:1;
 	unsigned			a_alt_hnp_support:1;
+	unsigned			register_my_device:1;
 	const char			*name;
 	struct device			dev;
 	unsigned			out_epnum;
-- 
cgit 


From 7bce401cc6db5508ef2517e45bd8caf7ce0a15ee Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 24 Jan 2013 17:41:00 +0200
Subject: usb: gadget: drop now unnecessary flag

We don't need the ->register_my_device flag
anymore because all UDC drivers have been
properly converted.

Let's remove every history of it.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/chipidea/udc.c             |  1 -
 drivers/usb/dwc3/gadget.c              |  1 -
 drivers/usb/gadget/amd5536udc.c        |  1 -
 drivers/usb/gadget/at91_udc.c          |  1 -
 drivers/usb/gadget/atmel_usba_udc.c    |  1 -
 drivers/usb/gadget/bcm63xx_udc.c       |  1 -
 drivers/usb/gadget/dummy_hcd.c         |  1 -
 drivers/usb/gadget/fsl_qe_udc.c        |  1 -
 drivers/usb/gadget/fsl_udc_core.c      |  1 -
 drivers/usb/gadget/fusb300_udc.c       |  1 -
 drivers/usb/gadget/goku_udc.c          |  1 -
 drivers/usb/gadget/imx_udc.c           |  1 -
 drivers/usb/gadget/lpc32xx_udc.c       |  1 -
 drivers/usb/gadget/m66592-udc.c        |  1 -
 drivers/usb/gadget/mv_u3d_core.c       |  1 -
 drivers/usb/gadget/mv_udc_core.c       |  1 -
 drivers/usb/gadget/net2272.c           |  1 -
 drivers/usb/gadget/net2280.c           |  1 -
 drivers/usb/gadget/omap_udc.c          |  1 -
 drivers/usb/gadget/pch_udc.c           |  1 -
 drivers/usb/gadget/pxa25x_udc.c        |  1 -
 drivers/usb/gadget/pxa27x_udc.c        |  1 -
 drivers/usb/gadget/r8a66597-udc.c      |  1 -
 drivers/usb/gadget/s3c-hsotg.c         |  1 -
 drivers/usb/gadget/s3c-hsudc.c         |  1 -
 drivers/usb/gadget/s3c2410_udc.c       |  1 -
 drivers/usb/gadget/udc-core.c          | 18 +++++++-----------
 drivers/usb/musb/musb_gadget.c         |  1 -
 drivers/usb/renesas_usbhs/mod_gadget.c |  1 -
 include/linux/usb/gadget.h             |  4 ----
 30 files changed, 7 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c
index e95e8bbde988..1b65ac8f3c9b 100644
--- a/drivers/usb/chipidea/udc.c
+++ b/drivers/usb/chipidea/udc.c
@@ -1721,7 +1721,6 @@ static int udc_start(struct ci13xxx *ci)
 	ci->gadget.dev.coherent_dma_mask = dev->coherent_dma_mask;
 	ci->gadget.dev.parent   = dev;
 	ci->gadget.dev.release  = udc_release;
-	ci->gadget.register_my_device = true;
 
 	/* alloc resources */
 	ci->qh_pool = dma_pool_create("ci13xxx_qh", dev,
diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c
index 10bb161eec88..65493b6cd5a6 100644
--- a/drivers/usb/dwc3/gadget.c
+++ b/drivers/usb/dwc3/gadget.c
@@ -2499,7 +2499,6 @@ int dwc3_gadget_init(struct dwc3 *dwc)
 	dwc->gadget.dev.dma_parms	= dwc->dev->dma_parms;
 	dwc->gadget.dev.dma_mask	= dwc->dev->dma_mask;
 	dwc->gadget.dev.release		= dwc3_gadget_release;
-	dwc->gadget.register_my_device	= true;
 	dwc->gadget.name		= "dwc3-gadget";
 
 	/*
diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c
index eee01ea70f8c..eec4461fb45f 100644
--- a/drivers/usb/gadget/amd5536udc.c
+++ b/drivers/usb/gadget/amd5536udc.c
@@ -3275,7 +3275,6 @@ static int udc_probe(struct udc *dev)
 	dev->gadget.dev.release = gadget_release;
 	dev->gadget.name = name;
 	dev->gadget.max_speed = USB_SPEED_HIGH;
-	dev->gadget.register_my_device = true;
 
 	/* init registers, interrupts, ... */
 	startup_registers(dev);
diff --git a/drivers/usb/gadget/at91_udc.c b/drivers/usb/gadget/at91_udc.c
index 47b7e58f8415..9936de9bbe50 100644
--- a/drivers/usb/gadget/at91_udc.c
+++ b/drivers/usb/gadget/at91_udc.c
@@ -1726,7 +1726,6 @@ static int at91udc_probe(struct platform_device *pdev)
 
 	/* init software state */
 	udc = &controller;
-	udc->gadget.register_my_device = true;
 	udc->gadget.dev.parent = dev;
 	if (pdev->dev.of_node)
 		at91udc_of_init(udc, pdev->dev.of_node);
diff --git a/drivers/usb/gadget/atmel_usba_udc.c b/drivers/usb/gadget/atmel_usba_udc.c
index 2404d0c25668..41518e612808 100644
--- a/drivers/usb/gadget/atmel_usba_udc.c
+++ b/drivers/usb/gadget/atmel_usba_udc.c
@@ -1902,7 +1902,6 @@ static int __init usba_udc_probe(struct platform_device *pdev)
 
 	udc->gadget.dev.parent = &pdev->dev;
 	udc->gadget.dev.dma_mask = pdev->dev.dma_mask;
-	udc->gadget.register_my_device = true;
 
 	platform_set_drvdata(pdev, udc);
 
diff --git a/drivers/usb/gadget/bcm63xx_udc.c b/drivers/usb/gadget/bcm63xx_udc.c
index c020b877219d..d4f73e1b37e6 100644
--- a/drivers/usb/gadget/bcm63xx_udc.c
+++ b/drivers/usb/gadget/bcm63xx_udc.c
@@ -2374,7 +2374,6 @@ static int bcm63xx_udc_probe(struct platform_device *pdev)
 	udc->gadget.dev.parent = dev;
 	udc->gadget.dev.release = bcm63xx_udc_gadget_release;
 	udc->gadget.dev.dma_mask = dev->dma_mask;
-	udc->gadget.register_my_device = true;
 
 	if (!pd->use_fullspeed && !use_fullspeed)
 		udc->gadget.max_speed = USB_SPEED_HIGH;
diff --git a/drivers/usb/gadget/dummy_hcd.c b/drivers/usb/gadget/dummy_hcd.c
index a6950aa8f3be..c4f27d5a2b9c 100644
--- a/drivers/usb/gadget/dummy_hcd.c
+++ b/drivers/usb/gadget/dummy_hcd.c
@@ -983,7 +983,6 @@ static int dummy_udc_probe(struct platform_device *pdev)
 	dum->gadget.name = gadget_name;
 	dum->gadget.ops = &dummy_ops;
 	dum->gadget.max_speed = USB_SPEED_SUPER;
-	dum->gadget.register_my_device = true;
 
 	dum->gadget.dev.parent = &pdev->dev;
 	dum->gadget.dev.release = dummy_gadget_release;
diff --git a/drivers/usb/gadget/fsl_qe_udc.c b/drivers/usb/gadget/fsl_qe_udc.c
index 0f78cd859d68..0e7531bd33f4 100644
--- a/drivers/usb/gadget/fsl_qe_udc.c
+++ b/drivers/usb/gadget/fsl_qe_udc.c
@@ -2525,7 +2525,6 @@ static int qe_udc_probe(struct platform_device *ofdev)
 	udc->gadget.name = driver_name;
 	udc->gadget.dev.release = qe_udc_release;
 	udc->gadget.dev.parent = &ofdev->dev;
-	udc->gadget.register_my_device = true;
 
 	/* initialize qe_ep struct */
 	for (i = 0; i < USB_MAX_ENDPOINTS ; i++) {
diff --git a/drivers/usb/gadget/fsl_udc_core.c b/drivers/usb/gadget/fsl_udc_core.c
index 9140a2daad87..f33b9005eeac 100644
--- a/drivers/usb/gadget/fsl_udc_core.c
+++ b/drivers/usb/gadget/fsl_udc_core.c
@@ -2524,7 +2524,6 @@ static int __init fsl_udc_probe(struct platform_device *pdev)
 	udc_controller->gadget.dev.release = fsl_udc_release;
 	udc_controller->gadget.dev.parent = &pdev->dev;
 	udc_controller->gadget.dev.of_node = pdev->dev.of_node;
-	udc_controller->gadget.register_my_device = true;
 
 	if (!IS_ERR_OR_NULL(udc_controller->transceiver))
 		udc_controller->gadget.is_otg = 1;
diff --git a/drivers/usb/gadget/fusb300_udc.c b/drivers/usb/gadget/fusb300_udc.c
index d29017218b01..2d3c8b351f42 100644
--- a/drivers/usb/gadget/fusb300_udc.c
+++ b/drivers/usb/gadget/fusb300_udc.c
@@ -1427,7 +1427,6 @@ static int __init fusb300_probe(struct platform_device *pdev)
 	fusb300->gadget.dev.dma_mask = pdev->dev.dma_mask;
 	fusb300->gadget.dev.release = pdev->dev.release;
 	fusb300->gadget.name = udc_name;
-	fusb300->gadget.register_my_device = true;
 	fusb300->reg = reg;
 
 	ret = request_irq(ires->start, fusb300_irq, IRQF_SHARED,
diff --git a/drivers/usb/gadget/goku_udc.c b/drivers/usb/gadget/goku_udc.c
index b4ea2cf465a6..8a6c66618bd3 100644
--- a/drivers/usb/gadget/goku_udc.c
+++ b/drivers/usb/gadget/goku_udc.c
@@ -1758,7 +1758,6 @@ static int goku_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->gadget.dev.dma_mask = pdev->dev.dma_mask;
 	dev->gadget.dev.release = gadget_release;
 	dev->gadget.name = driver_name;
-	dev->gadget.register_my_device = true;
 
 	/* now all the pci goodies ... */
 	retval = pci_enable_device(pdev);
diff --git a/drivers/usb/gadget/imx_udc.c b/drivers/usb/gadget/imx_udc.c
index 435b20346ead..9c5b7451a7d1 100644
--- a/drivers/usb/gadget/imx_udc.c
+++ b/drivers/usb/gadget/imx_udc.c
@@ -1461,7 +1461,6 @@ static int __init imx_udc_probe(struct platform_device *pdev)
 	imx_usb->clk = clk;
 	imx_usb->dev = &pdev->dev;
 
-	imx_usb->gadget.register_my_device = true;
 	imx_usb->gadget.dev.parent = &pdev->dev;
 	imx_usb->gadget.dev.dma_mask = pdev->dev.dma_mask;
 
diff --git a/drivers/usb/gadget/lpc32xx_udc.c b/drivers/usb/gadget/lpc32xx_udc.c
index 329e1c5f0ef9..67c3ef9d9bed 100644
--- a/drivers/usb/gadget/lpc32xx_udc.c
+++ b/drivers/usb/gadget/lpc32xx_udc.c
@@ -3090,7 +3090,6 @@ static int __init lpc32xx_udc_probe(struct platform_device *pdev)
 
 	/* init software state */
 	udc->gadget.dev.parent = dev;
-	udc->gadget.register_my_device = true;
 	udc->pdev = pdev;
 	udc->dev = &pdev->dev;
 	udc->enabled = 0;
diff --git a/drivers/usb/gadget/m66592-udc.c b/drivers/usb/gadget/m66592-udc.c
index 43ad70dff74d..eb61d0b54f21 100644
--- a/drivers/usb/gadget/m66592-udc.c
+++ b/drivers/usb/gadget/m66592-udc.c
@@ -1612,7 +1612,6 @@ static int __init m66592_probe(struct platform_device *pdev)
 	m66592->gadget.dev.dma_mask = pdev->dev.dma_mask;
 	m66592->gadget.dev.release = pdev->dev.release;
 	m66592->gadget.name = udc_name;
-	m66592->gadget.register_my_device = true;
 
 	init_timer(&m66592->timer);
 	m66592->timer.function = m66592_timer;
diff --git a/drivers/usb/gadget/mv_u3d_core.c b/drivers/usb/gadget/mv_u3d_core.c
index 734ade11505f..e5735fc610de 100644
--- a/drivers/usb/gadget/mv_u3d_core.c
+++ b/drivers/usb/gadget/mv_u3d_core.c
@@ -1959,7 +1959,6 @@ static int mv_u3d_probe(struct platform_device *dev)
 	u3d->gadget.dev.dma_mask = dev->dev.dma_mask;
 	u3d->gadget.dev.release = mv_u3d_gadget_release;
 	u3d->gadget.name = driver_name;		/* gadget name */
-	u3d->gadget.register_my_device = true;
 
 	mv_u3d_eps_init(u3d);
 
diff --git a/drivers/usb/gadget/mv_udc_core.c b/drivers/usb/gadget/mv_udc_core.c
index a7afdfb413b3..be35573f8703 100644
--- a/drivers/usb/gadget/mv_udc_core.c
+++ b/drivers/usb/gadget/mv_udc_core.c
@@ -2313,7 +2313,6 @@ static int mv_udc_probe(struct platform_device *pdev)
 	udc->gadget.dev.dma_mask = pdev->dev.dma_mask;
 	udc->gadget.dev.release = gadget_release;
 	udc->gadget.name = driver_name;		/* gadget name */
-	udc->gadget.register_my_device = true;
 
 	eps_init(udc);
 
diff --git a/drivers/usb/gadget/net2272.c b/drivers/usb/gadget/net2272.c
index 635248f42dcd..78c8bb538332 100644
--- a/drivers/usb/gadget/net2272.c
+++ b/drivers/usb/gadget/net2272.c
@@ -2239,7 +2239,6 @@ static struct net2272 *net2272_probe_init(struct device *dev, unsigned int irq)
 	ret->gadget.dev.dma_mask = dev->dma_mask;
 	ret->gadget.dev.release = net2272_gadget_release;
 	ret->gadget.name = driver_name;
-	ret->gadget.register_my_device = true;
 
 	return ret;
 }
diff --git a/drivers/usb/gadget/net2280.c b/drivers/usb/gadget/net2280.c
index c55af4293509..2089d9b0058c 100644
--- a/drivers/usb/gadget/net2280.c
+++ b/drivers/usb/gadget/net2280.c
@@ -2714,7 +2714,6 @@ static int net2280_probe (struct pci_dev *pdev, const struct pci_device_id *id)
 	dev->gadget.dev.dma_mask = pdev->dev.dma_mask;
 	dev->gadget.dev.release = gadget_release;
 	dev->gadget.name = driver_name;
-	dev->gadget.register_my_device = true;
 
 	/* now all the pci goodies ... */
 	if (pci_enable_device (pdev) < 0) {
diff --git a/drivers/usb/gadget/omap_udc.c b/drivers/usb/gadget/omap_udc.c
index c979272e7c86..b23c861e2a97 100644
--- a/drivers/usb/gadget/omap_udc.c
+++ b/drivers/usb/gadget/omap_udc.c
@@ -2634,7 +2634,6 @@ omap_udc_setup(struct platform_device *odev, struct usb_phy *xceiv)
 
 	udc->gadget.dev.release = omap_udc_release;
 	udc->gadget.dev.parent = &odev->dev;
-	udc->gadget.register_my_device = true;
 	if (use_dma)
 		udc->gadget.dev.dma_mask = odev->dev.dma_mask;
 
diff --git a/drivers/usb/gadget/pch_udc.c b/drivers/usb/gadget/pch_udc.c
index 703214543dd4..e8c9afd8fbf0 100644
--- a/drivers/usb/gadget/pch_udc.c
+++ b/drivers/usb/gadget/pch_udc.c
@@ -3198,7 +3198,6 @@ static int pch_udc_probe(struct pci_dev *pdev,
 	dev->gadget.dev.release = gadget_release;
 	dev->gadget.name = KBUILD_MODNAME;
 	dev->gadget.max_speed = USB_SPEED_HIGH;
-	dev->gadget.register_my_device = true;
 
 	/* Put the device in disconnected state till a driver is bound */
 	pch_udc_set_disconnect(dev);
diff --git a/drivers/usb/gadget/pxa25x_udc.c b/drivers/usb/gadget/pxa25x_udc.c
index 8996fcb053ef..e29bb878b2d7 100644
--- a/drivers/usb/gadget/pxa25x_udc.c
+++ b/drivers/usb/gadget/pxa25x_udc.c
@@ -2140,7 +2140,6 @@ static int __init pxa25x_udc_probe(struct platform_device *pdev)
 
 	dev->gadget.dev.parent = &pdev->dev;
 	dev->gadget.dev.dma_mask = pdev->dev.dma_mask;
-	dev->gadget.register_my_device = true;
 
 	the_controller = dev;
 	platform_set_drvdata(pdev, dev);
diff --git a/drivers/usb/gadget/pxa27x_udc.c b/drivers/usb/gadget/pxa27x_udc.c
index 1c5bfaafa6c8..07ce1477f911 100644
--- a/drivers/usb/gadget/pxa27x_udc.c
+++ b/drivers/usb/gadget/pxa27x_udc.c
@@ -2457,7 +2457,6 @@ static int __init pxa_udc_probe(struct platform_device *pdev)
 
 	udc->gadget.dev.parent = &pdev->dev;
 	udc->gadget.dev.dma_mask = NULL;
-	udc->gadget.register_my_device = true;
 	udc->vbus_sensed = 0;
 
 	the_controller = udc;
diff --git a/drivers/usb/gadget/r8a66597-udc.c b/drivers/usb/gadget/r8a66597-udc.c
index ae94c0eaf633..a67d47708b98 100644
--- a/drivers/usb/gadget/r8a66597-udc.c
+++ b/drivers/usb/gadget/r8a66597-udc.c
@@ -1919,7 +1919,6 @@ static int __init r8a66597_probe(struct platform_device *pdev)
 	r8a66597->gadget.dev.dma_mask = pdev->dev.dma_mask;
 	r8a66597->gadget.dev.release = pdev->dev.release;
 	r8a66597->gadget.name = udc_name;
-	r8a66597->gadget.register_my_device = true;
 
 	init_timer(&r8a66597->timer);
 	r8a66597->timer.function = r8a66597_timer;
diff --git a/drivers/usb/gadget/s3c-hsotg.c b/drivers/usb/gadget/s3c-hsotg.c
index 5fbd233eb6a0..8ae0bd99ffde 100644
--- a/drivers/usb/gadget/s3c-hsotg.c
+++ b/drivers/usb/gadget/s3c-hsotg.c
@@ -3573,7 +3573,6 @@ static int s3c_hsotg_probe(struct platform_device *pdev)
 	hsotg->gadget.dev.parent = dev;
 	hsotg->gadget.dev.dma_mask = dev->dma_mask;
 	hsotg->gadget.dev.release = s3c_hsotg_release;
-	hsotg->gadget.register_my_device = true;
 
 	/* reset the system */
 
diff --git a/drivers/usb/gadget/s3c-hsudc.c b/drivers/usb/gadget/s3c-hsudc.c
index c4ff747f53fc..7fc3de537c9a 100644
--- a/drivers/usb/gadget/s3c-hsudc.c
+++ b/drivers/usb/gadget/s3c-hsudc.c
@@ -1312,7 +1312,6 @@ static int s3c_hsudc_probe(struct platform_device *pdev)
 	hsudc->gadget.is_otg = 0;
 	hsudc->gadget.is_a_peripheral = 0;
 	hsudc->gadget.speed = USB_SPEED_UNKNOWN;
-	hsudc->gadget.register_my_device = true;
 
 	s3c_hsudc_setup_ep(hsudc);
 
diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c
index c4134948dd9e..a669081bbb88 100644
--- a/drivers/usb/gadget/s3c2410_udc.c
+++ b/drivers/usb/gadget/s3c2410_udc.c
@@ -1826,7 +1826,6 @@ static int s3c2410_udc_probe(struct platform_device *pdev)
 
 	udc->gadget.dev.parent = &pdev->dev;
 	udc->gadget.dev.dma_mask = pdev->dev.dma_mask;
-	udc->gadget.register_my_device = true;
 
 	the_controller = udc;
 	platform_set_drvdata(pdev, udc);
diff --git a/drivers/usb/gadget/udc-core.c b/drivers/usb/gadget/udc-core.c
index 919505426ec1..40b1d888d5a1 100644
--- a/drivers/usb/gadget/udc-core.c
+++ b/drivers/usb/gadget/udc-core.c
@@ -173,13 +173,11 @@ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
 	if (!udc)
 		goto err1;
 
-	if (gadget->register_my_device) {
-		dev_set_name(&gadget->dev, "gadget");
+	dev_set_name(&gadget->dev, "gadget");
 
-		ret = device_register(&gadget->dev);
-		if (ret)
-			goto err2;
-	}
+	ret = device_register(&gadget->dev);
+	if (ret)
+		goto err2;
 
 	device_initialize(&udc->dev);
 	udc->dev.release = usb_udc_release;
@@ -211,8 +209,8 @@ err3:
 	put_device(&udc->dev);
 
 err2:
-	if (gadget->register_my_device)
-		put_device(&gadget->dev);
+	put_device(&gadget->dev);
+
 err1:
 	return ret;
 }
@@ -266,9 +264,7 @@ found:
 
 	kobject_uevent(&udc->dev.kobj, KOBJ_REMOVE);
 	device_unregister(&udc->dev);
-
-	if (gadget->register_my_device)
-		device_unregister(&gadget->dev);
+	device_unregister(&gadget->dev);
 }
 EXPORT_SYMBOL_GPL(usb_del_gadget_udc);
 
diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c
index cadb750921e9..e363033f6754 100644
--- a/drivers/usb/musb/musb_gadget.c
+++ b/drivers/usb/musb/musb_gadget.c
@@ -1891,7 +1891,6 @@ int musb_gadget_setup(struct musb *musb)
 	musb->g.dev.dma_mask = musb->controller->dma_mask;
 	musb->g.dev.release = musb_gadget_release;
 	musb->g.name = musb_driver_name;
-	musb->g.register_my_device = true;
 	musb->g.is_otg = 1;
 
 	musb_g_init_endpoints(musb);
diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c
index 5d5fab0ad0d1..6a3afa9b764c 100644
--- a/drivers/usb/renesas_usbhs/mod_gadget.c
+++ b/drivers/usb/renesas_usbhs/mod_gadget.c
@@ -981,7 +981,6 @@ int usbhs_mod_gadget_probe(struct usbhs_priv *priv)
 	gpriv->gadget.name		= "renesas_usbhs_udc";
 	gpriv->gadget.ops		= &usbhsg_gadget_ops;
 	gpriv->gadget.max_speed		= USB_SPEED_HIGH;
-	gpriv->gadget.register_my_device = true;
 
 	INIT_LIST_HEAD(&gpriv->gadget.ep_list);
 
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index fcd9ef8d3f70..2e297e80d59a 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -494,9 +494,6 @@ struct usb_gadget_ops {
  *	only supports HNP on a different root port.
  * @b_hnp_enable: OTG device feature flag, indicating that the A-Host
  *	enabled HNP support.
- * @register_my_device: Flag telling udc-core that UDC driver didn't
- *	register the gadget device to the driver model. Temporary until
- *	all UDC drivers are fixed up properly.
  * @name: Identifies the controller hardware type.  Used in diagnostics
  *	and sometimes configuration.
  * @dev: Driver model state for this abstract device.
@@ -534,7 +531,6 @@ struct usb_gadget {
 	unsigned			b_hnp_enable:1;
 	unsigned			a_hnp_support:1;
 	unsigned			a_alt_hnp_support:1;
-	unsigned			register_my_device:1;
 	const char			*name;
 	struct device			dev;
 	unsigned			out_epnum;
-- 
cgit 


From d1e3d757f7aa91f15db347fc05ffd7ef7f413091 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 24 Jan 2013 22:29:48 +0200
Subject: usb: common: introduce usb_state_string()

this function will receive enum usb_device_state
and return a human-readable string from it or,
case an unknown value is passed as argument,
the string "UNKNOWN".

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/usb-common.c | 21 +++++++++++++++++++++
 include/linux/usb/ch9.h  |  9 +++++++++
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/usb-common.c b/drivers/usb/usb-common.c
index d29503e954ab..070b681e5d17 100644
--- a/drivers/usb/usb-common.c
+++ b/drivers/usb/usb-common.c
@@ -32,4 +32,25 @@ const char *usb_speed_string(enum usb_device_speed speed)
 }
 EXPORT_SYMBOL_GPL(usb_speed_string);
 
+const char *usb_state_string(enum usb_device_state state)
+{
+	static const char *const names[] = {
+		[USB_STATE_NOTATTACHED] = "not attached",
+		[USB_STATE_ATTACHED] = "attached",
+		[USB_STATE_POWERED] = "powered",
+		[USB_STATE_RECONNECTING] = "reconnecting",
+		[USB_STATE_UNAUTHENTICATED] = "unauthenticated",
+		[USB_STATE_DEFAULT] = "default",
+		[USB_STATE_ADDRESS] = "addresssed",
+		[USB_STATE_CONFIGURED] = "configured",
+		[USB_STATE_SUSPENDED] = "suspended",
+	};
+
+	if (state < 0 || state >= ARRAY_SIZE(names))
+		return "UNKNOWN";
+
+	return names[state];
+}
+EXPORT_SYMBOL_GPL(usb_state_string);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/usb/ch9.h b/include/linux/usb/ch9.h
index 9c210f2283df..27603bcbb9b9 100644
--- a/include/linux/usb/ch9.h
+++ b/include/linux/usb/ch9.h
@@ -43,4 +43,13 @@
  */
 extern const char *usb_speed_string(enum usb_device_speed speed);
 
+
+/**
+ * usb_state_string - Returns human readable name for the state.
+ * @state: The state to return a human-readable name for. If it's not
+ *	any of the states devices in usb_device_state_string enum,
+ *	the string UNKNOWN will be returned.
+ */
+extern const char *usb_state_string(enum usb_device_state state);
+
 #endif /* __LINUX_USB_CH9_H */
-- 
cgit 


From 49401f4169c0e5a1b38f1a676d6f12eecaf77485 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Mon, 19 Dec 2011 12:57:04 +0200
Subject: usb: gadget: introduce gadget state tracking

that's useful information to expose to userland.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc-core.c | 23 +++++++++++++++++++++++
 include/linux/usb/gadget.h    |  9 +++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc-core.c b/drivers/usb/gadget/udc-core.c
index 40b1d888d5a1..8a1eeb24ae6a 100644
--- a/drivers/usb/gadget/udc-core.c
+++ b/drivers/usb/gadget/udc-core.c
@@ -101,6 +101,16 @@ EXPORT_SYMBOL_GPL(usb_gadget_unmap_request);
 
 /* ------------------------------------------------------------------------- */
 
+void usb_gadget_set_state(struct usb_gadget *gadget,
+		enum usb_device_state state)
+{
+	gadget->state = state;
+	sysfs_notify(&gadget->dev.kobj, NULL, "status");
+}
+EXPORT_SYMBOL_GPL(usb_gadget_set_state);
+
+/* ------------------------------------------------------------------------- */
+
 /**
  * usb_gadget_udc_start - tells usb device controller to start up
  * @gadget: The gadget we want to get started
@@ -197,6 +207,8 @@ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
 	if (ret)
 		goto err4;
 
+	usb_gadget_set_state(gadget, USB_STATE_NOTATTACHED);
+
 	mutex_unlock(&udc_lock);
 
 	return 0;
@@ -406,6 +418,16 @@ static ssize_t usb_udc_softconn_store(struct device *dev,
 }
 static DEVICE_ATTR(soft_connect, S_IWUSR, NULL, usb_udc_softconn_store);
 
+static ssize_t usb_gadget_state_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct usb_udc		*udc = container_of(dev, struct usb_udc, dev);
+	struct usb_gadget	*gadget = udc->gadget;
+
+	return sprintf(buf, "%s\n", usb_state_string(gadget->state));
+}
+static DEVICE_ATTR(state, S_IRUGO, usb_gadget_state_show, NULL);
+
 #define USB_UDC_SPEED_ATTR(name, param)					\
 ssize_t usb_udc_##param##_show(struct device *dev,			\
 		struct device_attribute *attr, char *buf)		\
@@ -439,6 +461,7 @@ static USB_UDC_ATTR(a_alt_hnp_support);
 static struct attribute *usb_udc_attrs[] = {
 	&dev_attr_srp.attr,
 	&dev_attr_soft_connect.attr,
+	&dev_attr_state.attr,
 	&dev_attr_current_speed.attr,
 	&dev_attr_maximum_speed.attr,
 
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 2e297e80d59a..32b734d88d6b 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -482,6 +482,7 @@ struct usb_gadget_ops {
  * @speed: Speed of current connection to USB host.
  * @max_speed: Maximal speed the UDC can handle.  UDC must support this
  *      and all slower speeds.
+ * @state: the state we are now (attached, suspended, configured, etc)
  * @sg_supported: true if we can handle scatter-gather
  * @is_otg: True if the USB device port uses a Mini-AB jack, so that the
  *	gadget driver must provide a USB OTG descriptor.
@@ -525,6 +526,7 @@ struct usb_gadget {
 	struct list_head		ep_list;	/* of usb_ep */
 	enum usb_device_speed		speed;
 	enum usb_device_speed		max_speed;
+	enum usb_device_state		state;
 	unsigned			sg_supported:1;
 	unsigned			is_otg:1;
 	unsigned			is_a_peripheral:1;
@@ -959,6 +961,13 @@ extern void usb_gadget_unmap_request(struct usb_gadget *gadget,
 
 /*-------------------------------------------------------------------------*/
 
+/* utility to set gadget state properly */
+
+extern void usb_gadget_set_state(struct usb_gadget *gadget,
+		enum usb_device_state state);
+
+/*-------------------------------------------------------------------------*/
+
 /* utility wrapping a simple endpoint selection policy */
 
 extern struct usb_ep *usb_ep_autoconfig(struct usb_gadget *,
-- 
cgit 


From 792bfcf7a1cd7913fa5d55f2b3a40e3275e98f6f Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Tue, 26 Feb 2013 14:47:44 +0200
Subject: usb: gadget: udc-core: introduce usb_add_gadget_udc_release()

not all UDC drivers need a proper release function,
for those which don't need it, we udc-core will provide
a no-op release method so we can remove "redefinition"
of such methods in almost every UDC driver.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/udc-core.c | 39 ++++++++++++++++++++++++++++++++++-----
 include/linux/usb/gadget.h    |  2 ++
 2 files changed, 36 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/udc-core.c b/drivers/usb/gadget/udc-core.c
index 2423d024654f..a50811e35bdb 100644
--- a/drivers/usb/gadget/udc-core.c
+++ b/drivers/usb/gadget/udc-core.c
@@ -166,15 +166,23 @@ static void usb_udc_release(struct device *dev)
 }
 
 static const struct attribute_group *usb_udc_attr_groups[];
+
+static void usb_udc_nop_release(struct device *dev)
+{
+	dev_vdbg(dev, "%s\n", __func__);
+}
+
 /**
- * usb_add_gadget_udc - adds a new gadget to the udc class driver list
- * @parent: the parent device to this udc. Usually the controller
- * driver's device.
- * @gadget: the gadget to be added to the list
+ * usb_add_gadget_udc_release - adds a new gadget to the udc class driver list
+ * @parent: the parent device to this udc. Usually the controller driver's
+ * device.
+ * @gadget: the gadget to be added to the list.
+ * @release: a gadget release function.
  *
  * Returns zero on success, negative errno otherwise.
  */
-int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
+int usb_add_gadget_udc_release(struct device *parent, struct usb_gadget *gadget,
+		void (*release)(struct device *dev))
 {
 	struct usb_udc		*udc;
 	int			ret = -ENOMEM;
@@ -190,6 +198,13 @@ int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
 	gadget->dev.dma_parms = parent->dma_parms;
 	gadget->dev.dma_mask = parent->dma_mask;
 
+	if (release) {
+		gadget->dev.release = release;
+	} else {
+		if (!gadget->dev.release)
+			gadget->dev.release = usb_udc_nop_release;
+	}
+
 	ret = device_register(&gadget->dev);
 	if (ret)
 		goto err2;
@@ -231,6 +246,20 @@ err2:
 err1:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(usb_add_gadget_udc_release);
+
+/**
+ * usb_add_gadget_udc - adds a new gadget to the udc class driver list
+ * @parent: the parent device to this udc. Usually the controller
+ * driver's device.
+ * @gadget: the gadget to be added to the list
+ *
+ * Returns zero on success, negative errno otherwise.
+ */
+int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget)
+{
+	return usb_add_gadget_udc_release(parent, gadget, NULL);
+}
 EXPORT_SYMBOL_GPL(usb_add_gadget_udc);
 
 static void usb_gadget_remove_driver(struct usb_udc *udc)
diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h
index 32b734d88d6b..c454a88abf2e 100644
--- a/include/linux/usb/gadget.h
+++ b/include/linux/usb/gadget.h
@@ -874,6 +874,8 @@ int usb_gadget_probe_driver(struct usb_gadget_driver *driver);
  */
 int usb_gadget_unregister_driver(struct usb_gadget_driver *driver);
 
+extern int usb_add_gadget_udc_release(struct device *parent,
+		struct usb_gadget *gadget, void (*release)(struct device *dev));
 extern int usb_add_gadget_udc(struct device *parent, struct usb_gadget *gadget);
 extern void usb_del_gadget_udc(struct usb_gadget *gadget);
 extern int udc_attach_driver(const char *name,
-- 
cgit 


From 42c0bf1ce7c067bbc3e77d5626f102a16bc4fb6b Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 7 Mar 2013 10:39:57 +0200
Subject: usb: otg: prefix otg_state_string with usb_

all other functions under drivers/usb/ start
with usb_, let's do the same thing.

This patch is in preparation for moving otg_state_string
to usb-common.c and deleting otg.c completely.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/musb/am35x.c        |  8 ++++----
 drivers/usb/musb/blackfin.c     |  6 +++---
 drivers/usb/musb/da8xx.c        |  8 ++++----
 drivers/usb/musb/davinci.c      |  4 ++--
 drivers/usb/musb/musb_core.c    | 39 ++++++++++++++++++++-------------------
 drivers/usb/musb/musb_dsps.c    |  8 ++++----
 drivers/usb/musb/musb_gadget.c  |  8 ++++----
 drivers/usb/musb/musb_host.c    |  2 +-
 drivers/usb/musb/musb_virthub.c |  4 ++--
 drivers/usb/musb/omap2430.c     |  6 +++---
 drivers/usb/musb/tusb6010.c     | 14 +++++++-------
 drivers/usb/otg/fsl_otg.c       |  2 +-
 drivers/usb/otg/isp1301_omap.c  |  6 +++---
 drivers/usb/otg/otg.c           |  4 ++--
 drivers/usb/otg/otg_fsm.c       |  2 +-
 include/linux/usb/otg.h         |  4 ++--
 16 files changed, 63 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/musb/am35x.c b/drivers/usb/musb/am35x.c
index 59eea219034a..2231850c0625 100644
--- a/drivers/usb/musb/am35x.c
+++ b/drivers/usb/musb/am35x.c
@@ -149,7 +149,7 @@ static void otg_timer(unsigned long _musb)
 	 */
 	devctl = musb_readb(mregs, MUSB_DEVCTL);
 	dev_dbg(musb->controller, "Poll devctl %02x (%s)\n", devctl,
-		otg_state_string(musb->xceiv->state));
+		usb_otg_state_string(musb->xceiv->state));
 
 	spin_lock_irqsave(&musb->lock, flags);
 	switch (musb->xceiv->state) {
@@ -195,7 +195,7 @@ static void am35x_musb_try_idle(struct musb *musb, unsigned long timeout)
 	if (musb->is_active || (musb->a_wait_bcon == 0 &&
 				musb->xceiv->state == OTG_STATE_A_WAIT_BCON)) {
 		dev_dbg(musb->controller, "%s active, deleting timer\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		del_timer(&otg_workaround);
 		last_timer = jiffies;
 		return;
@@ -208,7 +208,7 @@ static void am35x_musb_try_idle(struct musb *musb, unsigned long timeout)
 	last_timer = timeout;
 
 	dev_dbg(musb->controller, "%s inactive, starting idle timer for %u ms\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		jiffies_to_msecs(timeout - jiffies));
 	mod_timer(&otg_workaround, timeout);
 }
@@ -298,7 +298,7 @@ static irqreturn_t am35x_musb_interrupt(int irq, void *hci)
 		/* NOTE: this must complete power-on within 100 ms. */
 		dev_dbg(musb->controller, "VBUS %s (%s)%s, devctl %02x\n",
 				drvvbus ? "on" : "off",
-				otg_state_string(musb->xceiv->state),
+				usb_otg_state_string(musb->xceiv->state),
 				err ? " ERROR" : "",
 				devctl);
 		ret = IRQ_HANDLED;
diff --git a/drivers/usb/musb/blackfin.c b/drivers/usb/musb/blackfin.c
index dbb31b30c7fa..5e63b160db0c 100644
--- a/drivers/usb/musb/blackfin.c
+++ b/drivers/usb/musb/blackfin.c
@@ -280,13 +280,13 @@ static void musb_conn_timer_handler(unsigned long _musb)
 		break;
 	default:
 		dev_dbg(musb->controller, "%s state not handled\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		break;
 	}
 	spin_unlock_irqrestore(&musb->lock, flags);
 
 	dev_dbg(musb->controller, "state is %s\n",
-		otg_state_string(musb->xceiv->state));
+		usb_otg_state_string(musb->xceiv->state));
 }
 
 static void bfin_musb_enable(struct musb *musb)
@@ -307,7 +307,7 @@ static void bfin_musb_set_vbus(struct musb *musb, int is_on)
 
 	dev_dbg(musb->controller, "VBUS %s, devctl %02x "
 		/* otg %3x conf %08x prcm %08x */ "\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		musb_readb(musb->mregs, MUSB_DEVCTL));
 }
 
diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c
index 7c71769d71ff..ea7e591093ee 100644
--- a/drivers/usb/musb/da8xx.c
+++ b/drivers/usb/musb/da8xx.c
@@ -198,7 +198,7 @@ static void otg_timer(unsigned long _musb)
 	 */
 	devctl = musb_readb(mregs, MUSB_DEVCTL);
 	dev_dbg(musb->controller, "Poll devctl %02x (%s)\n", devctl,
-		otg_state_string(musb->xceiv->state));
+		usb_otg_state_string(musb->xceiv->state));
 
 	spin_lock_irqsave(&musb->lock, flags);
 	switch (musb->xceiv->state) {
@@ -267,7 +267,7 @@ static void da8xx_musb_try_idle(struct musb *musb, unsigned long timeout)
 	if (musb->is_active || (musb->a_wait_bcon == 0 &&
 				musb->xceiv->state == OTG_STATE_A_WAIT_BCON)) {
 		dev_dbg(musb->controller, "%s active, deleting timer\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		del_timer(&otg_workaround);
 		last_timer = jiffies;
 		return;
@@ -280,7 +280,7 @@ static void da8xx_musb_try_idle(struct musb *musb, unsigned long timeout)
 	last_timer = timeout;
 
 	dev_dbg(musb->controller, "%s inactive, starting idle timer for %u ms\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		jiffies_to_msecs(timeout - jiffies));
 	mod_timer(&otg_workaround, timeout);
 }
@@ -360,7 +360,7 @@ static irqreturn_t da8xx_musb_interrupt(int irq, void *hci)
 
 		dev_dbg(musb->controller, "VBUS %s (%s)%s, devctl %02x\n",
 				drvvbus ? "on" : "off",
-				otg_state_string(musb->xceiv->state),
+				usb_otg_state_string(musb->xceiv->state),
 				err ? " ERROR" : "",
 				devctl);
 		ret = IRQ_HANDLED;
diff --git a/drivers/usb/musb/davinci.c b/drivers/usb/musb/davinci.c
index e040d9103735..bea6cc35471c 100644
--- a/drivers/usb/musb/davinci.c
+++ b/drivers/usb/musb/davinci.c
@@ -215,7 +215,7 @@ static void otg_timer(unsigned long _musb)
 	 */
 	devctl = musb_readb(mregs, MUSB_DEVCTL);
 	dev_dbg(musb->controller, "poll devctl %02x (%s)\n", devctl,
-		otg_state_string(musb->xceiv->state));
+		usb_otg_state_string(musb->xceiv->state));
 
 	spin_lock_irqsave(&musb->lock, flags);
 	switch (musb->xceiv->state) {
@@ -349,7 +349,7 @@ static irqreturn_t davinci_musb_interrupt(int irq, void *__hci)
 		davinci_musb_source_power(musb, drvvbus, 0);
 		dev_dbg(musb->controller, "VBUS %s (%s)%s, devctl %02x\n",
 				drvvbus ? "on" : "off",
-				otg_state_string(musb->xceiv->state),
+				usb_otg_state_string(musb->xceiv->state),
 				err ? " ERROR" : "",
 				devctl);
 		retval = IRQ_HANDLED;
diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c
index fad8571ed433..6bd879257e4c 100644
--- a/drivers/usb/musb/musb_core.c
+++ b/drivers/usb/musb/musb_core.c
@@ -372,13 +372,13 @@ static void musb_otg_timer_func(unsigned long data)
 	case OTG_STATE_A_SUSPEND:
 	case OTG_STATE_A_WAIT_BCON:
 		dev_dbg(musb->controller, "HNP: %s timeout\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		musb_platform_set_vbus(musb, 0);
 		musb->xceiv->state = OTG_STATE_A_WAIT_VFALL;
 		break;
 	default:
 		dev_dbg(musb->controller, "HNP: Unhandled mode %s\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 	}
 	musb->ignore_disconnect = 0;
 	spin_unlock_irqrestore(&musb->lock, flags);
@@ -393,13 +393,14 @@ void musb_hnp_stop(struct musb *musb)
 	void __iomem	*mbase = musb->mregs;
 	u8	reg;
 
-	dev_dbg(musb->controller, "HNP: stop from %s\n", otg_state_string(musb->xceiv->state));
+	dev_dbg(musb->controller, "HNP: stop from %s\n",
+			usb_otg_state_string(musb->xceiv->state));
 
 	switch (musb->xceiv->state) {
 	case OTG_STATE_A_PERIPHERAL:
 		musb_g_disconnect(musb);
 		dev_dbg(musb->controller, "HNP: back to %s\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		break;
 	case OTG_STATE_B_HOST:
 		dev_dbg(musb->controller, "HNP: Disabling HR\n");
@@ -413,7 +414,7 @@ void musb_hnp_stop(struct musb *musb)
 		break;
 	default:
 		dev_dbg(musb->controller, "HNP: Stopping in unknown state %s\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 	}
 
 	/*
@@ -451,7 +452,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 	 */
 	if (int_usb & MUSB_INTR_RESUME) {
 		handled = IRQ_HANDLED;
-		dev_dbg(musb->controller, "RESUME (%s)\n", otg_state_string(musb->xceiv->state));
+		dev_dbg(musb->controller, "RESUME (%s)\n", usb_otg_state_string(musb->xceiv->state));
 
 		if (devctl & MUSB_DEVCTL_HM) {
 			void __iomem *mbase = musb->mregs;
@@ -493,7 +494,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 			default:
 				WARNING("bogus %s RESUME (%s)\n",
 					"host",
-					otg_state_string(musb->xceiv->state));
+					usb_otg_state_string(musb->xceiv->state));
 			}
 		} else {
 			switch (musb->xceiv->state) {
@@ -522,7 +523,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 			default:
 				WARNING("bogus %s RESUME (%s)\n",
 					"peripheral",
-					otg_state_string(musb->xceiv->state));
+					usb_otg_state_string(musb->xceiv->state));
 			}
 		}
 	}
@@ -538,7 +539,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 		}
 
 		dev_dbg(musb->controller, "SESSION_REQUEST (%s)\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 
 		/* IRQ arrives from ID pin sense or (later, if VBUS power
 		 * is removed) SRP.  responses are time critical:
@@ -603,7 +604,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 		}
 
 		dev_dbg(musb->controller, "VBUS_ERROR in %s (%02x, %s), retry #%d, port1 %08x\n",
-				otg_state_string(musb->xceiv->state),
+				usb_otg_state_string(musb->xceiv->state),
 				devctl,
 				({ char *s;
 				switch (devctl & MUSB_DEVCTL_VBUS) {
@@ -628,7 +629,7 @@ static irqreturn_t musb_stage0_irq(struct musb *musb, u8 int_usb,
 
 	if (int_usb & MUSB_INTR_SUSPEND) {
 		dev_dbg(musb->controller, "SUSPEND (%s) devctl %02x\n",
-			otg_state_string(musb->xceiv->state), devctl);
+			usb_otg_state_string(musb->xceiv->state), devctl);
 		handled = IRQ_HANDLED;
 
 		switch (musb->xceiv->state) {
@@ -745,12 +746,12 @@ b_host:
 			usb_hcd_resume_root_hub(hcd);
 
 		dev_dbg(musb->controller, "CONNECT (%s) devctl %02x\n",
-				otg_state_string(musb->xceiv->state), devctl);
+				usb_otg_state_string(musb->xceiv->state), devctl);
 	}
 
 	if ((int_usb & MUSB_INTR_DISCONNECT) && !musb->ignore_disconnect) {
 		dev_dbg(musb->controller, "DISCONNECT (%s) as %s, devctl %02x\n",
-				otg_state_string(musb->xceiv->state),
+				usb_otg_state_string(musb->xceiv->state),
 				MUSB_MODE(musb), devctl);
 		handled = IRQ_HANDLED;
 
@@ -787,7 +788,7 @@ b_host:
 			break;
 		default:
 			WARNING("unhandled DISCONNECT transition (%s)\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 			break;
 		}
 	}
@@ -813,7 +814,7 @@ b_host:
 			}
 		} else {
 			dev_dbg(musb->controller, "BUS RESET as %s\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 			switch (musb->xceiv->state) {
 			case OTG_STATE_A_SUSPEND:
 				/* We need to ignore disconnect on suspend
@@ -826,7 +827,7 @@ b_host:
 			case OTG_STATE_A_WAIT_BCON:	/* OPT TD.4.7-900ms */
 				/* never use invalid T(a_wait_bcon) */
 				dev_dbg(musb->controller, "HNP: in %s, %d msec timeout\n",
-					otg_state_string(musb->xceiv->state),
+					usb_otg_state_string(musb->xceiv->state),
 					TA_WAIT_BCON(musb));
 				mod_timer(&musb->otg_timer, jiffies
 					+ msecs_to_jiffies(TA_WAIT_BCON(musb)));
@@ -838,7 +839,7 @@ b_host:
 				break;
 			case OTG_STATE_B_WAIT_ACON:
 				dev_dbg(musb->controller, "HNP: RESET (%s), to b_peripheral\n",
-					otg_state_string(musb->xceiv->state));
+					usb_otg_state_string(musb->xceiv->state));
 				musb->xceiv->state = OTG_STATE_B_PERIPHERAL;
 				musb_g_reset(musb);
 				break;
@@ -850,7 +851,7 @@ b_host:
 				break;
 			default:
 				dev_dbg(musb->controller, "Unhandled BUS RESET as %s\n",
-					otg_state_string(musb->xceiv->state));
+					usb_otg_state_string(musb->xceiv->state));
 			}
 		}
 	}
@@ -1632,7 +1633,7 @@ musb_mode_show(struct device *dev, struct device_attribute *attr, char *buf)
 	int ret = -EINVAL;
 
 	spin_lock_irqsave(&musb->lock, flags);
-	ret = sprintf(buf, "%s\n", otg_state_string(musb->xceiv->state));
+	ret = sprintf(buf, "%s\n", usb_otg_state_string(musb->xceiv->state));
 	spin_unlock_irqrestore(&musb->lock, flags);
 
 	return ret;
diff --git a/drivers/usb/musb/musb_dsps.c b/drivers/usb/musb/musb_dsps.c
index 4b4987461adb..1ea553d2b77f 100644
--- a/drivers/usb/musb/musb_dsps.c
+++ b/drivers/usb/musb/musb_dsps.c
@@ -225,7 +225,7 @@ static void otg_timer(unsigned long _musb)
 	 */
 	devctl = dsps_readb(mregs, MUSB_DEVCTL);
 	dev_dbg(musb->controller, "Poll devctl %02x (%s)\n", devctl,
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 
 	spin_lock_irqsave(&musb->lock, flags);
 	switch (musb->xceiv->state) {
@@ -274,7 +274,7 @@ static void dsps_musb_try_idle(struct musb *musb, unsigned long timeout)
 	if (musb->is_active || (musb->a_wait_bcon == 0 &&
 				musb->xceiv->state == OTG_STATE_A_WAIT_BCON)) {
 		dev_dbg(musb->controller, "%s active, deleting timer\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 		del_timer(&glue->timer[pdev->id]);
 		glue->last_timer[pdev->id] = jiffies;
 		return;
@@ -289,7 +289,7 @@ static void dsps_musb_try_idle(struct musb *musb, unsigned long timeout)
 	glue->last_timer[pdev->id] = timeout;
 
 	dev_dbg(musb->controller, "%s inactive, starting idle timer for %u ms\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 			jiffies_to_msecs(timeout - jiffies));
 	mod_timer(&glue->timer[pdev->id], timeout);
 }
@@ -378,7 +378,7 @@ static irqreturn_t dsps_interrupt(int irq, void *hci)
 		/* NOTE: this must complete power-on within 100 ms. */
 		dev_dbg(musb->controller, "VBUS %s (%s)%s, devctl %02x\n",
 				drvvbus ? "on" : "off",
-				otg_state_string(musb->xceiv->state),
+				usb_otg_state_string(musb->xceiv->state),
 				err ? " ERROR" : "",
 				devctl);
 		ret = IRQ_HANDLED;
diff --git a/drivers/usb/musb/musb_gadget.c b/drivers/usb/musb/musb_gadget.c
index 6101ebf803fd..e8408883ab0d 100644
--- a/drivers/usb/musb/musb_gadget.c
+++ b/drivers/usb/musb/musb_gadget.c
@@ -1571,7 +1571,7 @@ static int musb_gadget_wakeup(struct usb_gadget *gadget)
 		goto done;
 	default:
 		dev_dbg(musb->controller, "Unhandled wake: %s\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		goto done;
 	}
 
@@ -1970,7 +1970,7 @@ void musb_g_resume(struct musb *musb)
 		break;
 	default:
 		WARNING("unhandled RESUME transition (%s)\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 	}
 }
 
@@ -2000,7 +2000,7 @@ void musb_g_suspend(struct musb *musb)
 		 * A_PERIPHERAL may need care too
 		 */
 		WARNING("unhandled SUSPEND transition (%s)\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 	}
 }
 
@@ -2034,7 +2034,7 @@ void musb_g_disconnect(struct musb *musb)
 	switch (musb->xceiv->state) {
 	default:
 		dev_dbg(musb->controller, "Unhandled disconnect %s, setting a_idle\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		musb->xceiv->state = OTG_STATE_A_IDLE;
 		MUSB_HST_MODE(musb);
 		break;
diff --git a/drivers/usb/musb/musb_host.c b/drivers/usb/musb/musb_host.c
index 1ce1fcf3f3e7..51e9e8a38444 100644
--- a/drivers/usb/musb/musb_host.c
+++ b/drivers/usb/musb/musb_host.c
@@ -2453,7 +2453,7 @@ static int musb_bus_suspend(struct usb_hcd *hcd)
 
 	if (musb->is_active) {
 		WARNING("trying to suspend as %s while active\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 		return -EBUSY;
 	} else
 		return 0;
diff --git a/drivers/usb/musb/musb_virthub.c b/drivers/usb/musb/musb_virthub.c
index f70579154ded..ef7d11045f56 100644
--- a/drivers/usb/musb/musb_virthub.c
+++ b/drivers/usb/musb/musb_virthub.c
@@ -95,7 +95,7 @@ static void musb_port_suspend(struct musb *musb, bool do_suspend)
 			break;
 		default:
 			dev_dbg(musb->controller, "bogus rh suspend? %s\n",
-				otg_state_string(musb->xceiv->state));
+				usb_otg_state_string(musb->xceiv->state));
 		}
 	} else if (power & MUSB_POWER_SUSPENDM) {
 		power &= ~MUSB_POWER_SUSPENDM;
@@ -203,7 +203,7 @@ void musb_root_disconnect(struct musb *musb)
 		break;
 	default:
 		dev_dbg(musb->controller, "host disconnect (%s)\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 	}
 }
 
diff --git a/drivers/usb/musb/omap2430.c b/drivers/usb/musb/omap2430.c
index 1a42a458f2c4..8ba9bb2a91a7 100644
--- a/drivers/usb/musb/omap2430.c
+++ b/drivers/usb/musb/omap2430.c
@@ -117,7 +117,7 @@ static void omap2430_musb_try_idle(struct musb *musb, unsigned long timeout)
 	if (musb->is_active || ((musb->a_wait_bcon == 0)
 			&& (musb->xceiv->state == OTG_STATE_A_WAIT_BCON))) {
 		dev_dbg(musb->controller, "%s active, deleting timer\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		del_timer(&musb_idle_timer);
 		last_timer = jiffies;
 		return;
@@ -134,7 +134,7 @@ static void omap2430_musb_try_idle(struct musb *musb, unsigned long timeout)
 	last_timer = timeout;
 
 	dev_dbg(musb->controller, "%s inactive, for idle timer for %lu ms\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		(unsigned long)jiffies_to_msecs(timeout - jiffies));
 	mod_timer(&musb_idle_timer, timeout);
 }
@@ -200,7 +200,7 @@ static void omap2430_musb_set_vbus(struct musb *musb, int is_on)
 
 	dev_dbg(musb->controller, "VBUS %s, devctl %02x "
 		/* otg %3x conf %08x prcm %08x */ "\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		musb_readb(musb->mregs, MUSB_DEVCTL));
 }
 
diff --git a/drivers/usb/musb/tusb6010.c b/drivers/usb/musb/tusb6010.c
index 464bd23cccda..7369ba33c94f 100644
--- a/drivers/usb/musb/tusb6010.c
+++ b/drivers/usb/musb/tusb6010.c
@@ -423,7 +423,7 @@ static void musb_do_idle(unsigned long _musb)
 			&& (musb->idle_timeout == 0
 				|| time_after(jiffies, musb->idle_timeout))) {
 			dev_dbg(musb->controller, "Nothing connected %s, turning off VBUS\n",
-					otg_state_string(musb->xceiv->state));
+					usb_otg_state_string(musb->xceiv->state));
 		}
 		/* FALLTHROUGH */
 	case OTG_STATE_A_IDLE:
@@ -478,7 +478,7 @@ static void tusb_musb_try_idle(struct musb *musb, unsigned long timeout)
 	if (musb->is_active || ((musb->a_wait_bcon == 0)
 			&& (musb->xceiv->state == OTG_STATE_A_WAIT_BCON))) {
 		dev_dbg(musb->controller, "%s active, deleting timer\n",
-			otg_state_string(musb->xceiv->state));
+			usb_otg_state_string(musb->xceiv->state));
 		del_timer(&musb_idle_timer);
 		last_timer = jiffies;
 		return;
@@ -495,7 +495,7 @@ static void tusb_musb_try_idle(struct musb *musb, unsigned long timeout)
 	last_timer = timeout;
 
 	dev_dbg(musb->controller, "%s inactive, for idle timer for %lu ms\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		(unsigned long)jiffies_to_msecs(timeout - jiffies));
 	mod_timer(&musb_idle_timer, timeout);
 }
@@ -571,7 +571,7 @@ static void tusb_musb_set_vbus(struct musb *musb, int is_on)
 	musb_writeb(musb->mregs, MUSB_DEVCTL, devctl);
 
 	dev_dbg(musb->controller, "VBUS %s, devctl %02x otg %3x conf %08x prcm %08x\n",
-		otg_state_string(musb->xceiv->state),
+		usb_otg_state_string(musb->xceiv->state),
 		musb_readb(musb->mregs, MUSB_DEVCTL),
 		musb_readl(tbase, TUSB_DEV_OTG_STAT),
 		conf, prcm);
@@ -678,13 +678,13 @@ tusb_otg_ints(struct musb *musb, u32 int_src, void __iomem *tbase)
 				musb->is_active = 0;
 			}
 			dev_dbg(musb->controller, "vbus change, %s, otg %03x\n",
-				otg_state_string(musb->xceiv->state), otg_stat);
+				usb_otg_state_string(musb->xceiv->state), otg_stat);
 			idle_timeout = jiffies + (1 * HZ);
 			schedule_work(&musb->irq_work);
 
 		} else /* A-dev state machine */ {
 			dev_dbg(musb->controller, "vbus change, %s, otg %03x\n",
-				otg_state_string(musb->xceiv->state), otg_stat);
+				usb_otg_state_string(musb->xceiv->state), otg_stat);
 
 			switch (musb->xceiv->state) {
 			case OTG_STATE_A_IDLE:
@@ -733,7 +733,7 @@ tusb_otg_ints(struct musb *musb, u32 int_src, void __iomem *tbase)
 		u8	devctl;
 
 		dev_dbg(musb->controller, "%s timer, %03x\n",
-			otg_state_string(musb->xceiv->state), otg_stat);
+			usb_otg_state_string(musb->xceiv->state), otg_stat);
 
 		switch (musb->xceiv->state) {
 		case OTG_STATE_A_WAIT_VRISE:
diff --git a/drivers/usb/otg/fsl_otg.c b/drivers/usb/otg/fsl_otg.c
index 37e8e1578316..72a2a00c2487 100644
--- a/drivers/usb/otg/fsl_otg.c
+++ b/drivers/usb/otg/fsl_otg.c
@@ -992,7 +992,7 @@ static int show_fsl_usb2_otg_state(struct device *dev,
 	/* State */
 	t = scnprintf(next, size,
 		      "OTG state: %s\n\n",
-		      otg_state_string(fsl_otg_dev->phy.state));
+		      usb_otg_state_string(fsl_otg_dev->phy.state));
 	size -= t;
 	next += t;
 
diff --git a/drivers/usb/otg/isp1301_omap.c b/drivers/usb/otg/isp1301_omap.c
index af9cb11626b2..8fe0c3b95261 100644
--- a/drivers/usb/otg/isp1301_omap.c
+++ b/drivers/usb/otg/isp1301_omap.c
@@ -236,7 +236,7 @@ isp1301_clear_bits(struct isp1301 *isp, u8 reg, u8 bits)
 
 static inline const char *state_name(struct isp1301 *isp)
 {
-	return otg_state_string(isp->phy.state);
+	return usb_otg_state_string(isp->phy.state);
 }
 
 /*-------------------------------------------------------------------------*/
@@ -481,7 +481,7 @@ static void check_state(struct isp1301 *isp, const char *tag)
 	if (isp->phy.state == state && !extra)
 		return;
 	pr_debug("otg: %s FSM %s/%02x, %s, %06x\n", tag,
-		otg_state_string(state), fsm, state_name(isp),
+		usb_otg_state_string(state), fsm, state_name(isp),
 		omap_readl(OTG_CTRL));
 }
 
@@ -1077,7 +1077,7 @@ static void isp_update_otg(struct isp1301 *isp, u8 stat)
 
 	if (state != isp->phy.state)
 		pr_debug("  isp, %s -> %s\n",
-				otg_state_string(state), state_name(isp));
+				usb_otg_state_string(state), state_name(isp));
 
 #ifdef	CONFIG_USB_OTG
 	/* update the OTG controller state to match the isp1301; may
diff --git a/drivers/usb/otg/otg.c b/drivers/usb/otg/otg.c
index 358cfd9bce89..fd9a4b7bebe7 100644
--- a/drivers/usb/otg/otg.c
+++ b/drivers/usb/otg/otg.c
@@ -11,7 +11,7 @@
 #include <linux/export.h>
 #include <linux/usb/otg.h>
 
-const char *otg_state_string(enum usb_otg_state state)
+const char *usb_otg_state_string(enum usb_otg_state state)
 {
 	switch (state) {
 	case OTG_STATE_A_IDLE:
@@ -44,4 +44,4 @@ const char *otg_state_string(enum usb_otg_state state)
 		return "UNDEFINED";
 	}
 }
-EXPORT_SYMBOL(otg_state_string);
+EXPORT_SYMBOL(usb_otg_state_string);
diff --git a/drivers/usb/otg/otg_fsm.c b/drivers/usb/otg/otg_fsm.c
index ade131a8ae5e..1f729a15decb 100644
--- a/drivers/usb/otg/otg_fsm.c
+++ b/drivers/usb/otg/otg_fsm.c
@@ -119,7 +119,7 @@ int otg_set_state(struct otg_fsm *fsm, enum usb_otg_state new_state)
 	state_changed = 1;
 	if (fsm->otg->phy->state == new_state)
 		return 0;
-	VDBG("Set state: %s\n", otg_state_string(new_state));
+	VDBG("Set state: %s\n", usb_otg_state_string(new_state));
 	otg_leave_state(fsm, fsm->otg->phy->state);
 	switch (new_state) {
 	case OTG_STATE_B_IDLE:
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index e8a5fe87c6bd..9f9fb3927b0a 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -37,9 +37,9 @@ struct usb_otg {
 };
 
 #ifdef CONFIG_USB_OTG_UTILS
-extern const char *otg_state_string(enum usb_otg_state state);
+extern const char *usb_otg_state_string(enum usb_otg_state state);
 #else
-static inline const char *otg_state_string(enum usb_otg_state state)
+static inline const char *usb_otg_state_string(enum usb_otg_state state)
 {
 	return NULL;
 }
-- 
cgit 


From 7009bdd7f31ed6e769af0f76e2368bb6033be572 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 7 Mar 2013 10:45:56 +0200
Subject: usb: otg: move usb_otg_state_string to usb-common.c

otg.c only had a single function definition
which might make more sense to be placed in
usb-common.c. While doing that, we also delete
otg.c since it's now empty.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/otg/Makefile |  3 ---
 drivers/usb/otg/otg.c    | 47 -----------------------------------------------
 drivers/usb/usb-common.c | 26 ++++++++++++++++++++++++++
 include/linux/usb/otg.h  |  7 -------
 4 files changed, 26 insertions(+), 57 deletions(-)
 delete mode 100644 drivers/usb/otg/otg.c

(limited to 'include/linux')

diff --git a/drivers/usb/otg/Makefile b/drivers/usb/otg/Makefile
index a844b8d35d14..6abc45388e24 100644
--- a/drivers/usb/otg/Makefile
+++ b/drivers/usb/otg/Makefile
@@ -5,9 +5,6 @@
 ccflags-$(CONFIG_USB_DEBUG)		:= -DDEBUG
 ccflags-$(CONFIG_USB_GADGET_DEBUG)	+= -DDEBUG
 
-# infrastructure
-obj-$(CONFIG_USB_OTG_UTILS)	+= otg.o
-
 # transceiver drivers
 obj-$(CONFIG_USB_GPIO_VBUS)	+= gpio_vbus.o
 obj-$(CONFIG_ISP1301_OMAP)	+= isp1301_omap.o
diff --git a/drivers/usb/otg/otg.c b/drivers/usb/otg/otg.c
deleted file mode 100644
index fd9a4b7bebe7..000000000000
--- a/drivers/usb/otg/otg.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * otg.c -- USB OTG utility code
- *
- * Copyright (C) 2004 Texas Instruments
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-#include <linux/export.h>
-#include <linux/usb/otg.h>
-
-const char *usb_otg_state_string(enum usb_otg_state state)
-{
-	switch (state) {
-	case OTG_STATE_A_IDLE:
-		return "a_idle";
-	case OTG_STATE_A_WAIT_VRISE:
-		return "a_wait_vrise";
-	case OTG_STATE_A_WAIT_BCON:
-		return "a_wait_bcon";
-	case OTG_STATE_A_HOST:
-		return "a_host";
-	case OTG_STATE_A_SUSPEND:
-		return "a_suspend";
-	case OTG_STATE_A_PERIPHERAL:
-		return "a_peripheral";
-	case OTG_STATE_A_WAIT_VFALL:
-		return "a_wait_vfall";
-	case OTG_STATE_A_VBUS_ERR:
-		return "a_vbus_err";
-	case OTG_STATE_B_IDLE:
-		return "b_idle";
-	case OTG_STATE_B_SRP_INIT:
-		return "b_srp_init";
-	case OTG_STATE_B_PERIPHERAL:
-		return "b_peripheral";
-	case OTG_STATE_B_WAIT_ACON:
-		return "b_wait_acon";
-	case OTG_STATE_B_HOST:
-		return "b_host";
-	default:
-		return "UNDEFINED";
-	}
-}
-EXPORT_SYMBOL(usb_otg_state_string);
diff --git a/drivers/usb/usb-common.c b/drivers/usb/usb-common.c
index 070b681e5d17..0db0a919d72b 100644
--- a/drivers/usb/usb-common.c
+++ b/drivers/usb/usb-common.c
@@ -14,6 +14,32 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/usb/ch9.h>
+#include <linux/usb/otg.h>
+
+const char *usb_otg_state_string(enum usb_otg_state state)
+{
+	static const char *const names[] = {
+		[OTG_STATE_A_IDLE] = "a_idle",
+		[OTG_STATE_A_WAIT_VRISE] = "a_wait_vrise",
+		[OTG_STATE_A_WAIT_BCON] = "a_wait_bcon",
+		[OTG_STATE_A_HOST] = "a_host",
+		[OTG_STATE_A_SUSPEND] = "a_suspend",
+		[OTG_STATE_A_PERIPHERAL] = "a_peripheral",
+		[OTG_STATE_A_WAIT_VFALL] = "a_wait_vfall",
+		[OTG_STATE_A_VBUS_ERR] = "a_vbus_err",
+		[OTG_STATE_B_IDLE] = "b_idle",
+		[OTG_STATE_B_SRP_INIT] = "b_srp_init",
+		[OTG_STATE_B_PERIPHERAL] = "b_peripheral",
+		[OTG_STATE_B_WAIT_ACON] = "b_wait_acon",
+		[OTG_STATE_B_HOST] = "b_host",
+	};
+
+	if (state < 0 || state >= ARRAY_SIZE(names))
+		return "UNDEFINED";
+
+	return names[state];
+}
+EXPORT_SYMBOL_GPL(usb_otg_state_string);
 
 const char *usb_speed_string(enum usb_device_speed speed)
 {
diff --git a/include/linux/usb/otg.h b/include/linux/usb/otg.h
index 9f9fb3927b0a..291e01ba32e5 100644
--- a/include/linux/usb/otg.h
+++ b/include/linux/usb/otg.h
@@ -36,14 +36,7 @@ struct usb_otg {
 
 };
 
-#ifdef CONFIG_USB_OTG_UTILS
 extern const char *usb_otg_state_string(enum usb_otg_state state);
-#else
-static inline const char *usb_otg_state_string(enum usb_otg_state state)
-{
-	return NULL;
-}
-#endif
 
 /* Context: can sleep */
 static inline int
-- 
cgit 


From edc7cb2e955f222fe51cd44c1cf9c94d58017344 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 7 Mar 2013 11:13:43 +0200
Subject: usb: phy: make it a menuconfig

We already have a considerable amount of USB
PHY drivers, making it a menuconfig just
prevents us from adding too much churn to
USB's menuconfig.

While at that, also select USB_OTG_UTILS from
this new menuconfig just to keep backwards
compatibility until we manage to remove
that symbol.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/Makefile         |  2 +-
 drivers/usb/phy/Kconfig  | 17 ++++++++++++-----
 drivers/usb/phy/Makefile |  2 +-
 include/linux/usb/phy.h  |  2 +-
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/Makefile b/drivers/Makefile
index dce39a95fa71..3c200a243af0 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -79,7 +79,7 @@ obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
 obj-$(CONFIG_PARIDE) 		+= block/paride/
 obj-$(CONFIG_TC)		+= tc/
 obj-$(CONFIG_UWB)		+= uwb/
-obj-$(CONFIG_USB_OTG_UTILS)	+= usb/
+obj-$(CONFIG_USB_PHY)		+= usb/
 obj-$(CONFIG_USB)		+= usb/
 obj-$(CONFIG_PCI)		+= usb/
 obj-$(CONFIG_USB_GADGET)	+= usb/
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig
index 32ce740a9dd5..832cd694fb8b 100644
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -1,8 +1,17 @@
 #
 # Physical Layer USB driver configuration
 #
-comment "USB Physical Layer drivers"
-	depends on USB || USB_GADGET
+menuconfig USB_PHY
+	tristate "USB Physical Layer drivers"
+	select USB_OTG_UTILS
+	help
+	  USB controllers (those which are host, device or DRD) need a
+	  device to handle the physical layer signalling, commonly called
+	  a PHY.
+
+	  The following drivers add support for such PHY devices.
+
+if USB_PHY
 
 config USB_OTG_UTILS
 	bool
@@ -10,8 +19,6 @@ config USB_OTG_UTILS
 	  Select this to make sure the build includes objects from
 	  the OTG infrastructure directory.
 
-if USB || USB_GADGET
-
 #
 # USB Transceiver Drivers
 #
@@ -206,4 +213,4 @@ config USB_ULPI_VIEWPORT
 	  Provides read/write operations to the ULPI phy register set for
 	  controllers with a viewport register (e.g. Chipidea/ARC controllers).
 
-endif # USB || OTG
+endif # USB_PHY
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile
index 34488ceef491..d10a8b387ffe 100644
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -4,7 +4,7 @@
 
 ccflags-$(CONFIG_USB_DEBUG) := -DDEBUG
 
-obj-$(CONFIG_USB_OTG_UTILS)		+= phy.o
+obj-$(CONFIG_USB_PHY)			+= phy.o
 
 # transceiver drivers, keep the list sorted
 
diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index 15847cbdb512..b001dc3d6354 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -161,7 +161,7 @@ usb_phy_shutdown(struct usb_phy *x)
 }
 
 /* for usb host and peripheral controller drivers */
-#ifdef CONFIG_USB_OTG_UTILS
+#if IS_ENABLED(CONFIG_USB_PHY)
 extern struct usb_phy *usb_get_phy(enum usb_phy_type type);
 extern struct usb_phy *devm_usb_get_phy(struct device *dev,
 	enum usb_phy_type type);
-- 
cgit 


From b774212ea5f13911a5e0211a7088e42dad46b4c8 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Fri, 8 Mar 2013 13:22:58 +0200
Subject: usb: phy: introduce ->set_vbus() method

this method will be used to enable or disable
the charge pump.

Whenever we have DRD devices, we need to be
able to turn VBUS on or off whenever we want.

Note that in the ideal case, this would be
controlled by the ID-pin Interrupt, but not
all devices have ID-pin properly routed since
manufacturers can choose to save that trace
if they're building a host-only product out
of a DRD IP.

This is also useful during debugging where
we might not have the proper cable hanging
around.

Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/phy.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index b001dc3d6354..b7c2217c585f 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -91,6 +91,9 @@ struct usb_phy {
 	int	(*init)(struct usb_phy *x);
 	void	(*shutdown)(struct usb_phy *x);
 
+	/* enable/disable VBUS */
+	int	(*set_vbus)(struct usb_phy *x, int on);
+
 	/* effective for B devices, ignored for A-peripheral */
 	int	(*set_power)(struct usb_phy *x,
 				unsigned mA);
@@ -160,6 +163,24 @@ usb_phy_shutdown(struct usb_phy *x)
 		x->shutdown(x);
 }
 
+static inline int
+usb_phy_vbus_on(struct usb_phy *x)
+{
+	if (!x->set_vbus)
+		return 0;
+
+	return x->set_vbus(x, true);
+}
+
+static inline int
+usb_phy_vbus_off(struct usb_phy *x)
+{
+	if (!x->set_vbus)
+		return 0;
+
+	return x->set_vbus(x, false);
+}
+
 /* for usb host and peripheral controller drivers */
 #if IS_ENABLED(CONFIG_USB_PHY)
 extern struct usb_phy *usb_get_phy(enum usb_phy_type type);
-- 
cgit 


From 2ba7943af0f0cca5a069cd3aff807815bc76fff1 Mon Sep 17 00:00:00 2001
From: Kishon Vijay Abraham I <kishon@ti.com>
Date: Thu, 7 Mar 2013 18:51:44 +0530
Subject: usb: dwc3: dwc3-omap: return -EPROBE_DEFER if probe has not yet
 executed

return -EPROBE_DEFER from dwc3_omap_mailbox in dwc3-omap.c, if the probe of
dwc3-omap has not yet been executed or failed.

Signed-off-by: Kishon Vijay Abraham I <kishon@ti.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/dwc3/dwc3-omap.c  | 7 +++++--
 include/linux/usb/dwc3-omap.h | 6 +++---
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c
index 546f1fd84920..2fe9723ff1df 100644
--- a/drivers/usb/dwc3/dwc3-omap.c
+++ b/drivers/usb/dwc3/dwc3-omap.c
@@ -138,11 +138,14 @@ static inline void dwc3_omap_writel(void __iomem *base, u32 offset, u32 value)
 	writel(value, base + offset);
 }
 
-void dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status)
+int dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status)
 {
 	u32			val;
 	struct dwc3_omap	*omap = _omap;
 
+	if (!omap)
+		return -EPROBE_DEFER;
+
 	switch (status) {
 	case OMAP_DWC3_ID_GROUND:
 		dev_dbg(omap->dev, "ID GND\n");
@@ -185,7 +188,7 @@ void dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status)
 		dev_dbg(omap->dev, "ID float\n");
 	}
 
-	return;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dwc3_omap_mailbox);
 
diff --git a/include/linux/usb/dwc3-omap.h b/include/linux/usb/dwc3-omap.h
index 51eae14477f7..5615f4d82724 100644
--- a/include/linux/usb/dwc3-omap.h
+++ b/include/linux/usb/dwc3-omap.h
@@ -19,11 +19,11 @@ enum omap_dwc3_vbus_id_status {
 };
 
 #if (defined(CONFIG_USB_DWC3) || defined(CONFIG_USB_DWC3_MODULE))
-extern void dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status);
+extern int dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status);
 #else
-static inline void dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status)
+static inline int dwc3_omap_mailbox(enum omap_dwc3_vbus_id_status status)
 {
-	return;
+	return -ENODEV;
 }
 #endif
 
-- 
cgit 


From b7fa5c2aec5be083eb2719b405089703608e9bc6 Mon Sep 17 00:00:00 2001
From: Felipe Balbi <balbi@ti.com>
Date: Thu, 14 Mar 2013 17:59:06 +0200
Subject: usb: phy: return -ENXIO when PHY layer isn't enabled

in cases where PHY layer isn't enabled, we want
to still return an error code (actually an error
pointer) so that our users don't need to cope with
either error pointer of NULL.

This will simplify users as below:

-	return IS_ERR(phy) ? PTR_ERR(phy) : -ENODEV;
+	return PTR_ERR(phy);

Acked-by: Kishon Vijay Abraham I <kishon@ti.com>
Reported-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/usb/phy.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/phy.h b/include/linux/usb/phy.h
index b7c2217c585f..6b5978f57633 100644
--- a/include/linux/usb/phy.h
+++ b/include/linux/usb/phy.h
@@ -197,29 +197,29 @@ extern int usb_bind_phy(const char *dev_name, u8 index,
 #else
 static inline struct usb_phy *usb_get_phy(enum usb_phy_type type)
 {
-	return NULL;
+	return ERR_PTR(-ENXIO);
 }
 
 static inline struct usb_phy *devm_usb_get_phy(struct device *dev,
 	enum usb_phy_type type)
 {
-	return NULL;
+	return ERR_PTR(-ENXIO);
 }
 
 static inline struct usb_phy *usb_get_phy_dev(struct device *dev, u8 index)
 {
-	return NULL;
+	return ERR_PTR(-ENXIO);
 }
 
 static inline struct usb_phy *devm_usb_get_phy_dev(struct device *dev, u8 index)
 {
-	return NULL;
+	return ERR_PTR(-ENXIO);
 }
 
 static inline struct usb_phy *devm_usb_get_phy_by_phandle(struct device *dev,
 	const char *phandle, u8 index)
 {
-	return NULL;
+	return ERR_PTR(-ENXIO);
 }
 
 static inline void usb_put_phy(struct usb_phy *x)
-- 
cgit 


From e6251fc244a18a53830f38de84e4fcaee2f58662 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Fri, 15 Mar 2013 16:32:05 +0100
Subject: itg3200: fix incorrect ifdef comment

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/iio/gyro/itg3200.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/iio/gyro/itg3200.h b/include/linux/iio/gyro/itg3200.h
index c53f16914b77..2a820850f284 100644
--- a/include/linux/iio/gyro/itg3200.h
+++ b/include/linux/iio/gyro/itg3200.h
@@ -149,6 +149,6 @@ static inline void itg3200_buffer_unconfigure(struct iio_dev *indio_dev)
 {
 }
 
-#endif  /* CONFIG_IIO_RING_BUFFER */
+#endif  /* CONFIG_IIO_BUFFER */
 
 #endif /* ITG3200_H_ */
-- 
cgit 


From 5a20d339c785d98d8b050b9afc098e4184a6098c Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk.kim@samsung.com>
Date: Sun, 3 Mar 2013 13:58:05 +0900
Subject: f2fs: align f2fs maximum name length to linux based filesystem

The maximum filename length supported in linux is 255 characters.
So let's follow that.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Amit Sahrawat <a.sahrawat@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
---
 fs/f2fs/dir.c           |  3 +++
 fs/f2fs/namei.c         |  2 +-
 fs/f2fs/super.c         |  2 +-
 include/linux/f2fs_fs.h | 17 +++++++++--------
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index a1f38443ecee..2851ae6948a1 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -189,6 +189,9 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 	unsigned int max_depth;
 	unsigned int level;
 
+	if (namelen > F2FS_NAME_LEN)
+		return NULL;
+
 	if (npages == 0)
 		return NULL;
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 1a49b881bac0..d4a171b1a68b 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -197,7 +197,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
 	struct f2fs_dir_entry *de;
 	struct page *page;
 
-	if (dentry->d_name.len > F2FS_MAX_NAME_LEN)
+	if (dentry->d_name.len > F2FS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 8c117649a035..1c7f595ca47c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -180,7 +180,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_files = sbi->total_node_count;
 	buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
 
-	buf->f_namelen = F2FS_MAX_NAME_LEN;
+	buf->f_namelen = F2FS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
 	buf->f_fsid.val[1] = (u32)(id >> 32);
 
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index f9a12f6243a5..df6fab82f87e 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -139,7 +139,7 @@ struct f2fs_extent {
 	__le32 len;		/* lengh of the extent */
 } __packed;
 
-#define F2FS_MAX_NAME_LEN	256
+#define F2FS_NAME_LEN		255
 #define ADDRS_PER_INODE         923	/* Address Pointers in an Inode */
 #define ADDRS_PER_BLOCK         1018	/* Address Pointers in a Direct Block */
 #define NIDS_PER_BLOCK          1018	/* Node IDs in an Indirect Block */
@@ -165,7 +165,8 @@ struct f2fs_inode {
 	__le32 i_flags;			/* file attributes */
 	__le32 i_pino;			/* parent inode number */
 	__le32 i_namelen;		/* file name length */
-	__u8 i_name[F2FS_MAX_NAME_LEN];	/* file name for SPOR */
+	__u8 i_name[F2FS_NAME_LEN];	/* file name for SPOR */
+	__u8 i_reserved2;		/* for backward compatibility */
 
 	struct f2fs_extent i_ext;	/* caching a largest extent */
 
@@ -362,10 +363,10 @@ struct f2fs_summary_block {
 typedef __le32	f2fs_hash_t;
 
 /* One directory entry slot covers 8bytes-long file name */
-#define F2FS_NAME_LEN		8
-#define F2FS_NAME_LEN_BITS	3
+#define F2FS_SLOT_LEN		8
+#define F2FS_SLOT_LEN_BITS	3
 
-#define GET_DENTRY_SLOTS(x)	((x + F2FS_NAME_LEN - 1) >> F2FS_NAME_LEN_BITS)
+#define GET_DENTRY_SLOTS(x)	((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS)
 
 /* the number of dentry in a block */
 #define NR_DENTRY_IN_BLOCK	214
@@ -377,10 +378,10 @@ typedef __le32	f2fs_hash_t;
 #define SIZE_OF_DENTRY_BITMAP	((NR_DENTRY_IN_BLOCK + BITS_PER_BYTE - 1) / \
 					BITS_PER_BYTE)
 #define SIZE_OF_RESERVED	(PAGE_SIZE - ((SIZE_OF_DIR_ENTRY + \
-				F2FS_NAME_LEN) * \
+				F2FS_SLOT_LEN) * \
 				NR_DENTRY_IN_BLOCK + SIZE_OF_DENTRY_BITMAP))
 
-/* One directory entry slot representing F2FS_NAME_LEN-sized file name */
+/* One directory entry slot representing F2FS_SLOT_LEN-sized file name */
 struct f2fs_dir_entry {
 	__le32 hash_code;	/* hash code of file name */
 	__le32 ino;		/* inode number */
@@ -394,7 +395,7 @@ struct f2fs_dentry_block {
 	__u8 dentry_bitmap[SIZE_OF_DENTRY_BITMAP];
 	__u8 reserved[SIZE_OF_RESERVED];
 	struct f2fs_dir_entry dentry[NR_DENTRY_IN_BLOCK];
-	__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_NAME_LEN];
+	__u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN];
 } __packed;
 
 /* file types used in inode_info->flags */
-- 
cgit 


From 443580486e3b96578928c1c91e8fbdcf0c9c9c7f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Mon, 18 Feb 2013 23:28:34 +0900
Subject: irqchip: Renesas INTC External IRQ pin driver

This patch adds a driver for external IRQ pins connected
to the INTC block on recent SoCs from Renesas.

The INTC hardware block usually contains a rather wide
range of features ranging from external IRQ pin handling
to legacy interrupt controller support. On older SoCs
the INTC is used as a general purpose interrupt controller
both for external IRQ pins and on-chip devices.

On more recent ARM based SoCs with Cortex-A9 the main
interrupt controller is the GIC, but IRQ trigger setup
still need to happen in the INTC hardware block.

This driver implements the glue code needed to configure
IRQ trigger and also handle mask/unmask and demux of
external IRQ pins hooked up from the INTC to the GIC.

Tested on sh73a0 and r8a7779. The hardware varies quite
a bit with SoC model, for instance register width and
bitfield widths vary wildly. The driver requires one GIC
SPI per external IRQ pin to operate.  Each driver instance
will handle up to 8 external IRQ pins.

The SoCs using this driver are currently mainly used
together with regular platform devices so this driver
allows configuration via platform data to support things
like static interrupt base address. DT support will
be added incrementally in the not so distant future.

Signed-off-by: Magnus Damm <damm@opensource.se>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/irqchip/Kconfig                            |   4 +
 drivers/irqchip/Makefile                           |   1 +
 drivers/irqchip/irq-renesas-intc-irqpin.c          | 464 +++++++++++++++++++++
 .../linux/platform_data/irq-renesas-intc-irqpin.h  |  10 +
 4 files changed, 479 insertions(+)
 create mode 100644 drivers/irqchip/irq-renesas-intc-irqpin.c
 create mode 100644 include/linux/platform_data/irq-renesas-intc-irqpin.h

(limited to 'include/linux')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index a350969e5efe..0f5f1c3825bc 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -25,6 +25,10 @@ config ARM_VIC_NR
 	  The maximum number of VICs available in the system, for
 	  power management.
 
+config RENESAS_INTC_IRQPIN
+	bool
+	select IRQ_DOMAIN
+
 config VERSATILE_FPGA_IRQ
 	bool
 	select IRQ_DOMAIN
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 98e3b87bdf1b..1aaa4073ab60 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -8,4 +8,5 @@ obj-$(CONFIG_ARCH_SUNXI)		+= irq-sunxi.o
 obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
+obj-$(CONFIG_RENESAS_INTC_IRQPIN)	+= irq-renesas-intc-irqpin.o
 obj-$(CONFIG_VERSATILE_FPGA_IRQ)	+= irq-versatile-fpga.o
diff --git a/drivers/irqchip/irq-renesas-intc-irqpin.c b/drivers/irqchip/irq-renesas-intc-irqpin.c
new file mode 100644
index 000000000000..1e5058a56517
--- /dev/null
+++ b/drivers/irqchip/irq-renesas-intc-irqpin.c
@@ -0,0 +1,464 @@
+/*
+ * Renesas INTC External IRQ Pin Driver
+ *
+ *  Copyright (C) 2013 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_data/irq-renesas-intc-irqpin.h>
+
+#define INTC_IRQPIN_MAX 8 /* maximum 8 interrupts per driver instance */
+
+#define INTC_IRQPIN_REG_SENSE 0 /* ICRn */
+#define INTC_IRQPIN_REG_PRIO 1 /* INTPRInn */
+#define INTC_IRQPIN_REG_SOURCE 2 /* INTREQnn */
+#define INTC_IRQPIN_REG_MASK 3 /* INTMSKnn */
+#define INTC_IRQPIN_REG_CLEAR 4 /* INTMSKCLRnn */
+#define INTC_IRQPIN_REG_NR 5
+
+/* INTC external IRQ PIN hardware register access:
+ *
+ * SENSE is read-write 32-bit with 2-bits or 4-bits per IRQ (*)
+ * PRIO is read-write 32-bit with 4-bits per IRQ (**)
+ * SOURCE is read-only 32-bit or 8-bit with 1-bit per IRQ (***)
+ * MASK is write-only 32-bit or 8-bit with 1-bit per IRQ (***)
+ * CLEAR is write-only 32-bit or 8-bit with 1-bit per IRQ (***)
+ *
+ * (*) May be accessed by more than one driver instance - lock needed
+ * (**) Read-modify-write access by one driver instance - lock needed
+ * (***) Accessed by one driver instance only - no locking needed
+ */
+
+struct intc_irqpin_iomem {
+	void __iomem *iomem;
+	unsigned long (*read)(void __iomem *iomem);
+	void (*write)(void __iomem *iomem, unsigned long data);
+	int width;
+};  
+
+struct intc_irqpin_irq {
+	int hw_irq;
+	int irq;
+	struct intc_irqpin_priv *p;
+};  
+
+struct intc_irqpin_priv {
+	struct intc_irqpin_iomem iomem[INTC_IRQPIN_REG_NR];
+	struct intc_irqpin_irq irq[INTC_IRQPIN_MAX];
+	struct renesas_intc_irqpin_config config;
+	unsigned int number_of_irqs;
+	struct platform_device *pdev;
+	struct irq_chip irq_chip;
+	struct irq_domain *irq_domain;
+};
+
+static unsigned long intc_irqpin_read32(void __iomem *iomem)
+{
+	return ioread32(iomem);
+}
+
+static unsigned long intc_irqpin_read8(void __iomem *iomem)
+{
+	return ioread8(iomem);
+}
+
+static void intc_irqpin_write32(void __iomem *iomem, unsigned long data)
+{
+	iowrite32(data, iomem);
+}
+
+static void intc_irqpin_write8(void __iomem *iomem, unsigned long data)
+{
+	iowrite8(data, iomem);
+}
+
+static inline unsigned long intc_irqpin_read(struct intc_irqpin_priv *p,
+					     int reg)
+{
+	struct intc_irqpin_iomem *i = &p->iomem[reg];
+	return i->read(i->iomem);
+}
+
+static inline void intc_irqpin_write(struct intc_irqpin_priv *p,
+				     int reg, unsigned long data)
+{
+	struct intc_irqpin_iomem *i = &p->iomem[reg];
+	i->write(i->iomem, data);
+}
+
+static inline unsigned long intc_irqpin_hwirq_mask(struct intc_irqpin_priv *p,
+						   int reg, int hw_irq)
+{
+	return BIT((p->iomem[reg].width - 1) - hw_irq);
+}
+
+static inline void intc_irqpin_irq_write_hwirq(struct intc_irqpin_priv *p,
+					       int reg, int hw_irq)
+{
+	intc_irqpin_write(p, reg, intc_irqpin_hwirq_mask(p, reg, hw_irq));
+}
+
+static DEFINE_RAW_SPINLOCK(intc_irqpin_lock); /* only used by slow path */
+
+static void intc_irqpin_read_modify_write(struct intc_irqpin_priv *p,
+					  int reg, int shift,
+					  int width, int value)
+{
+	unsigned long flags;
+	unsigned long tmp;
+
+	raw_spin_lock_irqsave(&intc_irqpin_lock, flags);
+
+	tmp = intc_irqpin_read(p, reg);
+	tmp &= ~(((1 << width) - 1) << shift);
+	tmp |= value << shift;
+	intc_irqpin_write(p, reg, tmp);
+
+	raw_spin_unlock_irqrestore(&intc_irqpin_lock, flags);
+}
+
+static void intc_irqpin_mask_unmask_prio(struct intc_irqpin_priv *p,
+					 int irq, int do_mask)
+{
+	int bitfield_width = 4; /* PRIO assumed to have fixed bitfield width */
+	int shift = (7 - irq) * bitfield_width; /* PRIO assumed to be 32-bit */
+
+	intc_irqpin_read_modify_write(p, INTC_IRQPIN_REG_PRIO,
+				      shift, bitfield_width,
+				      do_mask ? 0 : (1 << bitfield_width) - 1);
+}
+
+static int intc_irqpin_set_sense(struct intc_irqpin_priv *p, int irq, int value)
+{
+	int bitfield_width = p->config.sense_bitfield_width;
+	int shift = (7 - irq) * bitfield_width; /* SENSE assumed to be 32-bit */
+
+	dev_dbg(&p->pdev->dev, "sense irq = %d, mode = %d\n", irq, value);
+
+	if (value >= (1 << bitfield_width))
+		return -EINVAL;
+
+	intc_irqpin_read_modify_write(p, INTC_IRQPIN_REG_SENSE, shift,
+				      bitfield_width, value);
+	return 0;
+}
+
+static void intc_irqpin_dbg(struct intc_irqpin_irq *i, char *str)
+{
+	dev_dbg(&i->p->pdev->dev, "%s (%d:%d:%d)\n",
+		str, i->irq, i->hw_irq,
+		irq_find_mapping(i->p->irq_domain, i->hw_irq));
+}
+
+static void intc_irqpin_irq_enable(struct irq_data *d)
+{
+	struct intc_irqpin_priv *p = irq_data_get_irq_chip_data(d);
+	int hw_irq = irqd_to_hwirq(d);
+
+	intc_irqpin_dbg(&p->irq[hw_irq], "enable");
+	intc_irqpin_irq_write_hwirq(p, INTC_IRQPIN_REG_CLEAR, hw_irq);
+}
+
+static void intc_irqpin_irq_disable(struct irq_data *d)
+{
+	struct intc_irqpin_priv *p = irq_data_get_irq_chip_data(d);
+	int hw_irq = irqd_to_hwirq(d);
+
+	intc_irqpin_dbg(&p->irq[hw_irq], "disable");
+	intc_irqpin_irq_write_hwirq(p, INTC_IRQPIN_REG_MASK, hw_irq);
+}
+
+static void intc_irqpin_irq_enable_force(struct irq_data *d)
+{
+	struct intc_irqpin_priv *p = irq_data_get_irq_chip_data(d);
+	int irq = p->irq[irqd_to_hwirq(d)].irq;
+
+	intc_irqpin_irq_enable(d);
+	irq_get_chip(irq)->irq_unmask(irq_get_irq_data(irq));
+}
+
+static void intc_irqpin_irq_disable_force(struct irq_data *d)
+{
+	struct intc_irqpin_priv *p = irq_data_get_irq_chip_data(d);
+	int irq = p->irq[irqd_to_hwirq(d)].irq;
+
+	irq_get_chip(irq)->irq_mask(irq_get_irq_data(irq));
+	intc_irqpin_irq_disable(d);
+}
+
+#define INTC_IRQ_SENSE_VALID 0x10
+#define INTC_IRQ_SENSE(x) (x + INTC_IRQ_SENSE_VALID)
+
+static unsigned char intc_irqpin_sense[IRQ_TYPE_SENSE_MASK + 1] = {
+	[IRQ_TYPE_EDGE_FALLING] = INTC_IRQ_SENSE(0x00),
+	[IRQ_TYPE_EDGE_RISING] = INTC_IRQ_SENSE(0x01),
+	[IRQ_TYPE_LEVEL_LOW] = INTC_IRQ_SENSE(0x02),
+	[IRQ_TYPE_LEVEL_HIGH] = INTC_IRQ_SENSE(0x03),
+	[IRQ_TYPE_EDGE_BOTH] = INTC_IRQ_SENSE(0x04),
+};
+
+static int intc_irqpin_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	unsigned char value = intc_irqpin_sense[type & IRQ_TYPE_SENSE_MASK];
+	struct intc_irqpin_priv *p = irq_data_get_irq_chip_data(d);
+
+	if (!(value & INTC_IRQ_SENSE_VALID))
+		return -EINVAL;
+
+	return intc_irqpin_set_sense(p, irqd_to_hwirq(d),
+				     value ^ INTC_IRQ_SENSE_VALID);
+}
+
+static irqreturn_t intc_irqpin_irq_handler(int irq, void *dev_id)
+{
+	struct intc_irqpin_irq *i = dev_id;
+	struct intc_irqpin_priv *p = i->p;
+	unsigned long bit;
+
+	intc_irqpin_dbg(i, "demux1");
+	bit = intc_irqpin_hwirq_mask(p, INTC_IRQPIN_REG_SOURCE, i->hw_irq);
+
+	if (intc_irqpin_read(p, INTC_IRQPIN_REG_SOURCE) & bit) {
+		intc_irqpin_write(p, INTC_IRQPIN_REG_SOURCE, ~bit);
+		intc_irqpin_dbg(i, "demux2");
+		generic_handle_irq(irq_find_mapping(p->irq_domain, i->hw_irq));
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
+
+static int intc_irqpin_irq_domain_map(struct irq_domain *h, unsigned int virq,
+				      irq_hw_number_t hw)
+{
+	struct intc_irqpin_priv *p = h->host_data;
+
+	intc_irqpin_dbg(&p->irq[hw], "map");
+	irq_set_chip_data(virq, h->host_data);
+	irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq);
+	set_irq_flags(virq, IRQF_VALID); /* kill me now */
+	return 0;
+}
+
+static struct irq_domain_ops intc_irqpin_irq_domain_ops = {
+	.map	= intc_irqpin_irq_domain_map,
+};
+
+static int intc_irqpin_probe(struct platform_device *pdev)
+{
+	struct renesas_intc_irqpin_config *pdata = pdev->dev.platform_data;
+	struct intc_irqpin_priv *p;
+	struct intc_irqpin_iomem *i;
+	struct resource *io[INTC_IRQPIN_REG_NR];
+	struct resource *irq;
+	struct irq_chip *irq_chip;
+	void (*enable_fn)(struct irq_data *d);
+	void (*disable_fn)(struct irq_data *d);
+	const char *name = dev_name(&pdev->dev);
+	int ret;
+	int k;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		ret = -ENOMEM;
+		goto err0;
+	}
+
+	/* deal with driver instance configuration */
+	if (pdata)
+		memcpy(&p->config, pdata, sizeof(*pdata));
+	if (!p->config.sense_bitfield_width)
+		p->config.sense_bitfield_width = 4; /* default to 4 bits */
+
+	p->pdev = pdev;
+	platform_set_drvdata(pdev, p);
+
+	/* get hold of manadatory IOMEM */
+	for (k = 0; k < INTC_IRQPIN_REG_NR; k++) {
+		io[k] = platform_get_resource(pdev, IORESOURCE_MEM, k);
+		if (!io[k]) {
+			dev_err(&pdev->dev, "not enough IOMEM resources\n");
+			ret = -EINVAL;
+			goto err1;
+		}
+	}
+
+	/* allow any number of IRQs between 1 and INTC_IRQPIN_MAX */
+	for (k = 0; k < INTC_IRQPIN_MAX; k++) {
+		irq = platform_get_resource(pdev, IORESOURCE_IRQ, k);
+		if (!irq)
+			break;
+
+		p->irq[k].hw_irq = k;
+		p->irq[k].p = p;
+		p->irq[k].irq = irq->start;
+	}
+
+	p->number_of_irqs = k;
+	if (p->number_of_irqs < 1) {
+		dev_err(&pdev->dev, "not enough IRQ resources\n");
+		ret = -EINVAL;
+		goto err1;
+	}
+
+	/* ioremap IOMEM and setup read/write callbacks */
+	for (k = 0; k < INTC_IRQPIN_REG_NR; k++) {
+		i = &p->iomem[k];
+
+		switch (resource_size(io[k])) {
+		case 1:
+			i->width = 8;
+			i->read = intc_irqpin_read8;
+			i->write = intc_irqpin_write8;
+			break;
+		case 4:
+			i->width = 32;
+			i->read = intc_irqpin_read32;
+			i->write = intc_irqpin_write32;
+			break;
+		default:
+			dev_err(&pdev->dev, "IOMEM size mismatch\n");
+			ret = -EINVAL;
+			goto err2;
+		}
+
+		i->iomem = ioremap_nocache(io[k]->start, resource_size(io[k]));
+		if (!i->iomem) {
+			dev_err(&pdev->dev, "failed to remap IOMEM\n");
+			ret = -ENXIO;
+			goto err2;
+		}
+	}
+
+	/* mask all interrupts using priority */
+	for (k = 0; k < p->number_of_irqs; k++)
+		intc_irqpin_mask_unmask_prio(p, k, 1);
+
+	/* use more severe masking method if requested */
+	if (p->config.control_parent) {
+		enable_fn = intc_irqpin_irq_enable_force;
+		disable_fn = intc_irqpin_irq_disable_force;
+	} else {
+		enable_fn = intc_irqpin_irq_enable;
+		disable_fn = intc_irqpin_irq_disable;
+	}
+
+	irq_chip = &p->irq_chip;
+	irq_chip->name = name;
+	irq_chip->irq_mask = disable_fn;
+	irq_chip->irq_unmask = enable_fn;
+	irq_chip->irq_enable = enable_fn;
+	irq_chip->irq_disable = disable_fn;
+	irq_chip->irq_set_type = intc_irqpin_irq_set_type;
+	irq_chip->flags	= IRQCHIP_SKIP_SET_WAKE;
+
+	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node,
+					      p->number_of_irqs,
+					      p->config.irq_base,
+					      &intc_irqpin_irq_domain_ops, p);
+	if (!p->irq_domain) {
+		ret = -ENXIO;
+		dev_err(&pdev->dev, "cannot initialize irq domain\n");
+		goto err2;
+	}
+
+	/* request and set priority on interrupts one by one */
+	for (k = 0; k < p->number_of_irqs; k++) {
+		if (request_irq(p->irq[k].irq, intc_irqpin_irq_handler,
+				0, name, &p->irq[k])) {
+			dev_err(&pdev->dev, "failed to request low IRQ\n");
+			ret = -ENOENT;
+			goto err3;
+		}
+		intc_irqpin_mask_unmask_prio(p, k, 0);
+	}
+
+	dev_info(&pdev->dev, "driving %d irqs\n", p->number_of_irqs);
+
+	/* warn in case of mismatch if irq base is specified */
+	if (p->config.irq_base) {
+		k = irq_find_mapping(p->irq_domain, 0);
+		if (p->config.irq_base != k)
+			dev_warn(&pdev->dev, "irq base mismatch (%d/%d)\n",
+				 p->config.irq_base, k);
+	}
+	
+	return 0;
+
+err3:
+	for (; k >= 0; k--)
+		free_irq(p->irq[k - 1].irq, &p->irq[k - 1]);
+
+	irq_domain_remove(p->irq_domain);
+err2:
+	for (k = 0; k < INTC_IRQPIN_REG_NR; k++)
+		iounmap(p->iomem[k].iomem);
+err1:
+	kfree(p);
+err0:
+	return ret;
+}
+
+static int intc_irqpin_remove(struct platform_device *pdev)
+{
+	struct intc_irqpin_priv *p = platform_get_drvdata(pdev);
+	int k;
+
+	for (k = 0; k < p->number_of_irqs; k++)
+		free_irq(p->irq[k].irq, &p->irq[k]);
+
+	irq_domain_remove(p->irq_domain);
+
+	for (k = 0; k < INTC_IRQPIN_REG_NR; k++)
+		iounmap(p->iomem[k].iomem);
+
+	kfree(p);
+	return 0;
+}
+
+static struct platform_driver intc_irqpin_device_driver = {
+	.probe		= intc_irqpin_probe,
+	.remove		= intc_irqpin_remove,
+	.driver		= {
+		.name	= "renesas_intc_irqpin",
+	}
+};
+
+static int __init intc_irqpin_init(void)
+{
+	return platform_driver_register(&intc_irqpin_device_driver);
+}
+postcore_initcall(intc_irqpin_init);
+
+static void __exit intc_irqpin_exit(void)
+{
+	platform_driver_unregister(&intc_irqpin_device_driver);
+}
+module_exit(intc_irqpin_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("Renesas INTC External IRQ Pin Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/platform_data/irq-renesas-intc-irqpin.h b/include/linux/platform_data/irq-renesas-intc-irqpin.h
new file mode 100644
index 000000000000..00ccac34dac8
--- /dev/null
+++ b/include/linux/platform_data/irq-renesas-intc-irqpin.h
@@ -0,0 +1,10 @@
+#ifndef __IRQ_RENESAS_INTC_IRQPIN_H__
+#define __IRQ_RENESAS_INTC_IRQPIN_H__
+
+struct renesas_intc_irqpin_config {
+	unsigned int sense_bitfield_width;
+	unsigned int irq_base;
+	bool control_parent;
+};
+
+#endif /* __IRQ_RENESAS_INTC_IRQPIN_H__ */
-- 
cgit 


From 0ca8712285e9e762ce4f5faf9f803b52e48c6837 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Tue, 26 Feb 2013 20:59:23 +0900
Subject: irqchip: intc-irqpin: GPL header for platform data

Add GPL header to platform data include file.

Signed-off-by: Magnus Damm <damm@opensource.se>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 include/linux/platform_data/irq-renesas-intc-irqpin.h | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/irq-renesas-intc-irqpin.h b/include/linux/platform_data/irq-renesas-intc-irqpin.h
index 00ccac34dac8..e4cb911066a6 100644
--- a/include/linux/platform_data/irq-renesas-intc-irqpin.h
+++ b/include/linux/platform_data/irq-renesas-intc-irqpin.h
@@ -1,3 +1,22 @@
+/*
+ * Renesas INTC External IRQ Pin Driver
+ *
+ *  Copyright (C) 2013 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
 #ifndef __IRQ_RENESAS_INTC_IRQPIN_H__
 #define __IRQ_RENESAS_INTC_IRQPIN_H__
 
-- 
cgit 


From fbc83b7f59dd8ed1154286b6de00b6d03c24a3c4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Wed, 27 Feb 2013 17:15:01 +0900
Subject: irqchip: Renesas IRQC driver

This patch adds a driver for external IRQ pins connected
to the IRQC hardware block on recent SoCs from Renesas.

The IRQC hardware block is used together with more
recent ARM based SoCs using the GIC. As usual the GIC
requires external IRQ trigger setup somewhere else
which in this particular case happens to be IRQC.

This driver implements the glue code needed to configure
IRQ trigger and also handle mask/unmask and demux of
external IRQ pins hooked up from the IRQC to the GIC.

Tested on r8a73a4 but is designed to work with a wide
range of SoCs. The driver requires one GIC SPI per
external IRQ pin to operate.  Each driver instance
will handle up to 32 external IRQ pins.

The SoCs using this driver are currently mainly used
together with regular platform devices so this driver
allows configuration via platform data to support things
like static interrupt base address. DT support will
be added incrementally in the not so distant future.

Signed-off-by: Magnus Damm <damm@opensource.se>
Tested-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/irqchip/Kconfig                        |   4 +
 drivers/irqchip/Makefile                       |   1 +
 drivers/irqchip/irq-renesas-irqc.c             | 298 +++++++++++++++++++++++++
 include/linux/platform_data/irq-renesas-irqc.h |  27 +++
 4 files changed, 330 insertions(+)
 create mode 100644 drivers/irqchip/irq-renesas-irqc.c
 create mode 100644 include/linux/platform_data/irq-renesas-irqc.h

(limited to 'include/linux')

diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 0f5f1c3825bc..4a33351c25dc 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -29,6 +29,10 @@ config RENESAS_INTC_IRQPIN
 	bool
 	select IRQ_DOMAIN
 
+config RENESAS_IRQC
+	bool
+	select IRQ_DOMAIN
+
 config VERSATILE_FPGA_IRQ
 	bool
 	select IRQ_DOMAIN
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 1aaa4073ab60..e41ceb9bec22 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -9,4 +9,5 @@ obj-$(CONFIG_ARCH_SPEAR3XX)		+= spear-shirq.o
 obj-$(CONFIG_ARM_GIC)			+= irq-gic.o
 obj-$(CONFIG_ARM_VIC)			+= irq-vic.o
 obj-$(CONFIG_RENESAS_INTC_IRQPIN)	+= irq-renesas-intc-irqpin.o
+obj-$(CONFIG_RENESAS_IRQC)		+= irq-renesas-irqc.o
 obj-$(CONFIG_VERSATILE_FPGA_IRQ)	+= irq-versatile-fpga.o
diff --git a/drivers/irqchip/irq-renesas-irqc.c b/drivers/irqchip/irq-renesas-irqc.c
new file mode 100644
index 000000000000..95d69bfac982
--- /dev/null
+++ b/drivers/irqchip/irq-renesas-irqc.c
@@ -0,0 +1,298 @@
+/*
+ * Renesas IRQC Driver
+ *
+ *  Copyright (C) 2013 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/platform_data/irq-renesas-irqc.h>
+
+#define IRQC_IRQ_MAX 32 /* maximum 32 interrupts per driver instance */
+
+#define IRQC_REQ_STS 0x00
+#define IRQC_EN_STS 0x04
+#define IRQC_EN_SET 0x08
+#define IRQC_INT_CPU_BASE(n) (0x000 + ((n) * 0x10))
+#define DETECT_STATUS 0x100
+#define IRQC_CONFIG(n) (0x180 + ((n) * 0x04))
+
+struct irqc_irq {
+	int hw_irq;
+	int requested_irq;
+	int domain_irq;
+	struct irqc_priv *p;
+};
+
+struct irqc_priv {
+	void __iomem *iomem;
+	void __iomem *cpu_int_base;
+	struct irqc_irq irq[IRQC_IRQ_MAX];
+	struct renesas_irqc_config config;
+	unsigned int number_of_irqs;
+	struct platform_device *pdev;
+	struct irq_chip irq_chip;
+	struct irq_domain *irq_domain;
+};
+
+static void irqc_dbg(struct irqc_irq *i, char *str)
+{
+	dev_dbg(&i->p->pdev->dev, "%s (%d:%d:%d)\n",
+		str, i->requested_irq, i->hw_irq, i->domain_irq);
+}
+
+static void irqc_irq_enable(struct irq_data *d)
+{
+	struct irqc_priv *p = irq_data_get_irq_chip_data(d);
+	int hw_irq = irqd_to_hwirq(d);
+
+	irqc_dbg(&p->irq[hw_irq], "enable");
+	iowrite32(BIT(hw_irq), p->cpu_int_base + IRQC_EN_SET);
+}
+
+static void irqc_irq_disable(struct irq_data *d)
+{
+	struct irqc_priv *p = irq_data_get_irq_chip_data(d);
+	int hw_irq = irqd_to_hwirq(d);
+
+	irqc_dbg(&p->irq[hw_irq], "disable");
+	iowrite32(BIT(hw_irq), p->cpu_int_base + IRQC_EN_STS);
+}
+
+#define INTC_IRQ_SENSE_VALID 0x10
+#define INTC_IRQ_SENSE(x) (x + INTC_IRQ_SENSE_VALID)
+
+static unsigned char irqc_sense[IRQ_TYPE_SENSE_MASK + 1] = {
+	[IRQ_TYPE_LEVEL_LOW] = INTC_IRQ_SENSE(0x01),
+	[IRQ_TYPE_LEVEL_HIGH] = INTC_IRQ_SENSE(0x02),
+	[IRQ_TYPE_EDGE_FALLING] = INTC_IRQ_SENSE(0x04), /* Synchronous */
+	[IRQ_TYPE_EDGE_RISING] = INTC_IRQ_SENSE(0x08), /* Synchronous */
+	[IRQ_TYPE_EDGE_BOTH] = INTC_IRQ_SENSE(0x0c),  /* Synchronous */
+};
+
+static int irqc_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	struct irqc_priv *p = irq_data_get_irq_chip_data(d);
+	int hw_irq = irqd_to_hwirq(d);
+	unsigned char value = irqc_sense[type & IRQ_TYPE_SENSE_MASK];
+	unsigned long tmp;
+
+	irqc_dbg(&p->irq[hw_irq], "sense");
+
+	if (!(value & INTC_IRQ_SENSE_VALID))
+		return -EINVAL;
+
+	tmp = ioread32(p->iomem + IRQC_CONFIG(hw_irq));
+	tmp &= ~0x3f;
+	tmp |= value ^ INTC_IRQ_SENSE_VALID;
+	iowrite32(tmp, p->iomem + IRQC_CONFIG(hw_irq));
+	return 0;
+}
+
+static irqreturn_t irqc_irq_handler(int irq, void *dev_id)
+{
+	struct irqc_irq *i = dev_id;
+	struct irqc_priv *p = i->p;
+	unsigned long bit = BIT(i->hw_irq);
+
+	irqc_dbg(i, "demux1");
+
+	if (ioread32(p->iomem + DETECT_STATUS) & bit) {
+		iowrite32(bit, p->iomem + DETECT_STATUS);
+		irqc_dbg(i, "demux2");
+		generic_handle_irq(i->domain_irq);
+		return IRQ_HANDLED;
+	}
+	return IRQ_NONE;
+}
+
+static int irqc_irq_domain_map(struct irq_domain *h, unsigned int virq,
+			       irq_hw_number_t hw)
+{
+	struct irqc_priv *p = h->host_data;
+
+	p->irq[hw].domain_irq = virq;
+	p->irq[hw].hw_irq = hw;
+
+	irqc_dbg(&p->irq[hw], "map");
+	irq_set_chip_data(virq, h->host_data);
+	irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq);
+	set_irq_flags(virq, IRQF_VALID); /* kill me now */
+	return 0;
+}
+
+static struct irq_domain_ops irqc_irq_domain_ops = {
+	.map	= irqc_irq_domain_map,
+};
+
+static int irqc_probe(struct platform_device *pdev)
+{
+	struct renesas_irqc_config *pdata = pdev->dev.platform_data;
+	struct irqc_priv *p;
+	struct resource *io;
+	struct resource *irq;
+	struct irq_chip *irq_chip;
+	const char *name = dev_name(&pdev->dev);
+	int ret;
+	int k;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		ret = -ENOMEM;
+		goto err0;
+	}
+
+	/* deal with driver instance configuration */
+	if (pdata)
+		memcpy(&p->config, pdata, sizeof(*pdata));
+
+	p->pdev = pdev;
+	platform_set_drvdata(pdev, p);
+
+	/* get hold of manadatory IOMEM */
+	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!io) {
+		dev_err(&pdev->dev, "not enough IOMEM resources\n");
+		ret = -EINVAL;
+		goto err1;
+	}
+
+	/* allow any number of IRQs between 1 and IRQC_IRQ_MAX */
+	for (k = 0; k < IRQC_IRQ_MAX; k++) {
+		irq = platform_get_resource(pdev, IORESOURCE_IRQ, k);
+		if (!irq)
+			break;
+
+		p->irq[k].p = p;
+		p->irq[k].requested_irq = irq->start;
+	}
+
+	p->number_of_irqs = k;
+	if (p->number_of_irqs < 1) {
+		dev_err(&pdev->dev, "not enough IRQ resources\n");
+		ret = -EINVAL;
+		goto err1;
+	}
+
+	/* ioremap IOMEM and setup read/write callbacks */
+	p->iomem = ioremap_nocache(io->start, resource_size(io));
+	if (!p->iomem) {
+		dev_err(&pdev->dev, "failed to remap IOMEM\n");
+		ret = -ENXIO;
+		goto err2;
+	}
+
+	p->cpu_int_base = p->iomem + IRQC_INT_CPU_BASE(0); /* SYS-SPI */
+
+	irq_chip = &p->irq_chip;
+	irq_chip->name = name;
+	irq_chip->irq_mask = irqc_irq_disable;
+	irq_chip->irq_unmask = irqc_irq_enable;
+	irq_chip->irq_enable = irqc_irq_enable;
+	irq_chip->irq_disable = irqc_irq_disable;
+	irq_chip->irq_set_type = irqc_irq_set_type;
+	irq_chip->flags	= IRQCHIP_SKIP_SET_WAKE;
+
+	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node,
+					      p->number_of_irqs,
+					      p->config.irq_base,
+					      &irqc_irq_domain_ops, p);
+	if (!p->irq_domain) {
+		ret = -ENXIO;
+		dev_err(&pdev->dev, "cannot initialize irq domain\n");
+		goto err2;
+	}
+
+	/* request interrupts one by one */
+	for (k = 0; k < p->number_of_irqs; k++) {
+		if (request_irq(p->irq[k].requested_irq, irqc_irq_handler,
+				0, name, &p->irq[k])) {
+			dev_err(&pdev->dev, "failed to request IRQ\n");
+			ret = -ENOENT;
+			goto err3;
+		}
+	}
+
+	dev_info(&pdev->dev, "driving %d irqs\n", p->number_of_irqs);
+
+	/* warn in case of mismatch if irq base is specified */
+	if (p->config.irq_base) {
+		if (p->config.irq_base != p->irq[0].domain_irq)
+			dev_warn(&pdev->dev, "irq base mismatch (%d/%d)\n",
+				 p->config.irq_base, p->irq[0].domain_irq);
+	}
+
+	return 0;
+err3:
+	for (; k >= 0; k--)
+		free_irq(p->irq[k - 1].requested_irq, &p->irq[k - 1]);
+
+	irq_domain_remove(p->irq_domain);
+err2:
+	iounmap(p->iomem);
+err1:
+	kfree(p);
+err0:
+	return ret;
+}
+
+static int irqc_remove(struct platform_device *pdev)
+{
+	struct irqc_priv *p = platform_get_drvdata(pdev);
+	int k;
+
+	for (k = 0; k < p->number_of_irqs; k++)
+		free_irq(p->irq[k].requested_irq, &p->irq[k]);
+
+	irq_domain_remove(p->irq_domain);
+	iounmap(p->iomem);
+	kfree(p);
+	return 0;
+}
+
+static struct platform_driver irqc_device_driver = {
+	.probe		= irqc_probe,
+	.remove		= irqc_remove,
+	.driver		= {
+		.name	= "renesas_irqc",
+	}
+};
+
+static int __init irqc_init(void)
+{
+	return platform_driver_register(&irqc_device_driver);
+}
+postcore_initcall(irqc_init);
+
+static void __exit irqc_exit(void)
+{
+	platform_driver_unregister(&irqc_device_driver);
+}
+module_exit(irqc_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("Renesas IRQC Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/platform_data/irq-renesas-irqc.h b/include/linux/platform_data/irq-renesas-irqc.h
new file mode 100644
index 000000000000..3ae17b3e00ed
--- /dev/null
+++ b/include/linux/platform_data/irq-renesas-irqc.h
@@ -0,0 +1,27 @@
+/*
+ * Renesas IRQC Driver
+ *
+ *  Copyright (C) 2013 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef __IRQ_RENESAS_IRQC_H__
+#define __IRQ_RENESAS_IRQC_H__
+
+struct renesas_irqc_config {
+	unsigned int irq_base;
+};
+
+#endif /* __IRQ_RENESAS_IRQC_H__ */
-- 
cgit 


From af6882be363d3a7bf0f72dd17ac2a639c4da0059 Mon Sep 17 00:00:00 2001
From: Fabio Baltieri <fabio.baltieri@linaro.org>
Date: Fri, 8 Mar 2013 10:27:09 +0800
Subject: usb: phy: ab8500-usb: update irq handling code

Update irq handling code to notify all possible link status changes of
AB8500 and AB8505 to the ux500-musb glue driver.  The additional event
codes will be used for pm-runtime implementation, and are defined in a
separate ux500-specific header.

This also modify the irq registration code to use devm_* helpers and
drop all non necessary fail path code.

Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Fabio Baltieri <fabio.baltieri@linaro.org>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/musb/ux500.c         |   7 +-
 drivers/usb/phy/phy-ab8500-usb.c | 440 ++++++++++++++++++++++++++++++---------
 include/linux/usb/musb-ux500.h   |  31 +++
 3 files changed, 382 insertions(+), 96 deletions(-)
 create mode 100644 include/linux/usb/musb-ux500.h

(limited to 'include/linux')

diff --git a/drivers/usb/musb/ux500.c b/drivers/usb/musb/ux500.c
index 0ae9472a68a8..88795f532370 100644
--- a/drivers/usb/musb/ux500.c
+++ b/drivers/usb/musb/ux500.c
@@ -26,6 +26,7 @@
 #include <linux/err.h>
 #include <linux/io.h>
 #include <linux/platform_device.h>
+#include <linux/usb/musb-ux500.h>
 
 #include "musb_core.h"
 
@@ -107,15 +108,15 @@ static int musb_otg_notifications(struct notifier_block *nb,
 			event, usb_otg_state_string(musb->xceiv->state));
 
 	switch (event) {
-	case USB_EVENT_ID:
+	case UX500_MUSB_ID:
 		dev_dbg(musb->controller, "ID GND\n");
 		ux500_musb_set_vbus(musb, 1);
 		break;
-	case USB_EVENT_VBUS:
+	case UX500_MUSB_VBUS:
 		dev_dbg(musb->controller, "VBUS Connect\n");
 		ux500_musb_set_vbus(musb, 0);
 		break;
-	case USB_EVENT_NONE:
+	case UX500_MUSB_NONE:
 		dev_dbg(musb->controller, "VBUS Disconnect\n");
 		if (is_host_active(musb))
 			ux500_musb_set_vbus(musb, 0);
diff --git a/drivers/usb/phy/phy-ab8500-usb.c b/drivers/usb/phy/phy-ab8500-usb.c
index 9f5e0e4ab02a..351b0369a611 100644
--- a/drivers/usb/phy/phy-ab8500-usb.c
+++ b/drivers/usb/phy/phy-ab8500-usb.c
@@ -31,9 +31,11 @@
 #include <linux/delay.h>
 #include <linux/mfd/abx500.h>
 #include <linux/mfd/abx500/ab8500.h>
+#include <linux/usb/musb-ux500.h>
 
 #define AB8500_MAIN_WD_CTRL_REG 0x01
 #define AB8500_USB_LINE_STAT_REG 0x80
+#define AB8505_USB_LINE_STAT_REG 0x94
 #define AB8500_USB_PHY_CTRL_REG 0x8A
 
 #define AB8500_BIT_OTG_STAT_ID (1 << 0)
@@ -44,36 +46,76 @@
 
 #define AB8500_WD_KICK_DELAY_US 100 /* usec */
 #define AB8500_WD_V11_DISABLE_DELAY_US 100 /* usec */
+#define AB8500_V20_31952_DISABLE_DELAY_US 100 /* usec */
 
 /* Usb line status register */
 enum ab8500_usb_link_status {
-	USB_LINK_NOT_CONFIGURED = 0,
-	USB_LINK_STD_HOST_NC,
-	USB_LINK_STD_HOST_C_NS,
-	USB_LINK_STD_HOST_C_S,
-	USB_LINK_HOST_CHG_NM,
-	USB_LINK_HOST_CHG_HS,
-	USB_LINK_HOST_CHG_HS_CHIRP,
-	USB_LINK_DEDICATED_CHG,
-	USB_LINK_ACA_RID_A,
-	USB_LINK_ACA_RID_B,
-	USB_LINK_ACA_RID_C_NM,
-	USB_LINK_ACA_RID_C_HS,
-	USB_LINK_ACA_RID_C_HS_CHIRP,
-	USB_LINK_HM_IDGND,
-	USB_LINK_RESERVED,
-	USB_LINK_NOT_VALID_LINK
+	USB_LINK_NOT_CONFIGURED_8500 = 0,
+	USB_LINK_STD_HOST_NC_8500,
+	USB_LINK_STD_HOST_C_NS_8500,
+	USB_LINK_STD_HOST_C_S_8500,
+	USB_LINK_HOST_CHG_NM_8500,
+	USB_LINK_HOST_CHG_HS_8500,
+	USB_LINK_HOST_CHG_HS_CHIRP_8500,
+	USB_LINK_DEDICATED_CHG_8500,
+	USB_LINK_ACA_RID_A_8500,
+	USB_LINK_ACA_RID_B_8500,
+	USB_LINK_ACA_RID_C_NM_8500,
+	USB_LINK_ACA_RID_C_HS_8500,
+	USB_LINK_ACA_RID_C_HS_CHIRP_8500,
+	USB_LINK_HM_IDGND_8500,
+	USB_LINK_RESERVED_8500,
+	USB_LINK_NOT_VALID_LINK_8500,
+};
+
+enum ab8505_usb_link_status {
+	USB_LINK_NOT_CONFIGURED_8505 = 0,
+	USB_LINK_STD_HOST_NC_8505,
+	USB_LINK_STD_HOST_C_NS_8505,
+	USB_LINK_STD_HOST_C_S_8505,
+	USB_LINK_CDP_8505,
+	USB_LINK_RESERVED0_8505,
+	USB_LINK_RESERVED1_8505,
+	USB_LINK_DEDICATED_CHG_8505,
+	USB_LINK_ACA_RID_A_8505,
+	USB_LINK_ACA_RID_B_8505,
+	USB_LINK_ACA_RID_C_NM_8505,
+	USB_LINK_RESERVED2_8505,
+	USB_LINK_RESERVED3_8505,
+	USB_LINK_HM_IDGND_8505,
+	USB_LINK_CHARGERPORT_NOT_OK_8505,
+	USB_LINK_CHARGER_DM_HIGH_8505,
+	USB_LINK_PHYEN_NO_VBUS_NO_IDGND_8505,
+	USB_LINK_STD_UPSTREAM_NO_IDGNG_NO_VBUS_8505,
+	USB_LINK_STD_UPSTREAM_8505,
+	USB_LINK_CHARGER_SE1_8505,
+	USB_LINK_CARKIT_CHGR_1_8505,
+	USB_LINK_CARKIT_CHGR_2_8505,
+	USB_LINK_ACA_DOCK_CHGR_8505,
+	USB_LINK_SAMSUNG_BOOT_CBL_PHY_EN_8505,
+	USB_LINK_SAMSUNG_BOOT_CBL_PHY_DISB_8505,
+	USB_LINK_SAMSUNG_UART_CBL_PHY_EN_8505,
+	USB_LINK_SAMSUNG_UART_CBL_PHY_DISB_8505,
+	USB_LINK_MOTOROLA_FACTORY_CBL_PHY_EN_8505,
+};
+
+enum ab8500_usb_mode {
+	USB_IDLE = 0,
+	USB_PERIPHERAL,
+	USB_HOST,
+	USB_DEDICATED_CHG
 };
 
 struct ab8500_usb {
 	struct usb_phy phy;
 	struct device *dev;
 	struct ab8500 *ab8500;
-	int irq_num_link_status;
 	unsigned vbus_draw;
 	struct delayed_work dwork;
 	struct work_struct phy_dis_work;
 	unsigned long link_status_wait;
+	enum ab8500_usb_mode mode;
+	int previous_link_status_state;
 };
 
 static inline struct ab8500_usb *phy_to_ab(struct usb_phy *x)
@@ -104,6 +146,17 @@ static void ab8500_usb_wd_workaround(struct ab8500_usb *ab)
 		0);
 }
 
+static void ab8500_usb_wd_linkstatus(struct ab8500_usb *ab, u8 bit)
+{
+	/* Workaround for v2.0 bug # 31952 */
+	if (is_ab8500_2p0(ab->ab8500)) {
+		abx500_mask_and_set_register_interruptible(ab->dev,
+				AB8500_USB, AB8500_USB_PHY_CTRL_REG,
+				bit, bit);
+		udelay(AB8500_V20_31952_DISABLE_DELAY_US);
+	}
+}
+
 static void ab8500_usb_phy_ctrl(struct ab8500_usb *ab, bool sel_host,
 					bool enable)
 {
@@ -139,92 +192,276 @@ static void ab8500_usb_phy_ctrl(struct ab8500_usb *ab, bool sel_host,
 #define ab8500_usb_peri_phy_en(ab)	ab8500_usb_phy_ctrl(ab, false, true)
 #define ab8500_usb_peri_phy_dis(ab)	ab8500_usb_phy_ctrl(ab, false, false)
 
-static int ab8500_usb_link_status_update(struct ab8500_usb *ab)
+static int ab8505_usb_link_status_update(struct ab8500_usb *ab,
+		enum ab8505_usb_link_status lsts)
 {
-	u8 reg;
-	enum ab8500_usb_link_status lsts;
-	void *v = NULL;
-	enum usb_phy_events event;
+	enum ux500_musb_vbus_id_status event = 0;
 
-	abx500_get_register_interruptible(ab->dev,
-			AB8500_USB,
-			AB8500_USB_LINE_STAT_REG,
-			&reg);
+	dev_dbg(ab->dev, "ab8505_usb_link_status_update %d\n", lsts);
 
-	lsts = (reg >> 3) & 0x0F;
+	/*
+	 * Spurious link_status interrupts are seen at the time of
+	 * disconnection of a device in RIDA state
+	 */
+	if (ab->previous_link_status_state == USB_LINK_ACA_RID_A_8505 &&
+			(lsts == USB_LINK_STD_HOST_NC_8505))
+		return 0;
+
+	ab->previous_link_status_state = lsts;
 
 	switch (lsts) {
-	case USB_LINK_NOT_CONFIGURED:
-	case USB_LINK_RESERVED:
-	case USB_LINK_NOT_VALID_LINK:
-		/* TODO: Disable regulators. */
-		ab8500_usb_host_phy_dis(ab);
-		ab8500_usb_peri_phy_dis(ab);
-		ab->phy.state = OTG_STATE_B_IDLE;
+	case USB_LINK_ACA_RID_B_8505:
+		event = UX500_MUSB_RIDB;
+	case USB_LINK_NOT_CONFIGURED_8505:
+	case USB_LINK_RESERVED0_8505:
+	case USB_LINK_RESERVED1_8505:
+	case USB_LINK_RESERVED2_8505:
+	case USB_LINK_RESERVED3_8505:
+		ab->mode = USB_IDLE;
 		ab->phy.otg->default_a = false;
 		ab->vbus_draw = 0;
-		event = USB_EVENT_NONE;
+		if (event != UX500_MUSB_RIDB)
+			event = UX500_MUSB_NONE;
+		/*
+		 * Fallback to default B_IDLE as nothing
+		 * is connected
+		 */
+		ab->phy.state = OTG_STATE_B_IDLE;
 		break;
 
-	case USB_LINK_STD_HOST_NC:
-	case USB_LINK_STD_HOST_C_NS:
-	case USB_LINK_STD_HOST_C_S:
-	case USB_LINK_HOST_CHG_NM:
-	case USB_LINK_HOST_CHG_HS:
-	case USB_LINK_HOST_CHG_HS_CHIRP:
-		if (ab->phy.otg->gadget) {
-			/* TODO: Enable regulators. */
+	case USB_LINK_ACA_RID_C_NM_8505:
+		event = UX500_MUSB_RIDC;
+	case USB_LINK_STD_HOST_NC_8505:
+	case USB_LINK_STD_HOST_C_NS_8505:
+	case USB_LINK_STD_HOST_C_S_8505:
+	case USB_LINK_CDP_8505:
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_PERIPHERAL;
 			ab8500_usb_peri_phy_en(ab);
-			v = ab->phy.otg->gadget;
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
 		}
-		event = USB_EVENT_VBUS;
+		if (event != UX500_MUSB_RIDC)
+			event = UX500_MUSB_VBUS;
 		break;
 
-	case USB_LINK_HM_IDGND:
-		if (ab->phy.otg->host) {
-			/* TODO: Enable regulators. */
+	case USB_LINK_ACA_RID_A_8505:
+	case USB_LINK_ACA_DOCK_CHGR_8505:
+		event = UX500_MUSB_RIDA;
+	case USB_LINK_HM_IDGND_8505:
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_HOST;
 			ab8500_usb_host_phy_en(ab);
-			v = ab->phy.otg->host;
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
 		}
-		ab->phy.state = OTG_STATE_A_IDLE;
 		ab->phy.otg->default_a = true;
-		event = USB_EVENT_ID;
+		if (event != UX500_MUSB_RIDA)
+			event = UX500_MUSB_ID;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
 		break;
 
-	case USB_LINK_ACA_RID_A:
-	case USB_LINK_ACA_RID_B:
-		/* TODO */
-	case USB_LINK_ACA_RID_C_NM:
-	case USB_LINK_ACA_RID_C_HS:
-	case USB_LINK_ACA_RID_C_HS_CHIRP:
-	case USB_LINK_DEDICATED_CHG:
-		/* TODO: vbus_draw */
-		event = USB_EVENT_CHARGER;
+	case USB_LINK_DEDICATED_CHG_8505:
+		ab->mode = USB_DEDICATED_CHG;
+		event = UX500_MUSB_CHARGER;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	default:
 		break;
 	}
 
-	atomic_notifier_call_chain(&ab->phy.notifier, event, v);
+	return 0;
+}
+
+static int ab8500_usb_link_status_update(struct ab8500_usb *ab,
+		enum ab8500_usb_link_status lsts)
+{
+	enum ux500_musb_vbus_id_status event = 0;
+
+	dev_dbg(ab->dev, "ab8500_usb_link_status_update %d\n", lsts);
+
+	/*
+	 * Spurious link_status interrupts are seen in case of a
+	 * disconnection of a device in IDGND and RIDA stage
+	 */
+	if (ab->previous_link_status_state == USB_LINK_HM_IDGND_8500 &&
+			(lsts == USB_LINK_STD_HOST_C_NS_8500 ||
+			 lsts == USB_LINK_STD_HOST_NC_8500))
+		return 0;
+
+	if (ab->previous_link_status_state == USB_LINK_ACA_RID_A_8500 &&
+			lsts == USB_LINK_STD_HOST_NC_8500)
+		return 0;
+
+	ab->previous_link_status_state = lsts;
+
+	switch (lsts) {
+	case USB_LINK_ACA_RID_B_8500:
+		event = UX500_MUSB_RIDB;
+	case USB_LINK_NOT_CONFIGURED_8500:
+	case USB_LINK_NOT_VALID_LINK_8500:
+		ab->mode = USB_IDLE;
+		ab->phy.otg->default_a = false;
+		ab->vbus_draw = 0;
+		if (event != UX500_MUSB_RIDB)
+			event = UX500_MUSB_NONE;
+		/* Fallback to default B_IDLE as nothing is connected */
+		ab->phy.state = OTG_STATE_B_IDLE;
+		break;
+
+	case USB_LINK_ACA_RID_C_NM_8500:
+	case USB_LINK_ACA_RID_C_HS_8500:
+	case USB_LINK_ACA_RID_C_HS_CHIRP_8500:
+		event = UX500_MUSB_RIDC;
+	case USB_LINK_STD_HOST_NC_8500:
+	case USB_LINK_STD_HOST_C_NS_8500:
+	case USB_LINK_STD_HOST_C_S_8500:
+	case USB_LINK_HOST_CHG_NM_8500:
+	case USB_LINK_HOST_CHG_HS_8500:
+	case USB_LINK_HOST_CHG_HS_CHIRP_8500:
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_PERIPHERAL;
+			ab8500_usb_peri_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		if (event != UX500_MUSB_RIDC)
+			event = UX500_MUSB_VBUS;
+		break;
+
+	case USB_LINK_ACA_RID_A_8500:
+		event = UX500_MUSB_RIDA;
+	case USB_LINK_HM_IDGND_8500:
+		if (ab->mode == USB_IDLE) {
+			ab->mode = USB_HOST;
+			ab8500_usb_host_phy_en(ab);
+			atomic_notifier_call_chain(&ab->phy.notifier,
+					UX500_MUSB_PREPARE, &ab->vbus_draw);
+		}
+		ab->phy.otg->default_a = true;
+		if (event != UX500_MUSB_RIDA)
+			event = UX500_MUSB_ID;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	case USB_LINK_DEDICATED_CHG_8500:
+		ab->mode = USB_DEDICATED_CHG;
+		event = UX500_MUSB_CHARGER;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		break;
+
+	case USB_LINK_RESERVED_8500:
+		break;
+	}
 
 	return 0;
 }
 
-static void ab8500_usb_delayed_work(struct work_struct *work)
+/*
+ * Connection Sequence:
+ *   1. Link Status Interrupt
+ *   2. Enable AB clock
+ *   3. Enable AB regulators
+ *   4. Enable USB phy
+ *   5. Reset the musb controller
+ *   6. Switch the ULPI GPIO pins to fucntion mode
+ *   7. Enable the musb Peripheral5 clock
+ *   8. Restore MUSB context
+ */
+static int abx500_usb_link_status_update(struct ab8500_usb *ab)
 {
-	struct ab8500_usb *ab = container_of(work, struct ab8500_usb,
-						dwork.work);
+	u8 reg;
+	int ret = 0;
+
+	if (is_ab8500(ab->ab8500)) {
+		enum ab8500_usb_link_status lsts;
+
+		abx500_get_register_interruptible(ab->dev,
+				AB8500_USB, AB8500_USB_LINE_STAT_REG, &reg);
+		lsts = (reg >> 3) & 0x0F;
+		ret = ab8500_usb_link_status_update(ab, lsts);
+	} else if (is_ab8505(ab->ab8500)) {
+		enum ab8505_usb_link_status lsts;
+
+		abx500_get_register_interruptible(ab->dev,
+				AB8500_USB, AB8505_USB_LINE_STAT_REG, &reg);
+		lsts = (reg >> 3) & 0x1F;
+		ret = ab8505_usb_link_status_update(ab, lsts);
+	}
+
+	return ret;
+}
+
+/*
+ * Disconnection Sequence:
+ *   1. Disconect Interrupt
+ *   2. Disable regulators
+ *   3. Disable AB clock
+ *   4. Disable the Phy
+ *   5. Link Status Interrupt
+ *   6. Disable Musb Clock
+ */
+static irqreturn_t ab8500_usb_disconnect_irq(int irq, void *data)
+{
+	struct ab8500_usb *ab = (struct ab8500_usb *) data;
+	enum usb_phy_events event = UX500_MUSB_NONE;
+
+	/* Link status will not be updated till phy is disabled. */
+	if (ab->mode == USB_HOST) {
+		ab->phy.otg->default_a = false;
+		ab->vbus_draw = 0;
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		ab8500_usb_host_phy_dis(ab);
+		ab->mode = USB_IDLE;
+	}
+
+	if (ab->mode == USB_PERIPHERAL) {
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				event, &ab->vbus_draw);
+		ab8500_usb_peri_phy_dis(ab);
+		atomic_notifier_call_chain(&ab->phy.notifier,
+				UX500_MUSB_CLEAN, &ab->vbus_draw);
+		ab->mode = USB_IDLE;
+		ab->phy.otg->default_a = false;
+		ab->vbus_draw = 0;
+	}
+
+	if (is_ab8500_2p0(ab->ab8500)) {
+		if (ab->mode == USB_DEDICATED_CHG) {
+			ab8500_usb_wd_linkstatus(ab,
+					AB8500_BIT_PHY_CTRL_DEVICE_EN);
+			abx500_mask_and_set_register_interruptible(ab->dev,
+					AB8500_USB, AB8500_USB_PHY_CTRL_REG,
+					AB8500_BIT_PHY_CTRL_DEVICE_EN, 0);
+		}
+	}
 
-	ab8500_usb_link_status_update(ab);
+	return IRQ_HANDLED;
 }
 
-static irqreturn_t ab8500_usb_v20_irq(int irq, void *data)
+static irqreturn_t ab8500_usb_link_status_irq(int irq, void *data)
 {
 	struct ab8500_usb *ab = (struct ab8500_usb *) data;
 
-	ab8500_usb_link_status_update(ab);
+	abx500_usb_link_status_update(ab);
 
 	return IRQ_HANDLED;
 }
 
+static void ab8500_usb_delayed_work(struct work_struct *work)
+{
+	struct ab8500_usb *ab = container_of(work, struct ab8500_usb,
+						dwork.work);
+
+	abx500_usb_link_status_update(ab);
+}
+
 static void ab8500_usb_phy_disable_work(struct work_struct *work)
 {
 	struct ab8500_usb *ab = container_of(work, struct ab8500_usb,
@@ -250,7 +487,7 @@ static int ab8500_usb_set_power(struct usb_phy *phy, unsigned mA)
 
 	if (mA)
 		atomic_notifier_call_chain(&ab->phy.notifier,
-				USB_EVENT_ENUMERATED, ab->phy.otg->gadget);
+				UX500_MUSB_ENUMERATED, ab->phy.otg->gadget);
 	return 0;
 }
 
@@ -327,30 +564,48 @@ static int ab8500_usb_set_host(struct usb_otg *otg, struct usb_bus *host)
 	return 0;
 }
 
-static void ab8500_usb_irq_free(struct ab8500_usb *ab)
-{
-	free_irq(ab->irq_num_link_status, ab);
-}
-
-static int ab8500_usb_v2_res_setup(struct platform_device *pdev,
-				struct ab8500_usb *ab)
+static int ab8500_usb_irq_setup(struct platform_device *pdev,
+		struct ab8500_usb *ab)
 {
 	int err;
+	int irq;
 
-	ab->irq_num_link_status = platform_get_irq_byname(pdev,
-						"USB_LINK_STATUS");
-	if (ab->irq_num_link_status < 0) {
+	irq = platform_get_irq_byname(pdev, "USB_LINK_STATUS");
+	if (irq < 0) {
 		dev_err(&pdev->dev, "Link status irq not found\n");
-		return ab->irq_num_link_status;
+		return irq;
+	}
+	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+			ab8500_usb_link_status_irq,
+			IRQF_NO_SUSPEND | IRQF_SHARED, "usb-link-status", ab);
+	if (err < 0) {
+		dev_err(ab->dev, "request_irq failed for link status irq\n");
+		return err;
 	}
 
-	err = request_threaded_irq(ab->irq_num_link_status, NULL,
-		ab8500_usb_v20_irq,
-		IRQF_NO_SUSPEND | IRQF_SHARED,
-		"usb-link-status", ab);
+	irq = platform_get_irq_byname(pdev, "ID_WAKEUP_F");
+	if (irq < 0) {
+		dev_err(&pdev->dev, "ID fall irq not found\n");
+		return irq;
+	}
+	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+			ab8500_usb_disconnect_irq,
+			IRQF_NO_SUSPEND | IRQF_SHARED, "usb-id-fall", ab);
 	if (err < 0) {
-		dev_err(ab->dev,
-			"request_irq failed for link status irq\n");
+		dev_err(ab->dev, "request_irq failed for ID fall irq\n");
+		return err;
+	}
+
+	irq = platform_get_irq_byname(pdev, "VBUS_DET_F");
+	if (irq < 0) {
+		dev_err(&pdev->dev, "VBUS fall irq not found\n");
+		return irq;
+	}
+	err = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+			ab8500_usb_disconnect_irq,
+			IRQF_NO_SUSPEND | IRQF_SHARED, "usb-vbus-fall", ab);
+	if (err < 0) {
+		dev_err(ab->dev, "request_irq failed for Vbus fall irq\n");
 		return err;
 	}
 
@@ -408,22 +663,23 @@ static int ab8500_usb_probe(struct platform_device *pdev)
 	/* all: Disable phy when called from set_host and set_peripheral */
 	INIT_WORK(&ab->phy_dis_work, ab8500_usb_phy_disable_work);
 
-	err = ab8500_usb_v2_res_setup(pdev, ab);
+	err = ab8500_usb_irq_setup(pdev, ab);
 	if (err < 0)
-		goto fail0;
+		goto fail;
 
 	err = usb_add_phy(&ab->phy, USB_PHY_TYPE_USB2);
 	if (err) {
 		dev_err(&pdev->dev, "Can't register transceiver\n");
-		goto fail1;
+		goto fail;
 	}
 
+	/* Needed to enable ID detection. */
+	ab8500_usb_wd_workaround(ab);
+
 	dev_info(&pdev->dev, "revision 0x%2x driver initialized\n", rev);
 
 	return 0;
-fail1:
-	ab8500_usb_irq_free(ab);
-fail0:
+fail:
 	kfree(otg);
 	kfree(ab);
 	return err;
@@ -433,8 +689,6 @@ static int ab8500_usb_remove(struct platform_device *pdev)
 {
 	struct ab8500_usb *ab = platform_get_drvdata(pdev);
 
-	ab8500_usb_irq_free(ab);
-
 	cancel_delayed_work_sync(&ab->dwork);
 
 	cancel_work_sync(&ab->phy_dis_work);
diff --git a/include/linux/usb/musb-ux500.h b/include/linux/usb/musb-ux500.h
new file mode 100644
index 000000000000..1e2c7130f6e1
--- /dev/null
+++ b/include/linux/usb/musb-ux500.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (C) 2013 ST-Ericsson AB
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __MUSB_UX500_H__
+#define __MUSB_UX500_H__
+
+enum ux500_musb_vbus_id_status {
+	UX500_MUSB_NONE = 0,
+	UX500_MUSB_VBUS,
+	UX500_MUSB_ID,
+	UX500_MUSB_CHARGER,
+	UX500_MUSB_ENUMERATED,
+	UX500_MUSB_RIDA,
+	UX500_MUSB_RIDB,
+	UX500_MUSB_RIDC,
+	UX500_MUSB_PREPARE,
+	UX500_MUSB_CLEAN,
+};
+
+#endif	/* __MUSB_UX500_H__ */
-- 
cgit 


From 70bc126471af30bb115e635512dcf6d86fe6e29a Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Wed, 6 Mar 2013 08:20:52 -0500
Subject: tty: Add safe tty throttle/unthrottle functions

The tty driver can become stuck throttled due to race conditions
between throttle and unthrottle, when the decision to throttle
or unthrottle is conditional. The following example helps to
illustrate the race:

  CPU 0                        |  CPU 1
                               |
if (condition A)               |
                               | <processing such that A not true>
                               | if (!condition A)
                               |     unthrottle()
    throttle()                 |
                               |

Note the converse is also possible; ie.,

  CPU 0                        |  CPU 1
                               |
                               | if (!condition A)
<processing such that A true>  |
if (condition A)               |
    throttle()                 |
                               |     unthrottle()
                               |

Add new throttle/unthrottle functions based on the familiar model
of task state and schedule/wake. For example,

    while (1) {
        tty_set_flow_change(tty, TTY_THROTTLE_SAFE);
        if (!condition)
            break;
        if (!tty_throttle_safe(tty))
            break;
    }
    __tty_set_flow_change(tty, 0);

In this example, if an unthrottle occurs after the condition is
evaluated but before tty_throttle_safe(), then tty_throttle_safe()
will return non-zero, looping and forcing the re-evaluation of
condition.

Reported-by: Vincent Pillet <vincentx.pillet@intel.com>
Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ioctl.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/tty.h     | 18 ++++++++++++++
 2 files changed, 82 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index d58b92cc187c..132d452578bb 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -106,6 +106,7 @@ void tty_throttle(struct tty_struct *tty)
 	if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->throttle)
 		tty->ops->throttle(tty);
+	tty->flow_change = 0;
 	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_throttle);
@@ -129,10 +130,73 @@ void tty_unthrottle(struct tty_struct *tty)
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->unthrottle)
 		tty->ops->unthrottle(tty);
+	tty->flow_change = 0;
 	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_unthrottle);
 
+/**
+ *	tty_throttle_safe	-	flow control
+ *	@tty: terminal
+ *
+ *	Similar to tty_throttle() but will only attempt throttle
+ *	if tty->flow_change is TTY_THROTTLE_SAFE. Prevents an accidental
+ *	throttle due to race conditions when throttling is conditional
+ *	on factors evaluated prior to throttling.
+ *
+ *	Returns 0 if tty is throttled (or was already throttled)
+ */
+
+int tty_throttle_safe(struct tty_struct *tty)
+{
+	int ret = 0;
+
+	mutex_lock(&tty->termios_mutex);
+	if (!test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (tty->flow_change != TTY_THROTTLE_SAFE)
+			ret = 1;
+		else {
+			__set_bit(TTY_THROTTLED, &tty->flags);
+			if (tty->ops->throttle)
+				tty->ops->throttle(tty);
+		}
+	}
+	mutex_unlock(&tty->termios_mutex);
+
+	return ret;
+}
+
+/**
+ *	tty_unthrottle_safe	-	flow control
+ *	@tty: terminal
+ *
+ *	Similar to tty_unthrottle() but will only attempt unthrottle
+ *	if tty->flow_change is TTY_UNTHROTTLE_SAFE. Prevents an accidental
+ *	unthrottle due to race conditions when unthrottling is conditional
+ *	on factors evaluated prior to unthrottling.
+ *
+ *	Returns 0 if tty is unthrottled (or was already unthrottled)
+ */
+
+int tty_unthrottle_safe(struct tty_struct *tty)
+{
+	int ret = 0;
+
+	mutex_lock(&tty->termios_mutex);
+	if (test_bit(TTY_THROTTLED, &tty->flags)) {
+		if (tty->flow_change != TTY_UNTHROTTLE_SAFE)
+			ret = 1;
+		else {
+			__clear_bit(TTY_THROTTLED, &tty->flags);
+			if (tty->ops->unthrottle)
+				tty->ops->unthrottle(tty);
+		}
+	}
+	mutex_unlock(&tty->termios_mutex);
+
+	return ret;
+}
+
 /**
  *	tty_wait_until_sent	-	wait for I/O to finish
  *	@tty: tty we are waiting for
diff --git a/include/linux/tty.h b/include/linux/tty.h
index c75d886b0307..189ca80494d1 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -258,6 +258,7 @@ struct tty_struct {
 	unsigned char warned:1;
 	unsigned char ctrl_status;	/* ctrl_lock */
 	unsigned int receive_room;	/* Bytes free for queue */
+	int flow_change;
 
 	struct tty_struct *link;
 	struct fasync_struct *fasync;
@@ -318,6 +319,21 @@ struct tty_file_private {
 
 #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
 
+/* Values for tty->flow_change */
+#define TTY_THROTTLE_SAFE 1
+#define TTY_UNTHROTTLE_SAFE 2
+
+static inline void __tty_set_flow_change(struct tty_struct *tty, int val)
+{
+	tty->flow_change = val;
+}
+
+static inline void tty_set_flow_change(struct tty_struct *tty, int val)
+{
+	tty->flow_change = val;
+	smp_mb();
+}
+
 #ifdef CONFIG_TTY
 extern void console_init(void);
 extern void tty_kref_put(struct tty_struct *tty);
@@ -400,6 +416,8 @@ extern int tty_write_room(struct tty_struct *tty);
 extern void tty_driver_flush_buffer(struct tty_struct *tty);
 extern void tty_throttle(struct tty_struct *tty);
 extern void tty_unthrottle(struct tty_struct *tty);
+extern int tty_throttle_safe(struct tty_struct *tty);
+extern int tty_unthrottle_safe(struct tty_struct *tty);
 extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws);
 extern void tty_driver_remove_tty(struct tty_driver *driver,
 				  struct tty_struct *tty);
-- 
cgit 


From 6be06e7273c4682a15ca1f4adf1aeae510823530 Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Wed, 6 Mar 2013 08:38:21 -0500
Subject: tty: Fix checkpatch errors in tty_ldisc.h

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/tty_ldisc.h | 132 +++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h
index 455a0d7bf220..58390c73df8b 100644
--- a/include/linux/tty_ldisc.h
+++ b/include/linux/tty_ldisc.h
@@ -9,89 +9,89 @@
  *
  * int	(*open)(struct tty_struct *);
  *
- * 	This function is called when the line discipline is associated
- * 	with the tty.  The line discipline can use this as an
- * 	opportunity to initialize any state needed by the ldisc routines.
- * 
+ *	This function is called when the line discipline is associated
+ *	with the tty.  The line discipline can use this as an
+ *	opportunity to initialize any state needed by the ldisc routines.
+ *
  * void	(*close)(struct tty_struct *);
  *
  *	This function is called when the line discipline is being
- * 	shutdown, either because the tty is being closed or because
- * 	the tty is being changed to use a new line discipline
- * 
+ *	shutdown, either because the tty is being closed or because
+ *	the tty is being changed to use a new line discipline
+ *
  * void	(*flush_buffer)(struct tty_struct *tty);
  *
- * 	This function instructs the line discipline to clear its
- * 	buffers of any input characters it may have queued to be
- * 	delivered to the user mode process.
- * 
+ *	This function instructs the line discipline to clear its
+ *	buffers of any input characters it may have queued to be
+ *	delivered to the user mode process.
+ *
  * ssize_t (*chars_in_buffer)(struct tty_struct *tty);
  *
- * 	This function returns the number of input characters the line
+ *	This function returns the number of input characters the line
  *	discipline may have queued up to be delivered to the user mode
  *	process.
- * 
+ *
  * ssize_t (*read)(struct tty_struct * tty, struct file * file,
  *		   unsigned char * buf, size_t nr);
  *
- * 	This function is called when the user requests to read from
- * 	the tty.  The line discipline will return whatever characters
- * 	it has buffered up for the user.  If this function is not
- * 	defined, the user will receive an EIO error.
- * 
+ *	This function is called when the user requests to read from
+ *	the tty.  The line discipline will return whatever characters
+ *	it has buffered up for the user.  If this function is not
+ *	defined, the user will receive an EIO error.
+ *
  * ssize_t (*write)(struct tty_struct * tty, struct file * file,
- * 		    const unsigned char * buf, size_t nr);
- *
- * 	This function is called when the user requests to write to the
- * 	tty.  The line discipline will deliver the characters to the
- * 	low-level tty device for transmission, optionally performing
- * 	some processing on the characters first.  If this function is
- * 	not defined, the user will receive an EIO error.
- * 
+ *		    const unsigned char * buf, size_t nr);
+ *
+ *	This function is called when the user requests to write to the
+ *	tty.  The line discipline will deliver the characters to the
+ *	low-level tty device for transmission, optionally performing
+ *	some processing on the characters first.  If this function is
+ *	not defined, the user will receive an EIO error.
+ *
  * int	(*ioctl)(struct tty_struct * tty, struct file * file,
- * 		 unsigned int cmd, unsigned long arg);
+ *		 unsigned int cmd, unsigned long arg);
  *
  *	This function is called when the user requests an ioctl which
- * 	is not handled by the tty layer or the low-level tty driver.
- * 	It is intended for ioctls which affect line discpline
- * 	operation.  Note that the search order for ioctls is (1) tty
- * 	layer, (2) tty low-level driver, (3) line discpline.  So a
- * 	low-level driver can "grab" an ioctl request before the line
- * 	discpline has a chance to see it.
- * 
+ *	is not handled by the tty layer or the low-level tty driver.
+ *	It is intended for ioctls which affect line discpline
+ *	operation.  Note that the search order for ioctls is (1) tty
+ *	layer, (2) tty low-level driver, (3) line discpline.  So a
+ *	low-level driver can "grab" an ioctl request before the line
+ *	discpline has a chance to see it.
+ *
  * long	(*compat_ioctl)(struct tty_struct * tty, struct file * file,
- * 		        unsigned int cmd, unsigned long arg);
+ *		        unsigned int cmd, unsigned long arg);
  *
- *      Process ioctl calls from 32-bit process on 64-bit system
+ *	Process ioctl calls from 32-bit process on 64-bit system
  *
  * void	(*set_termios)(struct tty_struct *tty, struct ktermios * old);
  *
- * 	This function notifies the line discpline that a change has
- * 	been made to the termios structure.
- * 
+ *	This function notifies the line discpline that a change has
+ *	been made to the termios structure.
+ *
  * int	(*poll)(struct tty_struct * tty, struct file * file,
- * 		  poll_table *wait);
+ *		  poll_table *wait);
  *
- * 	This function is called when a user attempts to select/poll on a
- * 	tty device.  It is solely the responsibility of the line
- * 	discipline to handle poll requests.
+ *	This function is called when a user attempts to select/poll on a
+ *	tty device.  It is solely the responsibility of the line
+ *	discipline to handle poll requests.
  *
  * void	(*receive_buf)(struct tty_struct *, const unsigned char *cp,
- * 		       char *fp, int count);
- *
- * 	This function is called by the low-level tty driver to send
- * 	characters received by the hardware to the line discpline for
- * 	processing.  <cp> is a pointer to the buffer of input
- * 	character received by the device.  <fp> is a pointer to a
- * 	pointer of flag bytes which indicate whether a character was
- * 	received with a parity error, etc.
- * 
+ *		       char *fp, int count);
+ *
+ *	This function is called by the low-level tty driver to send
+ *	characters received by the hardware to the line discpline for
+ *	processing.  <cp> is a pointer to the buffer of input
+ *	character received by the device.  <fp> is a pointer to a
+ *	pointer of flag bytes which indicate whether a character was
+ *	received with a parity error, etc.
+ *
  * void	(*write_wakeup)(struct tty_struct *);
  *
- * 	This function is called by the low-level tty driver to signal
- * 	that line discpline should try to send more characters to the
- * 	low-level driver for transmission.  If the line discpline does
- * 	not have any more data to send, it can just return.
+ *	This function is called by the low-level tty driver to signal
+ *	that line discpline should try to send more characters to the
+ *	low-level driver for transmission.  If the line discpline does
+ *	not have any more data to send, it can just return.
  *
  * int (*hangup)(struct tty_struct *)
  *
@@ -115,7 +115,7 @@ struct tty_ldisc_ops {
 	char	*name;
 	int	num;
 	int	flags;
-	
+
 	/*
 	 * The following routines are called from above.
 	 */
@@ -123,19 +123,19 @@ struct tty_ldisc_ops {
 	void	(*close)(struct tty_struct *);
 	void	(*flush_buffer)(struct tty_struct *tty);
 	ssize_t	(*chars_in_buffer)(struct tty_struct *tty);
-	ssize_t	(*read)(struct tty_struct * tty, struct file * file,
-			unsigned char __user * buf, size_t nr);
-	ssize_t	(*write)(struct tty_struct * tty, struct file * file,
-			 const unsigned char * buf, size_t nr);	
-	int	(*ioctl)(struct tty_struct * tty, struct file * file,
+	ssize_t	(*read)(struct tty_struct *tty, struct file *file,
+			unsigned char __user *buf, size_t nr);
+	ssize_t	(*write)(struct tty_struct *tty, struct file *file,
+			 const unsigned char *buf, size_t nr);
+	int	(*ioctl)(struct tty_struct *tty, struct file *file,
 			 unsigned int cmd, unsigned long arg);
-	long	(*compat_ioctl)(struct tty_struct * tty, struct file * file,
+	long	(*compat_ioctl)(struct tty_struct *tty, struct file *file,
 				unsigned int cmd, unsigned long arg);
-	void	(*set_termios)(struct tty_struct *tty, struct ktermios * old);
+	void	(*set_termios)(struct tty_struct *tty, struct ktermios *old);
 	unsigned int (*poll)(struct tty_struct *, struct file *,
 			     struct poll_table_struct *);
 	int	(*hangup)(struct tty_struct *tty);
-	
+
 	/*
 	 * The following routines are called from below.
 	 */
@@ -145,7 +145,7 @@ struct tty_ldisc_ops {
 	void	(*dcd_change)(struct tty_struct *, unsigned int);
 
 	struct  module *owner;
-	
+
 	int refcount;
 };
 
-- 
cgit 


From 6865ff222ccab371c04afce17aec1f7d70b17dbc Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 7 Mar 2013 13:12:27 +0100
Subject: TTY: do not warn about setting speed via SPD_*

The warning is there since 2.1.69 and we have not seen anybody
reporting it in the past decade. Remove the warning now.

tty_get_baud_rate can now be inline. This gives us one less
EXPORT_SYMBOL.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_ioctl.c | 28 ----------------------------
 include/linux/tty.h     | 18 ++++++++++++++++--
 2 files changed, 16 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c
index 132d452578bb..28715e48b2f7 100644
--- a/drivers/tty/tty_ioctl.c
+++ b/drivers/tty/tty_ioctl.c
@@ -478,34 +478,6 @@ void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud)
 }
 EXPORT_SYMBOL_GPL(tty_encode_baud_rate);
 
-/**
- *	tty_get_baud_rate	-	get tty bit rates
- *	@tty: tty to query
- *
- *	Returns the baud rate as an integer for this terminal. The
- *	termios lock must be held by the caller and the terminal bit
- *	flags may be updated.
- *
- *	Locking: none
- */
-
-speed_t tty_get_baud_rate(struct tty_struct *tty)
-{
-	speed_t baud = tty_termios_baud_rate(&tty->termios);
-
-	if (baud == 38400 && tty->alt_speed) {
-		if (!tty->warned) {
-			printk(KERN_WARNING "Use of setserial/setrocket to "
-					    "set SPD_* flags is deprecated\n");
-			tty->warned = 1;
-		}
-		baud = tty->alt_speed;
-	}
-
-	return baud;
-}
-EXPORT_SYMBOL(tty_get_baud_rate);
-
 /**
  *	tty_termios_copy_hw	-	copy hardware settings
  *	@new: New termios
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 189ca80494d1..63b62865c8e9 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -255,7 +255,6 @@ struct tty_struct {
 	int count;
 	struct winsize winsize;		/* termios mutex */
 	unsigned char stopped:1, hw_stopped:1, flow_stopped:1, packet:1;
-	unsigned char warned:1;
 	unsigned char ctrl_status;	/* ctrl_lock */
 	unsigned int receive_room;	/* Bytes free for queue */
 	int flow_change;
@@ -437,13 +436,28 @@ extern void tty_flush_to_ldisc(struct tty_struct *tty);
 extern void tty_buffer_free_all(struct tty_port *port);
 extern void tty_buffer_flush(struct tty_struct *tty);
 extern void tty_buffer_init(struct tty_port *port);
-extern speed_t tty_get_baud_rate(struct tty_struct *tty);
 extern speed_t tty_termios_baud_rate(struct ktermios *termios);
 extern speed_t tty_termios_input_baud_rate(struct ktermios *termios);
 extern void tty_termios_encode_baud_rate(struct ktermios *termios,
 						speed_t ibaud, speed_t obaud);
 extern void tty_encode_baud_rate(struct tty_struct *tty,
 						speed_t ibaud, speed_t obaud);
+
+/**
+ *	tty_get_baud_rate	-	get tty bit rates
+ *	@tty: tty to query
+ *
+ *	Returns the baud rate as an integer for this terminal. The
+ *	termios lock must be held by the caller and the terminal bit
+ *	flags may be updated.
+ *
+ *	Locking: none
+ */
+static inline speed_t tty_get_baud_rate(struct tty_struct *tty)
+{
+	return tty_termios_baud_rate(&tty->termios);
+}
+
 extern void tty_termios_copy_hw(struct ktermios *new, struct ktermios *old);
 extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
 extern int tty_set_termios(struct tty_struct *tty, struct ktermios *kt);
-- 
cgit 


From 6aad04f21374633bd8cecf25024553d1e11a9522 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 7 Mar 2013 13:12:29 +0100
Subject: TTY: add tty_port_tty_wakeup helper

It allows for cleaning up on a considerable amount of places. They did
port_get, wakeup, kref_put. Now the only thing needed is to call
tty_port_tty_wakeup which does exactly that.

One exception is ifx6x60 where tty_wakeup was open-coded. We now call
tty_wakeup properly there.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/um/drivers/line.c                  |  8 +-------
 drivers/isdn/capi/capi.c                |  7 +------
 drivers/isdn/gigaset/interface.c        |  6 +-----
 drivers/net/usb/hso.c                   | 13 ++-----------
 drivers/s390/char/sclp_tty.c            |  9 ++-------
 drivers/s390/char/sclp_vt220.c          |  8 +-------
 drivers/staging/fwserial/fwserial.c     | 10 ++--------
 drivers/staging/serqt_usb2/serqt_usb2.c |  7 +------
 drivers/tty/ehv_bytechan.c              |  6 +-----
 drivers/tty/hvc/hvsi.c                  |  7 +------
 drivers/tty/nozomi.c                    |  6 +-----
 drivers/tty/serial/ifx6x60.c            | 33 ++-------------------------------
 drivers/tty/tty_port.c                  | 16 ++++++++++++++++
 drivers/usb/class/cdc-acm.c             |  7 +------
 drivers/usb/serial/digi_acceleport.c    | 17 +++--------------
 drivers/usb/serial/io_edgeport.c        | 28 +++++-----------------------
 drivers/usb/serial/keyspan_pda.c        |  6 ++----
 drivers/usb/serial/mos7720.c            |  8 ++------
 drivers/usb/serial/mos7840.c            |  7 ++-----
 drivers/usb/serial/ti_usb_3410_5052.c   |  7 ++-----
 drivers/usb/serial/usb-serial.c         | 10 +---------
 include/linux/tty.h                     |  1 +
 22 files changed, 51 insertions(+), 176 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index f1b38571f94e..cc206eda245c 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -248,7 +248,6 @@ static irqreturn_t line_write_interrupt(int irq, void *data)
 {
 	struct chan *chan = data;
 	struct line *line = chan->line;
-	struct tty_struct *tty;
 	int err;
 
 	/*
@@ -267,12 +266,7 @@ static irqreturn_t line_write_interrupt(int irq, void *data)
 	}
 	spin_unlock(&line->lock);
 
-	tty = tty_port_tty_get(&line->port);
-	if (tty == NULL)
-		return IRQ_NONE;
-
-	tty_wakeup(tty);
-	tty_kref_put(tty);
+	tty_port_tty_wakeup(&line->port);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/isdn/capi/capi.c b/drivers/isdn/capi/capi.c
index 89562a845f6a..ac6f72b455d1 100644
--- a/drivers/isdn/capi/capi.c
+++ b/drivers/isdn/capi/capi.c
@@ -569,7 +569,6 @@ static void capi_recv_message(struct capi20_appl *ap, struct sk_buff *skb)
 {
 	struct capidev *cdev = ap->private;
 #ifdef CONFIG_ISDN_CAPI_MIDDLEWARE
-	struct tty_struct *tty;
 	struct capiminor *mp;
 	u16 datahandle;
 	struct capincci *np;
@@ -627,11 +626,7 @@ static void capi_recv_message(struct capi20_appl *ap, struct sk_buff *skb)
 			 CAPIMSG_U16(skb->data, CAPIMSG_BASELEN + 4 + 2));
 		kfree_skb(skb);
 		capiminor_del_ack(mp, datahandle);
-		tty = tty_port_tty_get(&mp->port);
-		if (tty) {
-			tty_wakeup(tty);
-			tty_kref_put(tty);
-		}
+		tty_port_tty_wakeup(&mp->port);
 		handle_minor_send(mp);
 
 	} else {
diff --git a/drivers/isdn/gigaset/interface.c b/drivers/isdn/gigaset/interface.c
index e2b539675b66..600c79b030cd 100644
--- a/drivers/isdn/gigaset/interface.c
+++ b/drivers/isdn/gigaset/interface.c
@@ -487,12 +487,8 @@ static const struct tty_operations if_ops = {
 static void if_wake(unsigned long data)
 {
 	struct cardstate *cs = (struct cardstate *)data;
-	struct tty_struct *tty = tty_port_tty_get(&cs->port);
 
-	if (tty) {
-		tty_wakeup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_wakeup(&cs->port);
 }
 
 /*** interface to common ***/
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index e2dd3249b6bd..a7714b4f29ad 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -1925,7 +1925,6 @@ static void hso_std_serial_write_bulk_callback(struct urb *urb)
 {
 	struct hso_serial *serial = urb->context;
 	int status = urb->status;
-	struct tty_struct *tty;
 
 	/* sanity check */
 	if (!serial) {
@@ -1941,11 +1940,7 @@ static void hso_std_serial_write_bulk_callback(struct urb *urb)
 		return;
 	}
 	hso_put_activity(serial->parent);
-	tty = tty_port_tty_get(&serial->port);
-	if (tty) {
-		tty_wakeup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_wakeup(&serial->port);
 	hso_kick_transmit(serial);
 
 	D1(" ");
@@ -2008,12 +2003,8 @@ static void ctrl_callback(struct urb *urb)
 		put_rxbuf_data_and_resubmit_ctrl_urb(serial);
 		spin_unlock(&serial->serial_lock);
 	} else {
-		struct tty_struct *tty = tty_port_tty_get(&serial->port);
 		hso_put_activity(serial->parent);
-		if (tty) {
-			tty_wakeup(tty);
-			tty_kref_put(tty);
-		}
+		tty_port_tty_wakeup(&serial->port);
 		/* response to a write command */
 		hso_kick_transmit(serial);
 	}
diff --git a/drivers/s390/char/sclp_tty.c b/drivers/s390/char/sclp_tty.c
index 14b4cb8abcc8..7ed7a5987816 100644
--- a/drivers/s390/char/sclp_tty.c
+++ b/drivers/s390/char/sclp_tty.c
@@ -107,7 +107,6 @@ sclp_tty_write_room (struct tty_struct *tty)
 static void
 sclp_ttybuf_callback(struct sclp_buffer *buffer, int rc)
 {
-	struct tty_struct *tty;
 	unsigned long flags;
 	void *page;
 
@@ -125,12 +124,8 @@ sclp_ttybuf_callback(struct sclp_buffer *buffer, int rc)
 					    struct sclp_buffer, list);
 		spin_unlock_irqrestore(&sclp_tty_lock, flags);
 	} while (buffer && sclp_emit_buffer(buffer, sclp_ttybuf_callback));
-	/* check if the tty needs a wake up call */
-	tty = tty_port_tty_get(&sclp_port);
-	if (tty != NULL) {
-		tty_wakeup(tty);
-		tty_kref_put(tty);
-	}
+
+	tty_port_tty_wakeup(&sclp_port);
 }
 
 static inline void
diff --git a/drivers/s390/char/sclp_vt220.c b/drivers/s390/char/sclp_vt220.c
index 6c92f62623be..5aaaa2ec8df4 100644
--- a/drivers/s390/char/sclp_vt220.c
+++ b/drivers/s390/char/sclp_vt220.c
@@ -114,7 +114,6 @@ static struct sclp_register sclp_vt220_register = {
 static void
 sclp_vt220_process_queue(struct sclp_vt220_request *request)
 {
-	struct tty_struct *tty;
 	unsigned long flags;
 	void *page;
 
@@ -139,12 +138,7 @@ sclp_vt220_process_queue(struct sclp_vt220_request *request)
 	} while (__sclp_vt220_emit(request));
 	if (request == NULL && sclp_vt220_flush_later)
 		sclp_vt220_emit_current();
-	/* Check if the tty needs a wake up call */
-	tty = tty_port_tty_get(&sclp_vt220_port);
-	if (tty) {
-		tty_wakeup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_wakeup(&sclp_vt220_port);
 }
 
 #define SCLP_BUFFER_MAX_RETRY		1
diff --git a/drivers/staging/fwserial/fwserial.c b/drivers/staging/fwserial/fwserial.c
index 5a6fb44f38a8..5c64e3a35b28 100644
--- a/drivers/staging/fwserial/fwserial.c
+++ b/drivers/staging/fwserial/fwserial.c
@@ -744,7 +744,6 @@ static void fwtty_tx_complete(struct fw_card *card, int rcode,
 			      struct fwtty_transaction *txn)
 {
 	struct fwtty_port *port = txn->port;
-	struct tty_struct *tty;
 	int len;
 
 	fwtty_dbg(port, "rcode: %d", rcode);
@@ -769,13 +768,8 @@ static void fwtty_tx_complete(struct fw_card *card, int rcode,
 		port->stats.dropped += txn->dma_pended.len;
 	}
 
-	if (len < WAKEUP_CHARS) {
-		tty = tty_port_tty_get(&port->port);
-		if (tty) {
-			tty_wakeup(tty);
-			tty_kref_put(tty);
-		}
-	}
+	if (len < WAKEUP_CHARS)
+		tty_port_tty_wakeup(&port->port);
 }
 
 static int fwtty_tx(struct fwtty_port *port, bool drain)
diff --git a/drivers/staging/serqt_usb2/serqt_usb2.c b/drivers/staging/serqt_usb2/serqt_usb2.c
index b1bb1a6abe81..8a6e5ea476e1 100644
--- a/drivers/staging/serqt_usb2/serqt_usb2.c
+++ b/drivers/staging/serqt_usb2/serqt_usb2.c
@@ -264,7 +264,6 @@ static void ProcessRxChar(struct usb_serial_port *port, unsigned char data)
 
 static void qt_write_bulk_callback(struct urb *urb)
 {
-	struct tty_struct *tty;
 	int status;
 	struct quatech_port *quatech_port;
 
@@ -278,11 +277,7 @@ static void qt_write_bulk_callback(struct urb *urb)
 
 	quatech_port = urb->context;
 
-	tty = tty_port_tty_get(&quatech_port->port->port);
-
-	if (tty)
-		tty_wakeup(tty);
-	tty_kref_put(tty);
+	tty_port_tty_wakeup(&quatech_port->port->port);
 }
 
 static void qt_interrupt_callback(struct urb *urb)
diff --git a/drivers/tty/ehv_bytechan.c b/drivers/tty/ehv_bytechan.c
index ed92622b8949..6d0c27cd03da 100644
--- a/drivers/tty/ehv_bytechan.c
+++ b/drivers/tty/ehv_bytechan.c
@@ -472,13 +472,9 @@ static void ehv_bc_tx_dequeue(struct ehv_bc_data *bc)
 static irqreturn_t ehv_bc_tty_tx_isr(int irq, void *data)
 {
 	struct ehv_bc_data *bc = data;
-	struct tty_struct *ttys = tty_port_tty_get(&bc->port);
 
 	ehv_bc_tx_dequeue(bc);
-	if (ttys) {
-		tty_wakeup(ttys);
-		tty_kref_put(ttys);
-	}
+	tty_port_tty_wakeup(&bc->port);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/tty/hvc/hvsi.c b/drivers/tty/hvc/hvsi.c
index ef95a154854a..41901997c0d6 100644
--- a/drivers/tty/hvc/hvsi.c
+++ b/drivers/tty/hvc/hvsi.c
@@ -861,7 +861,6 @@ static void hvsi_write_worker(struct work_struct *work)
 {
 	struct hvsi_struct *hp =
 		container_of(work, struct hvsi_struct, writer.work);
-	struct tty_struct *tty;
 	unsigned long flags;
 #ifdef DEBUG
 	static long start_j = 0;
@@ -895,11 +894,7 @@ static void hvsi_write_worker(struct work_struct *work)
 		start_j = 0;
 #endif /* DEBUG */
 		wake_up_all(&hp->emptyq);
-		tty = tty_port_tty_get(&hp->port);
-		if (tty) {
-			tty_wakeup(tty);
-			tty_kref_put(tty);
-		}
+		tty_port_tty_wakeup(&hp->port);
 	}
 
 out:
diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 2dff19796157..2e5bbdc09e1c 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -791,7 +791,6 @@ static int send_data(enum port_type index, struct nozomi *dc)
 	const u8 toggle = port->toggle_ul;
 	void __iomem *addr = port->ul_addr[toggle];
 	const u32 ul_size = port->ul_size[toggle];
-	struct tty_struct *tty = tty_port_tty_get(&port->port);
 
 	/* Get data from tty and place in buf for now */
 	size = kfifo_out(&port->fifo_ul, dc->send_buf,
@@ -799,7 +798,6 @@ static int send_data(enum port_type index, struct nozomi *dc)
 
 	if (size == 0) {
 		DBG4("No more data to send, disable link:");
-		tty_kref_put(tty);
 		return 0;
 	}
 
@@ -809,10 +807,8 @@ static int send_data(enum port_type index, struct nozomi *dc)
 	write_mem32(addr, (u32 *) &size, 4);
 	write_mem32(addr + 4, (u32 *) dc->send_buf, size);
 
-	if (tty)
-		tty_wakeup(tty);
+	tty_port_tty_wakeup(&port->port);
 
-	tty_kref_put(tty);
 	return 1;
 }
 
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
index 68d7ce997ede..d723d4193b90 100644
--- a/drivers/tty/serial/ifx6x60.c
+++ b/drivers/tty/serial/ifx6x60.c
@@ -442,25 +442,6 @@ static void ifx_spi_setup_spi_header(unsigned char *txbuffer, int tx_count,
 	txbuffer[1] |= (more << IFX_SPI_MORE_BIT) & IFX_SPI_MORE_MASK;
 }
 
-/**
- *	ifx_spi_wakeup_serial	-	SPI space made
- *	@port_data: our SPI device
- *
- *	We have emptied the FIFO enough that we want to get more data
- *	queued into it. Poke the line discipline via tty_wakeup so that
- *	it will feed us more bits
- */
-static void ifx_spi_wakeup_serial(struct ifx_spi_device *ifx_dev)
-{
-	struct tty_struct *tty;
-
-	tty = tty_port_tty_get(&ifx_dev->tty_port);
-	if (!tty)
-		return;
-	tty_wakeup(tty);
-	tty_kref_put(tty);
-}
-
 /**
  *	ifx_spi_prepare_tx_buffer	-	prepare transmit frame
  *	@ifx_dev: our SPI device
@@ -506,7 +487,7 @@ static int ifx_spi_prepare_tx_buffer(struct ifx_spi_device *ifx_dev)
 			tx_count += temp_count;
 			if (temp_count == queue_length)
 				/* poke port to get more data */
-				ifx_spi_wakeup_serial(ifx_dev);
+				tty_port_tty_wakeup(&ifx_dev->tty_port);
 			else /* more data in port, use next SPI message */
 				ifx_dev->spi_more = 1;
 		}
@@ -683,8 +664,6 @@ static void ifx_spi_insert_flip_string(struct ifx_spi_device *ifx_dev,
 static void ifx_spi_complete(void *ctx)
 {
 	struct ifx_spi_device *ifx_dev = ctx;
-	struct tty_struct *tty;
-	struct tty_ldisc *ldisc = NULL;
 	int length;
 	int actual_length;
 	unsigned char more;
@@ -762,15 +741,7 @@ complete_exit:
 			 */
 			ifx_spi_power_state_clear(ifx_dev,
 						  IFX_SPI_POWER_DATA_PENDING);
-			tty = tty_port_tty_get(&ifx_dev->tty_port);
-			if (tty) {
-				ldisc = tty_ldisc_ref(tty);
-				if (ldisc) {
-					ldisc->ops->write_wakeup(tty);
-					tty_ldisc_deref(ldisc);
-				}
-				tty_kref_put(tty);
-			}
+			tty_port_tty_wakeup(&ifx_dev->tty_port);
 		}
 	}
 }
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index b7ff59d3db88..8bb757c62ee2 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -232,6 +232,22 @@ void tty_port_hangup(struct tty_port *port)
 }
 EXPORT_SYMBOL(tty_port_hangup);
 
+/**
+ * tty_port_tty_wakeup - helper to wake up a tty
+ *
+ * @port: tty port
+ */
+void tty_port_tty_wakeup(struct tty_port *port)
+{
+	struct tty_struct *tty = tty_port_tty_get(port);
+
+	if (tty) {
+		tty_wakeup(tty);
+		tty_kref_put(tty);
+	}
+}
+EXPORT_SYMBOL_GPL(tty_port_tty_wakeup);
+
 /**
  *	tty_port_carrier_raised	-	carrier raised check
  *	@port: tty port
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 8ac25adf31b4..755766e4b756 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -475,15 +475,10 @@ static void acm_write_bulk(struct urb *urb)
 static void acm_softint(struct work_struct *work)
 {
 	struct acm *acm = container_of(work, struct acm, work);
-	struct tty_struct *tty;
 
 	dev_vdbg(&acm->data->dev, "%s\n", __func__);
 
-	tty = tty_port_tty_get(&acm->port);
-	if (!tty)
-		return;
-	tty_wakeup(tty);
-	tty_kref_put(tty);
+	tty_port_tty_wakeup(&acm->port);
 }
 
 /*
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index ebe45fa0ed50..31191581060c 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -210,7 +210,6 @@ struct digi_port {
 
 /* Local Function Declarations */
 
-static void digi_wakeup_write(struct usb_serial_port *port);
 static void digi_wakeup_write_lock(struct work_struct *work);
 static int digi_write_oob_command(struct usb_serial_port *port,
 	unsigned char *buf, int count, int interruptible);
@@ -374,20 +373,10 @@ static void digi_wakeup_write_lock(struct work_struct *work)
 	unsigned long flags;
 
 	spin_lock_irqsave(&priv->dp_port_lock, flags);
-	digi_wakeup_write(port);
+	tty_port_tty_wakeup(&port->port);
 	spin_unlock_irqrestore(&priv->dp_port_lock, flags);
 }
 
-static void digi_wakeup_write(struct usb_serial_port *port)
-{
-	struct tty_struct *tty = tty_port_tty_get(&port->port);
-	if (tty) {
-		tty_wakeup(tty);
-		tty_kref_put(tty);
-	}
-}
-
-
 /*
  *  Digi Write OOB Command
  *
@@ -1044,7 +1033,7 @@ static void digi_write_bulk_callback(struct urb *urb)
 		}
 	}
 	/* wake up processes sleeping on writes immediately */
-	digi_wakeup_write(port);
+	tty_port_tty_wakeup(&port->port);
 	/* also queue up a wakeup at scheduler time, in case we */
 	/* lost the race in write_chan(). */
 	schedule_work(&priv->dp_wakeup_work);
@@ -1522,7 +1511,7 @@ static int digi_read_oob_callback(struct urb *urb)
 				/* port must be open to use tty struct */
 				if (rts) {
 					tty->hw_stopped = 0;
-					digi_wakeup_write(port);
+					tty_port_tty_wakeup(&port->port);
 				}
 			} else {
 				priv->dp_modem_signals &= ~TIOCM_CTS;
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index b00e5cbf741f..44e5208f7c61 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -565,7 +565,6 @@ static void edge_interrupt_callback(struct urb *urb)
 	struct device *dev;
 	struct edgeport_port *edge_port;
 	struct usb_serial_port *port;
-	struct tty_struct *tty;
 	unsigned char *data = urb->transfer_buffer;
 	int length = urb->actual_length;
 	int bytes_avail;
@@ -644,12 +643,7 @@ static void edge_interrupt_callback(struct urb *urb)
 
 					/* tell the tty driver that something
 					   has changed */
-					tty = tty_port_tty_get(
-						&edge_port->port->port);
-					if (tty) {
-						tty_wakeup(tty);
-						tty_kref_put(tty);
-					}
+					tty_port_tty_wakeup(&edge_port->port->port);
 					/* Since we have more credit, check
 					   if more data can be sent */
 					send_more_port_data(edge_serial,
@@ -738,7 +732,6 @@ static void edge_bulk_in_callback(struct urb *urb)
 static void edge_bulk_out_data_callback(struct urb *urb)
 {
 	struct edgeport_port *edge_port = urb->context;
-	struct tty_struct *tty;
 	int status = urb->status;
 
 	if (status) {
@@ -747,14 +740,8 @@ static void edge_bulk_out_data_callback(struct urb *urb)
 			__func__, status);
 	}
 
-	tty = tty_port_tty_get(&edge_port->port->port);
-
-	if (tty && edge_port->open) {
-		/* let the tty driver wakeup if it has a special
-		   write_wakeup function */
-		tty_wakeup(tty);
-	}
-	tty_kref_put(tty);
+	if (edge_port->open)
+		tty_port_tty_wakeup(&edge_port->port->port);
 
 	/* Release the Write URB */
 	edge_port->write_in_progress = false;
@@ -773,7 +760,6 @@ static void edge_bulk_out_data_callback(struct urb *urb)
 static void edge_bulk_out_cmd_callback(struct urb *urb)
 {
 	struct edgeport_port *edge_port = urb->context;
-	struct tty_struct *tty;
 	int status = urb->status;
 
 	atomic_dec(&CmdUrbs);
@@ -794,13 +780,9 @@ static void edge_bulk_out_cmd_callback(struct urb *urb)
 		return;
 	}
 
-	/* Get pointer to tty */
-	tty = tty_port_tty_get(&edge_port->port->port);
-
 	/* tell the tty driver that something has changed */
-	if (tty && edge_port->open)
-		tty_wakeup(tty);
-	tty_kref_put(tty);
+	if (edge_port->open)
+		tty_port_tty_wakeup(&edge_port->port->port);
 
 	/* we have completed the command */
 	edge_port->commandPending = false;
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index 3b17d5d13dc8..2230223978ca 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -104,10 +104,8 @@ static void keyspan_pda_wakeup_write(struct work_struct *work)
 	struct keyspan_pda_private *priv =
 		container_of(work, struct keyspan_pda_private, wakeup_work);
 	struct usb_serial_port *port = priv->port;
-	struct tty_struct *tty = tty_port_tty_get(&port->port);
-	if (tty)
-		tty_wakeup(tty);
-	tty_kref_put(tty);
+
+	tty_port_tty_wakeup(&port->port);
 }
 
 static void keyspan_pda_request_unthrottle(struct work_struct *work)
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index e0ebec3b5d6a..e956eae198fd 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -932,7 +932,6 @@ static void mos7720_bulk_in_callback(struct urb *urb)
 static void mos7720_bulk_out_data_callback(struct urb *urb)
 {
 	struct moschip_port *mos7720_port;
-	struct tty_struct *tty;
 	int status = urb->status;
 
 	if (status) {
@@ -946,11 +945,8 @@ static void mos7720_bulk_out_data_callback(struct urb *urb)
 		return ;
 	}
 
-	tty = tty_port_tty_get(&mos7720_port->port->port);
-
-	if (tty && mos7720_port->open)
-		tty_wakeup(tty);
-	tty_kref_put(tty);
+	if (mos7720_port->open)
+		tty_port_tty_wakeup(&mos7720_port->port->port);
 }
 
 /*
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 809fb329eca5..08284d28e84b 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -814,7 +814,6 @@ static void mos7840_bulk_out_data_callback(struct urb *urb)
 {
 	struct moschip_port *mos7840_port;
 	struct usb_serial_port *port;
-	struct tty_struct *tty;
 	int status = urb->status;
 	int i;
 
@@ -837,10 +836,8 @@ static void mos7840_bulk_out_data_callback(struct urb *urb)
 	if (mos7840_port_paranoia_check(port, __func__))
 		return;
 
-	tty = tty_port_tty_get(&port->port);
-	if (tty && mos7840_port->open)
-		tty_wakeup(tty);
-	tty_kref_put(tty);
+	if (mos7840_port->open)
+		tty_port_tty_wakeup(&port->port);
 
 }
 
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 39cb9b807c3c..437f2d579cde 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -1227,7 +1227,6 @@ static void ti_send(struct ti_port *tport)
 {
 	int count, result;
 	struct usb_serial_port *port = tport->tp_port;
-	struct tty_struct *tty = tty_port_tty_get(&port->port);	/* FIXME */
 	unsigned long flags;
 
 	spin_lock_irqsave(&tport->tp_lock, flags);
@@ -1268,14 +1267,12 @@ static void ti_send(struct ti_port *tport)
 	}
 
 	/* more room in the buffer for new writes, wakeup */
-	if (tty)
-		tty_wakeup(tty);
-	tty_kref_put(tty);
+	tty_port_tty_wakeup(&port->port);
+
 	wake_up_interruptible(&tport->tp_write_wait);
 	return;
 unlock:
 	spin_unlock_irqrestore(&tport->tp_lock, flags);
-	tty_kref_put(tty);
 	return;
 }
 
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index a19ed74d770d..2df84845bafb 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -541,16 +541,8 @@ static void usb_serial_port_work(struct work_struct *work)
 {
 	struct usb_serial_port *port =
 		container_of(work, struct usb_serial_port, work);
-	struct tty_struct *tty;
 
-	tty = tty_port_tty_get(&port->port);
-	if (!tty)
-		return;
-
-	dev_dbg(tty->dev, "%s - port %d\n", __func__, port->number);
-
-	tty_wakeup(tty);
-	tty_kref_put(tty);
+	tty_port_tty_wakeup(&port->port);
 }
 
 static void kill_traffic(struct usb_serial_port *port)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 63b62865c8e9..b6e890a87eb1 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -534,6 +534,7 @@ extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
 extern void tty_port_lower_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
+extern void tty_port_tty_wakeup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
 extern int tty_port_close_start(struct tty_port *port,
-- 
cgit 


From aa27a094e2c2e0cc59914e56113b860f524f4479 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Thu, 7 Mar 2013 13:12:30 +0100
Subject: TTY: add tty_port_tty_hangup helper

It allows for cleaning up on a considerable amount of places. They did
port_get, hangup, kref_put. Now the only thing needed is to call
tty_port_tty_hangup which does exactly that. And they can also decide
whether to consider CLOCAL or completely ignore that.

Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/um/drivers/chan_kern.c         |  6 +-----
 drivers/mmc/card/sdio_uart.c        | 13 ++---------
 drivers/net/usb/hso.c               |  7 +-----
 drivers/tty/cyclades.c              | 10 ++-------
 drivers/tty/moxa.c                  | 19 ++++++----------
 drivers/tty/n_gsm.c                 |  6 +-----
 drivers/tty/nozomi.c                |  9 +++-----
 drivers/tty/rocket.c                |  7 +-----
 drivers/tty/serial/ifx6x60.c        | 21 ++----------------
 drivers/tty/tty_port.c              | 17 +++++++++++++++
 drivers/usb/class/cdc-acm.c         | 24 ++++++---------------
 drivers/usb/serial/keyspan.c        | 43 +++++++++----------------------------
 drivers/usb/serial/option.c         |  9 ++------
 drivers/usb/serial/sierra.c         |  8 ++-----
 include/linux/tty.h                 |  1 +
 net/irda/ircomm/ircomm_tty_attach.c |  6 +-----
 16 files changed, 58 insertions(+), 148 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/chan_kern.c b/arch/um/drivers/chan_kern.c
index 15c553c239a1..bf42825ba54f 100644
--- a/arch/um/drivers/chan_kern.c
+++ b/arch/um/drivers/chan_kern.c
@@ -568,11 +568,7 @@ void chan_interrupt(struct line *line, int irq)
 		reactivate_fd(chan->fd, irq);
 	if (err == -EIO) {
 		if (chan->primary) {
-			struct tty_struct *tty = tty_port_tty_get(&line->port);
-			if (tty != NULL) {
-				tty_hangup(tty);
-				tty_kref_put(tty);
-			}
+			tty_port_tty_hangup(&line->port, false);
 			if (line->chan_out != chan)
 				close_one_chan(line->chan_out, 1);
 		}
diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c
index c931dfe6a59c..f093cea0d060 100644
--- a/drivers/mmc/card/sdio_uart.c
+++ b/drivers/mmc/card/sdio_uart.c
@@ -134,7 +134,6 @@ static void sdio_uart_port_put(struct sdio_uart_port *port)
 static void sdio_uart_port_remove(struct sdio_uart_port *port)
 {
 	struct sdio_func *func;
-	struct tty_struct *tty;
 
 	BUG_ON(sdio_uart_table[port->index] != port);
 
@@ -155,12 +154,8 @@ static void sdio_uart_port_remove(struct sdio_uart_port *port)
 	sdio_claim_host(func);
 	port->func = NULL;
 	mutex_unlock(&port->func_lock);
-	tty = tty_port_tty_get(&port->port);
 	/* tty_hangup is async so is this safe as is ?? */
-	if (tty) {
-		tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	tty_port_tty_hangup(&port->port, false);
 	mutex_unlock(&port->port.mutex);
 	sdio_release_irq(func);
 	sdio_disable_func(func);
@@ -492,11 +487,7 @@ static void sdio_uart_check_modem_status(struct sdio_uart_port *port)
 			wake_up_interruptible(&port->port.open_wait);
 		else {
 			/* DCD drop - hang up if tty attached */
-			tty = tty_port_tty_get(&port->port);
-			if (tty) {
-				tty_hangup(tty);
-				tty_kref_put(tty);
-			}
+			tty_port_tty_hangup(&port->port, false);
 		}
 	}
 	if (status & UART_MSR_DCTS) {
diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c
index a7714b4f29ad..cba1d46e672e 100644
--- a/drivers/net/usb/hso.c
+++ b/drivers/net/usb/hso.c
@@ -3124,18 +3124,13 @@ static void hso_serial_ref_free(struct kref *ref)
 static void hso_free_interface(struct usb_interface *interface)
 {
 	struct hso_serial *hso_dev;
-	struct tty_struct *tty;
 	int i;
 
 	for (i = 0; i < HSO_SERIAL_TTY_MINORS; i++) {
 		if (serial_table[i] &&
 		    (serial_table[i]->interface == interface)) {
 			hso_dev = dev2ser(serial_table[i]);
-			tty = tty_port_tty_get(&hso_dev->port);
-			if (tty) {
-				tty_hangup(tty);
-				tty_kref_put(tty);
-			}
+			tty_port_tty_hangup(&hso_dev->port, false);
 			mutex_lock(&hso_dev->parent->mutex);
 			hso_dev->parent->usb_gone = 1;
 			mutex_unlock(&hso_dev->parent->mutex);
diff --git a/drivers/tty/cyclades.c b/drivers/tty/cyclades.c
index 345bd0e0884e..33f83fee9fae 100644
--- a/drivers/tty/cyclades.c
+++ b/drivers/tty/cyclades.c
@@ -1124,14 +1124,8 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 					readl(&info->u.cyz.ch_ctrl->rs_status);
 				if (dcd & C_RS_DCD)
 					wake_up_interruptible(&info->port.open_wait);
-				else {
-					struct tty_struct *tty;
-					tty = tty_port_tty_get(&info->port);
-					if (tty) {
-						tty_hangup(tty);
-						tty_kref_put(tty);
-					}
-				}
+				else
+					tty_port_tty_hangup(&info->port, false);
 			}
 			break;
 		case C_CM_MCTS:
diff --git a/drivers/tty/moxa.c b/drivers/tty/moxa.c
index adeac255e526..1deaca4674e4 100644
--- a/drivers/tty/moxa.c
+++ b/drivers/tty/moxa.c
@@ -913,16 +913,12 @@ static void moxa_board_deinit(struct moxa_board_conf *brd)
 
 	/* pci hot-un-plug support */
 	for (a = 0; a < brd->numPorts; a++)
-		if (brd->ports[a].port.flags & ASYNC_INITIALIZED) {
-			struct tty_struct *tty = tty_port_tty_get(
-						&brd->ports[a].port);
-			if (tty) {
-				tty_hangup(tty);
-				tty_kref_put(tty);
-			}
-		}
+		if (brd->ports[a].port.flags & ASYNC_INITIALIZED)
+			tty_port_tty_hangup(&brd->ports[a].port, false);
+
 	for (a = 0; a < MAX_PORTS_PER_BOARD; a++)
 		tty_port_destroy(&brd->ports[a].port);
+
 	while (1) {
 		opened = 0;
 		for (a = 0; a < brd->numPorts; a++)
@@ -1365,7 +1361,6 @@ static void moxa_hangup(struct tty_struct *tty)
 
 static void moxa_new_dcdstate(struct moxa_port *p, u8 dcd)
 {
-	struct tty_struct *tty;
 	unsigned long flags;
 	dcd = !!dcd;
 
@@ -1373,10 +1368,8 @@ static void moxa_new_dcdstate(struct moxa_port *p, u8 dcd)
 	if (dcd != p->DCDState) {
         	p->DCDState = dcd;
         	spin_unlock_irqrestore(&p->port.lock, flags);
-		tty = tty_port_tty_get(&p->port);
-		if (tty && !C_CLOCAL(tty) && !dcd)
-			tty_hangup(tty);
-		tty_kref_put(tty);
+		if (!dcd)
+			tty_port_tty_hangup(&p->port, true);
 	}
 	else
 		spin_unlock_irqrestore(&p->port.lock, flags);
diff --git a/drivers/tty/n_gsm.c b/drivers/tty/n_gsm.c
index 4a43ef5d7962..74d9a0258d7c 100644
--- a/drivers/tty/n_gsm.c
+++ b/drivers/tty/n_gsm.c
@@ -1418,11 +1418,7 @@ static void gsm_dlci_close(struct gsm_dlci *dlci)
 		pr_debug("DLCI %d goes closed.\n", dlci->addr);
 	dlci->state = DLCI_CLOSED;
 	if (dlci->addr != 0) {
-		struct tty_struct  *tty = tty_port_tty_get(&dlci->port);
-		if (tty) {
-			tty_hangup(tty);
-			tty_kref_put(tty);
-		}
+		tty_port_tty_hangup(&dlci->port, false);
 		kfifo_reset(dlci->fifo);
 	} else
 		dlci->gsm->dead = 1;
diff --git a/drivers/tty/nozomi.c b/drivers/tty/nozomi.c
index 2e5bbdc09e1c..d6080c3831ef 100644
--- a/drivers/tty/nozomi.c
+++ b/drivers/tty/nozomi.c
@@ -1501,12 +1501,9 @@ static void tty_exit(struct nozomi *dc)
 
 	DBG1(" ");
 
-	for (i = 0; i < MAX_PORT; ++i) {
-		struct tty_struct *tty = tty_port_tty_get(&dc->port[i].port);
-		if (tty && list_empty(&tty->hangup_work.entry))
-			tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	for (i = 0; i < MAX_PORT; ++i)
+		tty_port_tty_hangup(&dc->port[i].port, false);
+
 	/* Racy below - surely should wait for scheduled work to be done or
 	   complete off a hangup method ? */
 	while (dc->open_ttys)
diff --git a/drivers/tty/rocket.c b/drivers/tty/rocket.c
index 1d270034bfc3..bbffd7a431e9 100644
--- a/drivers/tty/rocket.c
+++ b/drivers/tty/rocket.c
@@ -521,15 +521,10 @@ static void rp_handle_port(struct r_port *info)
 		       (ChanStatus & CD_ACT) ? "on" : "off");
 #endif
 		if (!(ChanStatus & CD_ACT) && info->cd_status) {
-			struct tty_struct *tty;
 #ifdef ROCKET_DEBUG_HANGUP
 			printk(KERN_INFO "CD drop, calling hangup.\n");
 #endif
-			tty = tty_port_tty_get(&info->port);
-			if (tty) {
-				tty_hangup(tty);
-				tty_kref_put(tty);
-			}
+			tty_port_tty_hangup(&info->port, false);
 		}
 		info->cd_status = (ChanStatus & CD_ACT) ? 1 : 0;
 		wake_up_interruptible(&info->port.open_wait);
diff --git a/drivers/tty/serial/ifx6x60.c b/drivers/tty/serial/ifx6x60.c
index d723d4193b90..2c77fed31a72 100644
--- a/drivers/tty/serial/ifx6x60.c
+++ b/drivers/tty/serial/ifx6x60.c
@@ -269,23 +269,6 @@ static void mrdy_assert(struct ifx_spi_device *ifx_dev)
 	mrdy_set_high(ifx_dev);
 }
 
-/**
- *	ifx_spi_hangup		-	hang up an IFX device
- *	@ifx_dev: our SPI device
- *
- *	Hang up the tty attached to the IFX device if one is currently
- *	open. If not take no action
- */
-static void ifx_spi_ttyhangup(struct ifx_spi_device *ifx_dev)
-{
-	struct tty_port *pport = &ifx_dev->tty_port;
-	struct tty_struct *tty = tty_port_tty_get(pport);
-	if (tty) {
-		tty_hangup(tty);
-		tty_kref_put(tty);
-	}
-}
-
 /**
  *	ifx_spi_timeout		-	SPI timeout
  *	@arg: our SPI device
@@ -298,7 +281,7 @@ static void ifx_spi_timeout(unsigned long arg)
 	struct ifx_spi_device *ifx_dev = (struct ifx_spi_device *)arg;
 
 	dev_warn(&ifx_dev->spi_dev->dev, "*** SPI Timeout ***");
-	ifx_spi_ttyhangup(ifx_dev);
+	tty_port_tty_hangup(&ifx_dev->tty_port, false);
 	mrdy_set_low(ifx_dev);
 	clear_bit(IFX_SPI_STATE_TIMER_PENDING, &ifx_dev->flags);
 }
@@ -933,7 +916,7 @@ static irqreturn_t ifx_spi_reset_interrupt(int irq, void *dev)
 		set_bit(MR_INPROGRESS, &ifx_dev->mdm_reset_state);
 		if (!solreset) {
 			/* unsolicited reset  */
-			ifx_spi_ttyhangup(ifx_dev);
+			tty_port_tty_hangup(&ifx_dev->tty_port, false);
 		}
 	} else {
 		/* exited reset */
diff --git a/drivers/tty/tty_port.c b/drivers/tty/tty_port.c
index 8bb757c62ee2..7f38eeaafac3 100644
--- a/drivers/tty/tty_port.c
+++ b/drivers/tty/tty_port.c
@@ -232,6 +232,23 @@ void tty_port_hangup(struct tty_port *port)
 }
 EXPORT_SYMBOL(tty_port_hangup);
 
+/**
+ * tty_port_tty_hangup - helper to hang up a tty
+ *
+ * @port: tty port
+ * @check_clocal: hang only ttys with CLOCAL unset?
+ */
+void tty_port_tty_hangup(struct tty_port *port, bool check_clocal)
+{
+	struct tty_struct *tty = tty_port_tty_get(port);
+
+	if (tty && (!check_clocal || !C_CLOCAL(tty))) {
+		tty_hangup(tty);
+		tty_kref_put(tty);
+	}
+}
+EXPORT_SYMBOL_GPL(tty_port_tty_hangup);
+
 /**
  * tty_port_tty_wakeup - helper to wake up a tty
  *
diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 755766e4b756..27a18743275e 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -292,7 +292,6 @@ static void acm_ctrl_irq(struct urb *urb)
 {
 	struct acm *acm = urb->context;
 	struct usb_cdc_notification *dr = urb->transfer_buffer;
-	struct tty_struct *tty;
 	unsigned char *data;
 	int newctrl;
 	int retval;
@@ -327,17 +326,12 @@ static void acm_ctrl_irq(struct urb *urb)
 		break;
 
 	case USB_CDC_NOTIFY_SERIAL_STATE:
-		tty = tty_port_tty_get(&acm->port);
 		newctrl = get_unaligned_le16(data);
 
-		if (tty) {
-			if (!acm->clocal &&
-				(acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
-				dev_dbg(&acm->control->dev,
-					"%s - calling hangup\n", __func__);
-				tty_hangup(tty);
-			}
-			tty_kref_put(tty);
+		if (!acm->clocal && (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
+			dev_dbg(&acm->control->dev, "%s - calling hangup\n",
+					__func__);
+			tty_port_tty_hangup(&acm->port, false);
 		}
 
 		acm->ctrlin = newctrl;
@@ -1498,15 +1492,9 @@ err_out:
 static int acm_reset_resume(struct usb_interface *intf)
 {
 	struct acm *acm = usb_get_intfdata(intf);
-	struct tty_struct *tty;
 
-	if (test_bit(ASYNCB_INITIALIZED, &acm->port.flags)) {
-		tty = tty_port_tty_get(&acm->port);
-		if (tty) {
-			tty_hangup(tty);
-			tty_kref_put(tty);
-		}
-	}
+	if (test_bit(ASYNCB_INITIALIZED, &acm->port.flags))
+		tty_port_tty_hangup(&acm->port, false);
 
 	return acm_resume(intf);
 }
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 1fd1935c8316..b011478d2e5f 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -378,7 +378,6 @@ static void	usa26_instat_callback(struct urb *urb)
 	struct usb_serial			*serial;
 	struct usb_serial_port			*port;
 	struct keyspan_port_private	 	*p_priv;
-	struct tty_struct			*tty;
 	int old_dcd_state, err;
 	int status = urb->status;
 
@@ -421,12 +420,8 @@ static void	usa26_instat_callback(struct urb *urb)
 	p_priv->dcd_state = ((msg->gpia_dcd) ? 1 : 0);
 	p_priv->ri_state = ((msg->ri) ? 1 : 0);
 
-	if (old_dcd_state != p_priv->dcd_state) {
-		tty = tty_port_tty_get(&port->port);
-		if (tty && !C_CLOCAL(tty))
-			tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	if (old_dcd_state != p_priv->dcd_state)
+		tty_port_tty_hangup(&port->port, true);
 
 	/* Resubmit urb so we continue receiving */
 	err = usb_submit_urb(urb, GFP_ATOMIC);
@@ -510,7 +505,6 @@ static void	usa28_instat_callback(struct urb *urb)
 	struct usb_serial			*serial;
 	struct usb_serial_port			*port;
 	struct keyspan_port_private	 	*p_priv;
-	struct tty_struct			*tty;
 	int old_dcd_state;
 	int status = urb->status;
 
@@ -551,12 +545,8 @@ static void	usa28_instat_callback(struct urb *urb)
 	p_priv->dcd_state = ((msg->dcd) ? 1 : 0);
 	p_priv->ri_state = ((msg->ri) ? 1 : 0);
 
-	if (old_dcd_state != p_priv->dcd_state && old_dcd_state) {
-		tty = tty_port_tty_get(&port->port);
-		if (tty && !C_CLOCAL(tty))
-			tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	if (old_dcd_state != p_priv->dcd_state && old_dcd_state)
+		tty_port_tty_hangup(&port->port, true);
 
 		/* Resubmit urb so we continue receiving */
 	err = usb_submit_urb(urb, GFP_ATOMIC);
@@ -642,12 +632,8 @@ static void	usa49_instat_callback(struct urb *urb)
 	p_priv->dcd_state = ((msg->dcd) ? 1 : 0);
 	p_priv->ri_state = ((msg->ri) ? 1 : 0);
 
-	if (old_dcd_state != p_priv->dcd_state && old_dcd_state) {
-		struct tty_struct *tty = tty_port_tty_get(&port->port);
-		if (tty && !C_CLOCAL(tty))
-			tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	if (old_dcd_state != p_priv->dcd_state && old_dcd_state)
+		tty_port_tty_hangup(&port->port, true);
 
 	/* Resubmit urb so we continue receiving */
 	err = usb_submit_urb(urb, GFP_ATOMIC);
@@ -851,7 +837,6 @@ static void	usa90_instat_callback(struct urb *urb)
 	struct usb_serial			*serial;
 	struct usb_serial_port			*port;
 	struct keyspan_port_private	 	*p_priv;
-	struct tty_struct			*tty;
 	int old_dcd_state, err;
 	int status = urb->status;
 
@@ -880,12 +865,8 @@ static void	usa90_instat_callback(struct urb *urb)
 	p_priv->dcd_state = ((msg->dcd) ? 1 : 0);
 	p_priv->ri_state = ((msg->ri) ? 1 : 0);
 
-	if (old_dcd_state != p_priv->dcd_state && old_dcd_state) {
-		tty = tty_port_tty_get(&port->port);
-		if (tty && !C_CLOCAL(tty))
-			tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	if (old_dcd_state != p_priv->dcd_state && old_dcd_state)
+		tty_port_tty_hangup(&port->port, true);
 
 	/* Resubmit urb so we continue receiving */
 	err = usb_submit_urb(urb, GFP_ATOMIC);
@@ -953,12 +934,8 @@ static void	usa67_instat_callback(struct urb *urb)
 	p_priv->cts_state = ((msg->hskia_cts) ? 1 : 0);
 	p_priv->dcd_state = ((msg->gpia_dcd) ? 1 : 0);
 
-	if (old_dcd_state != p_priv->dcd_state && old_dcd_state) {
-		struct tty_struct *tty = tty_port_tty_get(&port->port);
-		if (tty && !C_CLOCAL(tty))
-			tty_hangup(tty);
-		tty_kref_put(tty);
-	}
+	if (old_dcd_state != p_priv->dcd_state && old_dcd_state)
+		tty_port_tty_hangup(&port->port, true);
 
 	/* Resubmit urb so we continue receiving */
 	err = usb_submit_urb(urb, GFP_ATOMIC);
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index f7d339d8187b..602d1f389a3b 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -1532,13 +1532,8 @@ static void option_instat_callback(struct urb *urb)
 			portdata->dsr_state = ((signals & 0x02) ? 1 : 0);
 			portdata->ri_state = ((signals & 0x08) ? 1 : 0);
 
-			if (old_dcd_state && !portdata->dcd_state) {
-				struct tty_struct *tty =
-						tty_port_tty_get(&port->port);
-				if (tty && !C_CLOCAL(tty))
-					tty_hangup(tty);
-				tty_kref_put(tty);
-			}
+			if (old_dcd_state && !portdata->dcd_state)
+				tty_port_tty_hangup(&port->port, true);
 		} else {
 			dev_dbg(dev, "%s: type %x req %x\n", __func__,
 				req_pkt->bRequestType, req_pkt->bRequest);
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index c13f6e747748..d66148a17fe3 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -628,7 +628,6 @@ static void sierra_instat_callback(struct urb *urb)
 			unsigned char signals = *((unsigned char *)
 					urb->transfer_buffer +
 					sizeof(struct usb_ctrlrequest));
-			struct tty_struct *tty;
 
 			dev_dbg(&port->dev, "%s: signal x%x\n", __func__,
 				signals);
@@ -639,11 +638,8 @@ static void sierra_instat_callback(struct urb *urb)
 			portdata->dsr_state = ((signals & 0x02) ? 1 : 0);
 			portdata->ri_state = ((signals & 0x08) ? 1 : 0);
 
-			tty = tty_port_tty_get(&port->port);
-			if (tty && !C_CLOCAL(tty) &&
-					old_dcd_state && !portdata->dcd_state)
-				tty_hangup(tty);
-			tty_kref_put(tty);
+			if (old_dcd_state && !portdata->dcd_state)
+				tty_port_tty_hangup(&port->port, true);
 		} else {
 			dev_dbg(&port->dev, "%s: type %x req %x\n",
 				__func__, req_pkt->bRequestType,
diff --git a/include/linux/tty.h b/include/linux/tty.h
index b6e890a87eb1..d3548f871968 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -534,6 +534,7 @@ extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
 extern void tty_port_lower_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
+extern void tty_port_tty_hangup(struct tty_port *port, bool check_clocal);
 extern void tty_port_tty_wakeup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index edab393e0c82..a2a508f5f268 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -997,12 +997,8 @@ static int ircomm_tty_state_ready(struct ircomm_tty_cb *self,
 			self->settings.dce = IRCOMM_DELTA_CD;
 			ircomm_tty_check_modem_status(self);
 		} else {
-			struct tty_struct *tty = tty_port_tty_get(&self->port);
 			IRDA_DEBUG(0, "%s(), hanging up!\n", __func__ );
-			if (tty) {
-				tty_hangup(tty);
-				tty_kref_put(tty);
-			}
+			tty_port_tty_hangup(&self->port, false);
 		}
 		break;
 	default:
-- 
cgit 


From 21622939fc452c7fb739464b8e49368c3ceaa0ee Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Mon, 11 Mar 2013 16:44:21 -0400
Subject: tty: Add diagnostic for halted line discipline

Flip buffer work must not be scheduled by the line discipline
after the line discipline has been halted; issue warning.

Note: drivers can still schedule flip buffer work.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/n_tty.c     | 8 ++++++++
 drivers/tty/tty_ldisc.c | 7 ++++++-
 include/linux/tty.h     | 1 +
 3 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 68865d9af8a0..16793eccc6ae 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -153,6 +153,12 @@ static void n_tty_set_room(struct tty_struct *tty)
 	if (left && !old_left) {
 		WARN_RATELIMIT(tty->port->itty == NULL,
 				"scheduling with invalid itty\n");
+		/* see if ldisc has been killed - if so, this means that
+		 * even though the ldisc has been halted and ->buf.work
+		 * cancelled, ->buf.work is about to be rescheduled
+		 */
+		WARN_RATELIMIT(test_bit(TTY_LDISC_HALTED, &tty->flags),
+			       "scheduling buffer work for halted ldisc\n");
 		schedule_work(&tty->port->buf.work);
 	}
 }
@@ -1624,6 +1630,8 @@ static int n_tty_open(struct tty_struct *tty)
 		goto err_free_bufs;
 
 	tty->disc_data = ldata;
+	/* indicate buffer work may resume */
+	clear_bit(TTY_LDISC_HALTED, &tty->flags);
 	reset_buffer_flags(tty);
 	tty_unthrottle(tty);
 	ldata->column = 0;
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index d794087c327e..c641321b9404 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -375,6 +375,7 @@ static inline void tty_ldisc_put(struct tty_ldisc *ld)
 
 void tty_ldisc_enable(struct tty_struct *tty)
 {
+	clear_bit(TTY_LDISC_HALTED, &tty->flags);
 	set_bit(TTY_LDISC, &tty->flags);
 	clear_bit(TTY_LDISC_CHANGING, &tty->flags);
 	wake_up(&tty_ldisc_wait);
@@ -513,8 +514,11 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 
 static int tty_ldisc_halt(struct tty_struct *tty)
 {
+	int scheduled;
 	clear_bit(TTY_LDISC, &tty->flags);
-	return cancel_work_sync(&tty->port->buf.work);
+	scheduled = cancel_work_sync(&tty->port->buf.work);
+	set_bit(TTY_LDISC_HALTED, &tty->flags);
+	return scheduled;
 }
 
 /**
@@ -820,6 +824,7 @@ void tty_ldisc_hangup(struct tty_struct *tty)
 	clear_bit(TTY_LDISC, &tty->flags);
 	tty_unlock(tty);
 	cancel_work_sync(&tty->port->buf.work);
+	set_bit(TTY_LDISC_HALTED, &tty->flags);
 	mutex_unlock(&tty->ldisc_mutex);
 retry:
 	tty_lock(tty);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index d3548f871968..66ae020e8a98 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -315,6 +315,7 @@ struct tty_file_private {
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
 #define TTY_HUPPED 		18	/* Post driver->hangup() */
 #define TTY_HUPPING 		21	/* ->hangup() in progress */
+#define TTY_LDISC_HALTED	22	/* Line discipline is halted */
 
 #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
 
-- 
cgit 


From d912156605b0eb3b3070dc7eabc43db6379aa43b Mon Sep 17 00:00:00 2001
From: Peter Hurley <peter@hurleysoftware.com>
Date: Mon, 11 Mar 2013 16:44:33 -0400
Subject: tty: Don't reenable already enabled ldisc

tty_ldisc_hangup() guarantees the ldisc is enabled (or that there
is no ldisc). Since __tty_hangup() was the only user, re-define
tty_ldisc_enable() in file-scope.

Signed-off-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/tty_io.c    | 1 -
 drivers/tty/tty_ldisc.c | 2 +-
 include/linux/tty.h     | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index d3ddb31e363e..e6ee0f459a20 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -693,7 +693,6 @@ static void __tty_hangup(struct tty_struct *tty, int exit_session)
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
 	clear_bit(TTY_HUPPING, &tty->flags);
-	tty_ldisc_enable(tty);
 
 	tty_unlock(tty);
 
diff --git a/drivers/tty/tty_ldisc.c b/drivers/tty/tty_ldisc.c
index 37671fcc7e4c..9c727da59fac 100644
--- a/drivers/tty/tty_ldisc.c
+++ b/drivers/tty/tty_ldisc.c
@@ -373,7 +373,7 @@ static inline void tty_ldisc_put(struct tty_ldisc *ld)
  *	Clearing directly is allowed.
  */
 
-void tty_ldisc_enable(struct tty_struct *tty)
+static void tty_ldisc_enable(struct tty_struct *tty)
 {
 	clear_bit(TTY_LDISC_HALTED, &tty->flags);
 	set_bit(TTY_LDISC, &tty->flags);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 66ae020e8a98..367a9dfc4ea2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -561,8 +561,6 @@ extern void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty);
 extern void tty_ldisc_init(struct tty_struct *tty);
 extern void tty_ldisc_deinit(struct tty_struct *tty);
 extern void tty_ldisc_begin(void);
-/* This last one is just for the tty layer internals and shouldn't be used elsewhere */
-extern void tty_ldisc_enable(struct tty_struct *tty);
 
 
 /* n_tty.c */
-- 
cgit 


From 6f8da5df8c451103e0043f73a00c90676da6be9e Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Tue, 12 Mar 2013 18:08:09 -0400
Subject: power_supply: Add support for tps65090-charger

This patch adds support for the tps65090 charger driver. This driver is
responsible for controlling the charger aspect of the tps65090 mfd.
Currently, this mainly consists of turning on and off the charger, but
some features of the charger can be supported through this driver
including:

- Enable Auto Recharge based on Battery voltage
- Fast Charge Safety Timer
- Maximum battery discharge current
- Maximum battery adapter current
- Enable External Charge
- Disable charging termination based on low charger current (supported)

Once the driver is accepted, later patches can add support for the
features above which are not yet supported.

Based on work by:

  Syed Rafiuddin <srafiuddin@nvidia.com>
  Laxman Dewangan <ldewangan@nvidia.com>

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 drivers/power/Kconfig            |   7 +
 drivers/power/Makefile           |   1 +
 drivers/power/tps65090-charger.c | 315 +++++++++++++++++++++++++++++++++++++++
 include/linux/mfd/tps65090.h     |   5 +
 4 files changed, 328 insertions(+)
 create mode 100644 drivers/power/tps65090-charger.c

(limited to 'include/linux')

diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 07e1a8f8d03e..339f802b91c1 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -340,6 +340,13 @@ config CHARGER_SMB347
 	  Say Y to include support for Summit Microelectronics SMB347
 	  Battery Charger.
 
+config CHARGER_TPS65090
+	tristate "TPS65090 battery charger driver"
+	depends on MFD_TPS65090
+	help
+	 Say Y here to enable support for battery charging with TPS65090
+	 PMIC chips.
+
 config AB8500_BM
 	bool "AB8500 Battery Management Driver"
 	depends on AB8500_CORE && AB8500_GPADC
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index eb520ea74970..653bf6ceff30 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -52,4 +52,5 @@ obj-$(CONFIG_CHARGER_MAX8998)	+= max8998_charger.o
 obj-$(CONFIG_CHARGER_BQ2415X)	+= bq2415x_charger.o
 obj-$(CONFIG_POWER_AVS)		+= avs/
 obj-$(CONFIG_CHARGER_SMB347)	+= smb347-charger.o
+obj-$(CONFIG_CHARGER_TPS65090)	+= tps65090-charger.o
 obj-$(CONFIG_POWER_RESET)	+= reset/
diff --git a/drivers/power/tps65090-charger.c b/drivers/power/tps65090-charger.c
new file mode 100644
index 000000000000..0c66c6656b13
--- /dev/null
+++ b/drivers/power/tps65090-charger.c
@@ -0,0 +1,315 @@
+/*
+ * Battery charger driver for TI's tps65090
+ *
+ * Copyright (c) 2013, NVIDIA CORPORATION.  All rights reserved.
+
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+#include <linux/power_supply.h>
+#include <linux/mfd/tps65090.h>
+
+#define TPS65090_REG_INTR_STS	0x00
+#define TPS65090_REG_CG_CTRL0	0x04
+#define TPS65090_REG_CG_CTRL1	0x05
+#define TPS65090_REG_CG_CTRL2	0x06
+#define TPS65090_REG_CG_CTRL3	0x07
+#define TPS65090_REG_CG_CTRL4	0x08
+#define TPS65090_REG_CG_CTRL5	0x09
+#define TPS65090_REG_CG_STATUS1	0x0a
+#define TPS65090_REG_CG_STATUS2	0x0b
+
+#define TPS65090_CHARGER_ENABLE	BIT(0)
+#define TPS65090_VACG		BIT(1)
+#define TPS65090_NOITERM	BIT(5)
+
+struct tps65090_charger {
+	struct	device	*dev;
+	int	ac_online;
+	int	prev_ac_online;
+	int	irq;
+	struct power_supply	ac;
+	struct tps65090_platform_data *pdata;
+};
+
+static enum power_supply_property tps65090_ac_props[] = {
+	POWER_SUPPLY_PROP_ONLINE,
+};
+
+static int tps65090_low_chrg_current(struct tps65090_charger *charger)
+{
+	int ret;
+
+	ret = tps65090_write(charger->dev->parent, TPS65090_REG_CG_CTRL5,
+			TPS65090_NOITERM);
+	if (ret < 0) {
+		dev_err(charger->dev, "%s(): error reading in register 0x%x\n",
+			__func__, TPS65090_REG_CG_CTRL5);
+		return ret;
+	}
+	return 0;
+}
+
+static int tps65090_enable_charging(struct tps65090_charger *charger,
+	uint8_t enable)
+{
+	int ret;
+	uint8_t ctrl0 = 0;
+
+	ret = tps65090_read(charger->dev->parent, TPS65090_REG_CG_CTRL0,
+			    &ctrl0);
+	if (ret < 0) {
+		dev_err(charger->dev, "%s(): error reading in register 0x%x\n",
+				__func__, TPS65090_REG_CG_CTRL0);
+		return ret;
+	}
+
+	ret = tps65090_write(charger->dev->parent, TPS65090_REG_CG_CTRL0,
+				(ctrl0 | TPS65090_CHARGER_ENABLE));
+	if (ret < 0) {
+		dev_err(charger->dev, "%s(): error reading in register 0x%x\n",
+				__func__, TPS65090_REG_CG_CTRL0);
+		return ret;
+	}
+	return 0;
+}
+
+static int tps65090_config_charger(struct tps65090_charger *charger)
+{
+	int ret;
+
+	if (charger->pdata->enable_low_current_chrg) {
+		ret = tps65090_low_chrg_current(charger);
+		if (ret < 0) {
+			dev_err(charger->dev,
+				"error configuring low charge current\n");
+			return ret;
+		}
+	}
+
+	return 0;
+}
+
+static int tps65090_ac_get_property(struct power_supply *psy,
+			enum power_supply_property psp,
+			union power_supply_propval *val)
+{
+	struct tps65090_charger *charger = container_of(psy,
+					struct tps65090_charger, ac);
+
+	if (psp == POWER_SUPPLY_PROP_ONLINE) {
+		val->intval = charger->ac_online;
+		charger->prev_ac_online = charger->ac_online;
+		return 0;
+	}
+	return -EINVAL;
+}
+
+static irqreturn_t tps65090_charger_isr(int irq, void *dev_id)
+{
+	struct tps65090_charger *charger = dev_id;
+	int ret;
+	uint8_t status1 = 0;
+	uint8_t intrsts = 0;
+
+	ret = tps65090_read(charger->dev->parent, TPS65090_REG_CG_STATUS1,
+			    &status1);
+	if (ret < 0) {
+		dev_err(charger->dev, "%s(): Error in reading reg 0x%x\n",
+				__func__, TPS65090_REG_CG_STATUS1);
+		return IRQ_HANDLED;
+	}
+	msleep(75);
+	ret = tps65090_read(charger->dev->parent, TPS65090_REG_INTR_STS,
+			    &intrsts);
+	if (ret < 0) {
+		dev_err(charger->dev, "%s(): Error in reading reg 0x%x\n",
+				__func__, TPS65090_REG_INTR_STS);
+		return IRQ_HANDLED;
+	}
+
+	if (intrsts & TPS65090_VACG) {
+		ret = tps65090_enable_charging(charger, 1);
+		if (ret < 0)
+			return IRQ_HANDLED;
+		charger->ac_online = 1;
+	} else {
+		charger->ac_online = 0;
+	}
+
+	if (charger->prev_ac_online != charger->ac_online)
+		power_supply_changed(&charger->ac);
+
+	return IRQ_HANDLED;
+}
+
+#if defined(CONFIG_OF)
+
+#include <linux/of_device.h>
+
+static struct tps65090_platform_data *
+		tps65090_parse_dt_charger_data(struct platform_device *pdev)
+{
+	struct tps65090_platform_data *pdata;
+	struct device_node *np = pdev->dev.parent->of_node;
+	unsigned int prop;
+
+	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata) {
+		dev_err(&pdev->dev, "Memory alloc for tps65090_pdata failed\n");
+		return NULL;
+	}
+
+	prop = of_property_read_bool(np, "ti,enable-low-current-chrg");
+	pdata->enable_low_current_chrg = prop;
+
+	pdata->irq_base = -1;
+
+	return pdata;
+
+}
+#else
+static struct tps65090_platform_data *
+		tps65090_parse_dt_charger_data(struct platform_device *pdev)
+{
+	return NULL;
+}
+#endif
+
+static int tps65090_charger_probe(struct platform_device *pdev)
+{
+	struct tps65090 *tps65090_mfd = dev_get_drvdata(pdev->dev.parent);
+	struct tps65090_charger *cdata;
+	struct tps65090_platform_data *pdata;
+	uint8_t status1 = 0;
+	int ret;
+	int irq;
+
+	pdata = dev_get_platdata(pdev->dev.parent);
+
+	if (!pdata && tps65090_mfd->dev->of_node)
+		pdata = tps65090_parse_dt_charger_data(pdev);
+
+	if (!pdata) {
+		dev_err(&pdev->dev, "%s():no platform data available\n",
+				__func__);
+		return -ENODEV;
+	}
+
+	cdata = devm_kzalloc(&pdev->dev, sizeof(*cdata), GFP_KERNEL);
+	if (!cdata) {
+		dev_err(&pdev->dev, "failed to allocate memory status\n");
+		return -ENOMEM;
+	}
+
+	dev_set_drvdata(&pdev->dev, cdata);
+
+	cdata->dev			= &pdev->dev;
+	cdata->pdata			= pdata;
+
+	cdata->ac.name			= "tps65090-ac";
+	cdata->ac.type			= POWER_SUPPLY_TYPE_MAINS;
+	cdata->ac.get_property		= tps65090_ac_get_property;
+	cdata->ac.properties		= tps65090_ac_props;
+	cdata->ac.num_properties	= ARRAY_SIZE(tps65090_ac_props);
+	cdata->ac.supplied_to		= pdata->supplied_to;
+	cdata->ac.num_supplicants	= pdata->num_supplicants;
+
+	ret = power_supply_register(&pdev->dev, &cdata->ac);
+	if (ret) {
+		dev_err(&pdev->dev, "failed: power supply register\n");
+		return ret;
+	}
+
+	irq = platform_get_irq(pdev, 0);
+	if (irq <= 0) {
+		dev_warn(&pdev->dev, "Unable to get charger irq = %d\n", irq);
+		ret = irq;
+		goto fail_unregister_supply;
+	}
+
+	cdata->irq = irq;
+
+	ret = devm_request_threaded_irq(&pdev->dev, irq, NULL,
+		tps65090_charger_isr, 0, "tps65090-charger", cdata);
+	if (ret) {
+		dev_err(cdata->dev, "Unable to register irq %d err %d\n", irq,
+			ret);
+		goto fail_free_irq;
+	}
+
+	ret = tps65090_config_charger(cdata);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "charger config failed, err %d\n", ret);
+		goto fail_free_irq;
+	}
+
+	/* Check for charger presence */
+	ret = tps65090_read(cdata->dev->parent, TPS65090_REG_CG_STATUS1,
+			&status1);
+	if (ret < 0) {
+		dev_err(cdata->dev, "%s(): Error in reading reg 0x%x", __func__,
+			TPS65090_REG_CG_STATUS1);
+		goto fail_free_irq;
+	}
+
+	if (status1 != 0) {
+		ret = tps65090_enable_charging(cdata, 1);
+		if (ret < 0) {
+			dev_err(cdata->dev, "error enabling charger\n");
+			goto fail_free_irq;
+		}
+		cdata->ac_online = 1;
+		power_supply_changed(&cdata->ac);
+	}
+
+	return 0;
+
+fail_free_irq:
+	devm_free_irq(cdata->dev, irq, cdata);
+fail_unregister_supply:
+	power_supply_unregister(&cdata->ac);
+
+	return ret;
+}
+
+static int tps65090_charger_remove(struct platform_device *pdev)
+{
+	struct tps65090_charger *cdata = dev_get_drvdata(&pdev->dev);
+
+	devm_free_irq(cdata->dev, cdata->irq, cdata);
+	power_supply_unregister(&cdata->ac);
+
+	return 0;
+}
+
+static struct platform_driver tps65090_charger_driver = {
+	.driver	= {
+		.name	= "tps65090-charger",
+		.owner	= THIS_MODULE,
+	},
+	.probe	= tps65090_charger_probe,
+	.remove = tps65090_charger_remove,
+};
+module_platform_driver(tps65090_charger_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Syed Rafiuddin <srafiuddin@nvidia.com>");
+MODULE_DESCRIPTION("tps65090 battery charger driver");
diff --git a/include/linux/mfd/tps65090.h b/include/linux/mfd/tps65090.h
index 6694cf43e8b8..998628a2b08b 100644
--- a/include/linux/mfd/tps65090.h
+++ b/include/linux/mfd/tps65090.h
@@ -86,6 +86,11 @@ struct tps65090_regulator_plat_data {
 
 struct tps65090_platform_data {
 	int irq_base;
+
+	char **supplied_to;
+	size_t num_supplicants;
+	int enable_low_current_chrg;
+
 	struct tps65090_regulator_plat_data *reg_pdata[TPS65090_REGULATOR_MAX];
 };
 
-- 
cgit 


From 3d6ee287a3e341c88eafd0b4620b12d640b3736b Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 12 Mar 2013 20:26:02 +0100
Subject: clk: Introduce optional is_prepared callback

To reflect whether a clk_hw is prepared the clk_hw may implement
the optional is_prepared callback. If not implemented we fall back
to use the software prepare counter.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c            | 21 +++++++++++++++++++++
 include/linux/clk-provider.h |  6 ++++++
 2 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index ed87b2405806..7571b5054f3c 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -451,6 +451,27 @@ unsigned long __clk_get_flags(struct clk *clk)
 	return !clk ? 0 : clk->flags;
 }
 
+bool __clk_is_prepared(struct clk *clk)
+{
+	int ret;
+
+	if (!clk)
+		return false;
+
+	/*
+	 * .is_prepared is optional for clocks that can prepare
+	 * fall back to software usage counter if it is missing
+	 */
+	if (!clk->ops->is_prepared) {
+		ret = clk->prepare_count ? 1 : 0;
+		goto out;
+	}
+
+	ret = clk->ops->is_prepared(clk->hw);
+out:
+	return !!ret;
+}
+
 bool __clk_is_enabled(struct clk *clk)
 {
 	int ret;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 7f197d7addb0..ee946862e058 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -45,6 +45,10 @@ struct clk_hw;
  * 		undo any work done in the @prepare callback. Called with
  * 		prepare_lock held.
  *
+ * @is_prepared: Queries the hardware to determine if the clock is prepared.
+ *		This function is allowed to sleep. Optional, if this op is not
+ *		set then the prepare count will be used.
+ *
  * @enable:	Enable the clock atomically. This must not return until the
  * 		clock is generating a valid clock signal, usable by consumer
  * 		devices. Called with enable_lock held. This function must not
@@ -108,6 +112,7 @@ struct clk_hw;
 struct clk_ops {
 	int		(*prepare)(struct clk_hw *hw);
 	void		(*unprepare)(struct clk_hw *hw);
+	int		(*is_prepared)(struct clk_hw *hw);
 	int		(*enable)(struct clk_hw *hw);
 	void		(*disable)(struct clk_hw *hw);
 	int		(*is_enabled)(struct clk_hw *hw);
@@ -351,6 +356,7 @@ unsigned int __clk_get_enable_count(struct clk *clk);
 unsigned int __clk_get_prepare_count(struct clk *clk);
 unsigned long __clk_get_rate(struct clk *clk);
 unsigned long __clk_get_flags(struct clk *clk);
+bool __clk_is_prepared(struct clk *clk);
 bool __clk_is_enabled(struct clk *clk);
 struct clk *__clk_lookup(const char *name);
 
-- 
cgit 


From 3cc8247f1dce79511de8bf0f69ab02a46cc315b7 Mon Sep 17 00:00:00 2001
From: Ulf Hansson <ulf.hansson@linaro.org>
Date: Tue, 12 Mar 2013 20:26:04 +0100
Subject: clk: Introduce optional unprepare_unused callback

An unprepare_unused callback is introduced due to the same reasons to
why the disable_unused callback was added.

During the clk_disable_unused sequence, those clk_hw that needs specific
treatment with regards to being unprepared, shall implement the
unprepare_unused callback.

Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c            | 7 +++++--
 include/linux/clk-provider.h | 5 +++++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index c0141f3e1109..253792a46c08 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -352,9 +352,12 @@ static void clk_unprepare_unused_subtree(struct clk *clk)
 	if (clk->flags & CLK_IGNORE_UNUSED)
 		return;
 
-	if (__clk_is_prepared(clk))
-		if (clk->ops->unprepare)
+	if (__clk_is_prepared(clk)) {
+		if (clk->ops->unprepare_unused)
+			clk->ops->unprepare_unused(clk->hw);
+		else if (clk->ops->unprepare)
 			clk->ops->unprepare(clk->hw);
+	}
 }
 
 /* caller must hold prepare_lock */
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index ee946862e058..56e6cc12c796 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -49,6 +49,10 @@ struct clk_hw;
  *		This function is allowed to sleep. Optional, if this op is not
  *		set then the prepare count will be used.
  *
+ * @unprepare_unused: Unprepare the clock atomically.  Only called from
+ *		clk_disable_unused for prepare clocks with special needs.
+ *		Called with prepare mutex held. This function may sleep.
+ *
  * @enable:	Enable the clock atomically. This must not return until the
  * 		clock is generating a valid clock signal, usable by consumer
  * 		devices. Called with enable_lock held. This function must not
@@ -113,6 +117,7 @@ struct clk_ops {
 	int		(*prepare)(struct clk_hw *hw);
 	void		(*unprepare)(struct clk_hw *hw);
 	int		(*is_prepared)(struct clk_hw *hw);
+	void		(*unprepare_unused)(struct clk_hw *hw);
 	int		(*enable)(struct clk_hw *hw);
 	void		(*disable)(struct clk_hw *hw);
 	int		(*is_enabled)(struct clk_hw *hw);
-- 
cgit 


From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 Mar 2013 13:45:20 -0700
Subject: sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY

PF_THREAD_BOUND was originally used to mark kernel threads which were
bound to a specific CPU using kthread_bind() and a task with the flag
set allows cpus_allowed modifications only to itself.  Workqueue is
currently abusing it to prevent userland from meddling with
cpus_allowed of workqueue workers.

What we need is a flag to prevent userland from messing with
cpus_allowed of certain kernel tasks.  In kernel, anyone can
(incorrectly) squash the flag, and, for worker-type usages,
restricting cpus_allowed modification to the task itself doesn't
provide meaningful extra proection as other tasks can inject work
items to the task anyway.

This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY.
sched_setaffinity() checks the flag and return -EINVAL if set.
set_cpus_allowed_ptr() is no longer affected by the flag.

This will allow simplifying workqueue worker CPU affinity management.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  2 +-
 kernel/cgroup.c       |  4 ++--
 kernel/cpuset.c       | 16 ++++++++--------
 kernel/kthread.c      |  2 +-
 kernel/sched/core.c   |  9 ++++-----
 kernel/workqueue.c    | 10 +++-------
 6 files changed, 19 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..e5c64f7b8c1d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1793,7 +1793,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x01000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x02000000	/* Spread some slab caches over cpuset */
-#define PF_THREAD_BOUND	0x04000000	/* Thread bound to specific cpu */
+#define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 #define PF_MUTEX_TESTER	0x20000000	/* Thread belongs to the rt mutex tester */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..3852d926322c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2224,11 +2224,11 @@ retry_find_task:
 		tsk = tsk->group_leader;
 
 	/*
-	 * Workqueue threads may acquire PF_THREAD_BOUND and become
+	 * Workqueue threads may acquire PF_NO_SETAFFINITY and become
 	 * trapped in a cpuset, or RT worker may be born in a cgroup
 	 * with no rt_runtime allocated.  Just say no.
 	 */
-	if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
 		rcu_read_unlock();
 		goto out_unlock_cgroup;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..f22e94792707 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1388,16 +1388,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
 
 	cgroup_taskset_for_each(task, cgrp, tset) {
 		/*
-		 * Kthreads bound to specific cpus cannot be moved to a new
-		 * cpuset; we cannot change their cpu affinity and
-		 * isolating such threads by their set of allowed nodes is
-		 * unnecessary.  Thus, cpusets are not applicable for such
-		 * threads.  This prevents checking for success of
-		 * set_cpus_allowed_ptr() on all attached tasks before
-		 * cpus_allowed may be changed.
+		 * Kthreads which disallow setaffinity shouldn't be moved
+		 * to a new cpuset; we don't want to change their cpu
+		 * affinity and isolating such threads by their set of
+		 * allowed nodes is unnecessary.  Thus, cpusets are not
+		 * applicable for such threads.  This prevents checking for
+		 * success of set_cpus_allowed_ptr() on all attached tasks
+		 * before cpus_allowed may be changed.
 		 */
 		ret = -EINVAL;
-		if (task->flags & PF_THREAD_BOUND)
+		if (task->flags & PF_NO_SETAFFINITY)
 			goto out_unlock;
 		ret = security_task_setscheduler(task);
 		if (ret)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..a2fbbb782bad 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -260,7 +260,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
 {
 	/* It's safe because the task is inactive. */
 	do_set_cpus_allowed(p, cpumask_of(cpu));
-	p->flags |= PF_THREAD_BOUND;
+	p->flags |= PF_NO_SETAFFINITY;
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..23606ee961b5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	get_task_struct(p);
 	rcu_read_unlock();
 
+	if (p->flags & PF_NO_SETAFFINITY) {
+		retval = -EINVAL;
+		goto out_put_task;
+	}
 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
 		retval = -ENOMEM;
 		goto out_put_task;
@@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 		goto out;
 	}
 
-	if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	do_set_cpus_allowed(p, new_mask);
 
 	/* Can the task run on the task's current CPU? If so, we're done */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 969be0b72071..39a591f65b08 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1757,12 +1757,8 @@ static struct worker *create_worker(struct worker_pool *pool)
 	set_user_nice(worker->task, pool->attrs->nice);
 	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
 
-	/*
-	 * %PF_THREAD_BOUND is used to prevent userland from meddling with
-	 * cpumask of workqueue workers.  This is an abuse.  We need
-	 * %PF_NO_SETAFFINITY.
-	 */
-	worker->task->flags |= PF_THREAD_BOUND;
+	/* prevent userland from meddling with cpumask of workqueue workers */
+	worker->task->flags |= PF_NO_SETAFFINITY;
 
 	/*
 	 * The caller is responsible for ensuring %POOL_DISASSOCIATED
@@ -3876,7 +3872,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 		}
 
 		wq->rescuer = rescuer;
-		rescuer->task->flags |= PF_THREAD_BOUND;
+		rescuer->task->flags |= PF_NO_SETAFFINITY;
 		wake_up_process(rescuer->task);
 	}
 
-- 
cgit 


From 4d10f054f7df600ec8a388091c93b2d976920de0 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 19 Mar 2013 15:38:50 +0100
Subject: clocksource: make CLOCKSOURCE_OF_DECLARE type safe

This ensures that a function pointer passed into CLOCKSOURCE_OF_DECLARE
takes the same arguments that we use for calling that function later.

Also fix the extraneous semicolon at end of the CLOCKSOURCE_OF_DECLARE
definition.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rob Herring <rob.herring@calxeda.com>
---
 drivers/clocksource/clksrc-of.c    |  3 ++-
 drivers/clocksource/vt8500_timer.c |  2 +-
 include/linux/clocksource.h        | 11 +++++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/clksrc-of.c b/drivers/clocksource/clksrc-of.c
index 3ef11fba781c..37f5325bec95 100644
--- a/drivers/clocksource/clksrc-of.c
+++ b/drivers/clocksource/clksrc-of.c
@@ -16,6 +16,7 @@
 
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/clocksource.h>
 
 extern struct of_device_id __clksrc_of_table[];
 
@@ -26,7 +27,7 @@ void __init clocksource_of_init(void)
 {
 	struct device_node *np;
 	const struct of_device_id *match;
-	void (*init_func)(struct device_node *);
+	clocksource_of_init_fn init_func;
 
 	for_each_matching_node_and_match(np, __clksrc_of_table, &match) {
 		init_func = match->data;
diff --git a/drivers/clocksource/vt8500_timer.c b/drivers/clocksource/vt8500_timer.c
index 242255285597..64f553f04fa4 100644
--- a/drivers/clocksource/vt8500_timer.c
+++ b/drivers/clocksource/vt8500_timer.c
@@ -165,4 +165,4 @@ static void __init vt8500_timer_init(struct device_node *np)
 					4, 0xf0000000);
 }
 
-CLOCKSOURCE_OF_DECLARE(vt8500, "via,vt8500-timer", vt8500_timer_init)
+CLOCKSOURCE_OF_DECLARE(vt8500, "via,vt8500-timer", vt8500_timer_init);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 08ed5e19d8c6..ac33184b14fd 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -332,16 +332,23 @@ extern int clocksource_mmio_init(void __iomem *, const char *,
 
 extern int clocksource_i8253_init(void);
 
+struct device_node;
+typedef void(*clocksource_of_init_fn)(struct device_node *);
 #ifdef CONFIG_CLKSRC_OF
 extern void clocksource_of_init(void);
 
 #define CLOCKSOURCE_OF_DECLARE(name, compat, fn)			\
 	static const struct of_device_id __clksrc_of_table_##name	\
 		__used __section(__clksrc_of_table)			\
-		 = { .compatible = compat, .data = fn };
+		 = { .compatible = compat,				\
+		     .data = (fn == (clocksource_of_init_fn)NULL) ? fn : fn }
 #else
 static inline void clocksource_of_init(void) {}
-#define CLOCKSOURCE_OF_DECLARE(name, compat, fn)
+#define CLOCKSOURCE_OF_DECLARE(name, compat, fn)			\
+	static const struct of_device_id __clksrc_of_table_##name	\
+		__unused __section(__clksrc_of_table)			\
+		 = { .compatible = compat,				\
+		     .data = (fn == (clocksource_of_init_fn)NULL) ? fn : fn }
 #endif
 
 #endif /* _LINUX_CLOCKSOURCE_H */
-- 
cgit 


From a9a0fef779074838230e04a322fd2bdc921f4f4f Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 18 Mar 2013 13:22:19 +1030
Subject: virtio_ring: expose virtio barriers for use in vringh.

The host side of ring needs this logic too.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 33 +++++--------------------
 include/linux/virtio_ring.h  | 57 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index ffd7e7da5d3b..245177c286ae 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -24,27 +24,6 @@
 #include <linux/module.h>
 #include <linux/hrtimer.h>
 
-/* virtio guest is communicating with a virtual "device" that actually runs on
- * a host processor.  Memory barriers are used to control SMP effects. */
-#ifdef CONFIG_SMP
-/* Where possible, use SMP barriers which are more lightweight than mandatory
- * barriers, because mandatory barriers control MMIO effects on accesses
- * through relaxed memory I/O windows (which virtio-pci does not use). */
-#define virtio_mb(vq) \
-	do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
-#define virtio_rmb(vq) \
-	do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
-#define virtio_wmb(vq) \
-	do { if ((vq)->weak_barriers) smp_wmb(); else wmb(); } while(0)
-#else
-/* We must force memory ordering even if guest is UP since host could be
- * running on another CPU, but SMP barriers are defined to barrier() in that
- * configuration. So fall back to mandatory barriers instead. */
-#define virtio_mb(vq) mb()
-#define virtio_rmb(vq) rmb()
-#define virtio_wmb(vq) wmb()
-#endif
-
 #ifdef DEBUG
 /* For development, we want to crash whenever the ring is screwed. */
 #define BAD_RING(_vq, fmt, args...)				\
@@ -276,7 +255,7 @@ add_head:
 
 	/* Descriptors and available array need to be set before we expose the
 	 * new available array entries. */
-	virtio_wmb(vq);
+	virtio_wmb(vq->weak_barriers);
 	vq->vring.avail->idx++;
 	vq->num_added++;
 
@@ -312,7 +291,7 @@ bool virtqueue_kick_prepare(struct virtqueue *_vq)
 	START_USE(vq);
 	/* We need to expose available array entries before checking avail
 	 * event. */
-	virtio_mb(vq);
+	virtio_mb(vq->weak_barriers);
 
 	old = vq->vring.avail->idx - vq->num_added;
 	new = vq->vring.avail->idx;
@@ -436,7 +415,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	}
 
 	/* Only get used array entries after they have been exposed by host. */
-	virtio_rmb(vq);
+	virtio_rmb(vq->weak_barriers);
 
 	last_used = (vq->last_used_idx & (vq->vring.num - 1));
 	i = vq->vring.used->ring[last_used].id;
@@ -460,7 +439,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 	 * the read in the next get_buf call. */
 	if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
 		vring_used_event(&vq->vring) = vq->last_used_idx;
-		virtio_mb(vq);
+		virtio_mb(vq->weak_barriers);
 	}
 
 #ifdef DEBUG
@@ -513,7 +492,7 @@ bool virtqueue_enable_cb(struct virtqueue *_vq)
 	 * entry. Always do both to keep code simple. */
 	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
 	vring_used_event(&vq->vring) = vq->last_used_idx;
-	virtio_mb(vq);
+	virtio_mb(vq->weak_barriers);
 	if (unlikely(more_used(vq))) {
 		END_USE(vq);
 		return false;
@@ -553,7 +532,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
 	/* TODO: tune this threshold */
 	bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
 	vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
-	virtio_mb(vq);
+	virtio_mb(vq->weak_barriers);
 	if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
 		END_USE(vq);
 		return false;
diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
index 63c6ea199519..ca3ad41c2c82 100644
--- a/include/linux/virtio_ring.h
+++ b/include/linux/virtio_ring.h
@@ -4,6 +4,63 @@
 #include <linux/irqreturn.h>
 #include <uapi/linux/virtio_ring.h>
 
+/*
+ * Barriers in virtio are tricky.  Non-SMP virtio guests can't assume
+ * they're not on an SMP host system, so they need to assume real
+ * barriers.  Non-SMP virtio hosts could skip the barriers, but does
+ * anyone care?
+ *
+ * For virtio_pci on SMP, we don't need to order with respect to MMIO
+ * accesses through relaxed memory I/O windows, so smp_mb() et al are
+ * sufficient.
+ *
+ * For using virtio to talk to real devices (eg. other heterogeneous
+ * CPUs) we do need real barriers.  In theory, we could be using both
+ * kinds of virtio, so it's a runtime decision, and the branch is
+ * actually quite cheap.
+ */
+
+#ifdef CONFIG_SMP
+static inline void virtio_mb(bool weak_barriers)
+{
+	if (weak_barriers)
+		smp_mb();
+	else
+		mb();
+}
+
+static inline void virtio_rmb(bool weak_barriers)
+{
+	if (weak_barriers)
+		smp_rmb();
+	else
+		rmb();
+}
+
+static inline void virtio_wmb(bool weak_barriers)
+{
+	if (weak_barriers)
+		smp_wmb();
+	else
+		wmb();
+}
+#else
+static inline void virtio_mb(bool weak_barriers)
+{
+	mb();
+}
+
+static inline void virtio_rmb(bool weak_barriers)
+{
+	rmb();
+}
+
+static inline void virtio_wmb(bool weak_barriers)
+{
+	wmb();
+}
+#endif
+
 struct virtio_device;
 struct virtqueue;
 
-- 
cgit 


From f87d0fbb579818fed3eeb0923cc253163ab93039 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 13:50:14 +1030
Subject: vringh: host-side implementation of virtio rings.

Getting use of virtio rings correct is tricky, and a recent patch saw
an implementation of in-kernel rings (as separate from userspace).

This abstracts the business of dealing with the virtio ring layout
from the access (userspace or direct); to do this, we use function
pointers, which gcc inlines correctly.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 drivers/Makefile          |    2 +-
 drivers/vhost/Kconfig     |    8 +
 drivers/vhost/Kconfig.tcm |    1 +
 drivers/vhost/Makefile    |    2 +
 drivers/vhost/vringh.c    | 1007 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/vringh.h    |  196 +++++++++
 6 files changed, 1215 insertions(+), 1 deletion(-)
 create mode 100644 drivers/vhost/vringh.c
 create mode 100644 include/linux/vringh.h

(limited to 'include/linux')

diff --git a/drivers/Makefile b/drivers/Makefile
index dce39a95fa71..72d28d34ee24 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -123,7 +123,7 @@ obj-$(CONFIG_PPC_PS3)		+= ps3/
 obj-$(CONFIG_OF)		+= of/
 obj-$(CONFIG_SSB)		+= ssb/
 obj-$(CONFIG_BCMA)		+= bcma/
-obj-$(CONFIG_VHOST_NET)		+= vhost/
+obj-$(CONFIG_VHOST_RING)	+= vhost/
 obj-$(CONFIG_VLYNQ)		+= vlynq/
 obj-$(CONFIG_STAGING)		+= staging/
 obj-y				+= platform/
diff --git a/drivers/vhost/Kconfig b/drivers/vhost/Kconfig
index bf243177ffe1..85b773a93a5d 100644
--- a/drivers/vhost/Kconfig
+++ b/drivers/vhost/Kconfig
@@ -1,6 +1,7 @@
 config VHOST_NET
 	tristate "Host kernel accelerator for virtio net"
 	depends on NET && EVENTFD && (TUN || !TUN) && (MACVTAP || !MACVTAP)
+	select VHOST_RING
 	---help---
 	  This kernel module can be loaded in host kernel to accelerate
 	  guest networking with virtio_net. Not to be confused with virtio_net
@@ -12,3 +13,10 @@ config VHOST_NET
 if STAGING
 source "drivers/vhost/Kconfig.tcm"
 endif
+
+config VHOST_RING
+	tristate
+	---help---
+	  This option is selected by any driver which needs to access
+	  the host side of a virtio ring.
+
diff --git a/drivers/vhost/Kconfig.tcm b/drivers/vhost/Kconfig.tcm
index 7e3aa28d999e..c3a8cfa1de72 100644
--- a/drivers/vhost/Kconfig.tcm
+++ b/drivers/vhost/Kconfig.tcm
@@ -1,6 +1,7 @@
 config TCM_VHOST
 	tristate "TCM_VHOST fabric module"
 	depends on TARGET_CORE && EVENTFD && m
+	select VHOST_RING
 	default n
 	---help---
 	Say M here to enable the TCM_VHOST fabric module for use with virtio-scsi guests
diff --git a/drivers/vhost/Makefile b/drivers/vhost/Makefile
index a27b053bc9ab..1d37f5e12be6 100644
--- a/drivers/vhost/Makefile
+++ b/drivers/vhost/Makefile
@@ -2,3 +2,5 @@ obj-$(CONFIG_VHOST_NET) += vhost_net.o
 vhost_net-y := vhost.o net.o
 
 obj-$(CONFIG_TCM_VHOST) += tcm_vhost.o
+
+obj-$(CONFIG_VHOST_RING) += vringh.o
diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c
new file mode 100644
index 000000000000..bff0775e258c
--- /dev/null
+++ b/drivers/vhost/vringh.c
@@ -0,0 +1,1007 @@
+/*
+ * Helpers for the host side of a virtio ring.
+ *
+ * Since these may be in userspace, we use (inline) accessors.
+ */
+#include <linux/vringh.h>
+#include <linux/virtio_ring.h>
+#include <linux/kernel.h>
+#include <linux/ratelimit.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+
+static __printf(1,2) __cold void vringh_bad(const char *fmt, ...)
+{
+	static DEFINE_RATELIMIT_STATE(vringh_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	if (__ratelimit(&vringh_rs)) {
+		va_list ap;
+		va_start(ap, fmt);
+		printk(KERN_NOTICE "vringh:");
+		vprintk(fmt, ap);
+		va_end(ap);
+	}
+}
+
+/* Returns vring->num if empty, -ve on error. */
+static inline int __vringh_get_head(const struct vringh *vrh,
+				    int (*getu16)(u16 *val, const u16 *p),
+				    u16 *last_avail_idx)
+{
+	u16 avail_idx, i, head;
+	int err;
+
+	err = getu16(&avail_idx, &vrh->vring.avail->idx);
+	if (err) {
+		vringh_bad("Failed to access avail idx at %p",
+			   &vrh->vring.avail->idx);
+		return err;
+	}
+
+	if (*last_avail_idx == avail_idx)
+		return vrh->vring.num;
+
+	/* Only get avail ring entries after they have been exposed by guest. */
+	virtio_rmb(vrh->weak_barriers);
+
+	i = *last_avail_idx & (vrh->vring.num - 1);
+
+	err = getu16(&head, &vrh->vring.avail->ring[i]);
+	if (err) {
+		vringh_bad("Failed to read head: idx %d address %p",
+			   *last_avail_idx, &vrh->vring.avail->ring[i]);
+		return err;
+	}
+
+	if (head >= vrh->vring.num) {
+		vringh_bad("Guest says index %u > %u is available",
+			   head, vrh->vring.num);
+		return -EINVAL;
+	}
+
+	(*last_avail_idx)++;
+	return head;
+}
+
+/* Copy some bytes to/from the iovec.  Returns num copied. */
+static inline ssize_t vringh_iov_xfer(struct vringh_kiov *iov,
+				      void *ptr, size_t len,
+				      int (*xfer)(void *addr, void *ptr,
+						  size_t len))
+{
+	int err, done = 0;
+
+	while (len && iov->i < iov->used) {
+		size_t partlen;
+
+		partlen = min(iov->iov[iov->i].iov_len, len);
+		err = xfer(iov->iov[iov->i].iov_base, ptr, partlen);
+		if (err)
+			return err;
+		done += partlen;
+		len -= partlen;
+		ptr += partlen;
+		iov->consumed += partlen;
+		iov->iov[iov->i].iov_len -= partlen;
+		iov->iov[iov->i].iov_base += partlen;
+
+		if (!iov->iov[iov->i].iov_len) {
+			/* Fix up old iov element then increment. */
+			iov->iov[iov->i].iov_len = iov->consumed;
+			iov->iov[iov->i].iov_base -= iov->consumed;
+			
+			iov->consumed = 0;
+			iov->i++;
+		}
+	}
+	return done;
+}
+
+/* May reduce *len if range is shorter. */
+static inline bool range_check(struct vringh *vrh, u64 addr, size_t *len,
+			       struct vringh_range *range,
+			       bool (*getrange)(struct vringh *,
+						u64, struct vringh_range *))
+{
+	if (addr < range->start || addr > range->end_incl) {
+		if (!getrange(vrh, addr, range))
+			return false;
+	}
+	BUG_ON(addr < range->start || addr > range->end_incl);
+
+	/* To end of memory? */
+	if (unlikely(addr + *len == 0)) {
+		if (range->end_incl == -1ULL)
+			return true;
+		goto truncate;
+	}
+
+	/* Otherwise, don't wrap. */
+	if (addr + *len < addr) {
+		vringh_bad("Wrapping descriptor %zu@0x%llx",
+			   *len, (unsigned long long)addr);
+		return false;
+	}
+
+	if (unlikely(addr + *len - 1 > range->end_incl))
+		goto truncate;
+	return true;
+
+truncate:
+	*len = range->end_incl + 1 - addr;
+	return true;
+}
+
+static inline bool no_range_check(struct vringh *vrh, u64 addr, size_t *len,
+				  struct vringh_range *range,
+				  bool (*getrange)(struct vringh *,
+						   u64, struct vringh_range *))
+{
+	return true;
+}
+
+/* No reason for this code to be inline. */
+static int move_to_indirect(int *up_next, u16 *i, void *addr,
+			    const struct vring_desc *desc,
+			    struct vring_desc **descs, int *desc_max)
+{
+	/* Indirect tables can't have indirect. */
+	if (*up_next != -1) {
+		vringh_bad("Multilevel indirect %u->%u", *up_next, *i);
+		return -EINVAL;
+	}
+
+	if (unlikely(desc->len % sizeof(struct vring_desc))) {
+		vringh_bad("Strange indirect len %u", desc->len);
+		return -EINVAL;
+	}
+
+	/* We will check this when we follow it! */
+	if (desc->flags & VRING_DESC_F_NEXT)
+		*up_next = desc->next;
+	else
+		*up_next = -2;
+	*descs = addr;
+	*desc_max = desc->len / sizeof(struct vring_desc);
+
+	/* Now, start at the first indirect. */
+	*i = 0;
+	return 0;
+}
+
+static int resize_iovec(struct vringh_kiov *iov, gfp_t gfp)
+{
+	struct kvec *new;
+	unsigned int flag, new_num = (iov->max_num & ~VRINGH_IOV_ALLOCATED) * 2;
+
+	if (new_num < 8)
+		new_num = 8;
+
+	flag = (iov->max_num & VRINGH_IOV_ALLOCATED);
+	if (flag)
+		new = krealloc(iov->iov, new_num * sizeof(struct iovec), gfp);
+	else {
+		new = kmalloc(new_num * sizeof(struct iovec), gfp);
+		if (new) {
+			memcpy(new, iov->iov,
+			       iov->max_num * sizeof(struct iovec));
+			flag = VRINGH_IOV_ALLOCATED;
+		}
+	}
+	if (!new)
+		return -ENOMEM;
+	iov->iov = new;
+	iov->max_num = (new_num | flag);
+	return 0;
+}
+
+static u16 __cold return_from_indirect(const struct vringh *vrh, int *up_next,
+				       struct vring_desc **descs, int *desc_max)
+{
+	u16 i = *up_next;
+
+	*up_next = -1;
+	*descs = vrh->vring.desc;
+	*desc_max = vrh->vring.num;
+	return i;
+}
+
+static int slow_copy(struct vringh *vrh, void *dst, const void *src,
+		     bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+				    struct vringh_range *range,
+				    bool (*getrange)(struct vringh *vrh,
+						     u64,
+						     struct vringh_range *)),
+		     bool (*getrange)(struct vringh *vrh,
+				      u64 addr,
+				      struct vringh_range *r),
+		     struct vringh_range *range,
+		     int (*copy)(void *dst, const void *src, size_t len))
+{
+	size_t part, len = sizeof(struct vring_desc);
+
+	do {
+		u64 addr;
+		int err;
+
+		part = len;
+		addr = (u64)(unsigned long)src - range->offset;
+
+		if (!rcheck(vrh, addr, &part, range, getrange))
+			return -EINVAL;
+
+		err = copy(dst, src, part);
+		if (err)
+			return err;
+
+		dst += part;
+		src += part;
+		len -= part;
+	} while (len);
+	return 0;
+}
+
+static inline int
+__vringh_iov(struct vringh *vrh, u16 i,
+	     struct vringh_kiov *riov,
+	     struct vringh_kiov *wiov,
+	     bool (*rcheck)(struct vringh *vrh, u64 addr, size_t *len,
+			    struct vringh_range *range,
+			    bool (*getrange)(struct vringh *, u64,
+					     struct vringh_range *)),
+	     bool (*getrange)(struct vringh *, u64, struct vringh_range *),
+	     gfp_t gfp,
+	     int (*copy)(void *dst, const void *src, size_t len))
+{
+	int err, count = 0, up_next, desc_max;
+	struct vring_desc desc, *descs;
+	struct vringh_range range = { -1ULL, 0 }, slowrange;
+	bool slow = false;
+
+	/* We start traversing vring's descriptor table. */
+	descs = vrh->vring.desc;
+	desc_max = vrh->vring.num;
+	up_next = -1;
+
+	if (riov)
+		riov->i = riov->used = 0;
+	else if (wiov)
+		wiov->i = wiov->used = 0;
+	else
+		/* You must want something! */
+		BUG();
+
+	for (;;) {
+		void *addr;
+		struct vringh_kiov *iov;
+		size_t len;
+
+		if (unlikely(slow))
+			err = slow_copy(vrh, &desc, &descs[i], rcheck, getrange,
+					&slowrange, copy);
+		else
+			err = copy(&desc, &descs[i], sizeof(desc));
+		if (unlikely(err))
+			goto fail;
+
+		if (unlikely(desc.flags & VRING_DESC_F_INDIRECT)) {
+			/* Make sure it's OK, and get offset. */
+			len = desc.len;
+			if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+				err = -EINVAL;
+				goto fail;
+			}
+
+			if (unlikely(len != desc.len)) {
+				slow = true;
+				/* We need to save this range to use offset */
+				slowrange = range;
+			}
+
+			addr = (void *)(long)(desc.addr + range.offset);
+			err = move_to_indirect(&up_next, &i, addr, &desc,
+					       &descs, &desc_max);
+			if (err)
+				goto fail;
+			continue;
+		}
+
+		if (count++ == vrh->vring.num) {
+			vringh_bad("Descriptor loop in %p", descs);
+			err = -ELOOP;
+			goto fail;
+		}
+
+		if (desc.flags & VRING_DESC_F_WRITE)
+			iov = wiov;
+		else {
+			iov = riov;
+			if (unlikely(wiov && wiov->i)) {
+				vringh_bad("Readable desc %p after writable",
+					   &descs[i]);
+				err = -EINVAL;
+				goto fail;
+			}
+		}
+
+		if (!iov) {
+			vringh_bad("Unexpected %s desc",
+				   !wiov ? "writable" : "readable");
+			err = -EPROTO;
+			goto fail;
+		}
+
+	again:
+		/* Make sure it's OK, and get offset. */
+		len = desc.len;
+		if (!rcheck(vrh, desc.addr, &len, &range, getrange)) {
+			err = -EINVAL;
+			goto fail;
+		}
+		addr = (void *)(unsigned long)(desc.addr + range.offset);
+
+		if (unlikely(iov->used == (iov->max_num & ~VRINGH_IOV_ALLOCATED))) {
+			err = resize_iovec(iov, gfp);
+			if (err)
+				goto fail;
+		}
+
+		iov->iov[iov->used].iov_base = addr;
+		iov->iov[iov->used].iov_len = len;
+		iov->used++;
+
+		if (unlikely(len != desc.len)) {
+			desc.len -= len;
+			desc.addr += len;
+			goto again;
+		}
+
+		if (desc.flags & VRING_DESC_F_NEXT) {
+			i = desc.next;
+		} else {
+			/* Just in case we need to finish traversing above. */
+			if (unlikely(up_next > 0)) {
+				i = return_from_indirect(vrh, &up_next,
+							 &descs, &desc_max);
+				slow = false;
+			} else
+				break;
+		}
+
+		if (i >= desc_max) {
+			vringh_bad("Chained index %u > %u", i, desc_max);
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+
+	return 0;
+
+fail:
+	return err;
+}
+
+static inline int __vringh_complete(struct vringh *vrh,
+				    const struct vring_used_elem *used,
+				    unsigned int num_used,
+				    int (*putu16)(u16 *p, u16 val),
+				    int (*putused)(struct vring_used_elem *dst,
+						   const struct vring_used_elem
+						   *src, unsigned num))
+{
+	struct vring_used *used_ring;
+	int err;
+	u16 used_idx, off;
+
+	used_ring = vrh->vring.used;
+	used_idx = vrh->last_used_idx + vrh->completed;
+
+	off = used_idx % vrh->vring.num;
+
+	/* Compiler knows num_used == 1 sometimes, hence extra check */
+	if (num_used > 1 && unlikely(off + num_used >= vrh->vring.num)) {
+		u16 part = vrh->vring.num - off;
+		err = putused(&used_ring->ring[off], used, part);
+		if (!err)
+			err = putused(&used_ring->ring[0], used + part,
+				      num_used - part);
+	} else
+		err = putused(&used_ring->ring[off], used, num_used);
+
+	if (err) {
+		vringh_bad("Failed to write %u used entries %u at %p",
+			   num_used, off, &used_ring->ring[off]);
+		return err;
+	}
+
+	/* Make sure buffer is written before we update index. */
+	virtio_wmb(vrh->weak_barriers);
+
+	err = putu16(&vrh->vring.used->idx, used_idx + num_used);
+	if (err) {
+		vringh_bad("Failed to update used index at %p",
+			   &vrh->vring.used->idx);
+		return err;
+	}
+
+	vrh->completed += num_used;
+	return 0;
+}
+
+
+static inline int __vringh_need_notify(struct vringh *vrh,
+				       int (*getu16)(u16 *val, const u16 *p))
+{
+	bool notify;
+	u16 used_event;
+	int err;
+
+	/* Flush out used index update. This is paired with the
+	 * barrier that the Guest executes when enabling
+	 * interrupts. */
+	virtio_mb(vrh->weak_barriers);
+
+	/* Old-style, without event indices. */
+	if (!vrh->event_indices) {
+		u16 flags;
+		err = getu16(&flags, &vrh->vring.avail->flags);
+		if (err) {
+			vringh_bad("Failed to get flags at %p",
+				   &vrh->vring.avail->flags);
+			return err;
+		}
+		return (!(flags & VRING_AVAIL_F_NO_INTERRUPT));
+	}
+
+	/* Modern: we know when other side wants to know. */
+	err = getu16(&used_event, &vring_used_event(&vrh->vring));
+	if (err) {
+		vringh_bad("Failed to get used event idx at %p",
+			   &vring_used_event(&vrh->vring));
+		return err;
+	}
+
+	/* Just in case we added so many that we wrap. */
+	if (unlikely(vrh->completed > 0xffff))
+		notify = true;
+	else
+		notify = vring_need_event(used_event,
+					  vrh->last_used_idx + vrh->completed,
+					  vrh->last_used_idx);
+
+	vrh->last_used_idx += vrh->completed;
+	vrh->completed = 0;
+	return notify;
+}
+
+static inline bool __vringh_notify_enable(struct vringh *vrh,
+					  int (*getu16)(u16 *val, const u16 *p),
+					  int (*putu16)(u16 *p, u16 val))
+{
+	u16 avail;
+
+	if (!vrh->event_indices) {
+		/* Old-school; update flags. */
+		if (putu16(&vrh->vring.used->flags, 0) != 0) {
+			vringh_bad("Clearing used flags %p",
+				   &vrh->vring.used->flags);
+			return true;
+		}
+	} else {
+		if (putu16(&vring_avail_event(&vrh->vring),
+			   vrh->last_avail_idx) != 0) {
+			vringh_bad("Updating avail event index %p",
+				   &vring_avail_event(&vrh->vring));
+			return true;
+		}
+	}
+
+	/* They could have slipped one in as we were doing that: make
+	 * sure it's written, then check again. */
+	virtio_mb(vrh->weak_barriers);
+
+	if (getu16(&avail, &vrh->vring.avail->idx) != 0) {
+		vringh_bad("Failed to check avail idx at %p",
+			   &vrh->vring.avail->idx);
+		return true;
+	}
+
+	/* This is unlikely, so we just leave notifications enabled
+	 * (if we're using event_indices, we'll only get one
+	 * notification anyway). */
+	return avail == vrh->last_avail_idx;
+}
+
+static inline void __vringh_notify_disable(struct vringh *vrh,
+					   int (*putu16)(u16 *p, u16 val))
+{
+	if (!vrh->event_indices) {
+		/* Old-school; update flags. */
+		if (putu16(&vrh->vring.used->flags, VRING_USED_F_NO_NOTIFY)) {
+			vringh_bad("Setting used flags %p",
+				   &vrh->vring.used->flags);
+		}
+	}
+}
+
+/* Userspace access helpers: in this case, addresses are really userspace. */
+static inline int getu16_user(u16 *val, const u16 *p)
+{
+	return get_user(*val, (__force u16 __user *)p);
+}
+
+static inline int putu16_user(u16 *p, u16 val)
+{
+	return put_user(val, (__force u16 __user *)p);
+}
+
+static inline int copydesc_user(void *dst, const void *src, size_t len)
+{
+	return copy_from_user(dst, (__force void __user *)src, len) ?
+		-EFAULT : 0;
+}
+
+static inline int putused_user(struct vring_used_elem *dst,
+			       const struct vring_used_elem *src,
+			       unsigned int num)
+{
+	return copy_to_user((__force void __user *)dst, src,
+			    sizeof(*dst) * num) ? -EFAULT : 0;
+}
+
+static inline int xfer_from_user(void *src, void *dst, size_t len)
+{
+	return copy_from_user(dst, (__force void __user *)src, len) ?
+		-EFAULT : 0;
+}
+
+static inline int xfer_to_user(void *dst, void *src, size_t len)
+{
+	return copy_to_user((__force void __user *)dst, src, len) ?
+		-EFAULT : 0;
+}
+
+/**
+ * vringh_init_user - initialize a vringh for a userspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid: you should check pointers
+ * yourself!
+ */
+int vringh_init_user(struct vringh *vrh, u32 features,
+		     unsigned int num, bool weak_barriers,
+		     struct vring_desc __user *desc,
+		     struct vring_avail __user *avail,
+		     struct vring_used __user *used)
+{
+	/* Sane power of 2 please! */
+	if (!num || num > 0xffff || (num & (num - 1))) {
+		vringh_bad("Bad ring size %u", num);
+		return -EINVAL;
+	}
+
+	vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+	vrh->weak_barriers = weak_barriers;
+	vrh->completed = 0;
+	vrh->last_avail_idx = 0;
+	vrh->last_used_idx = 0;
+	vrh->vring.num = num;
+	/* vring expects kernel addresses, but only used via accessors. */
+	vrh->vring.desc = (__force struct vring_desc *)desc;
+	vrh->vring.avail = (__force struct vring_avail *)avail;
+	vrh->vring.used = (__force struct vring_used *)used;
+	return 0;
+}
+EXPORT_SYMBOL(vringh_init_user);
+
+/**
+ * vringh_getdesc_user - get next available descriptor from userspace ring.
+ * @vrh: the userspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @getrange: function to call to check ranges.
+ * @head: head index we received, for passing to vringh_complete_user().
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_user(struct vringh *vrh,
+			struct vringh_iov *riov,
+			struct vringh_iov *wiov,
+			bool (*getrange)(struct vringh *vrh,
+					 u64 addr, struct vringh_range *r),
+			u16 *head)
+{
+	int err;
+
+	*head = vrh->vring.num;
+	err = __vringh_get_head(vrh, getu16_user, &vrh->last_avail_idx);
+	if (err < 0)
+		return err;
+
+	/* Empty... */
+	if (err == vrh->vring.num)
+		return 0;
+
+	/* We need the layouts to be the identical for this to work */
+	BUILD_BUG_ON(sizeof(struct vringh_kiov) != sizeof(struct vringh_iov));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, iov) !=
+		     offsetof(struct vringh_iov, iov));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, i) !=
+		     offsetof(struct vringh_iov, i));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, used) !=
+		     offsetof(struct vringh_iov, used));
+	BUILD_BUG_ON(offsetof(struct vringh_kiov, max_num) !=
+		     offsetof(struct vringh_iov, max_num));
+	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
+	BUILD_BUG_ON(offsetof(struct iovec, iov_base) !=
+		     offsetof(struct kvec, iov_base));
+	BUILD_BUG_ON(offsetof(struct iovec, iov_len) !=
+		     offsetof(struct kvec, iov_len));
+	BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_base)
+		     != sizeof(((struct kvec *)NULL)->iov_base));
+	BUILD_BUG_ON(sizeof(((struct iovec *)NULL)->iov_len)
+		     != sizeof(((struct kvec *)NULL)->iov_len));
+
+	*head = err;
+	err = __vringh_iov(vrh, *head, (struct vringh_kiov *)riov,
+			   (struct vringh_kiov *)wiov,
+			   range_check, getrange, GFP_KERNEL, copydesc_user);
+	if (err)
+		return err;
+
+	return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_user);
+
+/**
+ * vringh_iov_pull_user - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len)
+{
+	return vringh_iov_xfer((struct vringh_kiov *)riov,
+			       dst, len, xfer_from_user);
+}
+EXPORT_SYMBOL(vringh_iov_pull_user);
+
+/**
+ * vringh_iov_push_user - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+			     const void *src, size_t len)
+{
+	return vringh_iov_xfer((struct vringh_kiov *)wiov,
+			       (void *)src, len, xfer_to_user);
+}
+EXPORT_SYMBOL(vringh_iov_push_user);
+
+/**
+ * vringh_abandon_user - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *	 vringh_get_user() to undo).
+ *
+ * The next vringh_get_user() will return the old descriptor(s) again.
+ */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num)
+{
+	/* We only update vring_avail_event(vr) when we want to be notified,
+	 * so we haven't changed that yet. */
+	vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_user);
+
+/**
+ * vringh_complete_user - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_user.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len)
+{
+	struct vring_used_elem used;
+
+	used.id = head;
+	used.len = len;
+	return __vringh_complete(vrh, &used, 1, putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_user);
+
+/**
+ * vringh_complete_multi_user - we've finished with many descriptors.
+ * @vrh: the vring.
+ * @used: the head, length pairs.
+ * @num_used: the number of used elements.
+ *
+ * You should check vringh_need_notify_user() after one or more calls
+ * to this function.
+ */
+int vringh_complete_multi_user(struct vringh *vrh,
+			       const struct vring_used_elem used[],
+			       unsigned num_used)
+{
+	return __vringh_complete(vrh, used, num_used,
+				 putu16_user, putused_user);
+}
+EXPORT_SYMBOL(vringh_complete_multi_user);
+
+/**
+ * vringh_notify_enable_user - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_user(struct vringh *vrh)
+{
+	return __vringh_notify_enable(vrh, getu16_user, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_enable_user);
+
+/**
+ * vringh_notify_disable_user - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_user(struct vringh *vrh)
+{
+	__vringh_notify_disable(vrh, putu16_user);
+}
+EXPORT_SYMBOL(vringh_notify_disable_user);
+
+/**
+ * vringh_need_notify_user - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_user() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_user(struct vringh *vrh)
+{
+	return __vringh_need_notify(vrh, getu16_user);
+}
+EXPORT_SYMBOL(vringh_need_notify_user);
+
+/* Kernelspace access helpers. */
+static inline int getu16_kern(u16 *val, const u16 *p)
+{
+	*val = ACCESS_ONCE(*p);
+	return 0;
+}
+
+static inline int putu16_kern(u16 *p, u16 val)
+{
+	ACCESS_ONCE(*p) = val;
+	return 0;
+}
+
+static inline int copydesc_kern(void *dst, const void *src, size_t len)
+{
+	memcpy(dst, src, len);
+	return 0;
+}
+
+static inline int putused_kern(struct vring_used_elem *dst,
+			       const struct vring_used_elem *src,
+			       unsigned int num)
+{
+	memcpy(dst, src, num * sizeof(*dst));
+	return 0;
+}
+
+static inline int xfer_kern(void *src, void *dst, size_t len)
+{
+	memcpy(dst, src, len);
+	return 0;
+}
+
+/**
+ * vringh_init_kern - initialize a vringh for a kernelspace vring.
+ * @vrh: the vringh to initialize.
+ * @features: the feature bits for this ring.
+ * @num: the number of elements.
+ * @weak_barriers: true if we only need memory barriers, not I/O.
+ * @desc: the userpace descriptor pointer.
+ * @avail: the userpace avail pointer.
+ * @used: the userpace used pointer.
+ *
+ * Returns an error if num is invalid.
+ */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+		     unsigned int num, bool weak_barriers,
+		     struct vring_desc *desc,
+		     struct vring_avail *avail,
+		     struct vring_used *used)
+{
+	/* Sane power of 2 please! */
+	if (!num || num > 0xffff || (num & (num - 1))) {
+		vringh_bad("Bad ring size %u", num);
+		return -EINVAL;
+	}
+
+	vrh->event_indices = (features & (1 << VIRTIO_RING_F_EVENT_IDX));
+	vrh->weak_barriers = weak_barriers;
+	vrh->completed = 0;
+	vrh->last_avail_idx = 0;
+	vrh->last_used_idx = 0;
+	vrh->vring.num = num;
+	vrh->vring.desc = desc;
+	vrh->vring.avail = avail;
+	vrh->vring.used = used;
+	return 0;
+}
+EXPORT_SYMBOL(vringh_init_kern);
+
+/**
+ * vringh_getdesc_kern - get next available descriptor from kernelspace ring.
+ * @vrh: the kernelspace vring.
+ * @riov: where to put the readable descriptors (or NULL)
+ * @wiov: where to put the writable descriptors (or NULL)
+ * @head: head index we received, for passing to vringh_complete_kern().
+ * @gfp: flags for allocating larger riov/wiov.
+ *
+ * Returns 0 if there was no descriptor, 1 if there was, or -errno.
+ *
+ * Note that on error return, you can tell the difference between an
+ * invalid ring and a single invalid descriptor: in the former case,
+ * *head will be vrh->vring.num.  You may be able to ignore an invalid
+ * descriptor, but there's not much you can do with an invalid ring.
+ *
+ * Note that you may need to clean up riov and wiov, even on error!
+ */
+int vringh_getdesc_kern(struct vringh *vrh,
+			struct vringh_kiov *riov,
+			struct vringh_kiov *wiov,
+			u16 *head,
+			gfp_t gfp)
+{
+	int err;
+
+	err = __vringh_get_head(vrh, getu16_kern, &vrh->last_avail_idx);
+	if (err < 0)
+		return err;
+
+	/* Empty... */
+	if (err == vrh->vring.num)
+		return 0;
+
+	*head = err;
+	err = __vringh_iov(vrh, *head, riov, wiov, no_range_check, NULL,
+			   gfp, copydesc_kern);
+	if (err)
+		return err;
+
+	return 1;
+}
+EXPORT_SYMBOL(vringh_getdesc_kern);
+
+/**
+ * vringh_iov_pull_kern - copy bytes from vring_iov.
+ * @riov: the riov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len)
+{
+	return vringh_iov_xfer(riov, dst, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_pull_kern);
+
+/**
+ * vringh_iov_push_kern - copy bytes into vring_iov.
+ * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume)
+ * @dst: the place to copy.
+ * @len: the maximum length to copy.
+ *
+ * Returns the bytes copied <= len or a negative errno.
+ */
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+			     const void *src, size_t len)
+{
+	return vringh_iov_xfer(wiov, (void *)src, len, xfer_kern);
+}
+EXPORT_SYMBOL(vringh_iov_push_kern);
+
+/**
+ * vringh_abandon_kern - we've decided not to handle the descriptor(s).
+ * @vrh: the vring.
+ * @num: the number of descriptors to put back (ie. num
+ *	 vringh_get_kern() to undo).
+ *
+ * The next vringh_get_kern() will return the old descriptor(s) again.
+ */
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num)
+{
+	/* We only update vring_avail_event(vr) when we want to be notified,
+	 * so we haven't changed that yet. */
+	vrh->last_avail_idx -= num;
+}
+EXPORT_SYMBOL(vringh_abandon_kern);
+
+/**
+ * vringh_complete_kern - we've finished with descriptor, publish it.
+ * @vrh: the vring.
+ * @head: the head as filled in by vringh_getdesc_kern.
+ * @len: the length of data we have written.
+ *
+ * You should check vringh_need_notify_kern() after one or more calls
+ * to this function.
+ */
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len)
+{
+	struct vring_used_elem used;
+
+	used.id = head;
+	used.len = len;
+
+	return __vringh_complete(vrh, &used, 1, putu16_kern, putused_kern);
+}
+EXPORT_SYMBOL(vringh_complete_kern);
+
+/**
+ * vringh_notify_enable_kern - we want to know if something changes.
+ * @vrh: the vring.
+ *
+ * This always enables notifications, but returns false if there are
+ * now more buffers available in the vring.
+ */
+bool vringh_notify_enable_kern(struct vringh *vrh)
+{
+	return __vringh_notify_enable(vrh, getu16_kern, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_enable_kern);
+
+/**
+ * vringh_notify_disable_kern - don't tell us if something changes.
+ * @vrh: the vring.
+ *
+ * This is our normal running state: we disable and then only enable when
+ * we're going to sleep.
+ */
+void vringh_notify_disable_kern(struct vringh *vrh)
+{
+	__vringh_notify_disable(vrh, putu16_kern);
+}
+EXPORT_SYMBOL(vringh_notify_disable_kern);
+
+/**
+ * vringh_need_notify_kern - must we tell the other side about used buffers?
+ * @vrh: the vring we've called vringh_complete_kern() on.
+ *
+ * Returns -errno or 0 if we don't need to tell the other side, 1 if we do.
+ */
+int vringh_need_notify_kern(struct vringh *vrh)
+{
+	return __vringh_need_notify(vrh, getu16_kern);
+}
+EXPORT_SYMBOL(vringh_need_notify_kern);
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
new file mode 100644
index 000000000000..b8f086625c49
--- /dev/null
+++ b/include/linux/vringh.h
@@ -0,0 +1,196 @@
+/*
+ * Linux host-side vring helpers; for when the kernel needs to access
+ * someone else's vring.
+ *
+ * Copyright IBM Corporation, 2013.
+ * Parts taken from drivers/vhost/vhost.c Copyright 2009 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Written by: Rusty Russell <rusty@rustcorp.com.au>
+ */
+#ifndef _LINUX_VRINGH_H
+#define _LINUX_VRINGH_H
+#include <uapi/linux/virtio_ring.h>
+#include <linux/uio.h>
+#include <linux/slab.h>
+#include <asm/barrier.h>
+
+/* virtio_ring with information needed for host access. */
+struct vringh {
+	/* Guest publishes used event idx (note: we always do). */
+	bool event_indices;
+
+	/* Can we get away with weak barriers? */
+	bool weak_barriers;
+
+	/* Last available index we saw (ie. where we're up to). */
+	u16 last_avail_idx;
+
+	/* Last index we used. */
+	u16 last_used_idx;
+
+	/* How many descriptors we've completed since last need_notify(). */
+	u32 completed;
+
+	/* The vring (note: it may contain user pointers!) */
+	struct vring vring;
+};
+
+/* The memory the vring can access, and what offset to apply. */
+struct vringh_range {
+	u64 start, end_incl;
+	u64 offset;
+};
+
+/**
+ * struct vringh_iov - iovec mangler.
+ *
+ * Mangles iovec in place, and restores it.
+ * Remaining data is iov + i, of used - i elements.
+ */
+struct vringh_iov {
+	struct iovec *iov;
+	size_t consumed; /* Within iov[i] */
+	unsigned i, used, max_num;
+};
+
+/**
+ * struct vringh_iov - kvec mangler.
+ *
+ * Mangles kvec in place, and restores it.
+ * Remaining data is iov + i, of used - i elements.
+ */
+struct vringh_kiov {
+	struct kvec *iov;
+	size_t consumed; /* Within iov[i] */
+	unsigned i, used, max_num;
+};
+
+/* Flag on max_num to indicate we're kmalloced. */
+#define VRINGH_IOV_ALLOCATED 0x8000000
+
+/* Helpers for userspace vrings. */
+int vringh_init_user(struct vringh *vrh, u32 features,
+		     unsigned int num, bool weak_barriers,
+		     struct vring_desc __user *desc,
+		     struct vring_avail __user *avail,
+		     struct vring_used __user *used);
+
+static inline void vringh_iov_init(struct vringh_iov *iov,
+				   struct iovec *iovec, unsigned num)
+{
+	iov->used = iov->i = 0;
+	iov->consumed = 0;
+	iov->max_num = num;
+	iov->iov = iovec;
+}
+
+static inline void vringh_iov_reset(struct vringh_iov *iov)
+{
+	iov->iov[iov->i].iov_len += iov->consumed;
+	iov->iov[iov->i].iov_base -= iov->consumed;
+	iov->consumed = 0;
+	iov->i = 0;
+}
+
+static inline void vringh_iov_cleanup(struct vringh_iov *iov)
+{
+	if (iov->max_num & VRINGH_IOV_ALLOCATED)
+		kfree(iov->iov);
+	iov->max_num = iov->used = iov->i = iov->consumed = 0;
+	iov->iov = NULL;
+}
+
+/* Convert a descriptor into iovecs. */
+int vringh_getdesc_user(struct vringh *vrh,
+			struct vringh_iov *riov,
+			struct vringh_iov *wiov,
+			bool (*getrange)(struct vringh *vrh,
+					 u64 addr, struct vringh_range *r),
+			u16 *head);
+
+/* Copy bytes from readable vsg, consuming it (and incrementing wiov->i). */
+ssize_t vringh_iov_pull_user(struct vringh_iov *riov, void *dst, size_t len);
+
+/* Copy bytes into writable vsg, consuming it (and incrementing wiov->i). */
+ssize_t vringh_iov_push_user(struct vringh_iov *wiov,
+			     const void *src, size_t len);
+
+/* Mark a descriptor as used. */
+int vringh_complete_user(struct vringh *vrh, u16 head, u32 len);
+int vringh_complete_multi_user(struct vringh *vrh,
+			       const struct vring_used_elem used[],
+			       unsigned num_used);
+
+/* Pretend we've never seen descriptor (for easy error handling). */
+void vringh_abandon_user(struct vringh *vrh, unsigned int num);
+
+/* Do we need to fire the eventfd to notify the other side? */
+int vringh_need_notify_user(struct vringh *vrh);
+
+bool vringh_notify_enable_user(struct vringh *vrh);
+void vringh_notify_disable_user(struct vringh *vrh);
+
+/* Helpers for kernelspace vrings. */
+int vringh_init_kern(struct vringh *vrh, u32 features,
+		     unsigned int num, bool weak_barriers,
+		     struct vring_desc *desc,
+		     struct vring_avail *avail,
+		     struct vring_used *used);
+
+static inline void vringh_kiov_init(struct vringh_kiov *kiov,
+				    struct kvec *kvec, unsigned num)
+{
+	kiov->used = kiov->i = 0;
+	kiov->consumed = 0;
+	kiov->max_num = num;
+	kiov->iov = kvec;
+}
+
+static inline void vringh_kiov_reset(struct vringh_kiov *kiov)
+{
+	kiov->iov[kiov->i].iov_len += kiov->consumed;
+	kiov->iov[kiov->i].iov_base -= kiov->consumed;
+	kiov->consumed = 0;
+	kiov->i = 0;
+}
+
+static inline void vringh_kiov_cleanup(struct vringh_kiov *kiov)
+{
+	if (kiov->max_num & VRINGH_IOV_ALLOCATED)
+		kfree(kiov->iov);
+	kiov->max_num = kiov->used = kiov->i = kiov->consumed = 0;
+	kiov->iov = NULL;
+}
+
+int vringh_getdesc_kern(struct vringh *vrh,
+			struct vringh_kiov *riov,
+			struct vringh_kiov *wiov,
+			u16 *head,
+			gfp_t gfp);
+
+ssize_t vringh_iov_pull_kern(struct vringh_kiov *riov, void *dst, size_t len);
+ssize_t vringh_iov_push_kern(struct vringh_kiov *wiov,
+			     const void *src, size_t len);
+void vringh_abandon_kern(struct vringh *vrh, unsigned int num);
+int vringh_complete_kern(struct vringh *vrh, u16 head, u32 len);
+
+bool vringh_notify_enable_kern(struct vringh *vrh);
+void vringh_notify_disable_kern(struct vringh *vrh);
+
+int vringh_need_notify_kern(struct vringh *vrh);
+
+#endif /* _LINUX_VRINGH_H */
-- 
cgit 


From 3beee86a4b9374e38dba36b44e81f1423a0d6b54 Mon Sep 17 00:00:00 2001
From: Sjur Brændeland <sjur.brandeland@stericsson.com>
Date: Wed, 20 Mar 2013 13:51:24 +1030
Subject: virtio: Introduce vringh wrappers in virtio_config
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add wrappers for the host vrings to support loose
coupling between the virtio device and driver.

A new struct vringh_config_ops with the functions
find_vrhs() and del_vrhs() is added to the virtio_device
struct. This enables virtio drivers to manage virtio
host rings without detailed knowledge of how the
vrings are created and deleted.

The function vringh_notify() is added so vringh clients
can notify the other side that buffers are added to the
used-ring.

Cc: Ohad Ben-Cohen <ohad@wizery.com>
Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (constified vringh_config)
---
 include/linux/virtio.h |  3 +++
 include/linux/vringh.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ff6714e6d0f5..5d5b3abc283d 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -8,6 +8,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/gfp.h>
+#include <linux/vringh.h>
 
 /**
  * virtqueue - a queue to register buffers for sending or receiving.
@@ -70,6 +71,7 @@ static inline unsigned int virtqueue_get_queue_index(struct virtqueue *vq)
  * @dev: underlying device.
  * @id: the device type identification (used to match it with a driver).
  * @config: the configuration ops for this device.
+ * @vringh_config: configuration ops for host vrings.
  * @vqs: the list of virtqueues for this device.
  * @features: the features supported by both driver and device.
  * @priv: private pointer for the driver's use.
@@ -79,6 +81,7 @@ struct virtio_device {
 	struct device dev;
 	struct virtio_device_id id;
 	const struct virtio_config_ops *config;
+	const struct vringh_config_ops *vringh_config;
 	struct list_head vqs;
 	/* Note that this is a Linux set_bit-style bitmap. */
 	unsigned long features[1];
diff --git a/include/linux/vringh.h b/include/linux/vringh.h
index b8f086625c49..749cde28728b 100644
--- a/include/linux/vringh.h
+++ b/include/linux/vringh.h
@@ -47,6 +47,28 @@ struct vringh {
 
 	/* The vring (note: it may contain user pointers!) */
 	struct vring vring;
+
+	/* The function to call to notify the guest about added buffers */
+	void (*notify)(struct vringh *);
+};
+
+/**
+ * struct vringh_config_ops - ops for creating a host vring from a virtio driver
+ * @find_vrhs: find the host vrings and instantiate them
+ *	vdev: the virtio_device
+ *	nhvrs: the number of host vrings to find
+ *	hvrs: on success, includes new host vrings
+ *	callbacks: array of driver callbacks, for each host vring
+ *		include a NULL entry for vqs that do not need a callback
+ *	Returns 0 on success or error status
+ * @del_vrhs: free the host vrings found by find_vrhs().
+ */
+struct virtio_device;
+typedef void vrh_callback_t(struct virtio_device *, struct vringh *);
+struct vringh_config_ops {
+	int (*find_vrhs)(struct virtio_device *vdev, unsigned nhvrs,
+			 struct vringh *vrhs[], vrh_callback_t *callbacks[]);
+	void (*del_vrhs)(struct virtio_device *vdev);
 };
 
 /* The memory the vring can access, and what offset to apply. */
@@ -193,4 +215,11 @@ void vringh_notify_disable_kern(struct vringh *vrh);
 
 int vringh_need_notify_kern(struct vringh *vrh);
 
+/* Notify the guest about buffers added to the used ring */
+static inline void vringh_notify(struct vringh *vrh)
+{
+	if (vrh->notify)
+		vrh->notify(vrh);
+}
+
 #endif /* _LINUX_VRINGH_H */
-- 
cgit 


From 0d2e1a2926b1839a4b74519e660739b2566c9386 Mon Sep 17 00:00:00 2001
From: Erwan Yvin <erwan.yvin@stericsson.com>
Date: Wed, 20 Mar 2013 13:52:24 +1030
Subject: caif_virtio: Introduce caif over virtio

Add the CAIF Virtio shared memory driver for talking
to a modem.

This CAIF Link layer communicates to the modem over
shared memory. It is implemented as a virtio_driver.
The underlying virtio device is managed by the remoteproc
framework. The Virtio queue is used for transmitting data
to the modem, and the new vringh is used for receiving data.

Genalloc is used for managing the shared memory used for TX
data. The default dma-alloc-coherent allocator can only
allocate whole pages, and this wastes too much shared memory.

Flow control is implemented by stopping the TX-queues if the
virtio queues go full or we run out of memory. Queued are
reopened when queues are below the watermark.

NAPI is used in RX path, and a dedicated tasklet is used
for releasing TX buffers.

Signed-off-by: Erwan Yvin <erwan.yvin@stericsson.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (minor fixes)
---
 drivers/net/caif/Kconfig        |  14 +
 drivers/net/caif/Makefile       |   3 +
 drivers/net/caif/caif_virtio.c  | 785 ++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio_caif.h     |  24 ++
 include/uapi/linux/virtio_ids.h |   1 +
 5 files changed, 827 insertions(+)
 create mode 100644 drivers/net/caif/caif_virtio.c
 create mode 100644 include/linux/virtio_caif.h

(limited to 'include/linux')

diff --git a/drivers/net/caif/Kconfig b/drivers/net/caif/Kconfig
index 60c2142373c9..893f9154011e 100644
--- a/drivers/net/caif/Kconfig
+++ b/drivers/net/caif/Kconfig
@@ -47,3 +47,17 @@ config CAIF_HSI
        The caif low level driver for CAIF over HSI.
        Be aware that if you enable this then you also need to
        enable a low-level HSI driver.
+
+config CAIF_VIRTIO
+	tristate "CAIF virtio transport driver"
+	depends on CAIF
+	select VHOST_RING
+	select VIRTIO
+	select GENERIC_ALLOCATOR
+	default n
+	---help---
+	The caif driver for CAIF over Virtio.
+
+if CAIF_VIRTIO
+source "drivers/vhost/Kconfig"
+endif
diff --git a/drivers/net/caif/Makefile b/drivers/net/caif/Makefile
index 91dff861560f..d9ee26a96c6e 100644
--- a/drivers/net/caif/Makefile
+++ b/drivers/net/caif/Makefile
@@ -13,3 +13,6 @@ obj-$(CONFIG_CAIF_SHM) += caif_shm.o
 
 # HSI interface
 obj-$(CONFIG_CAIF_HSI) += caif_hsi.o
+
+# Virtio interface
+obj-$(CONFIG_CAIF_VIRTIO) += caif_virtio.o
diff --git a/drivers/net/caif/caif_virtio.c b/drivers/net/caif/caif_virtio.c
new file mode 100644
index 000000000000..b1e1205e4e28
--- /dev/null
+++ b/drivers/net/caif/caif_virtio.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2013
+ * Authors: Vicram Arv / vikram.arv@stericsson.com,
+ *	    Dmitry Tarnyagin / dmitry.tarnyagin@stericsson.com
+ *	    Sjur Brendeland / sjur.brandeland@stericsson.com
+ * License terms: GNU General Public License (GPL) version 2
+ */
+#include <linux/module.h>
+#include <linux/if_arp.h>
+#include <linux/virtio.h>
+#include <linux/vringh.h>
+#include <linux/debugfs.h>
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/interrupt.h>
+#include <linux/netdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/virtio_ids.h>
+#include <linux/virtio_caif.h>
+#include <linux/virtio_ring.h>
+#include <linux/dma-mapping.h>
+#include <net/caif/caif_dev.h>
+#include <linux/virtio_config.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Vicram Arv <vikram.arv@stericsson.com>");
+MODULE_AUTHOR("Sjur Brendeland <sjur.brandeland@stericsson.com>");
+MODULE_DESCRIPTION("Virtio CAIF Driver");
+
+/* NAPI schedule quota */
+#define CFV_DEFAULT_QUOTA 32
+
+/* Defaults used if virtio config space is unavailable */
+#define CFV_DEF_MTU_SIZE 4096
+#define CFV_DEF_HEADROOM 32
+#define CFV_DEF_TAILROOM 32
+
+/* Required IP header alignment */
+#define IP_HDR_ALIGN 4
+
+/* struct cfv_napi_contxt - NAPI context info
+ * @riov: IOV holding data read from the ring. Note that riov may
+ *	  still hold data when cfv_rx_poll() returns.
+ * @head: Last descriptor ID we received from vringh_getdesc_kern.
+ *	  We use this to put descriptor back on the used ring. USHRT_MAX is
+ *	  used to indicate invalid head-id.
+ */
+struct cfv_napi_context {
+	struct vringh_kiov riov;
+	unsigned short head;
+};
+
+/* struct cfv_stats - statistics for debugfs
+ * @rx_napi_complete:	Number of NAPI completions (RX)
+ * @rx_napi_resched:	Number of calls where the full quota was used (RX)
+ * @rx_nomem:		Number of SKB alloc failures (RX)
+ * @rx_kicks:		Number of RX kicks
+ * @tx_full_ring:	Number times TX ring was full
+ * @tx_no_mem:		Number of times TX went out of memory
+ * @tx_flow_on:		Number of flow on (TX)
+ * @tx_kicks:		Number of TX kicks
+ */
+struct cfv_stats {
+	u32 rx_napi_complete;
+	u32 rx_napi_resched;
+	u32 rx_nomem;
+	u32 rx_kicks;
+	u32 tx_full_ring;
+	u32 tx_no_mem;
+	u32 tx_flow_on;
+	u32 tx_kicks;
+};
+
+/* struct cfv_info - Caif Virtio control structure
+ * @cfdev:	caif common header
+ * @vdev:	Associated virtio device
+ * @vr_rx:	rx/downlink host vring
+ * @vq_tx:	tx/uplink virtqueue
+ * @ndev:	CAIF link layer device
+ * @watermark_tx: indicates number of free descriptors we need
+ *		to reopen the tx-queues after overload.
+ * @tx_lock:	protects vq_tx from concurrent use
+ * @tx_release_tasklet: Tasklet for freeing consumed TX buffers
+ * @napi:       Napi context used in cfv_rx_poll()
+ * @ctx:        Context data used in cfv_rx_poll()
+ * @tx_hr:	transmit headroom
+ * @rx_hr:	receive headroom
+ * @tx_tr:	transmit tail room
+ * @rx_tr:	receive tail room
+ * @mtu:	transmit max size
+ * @mru:	receive max size
+ * @allocsz:    size of dma memory reserved for TX buffers
+ * @alloc_addr: virtual address to dma memory for TX buffers
+ * @alloc_dma:  dma address to dma memory for TX buffers
+ * @genpool:    Gen Pool used for allocating TX buffers
+ * @reserved_mem: Pointer to memory reserve allocated from genpool
+ * @reserved_size: Size of memory reserve allocated from genpool
+ * @stats:       Statistics exposed in sysfs
+ * @debugfs:    Debugfs dentry for statistic counters
+ */
+struct cfv_info {
+	struct caif_dev_common cfdev;
+	struct virtio_device *vdev;
+	struct vringh *vr_rx;
+	struct virtqueue *vq_tx;
+	struct net_device *ndev;
+	unsigned int watermark_tx;
+	/* Protect access to vq_tx */
+	spinlock_t tx_lock;
+	struct tasklet_struct tx_release_tasklet;
+	struct napi_struct napi;
+	struct cfv_napi_context ctx;
+	u16 tx_hr;
+	u16 rx_hr;
+	u16 tx_tr;
+	u16 rx_tr;
+	u32 mtu;
+	u32 mru;
+	size_t allocsz;
+	void *alloc_addr;
+	dma_addr_t alloc_dma;
+	struct gen_pool *genpool;
+	unsigned long reserved_mem;
+	size_t reserved_size;
+	struct cfv_stats stats;
+	struct dentry *debugfs;
+};
+
+/* struct buf_info - maintains transmit buffer data handle
+ * @size:	size of transmit buffer
+ * @dma_handle: handle to allocated dma device memory area
+ * @vaddr:	virtual address mapping to allocated memory area
+ */
+struct buf_info {
+	size_t size;
+	u8 *vaddr;
+};
+
+/* Called from virtio device, in IRQ context */
+static void cfv_release_cb(struct virtqueue *vq_tx)
+{
+	struct cfv_info *cfv = vq_tx->vdev->priv;
+
+	++cfv->stats.tx_kicks;
+	tasklet_schedule(&cfv->tx_release_tasklet);
+}
+
+static void free_buf_info(struct cfv_info *cfv, struct buf_info *buf_info)
+{
+	if (!buf_info)
+		return;
+	gen_pool_free(cfv->genpool, (unsigned long) buf_info->vaddr,
+		      buf_info->size);
+	kfree(buf_info);
+}
+
+/* This is invoked whenever the remote processor completed processing
+ * a TX msg we just sent, and the buffer is put back to the used ring.
+ */
+static void cfv_release_used_buf(struct virtqueue *vq_tx)
+{
+	struct cfv_info *cfv = vq_tx->vdev->priv;
+	unsigned long flags;
+
+	BUG_ON(vq_tx != cfv->vq_tx);
+
+	for (;;) {
+		unsigned int len;
+		struct buf_info *buf_info;
+
+		/* Get used buffer from used ring to recycle used descriptors */
+		spin_lock_irqsave(&cfv->tx_lock, flags);
+		buf_info = virtqueue_get_buf(vq_tx, &len);
+		spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+		/* Stop looping if there are no more buffers to free */
+		if (!buf_info)
+			break;
+
+		free_buf_info(cfv, buf_info);
+
+		/* watermark_tx indicates if we previously stopped the tx
+		 * queues. If we have enough free stots in the virtio ring,
+		 * re-establish memory reserved and open up tx queues.
+		 */
+		if (cfv->vq_tx->num_free <= cfv->watermark_tx)
+			continue;
+
+		/* Re-establish memory reserve */
+		if (cfv->reserved_mem == 0 && cfv->genpool)
+			cfv->reserved_mem =
+				gen_pool_alloc(cfv->genpool,
+					       cfv->reserved_size);
+
+		/* Open up the tx queues */
+		if (cfv->reserved_mem) {
+			cfv->watermark_tx =
+				virtqueue_get_vring_size(cfv->vq_tx);
+			netif_tx_wake_all_queues(cfv->ndev);
+			/* Buffers are recycled in cfv_netdev_tx, so
+			 * disable notifications when queues are opened.
+			 */
+			virtqueue_disable_cb(cfv->vq_tx);
+			++cfv->stats.tx_flow_on;
+		} else {
+			/* if no memory reserve, wait for more free slots */
+			WARN_ON(cfv->watermark_tx >
+			       virtqueue_get_vring_size(cfv->vq_tx));
+			cfv->watermark_tx +=
+				virtqueue_get_vring_size(cfv->vq_tx) / 4;
+		}
+	}
+}
+
+/* Allocate a SKB and copy packet data to it */
+static struct sk_buff *cfv_alloc_and_copy_skb(int *err,
+					      struct cfv_info *cfv,
+					      u8 *frm, u32 frm_len)
+{
+	struct sk_buff *skb;
+	u32 cfpkt_len, pad_len;
+
+	*err = 0;
+	/* Verify that packet size with down-link header and mtu size */
+	if (frm_len > cfv->mru || frm_len <= cfv->rx_hr + cfv->rx_tr) {
+		netdev_err(cfv->ndev,
+			   "Invalid frmlen:%u  mtu:%u hr:%d tr:%d\n",
+			   frm_len, cfv->mru,  cfv->rx_hr,
+			   cfv->rx_tr);
+		*err = -EPROTO;
+		return NULL;
+	}
+
+	cfpkt_len = frm_len - (cfv->rx_hr + cfv->rx_tr);
+	pad_len = (unsigned long)(frm + cfv->rx_hr) & (IP_HDR_ALIGN - 1);
+
+	skb = netdev_alloc_skb(cfv->ndev, frm_len + pad_len);
+	if (!skb) {
+		*err = -ENOMEM;
+		return NULL;
+	}
+
+	skb_reserve(skb, cfv->rx_hr + pad_len);
+
+	memcpy(skb_put(skb, cfpkt_len), frm + cfv->rx_hr, cfpkt_len);
+	return skb;
+}
+
+/* Get packets from the host vring */
+static int cfv_rx_poll(struct napi_struct *napi, int quota)
+{
+	struct cfv_info *cfv = container_of(napi, struct cfv_info, napi);
+	int rxcnt = 0;
+	int err = 0;
+	void *buf;
+	struct sk_buff *skb;
+	struct vringh_kiov *riov = &cfv->ctx.riov;
+	unsigned int skb_len;
+
+again:
+	do {
+		skb = NULL;
+
+		/* Put the previous iovec back on the used ring and
+		 * fetch a new iovec if we have processed all elements.
+		 */
+		if (riov->i == riov->used) {
+			if (cfv->ctx.head != USHRT_MAX) {
+				vringh_complete_kern(cfv->vr_rx,
+						     cfv->ctx.head,
+						     0);
+				cfv->ctx.head = USHRT_MAX;
+			}
+
+			err = vringh_getdesc_kern(
+				cfv->vr_rx,
+				riov,
+				NULL,
+				&cfv->ctx.head,
+				GFP_ATOMIC);
+
+			if (err <= 0)
+				goto exit;
+		}
+
+		buf = phys_to_virt((unsigned long) riov->iov[riov->i].iov_base);
+		/* TODO: Add check on valid buffer address */
+
+		skb = cfv_alloc_and_copy_skb(&err, cfv, buf,
+					     riov->iov[riov->i].iov_len);
+		if (unlikely(err))
+			goto exit;
+
+		/* Push received packet up the stack. */
+		skb_len = skb->len;
+		skb->protocol = htons(ETH_P_CAIF);
+		skb_reset_mac_header(skb);
+		skb->dev = cfv->ndev;
+		err = netif_receive_skb(skb);
+		if (unlikely(err)) {
+			++cfv->ndev->stats.rx_dropped;
+		} else {
+			++cfv->ndev->stats.rx_packets;
+			cfv->ndev->stats.rx_bytes += skb_len;
+		}
+
+		++riov->i;
+		++rxcnt;
+	} while (rxcnt < quota);
+
+	++cfv->stats.rx_napi_resched;
+	goto out;
+
+exit:
+	switch (err) {
+	case 0:
+		++cfv->stats.rx_napi_complete;
+
+		/* Really out of patckets? (stolen from virtio_net)*/
+		napi_complete(napi);
+		if (unlikely(vringh_notify_enable_kern(cfv->vr_rx)) &&
+		    napi_schedule_prep(napi)) {
+			vringh_notify_disable_kern(cfv->vr_rx);
+			__napi_schedule(napi);
+			goto again;
+		}
+		break;
+
+	case -ENOMEM:
+		++cfv->stats.rx_nomem;
+		dev_kfree_skb(skb);
+		/* Stop NAPI poll on OOM, we hope to be polled later */
+		napi_complete(napi);
+		vringh_notify_enable_kern(cfv->vr_rx);
+		break;
+
+	default:
+		/* We're doomed, any modem fault is fatal */
+		netdev_warn(cfv->ndev, "Bad ring, disable device\n");
+		cfv->ndev->stats.rx_dropped = riov->used - riov->i;
+		napi_complete(napi);
+		vringh_notify_disable_kern(cfv->vr_rx);
+		netif_carrier_off(cfv->ndev);
+		break;
+	}
+out:
+	if (rxcnt && vringh_need_notify_kern(cfv->vr_rx) > 0)
+		vringh_notify(cfv->vr_rx);
+	return rxcnt;
+}
+
+static void cfv_recv(struct virtio_device *vdev, struct vringh *vr_rx)
+{
+	struct cfv_info *cfv = vdev->priv;
+
+	++cfv->stats.rx_kicks;
+	vringh_notify_disable_kern(cfv->vr_rx);
+	napi_schedule(&cfv->napi);
+}
+
+static void cfv_destroy_genpool(struct cfv_info *cfv)
+{
+	if (cfv->alloc_addr)
+		dma_free_coherent(cfv->vdev->dev.parent->parent,
+				  cfv->allocsz, cfv->alloc_addr,
+				  cfv->alloc_dma);
+
+	if (!cfv->genpool)
+		return;
+	gen_pool_free(cfv->genpool,  cfv->reserved_mem,
+		      cfv->reserved_size);
+	gen_pool_destroy(cfv->genpool);
+	cfv->genpool = NULL;
+}
+
+static int cfv_create_genpool(struct cfv_info *cfv)
+{
+	int err;
+
+	/* dma_alloc can only allocate whole pages, and we need a more
+	 * fine graned allocation so we use genpool. We ask for space needed
+	 * by IP and a full ring. If the dma allcoation fails we retry with a
+	 * smaller allocation size.
+	 */
+	err = -ENOMEM;
+	cfv->allocsz = (virtqueue_get_vring_size(cfv->vq_tx) *
+			(ETH_DATA_LEN + cfv->tx_hr + cfv->tx_tr) * 11)/10;
+	if (cfv->allocsz <= (num_possible_cpus() + 1) * cfv->ndev->mtu)
+		return -EINVAL;
+
+	for (;;) {
+		if (cfv->allocsz <= num_possible_cpus() * cfv->ndev->mtu) {
+			netdev_info(cfv->ndev, "Not enough device memory\n");
+			return -ENOMEM;
+		}
+
+		cfv->alloc_addr = dma_alloc_coherent(
+						cfv->vdev->dev.parent->parent,
+						cfv->allocsz, &cfv->alloc_dma,
+						GFP_ATOMIC);
+		if (cfv->alloc_addr)
+			break;
+
+		cfv->allocsz = (cfv->allocsz * 3) >> 2;
+	}
+
+	netdev_dbg(cfv->ndev, "Allocated %zd bytes from dma-memory\n",
+		   cfv->allocsz);
+
+	/* Allocate on 128 bytes boundaries (1 << 7)*/
+	cfv->genpool = gen_pool_create(7, -1);
+	if (!cfv->genpool)
+		goto err;
+
+	err = gen_pool_add_virt(cfv->genpool, (unsigned long)cfv->alloc_addr,
+				(phys_addr_t)virt_to_phys(cfv->alloc_addr),
+				cfv->allocsz, -1);
+	if (err)
+		goto err;
+
+	/* Reserve some memory for low memory situations. If we hit the roof
+	 * in the memory pool, we stop TX flow and release the reserve.
+	 */
+	cfv->reserved_size = num_possible_cpus() * cfv->ndev->mtu;
+	cfv->reserved_mem = gen_pool_alloc(cfv->genpool,
+					   cfv->reserved_size);
+	if (!cfv->reserved_mem)
+		goto err;
+
+	cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx);
+	return 0;
+err:
+	cfv_destroy_genpool(cfv);
+	return err;
+}
+
+/* Enable the CAIF interface and allocate the memory-pool */
+static int cfv_netdev_open(struct net_device *netdev)
+{
+	struct cfv_info *cfv = netdev_priv(netdev);
+
+	if (cfv_create_genpool(cfv))
+		return -ENOMEM;
+
+	netif_carrier_on(netdev);
+	napi_enable(&cfv->napi);
+
+	/* Schedule NAPI to read any pending packets */
+	napi_schedule(&cfv->napi);
+	return 0;
+}
+
+/* Disable the CAIF interface and free the memory-pool */
+static int cfv_netdev_close(struct net_device *netdev)
+{
+	struct cfv_info *cfv = netdev_priv(netdev);
+	unsigned long flags;
+	struct buf_info *buf_info;
+
+	/* Disable interrupts, queues and NAPI polling */
+	netif_carrier_off(netdev);
+	virtqueue_disable_cb(cfv->vq_tx);
+	vringh_notify_disable_kern(cfv->vr_rx);
+	napi_disable(&cfv->napi);
+
+	/* Release any TX buffers on both used and avilable rings */
+	cfv_release_used_buf(cfv->vq_tx);
+	spin_lock_irqsave(&cfv->tx_lock, flags);
+	while ((buf_info = virtqueue_detach_unused_buf(cfv->vq_tx)))
+		free_buf_info(cfv, buf_info);
+	spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+	/* Release all dma allocated memory and destroy the pool */
+	cfv_destroy_genpool(cfv);
+	return 0;
+}
+
+/* Allocate a buffer in dma-memory and copy skb to it */
+static struct buf_info *cfv_alloc_and_copy_to_shm(struct cfv_info *cfv,
+						       struct sk_buff *skb,
+						       struct scatterlist *sg)
+{
+	struct caif_payload_info *info = (void *)&skb->cb;
+	struct buf_info *buf_info = NULL;
+	u8 pad_len, hdr_ofs;
+
+	if (!cfv->genpool)
+		goto err;
+
+	if (unlikely(cfv->tx_hr + skb->len + cfv->tx_tr > cfv->mtu)) {
+		netdev_warn(cfv->ndev, "Invalid packet len (%d > %d)\n",
+			    cfv->tx_hr + skb->len + cfv->tx_tr, cfv->mtu);
+		goto err;
+	}
+
+	buf_info = kmalloc(sizeof(struct buf_info), GFP_ATOMIC);
+	if (unlikely(!buf_info))
+		goto err;
+
+	/* Make the IP header aligned in tbe buffer */
+	hdr_ofs = cfv->tx_hr + info->hdr_len;
+	pad_len = hdr_ofs & (IP_HDR_ALIGN - 1);
+	buf_info->size = cfv->tx_hr + skb->len + cfv->tx_tr + pad_len;
+
+	/* allocate dma memory buffer */
+	buf_info->vaddr = (void *)gen_pool_alloc(cfv->genpool, buf_info->size);
+	if (unlikely(!buf_info->vaddr))
+		goto err;
+
+	/* copy skbuf contents to send buffer */
+	skb_copy_bits(skb, 0, buf_info->vaddr + cfv->tx_hr + pad_len, skb->len);
+	sg_init_one(sg, buf_info->vaddr + pad_len,
+		    skb->len + cfv->tx_hr + cfv->rx_hr);
+
+	return buf_info;
+err:
+	kfree(buf_info);
+	return NULL;
+}
+
+/* Put the CAIF packet on the virtio ring and kick the receiver */
+static int cfv_netdev_tx(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct cfv_info *cfv = netdev_priv(netdev);
+	struct buf_info *buf_info;
+	struct scatterlist sg;
+	unsigned long flags;
+	bool flow_off = false;
+	int ret;
+
+	/* garbage collect released buffers */
+	cfv_release_used_buf(cfv->vq_tx);
+	spin_lock_irqsave(&cfv->tx_lock, flags);
+
+	/* Flow-off check takes into account number of cpus to make sure
+	 * virtqueue will not be overfilled in any possible smp conditions.
+	 *
+	 * Flow-on is triggered when sufficient buffers are freed
+	 */
+	if (unlikely(cfv->vq_tx->num_free <= num_present_cpus())) {
+		flow_off = true;
+		cfv->stats.tx_full_ring++;
+	}
+
+	/* If we run out of memory, we release the memory reserve and retry
+	 * allocation.
+	 */
+	buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+	if (unlikely(!buf_info)) {
+		cfv->stats.tx_no_mem++;
+		flow_off = true;
+
+		if (cfv->reserved_mem && cfv->genpool) {
+			gen_pool_free(cfv->genpool,  cfv->reserved_mem,
+				      cfv->reserved_size);
+			cfv->reserved_mem = 0;
+			buf_info = cfv_alloc_and_copy_to_shm(cfv, skb, &sg);
+		}
+	}
+
+	if (unlikely(flow_off)) {
+		/* Turn flow on when a 1/4 of the descriptors are released */
+		cfv->watermark_tx = virtqueue_get_vring_size(cfv->vq_tx) / 4;
+		/* Enable notifications of recycled TX buffers */
+		virtqueue_enable_cb(cfv->vq_tx);
+		netif_tx_stop_all_queues(netdev);
+	}
+
+	if (unlikely(!buf_info)) {
+		/* If the memory reserve does it's job, this shouldn't happen */
+		netdev_warn(cfv->ndev, "Out of gen_pool memory\n");
+		goto err;
+	}
+
+	ret = virtqueue_add_buf(cfv->vq_tx, &sg, 1, 0,
+				buf_info, GFP_ATOMIC);
+	if (unlikely((ret < 0))) {
+		/* If flow control works, this shouldn't happen */
+		netdev_warn(cfv->ndev, "Failed adding buffer to TX vring:%d\n",
+			    ret);
+		goto err;
+	}
+
+	/* update netdev statistics */
+	cfv->ndev->stats.tx_packets++;
+	cfv->ndev->stats.tx_bytes += skb->len;
+	spin_unlock_irqrestore(&cfv->tx_lock, flags);
+
+	/* tell the remote processor it has a pending message to read */
+	virtqueue_kick(cfv->vq_tx);
+
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+err:
+	spin_unlock_irqrestore(&cfv->tx_lock, flags);
+	cfv->ndev->stats.tx_dropped++;
+	free_buf_info(cfv, buf_info);
+	dev_kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static void cfv_tx_release_tasklet(unsigned long drv)
+{
+	struct cfv_info *cfv = (struct cfv_info *)drv;
+	cfv_release_used_buf(cfv->vq_tx);
+}
+
+static const struct net_device_ops cfv_netdev_ops = {
+	.ndo_open = cfv_netdev_open,
+	.ndo_stop = cfv_netdev_close,
+	.ndo_start_xmit = cfv_netdev_tx,
+};
+
+static void cfv_netdev_setup(struct net_device *netdev)
+{
+	netdev->netdev_ops = &cfv_netdev_ops;
+	netdev->type = ARPHRD_CAIF;
+	netdev->tx_queue_len = 100;
+	netdev->flags = IFF_POINTOPOINT | IFF_NOARP;
+	netdev->mtu = CFV_DEF_MTU_SIZE;
+	netdev->destructor = free_netdev;
+}
+
+/* Create debugfs counters for the device */
+static inline void debugfs_init(struct cfv_info *cfv)
+{
+	cfv->debugfs =
+		debugfs_create_dir(netdev_name(cfv->ndev), NULL);
+
+	if (IS_ERR(cfv->debugfs))
+		return;
+
+	debugfs_create_u32("rx-napi-complete", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_napi_complete);
+	debugfs_create_u32("rx-napi-resched", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_napi_resched);
+	debugfs_create_u32("rx-nomem", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_nomem);
+	debugfs_create_u32("rx-kicks", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.rx_kicks);
+	debugfs_create_u32("tx-full-ring", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_full_ring);
+	debugfs_create_u32("tx-no-mem", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_no_mem);
+	debugfs_create_u32("tx-kicks", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_kicks);
+	debugfs_create_u32("tx-flow-on", S_IRUSR, cfv->debugfs,
+			   &cfv->stats.tx_flow_on);
+}
+
+/* Setup CAIF for the a virtio device */
+static int cfv_probe(struct virtio_device *vdev)
+{
+	vq_callback_t *vq_cbs = cfv_release_cb;
+	vrh_callback_t *vrh_cbs = cfv_recv;
+	const char *names =  "output";
+	const char *cfv_netdev_name = "cfvrt";
+	struct net_device *netdev;
+	struct cfv_info *cfv;
+	int err = -EINVAL;
+
+	netdev = alloc_netdev(sizeof(struct cfv_info), cfv_netdev_name,
+			      cfv_netdev_setup);
+	if (!netdev)
+		return -ENOMEM;
+
+	cfv = netdev_priv(netdev);
+	cfv->vdev = vdev;
+	cfv->ndev = netdev;
+
+	spin_lock_init(&cfv->tx_lock);
+
+	/* Get the RX virtio ring. This is a "host side vring". */
+	err = vdev->vringh_config->find_vrhs(vdev, 1, &cfv->vr_rx, &vrh_cbs);
+	if (err)
+		goto err;
+
+	/* Get the TX virtio ring. This is a "guest side vring". */
+	err = vdev->config->find_vqs(vdev, 1, &cfv->vq_tx, &vq_cbs, &names);
+	if (err)
+		goto err;
+
+	/* Get the CAIF configuration from virtio config space, if available */
+#define GET_VIRTIO_CONFIG_OPS(_v, _var, _f) \
+	((_v)->config->get(_v, offsetof(struct virtio_caif_transf_config, _f), \
+			   &_var, \
+			   FIELD_SIZEOF(struct virtio_caif_transf_config, _f)))
+
+	if (vdev->config->get) {
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_hr, headroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_hr, headroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->tx_tr, tailroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->rx_tr, tailroom);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->mtu, mtu);
+		GET_VIRTIO_CONFIG_OPS(vdev, cfv->mru, mtu);
+	} else {
+		cfv->tx_hr = CFV_DEF_HEADROOM;
+		cfv->rx_hr = CFV_DEF_HEADROOM;
+		cfv->tx_tr = CFV_DEF_TAILROOM;
+		cfv->rx_tr = CFV_DEF_TAILROOM;
+		cfv->mtu = CFV_DEF_MTU_SIZE;
+		cfv->mru = CFV_DEF_MTU_SIZE;
+	}
+
+	netdev->needed_headroom = cfv->tx_hr;
+	netdev->needed_tailroom = cfv->tx_tr;
+
+	/* Disable buffer release interrupts unless we have stopped TX queues */
+	virtqueue_disable_cb(cfv->vq_tx);
+
+	netdev->mtu = cfv->mtu - cfv->tx_tr;
+	vdev->priv = cfv;
+
+	/* Initialize NAPI poll context data */
+	vringh_kiov_init(&cfv->ctx.riov, NULL, 0);
+	cfv->ctx.head = USHRT_MAX;
+	netif_napi_add(netdev, &cfv->napi, cfv_rx_poll, CFV_DEFAULT_QUOTA);
+
+	tasklet_init(&cfv->tx_release_tasklet,
+		     cfv_tx_release_tasklet,
+		     (unsigned long)cfv);
+
+	/* Carrier is off until netdevice is opened */
+	netif_carrier_off(netdev);
+
+	/* register Netdev */
+	err = register_netdev(netdev);
+	if (err) {
+		dev_err(&vdev->dev, "Unable to register netdev (%d)\n", err);
+		goto err;
+	}
+
+	debugfs_init(cfv);
+
+	return 0;
+err:
+	netdev_warn(cfv->ndev, "CAIF Virtio probe failed:%d\n", err);
+
+	if (cfv->vr_rx)
+		vdev->vringh_config->del_vrhs(cfv->vdev);
+	if (cfv->vdev)
+		vdev->config->del_vqs(cfv->vdev);
+	free_netdev(netdev);
+	return err;
+}
+
+static void cfv_remove(struct virtio_device *vdev)
+{
+	struct cfv_info *cfv = vdev->priv;
+
+	rtnl_lock();
+	dev_close(cfv->ndev);
+	rtnl_unlock();
+
+	tasklet_kill(&cfv->tx_release_tasklet);
+	debugfs_remove_recursive(cfv->debugfs);
+
+	vringh_kiov_cleanup(&cfv->ctx.riov);
+	vdev->config->reset(vdev);
+	vdev->vringh_config->del_vrhs(cfv->vdev);
+	cfv->vr_rx = NULL;
+	vdev->config->del_vqs(cfv->vdev);
+	unregister_netdev(cfv->ndev);
+}
+
+static struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_CAIF, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features[] = {
+};
+
+static struct virtio_driver caif_virtio_driver = {
+	.feature_table		= features,
+	.feature_table_size	= ARRAY_SIZE(features),
+	.driver.name		= KBUILD_MODNAME,
+	.driver.owner		= THIS_MODULE,
+	.id_table		= id_table,
+	.probe			= cfv_probe,
+	.remove			= cfv_remove,
+};
+
+module_virtio_driver(caif_virtio_driver);
+MODULE_DEVICE_TABLE(virtio, id_table);
diff --git a/include/linux/virtio_caif.h b/include/linux/virtio_caif.h
new file mode 100644
index 000000000000..5d2d3124ca3d
--- /dev/null
+++ b/include/linux/virtio_caif.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (C) ST-Ericsson AB 2012
+ * Author: Sjur Brændeland <sjur.brandeland@stericsson.com>
+ *
+ * This header is BSD licensed so
+ * anyone can use the definitions to implement compatible remote processors
+ */
+
+#ifndef VIRTIO_CAIF_H
+#define VIRTIO_CAIF_H
+
+#include <linux/types.h>
+struct virtio_caif_transf_config {
+	u16 headroom;
+	u16 tailroom;
+	u32 mtu;
+	u8 reserved[4];
+};
+
+struct virtio_caif_config {
+	struct virtio_caif_transf_config uplink, downlink;
+	u8 reserved[8];
+};
+#endif
diff --git a/include/uapi/linux/virtio_ids.h b/include/uapi/linux/virtio_ids.h
index a7630d04029f..284fc3a05f7b 100644
--- a/include/uapi/linux/virtio_ids.h
+++ b/include/uapi/linux/virtio_ids.h
@@ -38,5 +38,6 @@
 #define VIRTIO_ID_SCSI		8 /* virtio scsi */
 #define VIRTIO_ID_9P		9 /* 9p virtio console */
 #define VIRTIO_ID_RPROC_SERIAL 11 /* virtio remoteproc serial link */
+#define VIRTIO_ID_CAIF	       12 /* Virtio caif */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
-- 
cgit 


From c8164d8931fdee9ac5314708c4071adf1d997425 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Wed, 20 Mar 2013 15:37:08 +1030
Subject: scatterlist: introduce sg_unmark_end

This is useful in places that recycle the same scatterlist multiple
times, and do not want to incur the cost of sg_init_table every
time in hot paths.

Acked-by: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 block/blk-integrity.c       |  2 +-
 block/blk-merge.c           |  2 +-
 include/linux/scatterlist.h | 16 ++++++++++++++++
 3 files changed, 18 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index dabd221857e1..03cf7179e8ef 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -110,7 +110,7 @@ new_segment:
 			if (!sg)
 				sg = sglist;
 			else {
-				sg->page_link &= ~0x02;
+				sg_unmark_end(sg);
 				sg = sg_next(sg);
 			}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 936a110de0b9..5f2448253797 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -143,7 +143,7 @@ new_segment:
 			 * termination bit to avoid doing a full
 			 * sg_init_table() in drivers for each command.
 			 */
-			(*sg)->page_link &= ~0x02;
+			sg_unmark_end(*sg);
 			*sg = sg_next(*sg);
 		}
 
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2d8bdaef9611..bfc47e0de81c 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -171,6 +171,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
 	sg->page_link &= ~0x01;
 }
 
+/**
+ * sg_unmark_end - Undo setting the end of the scatterlist
+ * @sg:		 SG entryScatterlist
+ *
+ * Description:
+ *   Removes the termination marker from the given entry of the scatterlist.
+ *
+ **/
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+	BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+	sg->page_link &= ~0x02;
+}
+
 /**
  * sg_phys - Return physical address of an sg entry
  * @sg:	     SG entry
-- 
cgit 


From 13816c768d46586e925b22736992258d6105ad2c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:37:09 +1030
Subject: virtio_ring: virtqueue_add_sgs, to add multiple sgs.

virtio_scsi can really use this, to avoid the current hack of copying
the whole sg array.  Some other things get slightly neater, too.

This causes a slowdown in virtqueue_add_buf(), which is implemented as
a wrapper.  This is addressed in the next patches.

for i in `seq 50`; do /usr/bin/time -f 'Wall time:%e' ./vringh_test --indirect --eventidx --parallel --fast-vringh; done 2>&1 | stats --trim-outliers:

Before:
	Using CPUS 0 and 3
	Guest: notified 0, pinged 39009-39063(39062)
	Host: notified 39009-39063(39062), pinged 0
	Wall time:1.700000-1.950000(1.723542)

After:
	Using CPUS 0 and 3
	Guest: notified 0, pinged 39062-39063(39063)
	Host: notified 39062-39063(39063), pinged 0
	Wall time:1.760000-2.220000(1.789167)

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Reviewed-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Reviewed-by: Asias He <asias@redhat.com>
---
 drivers/virtio/virtio_ring.c     | 220 ++++++++++++++++++++++++++++-----------
 include/linux/virtio.h           |   7 ++
 tools/virtio/linux/scatterlist.h |  16 +++
 tools/virtio/linux/virtio.h      |   7 ++
 4 files changed, 187 insertions(+), 63 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index 245177c286ae..a78ad459cc85 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -98,16 +98,36 @@ struct vring_virtqueue
 
 #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
 
+static inline struct scatterlist *sg_next_chained(struct scatterlist *sg,
+						  unsigned int *count)
+{
+	return sg_next(sg);
+}
+
+static inline struct scatterlist *sg_next_arr(struct scatterlist *sg,
+					      unsigned int *count)
+{
+	if (--(*count) == 0)
+		return NULL;
+	return sg + 1;
+}
+
 /* Set up an indirect table of descriptors and add it to the queue. */
-static int vring_add_indirect(struct vring_virtqueue *vq,
-			      struct scatterlist sg[],
-			      unsigned int out,
-			      unsigned int in,
-			      gfp_t gfp)
+static inline int vring_add_indirect(struct vring_virtqueue *vq,
+				     struct scatterlist *sgs[],
+				     struct scatterlist *(*next)
+				       (struct scatterlist *, unsigned int *),
+				     unsigned int total_sg,
+				     unsigned int total_out,
+				     unsigned int total_in,
+				     unsigned int out_sgs,
+				     unsigned int in_sgs,
+				     gfp_t gfp)
 {
 	struct vring_desc *desc;
 	unsigned head;
-	int i;
+	struct scatterlist *sg;
+	int i, n;
 
 	/*
 	 * We require lowmem mappings for the descriptors because
@@ -116,25 +136,31 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	 */
 	gfp &= ~(__GFP_HIGHMEM | __GFP_HIGH);
 
-	desc = kmalloc((out + in) * sizeof(struct vring_desc), gfp);
+	desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
 	if (!desc)
 		return -ENOMEM;
 
-	/* Transfer entries from the sg list into the indirect page */
-	for (i = 0; i < out; i++) {
-		desc[i].flags = VRING_DESC_F_NEXT;
-		desc[i].addr = sg_phys(sg);
-		desc[i].len = sg->length;
-		desc[i].next = i+1;
-		sg++;
+	/* Transfer entries from the sg lists into the indirect page */
+	i = 0;
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+			desc[i].flags = VRING_DESC_F_NEXT;
+			desc[i].addr = sg_phys(sg);
+			desc[i].len = sg->length;
+			desc[i].next = i+1;
+			i++;
+		}
 	}
-	for (; i < (out + in); i++) {
-		desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-		desc[i].addr = sg_phys(sg);
-		desc[i].len = sg->length;
-		desc[i].next = i+1;
-		sg++;
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+			desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+			desc[i].addr = sg_phys(sg);
+			desc[i].len = sg->length;
+			desc[i].next = i+1;
+			i++;
+		}
 	}
+	BUG_ON(i != total_sg);
 
 	/* Last one doesn't continue. */
 	desc[i-1].flags &= ~VRING_DESC_F_NEXT;
@@ -155,29 +181,20 @@ static int vring_add_indirect(struct vring_virtqueue *vq,
 	return head;
 }
 
-/**
- * virtqueue_add_buf - expose buffer to other end
- * @vq: the struct virtqueue we're talking about.
- * @sg: the description of the buffer(s).
- * @out_num: the number of sg readable by other side
- * @in_num: the number of sg which are writable (after readable ones)
- * @data: the token identifying the buffer.
- * @gfp: how to do memory allocations (if necessary).
- *
- * Caller must ensure we don't call this with other virtqueue operations
- * at the same time (except where noted).
- *
- * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
- */
-int virtqueue_add_buf(struct virtqueue *_vq,
-		      struct scatterlist sg[],
-		      unsigned int out,
-		      unsigned int in,
-		      void *data,
-		      gfp_t gfp)
+static inline int virtqueue_add(struct virtqueue *_vq,
+				struct scatterlist *sgs[],
+				struct scatterlist *(*next)
+				  (struct scatterlist *, unsigned int *),
+				unsigned int total_out,
+				unsigned int total_in,
+				unsigned int out_sgs,
+				unsigned int in_sgs,
+				void *data,
+				gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
-	unsigned int i, avail, uninitialized_var(prev);
+	struct scatterlist *sg;
+	unsigned int i, n, avail, uninitialized_var(prev), total_sg;
 	int head;
 
 	START_USE(vq);
@@ -197,46 +214,54 @@ int virtqueue_add_buf(struct virtqueue *_vq,
 	}
 #endif
 
+	total_sg = total_in + total_out;
+
 	/* If the host supports indirect descriptor tables, and we have multiple
 	 * buffers, then go indirect. FIXME: tune this threshold */
-	if (vq->indirect && (out + in) > 1 && vq->vq.num_free) {
-		head = vring_add_indirect(vq, sg, out, in, gfp);
+	if (vq->indirect && total_sg > 1 && vq->vq.num_free) {
+		head = vring_add_indirect(vq, sgs, next, total_sg, total_out,
+					  total_in,
+					  out_sgs, in_sgs, gfp);
 		if (likely(head >= 0))
 			goto add_head;
 	}
 
-	BUG_ON(out + in > vq->vring.num);
-	BUG_ON(out + in == 0);
+	BUG_ON(total_sg > vq->vring.num);
+	BUG_ON(total_sg == 0);
 
-	if (vq->vq.num_free < out + in) {
+	if (vq->vq.num_free < total_sg) {
 		pr_debug("Can't add buf len %i - avail = %i\n",
-			 out + in, vq->vq.num_free);
+			 total_sg, vq->vq.num_free);
 		/* FIXME: for historical reasons, we force a notify here if
 		 * there are outgoing parts to the buffer.  Presumably the
 		 * host should service the ring ASAP. */
-		if (out)
+		if (out_sgs)
 			vq->notify(&vq->vq);
 		END_USE(vq);
 		return -ENOSPC;
 	}
 
 	/* We're about to use some buffers from the free list. */
-	vq->vq.num_free -= out + in;
-
-	head = vq->free_head;
-	for (i = vq->free_head; out; i = vq->vring.desc[i].next, out--) {
-		vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
-		vq->vring.desc[i].addr = sg_phys(sg);
-		vq->vring.desc[i].len = sg->length;
-		prev = i;
-		sg++;
+	vq->vq.num_free -= total_sg;
+
+	head = i = vq->free_head;
+	for (n = 0; n < out_sgs; n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_out)) {
+			vq->vring.desc[i].flags = VRING_DESC_F_NEXT;
+			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].len = sg->length;
+			prev = i;
+			i = vq->vring.desc[i].next;
+		}
 	}
-	for (; in; i = vq->vring.desc[i].next, in--) {
-		vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
-		vq->vring.desc[i].addr = sg_phys(sg);
-		vq->vring.desc[i].len = sg->length;
-		prev = i;
-		sg++;
+	for (; n < (out_sgs + in_sgs); n++) {
+		for (sg = sgs[n]; sg; sg = next(sg, &total_in)) {
+			vq->vring.desc[i].flags = VRING_DESC_F_NEXT|VRING_DESC_F_WRITE;
+			vq->vring.desc[i].addr = sg_phys(sg);
+			vq->vring.desc[i].len = sg->length;
+			prev = i;
+			i = vq->vring.desc[i].next;
+		}
 	}
 	/* Last one doesn't continue. */
 	vq->vring.desc[prev].flags &= ~VRING_DESC_F_NEXT;
@@ -269,8 +294,77 @@ add_head:
 
 	return 0;
 }
+
+/**
+ * virtqueue_add_buf - expose buffer to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: the description of the buffer(s).
+ * @out_num: the number of sg readable by other side
+ * @in_num: the number of sg which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_buf(struct virtqueue *_vq,
+		      struct scatterlist sg[],
+		      unsigned int out,
+		      unsigned int in,
+		      void *data,
+		      gfp_t gfp)
+{
+	struct scatterlist *sgs[2];
+
+	sgs[0] = sg;
+	sgs[1] = sg + out;
+
+	return virtqueue_add(_vq, sgs, sg_next_arr,
+			     out, in, out ? 1 : 0, in ? 1 : 0, data, gfp);
+}
 EXPORT_SYMBOL_GPL(virtqueue_add_buf);
 
+/**
+ * virtqueue_add_sgs - expose buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of terminated scatterlists.
+ * @out_num: the number of scatterlists readable by other side
+ * @in_num: the number of scatterlists which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_sgs(struct virtqueue *_vq,
+		      struct scatterlist *sgs[],
+		      unsigned int out_sgs,
+		      unsigned int in_sgs,
+		      void *data,
+		      gfp_t gfp)
+{
+	unsigned int i, total_out, total_in;
+
+	/* Count them first. */
+	for (i = total_out = total_in = 0; i < out_sgs; i++) {
+		struct scatterlist *sg;
+		for (sg = sgs[i]; sg; sg = sg_next(sg))
+			total_out++;
+	}
+	for (; i < out_sgs + in_sgs; i++) {
+		struct scatterlist *sg;
+		for (sg = sgs[i]; sg; sg = sg_next(sg))
+			total_in++;
+	}
+	return virtqueue_add(_vq, sgs, sg_next_chained,
+			     total_out, total_in, out_sgs, in_sgs, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @vq: the struct virtqueue
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index 5d5b3abc283d..ac80288b2920 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -41,6 +41,13 @@ int virtqueue_add_buf(struct virtqueue *vq,
 		      void *data,
 		      gfp_t gfp);
 
+int virtqueue_add_sgs(struct virtqueue *vq,
+		      struct scatterlist *sgs[],
+		      unsigned int out_sgs,
+		      unsigned int in_sgs,
+		      void *data,
+		      gfp_t gfp);
+
 void virtqueue_kick(struct virtqueue *vq);
 
 bool virtqueue_kick_prepare(struct virtqueue *vq);
diff --git a/tools/virtio/linux/scatterlist.h b/tools/virtio/linux/scatterlist.h
index b2cf7d0f6133..68c9e2adc996 100644
--- a/tools/virtio/linux/scatterlist.h
+++ b/tools/virtio/linux/scatterlist.h
@@ -125,6 +125,22 @@ static inline void sg_mark_end(struct scatterlist *sg)
 	sg->page_link &= ~0x01;
 }
 
+/**
+ * sg_unmark_end - Undo setting the end of the scatterlist
+ * @sg:		 SG entryScatterlist
+ *
+ * Description:
+ *   Removes the termination marker from the given entry of the scatterlist.
+ *
+ **/
+static inline void sg_unmark_end(struct scatterlist *sg)
+{
+#ifdef CONFIG_DEBUG_SG
+	BUG_ON(sg->sg_magic != SG_MAGIC);
+#endif
+	sg->page_link &= ~0x02;
+}
+
 static inline struct scatterlist *sg_next(struct scatterlist *sg)
 {
 #ifdef CONFIG_DEBUG_SG
diff --git a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
index e4af6591f5ff..5fa612ad932c 100644
--- a/tools/virtio/linux/virtio.h
+++ b/tools/virtio/linux/virtio.h
@@ -56,6 +56,13 @@ int virtqueue_add_buf(struct virtqueue *vq,
 		      void *data,
 		      gfp_t gfp);
 
+int virtqueue_add_sgs(struct virtqueue *vq,
+		      struct scatterlist *sgs[],
+		      unsigned int out_sgs,
+		      unsigned int in_sgs,
+		      void *data,
+		      gfp_t gfp);
+
 void virtqueue_kick(struct virtqueue *vq);
 
 void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
-- 
cgit 


From 282edb36499042a92b71f052f51754ae7ed936e4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Wed, 20 Mar 2013 15:44:26 +1030
Subject: virtio_ring: virtqueue_add_outbuf / virtqueue_add_inbuf.

These are specialized versions of virtqueue_add_buf(), which cover
over 80% of cases and are far clearer.

In particular, the scatterlists passed to these functions don't have
to be clean (ie. we ignore end markers).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/virtio/virtio_ring.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/virtio.h       | 10 ++++++++++
 2 files changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index a78ad459cc85..5217baf5528c 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -365,6 +365,50 @@ int virtqueue_add_sgs(struct virtqueue *_vq,
 }
 EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
 
+/**
+ * virtqueue_add_outbuf - expose output buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists readable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_outbuf(struct virtqueue *vq,
+			 struct scatterlist sg[], unsigned int num,
+			 void *data,
+			 gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, sg_next_arr, num, 0, 1, 0, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
+
+/**
+ * virtqueue_add_inbuf - expose input buffers to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sgs: array of scatterlists (need not be terminated!)
+ * @num: the number of scatterlists writable by other side
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns zero or a negative error (ie. ENOSPC, ENOMEM).
+ */
+int virtqueue_add_inbuf(struct virtqueue *vq,
+			struct scatterlist sg[], unsigned int num,
+			void *data,
+			gfp_t gfp)
+{
+	return virtqueue_add(vq, &sg, sg_next_arr, 0, num, 0, 1, data, gfp);
+}
+EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
+
 /**
  * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  * @vq: the struct virtqueue
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ac80288b2920..833f17b6a743 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -41,6 +41,16 @@ int virtqueue_add_buf(struct virtqueue *vq,
 		      void *data,
 		      gfp_t gfp);
 
+int virtqueue_add_outbuf(struct virtqueue *vq,
+			 struct scatterlist sg[], unsigned int num,
+			 void *data,
+			 gfp_t gfp);
+
+int virtqueue_add_inbuf(struct virtqueue *vq,
+			struct scatterlist sg[], unsigned int num,
+			void *data,
+			gfp_t gfp);
+
 int virtqueue_add_sgs(struct virtqueue *vq,
 		      struct scatterlist *sgs[],
 		      unsigned int out_sgs,
-- 
cgit 


From 081aa458c38ba576bdd4265fc807fa95b48b9e79 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Wed, 13 Mar 2013 09:17:09 +0800
Subject: cgroup: consolidate cgroup_attach_task() and cgroup_attach_proc()

These two functions share most of the code.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |   3 +-
 kernel/cgroup.c        | 109 +++++++++----------------------------------------
 kernel/cpuset.c        |   2 +-
 3 files changed, 23 insertions(+), 91 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7e818a3ef60a..01c48c6806d6 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -693,7 +693,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
-int cgroup_attach_task(struct cgroup *, struct task_struct *);
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+		       bool threadgroup);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 54689fc008f6..04fa2abf94b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -59,7 +59,7 @@
 #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
 #include <linux/eventfd.h>
 #include <linux/poll.h>
-#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
 
 #include <linux/atomic.h>
@@ -1943,82 +1943,6 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
 	put_css_set(oldcg);
 }
 
-/**
- * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
- * @cgrp: the cgroup the task is attaching to
- * @tsk: the task to be attached
- *
- * Call with cgroup_mutex and threadgroup locked. May take task_lock of
- * @tsk during call.
- */
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-{
-	int retval = 0;
-	struct cgroup_subsys *ss, *failed_ss = NULL;
-	struct cgroup *oldcgrp;
-	struct cgroupfs_root *root = cgrp->root;
-	struct cgroup_taskset tset = { };
-	struct css_set *newcg;
-
-	/* @tsk either already exited or can't exit until the end */
-	if (tsk->flags & PF_EXITING)
-		return -ESRCH;
-
-	/* Nothing to do if the task is already in that cgroup */
-	oldcgrp = task_cgroup_from_root(tsk, root);
-	if (cgrp == oldcgrp)
-		return 0;
-
-	tset.single.task = tsk;
-	tset.single.cgrp = oldcgrp;
-
-	for_each_subsys(root, ss) {
-		if (ss->can_attach) {
-			retval = ss->can_attach(cgrp, &tset);
-			if (retval) {
-				/*
-				 * Remember on which subsystem the can_attach()
-				 * failed, so that we only call cancel_attach()
-				 * against the subsystems whose can_attach()
-				 * succeeded. (See below)
-				 */
-				failed_ss = ss;
-				goto out;
-			}
-		}
-	}
-
-	newcg = find_css_set(tsk->cgroups, cgrp);
-	if (!newcg) {
-		retval = -ENOMEM;
-		goto out;
-	}
-
-	cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
-
-	for_each_subsys(root, ss) {
-		if (ss->attach)
-			ss->attach(cgrp, &tset);
-	}
-
-out:
-	if (retval) {
-		for_each_subsys(root, ss) {
-			if (ss == failed_ss)
-				/*
-				 * This subsystem was the one that failed the
-				 * can_attach() check earlier, so we don't need
-				 * to call cancel_attach() against it or any
-				 * remaining subsystems.
-				 */
-				break;
-			if (ss->cancel_attach)
-				ss->cancel_attach(cgrp, &tset);
-		}
-	}
-	return retval;
-}
-
 /**
  * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
  * @from: attach to all cgroups of a given task
@@ -2033,7 +1957,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 	for_each_active_root(root) {
 		struct cgroup *from_cg = task_cgroup_from_root(from, root);
 
-		retval = cgroup_attach_task(from_cg, tsk);
+		retval = cgroup_attach_task(from_cg, tsk, false);
 		if (retval)
 			break;
 	}
@@ -2044,21 +1968,22 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
 
 /**
- * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * cgroup_attach_task - attach a task or a whole threadgroup to a cgroup
  * @cgrp: the cgroup to attach to
- * @leader: the threadgroup leader task_struct of the group to be attached
+ * @tsk: the task or the leader of the threadgroup to be attached
+ * @threadgroup: attach the whole threadgroup?
  *
  * Call holding cgroup_mutex and the group_rwsem of the leader. Will take
- * task_lock of each thread in leader's threadgroup individually in turn.
+ * task_lock of @tsk or each thread in the threadgroup individually in turn.
  */
-static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
+		       bool threadgroup)
 {
 	int retval, i, group_size;
 	struct cgroup_subsys *ss, *failed_ss = NULL;
-	/* guaranteed to be initialized later, but the compiler needs this */
 	struct cgroupfs_root *root = cgrp->root;
 	/* threadgroup list cursor and array */
-	struct task_struct *tsk;
+	struct task_struct *leader = tsk;
 	struct task_and_cgroup *tc;
 	struct flex_array *group;
 	struct cgroup_taskset tset = { };
@@ -2070,7 +1995,10 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	 * group - group_rwsem prevents new threads from appearing, and if
 	 * threads exit, this will just be an over-estimate.
 	 */
-	group_size = get_nr_threads(leader);
+	if (threadgroup)
+		group_size = get_nr_threads(tsk);
+	else
+		group_size = 1;
 	/* flex_array supports very large thread-groups better than kmalloc. */
 	group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL);
 	if (!group)
@@ -2080,7 +2008,6 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 	if (retval)
 		goto out_free_group_list;
 
-	tsk = leader;
 	i = 0;
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2109,6 +2036,9 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
 		retval = flex_array_put(group, i, &ent, GFP_ATOMIC);
 		BUG_ON(retval != 0);
 		i++;
+
+		if (!threadgroup)
+			break;
 	} while_each_thread(leader, tsk);
 	rcu_read_unlock();
 	/* remember the number of threads in the array for later. */
@@ -2262,9 +2192,10 @@ retry_find_task:
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
-		ret = cgroup_attach_proc(cgrp, tsk);
-	} else
-		ret = cgroup_attach_task(cgrp, tsk);
+	}
+
+	ret = cgroup_attach_task(cgrp, tsk, threadgroup);
+
 	threadgroup_unlock(tsk);
 
 	put_task_struct(tsk);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index efbfca7a33e4..98d458aad789 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2008,7 +2008,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 	struct cgroup *new_cgroup = scan->data;
 
 	cgroup_lock();
-	cgroup_attach_task(new_cgroup, tsk);
+	cgroup_attach_task(new_cgroup, tsk, false);
 	cgroup_unlock();
 }
 
-- 
cgit 


From f77668dc25b27270fe589031b22c432c3462b1d8 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 19 Mar 2013 06:39:30 +0000
Subject: net: flow_dissector: add __skb_get_poff to get a start offset to
 payload

__skb_get_poff() returns the offset to the payload as far as it could
be dissected. The main user is currently BPF, so that we can dynamically
truncate packets without needing to push actual payload to the user
space and instead can analyze headers only.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h    |  2 ++
 net/core/flow_dissector.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index b66ecc6ef102..497412165b1c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2840,6 +2840,8 @@ static inline void skb_checksum_none_assert(const struct sk_buff *skb)
 
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off);
 
+u32 __skb_get_poff(const struct sk_buff *skb);
+
 /**
  * skb_head_is_locked - Determine if the skb->head is locked down
  * @skb: skb to check
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index f4be293bab9e..00ee068efc1c 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -5,6 +5,10 @@
 #include <linux/if_vlan.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <linux/igmp.h>
+#include <linux/icmp.h>
+#include <linux/sctp.h>
+#include <linux/dccp.h>
 #include <linux/if_tunnel.h>
 #include <linux/if_pppox.h>
 #include <linux/ppp_defs.h>
@@ -228,6 +232,59 @@ u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
 }
 EXPORT_SYMBOL(__skb_tx_hash);
 
+/* __skb_get_poff() returns the offset to the payload as far as it could
+ * be dissected. The main user is currently BPF, so that we can dynamically
+ * truncate packets without needing to push actual payload to the user
+ * space and can analyze headers only, instead.
+ */
+u32 __skb_get_poff(const struct sk_buff *skb)
+{
+	struct flow_keys keys;
+	u32 poff = 0;
+
+	if (!skb_flow_dissect(skb, &keys))
+		return 0;
+
+	poff += keys.thoff;
+	switch (keys.ip_proto) {
+	case IPPROTO_TCP: {
+		const struct tcphdr *tcph;
+		struct tcphdr _tcph;
+
+		tcph = skb_header_pointer(skb, poff, sizeof(_tcph), &_tcph);
+		if (!tcph)
+			return poff;
+
+		poff += max_t(u32, sizeof(struct tcphdr), tcph->doff * 4);
+		break;
+	}
+	case IPPROTO_UDP:
+	case IPPROTO_UDPLITE:
+		poff += sizeof(struct udphdr);
+		break;
+	/* For the rest, we do not really care about header
+	 * extensions at this point for now.
+	 */
+	case IPPROTO_ICMP:
+		poff += sizeof(struct icmphdr);
+		break;
+	case IPPROTO_ICMPV6:
+		poff += sizeof(struct icmp6hdr);
+		break;
+	case IPPROTO_IGMP:
+		poff += sizeof(struct igmphdr);
+		break;
+	case IPPROTO_DCCP:
+		poff += sizeof(struct dccp_hdr);
+		break;
+	case IPPROTO_SCTP:
+		poff += sizeof(struct sctphdr);
+		break;
+	}
+
+	return poff;
+}
+
 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
 {
 	if (unlikely(queue_index >= dev->real_num_tx_queues)) {
-- 
cgit 


From 3e5289d5e3f98b7b5b8cac32e9e5a7004c067436 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 19 Mar 2013 06:39:31 +0000
Subject: filter: add ANC_PAY_OFFSET instruction for loading payload start
 offset

It is very useful to do dynamic truncation of packets. In particular,
we're interested to push the necessary header bytes to the user space and
cut off user payload that should probably not be transferred for some reasons
(e.g. privacy, speed, or others). With the ancillary extension PAY_OFFSET,
we can load it into the accumulator, and return it. E.g. in bpfc syntax ...

        ld #poff        ; { 0x20, 0, 0, 0xfffff034 },
        ret a           ; { 0x16, 0, 0, 0x00000000 },

... as a filter will accomplish this without having to do a big hackery in
a BPF filter itself. Follow-up JIT implementations are welcome.

Thanks to Eric Dumazet for suggesting and discussing this during the
Netfilter Workshop in Copenhagen.

Suggested-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h      | 1 +
 include/uapi/linux/filter.h | 3 ++-
 net/core/filter.c           | 5 +++++
 3 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index c45eabc135e1..d2059cb4e465 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -126,6 +126,7 @@ enum {
 	BPF_S_ANC_SECCOMP_LD_W,
 	BPF_S_ANC_VLAN_TAG,
 	BPF_S_ANC_VLAN_TAG_PRESENT,
+	BPF_S_ANC_PAY_OFFSET,
 };
 
 #endif /* __LINUX_FILTER_H__ */
diff --git a/include/uapi/linux/filter.h b/include/uapi/linux/filter.h
index 9cfde6941099..8eb9ccaa5b48 100644
--- a/include/uapi/linux/filter.h
+++ b/include/uapi/linux/filter.h
@@ -129,7 +129,8 @@ struct sock_fprog {	/* Required for SO_ATTACH_FILTER. */
 #define SKF_AD_ALU_XOR_X	40
 #define SKF_AD_VLAN_TAG	44
 #define SKF_AD_VLAN_TAG_PRESENT 48
-#define SKF_AD_MAX	52
+#define SKF_AD_PAY_OFFSET	52
+#define SKF_AD_MAX	56
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 
diff --git a/net/core/filter.c b/net/core/filter.c
index 2e20b55a7830..dad2a178f9f8 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -348,6 +348,9 @@ load_b:
 		case BPF_S_ANC_VLAN_TAG_PRESENT:
 			A = !!vlan_tx_tag_present(skb);
 			continue;
+		case BPF_S_ANC_PAY_OFFSET:
+			A = __skb_get_poff(skb);
+			continue;
 		case BPF_S_ANC_NLATTR: {
 			struct nlattr *nla;
 
@@ -612,6 +615,7 @@ int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
 			ANCILLARY(ALU_XOR_X);
 			ANCILLARY(VLAN_TAG);
 			ANCILLARY(VLAN_TAG_PRESENT);
+			ANCILLARY(PAY_OFFSET);
 			}
 
 			/* ancillary operation unknown or unsupported */
@@ -814,6 +818,7 @@ static void sk_decode_filter(struct sock_filter *filt, struct sock_filter *to)
 		[BPF_S_ANC_SECCOMP_LD_W] = BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_VLAN_TAG]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_ANC_VLAN_TAG_PRESENT] = BPF_LD|BPF_B|BPF_ABS,
+		[BPF_S_ANC_PAY_OFFSET]	= BPF_LD|BPF_B|BPF_ABS,
 		[BPF_S_LD_W_LEN]	= BPF_LD|BPF_W|BPF_LEN,
 		[BPF_S_LD_W_IND]	= BPF_LD|BPF_W|BPF_IND,
 		[BPF_S_LD_H_IND]	= BPF_LD|BPF_H|BPF_IND,
-- 
cgit 


From 2b5faa4c553f90ee2dde1d976b220b1ca9741ef0 Mon Sep 17 00:00:00 2001
From: Jesper Derehag <jderehag@hotmail.com>
Date: Tue, 19 Mar 2013 20:50:05 +0000
Subject: connector: Added coredumping event to the process connector

Process connector can now also detect coredumping events.

Main aim of patch is get notified at start of coredumping, instead of
having to wait for it to finish and then being notified through EXIT
event.

Could be used for instance by process-managers that want to get
notified as soon as possible about process failures, and not
necessarily beeing notified after coredump, which could be in the
order of minutes depending on size of coredump, piping and so on.

Signed-off-by: Jesper Derehag <jderehag@hotmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/connector/cn_proc.c  | 25 +++++++++++++++++++++++++
 include/linux/cn_proc.h      |  4 ++++
 include/uapi/linux/cn_proc.h | 10 +++++++++-
 kernel/signal.c              |  2 ++
 4 files changed, 40 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 1110478dd0fd..08ae128cce9b 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -232,6 +232,31 @@ void proc_comm_connector(struct task_struct *task)
 	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
 }
 
+void proc_coredump_connector(struct task_struct *task)
+{
+	struct cn_msg *msg;
+	struct proc_event *ev;
+	__u8 buffer[CN_PROC_MSG_SIZE];
+	struct timespec ts;
+
+	if (atomic_read(&proc_event_num_listeners) < 1)
+		return;
+
+	msg = (struct cn_msg *)buffer;
+	ev = (struct proc_event *)msg->data;
+	get_seq(&msg->seq, &ev->cpu);
+	ktime_get_ts(&ts); /* get high res monotonic timestamp */
+	put_unaligned(timespec_to_ns(&ts), (__u64 *)&ev->timestamp_ns);
+	ev->what = PROC_EVENT_COREDUMP;
+	ev->event_data.coredump.process_pid = task->pid;
+	ev->event_data.coredump.process_tgid = task->tgid;
+
+	memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+	msg->ack = 0; /* not used */
+	msg->len = sizeof(*ev);
+	cn_netlink_send(msg, CN_IDX_PROC, GFP_KERNEL);
+}
+
 void proc_exit_connector(struct task_struct *task)
 {
 	struct cn_msg *msg;
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 2c1bc1ea04ee..1d5b02a96c46 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -26,6 +26,7 @@ void proc_id_connector(struct task_struct *task, int which_id);
 void proc_sid_connector(struct task_struct *task);
 void proc_ptrace_connector(struct task_struct *task, int which_id);
 void proc_comm_connector(struct task_struct *task);
+void proc_coredump_connector(struct task_struct *task);
 void proc_exit_connector(struct task_struct *task);
 #else
 static inline void proc_fork_connector(struct task_struct *task)
@@ -48,6 +49,9 @@ static inline void proc_ptrace_connector(struct task_struct *task,
 					 int ptrace_id)
 {}
 
+static inline void proc_coredump_connector(struct task_struct *task)
+{}
+
 static inline void proc_exit_connector(struct task_struct *task)
 {}
 #endif	/* CONFIG_PROC_EVENTS */
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index 0d7b49973bb3..f6c271035bbd 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -56,7 +56,9 @@ struct proc_event {
 		PROC_EVENT_PTRACE = 0x00000100,
 		PROC_EVENT_COMM = 0x00000200,
 		/* "next" should be 0x00000400 */
-		/* "last" is the last process event: exit */
+		/* "last" is the last process event: exit,
+		 * while "next to last" is coredumping event */
+		PROC_EVENT_COREDUMP = 0x40000000,
 		PROC_EVENT_EXIT = 0x80000000
 	} what;
 	__u32 cpu;
@@ -110,11 +112,17 @@ struct proc_event {
 			char           comm[16];
 		} comm;
 
+		struct coredump_proc_event {
+			__kernel_pid_t process_pid;
+			__kernel_pid_t process_tgid;
+		} coredump;
+
 		struct exit_proc_event {
 			__kernel_pid_t process_pid;
 			__kernel_pid_t process_tgid;
 			__u32 exit_code, exit_signal;
 		} exit;
+
 	} event_data;
 };
 
diff --git a/kernel/signal.c b/kernel/signal.c
index dd72567767d9..497330ec2ae9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -32,6 +32,7 @@
 #include <linux/user_namespace.h>
 #include <linux/uprobes.h>
 #include <linux/compat.h>
+#include <linux/cn_proc.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -2350,6 +2351,7 @@ relock:
 		if (sig_kernel_coredump(signr)) {
 			if (print_fatal_signals)
 				print_fatal_signal(info->si_signo);
+			proc_coredump_connector(current);
 			/*
 			 * If it was able to dump core, this kills all
 			 * other threads in the group and synchronizes with
-- 
cgit 


From a831881be220358a1d28c5d95d69449fb6d623ca Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 18 Dec 2012 17:32:19 +0100
Subject: nohz: Basic full dynticks interface

For extreme usecases such as Real Time or HPC, having
the ability to shutdown the tick when a single task runs
on a CPU is a desired feature:

* Reducing the amount of interrupts improves throughput
for CPU-bound tasks. The CPU is less distracted from its
real job, from an execution time and from the cache point
of views.

* This also improve latency response as we have less critical
sections.

Start with introducing a very simple interface to define
full dynticks CPU: use a boot time option defined cpumask
through the "nohz_extended=" kernel parameter. CPUs that
are part of this range will have their tick shutdown
whenever possible: provided they run a single task and
they don't do kernel activity that require the periodic
tick. These details will be later documented in
Documentation/*

An online CPU must be kept outside this range to handle the
timekeeping.

Suggested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  6 ++++
 include/linux/tick.h                |  7 +++++
 kernel/time/Kconfig                 | 19 ++++++++++++
 kernel/time/tick-sched.c            | 62 +++++++++++++++++++++++++++++++++++++
 4 files changed, 94 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4609e81dbc37..231698feaddc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1913,6 +1913,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Valid arguments: on, off
 			Default: on
 
+	nohz_extended=  [KNL,BOOT]
+			In kernels built with CONFIG_NO_HZ_EXTENDED=y, set
+			the specified list of CPUs whose tick will be stopped
+			whenever possible. You need to keep at least one online
+			CPU outside the range to maintain the timekeeping.
+
 	noiotrap	[SH] Disables trapped I/O port accesses.
 
 	noirqdebug	[X86-32] Disables the code which attempts to detect and
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 553272e6af55..44bfa8aa439f 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -157,6 +157,13 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !NO_HZ */
 
+#ifdef CONFIG_NO_HZ_EXTENDED
+extern int tick_nohz_extended_cpu(int cpu);
+#else
+static inline int tick_nohz_extended_cpu(int cpu) { return 0; }
+#endif
+
+
 # ifdef CONFIG_CPU_IDLE_GOV_MENU
 extern void menu_hrtimer_cancel(void);
 # else
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 24510d84efd7..5a87c03e45ad 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -79,6 +79,25 @@ config NO_HZ
 	  only trigger on an as-needed basis both when the system is
 	  busy and when the system is idle.
 
+config NO_HZ_EXTENDED
+       bool "Full dynticks system"
+       depends on NO_HZ && RCU_USER_QS && VIRT_CPU_ACCOUNTING_GEN && RCU_NOCB_CPU && SMP
+       select CONTEXT_TRACKING_FORCE
+       help
+         Adaptively try to shutdown the tick whenever possible, even when
+	 the CPU is running tasks. Typically this requires running a single
+	 task on the CPU. Chances for running tickless are maximized when
+	 the task mostly runs in userspace and has few kernel activity.
+
+	 You need to fill up the nohz_extended boot parameter with the
+	 desired range of dynticks CPUs.
+
+	 This is implemented at the expense of some overhead in user <-> kernel
+	 transitions: syscalls, exceptions and interrupts. Even when it's
+	 dynamically off.
+
+	 Say N.
+
 config HIGH_RES_TIMERS
 	bool "High Resolution Timer Support"
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index a19a39952c1b..79c275f08b7d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -142,6 +142,68 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	profile_tick(CPU_PROFILING);
 }
 
+#ifdef CONFIG_NO_HZ_EXTENDED
+static cpumask_var_t nohz_extended_mask;
+bool have_nohz_extended_mask;
+
+int tick_nohz_extended_cpu(int cpu)
+{
+	if (!have_nohz_extended_mask)
+		return 0;
+
+	return cpumask_test_cpu(cpu, nohz_extended_mask);
+}
+
+/* Parse the boot-time nohz CPU list from the kernel parameters. */
+static int __init tick_nohz_extended_setup(char *str)
+{
+	alloc_bootmem_cpumask_var(&nohz_extended_mask);
+	if (cpulist_parse(str, nohz_extended_mask) < 0)
+		pr_warning("NOHZ: Incorrect nohz_extended cpumask\n");
+	else
+		have_nohz_extended_mask = true;
+	return 1;
+}
+__setup("nohz_extended=", tick_nohz_extended_setup);
+
+static int __init init_tick_nohz_extended(void)
+{
+	cpumask_var_t online_nohz;
+	int cpu;
+
+	if (!have_nohz_extended_mask)
+		return 0;
+
+	if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) {
+		pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * CPUs can probably not be concurrently offlined on initcall time.
+	 * But we are paranoid, aren't we?
+	 */
+	get_online_cpus();
+
+	/* Ensure we keep a CPU outside the dynticks range for timekeeping */
+	cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask);
+	if (cpumask_equal(online_nohz, cpu_online_mask)) {
+		cpu = cpumask_any(cpu_online_mask);
+		pr_warning("NO_HZ: Must keep at least one online CPU "
+			   "out of nohz_extended range\n");
+		pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu);
+		cpumask_clear_cpu(cpu, nohz_extended_mask);
+	}
+	put_online_cpus();
+	free_cpumask_var(online_nohz);
+
+	return 0;
+}
+core_initcall(init_tick_nohz_extended);
+#else
+#define have_nohz_extended_mask (0)
+#endif
+
 /*
  * NOHZ - aka dynamic tick functionality
  */
-- 
cgit 


From 1c20091e77fc5a9b7d7d905176443b4822a23cdb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 10 Aug 2011 23:21:01 +0200
Subject: nohz: Wake up full dynticks CPUs when a timer gets enqueued

Wake up a CPU when a timer list timer is enqueued there and
the target is part of the full dynticks range. Sending an IPI
to it makes it reconsidering the next timer to program on top
of recent updates.

This may later be improved by checking if the tick is really
stopped on the target. This would need some careful
synchronization though. So deal with such optimization later
and start simple.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  4 ++--
 kernel/sched/core.c   | 20 +++++++++++++++++++-
 kernel/timer.c        | 12 ++++++------
 3 files changed, 27 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9004f6e19eac..10626e2ee688 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1851,9 +1851,9 @@ static inline void idle_task_exit(void) {}
 #endif
 
 #if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
-extern void wake_up_idle_cpu(int cpu);
+extern void wake_up_nohz_cpu(int cpu);
 #else
-static inline void wake_up_idle_cpu(int cpu) { }
+static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
 #ifdef CONFIG_SCHED_AUTOGROUP
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 849deb96e61e..e91ee589f793 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -587,7 +587,7 @@ unlock:
  * account when the CPU goes back to idle and evaluates the timer
  * wheel for the next timer event.
  */
-void wake_up_idle_cpu(int cpu)
+static void wake_up_idle_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
 
@@ -617,6 +617,24 @@ void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
+static bool wake_up_extended_nohz_cpu(int cpu)
+{
+	if (tick_nohz_extended_cpu(cpu)) {
+		if (cpu != smp_processor_id() ||
+		    tick_nohz_tick_stopped())
+			smp_send_reschedule(cpu);
+		return true;
+	}
+
+	return false;
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+	if (!wake_up_extended_nohz_cpu(cpu))
+		wake_up_idle_cpu(cpu);
+}
+
 static inline bool got_nohz_idle_kick(void)
 {
 	int cpu = smp_processor_id();
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..4e3040b40d16 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -930,14 +930,14 @@ void add_timer_on(struct timer_list *timer, int cpu)
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
 	/*
-	 * Check whether the other CPU is idle and needs to be
-	 * triggered to reevaluate the timer wheel when nohz is
-	 * active. We are protected against the other CPU fiddling
+	 * Check whether the other CPU is in dynticks mode and needs
+	 * to be triggered to reevaluate the timer wheel.
+	 * We are protected against the other CPU fiddling
 	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to idle can not evaluate
-	 * the timer wheel.
+	 * makes sure that a CPU on the way to stop its tick can not
+	 * evaluate the timer wheel.
 	 */
-	wake_up_idle_cpu(cpu);
+	wake_up_nohz_cpu(cpu);
 	spin_unlock_irqrestore(&base->lock, flags);
 }
 EXPORT_SYMBOL_GPL(add_timer_on);
-- 
cgit 


From 65deb782858128cde598ac4a9150ab7cdd29dafa Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 28 Feb 2013 16:43:19 +0000
Subject: arm: vexpress: Decouple vexpress-poweroff implementation from
 machine_desc

This patch adds the pm_power_off and arm_pm_restart variable settings to
the vexpress-poweroff.c driver to decouple it from the machine_desc
definition.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Pawel Moll <pawel.moll@arm.com>
---
 arch/arm/mach-vexpress/v2m.c            | 5 -----
 drivers/power/reset/vexpress-poweroff.c | 9 +++++++--
 include/linux/vexpress.h                | 3 ---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-vexpress/v2m.c b/arch/arm/mach-vexpress/v2m.c
index 915683cb67d6..c970762e8386 100644
--- a/arch/arm/mach-vexpress/v2m.c
+++ b/arch/arm/mach-vexpress/v2m.c
@@ -361,8 +361,6 @@ static void __init v2m_init(void)
 	for (i = 0; i < ARRAY_SIZE(v2m_amba_devs); i++)
 		amba_device_register(v2m_amba_devs[i], &iomem_resource);
 
-	pm_power_off = vexpress_power_off;
-
 	ct_desc->init_tile();
 }
 
@@ -374,7 +372,6 @@ MACHINE_START(VEXPRESS, "ARM-Versatile Express")
 	.init_irq	= v2m_init_irq,
 	.init_time	= v2m_timer_init,
 	.init_machine	= v2m_init,
-	.restart	= vexpress_restart,
 MACHINE_END
 
 static struct map_desc v2m_rs1_io_desc __initdata = {
@@ -464,7 +461,6 @@ static void __init v2m_dt_init(void)
 {
 	l2x0_of_init(0x00400000, 0xfe0fffff);
 	of_platform_populate(NULL, v2m_dt_bus_match, NULL, NULL);
-	pm_power_off = vexpress_power_off;
 }
 
 static const char * const v2m_dt_match[] __initconst = {
@@ -481,5 +477,4 @@ DT_MACHINE_START(VEXPRESS_DT, "ARM-Versatile Express")
 	.init_irq	= irqchip_init,
 	.init_time	= v2m_dt_timer_init,
 	.init_machine	= v2m_dt_init,
-	.restart	= vexpress_restart,
 MACHINE_END
diff --git a/drivers/power/reset/vexpress-poweroff.c b/drivers/power/reset/vexpress-poweroff.c
index 465923aa3819..469e6962b2cf 100644
--- a/drivers/power/reset/vexpress-poweroff.c
+++ b/drivers/power/reset/vexpress-poweroff.c
@@ -18,6 +18,8 @@
 #include <linux/stat.h>
 #include <linux/vexpress.h>
 
+#include <asm/system_misc.h>
+
 static void vexpress_reset_do(struct device *dev, const char *what)
 {
 	int err = -ENOENT;
@@ -39,14 +41,14 @@ static void vexpress_reset_do(struct device *dev, const char *what)
 
 static struct device *vexpress_power_off_device;
 
-void vexpress_power_off(void)
+static void vexpress_power_off(void)
 {
 	vexpress_reset_do(vexpress_power_off_device, "power off");
 }
 
 static struct device *vexpress_restart_device;
 
-void vexpress_restart(char str, const char *cmd)
+static void vexpress_restart(char str, const char *cmd)
 {
 	vexpress_reset_do(vexpress_restart_device, "restart");
 }
@@ -103,14 +105,17 @@ static int vexpress_reset_probe(struct platform_device *pdev)
 	switch (func) {
 	case FUNC_SHUTDOWN:
 		vexpress_power_off_device = &pdev->dev;
+		pm_power_off = vexpress_power_off;
 		break;
 	case FUNC_RESET:
 		if (!vexpress_restart_device)
 			vexpress_restart_device = &pdev->dev;
+		arm_pm_restart = vexpress_restart;
 		device_create_file(&pdev->dev, &dev_attr_active);
 		break;
 	case FUNC_REBOOT:
 		vexpress_restart_device = &pdev->dev;
+		arm_pm_restart = vexpress_restart;
 		device_create_file(&pdev->dev, &dev_attr_active);
 		break;
 	};
diff --git a/include/linux/vexpress.h b/include/linux/vexpress.h
index 75818744ab59..ea7168a68081 100644
--- a/include/linux/vexpress.h
+++ b/include/linux/vexpress.h
@@ -115,9 +115,6 @@ unsigned __vexpress_get_site(struct device *dev, struct device_node *node);
 void vexpress_sysreg_early_init(void __iomem *base);
 void vexpress_sysreg_of_early_init(void);
 
-void vexpress_power_off(void);
-void vexpress_restart(char str, const char *cmd);
-
 /* Clocks */
 
 struct clk *vexpress_osc_setup(struct device *dev);
-- 
cgit 


From 9b44190dc114c1720b34975b5bfc65aece112ced Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 20 Mar 2013 13:32:58 +0000
Subject: tcp: refactor F-RTO

The patch series refactor the F-RTO feature (RFC4138/5682).

This is to simplify the loss recovery processing. Existing F-RTO
was developed during the experimental stage (RFC4138) and has
many experimental features.  It takes a separate code path from
the traditional timeout processing by overloading CA_Disorder
instead of using CA_Loss state. This complicates CA_Disorder state
handling because it's also used for handling dubious ACKs and undos.
While the algorithm in the RFC does not change the congestion control,
the implementation intercepts congestion control in various places
(e.g., frto_cwnd in tcp_ack()).

The new code implements newer F-RTO RFC5682 using CA_Loss processing
path.  F-RTO becomes a small extension in the timeout processing
and interfaces with congestion control and Eifel undo modules.
It lets congestion control (module) determines how many to send
independently.  F-RTO only chooses what to send in order to detect
spurious retranmission. If timeout is found spurious it invokes
existing Eifel undo algorithms like DSACK or TCP timestamp based
detection.

The first patch removes all F-RTO code except the sysctl_tcp_frto is
left for the new implementation.  Since CA_EVENT_FRTO is removed, TCP
westwood now computes ssthresh on regular timeout CA_EVENT_LOSS event.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt |  17 --
 include/linux/tcp.h                    |   6 +-
 include/net/tcp.h                      |   4 -
 net/ipv4/sysctl_net_ipv4.c             |   7 -
 net/ipv4/tcp_input.c                   | 375 +--------------------------------
 net/ipv4/tcp_minisocks.c               |   3 -
 net/ipv4/tcp_output.c                  |  11 +-
 net/ipv4/tcp_timer.c                   |   6 +-
 net/ipv4/tcp_westwood.c                |   2 +-
 9 files changed, 10 insertions(+), 421 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 17953e2bc3e9..8a977a0aaede 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -239,23 +239,6 @@ tcp_frto - INTEGER
 	interacts badly with the packet counting of the SACK enabled TCP
 	flow.
 
-tcp_frto_response - INTEGER
-	When F-RTO has detected that a TCP retransmission timeout was
-	spurious (i.e, the timeout would have been avoided had TCP set a
-	longer retransmission timeout), TCP has several options what to do
-	next. Possible values are:
-		0 Rate halving based; a smooth and conservative response,
-		  results in halved cwnd and ssthresh after one RTT
-		1 Very conservative response; not recommended because even
-		  though being valid, it interacts poorly with the rest of
-		  Linux TCP, halves cwnd and ssthresh immediately
-		2 Aggressive response; undoes congestion control measures
-		  that are now known to be unnecessary (ignoring the
-		  possibility of a lost retransmission that would require
-		  TCP to be more cautious), cwnd and ssthresh are restored
-		  to the values prior timeout
-	Default: 0 (rate halving based)
-
 tcp_keepalive_time - INTEGER
 	How often TCP sends out keepalive messages when keepalive is enabled.
 	Default: 2hours.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index ed6a7456eecd..f5f203b36379 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -187,14 +187,12 @@ struct tcp_sock {
 	u32	window_clamp;	/* Maximal window to advertise		*/
 	u32	rcv_ssthresh;	/* Current window clamp			*/
 
-	u32	frto_highmark;	/* snd_nxt when RTO occurred */
 	u16	advmss;		/* Advertised MSS			*/
-	u8	frto_counter;	/* Number of new acks after RTO */
+	u8	unused;
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
-		repair      : 1,
-		unused      : 1;
+		repair      : 1;
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		syn_data:1,	/* SYN includes data */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7f2f17198d75..d1dcb596230e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -272,7 +272,6 @@ extern int sysctl_tcp_app_win;
 extern int sysctl_tcp_adv_win_scale;
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
-extern int sysctl_tcp_frto_response;
 extern int sysctl_tcp_low_latency;
 extern int sysctl_tcp_dma_copybreak;
 extern int sysctl_tcp_nometrics_save;
@@ -424,8 +423,6 @@ extern struct sock * tcp_check_req(struct sock *sk,struct sk_buff *skb,
 				   bool fastopen);
 extern int tcp_child_process(struct sock *parent, struct sock *child,
 			     struct sk_buff *skb);
-extern bool tcp_use_frto(struct sock *sk);
-extern void tcp_enter_frto(struct sock *sk);
 extern void tcp_enter_loss(struct sock *sk, int how);
 extern void tcp_clear_retrans(struct tcp_sock *tp);
 extern void tcp_update_metrics(struct sock *sk);
@@ -756,7 +753,6 @@ enum tcp_ca_event {
 	CA_EVENT_TX_START,	/* first transmit when no packets in flight */
 	CA_EVENT_CWND_RESTART,	/* congestion window restart */
 	CA_EVENT_COMPLETE_CWR,	/* end of congestion recovery */
-	CA_EVENT_FRTO,		/* fast recovery timeout */
 	CA_EVENT_LOSS,		/* loss timeout */
 	CA_EVENT_FAST_ACK,	/* in sequence ack */
 	CA_EVENT_SLOW_ACK,	/* other ack */
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index cb45062c8be0..fa2f63fc453b 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -591,13 +591,6 @@ static struct ctl_table ipv4_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec
 	},
-	{
-		.procname	= "tcp_frto_response",
-		.data		= &sysctl_tcp_frto_response,
-		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
-	},
 	{
 		.procname	= "tcp_low_latency",
 		.data		= &sysctl_tcp_low_latency,
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 19f0149fb6a2..231c79fe91f3 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -93,7 +93,6 @@ int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
 int sysctl_tcp_frto __read_mostly = 2;
-int sysctl_tcp_frto_response __read_mostly;
 
 int sysctl_tcp_thin_dupack __read_mostly;
 
@@ -108,17 +107,14 @@ int sysctl_tcp_early_retrans __read_mostly = 3;
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
-#define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
-#define FLAG_NONHEAD_RETRANS_ACKED	0x1000 /* Non-head rexmitted data was ACKed */
 #define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
 
 #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
 #define FLAG_CA_ALERT		(FLAG_DATA_SACKED|FLAG_ECE)
 #define FLAG_FORWARD_PROGRESS	(FLAG_ACKED|FLAG_DATA_SACKED)
-#define FLAG_ANY_PROGRESS	(FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
 
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
 #define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
@@ -1159,10 +1155,6 @@ static u8 tcp_sacktag_one(struct sock *sk,
 					   tcp_highest_sack_seq(tp)))
 					state->reord = min(fack_count,
 							   state->reord);
-
-				/* SACK enhanced F-RTO (RFC4138; Appendix B) */
-				if (!after(end_seq, tp->frto_highmark))
-					state->flag |= FLAG_ONLY_ORIG_SACKED;
 			}
 
 			if (sacked & TCPCB_LOST) {
@@ -1555,7 +1547,6 @@ static int
 tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb,
 			u32 prior_snd_una)
 {
-	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	const unsigned char *ptr = (skb_transport_header(ack_skb) +
 				    TCP_SKB_CB(ack_skb)->sacked);
@@ -1728,12 +1719,6 @@ walk:
 				       start_seq, end_seq, dup_sack);
 
 advance_sp:
-		/* SACK enhanced FRTO (RFC4138, Appendix B): Clearing correct
-		 * due to in-order walk
-		 */
-		if (after(end_seq, tp->frto_highmark))
-			state.flag &= ~FLAG_ONLY_ORIG_SACKED;
-
 		i++;
 	}
 
@@ -1750,8 +1735,7 @@ advance_sp:
 	tcp_verify_left_out(tp);
 
 	if ((state.reord < tp->fackets_out) &&
-	    ((icsk->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker) &&
-	    (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
+	    ((inet_csk(sk)->icsk_ca_state != TCP_CA_Loss) || tp->undo_marker))
 		tcp_update_reordering(sk, tp->fackets_out - state.reord, 0);
 
 out:
@@ -1825,197 +1809,6 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
 	tp->sacked_out = 0;
 }
 
-static int tcp_is_sackfrto(const struct tcp_sock *tp)
-{
-	return (sysctl_tcp_frto == 0x2) && !tcp_is_reno(tp);
-}
-
-/* F-RTO can only be used if TCP has never retransmitted anything other than
- * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
- */
-bool tcp_use_frto(struct sock *sk)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct sk_buff *skb;
-
-	if (!sysctl_tcp_frto)
-		return false;
-
-	/* MTU probe and F-RTO won't really play nicely along currently */
-	if (icsk->icsk_mtup.probe_size)
-		return false;
-
-	if (tcp_is_sackfrto(tp))
-		return true;
-
-	/* Avoid expensive walking of rexmit queue if possible */
-	if (tp->retrans_out > 1)
-		return false;
-
-	skb = tcp_write_queue_head(sk);
-	if (tcp_skb_is_last(sk, skb))
-		return true;
-	skb = tcp_write_queue_next(sk, skb);	/* Skips head */
-	tcp_for_write_queue_from(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-		if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
-			return false;
-		/* Short-circuit when first non-SACKed skb has been checked */
-		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
-			break;
-	}
-	return true;
-}
-
-/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
- * recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
- * keep retrans_out counting accurate (with SACK F-RTO, other than head
- * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
- * bits are handled if the Loss state is really to be entered (in
- * tcp_enter_frto_loss).
- *
- * Do like tcp_enter_loss() would; when RTO expires the second time it
- * does:
- *  "Reduce ssthresh if it has not yet been made inside this window."
- */
-void tcp_enter_frto(struct sock *sk)
-{
-	const struct inet_connection_sock *icsk = inet_csk(sk);
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
-
-	if ((!tp->frto_counter && icsk->icsk_ca_state <= TCP_CA_Disorder) ||
-	    tp->snd_una == tp->high_seq ||
-	    ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
-	     !icsk->icsk_retransmits)) {
-		tp->prior_ssthresh = tcp_current_ssthresh(sk);
-		/* Our state is too optimistic in ssthresh() call because cwnd
-		 * is not reduced until tcp_enter_frto_loss() when previous F-RTO
-		 * recovery has not yet completed. Pattern would be this: RTO,
-		 * Cumulative ACK, RTO (2xRTO for the same segment does not end
-		 * up here twice).
-		 * RFC4138 should be more specific on what to do, even though
-		 * RTO is quite unlikely to occur after the first Cumulative ACK
-		 * due to back-off and complexity of triggering events ...
-		 */
-		if (tp->frto_counter) {
-			u32 stored_cwnd;
-			stored_cwnd = tp->snd_cwnd;
-			tp->snd_cwnd = 2;
-			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-			tp->snd_cwnd = stored_cwnd;
-		} else {
-			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
-		}
-		/* ... in theory, cong.control module could do "any tricks" in
-		 * ssthresh(), which means that ca_state, lost bits and lost_out
-		 * counter would have to be faked before the call occurs. We
-		 * consider that too expensive, unlikely and hacky, so modules
-		 * using these in ssthresh() must deal these incompatibility
-		 * issues if they receives CA_EVENT_FRTO and frto_counter != 0
-		 */
-		tcp_ca_event(sk, CA_EVENT_FRTO);
-	}
-
-	tp->undo_marker = tp->snd_una;
-	tp->undo_retrans = 0;
-
-	skb = tcp_write_queue_head(sk);
-	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
-		tp->undo_marker = 0;
-	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
-		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
-		tp->retrans_out -= tcp_skb_pcount(skb);
-	}
-	tcp_verify_left_out(tp);
-
-	/* Too bad if TCP was application limited */
-	tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp) + 1);
-
-	/* Earlier loss recovery underway (see RFC4138; Appendix B).
-	 * The last condition is necessary at least in tp->frto_counter case.
-	 */
-	if (tcp_is_sackfrto(tp) && (tp->frto_counter ||
-	    ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
-	    after(tp->high_seq, tp->snd_una)) {
-		tp->frto_highmark = tp->high_seq;
-	} else {
-		tp->frto_highmark = tp->snd_nxt;
-	}
-	tcp_set_ca_state(sk, TCP_CA_Disorder);
-	tp->high_seq = tp->snd_nxt;
-	tp->frto_counter = 1;
-}
-
-/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO,
- * which indicates that we should follow the traditional RTO recovery,
- * i.e. mark everything lost and do go-back-N retransmission.
- */
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-	struct sk_buff *skb;
-
-	tp->lost_out = 0;
-	tp->retrans_out = 0;
-	if (tcp_is_reno(tp))
-		tcp_reset_reno_sack(tp);
-
-	tcp_for_write_queue(skb, sk) {
-		if (skb == tcp_send_head(sk))
-			break;
-
-		TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
-		/*
-		 * Count the retransmission made on RTO correctly (only when
-		 * waiting for the first ACK and did not get it)...
-		 */
-		if ((tp->frto_counter == 1) && !(flag & FLAG_DATA_ACKED)) {
-			/* For some reason this R-bit might get cleared? */
-			if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
-				tp->retrans_out += tcp_skb_pcount(skb);
-			/* ...enter this if branch just for the first segment */
-			flag |= FLAG_DATA_ACKED;
-		} else {
-			if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
-				tp->undo_marker = 0;
-			TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
-		}
-
-		/* Marking forward transmissions that were made after RTO lost
-		 * can cause unnecessary retransmissions in some scenarios,
-		 * SACK blocks will mitigate that in some but not in all cases.
-		 * We used to not mark them but it was causing break-ups with
-		 * receivers that do only in-order receival.
-		 *
-		 * TODO: we could detect presence of such receiver and select
-		 * different behavior per flow.
-		 */
-		if (!(TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) {
-			TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
-			tp->lost_out += tcp_skb_pcount(skb);
-			tp->retransmit_high = TCP_SKB_CB(skb)->end_seq;
-		}
-	}
-	tcp_verify_left_out(tp);
-
-	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
-	tp->snd_cwnd_cnt = 0;
-	tp->snd_cwnd_stamp = tcp_time_stamp;
-	tp->frto_counter = 0;
-
-	tp->reordering = min_t(unsigned int, tp->reordering,
-			       sysctl_tcp_reordering);
-	tcp_set_ca_state(sk, TCP_CA_Loss);
-	tp->high_seq = tp->snd_nxt;
-	TCP_ECN_queue_cwr(tp);
-
-	tcp_clear_all_retrans_hints(tp);
-}
-
 static void tcp_clear_retrans_partial(struct tcp_sock *tp)
 {
 	tp->retrans_out = 0;
@@ -2090,8 +1883,6 @@ void tcp_enter_loss(struct sock *sk, int how)
 	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
-	/* Abort F-RTO algorithm if one is in progress */
-	tp->frto_counter = 0;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2275,10 +2066,6 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
 	struct tcp_sock *tp = tcp_sk(sk);
 	__u32 packets_out;
 
-	/* Do not perform any recovery during F-RTO algorithm */
-	if (tp->frto_counter)
-		return false;
-
 	/* Trick#1: The loss is proven. */
 	if (tp->lost_out)
 		return true;
@@ -2760,7 +2547,7 @@ static void tcp_try_to_open(struct sock *sk, int flag, int newly_acked_sacked)
 
 	tcp_verify_left_out(tp);
 
-	if (!tp->frto_counter && !tcp_any_retrans_done(sk))
+	if (!tcp_any_retrans_done(sk))
 		tp->retrans_stamp = 0;
 
 	if (flag & FLAG_ECE)
@@ -3198,8 +2985,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			flag |= FLAG_RETRANS_DATA_ACKED;
 			ca_seq_rtt = -1;
 			seq_rtt = -1;
-			if ((flag & FLAG_DATA_ACKED) || (acked_pcount > 1))
-				flag |= FLAG_NONHEAD_RETRANS_ACKED;
 		} else {
 			ca_seq_rtt = now - scb->when;
 			last_ackt = skb->tstamp;
@@ -3408,150 +3193,6 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32
 	return flag;
 }
 
-/* A very conservative spurious RTO response algorithm: reduce cwnd and
- * continue in congestion avoidance.
- */
-static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
-{
-	tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh);
-	tp->snd_cwnd_cnt = 0;
-	TCP_ECN_queue_cwr(tp);
-	tcp_moderate_cwnd(tp);
-}
-
-/* A conservative spurious RTO response algorithm: reduce cwnd using
- * PRR and continue in congestion avoidance.
- */
-static void tcp_cwr_spur_to_response(struct sock *sk)
-{
-	tcp_enter_cwr(sk, 0);
-}
-
-static void tcp_undo_spur_to_response(struct sock *sk, int flag)
-{
-	if (flag & FLAG_ECE)
-		tcp_cwr_spur_to_response(sk);
-	else
-		tcp_undo_cwr(sk, true);
-}
-
-/* F-RTO spurious RTO detection algorithm (RFC4138)
- *
- * F-RTO affects during two new ACKs following RTO (well, almost, see inline
- * comments). State (ACK number) is kept in frto_counter. When ACK advances
- * window (but not to or beyond highest sequence sent before RTO):
- *   On First ACK,  send two new segments out.
- *   On Second ACK, RTO was likely spurious. Do spurious response (response
- *                  algorithm is not part of the F-RTO detection algorithm
- *                  given in RFC4138 but can be selected separately).
- * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss
- * and TCP falls back to conventional RTO recovery. F-RTO allows overriding
- * of Nagle, this is done using frto_counter states 2 and 3, when a new data
- * segment of any size sent during F-RTO, state 2 is upgraded to 3.
- *
- * Rationale: if the RTO was spurious, new ACKs should arrive from the
- * original window even after we transmit two new data segments.
- *
- * SACK version:
- *   on first step, wait until first cumulative ACK arrives, then move to
- *   the second step. In second step, the next ACK decides.
- *
- * F-RTO is implemented (mainly) in four functions:
- *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
- *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
- *     called when tcp_use_frto() showed green light
- *   - tcp_process_frto() handles incoming ACKs during F-RTO algorithm
- *   - tcp_enter_frto_loss() is called if there is not enough evidence
- *     to prove that the RTO is indeed spurious. It transfers the control
- *     from F-RTO to the conventional RTO recovery
- */
-static bool tcp_process_frto(struct sock *sk, int flag)
-{
-	struct tcp_sock *tp = tcp_sk(sk);
-
-	tcp_verify_left_out(tp);
-
-	/* Duplicate the behavior from Loss state (fastretrans_alert) */
-	if (flag & FLAG_DATA_ACKED)
-		inet_csk(sk)->icsk_retransmits = 0;
-
-	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
-	    ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
-		tp->undo_marker = 0;
-
-	if (!before(tp->snd_una, tp->frto_highmark)) {
-		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
-		return true;
-	}
-
-	if (!tcp_is_sackfrto(tp)) {
-		/* RFC4138 shortcoming in step 2; should also have case c):
-		 * ACK isn't duplicate nor advances window, e.g., opposite dir
-		 * data, winupdate
-		 */
-		if (!(flag & FLAG_ANY_PROGRESS) && (flag & FLAG_NOT_DUP))
-			return true;
-
-		if (!(flag & FLAG_DATA_ACKED)) {
-			tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
-					    flag);
-			return true;
-		}
-	} else {
-		if (!(flag & FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
-			if (!tcp_packets_in_flight(tp)) {
-				tcp_enter_frto_loss(sk, 2, flag);
-				return true;
-			}
-
-			/* Prevent sending of new data. */
-			tp->snd_cwnd = min(tp->snd_cwnd,
-					   tcp_packets_in_flight(tp));
-			return true;
-		}
-
-		if ((tp->frto_counter >= 2) &&
-		    (!(flag & FLAG_FORWARD_PROGRESS) ||
-		     ((flag & FLAG_DATA_SACKED) &&
-		      !(flag & FLAG_ONLY_ORIG_SACKED)))) {
-			/* RFC4138 shortcoming (see comment above) */
-			if (!(flag & FLAG_FORWARD_PROGRESS) &&
-			    (flag & FLAG_NOT_DUP))
-				return true;
-
-			tcp_enter_frto_loss(sk, 3, flag);
-			return true;
-		}
-	}
-
-	if (tp->frto_counter == 1) {
-		/* tcp_may_send_now needs to see updated state */
-		tp->snd_cwnd = tcp_packets_in_flight(tp) + 2;
-		tp->frto_counter = 2;
-
-		if (!tcp_may_send_now(sk))
-			tcp_enter_frto_loss(sk, 2, flag);
-
-		return true;
-	} else {
-		switch (sysctl_tcp_frto_response) {
-		case 2:
-			tcp_undo_spur_to_response(sk, flag);
-			break;
-		case 1:
-			tcp_conservative_spur_to_response(tp);
-			break;
-		default:
-			tcp_cwr_spur_to_response(sk);
-			break;
-		}
-		tp->frto_counter = 0;
-		tp->undo_marker = 0;
-		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS);
-	}
-	return false;
-}
-
 /* RFC 5961 7 [ACK Throttling] */
 static void tcp_send_challenge_ack(struct sock *sk)
 {
@@ -3616,7 +3257,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 	int prior_packets;
 	int prior_sacked = tp->sacked_out;
 	int pkts_acked = 0;
-	bool frto_cwnd = false;
 
 	/* If the ack is older than previous acks
 	 * then we can probably ignore it.
@@ -3690,22 +3330,15 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
 
 	pkts_acked = prior_packets - tp->packets_out;
 
-	if (tp->frto_counter)
-		frto_cwnd = tcp_process_frto(sk, flag);
-	/* Guarantee sacktag reordering detection against wrap-arounds */
-	if (before(tp->frto_highmark, tp->snd_una))
-		tp->frto_highmark = 0;
-
 	if (tcp_ack_is_dubious(sk, flag)) {
 		/* Advance CWND, if state allows this. */
-		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd &&
-		    tcp_may_raise_cwnd(sk, flag))
+		if ((flag & FLAG_DATA_ACKED) && tcp_may_raise_cwnd(sk, flag))
 			tcp_cong_avoid(sk, ack, prior_in_flight);
 		is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
 		tcp_fastretrans_alert(sk, pkts_acked, prior_sacked,
 				      is_dupack, flag);
 	} else {
-		if ((flag & FLAG_DATA_ACKED) && !frto_cwnd)
+		if (flag & FLAG_DATA_ACKED)
 			tcp_cong_avoid(sk, ack, prior_in_flight);
 	}
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 8f0234f8bb95..05eaf8904613 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -422,9 +422,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		newtp->snd_cwnd = TCP_INIT_CWND;
 		newtp->snd_cwnd_cnt = 0;
 
-		newtp->frto_counter = 0;
-		newtp->frto_highmark = 0;
-
 		if (newicsk->icsk_ca_ops != &tcp_init_congestion_ops &&
 		    !try_module_get(newicsk->icsk_ca_ops->owner))
 			newicsk->icsk_ca_ops = &tcp_init_congestion_ops;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index e787ecec505e..163cf5fc0119 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -78,10 +78,6 @@ static void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb)
 	tcp_advance_send_head(sk, skb);
 	tp->snd_nxt = TCP_SKB_CB(skb)->end_seq;
 
-	/* Don't override Nagle indefinitely with F-RTO */
-	if (tp->frto_counter == 2)
-		tp->frto_counter = 3;
-
 	tp->packets_out += tcp_skb_pcount(skb);
 	if (!prior_packets || icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
 	    icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)
@@ -1470,11 +1466,8 @@ static inline bool tcp_nagle_test(const struct tcp_sock *tp, const struct sk_buf
 	if (nonagle & TCP_NAGLE_PUSH)
 		return true;
 
-	/* Don't use the nagle rule for urgent data (or for the final FIN).
-	 * Nagle can be ignored during F-RTO too (see RFC4138).
-	 */
-	if (tcp_urg_mode(tp) || (tp->frto_counter == 2) ||
-	    (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
+	/* Don't use the nagle rule for urgent data (or for the final FIN). */
+	if (tcp_urg_mode(tp) || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
 		return true;
 
 	if (!tcp_nagle_check(tp, skb, cur_mss, nonagle))
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index eeccf795e917..4b85e6f636c9 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -416,11 +416,7 @@ void tcp_retransmit_timer(struct sock *sk)
 		NET_INC_STATS_BH(sock_net(sk), mib_idx);
 	}
 
-	if (tcp_use_frto(sk)) {
-		tcp_enter_frto(sk);
-	} else {
-		tcp_enter_loss(sk, 0);
-	}
+	tcp_enter_loss(sk, 0);
 
 	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) {
 		/* Retransmission failed because of local congestion,
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index 1b91bf48e277..76a1e23259e1 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -236,7 +236,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event)
 		tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		break;
 
-	case CA_EVENT_FRTO:
+	case CA_EVENT_LOSS:
 		tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk);
 		/* Update RTT_min when next ack arrives */
 		w->reset_rtt_min = 1;
-- 
cgit 


From e33099f96d99c391b3325caa9c44258de04aae86 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Wed, 20 Mar 2013 13:33:00 +0000
Subject: tcp: implement RFC5682 F-RTO

This patch implements F-RTO (foward RTO recovery):

When the first retransmission after timeout is acknowledged, F-RTO
sends new data instead of old data. If the next ACK acknowledges
some never-retransmitted data, then the timeout was spurious and the
congestion state is reverted.  Otherwise if the next ACK selectively
acknowledges the new data, then the timeout was genuine and the
loss recovery continues. This idea applies to recurring timeouts
as well. While F-RTO sends different data during timeout recovery,
it does not (and should not) change the congestion control.

The implementaion follows the three steps of SACK enhanced algorithm
(section 3) in RFC5682. Step 1 is in tcp_enter_loss(). Step 2 and
3 are in tcp_process_loss().  The basic version is not supported
because SACK enhanced version also works for non-SACK connections.

The new implementation is functionally in parity with the old F-RTO
implementation except the one case where it increases undo events:
In addition to the RFC algorithm, a spurious timeout may be detected
without sending data in step 2, as long as the SACK confirms not
all the original data are dropped. When this happens, the sender
will undo the cwnd and perhaps enter fast recovery instead. This
additional check increases the F-RTO undo events by 5x compared
to the prior implementation on Google Web servers, since the sender
often does not have new data to send for HTTP.

Note F-RTO may detect spurious timeout before Eifel with timestamps
does so.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Acked-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 18 +++------
 include/linux/tcp.h                    |  3 +-
 net/ipv4/tcp_input.c                   | 73 ++++++++++++++++++++++++++++------
 3 files changed, 68 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 8a977a0aaede..f98ca633b528 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -225,19 +225,13 @@ tcp_fin_timeout - INTEGER
 	Default: 60 seconds
 
 tcp_frto - INTEGER
-	Enables Forward RTO-Recovery (F-RTO) defined in RFC4138.
+	Enables Forward RTO-Recovery (F-RTO) defined in RFC5682.
 	F-RTO is an enhanced recovery algorithm for TCP retransmission
-	timeouts.  It is particularly beneficial in wireless environments
-	where packet loss is typically due to random radio interference
-	rather than intermediate router congestion.  F-RTO is sender-side
-	only modification. Therefore it does not require any support from
-	the peer.
-
-	If set to 1, basic version is enabled.  2 enables SACK enhanced
-	F-RTO if flow uses SACK.  The basic version can be used also when
-	SACK is in use though scenario(s) with it exists where F-RTO
-	interacts badly with the packet counting of the SACK enabled TCP
-	flow.
+	timeouts.  It is particularly beneficial in networks where the
+	RTT fluctuates (e.g., wireless). F-RTO is sender-side only
+	modification. It does not require any support from the peer.
+
+	By default it's enabled with a non-zero value. 0 disables F-RTO.
 
 tcp_keepalive_time - INTEGER
 	How often TCP sends out keepalive messages when keepalive is enabled.
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f5f203b36379..5adbc33d1ab3 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -192,7 +192,8 @@ struct tcp_sock {
 	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
 		thin_lto    : 1,/* Use linear timeouts for thin streams */
 		thin_dupack : 1,/* Fast retransmit on first dupack      */
-		repair      : 1;
+		repair      : 1,
+		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
 	u8	repair_queue;
 	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
 		syn_data:1,	/* SYN includes data */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8d821e45b917..b2b36196b342 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -107,6 +107,7 @@ int sysctl_tcp_early_retrans __read_mostly = 3;
 #define FLAG_DATA_SACKED	0x20 /* New SACK.				*/
 #define FLAG_ECE		0x40 /* ECE in this ACK				*/
 #define FLAG_SLOWPATH		0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ORIG_SACK_ACKED	0x200 /* Never retransmitted data are (s)acked	*/
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained D-SACK info */
 #define FLAG_SACK_RENEGING	0x2000 /* snd_una advanced to a sacked seq */
@@ -1155,6 +1156,8 @@ static u8 tcp_sacktag_one(struct sock *sk,
 					   tcp_highest_sack_seq(tp)))
 					state->reord = min(fack_count,
 							   state->reord);
+				if (!after(end_seq, tp->high_seq))
+					state->flag |= FLAG_ORIG_SACK_ACKED;
 			}
 
 			if (sacked & TCPCB_LOST) {
@@ -1835,10 +1838,13 @@ void tcp_enter_loss(struct sock *sk, int how)
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct sk_buff *skb;
+	bool new_recovery = false;
 
 	/* Reduce ssthresh if it has not yet been made inside this window. */
-	if (icsk->icsk_ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq ||
+	if (icsk->icsk_ca_state <= TCP_CA_Disorder ||
+	    !after(tp->high_seq, tp->snd_una) ||
 	    (icsk->icsk_ca_state == TCP_CA_Loss && !icsk->icsk_retransmits)) {
+		new_recovery = true;
 		tp->prior_ssthresh = tcp_current_ssthresh(sk);
 		tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
 		tcp_ca_event(sk, CA_EVENT_LOSS);
@@ -1883,6 +1889,14 @@ void tcp_enter_loss(struct sock *sk, int how)
 	tcp_set_ca_state(sk, TCP_CA_Loss);
 	tp->high_seq = tp->snd_nxt;
 	TCP_ECN_queue_cwr(tp);
+
+	/* F-RTO RFC5682 sec 3.1 step 1: retransmit SND.UNA if no previous
+	 * loss recovery is underway except recurring timeout(s) on
+	 * the same SND.UNA (sec 3.2). Disable F-RTO on path MTU probing
+	 */
+	tp->frto = sysctl_tcp_frto &&
+		   (new_recovery || icsk->icsk_retransmits) &&
+		   !inet_csk(sk)->icsk_mtup.probe_size;
 }
 
 /* If ACK arrived pointing to a remembered SACK, it means that our
@@ -2426,12 +2440,12 @@ static int tcp_try_undo_partial(struct sock *sk, int acked)
 	return failed;
 }
 
-/* Undo during loss recovery after partial ACK. */
-static bool tcp_try_undo_loss(struct sock *sk)
+/* Undo during loss recovery after partial ACK or using F-RTO. */
+static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
-	if (tcp_may_undo(tp)) {
+	if (frto_undo || tcp_may_undo(tp)) {
 		struct sk_buff *skb;
 		tcp_for_write_queue(skb, sk) {
 			if (skb == tcp_send_head(sk))
@@ -2445,9 +2459,12 @@ static bool tcp_try_undo_loss(struct sock *sk)
 		tp->lost_out = 0;
 		tcp_undo_cwr(sk, true);
 		NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSSUNDO);
+		if (frto_undo)
+			NET_INC_STATS_BH(sock_net(sk),
+					 LINUX_MIB_TCPSPURIOUSRTOS);
 		inet_csk(sk)->icsk_retransmits = 0;
 		tp->undo_marker = 0;
-		if (tcp_is_sack(tp))
+		if (frto_undo || tcp_is_sack(tp))
 			tcp_set_ca_state(sk, TCP_CA_Open);
 		return true;
 	}
@@ -2667,24 +2684,52 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
 /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
  * recovered or spurious. Otherwise retransmits more on partial ACKs.
  */
-static void tcp_process_loss(struct sock *sk, int flag)
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
+	bool recovered = !before(tp->snd_una, tp->high_seq);
 
-	if (!before(tp->snd_una, tp->high_seq)) {
+	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
+		if (flag & FLAG_ORIG_SACK_ACKED) {
+			/* Step 3.b. A timeout is spurious if not all data are
+			 * lost, i.e., never-retransmitted data are (s)acked.
+			 */
+			tcp_try_undo_loss(sk, true);
+			return;
+		}
+		if (after(tp->snd_nxt, tp->high_seq) &&
+		    (flag & FLAG_DATA_SACKED || is_dupack)) {
+			tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+			tp->high_seq = tp->snd_nxt;
+			__tcp_push_pending_frames(sk, tcp_current_mss(sk),
+						  TCP_NAGLE_OFF);
+			if (after(tp->snd_nxt, tp->high_seq))
+				return; /* Step 2.b */
+			tp->frto = 0;
+		}
+	}
+
+	if (recovered) {
+		/* F-RTO RFC5682 sec 3.1 step 2.a and 1st part of step 3.a */
 		icsk->icsk_retransmits = 0;
 		tcp_try_undo_recovery(sk);
 		return;
 	}
-
 	if (flag & FLAG_DATA_ACKED)
 		icsk->icsk_retransmits = 0;
-	if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
-		tcp_reset_reno_sack(tp);
-	if (tcp_try_undo_loss(sk))
+	if (tcp_is_reno(tp)) {
+		/* A Reno DUPACK means new data in F-RTO step 2.b above are
+		 * delivered. Lower inflight to clock out (re)tranmissions.
+		 */
+		if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+			tcp_add_reno_sack(sk);
+		else if (flag & FLAG_SND_UNA_ADVANCED)
+			tcp_reset_reno_sack(tp);
+	}
+	if (tcp_try_undo_loss(sk, false))
 		return;
-	tcp_moderate_cwnd(tp);
 	tcp_xmit_retransmit_queue(sk);
 }
 
@@ -2764,7 +2809,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked,
 		newly_acked_sacked = pkts_acked + tp->sacked_out - prior_sacked;
 		break;
 	case TCP_CA_Loss:
-		tcp_process_loss(sk, flag);
+		tcp_process_loss(sk, flag, is_dupack);
 		if (icsk->icsk_ca_state != TCP_CA_Open)
 			return;
 		/* Fall through to processing in Open state. */
@@ -3003,6 +3048,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
 			}
 			if (!(sacked & TCPCB_SACKED_ACKED))
 				reord = min(pkts_acked, reord);
+			if (!after(scb->end_seq, tp->high_seq))
+				flag |= FLAG_ORIG_SACK_ACKED;
 		}
 
 		if (sacked & TCPCB_SACKED_ACKED)
-- 
cgit 


From 3cf956eebe54cdb7cf1701642085507f0354e56a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 20 Mar 2013 10:12:10 +0100
Subject: ASoC: wm8994: Support constraining the maximum number of channels
 clocked

Some systems use the audio CODEC to clock a DAI with multiple data lines
in parallel, meaning that bit clocks are only required for a smaller number
of channels than data is sent for. In some cases providing the extra bit
clocks can take the other devices on the audio bus out of spec.

Support such systems by allowing a maximum number of channels to be
specified.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/wm8994/pdata.h |  8 ++++++++
 sound/soc/codecs/wm8994.c        | 13 +++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/wm8994/pdata.h b/include/linux/mfd/wm8994/pdata.h
index 8e21a094836d..68e776594889 100644
--- a/include/linux/mfd/wm8994/pdata.h
+++ b/include/linux/mfd/wm8994/pdata.h
@@ -17,6 +17,7 @@
 
 #define WM8994_NUM_LDO   2
 #define WM8994_NUM_GPIO 11
+#define WM8994_NUM_AIF   3
 
 struct wm8994_ldo_pdata {
 	/** GPIOs to enable regulator, 0 or less if not available */
@@ -215,6 +216,13 @@ struct wm8994_pdata {
 	 * system.
 	 */
 	bool spkmode_pu;
+
+	/**
+	 * Maximum number of channels clocks will be generated for,
+	 * useful for systems where and I2S bus with multiple data
+	 * lines is mastered.
+	 */
+	int max_channels_clocked[WM8994_NUM_AIF];
 };
 
 #endif
diff --git a/sound/soc/codecs/wm8994.c b/sound/soc/codecs/wm8994.c
index c9bd445c4976..318ea64b9800 100644
--- a/sound/soc/codecs/wm8994.c
+++ b/sound/soc/codecs/wm8994.c
@@ -2656,6 +2656,8 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream,
 {
 	struct snd_soc_codec *codec = dai->codec;
 	struct wm8994_priv *wm8994 = snd_soc_codec_get_drvdata(codec);
+	struct wm8994 *control = wm8994->wm8994;
+	struct wm8994_pdata *pdata = &control->pdata;
 	int aif1_reg;
 	int aif2_reg;
 	int bclk_reg;
@@ -2723,7 +2725,14 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream,
 	}
 
 	wm8994->channels[id] = params_channels(params);
-	switch (params_channels(params)) {
+	if (pdata->max_channels_clocked[id] &&
+	    wm8994->channels[id] > pdata->max_channels_clocked[id]) {
+		dev_dbg(dai->dev, "Constraining channels to %d from %d\n",
+			pdata->max_channels_clocked[id], wm8994->channels[id]);
+		wm8994->channels[id] = pdata->max_channels_clocked[id];
+	}
+
+	switch (wm8994->channels[id]) {
 	case 1:
 	case 2:
 		bclk_rate *= 2;
@@ -2745,7 +2754,7 @@ static int wm8994_hw_params(struct snd_pcm_substream *substream,
 	dev_dbg(dai->dev, "AIF%dCLK is %dHz, target BCLK %dHz\n",
 		dai->id, wm8994->aifclk[id], bclk_rate);
 
-	if (params_channels(params) == 1 &&
+	if (wm8994->channels[id] == 1 &&
 	    (snd_soc_read(codec, aif1_reg) & 0x18) == 0x18)
 		aif2 |= WM8994_AIF1_MONO;
 
-- 
cgit 


From 4f1b07581613bf076b0dacdd9a3fb290d3caa227 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Thu, 10 Jan 2013 15:38:48 +0000
Subject: mfd: wm5102: Add additional speaker control registers

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/wm5102-tables.c           | 2 ++
 include/linux/mfd/arizona/registers.h | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm5102-tables.c b/drivers/mfd/wm5102-tables.c
index 8a6ce8c12505..7d01069c09db 100644
--- a/drivers/mfd/wm5102-tables.c
+++ b/drivers/mfd/wm5102-tables.c
@@ -1169,6 +1169,8 @@ static bool wm5102_readable_register(struct device *dev, unsigned int reg)
 	case ARIZONA_NOISE_GATE_CONTROL:
 	case ARIZONA_PDM_SPK1_CTRL_1:
 	case ARIZONA_PDM_SPK1_CTRL_2:
+	case ARIZONA_SPK_CTRL_2:
+	case ARIZONA_SPK_CTRL_3:
 	case ARIZONA_DAC_COMP_1:
 	case ARIZONA_DAC_COMP_2:
 	case ARIZONA_DAC_COMP_3:
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index a61ce90ecd3f..a47fd358016f 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -217,6 +217,8 @@
 #define ARIZONA_PDM_SPK1_CTRL_2                  0x491
 #define ARIZONA_PDM_SPK2_CTRL_1                  0x492
 #define ARIZONA_PDM_SPK2_CTRL_2                  0x493
+#define ARIZONA_SPK_CTRL_2                       0x4B5
+#define ARIZONA_SPK_CTRL_3                       0x4B6
 #define ARIZONA_DAC_COMP_1                       0x4DC
 #define ARIZONA_DAC_COMP_2                       0x4DD
 #define ARIZONA_DAC_COMP_3                       0x4DE
-- 
cgit 


From 79617801ea0c0e6664cb497d4c1892c2ff407364 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Thu, 21 Mar 2013 22:22:03 +0100
Subject: filter: bpf_jit_comp: refactor and unify BPF JIT image dump output

If bpf_jit_enable > 1, then we dump the emitted JIT compiled image
after creation. Currently, only SPARC and PowerPC has similar output
as in the reference implementation on x86_64. Make a small helper
function in order to reduce duplicated code and make the dump output
uniform across architectures x86_64, SPARC, PPC, ARM (e.g. on ARM
flen, pass and proglen are currently not shown, but would be
interesting to know as well), also for future BPF JIT implementations
on other archs.

Cc: Mircea Gherzan <mgherzan@gmail.com>
Cc: Matt Evans <matt@ozlabs.org>
Cc: Eric Dumazet <eric.dumazet@google.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/net/bpf_jit_32.c       |  5 ++---
 arch/powerpc/net/bpf_jit_comp.c | 12 ++++--------
 arch/sparc/net/bpf_jit_comp.c   |  6 +-----
 arch/x86/net/bpf_jit_comp.c     |  9 ++-------
 include/linux/filter.h          | 10 ++++++++++
 5 files changed, 19 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index a0bd8a755bdf..1a643ee8e082 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -918,9 +918,8 @@ void bpf_jit_compile(struct sk_filter *fp)
 #endif
 
 	if (bpf_jit_enable > 1)
-		print_hex_dump(KERN_INFO, "BPF JIT code: ",
-			       DUMP_PREFIX_ADDRESS, 16, 4, ctx.target,
-			       alloc_size, false);
+		/* there are 2 passes here */
+		bpf_jit_dump(fp->len, alloc_size, 2, ctx.target);
 
 	fp->bpf_func = (void *)ctx.target;
 out:
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c
index e834f1ec23c8..c427ae36374a 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -671,16 +671,12 @@ void bpf_jit_compile(struct sk_filter *fp)
 	}
 
 	if (bpf_jit_enable > 1)
-		pr_info("flen=%d proglen=%u pass=%d image=%p\n",
-		       flen, proglen, pass, image);
+		/* Note that we output the base address of the code_base
+		 * rather than image, since opcodes are in code_base.
+		 */
+		bpf_jit_dump(flen, proglen, pass, code_base);
 
 	if (image) {
-		if (bpf_jit_enable > 1)
-			print_hex_dump(KERN_ERR, "JIT code: ",
-				       DUMP_PREFIX_ADDRESS,
-				       16, 1, code_base,
-				       proglen, false);
-
 		bpf_flush_icache(code_base, code_base + (proglen/4));
 		/* Function descriptor nastiness: Address + TOC */
 		((u64 *)image)[0] = (u64)code_base;
diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c
index 3109ca684a99..d36a85ebb5e0 100644
--- a/arch/sparc/net/bpf_jit_comp.c
+++ b/arch/sparc/net/bpf_jit_comp.c
@@ -795,13 +795,9 @@ cond_branch:			f_offset = addrs[i + filter[i].jf];
 	}
 
 	if (bpf_jit_enable > 1)
-		pr_err("flen=%d proglen=%u pass=%d image=%p\n",
-		       flen, proglen, pass, image);
+		bpf_jit_dump(flen, proglen, pass, image);
 
 	if (image) {
-		if (bpf_jit_enable > 1)
-			print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
-				       16, 1, image, proglen, false);
 		bpf_flush_icache(image, image + proglen);
 		fp->bpf_func = (void *)image;
 	}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 3cbe45381bbb..f66b54086ce5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -725,17 +725,12 @@ cond_branch:			f_offset = addrs[i + filter[i].jf] - addrs[i];
 		}
 		oldproglen = proglen;
 	}
+
 	if (bpf_jit_enable > 1)
-		pr_err("flen=%d proglen=%u pass=%d image=%p\n",
-		       flen, proglen, pass, image);
+		bpf_jit_dump(flen, proglen, pass, image);
 
 	if (image) {
-		if (bpf_jit_enable > 1)
-			print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
-				       16, 1, image, proglen, false);
-
 		bpf_flush_icache(image, image + proglen);
-
 		fp->bpf_func = (void *)image;
 	}
 out:
diff --git a/include/linux/filter.h b/include/linux/filter.h
index d2059cb4e465..d7d25083130b 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -50,6 +50,16 @@ extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, uns
 #ifdef CONFIG_BPF_JIT
 extern void bpf_jit_compile(struct sk_filter *fp);
 extern void bpf_jit_free(struct sk_filter *fp);
+
+static inline void bpf_jit_dump(unsigned int flen, unsigned int proglen,
+				u32 pass, void *image)
+{
+	pr_err("flen=%u proglen=%u pass=%u image=%p\n",
+	       flen, proglen, pass, image);
+	if (image)
+		print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
+			       16, 1, image, proglen, false);
+}
 #define SK_RUN_FILTER(FILTER, SKB) (*FILTER->bpf_func)(SKB, FILTER->insns)
 #else
 static inline void bpf_jit_compile(struct sk_filter *fp)
-- 
cgit 


From f58b082aed43400c03e53beacc50a9f9eb23ac91 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Wed, 6 Mar 2013 23:46:20 +0100
Subject: ACPI / scan: Add special handler for Intel Lynxpoint LPSS devices

Devices on the Intel Lynxpoint Low Power Subsystem (LPSS) have some
common features that aren't shared with any other platform devices,
including the clock and LTR (Latency Tolerance Reporting) registers.
It is better to handle those features in common code than to bother
device drivers with doing that (I/O functionality-wise the LPSS
devices are generally compatible with other devices that don't
have those special registers and may be handled by the same drivers).

The clock registers of the LPSS devices are now taken care of by
the special clk-x86-lpss driver, but the MMIO mappings used for
accessing those registers can also be used for accessing the LTR
registers on those devices (LTR support for the Lynxpoint LPSS is
going to be added by a subsequent patch).  Thus it is convenient
to add a special ACPI scan handler for the Lynxpoint LPSS devices
that will create the MMIO mappings for accessing the clock (and
LTR in the future) registers and will register the LPSS devices'
clocks, so the clk-x86-lpss driver will only need to take care of
the main Lynxpoint LPSS clock.

Introduce a special ACPI scan handler for Intel Lynxpoint LPSS
devices as described above.  This also reduces overhead related to
browsing the ACPI namespace in search of the LPSS devices before the
registration of their clocks, removes some LPSS-specific (and
somewhat ugly) code from acpi_platform.c and shrinks the overall code
size slightly.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/acpi/Makefile                  |   1 +
 drivers/acpi/acpi_lpss.c               | 163 +++++++++++++++++++++++++++++++++
 drivers/acpi/acpi_platform.c           |  40 +-------
 drivers/acpi/internal.h                |   8 ++
 drivers/acpi/scan.c                    |   1 +
 drivers/clk/x86/Makefile               |   2 +-
 drivers/clk/x86/clk-lpss.c             |  99 --------------------
 drivers/clk/x86/clk-lpss.h             |  36 --------
 drivers/clk/x86/clk-lpt.c              |  40 +-------
 include/linux/platform_data/clk-lpss.h |  18 ++++
 10 files changed, 195 insertions(+), 213 deletions(-)
 create mode 100644 drivers/acpi/acpi_lpss.c
 delete mode 100644 drivers/clk/x86/clk-lpss.c
 delete mode 100644 drivers/clk/x86/clk-lpss.h
 create mode 100644 include/linux/platform_data/clk-lpss.h

(limited to 'include/linux')

diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 474fcfeba66c..ecb743bf05a5 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -39,6 +39,7 @@ acpi-y				+= ec.o
 acpi-$(CONFIG_ACPI_DOCK)	+= dock.o
 acpi-y				+= pci_root.o pci_link.o pci_irq.o
 acpi-y				+= csrt.o
+acpi-$(CONFIG_X86_INTEL_LPSS)	+= acpi_lpss.o
 acpi-y				+= acpi_platform.o
 acpi-y				+= power.o
 acpi-y				+= event.o
diff --git a/drivers/acpi/acpi_lpss.c b/drivers/acpi/acpi_lpss.c
new file mode 100644
index 000000000000..823df46a3deb
--- /dev/null
+++ b/drivers/acpi/acpi_lpss.c
@@ -0,0 +1,163 @@
+/*
+ * ACPI support for Intel Lynxpoint LPSS.
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *          Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/acpi.h>
+#include <linux/clk.h>
+#include <linux/clkdev.h>
+#include <linux/clk-provider.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/platform_device.h>
+#include <linux/platform_data/clk-lpss.h>
+
+#include "internal.h"
+
+ACPI_MODULE_NAME("acpi_lpss");
+
+#define LPSS_CLK_OFFSET 0x800
+#define LPSS_CLK_SIZE	0x04
+
+struct lpss_device_desc {
+	bool clk_required;
+	const char *clk_parent;
+};
+
+struct lpss_private_data {
+	void __iomem *mmio_base;
+	resource_size_t mmio_size;
+	struct clk *clk;
+	const struct lpss_device_desc *dev_desc;
+};
+
+static struct lpss_device_desc lpt_dev_desc = {
+	.clk_required = true,
+	.clk_parent = "lpss_clk",
+};
+
+static const struct acpi_device_id acpi_lpss_device_ids[] = {
+	/* Lynxpoint LPSS devices */
+	{ "INT33C0", (unsigned long)&lpt_dev_desc },
+	{ "INT33C1", (unsigned long)&lpt_dev_desc },
+	{ "INT33C2", (unsigned long)&lpt_dev_desc },
+	{ "INT33C3", (unsigned long)&lpt_dev_desc },
+	{ "INT33C4", (unsigned long)&lpt_dev_desc },
+	{ "INT33C5", (unsigned long)&lpt_dev_desc },
+	{ "INT33C6", },
+	{ "INT33C7", },
+
+	{ }
+};
+
+static int is_memory(struct acpi_resource *res, void *not_used)
+{
+	struct resource r;
+	return !acpi_dev_resource_memory(res, &r);
+}
+
+/* LPSS main clock device. */
+static struct platform_device *lpss_clk_dev;
+
+static inline void lpt_register_clock_device(void)
+{
+	lpss_clk_dev = platform_device_register_simple("clk-lpt", -1, NULL, 0);
+}
+
+static int register_device_clock(struct acpi_device *adev,
+				 struct lpss_private_data *pdata)
+{
+	const struct lpss_device_desc *dev_desc = pdata->dev_desc;
+
+	if (!lpss_clk_dev)
+		lpt_register_clock_device();
+
+	if (!dev_desc->clk_parent || !pdata->mmio_base
+	    || pdata->mmio_size < LPSS_CLK_OFFSET + LPSS_CLK_SIZE)
+		return -ENODATA;
+
+	pdata->clk = clk_register_gate(NULL, dev_name(&adev->dev),
+				       dev_desc->clk_parent, 0,
+				       pdata->mmio_base + LPSS_CLK_OFFSET,
+				       0, 0, NULL);
+	if (IS_ERR(pdata->clk))
+		return PTR_ERR(pdata->clk);
+
+	clk_register_clkdev(pdata->clk, NULL, dev_name(&adev->dev));
+	return 0;
+}
+
+static int acpi_lpss_create_device(struct acpi_device *adev,
+				   const struct acpi_device_id *id)
+{
+	struct lpss_device_desc *dev_desc;
+	struct lpss_private_data *pdata;
+	struct resource_list_entry *rentry;
+	struct list_head resource_list;
+	int ret;
+
+	dev_desc = (struct lpss_device_desc *)id->driver_data;
+	if (!dev_desc)
+		return acpi_create_platform_device(adev, id);
+
+	pdata = kzalloc(sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&resource_list);
+	ret = acpi_dev_get_resources(adev, &resource_list, is_memory, NULL);
+	if (ret < 0)
+		goto err_out;
+
+	list_for_each_entry(rentry, &resource_list, node)
+		if (resource_type(&rentry->res) == IORESOURCE_MEM) {
+			pdata->mmio_size = resource_size(&rentry->res);
+			pdata->mmio_base = ioremap(rentry->res.start,
+						   pdata->mmio_size);
+			pdata->dev_desc = dev_desc;
+			break;
+		}
+
+	acpi_dev_free_resource_list(&resource_list);
+
+	if (dev_desc->clk_required) {
+		ret = register_device_clock(adev, pdata);
+		if (ret) {
+			/*
+			 * Skip the device, but don't terminate the namespace
+			 * scan.
+			 */
+			ret = 0;
+			goto err_out;
+		}
+	}
+
+	adev->driver_data = pdata;
+	ret = acpi_create_platform_device(adev, id);
+	if (ret > 0)
+		return ret;
+
+	adev->driver_data = NULL;
+
+ err_out:
+	kfree(pdata);
+	return ret;
+}
+
+static struct acpi_scan_handler lpss_handler = {
+	.ids = acpi_lpss_device_ids,
+	.attach = acpi_lpss_create_device,
+};
+
+void __init acpi_lpss_init(void)
+{
+	if (!lpt_clk_init())
+		acpi_scan_add_handler(&lpss_handler);
+}
diff --git a/drivers/acpi/acpi_platform.c b/drivers/acpi/acpi_platform.c
index 26fce4b8a632..fafec5ddf17f 100644
--- a/drivers/acpi/acpi_platform.c
+++ b/drivers/acpi/acpi_platform.c
@@ -22,9 +22,6 @@
 
 ACPI_MODULE_NAME("platform");
 
-/* Flags for acpi_create_platform_device */
-#define ACPI_PLATFORM_CLK	BIT(0)
-
 /*
  * The following ACPI IDs are known to be suitable for representing as
  * platform devices.
@@ -33,33 +30,9 @@ static const struct acpi_device_id acpi_platform_device_ids[] = {
 
 	{ "PNP0D40" },
 
-	/* Haswell LPSS devices */
-	{ "INT33C0", ACPI_PLATFORM_CLK },
-	{ "INT33C1", ACPI_PLATFORM_CLK },
-	{ "INT33C2", ACPI_PLATFORM_CLK },
-	{ "INT33C3", ACPI_PLATFORM_CLK },
-	{ "INT33C4", ACPI_PLATFORM_CLK },
-	{ "INT33C5", ACPI_PLATFORM_CLK },
-	{ "INT33C6", ACPI_PLATFORM_CLK },
-	{ "INT33C7", ACPI_PLATFORM_CLK },
-
 	{ }
 };
 
-static int acpi_create_platform_clks(struct acpi_device *adev)
-{
-	static struct platform_device *pdev;
-
-	/* Create Lynxpoint LPSS clocks */
-	if (!pdev && !strncmp(acpi_device_hid(adev), "INT33C", 6)) {
-		pdev = platform_device_register_simple("clk-lpt", -1, NULL, 0);
-		if (IS_ERR(pdev))
-			return PTR_ERR(pdev);
-	}
-
-	return 0;
-}
-
 /**
  * acpi_create_platform_device - Create platform device for ACPI device node
  * @adev: ACPI device node to create a platform device for.
@@ -71,10 +44,9 @@ static int acpi_create_platform_clks(struct acpi_device *adev)
  *
  * Name of the platform device will be the same as @adev's.
  */
-static int acpi_create_platform_device(struct acpi_device *adev,
-				       const struct acpi_device_id *id)
+int acpi_create_platform_device(struct acpi_device *adev,
+				const struct acpi_device_id *id)
 {
-	unsigned long flags = id->driver_data;
 	struct platform_device *pdev = NULL;
 	struct acpi_device *acpi_parent;
 	struct platform_device_info pdevinfo;
@@ -83,14 +55,6 @@ static int acpi_create_platform_device(struct acpi_device *adev,
 	struct resource *resources;
 	int count;
 
-	if (flags & ACPI_PLATFORM_CLK) {
-		int ret = acpi_create_platform_clks(adev);
-		if (ret) {
-			dev_err(&adev->dev, "failed to create clocks\n");
-			return ret;
-		}
-	}
-
 	/* If the ACPI node already has a physical device attached, skip it. */
 	if (adev->physical_node_count)
 		return 0;
diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h
index 3c94a732b4b3..e227819217fb 100644
--- a/drivers/acpi/internal.h
+++ b/drivers/acpi/internal.h
@@ -48,6 +48,11 @@ int acpi_debugfs_init(void);
 #else
 static inline void acpi_debugfs_init(void) { return; }
 #endif
+#ifdef CONFIG_X86_INTEL_LPSS
+void acpi_lpss_init(void);
+#else
+static inline void acpi_lpss_init(void) {}
+#endif
 
 /* --------------------------------------------------------------------------
                      Device Node Initialization / Removal
@@ -131,4 +136,7 @@ static inline void suspend_nvs_restore(void) {}
   -------------------------------------------------------------------------- */
 struct platform_device;
 
+int acpi_create_platform_device(struct acpi_device *adev,
+				const struct acpi_device_id *id);
+
 #endif /* _ACPI_INTERNAL_H_ */
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 5e7e991717d7..433a4e15019c 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1788,6 +1788,7 @@ int __init acpi_scan_init(void)
 	acpi_pci_root_init();
 	acpi_pci_link_init();
 	acpi_platform_init();
+	acpi_lpss_init();
 	acpi_csrt_init();
 	acpi_container_init();
 	acpi_pci_slot_init();
diff --git a/drivers/clk/x86/Makefile b/drivers/clk/x86/Makefile
index f9ba4fab0ddc..04781389d0fb 100644
--- a/drivers/clk/x86/Makefile
+++ b/drivers/clk/x86/Makefile
@@ -1,2 +1,2 @@
-clk-x86-lpss-objs		:= clk-lpss.o clk-lpt.o
+clk-x86-lpss-objs		:= clk-lpt.o
 obj-$(CONFIG_X86_INTEL_LPSS)	+= clk-x86-lpss.o
diff --git a/drivers/clk/x86/clk-lpss.c b/drivers/clk/x86/clk-lpss.c
deleted file mode 100644
index b5e229f3c3d9..000000000000
--- a/drivers/clk/x86/clk-lpss.c
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Intel Low Power Subsystem clocks.
- *
- * Copyright (C) 2013, Intel Corporation
- * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
- *	    Heikki Krogerus <heikki.krogerus@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#include <linux/acpi.h>
-#include <linux/clk.h>
-#include <linux/clk-provider.h>
-#include <linux/err.h>
-#include <linux/io.h>
-#include <linux/module.h>
-
-static int clk_lpss_is_mmio_resource(struct acpi_resource *res, void *data)
-{
-	struct resource r;
-	return !acpi_dev_resource_memory(res, &r);
-}
-
-static acpi_status clk_lpss_find_mmio(acpi_handle handle, u32 level,
-				      void *data, void **retval)
-{
-	struct resource_list_entry *rentry;
-	struct list_head resource_list;
-	struct acpi_device *adev;
-	const char *uid = data;
-	int ret;
-
-	if (acpi_bus_get_device(handle, &adev))
-		return AE_OK;
-
-	if (uid) {
-		if (!adev->pnp.unique_id)
-			return AE_OK;
-		if (strcmp(uid, adev->pnp.unique_id))
-			return AE_OK;
-	}
-
-	INIT_LIST_HEAD(&resource_list);
-	ret = acpi_dev_get_resources(adev, &resource_list,
-				     clk_lpss_is_mmio_resource, NULL);
-	if (ret < 0)
-		return AE_NO_MEMORY;
-
-	list_for_each_entry(rentry, &resource_list, node)
-		if (resource_type(&rentry->res) == IORESOURCE_MEM) {
-			*(struct resource *)retval = rentry->res;
-			break;
-		}
-
-	acpi_dev_free_resource_list(&resource_list);
-	return AE_OK;
-}
-
-/**
- * clk_register_lpss_gate - register LPSS clock gate
- * @name: name of this clock gate
- * @parent_name: parent clock name
- * @hid: ACPI _HID of the device
- * @uid: ACPI _UID of the device (optional)
- * @offset: LPSS PRV_CLOCK_PARAMS offset
- *
- * Creates and registers LPSS clock gate.
- */
-struct clk *clk_register_lpss_gate(const char *name, const char *parent_name,
-				   const char *hid, const char *uid,
-				   unsigned offset)
-{
-	struct resource res = { };
-	void __iomem *mmio_base;
-	acpi_status status;
-	struct clk *clk;
-
-	/*
-	 * First try to look the device and its mmio resource from the
-	 * ACPI namespace.
-	 */
-	status = acpi_get_devices(hid, clk_lpss_find_mmio, (void *)uid,
-				  (void **)&res);
-	if (ACPI_FAILURE(status) || !res.start)
-		return ERR_PTR(-ENODEV);
-
-	mmio_base = ioremap(res.start, resource_size(&res));
-	if (!mmio_base)
-		return ERR_PTR(-ENOMEM);
-
-	clk = clk_register_gate(NULL, name, parent_name, 0, mmio_base + offset,
-				0, 0, NULL);
-	if (IS_ERR(clk))
-		iounmap(mmio_base);
-
-	return clk;
-}
diff --git a/drivers/clk/x86/clk-lpss.h b/drivers/clk/x86/clk-lpss.h
deleted file mode 100644
index e9460f442297..000000000000
--- a/drivers/clk/x86/clk-lpss.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Intel Low Power Subsystem clock.
- *
- * Copyright (C) 2013, Intel Corporation
- * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
- *	    Heikki Krogerus <heikki.krogerus@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __CLK_LPSS_H
-#define __CLK_LPSS_H
-
-#include <linux/err.h>
-#include <linux/errno.h>
-#include <linux/clk.h>
-
-#ifdef CONFIG_ACPI
-extern struct clk *clk_register_lpss_gate(const char *name,
-					  const char *parent_name,
-					  const char *hid, const char *uid,
-					  unsigned offset);
-#else
-static inline struct clk *clk_register_lpss_gate(const char *name,
-						 const char *parent_name,
-						 const char *hid,
-						 const char *uid,
-						 unsigned offset)
-{
-	return ERR_PTR(-ENODEV);
-}
-#endif
-
-#endif /* __CLK_LPSS_H */
diff --git a/drivers/clk/x86/clk-lpt.c b/drivers/clk/x86/clk-lpt.c
index 81298aeef7e3..5cf4f4686406 100644
--- a/drivers/clk/x86/clk-lpt.c
+++ b/drivers/clk/x86/clk-lpt.c
@@ -10,7 +10,6 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/acpi.h>
 #include <linux/clk.h>
 #include <linux/clkdev.h>
 #include <linux/clk-provider.h>
@@ -18,8 +17,6 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 
-#include "clk-lpss.h"
-
 #define PRV_CLOCK_PARAMS 0x800
 
 static int lpt_clk_probe(struct platform_device *pdev)
@@ -34,40 +31,6 @@ static int lpt_clk_probe(struct platform_device *pdev)
 
 	/* Shared DMA clock */
 	clk_register_clkdev(clk, "hclk", "INTL9C60.0.auto");
-
-	/* SPI clocks */
-	clk = clk_register_lpss_gate("spi0_clk", "lpss_clk", "INT33C0", NULL,
-				     PRV_CLOCK_PARAMS);
-	if (!IS_ERR(clk))
-		clk_register_clkdev(clk, NULL, "INT33C0:00");
-
-	clk = clk_register_lpss_gate("spi1_clk", "lpss_clk", "INT33C1", NULL,
-				     PRV_CLOCK_PARAMS);
-	if (!IS_ERR(clk))
-		clk_register_clkdev(clk, NULL, "INT33C1:00");
-
-	/* I2C clocks */
-	clk = clk_register_lpss_gate("i2c0_clk", "lpss_clk", "INT33C2", NULL,
-				     PRV_CLOCK_PARAMS);
-	if (!IS_ERR(clk))
-		clk_register_clkdev(clk, NULL, "INT33C2:00");
-
-	clk = clk_register_lpss_gate("i2c1_clk", "lpss_clk", "INT33C3", NULL,
-				     PRV_CLOCK_PARAMS);
-	if (!IS_ERR(clk))
-		clk_register_clkdev(clk, NULL, "INT33C3:00");
-
-	/* UART clocks */
-	clk = clk_register_lpss_gate("uart0_clk", "lpss_clk", "INT33C4", NULL,
-				     PRV_CLOCK_PARAMS);
-	if (!IS_ERR(clk))
-		clk_register_clkdev(clk, NULL, "INT33C4:00");
-
-	clk = clk_register_lpss_gate("uart1_clk", "lpss_clk", "INT33C5", NULL,
-				     PRV_CLOCK_PARAMS);
-	if (!IS_ERR(clk))
-		clk_register_clkdev(clk, NULL, "INT33C5:00");
-
 	return 0;
 }
 
@@ -79,8 +42,7 @@ static struct platform_driver lpt_clk_driver = {
 	.probe = lpt_clk_probe,
 };
 
-static int __init lpt_clk_init(void)
+int __init lpt_clk_init(void)
 {
 	return platform_driver_register(&lpt_clk_driver);
 }
-arch_initcall(lpt_clk_init);
diff --git a/include/linux/platform_data/clk-lpss.h b/include/linux/platform_data/clk-lpss.h
new file mode 100644
index 000000000000..528e73ce46d2
--- /dev/null
+++ b/include/linux/platform_data/clk-lpss.h
@@ -0,0 +1,18 @@
+/*
+ * Intel Low Power Subsystem clocks.
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Authors: Mika Westerberg <mika.westerberg@linux.intel.com>
+ *          Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __CLK_LPSS_H
+#define __CLK_LPSS_H
+
+extern int lpt_clk_init(void);
+
+#endif /* __CLK_LPSS_H */
-- 
cgit 


From 19919226c3f20e6bf5de3df96432ce80ffd63ff2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 22 Mar 2013 10:48:33 +0100
Subject: clockevents: Add missing tick_check_broadcast_expired() for
 CLOCKEVENTS=n

Fengs build robot reports:

arch/arm/kernel/process.c: In function 'cpu_idle':
arch/arm/kernel/process.c:211:4: error: implicit declaration of function
'tick_check_broadcast_expired' [-Werror=implicit-function-declaration]

Add the missing inline function for non clockevent builds

Reported-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 646aac136eed..464e229e7d84 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -193,6 +193,7 @@ static inline void clockevents_suspend(void) {}
 static inline void clockevents_resume(void) {}
 
 #define clockevents_notify(reason, arg) do { } while (0)
+static inline int tick_check_broadcast_expired(void) { return 0; }
 
 #endif
 
-- 
cgit 


From 19dde0bd71e3dffb03ddc509019e22250f4e20c0 Mon Sep 17 00:00:00 2001
From: Janusz Dziedzic <janusz.dziedzic@gmail.com>
Date: Thu, 21 Mar 2013 15:47:54 +0100
Subject: cfg80211: add P2P Notice of Absence attribute

Add P2P Notice of Absence attribute structure.

Signed-off-by: Janusz Dziedzic <janusz.dziedzic@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 4cf0c9e4dd99..d10b5bba3268 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1027,6 +1027,26 @@ enum ieee80211_p2p_attr_id {
 	IEEE80211_P2P_ATTR_MAX
 };
 
+/* Notice of Absence attribute - described in P2P spec 4.1.14 */
+/* Typical max value used here */
+#define IEEE80211_P2P_NOA_DESC_MAX	4
+
+struct ieee80211_p2p_noa_desc {
+	u8 count;
+	__le32 duration;
+	__le32 interval;
+	__le32 start_time;
+} __packed;
+
+struct ieee80211_p2p_noa_attr {
+	u8 index;
+	u8 oppps_ctwindow;
+	struct ieee80211_p2p_noa_desc desc[IEEE80211_P2P_NOA_DESC_MAX];
+} __packed;
+
+#define IEEE80211_P2P_OPPPS_ENABLE_BIT		BIT(7)
+#define IEEE80211_P2P_OPPPS_CTWINDOW_MASK	0x7F
+
 /**
  * struct ieee80211_bar - HT Block Ack Request
  *
-- 
cgit 


From d79df329d0bd425c00856915b7b12f54dd100154 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 21 Mar 2013 15:58:58 +0000
Subject: regulator: ab8500: Further populate initialisation registers

This patch supplies access to some extra settings provided by the
AB8500 regulator device. We also update some of the existing
initialisation values in accordance with internal ST-Ericsson code
submissions. This single patch was originally a collection of updates
which have been squashed together to aid with clarity.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500-regulators.c | 40 +++++++++++++++++++++++++--
 drivers/regulator/ab8500.c                    | 39 ++++++++++++++++++++++++--
 include/linux/regulator/ab8500.h              |  8 ++++++
 3 files changed, 81 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-regulators.c b/arch/arm/mach-ux500/board-mop500-regulators.c
index 2a17bc506cff..4b3c51905071 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.c
+++ b/arch/arm/mach-ux500/board-mop500-regulators.c
@@ -126,6 +126,7 @@ struct ab8500_regulator_reg_init
 ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	/*
 	 * VanaRequestCtrl          = HP/LP depending on VxRequest
+	 * VpllRequestCtrl          = HP/LP depending on VxRequest
 	 * VextSupply1RequestCtrl   = HP/LP depending on VxRequest
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL2, 0x00),
@@ -142,12 +143,16 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL4, 0x00),
 	/*
+	 * Vsmps1SysClkReq1HPValid  = enabled
+	 * Vsmps2SysClkReq1HPValid  = enabled
+	 * Vsmps3SysClkReq1HPValid  = enabled
 	 * VanaSysClkReq1HPValid    = disabled
+	 * VpllSysClkReq1HPValid    = enabled
 	 * Vaux1SysClkReq1HPValid   = disabled
 	 * Vaux2SysClkReq1HPValid   = disabled
 	 * Vaux3SysClkReq1HPValid   = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0x17),
 	/*
 	 * VextSupply1SysClkReq1HPValid = disabled
 	 * VextSupply2SysClkReq1HPValid = disabled
@@ -233,6 +238,34 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * Vamic2_dzout             = high-Z when Vamic2 is disabled
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL1VAMIC, 0x00),
+	/*
+	 * VBBN                     = force OFF
+	 * VBBP                     = force OFF
+	 * NOTE! PRCMU register
+	 */
+	INIT_REGULATOR_REGISTER(AB8500_ARMREGU2,               0x00),
+	/*
+	 * VBBNSel1                 = VBBP = VBBPFB
+	 * VBBPSel1                 = 0 V
+	 * NOTE! PRCMU register
+	 */
+	INIT_REGULATOR_REGISTER(AB8500_VBBSEL1,                0x00),
+	/*
+	 * VBBNSel2                 = VBBP = VBBPFB
+	 * VBBPSel2                 = 0 V
+	 * NOTE! PRCMU register
+	 */
+	INIT_REGULATOR_REGISTER(AB8500_VBBSEL2,                0x00),
+	/*
+	 * Vsmps1Regu               = HW control
+	 * Vsmps1SelCtrl            = Vsmps1 voltage defined by Vsmsp1Sel2
+	 */
+	INIT_REGULATOR_REGISTER(AB8500_VSMPS1REGU,             0x06),
+	/*
+	 * Vsmps2Regu               = HW control
+	 * Vsmps2SelCtrl            = Vsmps2 voltage defined by Vsmsp2Sel2
+	 */
+	INIT_REGULATOR_REGISTER(AB8500_VSMPS2REGU,             0x06),
 	/*
 	 * VPll                     = Hw controlled
 	 * VanaRegu                 = force off
@@ -257,9 +290,10 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_VAUX12REGU, 0x01),
 	/*
-	 * Vaux3regu                = force off
+	 * Vrf1Regu                 = HW control
+	 * Vaux3Regu                = force off
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU, 0x08),
 	/*
 	 * Vsmps1                   = 1.15V
 	 */
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 09014f38a948..4d9d556a47cc 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -477,7 +477,7 @@ struct ab8500_reg_init {
 static struct ab8500_reg_init ab8500_reg_init[] = {
 	/*
 	 * 0x30, VanaRequestCtrl
-	 * 0x0C, VpllRequestCtrl
+	 * 0x0c, VpllRequestCtrl
 	 * 0xc0, VextSupply1RequestCtrl
 	 */
 	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xfc),
@@ -494,12 +494,16 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 */
 	REG_INIT(AB8500_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
 	/*
+	 * 0x01, Vsmps1SysClkReq1HPValid
+	 * 0x02, Vsmps2SysClkReq1HPValid
+	 * 0x04, Vsmps3SysClkReq1HPValid
 	 * 0x08, VanaSysClkReq1HPValid
+	 * 0x10, VpllSysClkReq1HPValid
 	 * 0x20, Vaux1SysClkReq1HPValid
 	 * 0x40, Vaux2SysClkReq1HPValid
 	 * 0x80, Vaux3SysClkReq1HPValid
 	 */
-	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xe8),
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
 	/*
 	 * 0x10, VextSupply1SysClkReq1HPValid
 	 * 0x20, VextSupply2SysClkReq1HPValid
@@ -577,6 +581,34 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 * 0x02, Vamic2_dzout
 	 */
 	REG_INIT(AB8500_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
+	/*
+	 * 0x0c, VBBNRegu
+	 * 0x03, VBBPRegu
+	 * NOTE! PRCMU register
+	 */
+	REG_INIT(AB8500_ARMREGU2,		0x04, 0x01, 0x0f),
+	/*
+	 * 0x0c, VBBPSel1
+	 * 0x03, VBBNSel1
+	 * NOTE! PRCMU register
+	 */
+	REG_INIT(AB8500_VBBSEL1,		0x04, 0x11, 0x0f),
+	/*
+	 * 0x0c, VBBNSel2
+	 * 0x03, VBBPSel2
+	 * NOTE! PRCMU register
+	 */
+	REG_INIT(AB8500_VBBSEL2,		0x04, 0x12, 0x0f),
+	/*
+	 * 0x03, Vsmps1Regu
+	 * 0x0c, Vsmps1SelCtrl
+	 */
+	REG_INIT(AB8500_VSMPS1REGU,		0x04, 0x03, 0x0f),
+	/*
+	 * 0x03, Vsmps2Regu
+	 * 0x0c, Vsmps2SelCtrl
+	 */
+	REG_INIT(AB8500_VSMPS2REGU,		0x04, 0x04, 0x0f),
 	/*
 	 * 0x0c, VanaRegu
 	 * 0x03, VpllRegu
@@ -601,9 +633,10 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 */
 	REG_INIT(AB8500_VAUX12REGU,		0x04, 0x09, 0x0f),
 	/*
+	 * 0x0c, Vrf1Regu
 	 * 0x03, Vaux3Regu
 	 */
-	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x03),
+	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
 	/*
 	 * 0x3f, Vsmps1Sel1
 	 */
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 7bd73bbdfd1b..2c6c9625013c 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -86,7 +86,15 @@ enum ab8500_regulator_reg {
 	AB8500_REGUCTRL2SPARE,
 	AB8500_REGUCTRLDISCH,
 	AB8500_REGUCTRLDISCH2,
+	AB8500_ARMREGU2, /* NOTE! PRCMU register */
+	AB8500_VBBSEL1, /* NOTE! PRCMU register */
+	AB8500_VBBSEL2, /* NOTE! PRCMU register */
+	AB8500_VSMPS1REGU,
+	AB8500_VSMPS2REGU,
+	AB8500_VSMPS3REGU, /* NOTE! PRCMU register */
 	AB8500_VSMPS1SEL1,
+	AB8500_VSMPS3SEL1, /* NOTE! PRCMU register */
+	AB8500_VSMPS3SEL2, /* NOTE! PRCMU register */
 	AB8500_NUM_REGULATOR_REGISTERS,
 };
 
-- 
cgit 


From 3c1b8438d4bc99269aba560739e3e6cb640584f4 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 21 Mar 2013 15:59:01 +0000
Subject: ARM: ux500: regulators: Add mask for configuration

There is already before a register mask in the regulator driver
to allow some bits of a register to be initialized. The register
value is defined in the board configuration. This patch puts a
mask in the board configuration to specify which bits should
actually be altered. The purpose with this patch is to avoid
future mistakes when updating the allowed bits in the regulator
driver.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500-regulators.c | 66 +++++++++++++--------------
 drivers/regulator/ab8500.c                    | 36 ++++++---------
 include/linux/regulator/ab8500.h              | 10 ++--
 3 files changed, 53 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-regulators.c b/arch/arm/mach-ux500/board-mop500-regulators.c
index 4b3c51905071..96dd17490bea 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.c
+++ b/arch/arm/mach-ux500/board-mop500-regulators.c
@@ -129,19 +129,19 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * VpllRequestCtrl          = HP/LP depending on VxRequest
 	 * VextSupply1RequestCtrl   = HP/LP depending on VxRequest
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL2, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL2,       0xfc, 0x00),
 	/*
 	 * VextSupply2RequestCtrl   = HP/LP depending on VxRequest
 	 * VextSupply3RequestCtrl   = HP/LP depending on VxRequest
 	 * Vaux1RequestCtrl         = HP/LP depending on VxRequest
 	 * Vaux2RequestCtrl         = HP/LP depending on VxRequest
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL3, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL3,       0xff, 0x00),
 	/*
 	 * Vaux3RequestCtrl         = HP/LP depending on VxRequest
 	 * SwHPReq                  = Control through SWValid disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL4, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL4,       0x07, 0x00),
 	/*
 	 * Vsmps1SysClkReq1HPValid  = enabled
 	 * Vsmps2SysClkReq1HPValid  = enabled
@@ -152,44 +152,44 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * Vaux2SysClkReq1HPValid   = disabled
 	 * Vaux3SysClkReq1HPValid   = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0x17),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0xff, 0x17),
 	/*
 	 * VextSupply1SysClkReq1HPValid = disabled
 	 * VextSupply2SysClkReq1HPValid = disabled
 	 * VextSupply3SysClkReq1HPValid = SysClkReq1 controlled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID2, 0x40),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID2, 0x70, 0x40),
 	/*
 	 * VanaHwHPReq1Valid        = disabled
 	 * Vaux1HwHPreq1Valid       = disabled
 	 * Vaux2HwHPReq1Valid       = disabled
 	 * Vaux3HwHPReqValid        = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ1VALID1, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ1VALID1,     0xe8, 0x00),
 	/*
 	 * VextSupply1HwHPReq1Valid = disabled
 	 * VextSupply2HwHPReq1Valid = disabled
 	 * VextSupply3HwHPReq1Valid = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ1VALID2, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ1VALID2,     0x07, 0x00),
 	/*
 	 * VanaHwHPReq2Valid        = disabled
 	 * Vaux1HwHPReq2Valid       = disabled
 	 * Vaux2HwHPReq2Valid       = disabled
 	 * Vaux3HwHPReq2Valid       = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ2VALID1, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ2VALID1,     0xe8, 0x00),
 	/*
 	 * VextSupply1HwHPReq2Valid = disabled
 	 * VextSupply2HwHPReq2Valid = disabled
 	 * VextSupply3HwHPReq2Valid = HWReq2 controlled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ2VALID2, 0x04),
+	INIT_REGULATOR_REGISTER(AB8500_REGUHWHPREQ2VALID2,     0x07, 0x04),
 	/*
 	 * VanaSwHPReqValid         = disabled
 	 * Vaux1SwHPReqValid        = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSWHPREQVALID1, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSWHPREQVALID1,      0xa0, 0x00),
 	/*
 	 * Vaux2SwHPReqValid        = disabled
 	 * Vaux3SwHPReqValid        = disabled
@@ -197,7 +197,7 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * VextSupply2SwHPReqValid  = disabled
 	 * VextSupply3SwHPReqValid  = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSWHPREQVALID2, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSWHPREQVALID2,      0x1f, 0x00),
 	/*
 	 * SysClkReq2Valid1         = SysClkReq2 controlled
 	 * SysClkReq3Valid1         = disabled
@@ -207,7 +207,7 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * SysClkReq7Valid1         = disabled
 	 * SysClkReq8Valid1         = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQVALID1, 0x2a),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQVALID1,    0xfe, 0x2a),
 	/*
 	 * SysClkReq2Valid2         = disabled
 	 * SysClkReq3Valid2         = disabled
@@ -217,7 +217,7 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * SysClkReq7Valid2         = disabled
 	 * SysClkReq8Valid2         = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQVALID2, 0x20),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQVALID2,    0xfe, 0x20),
 	/*
 	 * VTVoutEna                = disabled
 	 * Vintcore12Ena            = disabled
@@ -225,57 +225,57 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * Vintcore12LP             = inactive (HP)
 	 * VTVoutLP                 = inactive (HP)
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUMISC1, 0x10),
+	INIT_REGULATOR_REGISTER(AB8500_REGUMISC1,              0xfe, 0x10),
 	/*
 	 * VaudioEna                = disabled
 	 * VdmicEna                 = disabled
 	 * Vamic1Ena                = disabled
 	 * Vamic2Ena                = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUDIOSUPPLY, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_VAUDIOSUPPLY,           0x1e, 0x00),
 	/*
 	 * Vamic1_dzout             = high-Z when Vamic1 is disabled
 	 * Vamic2_dzout             = high-Z when Vamic2 is disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL1VAMIC, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL1VAMIC,         0x03, 0x00),
 	/*
 	 * VBBN                     = force OFF
 	 * VBBP                     = force OFF
 	 * NOTE! PRCMU register
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_ARMREGU2,               0x00),
+	INIT_REGULATOR_REGISTER(AB8500_ARMREGU2,               0x0f, 0x00),
 	/*
 	 * VBBNSel1                 = VBBP = VBBPFB
 	 * VBBPSel1                 = 0 V
 	 * NOTE! PRCMU register
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VBBSEL1,                0x00),
+	INIT_REGULATOR_REGISTER(AB8500_VBBSEL1,                0x0f, 0x00),
 	/*
 	 * VBBNSel2                 = VBBP = VBBPFB
 	 * VBBPSel2                 = 0 V
 	 * NOTE! PRCMU register
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VBBSEL2,                0x00),
+	INIT_REGULATOR_REGISTER(AB8500_VBBSEL2,                0x0f, 0x00),
 	/*
 	 * Vsmps1Regu               = HW control
 	 * Vsmps1SelCtrl            = Vsmps1 voltage defined by Vsmsp1Sel2
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VSMPS1REGU,             0x06),
+	INIT_REGULATOR_REGISTER(AB8500_VSMPS1REGU,             0x0f, 0x06),
 	/*
 	 * Vsmps2Regu               = HW control
 	 * Vsmps2SelCtrl            = Vsmps2 voltage defined by Vsmsp2Sel2
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VSMPS2REGU,             0x06),
+	INIT_REGULATOR_REGISTER(AB8500_VSMPS2REGU,             0x0f, 0x06),
 	/*
 	 * VPll                     = Hw controlled
 	 * VanaRegu                 = force off
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VPLLVANAREGU, 0x02),
+	INIT_REGULATOR_REGISTER(AB8500_VPLLVANAREGU,           0x0f, 0x02),
 	/*
 	 * VrefDDREna               = disabled
 	 * VrefDDRSleepMode         = inactive (no pulldown)
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VREFDDR, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_VREFDDR,                0x03, 0x00),
 	/*
 	 * VextSupply1Regu          = HW control
 	 * VextSupply2Regu          = HW control
@@ -283,37 +283,37 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * ExtSupply2Bypass         = ExtSupply12LPn ball is 0 when Ena is 0
 	 * ExtSupply3Bypass         = ExtSupply3LPn ball is 0 when Ena is 0
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_EXTSUPPLYREGU, 0x2a),
+	INIT_REGULATOR_REGISTER(AB8500_EXTSUPPLYREGU,          0xff, 0x1a),
 	/*
 	 * Vaux1Regu                = force HP
 	 * Vaux2Regu                = force off
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUX12REGU, 0x01),
+	INIT_REGULATOR_REGISTER(AB8500_VAUX12REGU,             0x0f, 0x01),
 	/*
 	 * Vrf1Regu                 = HW control
 	 * Vaux3Regu                = force off
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU, 0x08),
+	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU,          0x0f, 0x08),
 	/*
 	 * Vsmps1                   = 1.15V
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VSMPS1SEL1, 0x24),
+	INIT_REGULATOR_REGISTER(AB8500_VSMPS1SEL1,             0x3f, 0x24),
 	/*
 	 * Vaux1Sel                 = 2.5 V
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUX1SEL, 0x08),
+	INIT_REGULATOR_REGISTER(AB8500_VAUX1SEL,               0x0f, 0x08),
 	/*
 	 * Vaux2Sel                 = 2.9 V
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VAUX2SEL, 0x0d),
+	INIT_REGULATOR_REGISTER(AB8500_VAUX2SEL,               0x0f, 0x0d),
 	/*
 	 * Vaux3Sel                 = 2.91 V
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3SEL, 0x07),
+	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3SEL,           0x07, 0x07),
 	/*
 	 * VextSupply12LP           = disabled (no LP)
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL2SPARE, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL2SPARE,         0x01, 0x00),
 	/*
 	 * Vaux1Disch               = short discharge time
 	 * Vaux2Disch               = short discharge time
@@ -322,13 +322,13 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * VTVoutDisch              = short discharge time
 	 * VaudioDisch              = short discharge time
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRLDISCH, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUCTRLDISCH,          0xfc, 0x00),
 	/*
 	 * VanaDisch                = short discharge time
 	 * VdmicPullDownEna         = pulldown disabled when Vdmic is disabled
 	 * VdmicDisch               = short discharge time
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUCTRLDISCH2, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUCTRLDISCH2,         0x16, 0x00),
 };
 
 /* AB8500 regulators */
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 3465ac38bffe..a847744f8c20 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -811,23 +811,20 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
 };
 
-static int
-ab8500_regulator_init_registers(struct platform_device *pdev, int id, int value)
+static int ab8500_regulator_init_registers(struct platform_device *pdev,
+					   int id, int mask, int value)
 {
 	int err;
 
-	if (value & ~ab8500_reg_init[id].mask) {
-		dev_err(&pdev->dev,
-			"Configuration error: value outside mask.\n");
-		return -EINVAL;
-	}
+	BUG_ON(value & ~mask);
+	BUG_ON(mask & ~ab8500_reg_init[id].mask);
 
+	/* initialize register */
 	err = abx500_mask_and_set_register_interruptible(
 		&pdev->dev,
 		ab8500_reg_init[id].bank,
 		ab8500_reg_init[id].addr,
-		ab8500_reg_init[id].mask,
-		value);
+		mask, value);
 	if (err < 0) {
 		dev_err(&pdev->dev,
 			"Failed to initialize 0x%02x, 0x%02x.\n",
@@ -835,13 +832,11 @@ ab8500_regulator_init_registers(struct platform_device *pdev, int id, int value)
 			ab8500_reg_init[id].addr);
 		return err;
 	}
-
 	dev_vdbg(&pdev->dev,
-		"init: 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
-		ab8500_reg_init[id].bank,
-		ab8500_reg_init[id].addr,
-		ab8500_reg_init[id].mask,
-		value);
+		 "  init: 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
+		 ab8500_reg_init[id].bank,
+		 ab8500_reg_init[id].addr,
+		 mask, value);
 
 	return 0;
 }
@@ -960,19 +955,16 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 
 	/* initialize registers */
 	for (i = 0; i < pdata->num_regulator_reg_init; i++) {
-		int id, value;
+		int id, mask, value;
 
 		id = pdata->regulator_reg_init[i].id;
+		mask = pdata->regulator_reg_init[i].mask;
 		value = pdata->regulator_reg_init[i].value;
 
 		/* check for configuration errors */
-		if (id >= AB8500_NUM_REGULATOR_REGISTERS) {
-			dev_err(&pdev->dev,
-				"Configuration error: id outside range.\n");
-			return -EINVAL;
-		}
+		BUG_ON(id >= AB8500_NUM_REGULATOR_REGISTERS);
 
-		err = ab8500_regulator_init_registers(pdev, id, value);
+		err = ab8500_regulator_init_registers(pdev, id, mask, value);
 		if (err < 0)
 			return err;
 	}
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 2c6c9625013c..a1d245f13d9c 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -48,13 +48,15 @@ enum ab9540_regulator_id {
 /* AB8500 and AB9540 register initialization */
 struct ab8500_regulator_reg_init {
 	int id;
+	u8 mask;
 	u8 value;
 };
 
-#define INIT_REGULATOR_REGISTER(_id, _value)	\
-	{					\
-		.id = _id,			\
-		.value = _value,		\
+#define INIT_REGULATOR_REGISTER(_id, _mask, _value)	\
+	{						\
+		.id = _id,				\
+		.mask = _mask,				\
+		.value = _value,			\
 	}
 
 /* AB8500 registers */
-- 
cgit 


From 33bc8f46a8ee3fc1836def9713933435b7ff0b90 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 21 Mar 2013 15:59:02 +0000
Subject: regulator: ab8500: Another push to synchronise recent AB8500
 developments

This patch ensures that many of the recent developments pertaining to
the AB8500 regulator device are propagated out into the public arena.
It aims to update some of the existing initialisation values in
accordance with internal ST-Ericsson code submissions. This single
patch was originally a collection of updates which have been squashed
together to aid with clarity.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500-regulators.c | 18 ------
 drivers/regulator/ab8500.c                    | 86 +++++++++++++++++----------
 include/linux/regulator/ab8500.h              | 21 ++++---
 3 files changed, 65 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-regulators.c b/arch/arm/mach-ux500/board-mop500-regulators.c
index 96dd17490bea..a8141e3e8ca1 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.c
+++ b/arch/arm/mach-ux500/board-mop500-regulators.c
@@ -238,24 +238,6 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 	 * Vamic2_dzout             = high-Z when Vamic2 is disabled
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL1VAMIC,         0x03, 0x00),
-	/*
-	 * VBBN                     = force OFF
-	 * VBBP                     = force OFF
-	 * NOTE! PRCMU register
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_ARMREGU2,               0x0f, 0x00),
-	/*
-	 * VBBNSel1                 = VBBP = VBBPFB
-	 * VBBPSel1                 = 0 V
-	 * NOTE! PRCMU register
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VBBSEL1,                0x0f, 0x00),
-	/*
-	 * VBBNSel2                 = VBBP = VBBPFB
-	 * VBBPSel2                 = 0 V
-	 * NOTE! PRCMU register
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VBBSEL2,                0x0f, 0x00),
 	/*
 	 * Vsmps1Regu               = HW control
 	 * Vsmps1SelCtrl            = Vsmps1 voltage defined by Vsmsp1Sel2
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index a847744f8c20..c7784c4bff4f 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -613,11 +613,19 @@ struct ab8500_reg_init {
 
 static struct ab8500_reg_init ab8500_reg_init[] = {
 	/*
-	 * 0x30, VanaRequestCtrl
+	 * 0x03, VarmRequestCtrl
+	 * 0x0c, VapeRequestCtrl
+	 * 0x30, Vsmps1RequestCtrl
+	 * 0xc0, Vsmps2RequestCtrl
+	 */
+	REG_INIT(AB8500_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
+	/*
+	 * 0x03, Vsmps3RequestCtrl
 	 * 0x0c, VpllRequestCtrl
+	 * 0x30, VanaRequestCtrl
 	 * 0xc0, VextSupply1RequestCtrl
 	 */
-	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xfc),
+	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xff),
 	/*
 	 * 0x03, VextSupply2RequestCtrl
 	 * 0x0c, VextSupply3RequestCtrl
@@ -642,50 +650,71 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 */
 	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
 	/*
+	 * 0x01, VapeSysClkReq1HPValid
+	 * 0x02, VarmSysClkReq1HPValid
+	 * 0x04, VbbSysClkReq1HPValid
+	 * 0x08, VmodSysClkReq1HPValid
 	 * 0x10, VextSupply1SysClkReq1HPValid
 	 * 0x20, VextSupply2SysClkReq1HPValid
 	 * 0x40, VextSupply3SysClkReq1HPValid
 	 */
-	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x70),
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x7f),
 	/*
+	 * 0x01, Vsmps1HwHPReq1Valid
+	 * 0x02, Vsmps2HwHPReq1Valid
+	 * 0x04, Vsmps3HwHPReq1Valid
 	 * 0x08, VanaHwHPReq1Valid
+	 * 0x10, VpllHwHPReq1Valid
 	 * 0x20, Vaux1HwHPReq1Valid
 	 * 0x40, Vaux2HwHPReq1Valid
 	 * 0x80, Vaux3HwHPReq1Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xe8),
+	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
 	/*
 	 * 0x01, VextSupply1HwHPReq1Valid
 	 * 0x02, VextSupply2HwHPReq1Valid
 	 * 0x04, VextSupply3HwHPReq1Valid
+	 * 0x08, VmodHwHPReq1Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
+	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x0f),
 	/*
+	 * 0x01, Vsmps1HwHPReq2Valid
+	 * 0x02, Vsmps2HwHPReq2Valid
+	 * 0x03, Vsmps3HwHPReq2Valid
 	 * 0x08, VanaHwHPReq2Valid
+	 * 0x10, VpllHwHPReq2Valid
 	 * 0x20, Vaux1HwHPReq2Valid
 	 * 0x40, Vaux2HwHPReq2Valid
 	 * 0x80, Vaux3HwHPReq2Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xe8),
+	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
 	/*
 	 * 0x01, VextSupply1HwHPReq2Valid
 	 * 0x02, VextSupply2HwHPReq2Valid
 	 * 0x04, VextSupply3HwHPReq2Valid
+	 * 0x08, VmodHwHPReq2Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
+	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x0f),
 	/*
+	 * 0x01, VapeSwHPReqValid
+	 * 0x02, VarmSwHPReqValid
+	 * 0x04, Vsmps1SwHPReqValid
+	 * 0x08, Vsmps2SwHPReqValid
+	 * 0x10, Vsmps3SwHPReqValid
 	 * 0x20, VanaSwHPReqValid
+	 * 0x40, VpllSwHPReqValid
 	 * 0x80, Vaux1SwHPReqValid
 	 */
-	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xa0),
+	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
 	/*
 	 * 0x01, Vaux2SwHPReqValid
 	 * 0x02, Vaux3SwHPReqValid
 	 * 0x04, VextSupply1SwHPReqValid
 	 * 0x08, VextSupply2SwHPReqValid
 	 * 0x10, VextSupply3SwHPReqValid
+	 * 0x20, VmodSwHPReqValid
 	 */
-	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
+	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x3f),
 	/*
 	 * 0x02, SysClkReq2Valid1
 	 * ...
@@ -718,37 +747,23 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 * 0x02, Vamic2_dzout
 	 */
 	REG_INIT(AB8500_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
-	/*
-	 * 0x0c, VBBNRegu
-	 * 0x03, VBBPRegu
-	 * NOTE! PRCMU register
-	 */
-	REG_INIT(AB8500_ARMREGU2,		0x04, 0x01, 0x0f),
-	/*
-	 * 0x0c, VBBPSel1
-	 * 0x03, VBBNSel1
-	 * NOTE! PRCMU register
-	 */
-	REG_INIT(AB8500_VBBSEL1,		0x04, 0x11, 0x0f),
-	/*
-	 * 0x0c, VBBNSel2
-	 * 0x03, VBBPSel2
-	 * NOTE! PRCMU register
-	 */
-	REG_INIT(AB8500_VBBSEL2,		0x04, 0x12, 0x0f),
 	/*
 	 * 0x03, Vsmps1Regu
 	 * 0x0c, Vsmps1SelCtrl
+	 * 0x10, Vsmps1AutoMode
+	 * 0x20, Vsmps1PWMMode
 	 */
-	REG_INIT(AB8500_VSMPS1REGU,		0x04, 0x03, 0x0f),
+	REG_INIT(AB8500_VSMPS1REGU,		0x04, 0x03, 0x3f),
 	/*
 	 * 0x03, Vsmps2Regu
 	 * 0x0c, Vsmps2SelCtrl
+	 * 0x10, Vsmps2AutoMode
+	 * 0x20, Vsmps2PWMMode
 	 */
-	REG_INIT(AB8500_VSMPS2REGU,		0x04, 0x04, 0x0f),
+	REG_INIT(AB8500_VSMPS2REGU,		0x04, 0x04, 0x3f),
 	/*
-	 * 0x0c, VanaRegu
 	 * 0x03, VpllRegu
+	 * 0x0c, VanaRegu
 	 */
 	REG_INIT(AB8500_VPLLVANAREGU,		0x04, 0x06, 0x0f),
 	/*
@@ -788,13 +803,16 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	REG_INIT(AB8500_VAUX2SEL,		0x04, 0x20, 0x0f),
 	/*
 	 * 0x07, Vaux3Sel
+	 * 0x30, Vrf1Sel
 	 */
-	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x07),
+	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x37),
 	/*
 	 * 0x01, VextSupply12LP
 	 */
 	REG_INIT(AB8500_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
 	/*
+	 * 0x01, VpllDisch
+	 * 0x02, Vrf1Disch
 	 * 0x04, Vaux1Disch
 	 * 0x08, Vaux2Disch
 	 * 0x10, Vaux3Disch
@@ -802,13 +820,15 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 * 0x40, VTVoutDisch
 	 * 0x80, VaudioDisch
 	 */
-	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
+	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xff),
 	/*
+	 * 0x01, VsimDisch
 	 * 0x02, VanaDisch
 	 * 0x04, VdmicPullDownEna
+	 * 0x08, VpllPullDownEna
 	 * 0x10, VdmicDisch
 	 */
-	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
+	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x1f),
 };
 
 static int ab8500_regulator_init_registers(struct platform_device *pdev,
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index a1d245f13d9c..dd7944f735d8 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -61,6 +61,7 @@ struct ab8500_regulator_reg_init {
 
 /* AB8500 registers */
 enum ab8500_regulator_reg {
+	AB8500_REGUREQUESTCTRL1,
 	AB8500_REGUREQUESTCTRL2,
 	AB8500_REGUREQUESTCTRL3,
 	AB8500_REGUREQUESTCTRL4,
@@ -77,26 +78,28 @@ enum ab8500_regulator_reg {
 	AB8500_REGUMISC1,
 	AB8500_VAUDIOSUPPLY,
 	AB8500_REGUCTRL1VAMIC,
+	AB8500_VSMPS1REGU,
+	AB8500_VSMPS2REGU,
+	AB8500_VSMPS3REGU, /* NOTE! PRCMU register */
 	AB8500_VPLLVANAREGU,
 	AB8500_VREFDDR,
 	AB8500_EXTSUPPLYREGU,
 	AB8500_VAUX12REGU,
 	AB8500_VRF1VAUX3REGU,
+	AB8500_VSMPS1SEL1,
+	AB8500_VSMPS1SEL2,
+	AB8500_VSMPS1SEL3,
+	AB8500_VSMPS2SEL1,
+	AB8500_VSMPS2SEL2,
+	AB8500_VSMPS2SEL3,
+	AB8500_VSMPS3SEL1, /* NOTE! PRCMU register */
+	AB8500_VSMPS3SEL2, /* NOTE! PRCMU register */
 	AB8500_VAUX1SEL,
 	AB8500_VAUX2SEL,
 	AB8500_VRF1VAUX3SEL,
 	AB8500_REGUCTRL2SPARE,
 	AB8500_REGUCTRLDISCH,
 	AB8500_REGUCTRLDISCH2,
-	AB8500_ARMREGU2, /* NOTE! PRCMU register */
-	AB8500_VBBSEL1, /* NOTE! PRCMU register */
-	AB8500_VBBSEL2, /* NOTE! PRCMU register */
-	AB8500_VSMPS1REGU,
-	AB8500_VSMPS2REGU,
-	AB8500_VSMPS3REGU, /* NOTE! PRCMU register */
-	AB8500_VSMPS1SEL1,
-	AB8500_VSMPS3SEL1, /* NOTE! PRCMU register */
-	AB8500_VSMPS3SEL2, /* NOTE! PRCMU register */
 	AB8500_NUM_REGULATOR_REGISTERS,
 };
 
-- 
cgit 


From 732805a563617aafc7405409c03182afafb3943b Mon Sep 17 00:00:00 2001
From: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Date: Thu, 21 Mar 2013 15:59:03 +0000
Subject: regulator: ab8500: Separate regulator and MFD platform data

The ab8500 MFD should not have knowledge about regulator-
specific platform data like number of regulators and
regulator registers. As the regulator platform data is
about to grow with external regulators, this information
is moved to a new structure provided by the regulator
driver.

Signed-off-by: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Yvan FILLION <yvan.fillion@stericsson.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500-regulators.c | 12 +++++++++---
 arch/arm/mach-ux500/board-mop500-regulators.h |  4 +---
 drivers/regulator/ab8500.c                    | 21 +++++++++++++++------
 include/linux/regulator/ab8500.h              |  7 +++++++
 4 files changed, 32 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-regulators.c b/arch/arm/mach-ux500/board-mop500-regulators.c
index a8141e3e8ca1..0fd84d42e1ec 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.c
+++ b/arch/arm/mach-ux500/board-mop500-regulators.c
@@ -122,8 +122,7 @@ static struct regulator_consumer_supply ab8500_vana_consumers[] = {
 };
 
 /* ab8500 regulator register initialization */
-struct ab8500_regulator_reg_init
-ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
+static struct ab8500_regulator_reg_init ab8500_reg_init[] = {
 	/*
 	 * VanaRequestCtrl          = HP/LP depending on VxRequest
 	 * VpllRequestCtrl          = HP/LP depending on VxRequest
@@ -314,7 +313,7 @@ ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS] = {
 };
 
 /* AB8500 regulators */
-struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS] = {
+static struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS] = {
 	/* supplies to the display/camera */
 	[AB8500_LDO_AUX1] = {
 		.constraints = {
@@ -423,3 +422,10 @@ struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS] = {
 		.consumer_supplies = ab8500_vana_consumers,
 	},
 };
+
+struct ab8500_regulator_platform_data ab8500_regulator_plat_data = {
+	.reg_init               = ab8500_reg_init,
+	.num_reg_init           = ARRAY_SIZE(ab8500_reg_init),
+	.regulator              = ab8500_regulators,
+	.num_regulator          = ARRAY_SIZE(ab8500_regulators),
+};
diff --git a/arch/arm/mach-ux500/board-mop500-regulators.h b/arch/arm/mach-ux500/board-mop500-regulators.h
index 78a0642a2206..9ca4869a6f23 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.h
+++ b/arch/arm/mach-ux500/board-mop500-regulators.h
@@ -14,9 +14,7 @@
 #include <linux/regulator/machine.h>
 #include <linux/regulator/ab8500.h>
 
-extern struct ab8500_regulator_reg_init
-ab8500_regulator_reg_init[AB8500_NUM_REGULATOR_REGISTERS];
-extern struct regulator_init_data ab8500_regulators[AB8500_NUM_REGULATORS];
+extern struct ab8500_regulator_platform_data ab8500_regulator_plat_data;
 extern struct regulator_init_data tps61052_regulator;
 extern struct regulator_init_data gpio_en_3v3_regulator;
 
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index c7784c4bff4f..f7d1f538c200 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -939,8 +939,9 @@ ab8500_regulator_of_probe(struct platform_device *pdev, struct device_node *np)
 static int ab8500_regulator_probe(struct platform_device *pdev)
 {
 	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
-	struct ab8500_platform_data *pdata;
 	struct device_node *np = pdev->dev.of_node;
+	struct ab8500_platform_data *ppdata;
+	struct ab8500_regulator_platform_data *pdata;
 	int i, err;
 
 	if (np) {
@@ -961,7 +962,14 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 		dev_err(&pdev->dev, "null mfd parent\n");
 		return -EINVAL;
 	}
-	pdata = dev_get_platdata(ab8500->dev);
+
+	ppdata = dev_get_platdata(ab8500->dev);
+	if (!ppdata) {
+		dev_err(&pdev->dev, "null parent pdata\n");
+		return -EINVAL;
+	}
+
+	pdata = ppdata->regulator;
 	if (!pdata) {
 		dev_err(&pdev->dev, "null pdata\n");
 		return -EINVAL;
@@ -974,12 +982,12 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 	}
 
 	/* initialize registers */
-	for (i = 0; i < pdata->num_regulator_reg_init; i++) {
+	for (i = 0; i < pdata->num_reg_init; i++) {
 		int id, mask, value;
 
-		id = pdata->regulator_reg_init[i].id;
-		mask = pdata->regulator_reg_init[i].mask;
-		value = pdata->regulator_reg_init[i].value;
+		id = pdata->reg_init[i].id;
+		mask = pdata->reg_init[i].mask;
+		value = pdata->reg_init[i].value;
 
 		/* check for configuration errors */
 		BUG_ON(id >= AB8500_NUM_REGULATOR_REGISTERS);
@@ -1045,5 +1053,6 @@ module_exit(ab8500_regulator_exit);
 
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Sundar Iyer <sundar.iyer@stericsson.com>");
+MODULE_AUTHOR("Bengt Jonsson <bengt.g.jonsson@stericsson.com>");
 MODULE_DESCRIPTION("Regulator Driver for ST-Ericsson AB8500 Mixed-Sig PMIC");
 MODULE_ALIAS("platform:ab8500-regulator");
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index dd7944f735d8..3a8e02687f7b 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -152,4 +152,11 @@ enum ab9540_regulator_reg {
 	AB9540_NUM_REGULATOR_REGISTERS,
 };
 
+struct ab8500_regulator_platform_data {
+	int num_reg_init;
+	struct ab8500_regulator_reg_init *reg_init;
+	int num_regulator;
+	struct regulator_init_data *regulator;
+};
+
 #endif
-- 
cgit 


From 9d0ca6ed6f2f12eb488f450d5d38d047aa402a53 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 21 Mar 2013 14:17:34 +0000
Subject: virtio: remove obsolete virtqueue_get_queue_index()

You can access it directly now, since 3.8: v3.7-rc1-13-g06ca287
'virtio: move queue_index and num_free fields into core struct
virtqueue.'

Cc: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/virtio_net.c      | 4 ++--
 drivers/s390/kvm/virtio_ccw.c | 6 +++---
 include/linux/virtio.h        | 6 ------
 3 files changed, 5 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 57ac4b0294bc..f7d67e8eb1aa 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -154,7 +154,7 @@ struct padded_vnet_hdr {
  */
 static int vq2txq(struct virtqueue *vq)
 {
-	return (virtqueue_get_queue_index(vq) - 1) / 2;
+	return (vq->index - 1) / 2;
 }
 
 static int txq2vq(int txq)
@@ -164,7 +164,7 @@ static int txq2vq(int txq)
 
 static int vq2rxq(struct virtqueue *vq)
 {
-	return virtqueue_get_queue_index(vq) / 2;
+	return vq->index / 2;
 }
 
 static int rxq2vq(int rxq)
diff --git a/drivers/s390/kvm/virtio_ccw.c b/drivers/s390/kvm/virtio_ccw.c
index 2029b6caa595..fb877b59ec57 100644
--- a/drivers/s390/kvm/virtio_ccw.c
+++ b/drivers/s390/kvm/virtio_ccw.c
@@ -166,7 +166,7 @@ static void virtio_ccw_kvm_notify(struct virtqueue *vq)
 
 	vcdev = to_vc_device(info->vq->vdev);
 	ccw_device_get_schid(vcdev->cdev, &schid);
-	do_kvm_notify(schid, virtqueue_get_queue_index(vq));
+	do_kvm_notify(schid, vq->index);
 }
 
 static int virtio_ccw_read_vq_conf(struct virtio_ccw_device *vcdev,
@@ -188,7 +188,7 @@ static void virtio_ccw_del_vq(struct virtqueue *vq, struct ccw1 *ccw)
 	unsigned long flags;
 	unsigned long size;
 	int ret;
-	unsigned int index = virtqueue_get_queue_index(vq);
+	unsigned int index = vq->index;
 
 	/* Remove from our list. */
 	spin_lock_irqsave(&vcdev->lock, flags);
@@ -610,7 +610,7 @@ static struct virtqueue *virtio_ccw_vq_by_ind(struct virtio_ccw_device *vcdev,
 	vq = NULL;
 	spin_lock_irqsave(&vcdev->lock, flags);
 	list_for_each_entry(info, &vcdev->virtqueues, node) {
-		if (virtqueue_get_queue_index(info->vq) == index) {
+		if (info->vq->index == index) {
 			vq = info->vq;
 			break;
 		}
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index ff6714e6d0f5..2d7a5e045908 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -58,12 +58,6 @@ void *virtqueue_detach_unused_buf(struct virtqueue *vq);
 
 unsigned int virtqueue_get_vring_size(struct virtqueue *vq);
 
-/* FIXME: Obsolete accessor, but required for virtio_net merge. */
-static inline unsigned int virtqueue_get_queue_index(struct virtqueue *vq)
-{
-	return vq->index;
-}
-
 /**
  * virtio_device - representation of a device using virtio
  * @index: unique position on the virtio bus
-- 
cgit 


From c3a07134e6aa5b93a37f72ffa3d11fadf72bf757 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Fri, 22 Mar 2013 03:39:28 +0000
Subject: mv643xx_eth: convert to use the Marvell Orion MDIO driver

This patch converts the Marvell MV643XX ethernet driver to use the
Marvell Orion MDIO driver. As a result, PowerPC and ARM platforms
registering the Marvell MV643XX ethernet driver are also updated to
register a Marvell Orion MDIO driver. This driver voluntarily overlaps
with the Marvell Ethernet shared registers because it will use a subset
of this shared register (shared_base + 0x4 to shared_base + 0x84). The
Ethernet driver is also updated to look up for a PHY device using the
Orion MDIO bus driver.

For ARM and PowerPC we register a single instance of the "mvmdio" driver
in the system like it used to be done with the use of the "shared_smi"
platform_data cookie on ARM.

Note that it is safe to register the mvmdio driver only for the "ge00"
instance of the driver because this "ge00" interface is guaranteed to
always be explicitely registered by consumers of
arch/arm/plat-orion/common.c and other instances (ge01, ge10 and ge11)
were all pointing their shared_smi to ge00. For PowerPC the in-tree
Device Tree Source files mention only one MV643XX ethernet MAC instance
so the MDIO bus driver is registered only when id == 0.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/plat-orion/common.c               |  54 ++++----
 arch/powerpc/platforms/chrp/pegasos_eth.c  |  20 +++
 arch/powerpc/sysdev/mv64x60_dev.c          |  16 ++-
 drivers/net/ethernet/marvell/Kconfig       |   5 +-
 drivers/net/ethernet/marvell/Makefile      |   2 +-
 drivers/net/ethernet/marvell/mv643xx_eth.c | 195 +++--------------------------
 include/linux/mv643xx_eth.h                |   1 -
 7 files changed, 84 insertions(+), 209 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-orion/common.c b/arch/arm/plat-orion/common.c
index 2d4b6414609f..251f827271e9 100644
--- a/arch/arm/plat-orion/common.c
+++ b/arch/arm/plat-orion/common.c
@@ -238,6 +238,7 @@ static __init void ge_complete(
 	struct mv643xx_eth_shared_platform_data *orion_ge_shared_data,
 	struct resource *orion_ge_resource, unsigned long irq,
 	struct platform_device *orion_ge_shared,
+	struct platform_device *orion_ge_mvmdio,
 	struct mv643xx_eth_platform_data *eth_data,
 	struct platform_device *orion_ge)
 {
@@ -247,6 +248,8 @@ static __init void ge_complete(
 	orion_ge->dev.platform_data = eth_data;
 
 	platform_device_register(orion_ge_shared);
+	if (orion_ge_mvmdio)
+		platform_device_register(orion_ge_mvmdio);
 	platform_device_register(orion_ge);
 }
 
@@ -258,8 +261,6 @@ struct mv643xx_eth_shared_platform_data orion_ge00_shared_data;
 static struct resource orion_ge00_shared_resources[] = {
 	{
 		.name	= "ge00 base",
-	}, {
-		.name	= "ge00 err irq",
 	},
 };
 
@@ -271,6 +272,19 @@ static struct platform_device orion_ge00_shared = {
 	},
 };
 
+static struct resource orion_ge_mvmdio_resources[] = {
+	{
+		.name	= "ge00 mvmdio base",
+	}, {
+		.name	= "ge00 mvmdio err irq",
+	},
+};
+
+static struct platform_device orion_ge_mvmdio = {
+	.name		= "orion-mdio",
+	.id		= -1,
+};
+
 static struct resource orion_ge00_resources[] = {
 	{
 		.name	= "ge00 irq",
@@ -295,26 +309,25 @@ void __init orion_ge00_init(struct mv643xx_eth_platform_data *eth_data,
 			    unsigned int tx_csum_limit)
 {
 	fill_resources(&orion_ge00_shared, orion_ge00_shared_resources,
-		       mapbase + 0x2000, SZ_16K - 1, irq_err);
+		       mapbase + 0x2000, SZ_16K - 1, NO_IRQ);
+	fill_resources(&orion_ge_mvmdio, orion_ge_mvmdio_resources,
+			mapbase + 0x2004, 0x84 - 1, irq_err);
 	orion_ge00_shared_data.tx_csum_limit = tx_csum_limit;
 	ge_complete(&orion_ge00_shared_data,
 		    orion_ge00_resources, irq, &orion_ge00_shared,
+		    &orion_ge_mvmdio,
 		    eth_data, &orion_ge00);
 }
 
 /*****************************************************************************
  * GE01
  ****************************************************************************/
-struct mv643xx_eth_shared_platform_data orion_ge01_shared_data = {
-	.shared_smi	= &orion_ge00_shared,
-};
+struct mv643xx_eth_shared_platform_data orion_ge01_shared_data;
 
 static struct resource orion_ge01_shared_resources[] = {
 	{
 		.name	= "ge01 base",
-	}, {
-		.name	= "ge01 err irq",
-	},
+	}
 };
 
 static struct platform_device orion_ge01_shared = {
@@ -349,26 +362,23 @@ void __init orion_ge01_init(struct mv643xx_eth_platform_data *eth_data,
 			    unsigned int tx_csum_limit)
 {
 	fill_resources(&orion_ge01_shared, orion_ge01_shared_resources,
-		       mapbase + 0x2000, SZ_16K - 1, irq_err);
+		       mapbase + 0x2000, SZ_16K - 1, NO_IRQ);
 	orion_ge01_shared_data.tx_csum_limit = tx_csum_limit;
 	ge_complete(&orion_ge01_shared_data,
 		    orion_ge01_resources, irq, &orion_ge01_shared,
+		    NULL,
 		    eth_data, &orion_ge01);
 }
 
 /*****************************************************************************
  * GE10
  ****************************************************************************/
-struct mv643xx_eth_shared_platform_data orion_ge10_shared_data = {
-	.shared_smi	= &orion_ge00_shared,
-};
+struct mv643xx_eth_shared_platform_data orion_ge10_shared_data;
 
 static struct resource orion_ge10_shared_resources[] = {
 	{
 		.name	= "ge10 base",
-	}, {
-		.name	= "ge10 err irq",
-	},
+	}
 };
 
 static struct platform_device orion_ge10_shared = {
@@ -402,24 +412,21 @@ void __init orion_ge10_init(struct mv643xx_eth_platform_data *eth_data,
 			    unsigned long irq_err)
 {
 	fill_resources(&orion_ge10_shared, orion_ge10_shared_resources,
-		       mapbase + 0x2000, SZ_16K - 1, irq_err);
+		       mapbase + 0x2000, SZ_16K - 1, NO_IRQ);
 	ge_complete(&orion_ge10_shared_data,
 		    orion_ge10_resources, irq, &orion_ge10_shared,
+		    NULL,
 		    eth_data, &orion_ge10);
 }
 
 /*****************************************************************************
  * GE11
  ****************************************************************************/
-struct mv643xx_eth_shared_platform_data orion_ge11_shared_data = {
-	.shared_smi	= &orion_ge00_shared,
-};
+struct mv643xx_eth_shared_platform_data orion_ge11_shared_data;
 
 static struct resource orion_ge11_shared_resources[] = {
 	{
 		.name	= "ge11 base",
-	}, {
-		.name	= "ge11 err irq",
 	},
 };
 
@@ -454,9 +461,10 @@ void __init orion_ge11_init(struct mv643xx_eth_platform_data *eth_data,
 			    unsigned long irq_err)
 {
 	fill_resources(&orion_ge11_shared, orion_ge11_shared_resources,
-		       mapbase + 0x2000, SZ_16K - 1, irq_err);
+		       mapbase + 0x2000, SZ_16K - 1, NO_IRQ);
 	ge_complete(&orion_ge11_shared_data,
 		    orion_ge11_resources, irq, &orion_ge11_shared,
+		    NULL,
 		    eth_data, &orion_ge11);
 }
 
diff --git a/arch/powerpc/platforms/chrp/pegasos_eth.c b/arch/powerpc/platforms/chrp/pegasos_eth.c
index 039fc8e82199..2b4dc6abde6c 100644
--- a/arch/powerpc/platforms/chrp/pegasos_eth.c
+++ b/arch/powerpc/platforms/chrp/pegasos_eth.c
@@ -47,6 +47,25 @@ static struct platform_device mv643xx_eth_shared_device = {
 	.resource	= mv643xx_eth_shared_resources,
 };
 
+/*
+ * The orion mdio driver only covers shared + 0x4 up to shared + 0x84 - 1
+ */
+static struct resource mv643xx_eth_mvmdio_resources[] = {
+	[0] = {
+		.name	= "ethernet mdio base",
+		.start	= 0xf1000000 + MV643XX_ETH_SHARED_REGS + 0x4,
+		.end	= 0xf1000000 + MV643XX_ETH_SHARED_REGS + 0x83,
+		.flags	= IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device mv643xx_eth_mvmdio_device = {
+	.name		= "orion-mdio",
+	.id		= -1,
+	.num_resources	= ARRAY_SIZE(mv643xx_eth_mvmdio_resources),
+	.resource	= mv643xx_eth_shared_resources,
+};
+
 static struct resource mv643xx_eth_port1_resources[] = {
 	[0] = {
 		.name	= "eth port1 irq",
@@ -82,6 +101,7 @@ static struct platform_device eth_port1_device = {
 
 static struct platform_device *mv643xx_eth_pd_devs[] __initdata = {
 	&mv643xx_eth_shared_device,
+	&mv643xx_eth_mvmdio_device,
 	&eth_port1_device,
 };
 
diff --git a/arch/powerpc/sysdev/mv64x60_dev.c b/arch/powerpc/sysdev/mv64x60_dev.c
index 0f6af41ebb44..4a25c26f0bf4 100644
--- a/arch/powerpc/sysdev/mv64x60_dev.c
+++ b/arch/powerpc/sysdev/mv64x60_dev.c
@@ -214,15 +214,27 @@ static struct platform_device * __init mv64x60_eth_register_shared_pdev(
 						struct device_node *np, int id)
 {
 	struct platform_device *pdev;
-	struct resource r[1];
+	struct resource r[2];
 	int err;
 
 	err = of_address_to_resource(np, 0, &r[0]);
 	if (err)
 		return ERR_PTR(err);
 
+	/* register an orion mdio bus driver */
+	r[1].start = r[0].start + 0x4;
+	r[1].end = r[0].start + 0x84 - 1;
+	r[1].flags = IORESOURCE_MEM;
+
+	if (id == 0) {
+		pdev = platform_device_register_simple("orion-mdio", -1, &r[1], 1);
+		if (!pdev)
+			return pdev;
+	}
+
 	pdev = platform_device_register_simple(MV643XX_ETH_SHARED_NAME, id,
-					       r, 1);
+					       &r[0], 1);
+
 	return pdev;
 }
 
diff --git a/drivers/net/ethernet/marvell/Kconfig b/drivers/net/ethernet/marvell/Kconfig
index edfba9370922..5170ecb00acc 100644
--- a/drivers/net/ethernet/marvell/Kconfig
+++ b/drivers/net/ethernet/marvell/Kconfig
@@ -23,6 +23,7 @@ config MV643XX_ETH
 	depends on (MV64X60 || PPC32 || PLAT_ORION) && INET
 	select INET_LRO
 	select PHYLIB
+	select MVMDIO
 	---help---
 	  This driver supports the gigabit ethernet MACs in the
 	  Marvell Discovery PPC/MIPS chipset family (MV643XX) and
@@ -38,9 +39,7 @@ config MVMDIO
 	  interface units of the Marvell EBU SoCs (Kirkwood, Orion5x,
 	  Dove, Armada 370 and Armada XP).
 
-	  For now, this driver is only needed for the MVNETA driver
-	  (used on Armada 370 and XP), but it could be used in the
-	  future by the MV643XX_ETH driver.
+	  This driver is used by the MV643XX_ETH and MVNETA drivers.
 
 config MVNETA
 	tristate "Marvell Armada 370/XP network interface support"
diff --git a/drivers/net/ethernet/marvell/Makefile b/drivers/net/ethernet/marvell/Makefile
index 7f63b4aac434..5c4a7765ff0e 100644
--- a/drivers/net/ethernet/marvell/Makefile
+++ b/drivers/net/ethernet/marvell/Makefile
@@ -2,8 +2,8 @@
 # Makefile for the Marvell device drivers.
 #
 
-obj-$(CONFIG_MV643XX_ETH) += mv643xx_eth.o
 obj-$(CONFIG_MVMDIO) += mvmdio.o
+obj-$(CONFIG_MV643XX_ETH) += mv643xx_eth.o
 obj-$(CONFIG_MVNETA) += mvneta.o
 obj-$(CONFIG_PXA168_ETH) += pxa168_eth.o
 obj-$(CONFIG_SKGE) += skge.o
diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c
index d1ecf4bf7da7..a65a92ef19ec 100644
--- a/drivers/net/ethernet/marvell/mv643xx_eth.c
+++ b/drivers/net/ethernet/marvell/mv643xx_eth.c
@@ -69,14 +69,6 @@ static char mv643xx_eth_driver_version[] = "1.4";
  * Registers shared between all ports.
  */
 #define PHY_ADDR			0x0000
-#define SMI_REG				0x0004
-#define  SMI_BUSY			0x10000000
-#define  SMI_READ_VALID			0x08000000
-#define  SMI_OPCODE_READ		0x04000000
-#define  SMI_OPCODE_WRITE		0x00000000
-#define ERR_INT_CAUSE			0x0080
-#define  ERR_INT_SMI_DONE		0x00000010
-#define ERR_INT_MASK			0x0084
 #define WINDOW_BASE(w)			(0x0200 + ((w) << 3))
 #define WINDOW_SIZE(w)			(0x0204 + ((w) << 3))
 #define WINDOW_REMAP_HIGH(w)		(0x0280 + ((w) << 2))
@@ -265,25 +257,6 @@ struct mv643xx_eth_shared_private {
 	 */
 	void __iomem *base;
 
-	/*
-	 * Points at the right SMI instance to use.
-	 */
-	struct mv643xx_eth_shared_private *smi;
-
-	/*
-	 * Provides access to local SMI interface.
-	 */
-	struct mii_bus *smi_bus;
-
-	/*
-	 * If we have access to the error interrupt pin (which is
-	 * somewhat misnamed as it not only reflects internal errors
-	 * but also reflects SMI completion), use that to wait for
-	 * SMI access completion instead of polling the SMI busy bit.
-	 */
-	int err_interrupt;
-	wait_queue_head_t smi_busy_wait;
-
 	/*
 	 * Per-port MBUS window access register value.
 	 */
@@ -1122,97 +1095,6 @@ out_write:
 	wrlp(mp, PORT_SERIAL_CONTROL, pscr);
 }
 
-static irqreturn_t mv643xx_eth_err_irq(int irq, void *dev_id)
-{
-	struct mv643xx_eth_shared_private *msp = dev_id;
-
-	if (readl(msp->base + ERR_INT_CAUSE) & ERR_INT_SMI_DONE) {
-		writel(~ERR_INT_SMI_DONE, msp->base + ERR_INT_CAUSE);
-		wake_up(&msp->smi_busy_wait);
-		return IRQ_HANDLED;
-	}
-
-	return IRQ_NONE;
-}
-
-static int smi_is_done(struct mv643xx_eth_shared_private *msp)
-{
-	return !(readl(msp->base + SMI_REG) & SMI_BUSY);
-}
-
-static int smi_wait_ready(struct mv643xx_eth_shared_private *msp)
-{
-	if (msp->err_interrupt == NO_IRQ) {
-		int i;
-
-		for (i = 0; !smi_is_done(msp); i++) {
-			if (i == 10)
-				return -ETIMEDOUT;
-			msleep(10);
-		}
-
-		return 0;
-	}
-
-	if (!smi_is_done(msp)) {
-		wait_event_timeout(msp->smi_busy_wait, smi_is_done(msp),
-				   msecs_to_jiffies(100));
-		if (!smi_is_done(msp))
-			return -ETIMEDOUT;
-	}
-
-	return 0;
-}
-
-static int smi_bus_read(struct mii_bus *bus, int addr, int reg)
-{
-	struct mv643xx_eth_shared_private *msp = bus->priv;
-	void __iomem *smi_reg = msp->base + SMI_REG;
-	int ret;
-
-	if (smi_wait_ready(msp)) {
-		pr_warn("SMI bus busy timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	writel(SMI_OPCODE_READ | (reg << 21) | (addr << 16), smi_reg);
-
-	if (smi_wait_ready(msp)) {
-		pr_warn("SMI bus busy timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	ret = readl(smi_reg);
-	if (!(ret & SMI_READ_VALID)) {
-		pr_warn("SMI bus read not valid\n");
-		return -ENODEV;
-	}
-
-	return ret & 0xffff;
-}
-
-static int smi_bus_write(struct mii_bus *bus, int addr, int reg, u16 val)
-{
-	struct mv643xx_eth_shared_private *msp = bus->priv;
-	void __iomem *smi_reg = msp->base + SMI_REG;
-
-	if (smi_wait_ready(msp)) {
-		pr_warn("SMI bus busy timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	writel(SMI_OPCODE_WRITE | (reg << 21) |
-		(addr << 16) | (val & 0xffff), smi_reg);
-
-	if (smi_wait_ready(msp)) {
-		pr_warn("SMI bus busy timeout\n");
-		return -ETIMEDOUT;
-	}
-
-	return 0;
-}
-
-
 /* statistics ***************************************************************/
 static struct net_device_stats *mv643xx_eth_get_stats(struct net_device *dev)
 {
@@ -2687,47 +2569,6 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 	if (msp->base == NULL)
 		goto out_free;
 
-	/*
-	 * Set up and register SMI bus.
-	 */
-	if (pd == NULL || pd->shared_smi == NULL) {
-		msp->smi_bus = mdiobus_alloc();
-		if (msp->smi_bus == NULL)
-			goto out_unmap;
-
-		msp->smi_bus->priv = msp;
-		msp->smi_bus->name = "mv643xx_eth smi";
-		msp->smi_bus->read = smi_bus_read;
-		msp->smi_bus->write = smi_bus_write,
-		snprintf(msp->smi_bus->id, MII_BUS_ID_SIZE, "%s-%d",
-			pdev->name, pdev->id);
-		msp->smi_bus->parent = &pdev->dev;
-		msp->smi_bus->phy_mask = 0xffffffff;
-		if (mdiobus_register(msp->smi_bus) < 0)
-			goto out_free_mii_bus;
-		msp->smi = msp;
-	} else {
-		msp->smi = platform_get_drvdata(pd->shared_smi);
-	}
-
-	msp->err_interrupt = NO_IRQ;
-	init_waitqueue_head(&msp->smi_busy_wait);
-
-	/*
-	 * Check whether the error interrupt is hooked up.
-	 */
-	res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
-	if (res != NULL) {
-		int err;
-
-		err = request_irq(res->start, mv643xx_eth_err_irq,
-				  IRQF_SHARED, "mv643xx_eth", msp);
-		if (!err) {
-			writel(ERR_INT_SMI_DONE, msp->base + ERR_INT_MASK);
-			msp->err_interrupt = res->start;
-		}
-	}
-
 	/*
 	 * (Re-)program MBUS remapping windows if we are asked to.
 	 */
@@ -2743,10 +2584,6 @@ static int mv643xx_eth_shared_probe(struct platform_device *pdev)
 
 	return 0;
 
-out_free_mii_bus:
-	mdiobus_free(msp->smi_bus);
-out_unmap:
-	iounmap(msp->base);
 out_free:
 	kfree(msp);
 out:
@@ -2756,14 +2593,7 @@ out:
 static int mv643xx_eth_shared_remove(struct platform_device *pdev)
 {
 	struct mv643xx_eth_shared_private *msp = platform_get_drvdata(pdev);
-	struct mv643xx_eth_shared_platform_data *pd = pdev->dev.platform_data;
 
-	if (pd == NULL || pd->shared_smi == NULL) {
-		mdiobus_unregister(msp->smi_bus);
-		mdiobus_free(msp->smi_bus);
-	}
-	if (msp->err_interrupt != NO_IRQ)
-		free_irq(msp->err_interrupt, msp);
 	iounmap(msp->base);
 	kfree(msp);
 
@@ -2826,14 +2656,21 @@ static void set_params(struct mv643xx_eth_private *mp,
 	mp->txq_count = pd->tx_queue_count ? : 1;
 }
 
+static void mv643xx_eth_adjust_link(struct net_device *dev)
+{
+	struct mv643xx_eth_private *mp = netdev_priv(dev);
+
+	mv643xx_adjust_pscr(mp);
+}
+
 static struct phy_device *phy_scan(struct mv643xx_eth_private *mp,
 				   int phy_addr)
 {
-	struct mii_bus *bus = mp->shared->smi->smi_bus;
 	struct phy_device *phydev;
 	int start;
 	int num;
 	int i;
+	char phy_id[MII_BUS_ID_SIZE + 3];
 
 	if (phy_addr == MV643XX_ETH_PHY_ADDR_DEFAULT) {
 		start = phy_addr_get(mp) & 0x1f;
@@ -2843,17 +2680,19 @@ static struct phy_device *phy_scan(struct mv643xx_eth_private *mp,
 		num = 1;
 	}
 
+	/* Attempt to connect to the PHY using orion-mdio */
 	phydev = NULL;
 	for (i = 0; i < num; i++) {
 		int addr = (start + i) & 0x1f;
 
-		if (bus->phy_map[addr] == NULL)
-			mdiobus_scan(bus, addr);
+		snprintf(phy_id, sizeof(phy_id), PHY_ID_FMT,
+				"orion-mdio-mii", addr);
 
-		if (phydev == NULL) {
-			phydev = bus->phy_map[addr];
-			if (phydev != NULL)
-				phy_addr_set(mp, addr);
+		phydev = phy_connect(mp->dev, phy_id, mv643xx_eth_adjust_link,
+				PHY_INTERFACE_MODE_GMII);
+		if (!IS_ERR(phydev)) {
+			phy_addr_set(mp, addr);
+			break;
 		}
 	}
 
@@ -2866,8 +2705,6 @@ static void phy_init(struct mv643xx_eth_private *mp, int speed, int duplex)
 
 	phy_reset(mp);
 
-	phy_attach(mp->dev, dev_name(&phy->dev), PHY_INTERFACE_MODE_GMII);
-
 	if (speed == 0) {
 		phy->autoneg = AUTONEG_ENABLE;
 		phy->speed = 0;
diff --git a/include/linux/mv643xx_eth.h b/include/linux/mv643xx_eth.h
index 49258e0ed1c6..141d395bbb5f 100644
--- a/include/linux/mv643xx_eth.h
+++ b/include/linux/mv643xx_eth.h
@@ -19,7 +19,6 @@
 
 struct mv643xx_eth_shared_platform_data {
 	struct mbus_dram_target_info	*dram;
-	struct platform_device	*shared_smi;
 	/*
 	 * Max packet size for Tx IP/Layer 4 checksum, when set to 0, default
 	 * limit of 9KiB will be used.
-- 
cgit 


From 51e7e8b632d8e564ba494dfa61358ac1a97e4ceb Mon Sep 17 00:00:00 2001
From: Bernie Thompson <bhthompson@chromium.org>
Date: Wed, 27 Feb 2013 12:19:17 -0800
Subject: mmc: core: Add in support to expose PRV for v4 MMCs

The JEDEC MMC v4 spec defines a new PRV value in place of the original
fwrev and hwrev specified in v1. We can expose this in the kernel to enable
user space to more easily determine the product revision of a given MMC.

Signed-off-by: Bernie Thompson <bhthompson@chromium.org>
Reviewed-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 Documentation/mmc/mmc-dev-attrs.txt | 1 +
 drivers/mmc/core/mmc.c              | 3 +++
 include/linux/mmc/card.h            | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/Documentation/mmc/mmc-dev-attrs.txt b/Documentation/mmc/mmc-dev-attrs.txt
index 0d98fac8893b..189bab09255a 100644
--- a/Documentation/mmc/mmc-dev-attrs.txt
+++ b/Documentation/mmc/mmc-dev-attrs.txt
@@ -22,6 +22,7 @@ All attributes are read-only.
 	manfid			Manufacturer ID (from CID Register)
 	name			Product Name (from CID Register)
 	oemid			OEM/Application ID (from CID Register)
+	prv			Product Revision (from CID Register) (SD and MMCv4 only)
 	serial			Product Serial Number (from CID Register)
 	erase_size		Erase group size
 	preferred_erase_size	Preferred erase size
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c
index c8f3d6e0684e..d584f7ca168c 100644
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -96,6 +96,7 @@ static int mmc_decode_cid(struct mmc_card *card)
 		card->cid.prod_name[3]	= UNSTUFF_BITS(resp, 72, 8);
 		card->cid.prod_name[4]	= UNSTUFF_BITS(resp, 64, 8);
 		card->cid.prod_name[5]	= UNSTUFF_BITS(resp, 56, 8);
+		card->cid.prv		= UNSTUFF_BITS(resp, 48, 8);
 		card->cid.serial	= UNSTUFF_BITS(resp, 16, 32);
 		card->cid.month		= UNSTUFF_BITS(resp, 12, 4);
 		card->cid.year		= UNSTUFF_BITS(resp, 8, 4) + 1997;
@@ -627,6 +628,7 @@ MMC_DEV_ATTR(hwrev, "0x%x\n", card->cid.hwrev);
 MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
 MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
 MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
+MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
 MMC_DEV_ATTR(serial, "0x%08x\n", card->cid.serial);
 MMC_DEV_ATTR(enhanced_area_offset, "%llu\n",
 		card->ext_csd.enhanced_area_offset);
@@ -645,6 +647,7 @@ static struct attribute *mmc_std_attrs[] = {
 	&dev_attr_manfid.attr,
 	&dev_attr_name.attr,
 	&dev_attr_oemid.attr,
+	&dev_attr_prv.attr,
 	&dev_attr_serial.attr,
 	&dev_attr_enhanced_area_offset.attr,
 	&dev_attr_enhanced_area_size.attr,
diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h
index 61b2c30c903b..f31725ba49f3 100644
--- a/include/linux/mmc/card.h
+++ b/include/linux/mmc/card.h
@@ -17,6 +17,7 @@
 struct mmc_cid {
 	unsigned int		manfid;
 	char			prod_name[8];
+	unsigned char		prv;
 	unsigned int		serial;
 	unsigned short		oemid;
 	unsigned short		year;
-- 
cgit 


From eed222aca8d077af3600b651176f6fd04d95cce1 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Tue, 5 Mar 2013 11:24:52 +0800
Subject: mmc: sdio: bind acpi with sdio function device

ACPI spec 5 defined the _ADR encoding for sdio bus as:
High word - slot number (0 based)
Low word  - function number

This patch adds support for binding sdio function device with acpi node,
and if successful, involve acpi into its power management.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Reviewed-by: Adrian Hunter <adrian.hunter@intel.com>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/sdio_bus.c  | 20 +++++++++++++++++++-
 drivers/mmc/host/sdhci-pci.c |  1 +
 include/linux/mmc/host.h     |  2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c
index 5e57048e2c1d..8d6bb1821834 100644
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -16,6 +16,7 @@
 #include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/pm_runtime.h>
+#include <linux/acpi.h>
 
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
@@ -299,6 +300,19 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card)
 	return func;
 }
 
+#ifdef CONFIG_ACPI
+static void sdio_acpi_set_handle(struct sdio_func *func)
+{
+	struct mmc_host *host = func->card->host;
+	u64 addr = (host->slotno << 16) | func->num;
+
+	ACPI_HANDLE_SET(&func->dev,
+			acpi_get_child(ACPI_HANDLE(host->parent), addr));
+}
+#else
+static inline void sdio_acpi_set_handle(struct sdio_func *func) {}
+#endif
+
 /*
  * Register a new SDIO function with the driver model.
  */
@@ -308,9 +322,12 @@ int sdio_add_func(struct sdio_func *func)
 
 	dev_set_name(&func->dev, "%s:%d", mmc_card_id(func->card), func->num);
 
+	sdio_acpi_set_handle(func);
 	ret = device_add(&func->dev);
-	if (ret == 0)
+	if (ret == 0) {
 		sdio_func_set_present(func);
+		acpi_dev_pm_attach(&func->dev, false);
+	}
 
 	return ret;
 }
@@ -326,6 +343,7 @@ void sdio_remove_func(struct sdio_func *func)
 	if (!sdio_func_present(func))
 		return;
 
+	acpi_dev_pm_detach(&func->dev, false);
 	device_del(&func->dev);
 	put_device(&func->dev);
 }
diff --git a/drivers/mmc/host/sdhci-pci.c b/drivers/mmc/host/sdhci-pci.c
index c7ccf3034dad..3dee22d098e9 100644
--- a/drivers/mmc/host/sdhci-pci.c
+++ b/drivers/mmc/host/sdhci-pci.c
@@ -1279,6 +1279,7 @@ static struct sdhci_pci_slot *sdhci_pci_probe_slot(
 	}
 
 	host->mmc->pm_caps = MMC_PM_KEEP_POWER | MMC_PM_WAKE_SDIO_IRQ;
+	host->mmc->slotno = slotno;
 
 	ret = sdhci_add_host(host);
 	if (ret)
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index d6f20cc6415e..17d714801e94 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -361,6 +361,8 @@ struct mmc_host {
 
 	unsigned int		actual_clock;	/* Actual HC clock rate */
 
+	unsigned int		slotno;	/* used for sdio acpi binding */
+
 	unsigned long		private[0] ____cacheline_aligned;
 };
 
-- 
cgit 


From ce4f3313b05c836c21a91ac89f87dccf84ce9561 Mon Sep 17 00:00:00 2001
From: Peter De Schrijver <pdeschrijver@nvidia.com>
Date: Fri, 22 Mar 2013 14:07:53 +0200
Subject: clk: add table lookup to mux

Add a table lookup feature to the mux clock. Also allow arbitrary masks
instead of the width. This will be used by some clocks on Tegra114. Also
adapt the tegra periph clk because it uses struct clk_mux directly.

Signed-off-by: Peter De Schrijver <pdeschrijver@nvidia.com>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-mux.c        | 50 ++++++++++++++++++++++++++++++++++----------
 drivers/clk/tegra/clk.h      | 27 +++++++++++++++++-------
 include/linux/clk-private.h  |  2 +-
 include/linux/clk-provider.h |  9 +++++++-
 4 files changed, 67 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-mux.c b/drivers/clk/clk-mux.c
index 508c032edce4..25b1734560d0 100644
--- a/drivers/clk/clk-mux.c
+++ b/drivers/clk/clk-mux.c
@@ -32,6 +32,7 @@
 static u8 clk_mux_get_parent(struct clk_hw *hw)
 {
 	struct clk_mux *mux = to_clk_mux(hw);
+	int num_parents = __clk_get_num_parents(hw->clk);
 	u32 val;
 
 	/*
@@ -42,7 +43,16 @@ static u8 clk_mux_get_parent(struct clk_hw *hw)
 	 * val = 0x4 really means "bit 2, index starts at bit 0"
 	 */
 	val = readl(mux->reg) >> mux->shift;
-	val &= (1 << mux->width) - 1;
+	val &= mux->mask;
+
+	if (mux->table) {
+		int i;
+
+		for (i = 0; i < num_parents; i++)
+			if (mux->table[i] == val)
+				return i;
+		return -EINVAL;
+	}
 
 	if (val && (mux->flags & CLK_MUX_INDEX_BIT))
 		val = ffs(val) - 1;
@@ -50,7 +60,7 @@ static u8 clk_mux_get_parent(struct clk_hw *hw)
 	if (val && (mux->flags & CLK_MUX_INDEX_ONE))
 		val--;
 
-	if (val >= __clk_get_num_parents(hw->clk))
+	if (val >= num_parents)
 		return -EINVAL;
 
 	return val;
@@ -62,17 +72,22 @@ static int clk_mux_set_parent(struct clk_hw *hw, u8 index)
 	u32 val;
 	unsigned long flags = 0;
 
-	if (mux->flags & CLK_MUX_INDEX_BIT)
-		index = (1 << ffs(index));
+	if (mux->table)
+		index = mux->table[index];
 
-	if (mux->flags & CLK_MUX_INDEX_ONE)
-		index++;
+	else {
+		if (mux->flags & CLK_MUX_INDEX_BIT)
+			index = (1 << ffs(index));
+
+		if (mux->flags & CLK_MUX_INDEX_ONE)
+			index++;
+	}
 
 	if (mux->lock)
 		spin_lock_irqsave(mux->lock, flags);
 
 	val = readl(mux->reg);
-	val &= ~(((1 << mux->width) - 1) << mux->shift);
+	val &= ~(mux->mask << mux->shift);
 	val |= index << mux->shift;
 	writel(val, mux->reg);
 
@@ -88,10 +103,10 @@ const struct clk_ops clk_mux_ops = {
 };
 EXPORT_SYMBOL_GPL(clk_mux_ops);
 
-struct clk *clk_register_mux(struct device *dev, const char *name,
+struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		const char **parent_names, u8 num_parents, unsigned long flags,
-		void __iomem *reg, u8 shift, u8 width,
-		u8 clk_mux_flags, spinlock_t *lock)
+		void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock)
 {
 	struct clk_mux *mux;
 	struct clk *clk;
@@ -113,9 +128,10 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 	/* struct clk_mux assignments */
 	mux->reg = reg;
 	mux->shift = shift;
-	mux->width = width;
+	mux->mask = mask;
 	mux->flags = clk_mux_flags;
 	mux->lock = lock;
+	mux->table = table;
 	mux->hw.init = &init;
 
 	clk = clk_register(dev, &mux->hw);
@@ -125,3 +141,15 @@ struct clk *clk_register_mux(struct device *dev, const char *name,
 
 	return clk;
 }
+
+struct clk *clk_register_mux(struct device *dev, const char *name,
+		const char **parent_names, u8 num_parents, unsigned long flags,
+		void __iomem *reg, u8 shift, u8 width,
+		u8 clk_mux_flags, spinlock_t *lock)
+{
+	u32 mask = BIT(width) - 1;
+
+	return clk_register_mux_table(dev, name, parent_names, num_parents,
+				      flags, reg, shift, mask, clk_mux_flags,
+				      NULL, lock);
+}
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index 0744731c6229..a09d7dcaf183 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -355,15 +355,16 @@ struct clk *tegra_clk_register_periph_nodiv(const char *name,
 		struct tegra_clk_periph *periph, void __iomem *clk_base,
 		u32 offset);
 
-#define TEGRA_CLK_PERIPH(_mux_shift, _mux_width, _mux_flags,		\
+#define TEGRA_CLK_PERIPH(_mux_shift, _mux_mask, _mux_flags,		\
 			 _div_shift, _div_width, _div_frac_width,	\
 			 _div_flags, _clk_num, _enb_refcnt, _regs,	\
-			 _gate_flags)					\
+			 _gate_flags, _table)				\
 	{								\
 		.mux = {						\
 			.flags = _mux_flags,				\
 			.shift = _mux_shift,				\
-			.width = _mux_width,				\
+			.mask = _mux_mask,				\
+			.table = _table,				\
 		},							\
 		.divider = {						\
 			.flags = _div_flags,				\
@@ -393,26 +394,36 @@ struct tegra_periph_init_data {
 	const char *dev_id;
 };
 
-#define TEGRA_INIT_DATA(_name, _con_id, _dev_id, _parent_names, _offset, \
-			_mux_shift, _mux_width, _mux_flags, _div_shift,	\
+#define TEGRA_INIT_DATA_TABLE(_name, _con_id, _dev_id, _parent_names, _offset,\
+			_mux_shift, _mux_mask, _mux_flags, _div_shift,	\
 			_div_width, _div_frac_width, _div_flags, _regs,	\
-			_clk_num, _enb_refcnt, _gate_flags, _clk_id)	\
+			_clk_num, _enb_refcnt, _gate_flags, _clk_id, _table) \
 	{								\
 		.name = _name,						\
 		.clk_id = _clk_id,					\
 		.parent_names = _parent_names,				\
 		.num_parents = ARRAY_SIZE(_parent_names),		\
-		.periph = TEGRA_CLK_PERIPH(_mux_shift, _mux_width,	\
+		.periph = TEGRA_CLK_PERIPH(_mux_shift, _mux_mask,	\
 					   _mux_flags, _div_shift,	\
 					   _div_width, _div_frac_width,	\
 					   _div_flags, _clk_num,	\
 					   _enb_refcnt, _regs,		\
-					   _gate_flags),		\
+					   _gate_flags, _table),	\
 		.offset = _offset,					\
 		.con_id = _con_id,					\
 		.dev_id = _dev_id,					\
 	}
 
+#define TEGRA_INIT_DATA(_name, _con_id, _dev_id, _parent_names, _offset,\
+			_mux_shift, _mux_width, _mux_flags, _div_shift,	\
+			_div_width, _div_frac_width, _div_flags, _regs,	\
+			_clk_num, _enb_refcnt, _gate_flags, _clk_id)	\
+	TEGRA_INIT_DATA_TABLE(_name, _con_id, _dev_id, _parent_names, _offset,\
+			_mux_shift, BIT(_mux_width) - 1, _mux_flags,	\
+			_div_shift, _div_width, _div_frac_width, _div_flags, \
+			_regs, _clk_num, _enb_refcnt, _gate_flags, _clk_id,\
+			NULL)
+
 /**
  * struct clk_super_mux - super clock
  *
diff --git a/include/linux/clk-private.h b/include/linux/clk-private.h
index 9c7f5807824b..dd7adff76e81 100644
--- a/include/linux/clk-private.h
+++ b/include/linux/clk-private.h
@@ -152,7 +152,7 @@ struct clk {
 		},						\
 		.reg = _reg,					\
 		.shift = _shift,				\
-		.width = _width,				\
+		.mask = BIT(_width) - 1,			\
 		.flags = _mux_flags,				\
 		.lock = _lock,					\
 	};							\
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 56e6cc12c796..63ba3b740794 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -297,8 +297,9 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
 struct clk_mux {
 	struct clk_hw	hw;
 	void __iomem	*reg;
+	u32		*table;
+	u32		mask;
 	u8		shift;
-	u8		width;
 	u8		flags;
 	spinlock_t	*lock;
 };
@@ -307,11 +308,17 @@ struct clk_mux {
 #define CLK_MUX_INDEX_BIT		BIT(1)
 
 extern const struct clk_ops clk_mux_ops;
+
 struct clk *clk_register_mux(struct device *dev, const char *name,
 		const char **parent_names, u8 num_parents, unsigned long flags,
 		void __iomem *reg, u8 shift, u8 width,
 		u8 clk_mux_flags, spinlock_t *lock);
 
+struct clk *clk_register_mux_table(struct device *dev, const char *name,
+		const char **parent_names, u8 num_parents, unsigned long flags,
+		void __iomem *reg, u8 shift, u32 mask,
+		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
+
 /**
  * struct clk_fixed_factor - fixed multiplier and divider clock
  *
-- 
cgit 


From cc244ddae6d4c6902ac9d7d64023534f8c44a7eb Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 May 2012 12:30:07 -0700
Subject: timekeeping: Move TAI managment into timekeeping core from ntp

Currently NTP manages the TAI offset. Since there's plans for a
CLOCK_TAI clockid, push the TAI management into the timekeeping
core.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h                |  2 ++
 include/linux/timekeeper_internal.h |  3 +++
 kernel/time/ntp.c                   | 18 ++++++++-------
 kernel/time/timekeeping.c           | 44 +++++++++++++++++++++++++++++++++++++
 4 files changed, 59 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index d4835dfdf25e..47210a175e78 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -181,6 +181,8 @@ extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
+extern s32 timekeeping_get_tai_offset(void);
+extern void timekeeping_set_tai_offset(s32 tai_offset);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e1d558e237ec..ff94f436f8b7 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -62,6 +62,9 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	/* The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. */
 	struct timespec		raw_time;
+	/* The current UTC to TAI offset in seconds */
+	s32			tai_offset;
+
 	/* Seqlock for all timekeeper values */
 	seqlock_t		lock;
 };
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 072bb066bb7d..59e2749be0fa 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -53,9 +53,6 @@ static int			time_state = TIME_OK;
 /* clock status bits:							*/
 static int			time_status = STA_UNSYNC;
 
-/* TAI offset (secs):							*/
-static long			time_tai;
-
 /* time adjustment (nsecs):						*/
 static s64			time_offset;
 
@@ -415,7 +412,6 @@ int second_overflow(unsigned long secs)
 		else if (secs % 86400 == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
-			time_tai++;
 			printk(KERN_NOTICE
 				"Clock: inserting leap second 23:59:60 UTC\n");
 		}
@@ -425,7 +421,6 @@ int second_overflow(unsigned long secs)
 			time_state = TIME_OK;
 		else if ((secs + 1) % 86400 == 0) {
 			leap = 1;
-			time_tai--;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE
 				"Clock: deleting leap second 23:59:59 UTC\n");
@@ -579,7 +574,9 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
  * Called with ntp_lock held, so we can access and modify
  * all the global NTP state:
  */
-static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+static inline void process_adjtimex_modes(struct timex *txc,
+						struct timespec *ts,
+						s32 *time_tai)
 {
 	if (txc->modes & ADJ_STATUS)
 		process_adj_status(txc, ts);
@@ -613,7 +610,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 	}
 
 	if (txc->modes & ADJ_TAI && txc->constant > 0)
-		time_tai = txc->constant;
+		*time_tai = txc->constant;
 
 	if (txc->modes & ADJ_OFFSET)
 		ntp_update_offset(txc->offset);
@@ -632,6 +629,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 int do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
+	u32 time_tai, orig_tai;
 	int result;
 
 	/* Validate the data before disabling interrupts */
@@ -671,6 +669,7 @@ int do_adjtimex(struct timex *txc)
 	}
 
 	getnstimeofday(&ts);
+	orig_tai = time_tai = timekeeping_get_tai_offset();
 
 	raw_spin_lock_irq(&ntp_lock);
 
@@ -687,7 +686,7 @@ int do_adjtimex(struct timex *txc)
 
 		/* If there are input parameters, then process them: */
 		if (txc->modes)
-			process_adjtimex_modes(txc, &ts);
+			process_adjtimex_modes(txc, &ts, &time_tai);
 
 		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 				  NTP_SCALE_SHIFT);
@@ -716,6 +715,9 @@ int do_adjtimex(struct timex *txc)
 
 	raw_spin_unlock_irq(&ntp_lock);
 
+	if (time_tai != orig_tai)
+		timekeeping_set_tai_offset(time_tai);
+
 	txc->time.tv_sec = ts.tv_sec;
 	txc->time.tv_usec = ts.tv_nsec;
 	if (!(time_status & STA_NANO))
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 0355f125d585..937098aab498 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -513,6 +513,48 @@ error: /* even if we error out, we forwarded the time, so call update */
 }
 EXPORT_SYMBOL(timekeeping_inject_offset);
 
+
+/**
+ * timekeeping_get_tai_offset - Returns current TAI offset from UTC
+ *
+ */
+s32 timekeeping_get_tai_offset(void)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned int seq;
+	s32 ret;
+
+	do {
+		seq = read_seqbegin(&tk->lock);
+		ret = tk->tai_offset;
+	} while (read_seqretry(&tk->lock, seq));
+
+	return ret;
+}
+
+/**
+ * __timekeeping_set_tai_offset - Lock free worker function
+ *
+ */
+void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
+{
+	tk->tai_offset = tai_offset;
+}
+
+/**
+ * timekeeping_set_tai_offset - Sets the current TAI offset from UTC
+ *
+ */
+void timekeeping_set_tai_offset(s32 tai_offset)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&tk->lock, flags);
+	__timekeeping_set_tai_offset(tk, tai_offset);
+	write_sequnlock_irqrestore(&tk->lock, flags);
+}
+
 /**
  * change_clocksource - Swaps clocksources if a new one is available
  *
@@ -1143,6 +1185,8 @@ static inline void accumulate_nsecs_to_secs(struct timekeeper *tk)
 			tk_set_wall_to_mono(tk,
 				timespec_sub(tk->wall_to_monotonic, ts));
 
+			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
+
 			clock_was_set_delayed();
 		}
 	}
-- 
cgit 


From 1ff3c9677bff7e468e0c487d0ffefe4e901d33f4 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 3 May 2012 12:43:40 -0700
Subject: timekeeping: Add CLOCK_TAI clockid

This add a CLOCK_TAI clockid and the needed accessors.

CC: Thomas Gleixner <tglx@linutronix.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/time.h      |  1 +
 include/uapi/linux/time.h |  6 ++----
 kernel/posix-timers.c     | 10 ++++++++++
 kernel/time/timekeeping.c | 30 ++++++++++++++++++++++++++++++
 4 files changed, 43 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/time.h b/include/linux/time.h
index 47210a175e78..22d81b3c955b 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -183,6 +183,7 @@ extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
+extern void timekeeping_clocktai(struct timespec *ts);
 
 struct tms;
 extern void do_sys_times(struct tms *);
diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h
index 0d3c0edc3eda..e75e1b6ff27f 100644
--- a/include/uapi/linux/time.h
+++ b/include/uapi/linux/time.h
@@ -54,11 +54,9 @@ struct itimerval {
 #define CLOCK_BOOTTIME			7
 #define CLOCK_REALTIME_ALARM		8
 #define CLOCK_BOOTTIME_ALARM		9
+#define CLOCK_SGI_CYCLE			10	/* Hardware specific */
+#define CLOCK_TAI			11
 
-/*
- * The IDs of various hardware clocks:
- */
-#define CLOCK_SGI_CYCLE			10
 #define MAX_CLOCKS			16
 #define CLOCKS_MASK			(CLOCK_REALTIME | CLOCK_MONOTONIC)
 #define CLOCKS_MONO			CLOCK_MONOTONIC
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 6edbb2c55c22..fbfc5f1b7710 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -221,6 +221,11 @@ static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
+{
+	timekeeping_clocktai(tp);
+	return 0;
+}
 
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
@@ -261,6 +266,10 @@ static __init int init_posix_timers(void)
 		.clock_getres	= posix_get_coarse_res,
 		.clock_get	= posix_get_monotonic_coarse,
 	};
+	struct k_clock clock_tai = {
+		.clock_getres	= hrtimer_get_res,
+		.clock_get	= posix_get_tai,
+	};
 	struct k_clock clock_boottime = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_boottime,
@@ -278,6 +287,7 @@ static __init int init_posix_timers(void)
 	posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
 	posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
 	posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
+	posix_timers_register_clock(CLOCK_TAI, &clock_tai);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 937098aab498..8a842756572d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -379,6 +379,36 @@ void ktime_get_ts(struct timespec *ts)
 }
 EXPORT_SYMBOL_GPL(ktime_get_ts);
 
+
+/**
+ * timekeeping_clocktai - Returns the TAI time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void timekeeping_clocktai(struct timespec *ts)
+{
+	struct timekeeper *tk = &timekeeper;
+	unsigned long seq;
+	u64 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqbegin(&tk->lock);
+
+		ts->tv_sec = tk->xtime_sec + tk->tai_offset;
+		nsecs = timekeeping_get_ns(tk);
+
+	} while (read_seqretry(&tk->lock, seq));
+
+	ts->tv_nsec = 0;
+	timespec_add_ns(ts, nsecs);
+
+}
+EXPORT_SYMBOL(timekeeping_clocktai);
+
+
 #ifdef CONFIG_NTP_PPS
 
 /**
-- 
cgit 


From 90adda98b89aaf68b06014ecf805b6c477daa19b Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 21 Jan 2013 17:00:11 -0800
Subject: hrtimer: Add hrtimer support for CLOCK_TAI

Add hrtimer support for CLOCK_TAI, as well as posix timer interfaces.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/hrtimer.h             |  5 ++++-
 include/linux/timekeeper_internal.h |  2 ++
 kernel/hrtimer.c                    | 14 +++++++++++++-
 kernel/posix-timers.c               |  6 ++++++
 kernel/time/timekeeping.c           | 20 +++++++++++++++++++-
 5 files changed, 44 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index cc07d2777bbe..d19a5c2d2270 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -157,6 +157,7 @@ enum  hrtimer_base_type {
 	HRTIMER_BASE_MONOTONIC,
 	HRTIMER_BASE_REALTIME,
 	HRTIMER_BASE_BOOTTIME,
+	HRTIMER_BASE_TAI,
 	HRTIMER_MAX_CLOCK_BASES,
 };
 
@@ -327,7 +328,9 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_real(void);
 extern ktime_t ktime_get_boottime(void);
 extern ktime_t ktime_get_monotonic_offset(void);
-extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot);
+extern ktime_t ktime_get_clocktai(void);
+extern ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+					 ktime_t *offs_tai);
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index ff94f436f8b7..26700d870506 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -64,6 +64,8 @@ struct timekeeper {
 	struct timespec		raw_time;
 	/* The current UTC to TAI offset in seconds */
 	s32			tai_offset;
+	/* Offset clock monotonic -> clock tai */
+	ktime_t			offs_tai;
 
 	/* Seqlock for all timekeeper values */
 	seqlock_t		lock;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..258720741d3e 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -83,6 +83,12 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.get_time = &ktime_get_boottime,
 			.resolution = KTIME_LOW_RES,
 		},
+		{
+			.index = HRTIMER_BASE_TAI,
+			.clockid = CLOCK_TAI,
+			.get_time = &ktime_get_clocktai,
+			.resolution = KTIME_LOW_RES,
+		},
 	}
 };
 
@@ -90,6 +96,7 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
 	[CLOCK_REALTIME]	= HRTIMER_BASE_REALTIME,
 	[CLOCK_MONOTONIC]	= HRTIMER_BASE_MONOTONIC,
 	[CLOCK_BOOTTIME]	= HRTIMER_BASE_BOOTTIME,
+	[CLOCK_TAI]		= HRTIMER_BASE_TAI,
 };
 
 static inline int hrtimer_clockid_to_base(clockid_t clock_id)
@@ -106,8 +113,10 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 {
 	ktime_t xtim, mono, boot;
 	struct timespec xts, tom, slp;
+	s32 tai_offset;
 
 	get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
+	tai_offset = timekeeping_get_tai_offset();
 
 	xtim = timespec_to_ktime(xts);
 	mono = ktime_add(xtim, timespec_to_ktime(tom));
@@ -115,6 +124,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
 	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
 	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
 	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
+	base->clock_base[HRTIMER_BASE_TAI].softirq_time =
+				ktime_add(xtim,	ktime_set(tai_offset, 0));
 }
 
 /*
@@ -651,8 +662,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 {
 	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
-	return ktime_get_update_offsets(offs_real, offs_boot);
+	return ktime_get_update_offsets(offs_real, offs_boot, offs_tai);
 }
 
 /*
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index fbfc5f1b7710..2a2e173d0a7a 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -269,6 +269,12 @@ static __init int init_posix_timers(void)
 	struct k_clock clock_tai = {
 		.clock_getres	= hrtimer_get_res,
 		.clock_get	= posix_get_tai,
+		.nsleep		= common_nsleep,
+		.nsleep_restart	= hrtimer_nanosleep_restart,
+		.timer_create	= common_timer_create,
+		.timer_set	= common_timer_set,
+		.timer_get	= common_timer_get,
+		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_boottime = {
 		.clock_getres	= hrtimer_get_res,
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8a842756572d..8061ae0be7bd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -67,6 +67,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
 	tk->wall_to_monotonic = wtm;
 	set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
 	tk->offs_real = timespec_to_ktime(tmp);
+	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
 }
 
 static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -409,6 +410,20 @@ void timekeeping_clocktai(struct timespec *ts)
 EXPORT_SYMBOL(timekeeping_clocktai);
 
 
+/**
+ * ktime_get_clocktai - Returns the TAI time of day in a ktime
+ *
+ * Returns the time of day in a ktime.
+ */
+ktime_t ktime_get_clocktai(void)
+{
+	struct timespec ts;
+
+	timekeeping_clocktai(&ts);
+	return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL(ktime_get_clocktai);
+
 #ifdef CONFIG_NTP_PPS
 
 /**
@@ -569,6 +584,7 @@ s32 timekeeping_get_tai_offset(void)
 void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 {
 	tk->tai_offset = tai_offset;
+	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
 }
 
 /**
@@ -1539,7 +1555,8 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
  * Returns current monotonic time and updates the offsets
  * Called from hrtimer_interupt() or retrigger_next_event()
  */
-ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
+ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
+							ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &timekeeper;
 	ktime_t now;
@@ -1554,6 +1571,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot)
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
+		*offs_tai = tk->offs_tai;
 	} while (read_seqretry(&tk->lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
-- 
cgit 


From eb93e4d93093615c60cb7dd3dcb24e46bd7d62d4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:36 +0000
Subject: timekeeping: Make jiffies_lock internal

Nothing outside of the timekeeping core needs that lock.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/jiffies.h     | 1 -
 kernel/time/tick-internal.h | 2 ++
 kernel/time/timekeeping.c   | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 82ed068b1ebe..8fb8edf12417 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -75,7 +75,6 @@ extern int register_refined_jiffies(long clock_tick_rate);
  */
 extern u64 __jiffy_data jiffies_64;
 extern unsigned long volatile __jiffy_data jiffies;
-extern seqlock_t jiffies_lock;
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index cf3e59ed6dc0..f5c9207967cf 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -4,6 +4,8 @@
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
 
+extern seqlock_t jiffies_lock;
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
 #define TICK_DO_TIMER_NONE	-1
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c442a4ccccc9..b0c648fc959f 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -23,6 +23,7 @@
 #include <linux/stop_machine.h>
 #include <linux/pvclock_gtod.h>
 
+#include "tick-internal.h"
 
 static struct timekeeper timekeeper;
 
-- 
cgit 


From 7e40672d930b369c1984457233ec5557aa53bfb8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:37 +0000
Subject: timekeeping: Move lock out of timekeeper struct

Make the lock a separate entity. Preparatory patch for shadow
timekeeper structure.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[Merged with CLOCK_TAI changes]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h |   2 -
 kernel/time/timekeeping.c           | 108 ++++++++++++++++++------------------
 2 files changed, 53 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 26700d870506..a151bd70e52b 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -67,8 +67,6 @@ struct timekeeper {
 	/* Offset clock monotonic -> clock tai */
 	ktime_t			offs_tai;
 
-	/* Seqlock for all timekeeper values */
-	seqlock_t		lock;
 };
 
 static inline struct timespec tk_xtime(struct timekeeper *tk)
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index b0c648fc959f..caede71c0a35 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -26,6 +26,7 @@
 #include "tick-internal.h"
 
 static struct timekeeper timekeeper;
+static DEFINE_SEQLOCK(timekeeper_lock);
 
 /* flag for if timekeeping is suspended */
 int __read_mostly timekeeping_suspended;
@@ -212,11 +213,11 @@ int pvclock_gtod_register_notifier(struct notifier_block *nb)
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_register(&pvclock_gtod_chain, nb);
 	/* update timekeeping data */
 	update_pvclock_gtod(tk);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -230,13 +231,12 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
-	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 	int ret;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	ret = raw_notifier_chain_unregister(&pvclock_gtod_chain, nb);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return ret;
 }
@@ -296,12 +296,12 @@ int __getnstimeofday(struct timespec *ts)
 	s64 nsecs = 0;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ts->tv_sec = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -337,11 +337,11 @@ ktime_t ktime_get(void)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		secs = tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
 		nsecs = timekeeping_get_ns(tk) + tk->wall_to_monotonic.tv_nsec;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 	/*
 	 * Use ktime_set/ktime_add_ns to create a proper ktime on
 	 * 32-bit architectures without CONFIG_KTIME_SCALAR.
@@ -368,12 +368,12 @@ void ktime_get_ts(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec;
 	ts->tv_nsec = 0;
@@ -397,12 +397,12 @@ void timekeeping_clocktai(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ts->tv_sec = tk->xtime_sec + tk->tai_offset;
 		nsecs = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_nsec = 0;
 	timespec_add_ns(ts, nsecs);
@@ -445,7 +445,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 	WARN_ON_ONCE(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		*ts_raw = tk->raw_time;
 		ts_real->tv_sec = tk->xtime_sec;
@@ -454,7 +454,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		nsecs_raw = timekeeping_get_ns_raw(tk);
 		nsecs_real = timekeeping_get_ns(tk);
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts_raw, nsecs_raw);
 	timespec_add_ns(ts_real, nsecs_real);
@@ -494,7 +494,7 @@ int do_settimeofday(const struct timespec *tv)
 	if (!timespec_valid_strict(tv))
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -508,7 +508,7 @@ int do_settimeofday(const struct timespec *tv)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -533,7 +533,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
 		return -EINVAL;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -550,7 +550,7 @@ int timekeeping_inject_offset(struct timespec *ts)
 error: /* even if we error out, we forwarded the time, so call update */
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -571,9 +571,9 @@ s32 timekeeping_get_tai_offset(void)
 	s32 ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ret = tk->tai_offset;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -597,9 +597,9 @@ void timekeeping_set_tai_offset(s32 tai_offset)
 	struct timekeeper *tk = &timekeeper;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	__timekeeping_set_tai_offset(tk, tai_offset);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /**
@@ -615,7 +615,7 @@ static int change_clocksource(void *data)
 
 	new = (struct clocksource *) data;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 	if (!new->enable || new->enable(new) == 0) {
@@ -626,7 +626,7 @@ static int change_clocksource(void *data)
 	}
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	return 0;
 }
@@ -676,11 +676,11 @@ void getrawmonotonic(struct timespec *ts)
 	s64 nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		nsecs = timekeeping_get_ns_raw(tk);
 		*ts = tk->raw_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
 }
@@ -696,11 +696,11 @@ int timekeeping_valid_for_hres(void)
 	int ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -715,11 +715,11 @@ u64 timekeeping_max_deferment(void)
 	u64 ret;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		ret = tk->clock->max_idle_ns;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return ret;
 }
@@ -782,11 +782,9 @@ void __init timekeeping_init(void)
 		boot.tv_nsec = 0;
 	}
 
-	seqlock_init(&tk->lock);
-
 	ntp_init();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	clock = clocksource_default_clock();
 	if (clock->enable)
 		clock->enable(clock);
@@ -805,7 +803,7 @@ void __init timekeeping_init(void)
 	tmp.tv_nsec = 0;
 	tk_set_sleep_time(tk, tmp);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 }
 
 /* time in seconds when suspend began */
@@ -853,7 +851,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 	if (has_persistent_clock())
 		return;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	timekeeping_forward_now(tk);
 
@@ -861,7 +859,7 @@ void timekeeping_inject_sleeptime(struct timespec *delta)
 
 	timekeeping_update(tk, true);
 
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	/* signal hrtimers about time change */
 	clock_was_set();
@@ -888,7 +886,7 @@ static void timekeeping_resume(void)
 	clockevents_resume();
 	clocksource_resume();
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	/*
 	 * After system resumes, we need to calculate the suspended time and
@@ -940,7 +938,7 @@ static void timekeeping_resume(void)
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, false);
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	touch_softlockup_watchdog();
 
@@ -959,7 +957,7 @@ static int timekeeping_suspend(void)
 
 	read_persistent_clock(&timekeeping_suspend_time);
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 	timekeeping_forward_now(tk);
 	timekeeping_suspended = 1;
 
@@ -982,7 +980,7 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
 	clocksource_suspend();
@@ -1322,7 +1320,7 @@ static void update_wall_time(void)
 	int shift = 0, maxshift;
 	unsigned long flags;
 
-	write_seqlock_irqsave(&tk->lock, flags);
+	write_seqlock_irqsave(&timekeeper_lock, flags);
 
 	/* Make sure we're fully resumed: */
 	if (unlikely(timekeeping_suspended))
@@ -1377,7 +1375,7 @@ static void update_wall_time(void)
 	timekeeping_update(tk, false);
 
 out:
-	write_sequnlock_irqrestore(&tk->lock, flags);
+	write_sequnlock_irqrestore(&timekeeper_lock, flags);
 
 }
 
@@ -1425,13 +1423,13 @@ void get_monotonic_boottime(struct timespec *ts)
 	WARN_ON(timekeeping_suspended);
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		ts->tv_sec = tk->xtime_sec;
 		nsec = timekeeping_get_ns(tk);
 		tomono = tk->wall_to_monotonic;
 		sleep = tk->total_sleep_time;
 
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	ts->tv_sec += tomono.tv_sec + sleep.tv_sec;
 	ts->tv_nsec = 0;
@@ -1490,10 +1488,10 @@ struct timespec current_kernel_time(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return now;
 }
@@ -1506,11 +1504,11 @@ struct timespec get_monotonic_coarse(void)
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		now = tk_xtime(tk);
 		mono = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	set_normalized_timespec(&now, now.tv_sec + mono.tv_sec,
 				now.tv_nsec + mono.tv_nsec);
@@ -1541,11 +1539,11 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
 	unsigned long seq;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		*xtim = tk_xtime(tk);
 		*wtom = tk->wall_to_monotonic;
 		*sleep = tk->total_sleep_time;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 }
 
 #ifdef CONFIG_HIGH_RES_TIMERS
@@ -1566,7 +1564,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
 	u64 secs, nsecs;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 
 		secs = tk->xtime_sec;
 		nsecs = timekeeping_get_ns(tk);
@@ -1574,7 +1572,7 @@ ktime_t ktime_get_update_offsets(ktime_t *offs_real, ktime_t *offs_boot,
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
 		*offs_tai = tk->offs_tai;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	now = ktime_add_ns(ktime_set(secs, 0), nsecs);
 	now = ktime_sub(now, *offs_real);
@@ -1592,9 +1590,9 @@ ktime_t ktime_get_monotonic_offset(void)
 	struct timespec wtom;
 
 	do {
-		seq = read_seqbegin(&tk->lock);
+		seq = read_seqbegin(&timekeeper_lock);
 		wtom = tk->wall_to_monotonic;
-	} while (read_seqretry(&tk->lock, seq));
+	} while (read_seqretry(&timekeeper_lock, seq));
 
 	return timespec_to_ktime(wtom);
 }
-- 
cgit 


From cbe5e6109538ddab57764a88d9f0c2accd0c7d48 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Fri, 22 Mar 2013 22:17:36 -0600
Subject: lru_cache: introduce lc_get_cumulative()

New helper to be able to consolidate more updates
into a single transaction.
Without this, we can only grab a single refcount
on an updated element while preparing a transaction.

lc_get_cumulative - like lc_get; also finds to-be-changed elements
  @lc: the lru cache to operate on
  @enr: the label to look up

  Unlike lc_get this also returns the element for @enr, if it is belonging to
  a pending transaction, so the return values are like for lc_get(),
  plus:

  pointer to an element already on the "to_be_changed" list.
	  In this case, the cache was already marked %LC_DIRTY.

  Caller needs to make sure that the pending transaction is completed,
  before proceeding to actually use this element.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>

Fixed up by Jens to export lc_get_cumulative().

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/lru_cache.h |  1 +
 lib/lru_cache.c           | 56 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 47 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 4019013c6593..46262284de47 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -256,6 +256,7 @@ extern void lc_destroy(struct lru_cache *lc);
 extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
 extern void lc_del(struct lru_cache *lc, struct lc_element *element);
 
+extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
 extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index 8335d39d2ccd..4a83ecd03650 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -365,7 +365,13 @@ static int lc_unused_element_available(struct lru_cache *lc)
 	return 0;
 }
 
-static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
+/* used as internal flags to __lc_get */
+enum {
+	LC_GET_MAY_CHANGE = 1,
+	LC_GET_MAY_USE_UNCOMMITTED = 2,
+};
+
+static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsigned int flags)
 {
 	struct lc_element *e;
 
@@ -380,22 +386,31 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
 	 * this enr is currently being pulled in already,
 	 * and will be available once the pending transaction
 	 * has been committed. */
-	if (e && e->lc_new_number == e->lc_number) {
+	if (e) {
+		if (e->lc_new_number != e->lc_number) {
+			/* It has been found above, but on the "to_be_changed"
+			 * list, not yet committed.  Don't pull it in twice,
+			 * wait for the transaction, then try again...
+			 */
+			if (!(flags & LC_GET_MAY_USE_UNCOMMITTED))
+				RETURN(NULL);
+			/* ... unless the caller is aware of the implications,
+			 * probably preparing a cumulative transaction. */
+			++e->refcnt;
+			++lc->hits;
+			RETURN(e);
+		}
+		/* else: lc_new_number == lc_number; a real hit. */
 		++lc->hits;
 		if (e->refcnt++ == 0)
 			lc->used++;
 		list_move(&e->list, &lc->in_use); /* Not evictable... */
 		RETURN(e);
 	}
+	/* e == NULL */
 
 	++lc->misses;
-	if (!may_change)
-		RETURN(NULL);
-
-	/* It has been found above, but on the "to_be_changed" list, not yet
-	 * committed.  Don't pull it in twice, wait for the transaction, then
-	 * try again */
-	if (e)
+	if (!(flags & LC_GET_MAY_CHANGE))
 		RETURN(NULL);
 
 	/* To avoid races with lc_try_lock(), first, mark us dirty
@@ -477,7 +492,27 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool
  */
 struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
 {
-	return __lc_get(lc, enr, 1);
+	return __lc_get(lc, enr, LC_GET_MAY_CHANGE);
+}
+
+/**
+ * lc_get_cumulative - like lc_get; also finds to-be-changed elements
+ * @lc: the lru cache to operate on
+ * @enr: the label to look up
+ *
+ * Unlike lc_get this also returns the element for @enr, if it is belonging to
+ * a pending transaction, so the return values are like for lc_get(),
+ * plus:
+ *
+ * pointer to an element already on the "to_be_changed" list.
+ * 	In this case, the cache was already marked %LC_DIRTY.
+ *
+ * Caller needs to make sure that the pending transaction is completed,
+ * before proceeding to actually use this element.
+ */
+struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr)
+{
+	return __lc_get(lc, enr, LC_GET_MAY_CHANGE|LC_GET_MAY_USE_UNCOMMITTED);
 }
 
 /**
@@ -648,3 +683,4 @@ EXPORT_SYMBOL(lc_seq_printf_stats);
 EXPORT_SYMBOL(lc_seq_dump_details);
 EXPORT_SYMBOL(lc_try_lock);
 EXPORT_SYMBOL(lc_is_used);
+EXPORT_SYMBOL(lc_get_cumulative);
-- 
cgit 


From 5bbcf5e6abe97485748b51ea0713cc3012b4a8f0 Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 19 Mar 2013 18:16:59 +0100
Subject: drbd: adjust upper limit for activity log extents

Now that the on-disk activity-log ring buffer size is adjustable,
the maximum active set can become larger, and is now limited by
the use of 16bit "labels".

This increases the maximum working set from 6433 to 65534 extents,
each of which covers an area of 4MiB.
Which means that if you use the maximum, you'd have to resync
more than 250 GiB after an unclean Primary shutdown.
With capable backend storage and replication links,
this is entirely feasible.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_nl.c | 47 +++++++++++++++++++++++++++++++++++---------
 include/linux/drbd_limits.h  | 11 +++++------
 2 files changed, 43 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index bcf900bcd142..42fda4ae2f87 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1141,15 +1141,32 @@ static bool should_set_defaults(struct genl_info *info)
 	return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
 }
 
-static void enforce_disk_conf_limits(struct disk_conf *dc)
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
 {
-	if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
-		dc->al_extents = DRBD_AL_EXTENTS_MIN;
-	if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
-		dc->al_extents = DRBD_AL_EXTENTS_MAX;
+	/* This is limited by 16 bit "slot" numbers,
+	 * and by available on-disk context storage.
+	 *
+	 * Also (u16)~0 is special (denotes a "free" extent).
+	 *
+	 * One transaction occupies one 4kB on-disk block,
+	 * we have n such blocks in the on disk ring buffer,
+	 * the "current" transaction may fail (n-1),
+	 * and there is 919 slot numbers context information per transaction.
+	 *
+	 * 72 transaction blocks amounts to more than 2**16 context slots,
+	 * so cap there first.
+	 */
+	const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+	const unsigned int sufficient_on_disk =
+		(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+		/AL_CONTEXT_PER_TRANSACTION;
 
-	if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
-		dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+	unsigned int al_size_4k = bdev->md.al_size_4k;
+
+	if (al_size_4k > sufficient_on_disk)
+		return max_al_nr;
+
+	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
 }
 
 int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
@@ -1196,7 +1213,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
 	if (!expect(new_disk_conf->resync_rate >= 1))
 		new_disk_conf->resync_rate = 1;
 
-	enforce_disk_conf_limits(new_disk_conf);
+	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (new_disk_conf->al_extents > drbd_al_extents_max(mdev->ldev))
+		new_disk_conf->al_extents = drbd_al_extents_max(mdev->ldev);
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
 
 	fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
 	if (fifo_size != mdev->rs_plan_s->size) {
@@ -1344,7 +1367,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 		goto fail;
 	}
 
-	enforce_disk_conf_limits(new_disk_conf);
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
 
 	new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
 	if (!new_plan) {
@@ -1419,6 +1443,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto fail;
 
+	if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (new_disk_conf->al_extents > drbd_al_extents_max(nbc))
+		new_disk_conf->al_extents = drbd_al_extents_max(nbc);
+
 	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
 		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
 			(unsigned long long) drbd_get_max_capacity(nbc),
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index 1fa19c5f5e64..1fedf2b17cc8 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -126,13 +126,12 @@
 #define DRBD_RESYNC_RATE_DEF 250
 #define DRBD_RESYNC_RATE_SCALE 'k'  /* kilobytes */
 
-  /* less than 7 would hit performance unnecessarily.
-   * 919 slots context information per transaction,
-   * 32k activity log, 4k transaction size,
-   * one transaction in flight:
-   * 919 * 7 = 6433 */
+  /* less than 7 would hit performance unnecessarily. */
 #define DRBD_AL_EXTENTS_MIN  7
-#define DRBD_AL_EXTENTS_MAX  6433
+  /* we use u16 as "slot number", (u16)~0 is "FREE".
+   * If you use >= 292 kB on-disk ring buffer,
+   * this is the maximum you can use: */
+#define DRBD_AL_EXTENTS_MAX  0xfffe
 #define DRBD_AL_EXTENTS_DEF  1237
 #define DRBD_AL_EXTENTS_SCALE '1'
 
-- 
cgit 


From 66311274691ec65972cad3626057fa8d00c146d8 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Sat, 23 Mar 2013 11:42:24 +0800
Subject: block: add a flag to identify PM request

Add a flag REQ_PM to identify the request is PM related, such requests
will not change the device request queue's runtime status. It is
intended to be used in driver's runtime PM callback, so that driver can
perform some IO to the device there with the queue's runtime status
unaffected. e.g. in SCSI disk's runtime suspend callback, the disk will
be put into stopped power state, and this require sending a command to
the device. Such command processing should not change the disk's runtime
status.

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blk_types.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cdf11191e645..fcc1ce28d5ca 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -175,6 +175,7 @@ enum rq_flag_bits {
 	__REQ_IO_STAT,		/* account I/O stat */
 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
 	__REQ_KERNEL, 		/* direct IO to kernel pages */
+	__REQ_PM,		/* runtime pm request */
 	__REQ_NR_BITS,		/* stops here */
 };
 
@@ -223,5 +224,6 @@ enum rq_flag_bits {
 #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
 #define REQ_SECURE		(1 << __REQ_SECURE)
 #define REQ_KERNEL		(1 << __REQ_KERNEL)
+#define REQ_PM			(1 << __REQ_PM)
 
 #endif /* __LINUX_BLK_TYPES_H */
-- 
cgit 


From 6c9546675864f51506af69eca388e5d922942c56 Mon Sep 17 00:00:00 2001
From: Lin Ming <ming.m.lin@intel.com>
Date: Sat, 23 Mar 2013 11:42:26 +0800
Subject: block: add runtime pm helpers

Add runtime pm helper functions:

void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
  - Initialization function for drivers to call.

int blk_pre_runtime_suspend(struct request_queue *q)
  - If any requests are in the queue, mark last busy and return -EBUSY.
    Otherwise set q->rpm_status to RPM_SUSPENDING and return 0.

void blk_post_runtime_suspend(struct request_queue *q, int err)
  - If the suspend succeeded then set q->rpm_status to RPM_SUSPENDED.
    Otherwise set it to RPM_ACTIVE and mark last busy.

void blk_pre_runtime_resume(struct request_queue *q)
  - Set q->rpm_status to RPM_RESUMING.

void blk_post_runtime_resume(struct request_queue *q, int err)
  - If the resume succeeded then set q->rpm_status to RPM_ACTIVE
    and call __blk_run_queue, then mark last busy and autosuspend.
    Otherwise set q->rpm_status to RPM_SUSPENDED.

The idea and API is designed by Alan Stern and described here:
http://marc.info/?l=linux-scsi&m=133727953625963&w=2

Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c       | 144 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  27 ++++++++++
 2 files changed, 171 insertions(+)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 074b758efc42..123d240132bf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -30,6 +30,7 @@
 #include <linux/list_sort.h>
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
+#include <linux/pm_runtime.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -3045,6 +3046,149 @@ void blk_finish_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
+#ifdef CONFIG_PM_RUNTIME
+/**
+ * blk_pm_runtime_init - Block layer runtime PM initialization routine
+ * @q: the queue of the device
+ * @dev: the device the queue belongs to
+ *
+ * Description:
+ *    Initialize runtime-PM-related fields for @q and start auto suspend for
+ *    @dev. Drivers that want to take advantage of request-based runtime PM
+ *    should call this function after @dev has been initialized, and its
+ *    request queue @q has been allocated, and runtime PM for it can not happen
+ *    yet(either due to disabled/forbidden or its usage_count > 0). In most
+ *    cases, driver should call this function before any I/O has taken place.
+ *
+ *    This function takes care of setting up using auto suspend for the device,
+ *    the autosuspend delay is set to -1 to make runtime suspend impossible
+ *    until an updated value is either set by user or by driver. Drivers do
+ *    not need to touch other autosuspend settings.
+ *
+ *    The block layer runtime PM is request based, so only works for drivers
+ *    that use request as their IO unit instead of those directly use bio's.
+ */
+void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+{
+	q->dev = dev;
+	q->rpm_status = RPM_ACTIVE;
+	pm_runtime_set_autosuspend_delay(q->dev, -1);
+	pm_runtime_use_autosuspend(q->dev);
+}
+EXPORT_SYMBOL(blk_pm_runtime_init);
+
+/**
+ * blk_pre_runtime_suspend - Pre runtime suspend check
+ * @q: the queue of the device
+ *
+ * Description:
+ *    This function will check if runtime suspend is allowed for the device
+ *    by examining if there are any requests pending in the queue. If there
+ *    are requests pending, the device can not be runtime suspended; otherwise,
+ *    the queue's status will be updated to SUSPENDING and the driver can
+ *    proceed to suspend the device.
+ *
+ *    For the not allowed case, we mark last busy for the device so that
+ *    runtime PM core will try to autosuspend it some time later.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_suspend callback.
+ *
+ * Return:
+ *    0		- OK to runtime suspend the device
+ *    -EBUSY	- Device should not be runtime suspended
+ */
+int blk_pre_runtime_suspend(struct request_queue *q)
+{
+	int ret = 0;
+
+	spin_lock_irq(q->queue_lock);
+	if (q->nr_pending) {
+		ret = -EBUSY;
+		pm_runtime_mark_last_busy(q->dev);
+	} else {
+		q->rpm_status = RPM_SUSPENDING;
+	}
+	spin_unlock_irq(q->queue_lock);
+	return ret;
+}
+EXPORT_SYMBOL(blk_pre_runtime_suspend);
+
+/**
+ * blk_post_runtime_suspend - Post runtime suspend processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_suspend function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime suspend function and mark last busy for the device so
+ *    that PM core will try to auto suspend the device at a later time.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_suspend callback.
+ */
+void blk_post_runtime_suspend(struct request_queue *q, int err)
+{
+	spin_lock_irq(q->queue_lock);
+	if (!err) {
+		q->rpm_status = RPM_SUSPENDED;
+	} else {
+		q->rpm_status = RPM_ACTIVE;
+		pm_runtime_mark_last_busy(q->dev);
+	}
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_post_runtime_suspend);
+
+/**
+ * blk_pre_runtime_resume - Pre runtime resume processing
+ * @q: the queue of the device
+ *
+ * Description:
+ *    Update the queue's runtime status to RESUMING in preparation for the
+ *    runtime resume of the device.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_resume callback.
+ */
+void blk_pre_runtime_resume(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	q->rpm_status = RPM_RESUMING;
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_pre_runtime_resume);
+
+/**
+ * blk_post_runtime_resume - Post runtime resume processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_resume function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime_resume function. If it is successfully resumed, process
+ *    the requests that are queued into the device's queue when it is resuming
+ *    and then mark last busy and initiate autosuspend for it.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_resume callback.
+ */
+void blk_post_runtime_resume(struct request_queue *q, int err)
+{
+	spin_lock_irq(q->queue_lock);
+	if (!err) {
+		q->rpm_status = RPM_ACTIVE;
+		__blk_run_queue(q);
+		pm_runtime_mark_last_busy(q->dev);
+		pm_runtime_autosuspend(q->dev);
+	} else {
+		q->rpm_status = RPM_SUSPENDED;
+	}
+	spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_post_runtime_resume);
+#endif
+
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78feda9bbae2..89d89c7162aa 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -361,6 +361,12 @@ struct request_queue {
 	 */
 	struct kobject kobj;
 
+#ifdef CONFIG_PM_RUNTIME
+	struct device		*dev;
+	int			rpm_status;
+	unsigned int		nr_pending;
+#endif
+
 	/*
 	 * queue settings
 	 */
@@ -960,6 +966,27 @@ struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 
+/*
+ * block layer runtime pm functions
+ */
+#ifdef CONFIG_PM_RUNTIME
+extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
+extern int blk_pre_runtime_suspend(struct request_queue *q);
+extern void blk_post_runtime_suspend(struct request_queue *q, int err);
+extern void blk_pre_runtime_resume(struct request_queue *q);
+extern void blk_post_runtime_resume(struct request_queue *q, int err);
+#else
+static inline void blk_pm_runtime_init(struct request_queue *q,
+	struct device *dev) {}
+static inline int blk_pre_runtime_suspend(struct request_queue *q)
+{
+	return -ENOSYS;
+}
+static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
+static inline void blk_pre_runtime_resume(struct request_queue *q) {}
+static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
+#endif
+
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
-- 
cgit 


From 57fb233f078beb5d0437a4ae575fbd4d9eb9c738 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 24 Aug 2012 04:56:11 -0700
Subject: block: Reorder struct bio_set

This is prep work for the next patch, which embeds a struct bio_list in
struct bio_set.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
---
 include/linux/bio.h | 66 ++++++++++++++++++++++++++---------------------------
 1 file changed, 33 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bio.h b/include/linux/bio.h
index 820e7aaad4fd..93d3d17a300d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -298,39 +298,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif	/* CONFIG_BLK_CGROUP */
 
-/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-#define BIO_POOL_SIZE 2
-#define BIOVEC_NR_POOLS 6
-#define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
-
-struct bio_set {
-	struct kmem_cache *bio_slab;
-	unsigned int front_pad;
-
-	mempool_t *bio_pool;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-	mempool_t *bio_integrity_pool;
-#endif
-	mempool_t *bvec_pool;
-};
-
-struct biovec_slab {
-	int nr_vecs;
-	char *name;
-	struct kmem_cache *slab;
-};
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
-
 #ifdef CONFIG_HIGHMEM
 /*
  * remember never ever reenable interrupts between a bvec_kmap_irq and
@@ -527,6 +494,39 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
 	return bio;
 }
 
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+#define BIO_POOL_SIZE 2
+#define BIOVEC_NR_POOLS 6
+#define BIOVEC_MAX_IDX	(BIOVEC_NR_POOLS - 1)
+
+struct bio_set {
+	struct kmem_cache *bio_slab;
+	unsigned int front_pad;
+
+	mempool_t *bio_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+	mempool_t *bio_integrity_pool;
+#endif
+	mempool_t *bvec_pool;
+};
+
+struct biovec_slab {
+	int nr_vecs;
+	char *name;
+	struct kmem_cache *slab;
+};
+
+/*
+ * a small number of entries is fine, not going to be performance critical.
+ * basically we just need to survive
+ */
+#define BIO_SPLIT_ENTRIES 2
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_vec_idx(bip, idx)	(&(bip->bip_vec[(idx)]))
-- 
cgit 


From df2cb6daa4cbc34406bc4b1ac9b9335df1083a72 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Mon, 10 Sep 2012 14:33:46 -0700
Subject: block: Avoid deadlocks with bio allocation by stacking drivers

Previously, if we ever try to allocate more than once from the same bio
set while running under generic_make_request() (i.e. a stacking block
driver), we risk deadlock.

This is because of the code in generic_make_request() that converts
recursion to iteration; any bios we submit won't actually be submitted
(so they can complete and eventually be freed) until after we return -
this means if we allocate a second bio, we're blocking the first one
from ever being freed.

Thus if enough threads call into a stacking block driver at the same
time with bios that need multiple splits, and the bio_set's reserve gets
used up, we deadlock.

This can be worked around in the driver code - we could check if we're
running under generic_make_request(), then mask out __GFP_WAIT when we
go to allocate a bio, and if the allocation fails punt to workqueue and
retry the allocation.

But this is tricky and not a generic solution. This patch solves it for
all users by inverting the previously described technique. We allocate a
rescuer workqueue for each bio_set, and then in the allocation code if
there are bios on current->bio_list we would be blocking, we punt them
to the rescuer workqueue to be submitted.

This guarantees forward progress for bio allocations under
generic_make_request() provided each bio is submitted before allocating
the next, and provided the bios are freed after they complete.

Note that this doesn't do anything for allocation from other mempools.
Instead of allocating per bio data structures from a mempool, code
should use bio_set's front_pad.

Tested it by forcing the rescue codepath to be taken (by disabling the
first GFP_NOWAIT) attempt, and then ran it with bcache (which does a lot
of arbitrary bio splitting) and verified that the rescuer was being
invoked.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
Acked-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Muthukumar Ratty <muthur@gmail.com>
---
 fs/bio.c            | 116 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/bio.h |   9 ++++
 2 files changed, 123 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index bb5768f59b32..73b544709945 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -297,6 +297,54 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_alloc_rescue(struct work_struct *work)
+{
+	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+	struct bio *bio;
+
+	while (1) {
+		spin_lock(&bs->rescue_lock);
+		bio = bio_list_pop(&bs->rescue_list);
+		spin_unlock(&bs->rescue_lock);
+
+		if (!bio)
+			break;
+
+		generic_make_request(bio);
+	}
+}
+
+static void punt_bios_to_rescuer(struct bio_set *bs)
+{
+	struct bio_list punt, nopunt;
+	struct bio *bio;
+
+	/*
+	 * In order to guarantee forward progress we must punt only bios that
+	 * were allocated from this bio_set; otherwise, if there was a bio on
+	 * there for a stacking driver higher up in the stack, processing it
+	 * could require allocating bios from this bio_set, and doing that from
+	 * our own rescuer would be bad.
+	 *
+	 * Since bio lists are singly linked, pop them all instead of trying to
+	 * remove from the middle of the list:
+	 */
+
+	bio_list_init(&punt);
+	bio_list_init(&nopunt);
+
+	while ((bio = bio_list_pop(current->bio_list)))
+		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+
+	*current->bio_list = nopunt;
+
+	spin_lock(&bs->rescue_lock);
+	bio_list_merge(&bs->rescue_list, &punt);
+	spin_unlock(&bs->rescue_lock);
+
+	queue_work(bs->rescue_workqueue, &bs->rescue_work);
+}
+
 /**
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -314,11 +362,27 @@ EXPORT_SYMBOL(bio_reset);
  *   previously allocated bio for IO before attempting to allocate a new one.
  *   Failure to do so can cause deadlocks under memory pressure.
  *
+ *   Note that when running under generic_make_request() (i.e. any block
+ *   driver), bios are not submitted until after you return - see the code in
+ *   generic_make_request() that converts recursion into iteration, to prevent
+ *   stack overflows.
+ *
+ *   This would normally mean allocating multiple bios under
+ *   generic_make_request() would be susceptible to deadlocks, but we have
+ *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+ *   thread.
+ *
+ *   However, we do not guarantee forward progress for allocations from other
+ *   mempools. Doing multiple allocations from the same mempool under
+ *   generic_make_request() should be avoided - instead, use bio_set's front_pad
+ *   for per bio allocations.
+ *
  *   RETURNS:
  *   Pointer to new bio on success, NULL on failure.
  */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+	gfp_t saved_gfp = gfp_mask;
 	unsigned front_pad;
 	unsigned inline_vecs;
 	unsigned long idx = BIO_POOL_NONE;
@@ -336,7 +400,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 		front_pad = 0;
 		inline_vecs = nr_iovecs;
 	} else {
+		/*
+		 * generic_make_request() converts recursion to iteration; this
+		 * means if we're running beneath it, any bios we allocate and
+		 * submit will not be submitted (and thus freed) until after we
+		 * return.
+		 *
+		 * This exposes us to a potential deadlock if we allocate
+		 * multiple bios from the same bio_set() while running
+		 * underneath generic_make_request(). If we were to allocate
+		 * multiple bios (say a stacking block driver that was splitting
+		 * bios), we would deadlock if we exhausted the mempool's
+		 * reserve.
+		 *
+		 * We solve this, and guarantee forward progress, with a rescuer
+		 * workqueue per bio_set. If we go to allocate and there are
+		 * bios on current->bio_list, we first try the allocation
+		 * without __GFP_WAIT; if that fails, we punt those bios we
+		 * would be blocking to the rescuer workqueue before we retry
+		 * with the original gfp_flags.
+		 */
+
+		if (current->bio_list && !bio_list_empty(current->bio_list))
+			gfp_mask &= ~__GFP_WAIT;
+
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
+		if (!p && gfp_mask != saved_gfp) {
+			punt_bios_to_rescuer(bs);
+			gfp_mask = saved_gfp;
+			p = mempool_alloc(bs->bio_pool, gfp_mask);
+		}
+
 		front_pad = bs->front_pad;
 		inline_vecs = BIO_INLINE_VECS;
 	}
@@ -349,6 +443,12 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 	if (nr_iovecs > inline_vecs) {
 		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		if (!bvl && gfp_mask != saved_gfp) {
+			punt_bios_to_rescuer(bs);
+			gfp_mask = saved_gfp;
+			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		}
+
 		if (unlikely(!bvl))
 			goto err_free;
 	} else if (nr_iovecs) {
@@ -1579,6 +1679,9 @@ static void biovec_free_pools(struct bio_set *bs)
 
 void bioset_free(struct bio_set *bs)
 {
+	if (bs->rescue_workqueue)
+		destroy_workqueue(bs->rescue_workqueue);
+
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
@@ -1614,6 +1717,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
 	bs->front_pad = front_pad;
 
+	spin_lock_init(&bs->rescue_lock);
+	bio_list_init(&bs->rescue_list);
+	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
 	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
 	if (!bs->bio_slab) {
 		kfree(bs);
@@ -1624,9 +1731,14 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (!biovec_create_pools(bs, pool_size))
-		return bs;
+	if (biovec_create_pools(bs, pool_size))
+		goto bad;
+
+	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+	if (!bs->rescue_workqueue)
+		goto bad;
 
+	return bs;
 bad:
 	bioset_free(bs);
 	return NULL;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 93d3d17a300d..b31036ff779f 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -513,6 +513,15 @@ struct bio_set {
 	mempool_t *bio_integrity_pool;
 #endif
 	mempool_t *bvec_pool;
+
+	/*
+	 * Deadlock avoidance for stacking block drivers: see comments in
+	 * bio_alloc_bioset() for details
+	 */
+	spinlock_t		rescue_lock;
+	struct bio_list		rescue_list;
+	struct work_struct	rescue_work;
+	struct workqueue_struct	*rescue_workqueue;
 };
 
 struct biovec_slab {
-- 
cgit 


From 6fda981cafbf908acd11e1e636fec50e99d56a47 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 12 Oct 2012 13:18:27 -0700
Subject: block: Fix a buffer overrun in bio_integrity_split()

bio_integrity_split() seemed to be confusing pointers and arrays -
bip_vec in bio_integrity_payload was an array appended to the end of the
payload, so the bio_vecs in struct bio_pair should have come after the
bio_integrity_payload they're for.

Fix it by making bip_vec a pointer to the inline vecs - a later patch is
going to make more use of this pointer.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: Martin K. Petersen <martin.petersen@oracle.com>
---
 fs/bio-integrity.c  | 5 +++--
 include/linux/bio.h | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index a3f28f331b2b..94fa1c562c0e 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -112,6 +112,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 
 	bip->bip_slab = idx;
 	bip->bip_bio = bio;
+	bip->bip_vec = bip->bip_inline_vecs;
 	bio->bi_integrity = bip;
 
 	return bip;
@@ -697,8 +698,8 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
 	bp->iv1 = bip->bip_vec[0];
 	bp->iv2 = bip->bip_vec[0];
 
-	bp->bip1.bip_vec[0] = bp->iv1;
-	bp->bip2.bip_vec[0] = bp->iv2;
+	bp->bip1.bip_vec = &bp->iv1;
+	bp->bip2.bip_vec = &bp->iv2;
 
 	bp->iv1.bv_len = sectors * bi->tuple_size;
 	bp->iv2.bv_offset += sectors * bi->tuple_size;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b31036ff779f..81004fdcc277 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -182,7 +182,9 @@ struct bio_integrity_payload {
 	unsigned short		bip_idx;	/* current bip_vec index */
 
 	struct work_struct	bip_work;	/* I/O completion */
-	struct bio_vec		bip_vec[0];	/* embedded bvec array */
+
+	struct bio_vec		*bip_vec;
+	struct bio_vec		bip_inline_vecs[0];/* embedded bvec array */
 };
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
-- 
cgit 


From 9f060e2231ca96ca94f2ffcff730acd72606b280 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 12 Oct 2012 15:29:33 -0700
Subject: block: Convert integrity to bvec_alloc_bs()

This adds a pointer to the bvec array to struct bio_integrity_payload,
instead of the bvecs always being inline; then the bvecs are allocated
with bvec_alloc_bs().

Changed bvec_alloc_bs() and bvec_free_bs() to take a pointer to a
mempool instead of the bioset, so that bio integrity can use a different
mempool for its bvecs, and thus avoid a potential deadlock.

This is eventually for immutable bio vecs - immutable bvecs aren't
useful if we still have to copy them, hence the need for the pointer.
Less code is always nice too, though.

Also, bio_integrity_alloc() was using fs_bio_set if no bio_set was
specified. This was wrong - using the bio_set doesn't protect us from
memory allocation failures, because we just used kmalloc for the
bio_integrity_payload. But it does introduce the possibility of
deadlock, if for some reason we weren't supposed to be using fs_bio_set.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: Martin K. Petersen <martin.petersen@oracle.com>
---
 fs/bio-integrity.c  | 132 +++++++++++++++++++---------------------------------
 fs/bio.c            |  36 ++++++--------
 include/linux/bio.h |   8 ++--
 3 files changed, 68 insertions(+), 108 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 94fa1c562c0e..8c4c604c840d 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -27,48 +27,11 @@
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 
-struct integrity_slab {
-	struct kmem_cache *slab;
-	unsigned short nr_vecs;
-	char name[8];
-};
-
-#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
-struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
-	IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
-};
-#undef IS
+#define BIP_INLINE_VECS	4
 
+static struct kmem_cache *bip_slab;
 static struct workqueue_struct *kintegrityd_wq;
 
-static inline unsigned int vecs_to_idx(unsigned int nr)
-{
-	switch (nr) {
-	case 1:
-		return 0;
-	case 2 ... 4:
-		return 1;
-	case 5 ... 16:
-		return 2;
-	case 17 ... 64:
-		return 3;
-	case 65 ... 128:
-		return 4;
-	case 129 ... BIO_MAX_PAGES:
-		return 5;
-	default:
-		BUG();
-	}
-}
-
-static inline int use_bip_pool(unsigned int idx)
-{
-	if (idx == BIOVEC_MAX_IDX)
-		return 1;
-
-	return 0;
-}
-
 /**
  * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:	bio to attach integrity metadata to
@@ -84,38 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 						  unsigned int nr_vecs)
 {
 	struct bio_integrity_payload *bip;
-	unsigned int idx = vecs_to_idx(nr_vecs);
 	struct bio_set *bs = bio->bi_pool;
-
-	if (!bs)
-		bs = fs_bio_set;
-
-	BUG_ON(bio == NULL);
-	bip = NULL;
-
-	/* Lower order allocations come straight from slab */
-	if (!use_bip_pool(idx))
-		bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-
-	/* Use mempool if lower order alloc failed or max vecs were requested */
-	if (bip == NULL) {
-		idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
+	unsigned long idx = BIO_POOL_NONE;
+	unsigned inline_vecs;
+
+	if (!bs) {
+		bip = kmalloc(sizeof(struct bio_integrity_payload) +
+			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+		inline_vecs = nr_vecs;
+	} else {
 		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-
-		if (unlikely(bip == NULL)) {
-			printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-			return NULL;
-		}
+		inline_vecs = BIP_INLINE_VECS;
 	}
 
+	if (unlikely(!bip))
+		return NULL;
+
 	memset(bip, 0, sizeof(*bip));
 
+	if (nr_vecs > inline_vecs) {
+		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+					  bs->bvec_integrity_pool);
+		if (!bip->bip_vec)
+			goto err;
+	} else {
+		bip->bip_vec = bip->bip_inline_vecs;
+	}
+
 	bip->bip_slab = idx;
 	bip->bip_bio = bio;
-	bip->bip_vec = bip->bip_inline_vecs;
 	bio->bi_integrity = bip;
 
 	return bip;
+err:
+	mempool_free(bip, bs->bio_integrity_pool);
+	return NULL;
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
@@ -131,20 +97,20 @@ void bio_integrity_free(struct bio *bio)
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_set *bs = bio->bi_pool;
 
-	if (!bs)
-		bs = fs_bio_set;
-
-	BUG_ON(bip == NULL);
-
 	/* A cloned bio doesn't own the integrity metadata */
 	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
 	    && bip->bip_buf != NULL)
 		kfree(bip->bip_buf);
 
-	if (use_bip_pool(bip->bip_slab))
+	if (bs) {
+		if (bip->bip_slab != BIO_POOL_NONE)
+			bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
+				  bip->bip_slab);
+
 		mempool_free(bip, bs->bio_integrity_pool);
-	else
-		kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
+	} else {
+		kfree(bip);
+	}
 
 	bio->bi_integrity = NULL;
 }
@@ -747,13 +713,14 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-	unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
-
 	if (bs->bio_integrity_pool)
 		return 0;
 
-	bs->bio_integrity_pool =
-		mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
+
+	bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+	if (!bs->bvec_integrity_pool)
+		return -1;
 
 	if (!bs->bio_integrity_pool)
 		return -1;
@@ -766,13 +733,14 @@ void bioset_integrity_free(struct bio_set *bs)
 {
 	if (bs->bio_integrity_pool)
 		mempool_destroy(bs->bio_integrity_pool);
+
+	if (bs->bvec_integrity_pool)
+		mempool_destroy(bs->bio_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
 void __init bio_integrity_init(void)
 {
-	unsigned int i;
-
 	/*
 	 * kintegrityd won't block much but may burn a lot of CPU cycles.
 	 * Make it highpri CPU intensive wq with max concurrency of 1.
@@ -782,14 +750,10 @@ void __init bio_integrity_init(void)
 	if (!kintegrityd_wq)
 		panic("Failed to create kintegrityd\n");
 
-	for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-		unsigned int size;
-
-		size = sizeof(struct bio_integrity_payload)
-			+ bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-
-		bip_slab[i].slab =
-			kmem_cache_create(bip_slab[i].name, size, 0,
-					  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	}
+	bip_slab = kmem_cache_create("bio_integrity_payload",
+				     sizeof(struct bio_integrity_payload) +
+				     sizeof(struct bio_vec) * BIP_INLINE_VECS,
+				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	if (!bip_slab)
+		panic("Failed to create slab\n");
 }
diff --git a/fs/bio.c b/fs/bio.c
index 73b544709945..40aa96eae99f 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -160,12 +160,12 @@ unsigned int bvec_nr_vecs(unsigned short idx)
 	return bvec_slabs[idx].nr_vecs;
 }
 
-void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
 {
 	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
 
 	if (idx == BIOVEC_MAX_IDX)
-		mempool_free(bv, bs->bvec_pool);
+		mempool_free(bv, pool);
 	else {
 		struct biovec_slab *bvs = bvec_slabs + idx;
 
@@ -173,8 +173,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
 	}
 }
 
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-			      struct bio_set *bs)
+struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+			   mempool_t *pool)
 {
 	struct bio_vec *bvl;
 
@@ -210,7 +210,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
 	 */
 	if (*idx == BIOVEC_MAX_IDX) {
 fallback:
-		bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+		bvl = mempool_alloc(pool, gfp_mask);
 	} else {
 		struct biovec_slab *bvs = bvec_slabs + *idx;
 		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@ -253,7 +253,7 @@ static void bio_free(struct bio *bio)
 
 	if (bs) {
 		if (bio_has_allocated_vec(bio))
-			bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+			bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 		/*
 		 * If we have front padding, adjust the bio pointer before freeing
@@ -442,11 +442,11 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 	bio_init(bio);
 
 	if (nr_iovecs > inline_vecs) {
-		bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
 		if (!bvl && gfp_mask != saved_gfp) {
 			punt_bios_to_rescuer(bs);
 			gfp_mask = saved_gfp;
-			bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
 		}
 
 		if (unlikely(!bvl))
@@ -1661,20 +1661,11 @@ EXPORT_SYMBOL(bio_sector_offset);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
 
-	bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-	if (!bs->bvec_pool)
-		return -ENOMEM;
-
-	return 0;
-}
-
-static void biovec_free_pools(struct bio_set *bs)
-{
-	mempool_destroy(bs->bvec_pool);
+	return mempool_create_slab_pool(pool_entries, bp->slab);
 }
 
 void bioset_free(struct bio_set *bs)
@@ -1685,8 +1676,10 @@ void bioset_free(struct bio_set *bs)
 	if (bs->bio_pool)
 		mempool_destroy(bs->bio_pool);
 
+	if (bs->bvec_pool)
+		mempool_destroy(bs->bvec_pool);
+
 	bioset_integrity_free(bs);
-	biovec_free_pools(bs);
 	bio_put_slab(bs);
 
 	kfree(bs);
@@ -1731,7 +1724,8 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 	if (!bs->bio_pool)
 		goto bad;
 
-	if (biovec_create_pools(bs, pool_size))
+	bs->bvec_pool = biovec_create_pool(bs, pool_size);
+	if (!bs->bvec_pool)
 		goto bad;
 
 	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 81004fdcc277..669b1cb18fee 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -213,6 +213,7 @@ extern void bio_pair_release(struct bio_pair *dbio);
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
+extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
 
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
@@ -288,8 +289,8 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
 				     int, int, gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
-extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
-extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
+extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
+extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #ifdef CONFIG_BLK_CGROUP
@@ -511,10 +512,11 @@ struct bio_set {
 	unsigned int front_pad;
 
 	mempool_t *bio_pool;
+	mempool_t *bvec_pool;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	mempool_t *bio_integrity_pool;
+	mempool_t *bvec_integrity_pool;
 #endif
-	mempool_t *bvec_pool;
 
 	/*
 	 * Deadlock avoidance for stacking block drivers: see comments in
-- 
cgit 


From 054bdf646e36c2f7dc1bf6bc6209dbbb5909164b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 28 Sep 2012 13:17:55 -0700
Subject: block: Add bio_advance()

This is prep work for immutable bio vecs; we first want to centralize
where bvecs are modified.

Next two patches convert some existing code to use this function.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c                  | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h       |  2 ++
 include/linux/blk_types.h |  2 ++
 3 files changed, 45 insertions(+)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index 40aa96eae99f..7edc08d2246c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -752,6 +752,47 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:	bio to advance
+ * @bytes:	number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+void bio_advance(struct bio *bio, unsigned bytes)
+{
+	if (bio_integrity(bio))
+		bio_integrity_advance(bio, bytes);
+
+	bio->bi_sector += bytes >> 9;
+	bio->bi_size -= bytes;
+
+	if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+		return;
+
+	while (bytes) {
+		if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+			WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+				  bio->bi_idx, bio->bi_vcnt);
+			break;
+		}
+
+		if (bytes >= bio_iovec(bio)->bv_len) {
+			bytes -= bio_iovec(bio)->bv_len;
+			bio->bi_idx++;
+		} else {
+			bio_iovec(bio)->bv_len -= bytes;
+			bio_iovec(bio)->bv_offset += bytes;
+			bytes = 0;
+		}
+	}
+}
+EXPORT_SYMBOL(bio_advance);
+
 struct bio_map_data {
 	struct bio_vec *iovecs;
 	struct sg_iovec *sgvecs;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 669b1cb18fee..fcb4dba2d8ea 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -248,6 +248,8 @@ extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
+extern void bio_advance(struct bio *, unsigned);
+
 extern void bio_init(struct bio *);
 extern void bio_reset(struct bio *);
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cdf11191e645..c178d25e588b 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -197,6 +197,8 @@ enum rq_flag_bits {
 	 REQ_SECURE)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
+#define BIO_NO_ADVANCE_ITER_MASK	(REQ_DISCARD|REQ_WRITE_SAME)
+
 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
 	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
-- 
cgit 


From f73a1c7d117d07a96d89475066188a2b79e53c48 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 25 Sep 2012 15:05:12 -0700
Subject: block: Add bio_end_sector()

Just a little convenience macro - main reason to add it now is preparing
for immutable bio vecs, it'll reduce the size of the patch that puts
bi_sector/bi_size/bi_idx into a struct bvec_iter.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: Lars Ellenberg <drbd-dev@lists.linbit.com>
CC: Jiri Kosina <jkosina@suse.cz>
CC: Alasdair Kergon <agk@redhat.com>
CC: dm-devel@redhat.com
CC: Neil Brown <neilb@suse.de>
CC: Martin Schwidefsky <schwidefsky@de.ibm.com>
CC: Heiko Carstens <heiko.carstens@de.ibm.com>
CC: linux-s390@vger.kernel.org
CC: Chris Mason <chris.mason@fusionio.com>
CC: Steven Whitehouse <swhiteho@redhat.com>
Acked-by: Steven Whitehouse <swhiteho@redhat.com>
---
 block/blk-core.c             |  2 +-
 block/cfq-iosched.c          |  7 ++-----
 block/deadline-iosched.c     |  2 +-
 drivers/block/brd.c          |  3 +--
 drivers/block/pktcdvd.c      |  6 +++---
 drivers/md/dm-stripe.c       |  2 +-
 drivers/md/dm-verity.c       |  2 +-
 drivers/md/faulty.c          |  6 ++----
 drivers/md/linear.c          |  3 +--
 drivers/md/raid1.c           |  4 ++--
 drivers/md/raid5.c           | 14 +++++++-------
 drivers/s390/block/dcssblk.c |  3 +--
 fs/btrfs/extent_io.c         |  3 +--
 fs/gfs2/lops.c               |  2 +-
 include/linux/bio.h          |  1 +
 15 files changed, 26 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 86a1afeef606..7236b826f4a1 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1586,7 +1586,7 @@ static void handle_bad_sector(struct bio *bio)
 	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
-			(unsigned long long)bio->bi_sector + bio_sectors(bio),
+			(unsigned long long)bio_end_sector(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
 
 	set_bit(BIO_EOF, &bio->bi_flags);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 4f0ade74cfd0..d5cd3131c57a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2270,11 +2270,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 		return NULL;
 
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
-	if (cfqq) {
-		sector_t sector = bio->bi_sector + bio_sectors(bio);
-
-		return elv_rb_find(&cfqq->sort_list, sector);
-	}
+	if (cfqq)
+		return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
 
 	return NULL;
 }
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index 90037b5eb17f..ba19a3afab79 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -132,7 +132,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 	 * check for front merge
 	 */
 	if (dd->front_merges) {
-		sector_t sector = bio->bi_sector + bio_sectors(bio);
+		sector_t sector = bio_end_sector(bio);
 
 		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
 		if (__rq) {
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 531ceb31d0ff..f1a29f8e9d33 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -334,8 +334,7 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
 	int err = -EIO;
 
 	sector = bio->bi_sector;
-	if (sector + (bio->bi_size >> SECTOR_SHIFT) >
-						get_capacity(bdev->bd_disk))
+	if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
 		goto out;
 
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2e7de7a59bfc..26938e8e2fc3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -901,7 +901,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
 			pd->iosched.successive_reads += bio->bi_size >> 10;
 		else {
 			pd->iosched.successive_reads = 0;
-			pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+			pd->iosched.last_write = bio_end_sector(bio);
 		}
 		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
 			if (pd->read_speed == pd->write_speed) {
@@ -2454,7 +2454,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 	zone = ZONE(bio->bi_sector, pd);
 	VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
 		(unsigned long long)bio->bi_sector,
-		(unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+		(unsigned long long)bio_end_sector(bio));
 
 	/* Check if we have to split the bio */
 	{
@@ -2462,7 +2462,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
 		sector_t last_zone;
 		int first_sectors;
 
-		last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+		last_zone = ZONE(bio_end_sector(bio) - 1, pd);
 		if (last_zone != zone) {
 			BUG_ON(last_zone != zone + pd->settings.size);
 			first_sectors = last_zone - bio->bi_sector;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index d8837d313f54..ea5e878a30b9 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -258,7 +258,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
 	sector_t begin, end;
 
 	stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
-	stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio),
+	stripe_map_range_sector(sc, bio_end_sector(bio),
 				target_stripe, &end);
 	if (begin < end) {
 		bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
index 6ad538375c3c..923115d08baa 100644
--- a/drivers/md/dm-verity.c
+++ b/drivers/md/dm-verity.c
@@ -472,7 +472,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 		return -EIO;
 	}
 
-	if ((bio->bi_sector + bio_sectors(bio)) >>
+	if (bio_end_sector(bio) >>
 	    (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
 		DMERR_LIMIT("io out of range");
 		return -EIO;
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 5e7dc772f5de..3193aefe982b 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 			return;
 		}
 
-		if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
-				 WRITE))
+		if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
 			failit = 1;
 		if (check_mode(conf, WritePersistent)) {
 			add_sector(conf, bio->bi_sector, WritePersistent);
@@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
 			failit = 1;
 	} else {
 		/* read request */
-		if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
-				 READ))
+		if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
 			failit = 1;
 		if (check_mode(conf, ReadTransient))
 			failit = 1;
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 21014836bdbf..f03fabd2b37b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 		bio_io_error(bio);
 		return;
 	}
-	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-		     tmp_dev->end_sector)) {
+	if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
 		/* This bio crosses a device boundary, so we have to
 		 * split it.
 		 */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fd86b372692d..4d8c2e0a6bad 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1018,7 +1018,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 	md_write_start(mddev, bio); /* wait on superblock update early */
 
 	if (bio_data_dir(bio) == WRITE &&
-	    bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+	    bio_end_sector(bio) > mddev->suspend_lo &&
 	    bio->bi_sector < mddev->suspend_hi) {
 		/* As the suspend_* range is controlled by
 		 * userspace, we want an interruptible
@@ -1029,7 +1029,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
 			flush_signals(current);
 			prepare_to_wait(&conf->wait_barrier,
 					&w, TASK_INTERRUPTIBLE);
-			if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+			if (bio_end_sector(bio) <= mddev->suspend_lo ||
 			    bio->bi_sector >= mddev->suspend_hi)
 				break;
 			schedule();
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3ee2912889e7..68706970d217 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2384,11 +2384,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 	} else
 		bip = &sh->dev[dd_idx].toread;
 	while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-		if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+		if (bio_end_sector(*bip) > bi->bi_sector)
 			goto overlap;
 		bip = & (*bip)->bi_next;
 	}
-	if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+	if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
 		goto overlap;
 
 	BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2404,8 +2404,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
 		     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
 			     bi && bi->bi_sector <= sector;
 		     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-			if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-				sector = bi->bi_sector + (bi->bi_size>>9);
+			if (bio_end_sector(bi) >= sector)
+				sector = bio_end_sector(bi);
 		}
 		if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
 			set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@ -3941,7 +3941,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
 						    0,
 						    &dd_idx, NULL);
 
-	end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+	end_sector = bio_end_sector(align_bi);
 	rcu_read_lock();
 	rdev = rcu_dereference(conf->disks[dd_idx].replacement);
 	if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -4216,7 +4216,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 	}
 
 	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-	last_sector = bi->bi_sector + (bi->bi_size>>9);
+	last_sector = bio_end_sector(bi);
 	bi->bi_next = NULL;
 	bi->bi_phys_segments = 1;	/* over-loaded to count active stripes */
 
@@ -4679,7 +4679,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 	logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
 	sector = raid5_compute_sector(conf, logical_sector,
 				      0, &dd_idx, NULL);
-	last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+	last_sector = bio_end_sector(raid_bio);
 
 	for (; logical_sector < last_sector;
 	     logical_sector += STRIPE_SECTORS,
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index b6ad0de07930..12d08b4529e9 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -826,8 +826,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
 	if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
 		/* Request is not page-aligned. */
 		goto fail;
-	if (((bio->bi_size >> 9) + bio->bi_sector)
-			> get_capacity(bio->bi_bdev->bd_disk)) {
+	if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
 		/* Request beyond end of DCSS segment. */
 		goto fail;
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f173c5af6461..bed072aa461f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2527,8 +2527,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		if (old_compressed)
 			contig = bio->bi_sector == sector;
 		else
-			contig = bio->bi_sector + (bio->bi_size >> 9) ==
-				sector;
+			contig = bio_end_sector(bio) == sector;
 
 		if (prev_bio_flags != bio_flags || !contig ||
 		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a5055977a214..5c37ef982390 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
 	u64 nblk;
 
 	if (bio) {
-		nblk = bio->bi_sector + bio_sectors(bio);
+		nblk = bio_end_sector(bio);
 		nblk >>= sdp->sd_fsb2bb_shift;
 		if (blkno == nblk)
 			return bio;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index fcb4dba2d8ea..20507eb7c979 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -67,6 +67,7 @@
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
+#define bio_end_sector(bio)	((bio)->bi_sector + bio_sectors((bio)))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
-- 
cgit 


From 9e882242c6193ae6f416f2d8d8db0d9126bd996b Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Mon, 10 Sep 2012 14:41:12 -0700
Subject: block: Add submit_bio_wait(), remove from md

Random cleanup - this code was duplicated and it's not really specific
to md.

Also added the ability to return the actual error code.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: NeilBrown <neilb@suse.de>
Acked-by: Tejun Heo <tj@kernel.org>
---
 drivers/md/raid1.c  | 19 -------------------
 drivers/md/raid10.c | 19 -------------------
 fs/bio.c            | 36 ++++++++++++++++++++++++++++++++++++
 include/linux/bio.h |  1 +
 4 files changed, 37 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index f741c9fe25c8..800748d585ca 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2059,25 +2059,6 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
 	}
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-	struct completion event;
-	rw |= REQ_SYNC;
-
-	init_completion(&event);
-	bio->bi_private = &event;
-	bio->bi_end_io = bi_complete;
-	submit_bio(rw, bio);
-	wait_for_completion(&event);
-
-	return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
 static int narrow_write_error(struct r1bio *r1_bio, int i)
 {
 	struct mddev *mddev = r1_bio->mddev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6ffb6c08aec5..434586d43115 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2529,25 +2529,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
 	}
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-	complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-	struct completion event;
-	rw |= REQ_SYNC;
-
-	init_completion(&event);
-	bio->bi_private = &event;
-	bio->bi_end_io = bi_complete;
-	submit_bio(rw, bio);
-	wait_for_completion(&event);
-
-	return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
 static int narrow_write_error(struct r10bio *r10_bio, int i)
 {
 	struct bio *bio = r10_bio->master_bio;
diff --git a/fs/bio.c b/fs/bio.c
index f1b4c1651089..4ce24ee5dcd0 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -752,6 +752,42 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+struct submit_bio_ret {
+	struct completion event;
+	int error;
+};
+
+static void submit_bio_wait_endio(struct bio *bio, int error)
+{
+	struct submit_bio_ret *ret = bio->bi_private;
+
+	ret->error = error;
+	complete(&ret->event);
+}
+
+/**
+ * submit_bio_wait - submit a bio, and wait until it completes
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+ * bio_endio() on failure.
+ */
+int submit_bio_wait(int rw, struct bio *bio)
+{
+	struct submit_bio_ret ret;
+
+	rw |= REQ_SYNC;
+	init_completion(&ret.event);
+	bio->bi_private = &ret;
+	bio->bi_end_io = submit_bio_wait_endio;
+	submit_bio(rw, bio);
+	wait_for_completion(&ret.event);
+
+	return ret.error;
+}
+EXPORT_SYMBOL(submit_bio_wait);
+
 /**
  * bio_advance - increment/complete a bio by some number of bytes
  * @bio:	bio to advance
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 20507eb7c979..b20a9cd776dd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -249,6 +249,7 @@ extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
+extern int submit_bio_wait(int rw, struct bio *bio);
 extern void bio_advance(struct bio *, unsigned);
 
 extern void bio_init(struct bio *);
-- 
cgit 


From 16ac3d63e74f3d6e34e42d6e523b6a61de0020f0 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Mon, 10 Sep 2012 13:57:51 -0700
Subject: block: Add bio_copy_data()

This gets open coded quite a bit and it's tricky to get right, so make a
generic version and convert some existing users over to it instead.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
---
 fs/bio.c            | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/bio.h |  2 ++
 2 files changed, 72 insertions(+)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index 4ce24ee5dcd0..e437f9aae67d 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -829,6 +829,76 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bio_vec *src_bv, *dst_bv;
+	unsigned src_offset, dst_offset, bytes;
+	void *src_p, *dst_p;
+
+	src_bv = bio_iovec(src);
+	dst_bv = bio_iovec(dst);
+
+	src_offset = src_bv->bv_offset;
+	dst_offset = dst_bv->bv_offset;
+
+	while (1) {
+		if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+			src_bv++;
+			if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+				src = src->bi_next;
+				if (!src)
+					break;
+
+				src_bv = bio_iovec(src);
+			}
+
+			src_offset = src_bv->bv_offset;
+		}
+
+		if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+			dst_bv++;
+			if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+				dst = dst->bi_next;
+				if (!dst)
+					break;
+
+				dst_bv = bio_iovec(dst);
+			}
+
+			dst_offset = dst_bv->bv_offset;
+		}
+
+		bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+			    src_bv->bv_offset + src_bv->bv_len - src_offset);
+
+		src_p = kmap_atomic(src_bv->bv_page);
+		dst_p = kmap_atomic(dst_bv->bv_page);
+
+		memcpy(dst_p + dst_bv->bv_offset,
+		       src_p + src_bv->bv_offset,
+		       bytes);
+
+		kunmap_atomic(dst_p);
+		kunmap_atomic(src_p);
+
+		src_offset += bytes;
+		dst_offset += bytes;
+	}
+}
+EXPORT_SYMBOL(bio_copy_data);
+
 struct bio_map_data {
 	struct bio_vec *iovecs;
 	struct sg_iovec *sgvecs;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b20a9cd776dd..90d36c65cb70 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -286,6 +286,8 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 }
 #endif
 
+extern void bio_copy_data(struct bio *dst, struct bio *src);
+
 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
 				 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
-- 
cgit 


From d74c6d514fe314b8bdab58b487b25992291577ec Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Wed, 6 Feb 2013 12:23:11 -0800
Subject: block: Add bio_for_each_segment_all()

__bio_for_each_segment() iterates bvecs from the specified index
instead of bio->bv_idx.  Currently, the only usage is to walk all the
bvecs after the bio has been advanced by specifying 0 index.

For immutable bvecs, we need to split these apart;
bio_for_each_segment() is going to have a different implementation.
This will also help document the intent of code that's using it -
bio_for_each_segment_all() is only legal to use for code that owns the
bio.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: Neil Brown <neilb@suse.de>
CC: Boaz Harrosh <bharrosh@panasas.com>
---
 drivers/block/rbd.c |  2 +-
 drivers/md/raid1.c  |  2 +-
 fs/bio.c            | 12 ++++++------
 fs/exofs/ore.c      |  2 +-
 fs/exofs/ore_raid.c |  2 +-
 include/linux/bio.h | 17 ++++++++++++++---
 mm/bounce.c         |  2 +-
 7 files changed, 25 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6c81a4c040b9..11e179826b60 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -952,7 +952,7 @@ static struct bio *bio_clone_range(struct bio *bio_src,
 	/* Find first affected segment... */
 
 	resid = offset;
-	__bio_for_each_segment(bv, bio_src, idx, 0) {
+	bio_for_each_segment(bv, bio_src, idx) {
 		if (resid < bv->bv_len)
 			break;
 		resid -= bv->bv_len;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0a3988a25aab..853482015d3d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1291,7 +1291,7 @@ read_again:
 			 * know the original bi_idx, so we just free
 			 * them all
 			 */
-			__bio_for_each_segment(bvec, mbio, j, 0)
+			bio_for_each_segment_all(bvec, mbio, j)
 				bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
diff --git a/fs/bio.c b/fs/bio.c
index e437f9aae67d..618f9044c414 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -961,7 +961,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 	int iov_idx = 0;
 	unsigned int iov_off = 0;
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		char *bv_addr = page_address(bvec->bv_page);
 		unsigned int bv_len = iovecs[i].bv_len;
 
@@ -1143,7 +1143,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
 	return bio;
 cleanup:
 	if (!map_data)
-		bio_for_each_segment(bvec, bio, i)
+		bio_for_each_segment_all(bvec, bio, i)
 			__free_page(bvec->bv_page);
 
 	bio_put(bio);
@@ -1357,7 +1357,7 @@ static void __bio_unmap_user(struct bio *bio)
 	/*
 	 * make sure we dirty pages we wrote to
 	 */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		if (bio_data_dir(bio) == READ)
 			set_page_dirty_lock(bvec->bv_page);
 
@@ -1463,7 +1463,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 	int i;
 	char *p = bmd->sgvecs[0].iov_base;
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		char *addr = page_address(bvec->bv_page);
 		int len = bmd->iovecs[i].bv_len;
 
@@ -1503,7 +1503,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
 	if (!reading) {
 		void *p = data;
 
-		bio_for_each_segment(bvec, bio, i) {
+		bio_for_each_segment_all(bvec, bio, i) {
 			char *addr = page_address(bvec->bv_page);
 
 			memcpy(addr, p, bvec->bv_len);
@@ -1789,7 +1789,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 	if (index >= bio->bi_idx)
 		index = bio->bi_vcnt - 1;
 
-	__bio_for_each_segment(bv, bio, i, 0) {
+	bio_for_each_segment_all(bv, bio, i) {
 		if (i == index) {
 			if (offset > bv->bv_offset)
 				sectors += (offset - bv->bv_offset) / sector_sz;
diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c
index f936cb50dc0d..b74422888604 100644
--- a/fs/exofs/ore.c
+++ b/fs/exofs/ore.c
@@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio)
 	struct bio_vec *bv;
 	unsigned i;
 
-	__bio_for_each_segment(bv, bio, i, 0) {
+	bio_for_each_segment_all(bv, bio, i) {
 		unsigned this_count = bv->bv_len;
 
 		if (likely(PAGE_SIZE == this_count))
diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c
index b963f38ac298..7682b970d0f1 100644
--- a/fs/exofs/ore_raid.c
+++ b/fs/exofs/ore_raid.c
@@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
 		if (!bio)
 			continue;
 
-		__bio_for_each_segment(bv, bio, i, 0) {
+		bio_for_each_segment_all(bv, bio, i) {
 			struct page *page = bv->bv_page;
 
 			SetPageUptodate(page);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 90d36c65cb70..be2efa09f9bf 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -137,16 +137,27 @@ static inline int bio_has_allocated_vec(struct bio *bio)
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 
 /*
- * drivers should not use the __ version unless they _really_ want to
- * run through the entire bio and not just pending pieces
+ * drivers should not use the __ version unless they _really_ know what
+ * they're doing
  */
 #define __bio_for_each_segment(bvl, bio, i, start_idx)			\
 	for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);	\
 	     i < (bio)->bi_vcnt;					\
 	     bvl++, i++)
 
+/*
+ * drivers should _never_ use the all version - the bio may have been split
+ * before it got to the driver and the driver won't own all of it
+ */
+#define bio_for_each_segment_all(bvl, bio, i)				\
+	for (i = 0;							\
+	     bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;	\
+	     i++)
+
 #define bio_for_each_segment(bvl, bio, i)				\
-	__bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
+	for (i = (bio)->bi_idx;						\
+	     bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;	\
+	     i++)
 
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
diff --git a/mm/bounce.c b/mm/bounce.c
index 55f512af50c7..2ee1b6fef44a 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	/*
 	 * free up bounce indirect pages used
 	 */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment_all(bvec, bio, i) {
 		org_vec = bio_orig->bi_io_vec + i;
 		if (bvec->bv_page == org_vec->bv_page)
 			continue;
-- 
cgit 


From a07876064a0b73ab5ef1ebcf14b1cf0231c07858 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Mon, 10 Sep 2012 14:03:28 -0700
Subject: block: Add bio_alloc_pages()

More utility code to replace stuff that's getting open coded.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: NeilBrown <neilb@suse.de>
---
 drivers/md/raid1.c  | 16 +++-------------
 fs/bio.c            | 28 ++++++++++++++++++++++++++++
 include/linux/bio.h |  1 +
 3 files changed, 32 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a7ea954abe1d..aeb4e3f74791 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
-	struct page *page;
 	struct r1bio *r1_bio;
 	struct bio *bio;
 	int i, j;
@@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		j = 1;
 	while(j--) {
 		bio = r1_bio->bios[j];
-		for (i = 0; i < RESYNC_PAGES; i++) {
-			page = alloc_page(gfp_flags);
-			if (unlikely(!page))
-				goto out_free_pages;
+		bio->bi_vcnt = RESYNC_PAGES;
 
-			bio->bi_io_vec[i].bv_page = page;
-			bio->bi_vcnt = i+1;
-		}
+		if (bio_alloc_pages(bio, gfp_flags))
+			goto out_free_bio;
 	}
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 	return r1_bio;
 
-out_free_pages:
-	for (j=0 ; j < pi->raid_disks; j++)
-		for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
-			put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
-	j = -1;
 out_free_bio:
 	while (++j < pi->raid_disks)
 		bio_put(r1_bio->bios[j]);
diff --git a/fs/bio.c b/fs/bio.c
index fe3aee90c988..e545a440d376 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -829,6 +829,34 @@ void bio_advance(struct bio *bio, unsigned bytes)
 }
 EXPORT_SYMBOL(bio_advance);
 
+/**
+ * bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+	int i;
+	struct bio_vec *bv;
+
+	bio_for_each_segment_all(bv, bio, i) {
+		bv->bv_page = alloc_page(gfp_mask);
+		if (!bv->bv_page) {
+			while (--bv >= bio->bi_io_vec)
+				__free_page(bv->bv_page);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(bio_alloc_pages);
+
 /**
  * bio_copy_data - copy contents of data buffers from one chain of bios to
  * another
diff --git a/include/linux/bio.h b/include/linux/bio.h
index be2efa09f9bf..e25378f2f408 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -298,6 +298,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 #endif
 
 extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
 
 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
 				 unsigned long, unsigned int, int, gfp_t);
-- 
cgit 


From a38352e0ac02dbbd4fa464dc22d1352b5fbd06fd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Fri, 25 May 2012 13:03:11 -0700
Subject: block: Add an explicit bio flag for bios that own their bvec

This is for the new bio splitting code. When we split a bio, if the
split occured on a bvec boundry we reuse the bvec for the new bio. But
that means bio_free() can't free it, hence the explicit flag.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
Acked-by: Tejun Heo <tj@kernel.org>
---
 fs/bio.c                  | 4 +++-
 include/linux/bio.h       | 5 -----
 include/linux/blk_types.h | 1 +
 3 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio.c b/fs/bio.c
index e545a440d376..9238a54b562c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -252,7 +252,7 @@ static void bio_free(struct bio *bio)
 	__bio_free(bio);
 
 	if (bs) {
-		if (bio_has_allocated_vec(bio))
+		if (bio_flagged(bio, BIO_OWNS_VEC))
 			bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
 		/*
@@ -451,6 +451,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 
 		if (unlikely(!bvl))
 			goto err_free;
+
+		bio->bi_flags |= 1 << BIO_OWNS_VEC;
 	} else if (nr_iovecs) {
 		bvl = bio->bi_inline_vecs;
 	}
diff --git a/include/linux/bio.h b/include/linux/bio.h
index e25378f2f408..794bcd0c5039 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -85,11 +85,6 @@ static inline void *bio_data(struct bio *bio)
 	return NULL;
 }
 
-static inline int bio_has_allocated_vec(struct bio *bio)
-{
-	return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
-}
-
 /*
  * will die
  */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c178d25e588b..538289ffc704 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -117,6 +117,7 @@ struct bio {
  * BIO_POOL_IDX()
  */
 #define BIO_RESET_BITS	12
+#define BIO_OWNS_VEC	12	/* bio_free() should free bvec */
 
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
-- 
cgit 


From 29ed7813ce5c4661261aeebddb1b8660e0860223 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 4 Sep 2012 09:54:22 -0700
Subject: bio-integrity: Add explicit field for owner of bip_buf

This was the only real user of BIO_CLONED, which didn't have very clear
semantics. Convert to its own flag so we can get rid of BIO_CLONED.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Jens Axboe <axboe@kernel.dk>
CC: Martin K. Petersen <martin.petersen@oracle.com>
---
 fs/bio-integrity.c  | 5 ++---
 include/linux/bio.h | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index ca7b02dbf09d..8fb42916d8a2 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -97,9 +97,7 @@ void bio_integrity_free(struct bio *bio)
 	struct bio_integrity_payload *bip = bio->bi_integrity;
 	struct bio_set *bs = bio->bi_pool;
 
-	/* A cloned bio doesn't own the integrity metadata */
-	if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
-	    && bip->bip_buf != NULL)
+	if (bip->bip_owns_buf)
 		kfree(bip->bip_buf);
 
 	if (bs) {
@@ -386,6 +384,7 @@ int bio_integrity_prep(struct bio *bio)
 		return -EIO;
 	}
 
+	bip->bip_owns_buf = 1;
 	bip->bip_buf = buf;
 	bip->bip_size = len;
 	bip->bip_sector = bio->bi_sector;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 794bcd0c5039..ef24466d8f82 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -187,6 +187,7 @@ struct bio_integrity_payload {
 	unsigned short		bip_slab;	/* slab the bip came from */
 	unsigned short		bip_vcnt;	/* # of integrity bio_vecs */
 	unsigned short		bip_idx;	/* current bip_vec index */
+	unsigned		bip_owns_buf:1;	/* should free bip_buf */
 
 	struct work_struct	bip_work;	/* I/O completion */
 
-- 
cgit 


From 84759c6d18c5144432781ddca037d929ee9db8a5 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Wed, 21 Sep 2011 21:43:05 -0700
Subject: Revert "rw_semaphore: remove up/down_read_non_owner"

This reverts commit 11b80f459adaf91a712f95e7734a17655a36bf30.

Bcache needs rw semaphores for cache coherency in writeback mode -
writes have to take a read lock on a per cache device rw sem, and
release it when the bio completes.

But since this is for bios it's naturally not in the context of the
process that originally took the lock.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
CC: Christoph Hellwig <hch@infradead.org>
CC: David Howells <dhowells@redhat.com>
---
 include/linux/rwsem.h | 10 ++++++++++
 kernel/rwsem.c        | 16 ++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 8da67d625e13..0616ffe45702 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -133,10 +133,20 @@ do {								\
 	_down_write_nest_lock(sem, &(nest_lock)->dep_map);	\
 } while (0);
 
+/*
+ * Take/release a lock when not the owner will release it.
+ *
+ * [ This API should be avoided as much as possible - the
+ *   proper abstraction for this case is completions. ]
+ */
+extern void down_read_non_owner(struct rw_semaphore *sem);
+extern void up_read_non_owner(struct rw_semaphore *sem);
 #else
 # define down_read_nested(sem, subclass)		down_read(sem)
 # define down_write_nest_lock(sem, nest_lock)	down_write(sem)
 # define down_write_nested(sem, subclass)	down_write(sem)
+# define down_read_non_owner(sem)		down_read(sem)
+# define up_read_non_owner(sem)			up_read(sem)
 #endif
 
 #endif /* _LINUX_RWSEM_H */
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index b3c6c3fcd847..cfff1435bdfb 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -126,6 +126,15 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
 
 EXPORT_SYMBOL(_down_write_nest_lock);
 
+void down_read_non_owner(struct rw_semaphore *sem)
+{
+	might_sleep();
+
+	__down_read(sem);
+}
+
+EXPORT_SYMBOL(down_read_non_owner);
+
 void down_write_nested(struct rw_semaphore *sem, int subclass)
 {
 	might_sleep();
@@ -136,6 +145,13 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
 
 EXPORT_SYMBOL(down_write_nested);
 
+void up_read_non_owner(struct rw_semaphore *sem)
+{
+	__up_read(sem);
+}
+
+EXPORT_SYMBOL(up_read_non_owner);
+
 #endif
 
 
-- 
cgit 


From cafe563591446cf80bfbc2fe3bc72a2e36cf1060 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Sat, 23 Mar 2013 16:11:31 -0700
Subject: bcache: A block layer cache

Does writethrough and writeback caching, handles unclean shutdown, and
has a bunch of other nifty features motivated by real world usage.

See the wiki at http://bcache.evilpiepirate.org for more.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
---
 Documentation/ABI/testing/sysfs-block-bcache |  156 ++
 Documentation/bcache.txt                     |  343 ++++
 MAINTAINERS                                  |    7 +
 drivers/md/Kconfig                           |    2 +
 drivers/md/Makefile                          |    1 +
 drivers/md/bcache/Kconfig                    |   42 +
 drivers/md/bcache/Makefile                   |    7 +
 drivers/md/bcache/alloc.c                    |  583 ++++++
 drivers/md/bcache/bcache.h                   | 1232 +++++++++++++
 drivers/md/bcache/bset.c                     | 1190 ++++++++++++
 drivers/md/bcache/bset.h                     |  379 ++++
 drivers/md/bcache/btree.c                    | 2503 ++++++++++++++++++++++++++
 drivers/md/bcache/btree.h                    |  405 +++++
 drivers/md/bcache/closure.c                  |  348 ++++
 drivers/md/bcache/closure.h                  |  670 +++++++
 drivers/md/bcache/debug.c                    |  563 ++++++
 drivers/md/bcache/debug.h                    |   54 +
 drivers/md/bcache/io.c                       |  390 ++++
 drivers/md/bcache/journal.c                  |  785 ++++++++
 drivers/md/bcache/journal.h                  |  215 +++
 drivers/md/bcache/movinggc.c                 |  254 +++
 drivers/md/bcache/request.c                  | 1409 +++++++++++++++
 drivers/md/bcache/request.h                  |   62 +
 drivers/md/bcache/stats.c                    |  245 +++
 drivers/md/bcache/stats.h                    |   58 +
 drivers/md/bcache/super.c                    | 1941 ++++++++++++++++++++
 drivers/md/bcache/sysfs.c                    |  817 +++++++++
 drivers/md/bcache/sysfs.h                    |  110 ++
 drivers/md/bcache/trace.c                    |   26 +
 drivers/md/bcache/util.c                     |  389 ++++
 drivers/md/bcache/util.h                     |  589 ++++++
 drivers/md/bcache/writeback.c                |  414 +++++
 include/linux/cgroup_subsys.h                |    6 +
 include/linux/sched.h                        |    4 +
 include/trace/events/bcache.h                |  271 +++
 kernel/fork.c                                |    4 +
 36 files changed, 16474 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-block-bcache
 create mode 100644 Documentation/bcache.txt
 create mode 100644 drivers/md/bcache/Kconfig
 create mode 100644 drivers/md/bcache/Makefile
 create mode 100644 drivers/md/bcache/alloc.c
 create mode 100644 drivers/md/bcache/bcache.h
 create mode 100644 drivers/md/bcache/bset.c
 create mode 100644 drivers/md/bcache/bset.h
 create mode 100644 drivers/md/bcache/btree.c
 create mode 100644 drivers/md/bcache/btree.h
 create mode 100644 drivers/md/bcache/closure.c
 create mode 100644 drivers/md/bcache/closure.h
 create mode 100644 drivers/md/bcache/debug.c
 create mode 100644 drivers/md/bcache/debug.h
 create mode 100644 drivers/md/bcache/io.c
 create mode 100644 drivers/md/bcache/journal.c
 create mode 100644 drivers/md/bcache/journal.h
 create mode 100644 drivers/md/bcache/movinggc.c
 create mode 100644 drivers/md/bcache/request.c
 create mode 100644 drivers/md/bcache/request.h
 create mode 100644 drivers/md/bcache/stats.c
 create mode 100644 drivers/md/bcache/stats.h
 create mode 100644 drivers/md/bcache/super.c
 create mode 100644 drivers/md/bcache/sysfs.c
 create mode 100644 drivers/md/bcache/sysfs.h
 create mode 100644 drivers/md/bcache/trace.c
 create mode 100644 drivers/md/bcache/util.c
 create mode 100644 drivers/md/bcache/util.h
 create mode 100644 drivers/md/bcache/writeback.c
 create mode 100644 include/trace/events/bcache.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-block-bcache b/Documentation/ABI/testing/sysfs-block-bcache
new file mode 100644
index 000000000000..9e4bbc5d51fd
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-block-bcache
@@ -0,0 +1,156 @@
+What:		/sys/block/<disk>/bcache/unregister
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		A write to this file causes the backing device or cache to be
+		unregistered. If a backing device had dirty data in the cache,
+		writeback mode is automatically disabled and all dirty data is
+		flushed before the device is unregistered. Caches unregister
+		all associated backing devices before unregistering themselves.
+
+What:		/sys/block/<disk>/bcache/clear_stats
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		Writing to this file resets all the statistics for the device.
+
+What:		/sys/block/<disk>/bcache/cache
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a backing device that has cache, a symlink to
+		the bcache/ dir of that cache.
+
+What:		/sys/block/<disk>/bcache/cache_hits
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: integer number of full cache hits,
+		counted per bio. A partial cache hit counts as a miss.
+
+What:		/sys/block/<disk>/bcache/cache_misses
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: integer number of cache misses.
+
+What:		/sys/block/<disk>/bcache/cache_hit_ratio
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: cache hits as a percentage.
+
+What:		/sys/block/<disk>/bcache/sequential_cutoff
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: Threshold past which sequential IO will
+		skip the cache. Read and written as bytes in human readable
+		units (i.e. echo 10M > sequntial_cutoff).
+
+What:		/sys/block/<disk>/bcache/bypassed
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		Sum of all reads and writes that have bypassed the cache (due
+		to the sequential cutoff).  Expressed as bytes in human
+		readable units.
+
+What:		/sys/block/<disk>/bcache/writeback
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: When on, writeback caching is enabled and
+		writes will be buffered in the cache. When off, caching is in
+		writethrough mode; reads and writes will be added to the
+		cache but no write buffering will take place.
+
+What:		/sys/block/<disk>/bcache/writeback_running
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: when off, dirty data will not be written
+		from the cache to the backing device. The cache will still be
+		used to buffer writes until it is mostly full, at which point
+		writes transparently revert to writethrough mode. Intended only
+		for benchmarking/testing.
+
+What:		/sys/block/<disk>/bcache/writeback_delay
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: In writeback mode, when dirty data is
+		written to the cache and the cache held no dirty data for that
+		backing device, writeback from cache to backing device starts
+		after this delay, expressed as an integer number of seconds.
+
+What:		/sys/block/<disk>/bcache/writeback_percent
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For backing devices: If nonzero, writeback from cache to
+		backing device only takes place when more than this percentage
+		of the cache is used, allowing more write coalescing to take
+		place and reducing total number of writes sent to the backing
+		device. Integer between 0 and 40.
+
+What:		/sys/block/<disk>/bcache/synchronous
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, a boolean that allows synchronous mode to be
+		switched on and off. In synchronous mode all writes are ordered
+		such that the cache can reliably recover from unclean shutdown;
+		if disabled bcache will not generally wait for writes to
+		complete but if the cache is not shut down cleanly all data
+		will be discarded from the cache. Should not be turned off with
+		writeback caching enabled.
+
+What:		/sys/block/<disk>/bcache/discard
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, a boolean allowing discard/TRIM to be turned off
+		or back on if the device supports it.
+
+What:		/sys/block/<disk>/bcache/bucket_size
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, bucket size in human readable units, as set at
+		cache creation time; should match the erase block size of the
+		SSD for optimal performance.
+
+What:		/sys/block/<disk>/bcache/nbuckets
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, the number of usable buckets.
+
+What:		/sys/block/<disk>/bcache/tree_depth
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, height of the btree excluding leaf nodes (i.e. a
+		one node tree will have a depth of 0).
+
+What:		/sys/block/<disk>/bcache/btree_cache_size
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		Number of btree buckets/nodes that are currently cached in
+		memory; cache dynamically grows and shrinks in response to
+		memory pressure from the rest of the system.
+
+What:		/sys/block/<disk>/bcache/written
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, total amount of data in human readable units
+		written to the cache, excluding all metadata.
+
+What:		/sys/block/<disk>/bcache/btree_written
+Date:		November 2010
+Contact:	Kent Overstreet <kent.overstreet@gmail.com>
+Description:
+		For a cache, sum of all btree writes in human readable units.
diff --git a/Documentation/bcache.txt b/Documentation/bcache.txt
new file mode 100644
index 000000000000..533307d52c87
--- /dev/null
+++ b/Documentation/bcache.txt
@@ -0,0 +1,343 @@
+Say you've got a big slow raid 6, and an X-25E or three. Wouldn't it be
+nice if you could use them as cache... Hence bcache.
+
+Wiki and git repositories are at:
+  http://bcache.evilpiepirate.org
+  http://evilpiepirate.org/git/linux-bcache.git
+  http://evilpiepirate.org/git/bcache-tools.git
+
+It's designed around the performance characteristics of SSDs - it only allocates
+in erase block sized buckets, and it uses a hybrid btree/log to track cached
+extants (which can be anywhere from a single sector to the bucket size). It's
+designed to avoid random writes at all costs; it fills up an erase block
+sequentially, then issues a discard before reusing it.
+
+Both writethrough and writeback caching are supported. Writeback defaults to
+off, but can be switched on and off arbitrarily at runtime. Bcache goes to
+great lengths to protect your data - it reliably handles unclean shutdown. (It
+doesn't even have a notion of a clean shutdown; bcache simply doesn't return
+writes as completed until they're on stable storage).
+
+Writeback caching can use most of the cache for buffering writes - writing
+dirty data to the backing device is always done sequentially, scanning from the
+start to the end of the index.
+
+Since random IO is what SSDs excel at, there generally won't be much benefit
+to caching large sequential IO. Bcache detects sequential IO and skips it;
+it also keeps a rolling average of the IO sizes per task, and as long as the
+average is above the cutoff it will skip all IO from that task - instead of
+caching the first 512k after every seek. Backups and large file copies should
+thus entirely bypass the cache.
+
+In the event of a data IO error on the flash it will try to recover by reading
+from disk or invalidating cache entries.  For unrecoverable errors (meta data
+or dirty data), caching is automatically disabled; if dirty data was present
+in the cache it first disables writeback caching and waits for all dirty data
+to be flushed.
+
+Getting started:
+You'll need make-bcache from the bcache-tools repository. Both the cache device
+and backing device must be formatted before use.
+  make-bcache -B /dev/sdb
+  make-bcache -C /dev/sdc
+
+make-bcache has the ability to format multiple devices at the same time - if
+you format your backing devices and cache device at the same time, you won't
+have to manually attach:
+  make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
+
+To make bcache devices known to the kernel, echo them to /sys/fs/bcache/register:
+
+  echo /dev/sdb > /sys/fs/bcache/register
+  echo /dev/sdc > /sys/fs/bcache/register
+
+To register your bcache devices automatically, you could add something like
+this to an init script:
+
+  echo /dev/sd* > /sys/fs/bcache/register_quiet
+
+It'll look for bcache superblocks and ignore everything that doesn't have one.
+
+Registering the backing device makes the bcache show up in /dev; you can now
+format it and use it as normal. But the first time using a new bcache device,
+it'll be running in passthrough mode until you attach it to a cache. See the
+section on attaching.
+
+The devices show up at /dev/bcacheN, and can be controlled via sysfs from
+/sys/block/bcacheN/bcache:
+
+  mkfs.ext4 /dev/bcache0
+  mount /dev/bcache0 /mnt
+
+Cache devices are managed as sets; multiple caches per set isn't supported yet
+but will allow for mirroring of metadata and dirty data in the future. Your new
+cache set shows up as /sys/fs/bcache/<UUID>
+
+ATTACHING:
+
+After your cache device and backing device are registered, the backing device
+must be attached to your cache set to enable caching. Attaching a backing
+device to a cache set is done thusly, with the UUID of the cache set in
+/sys/fs/bcache:
+
+  echo <UUID> > /sys/block/bcache0/bcache/attach
+
+This only has to be done once. The next time you reboot, just reregister all
+your bcache devices. If a backing device has data in a cache somewhere, the
+/dev/bcache# device won't be created until the cache shows up - particularly
+important if you have writeback caching turned on.
+
+If you're booting up and your cache device is gone and never coming back, you
+can force run the backing device:
+
+  echo 1 > /sys/block/sdb/bcache/running
+
+(You need to use /sys/block/sdb (or whatever your backing device is called), not
+/sys/block/bcache0, because bcache0 doesn't exist yet. If you're using a
+partition, the bcache directory would be at /sys/block/sdb/sdb2/bcache)
+
+The backing device will still use that cache set if it shows up in the future,
+but all the cached data will be invalidated. If there was dirty data in the
+cache, don't expect the filesystem to be recoverable - you will have massive
+filesystem corruption, though ext4's fsck does work miracles.
+
+SYSFS - BACKING DEVICE:
+
+attach
+  Echo the UUID of a cache set to this file to enable caching.
+
+cache_mode
+  Can be one of either writethrough, writeback, writearound or none.
+
+clear_stats
+  Writing to this file resets the running total stats (not the day/hour/5 minute
+  decaying versions).
+
+detach
+  Write to this file to detach from a cache set. If there is dirty data in the
+  cache, it will be flushed first.
+
+dirty_data
+  Amount of dirty data for this backing device in the cache. Continuously
+  updated unlike the cache set's version, but may be slightly off.
+
+label
+  Name of underlying device.
+
+readahead
+  Size of readahead that should be performed.  Defaults to 0.  If set to e.g.
+  1M, it will round cache miss reads up to that size, but without overlapping
+  existing cache entries.
+
+running
+  1 if bcache is running (i.e. whether the /dev/bcache device exists, whether
+  it's in passthrough mode or caching).
+
+sequential_cutoff
+  A sequential IO will bypass the cache once it passes this threshhold; the
+  most recent 128 IOs are tracked so sequential IO can be detected even when
+  it isn't all done at once.
+
+sequential_merge
+  If non zero, bcache keeps a list of the last 128 requests submitted to compare
+  against all new requests to determine which new requests are sequential
+  continuations of previous requests for the purpose of determining sequential
+  cutoff. This is necessary if the sequential cutoff value is greater than the
+  maximum acceptable sequential size for any single request. 
+
+state
+  The backing device can be in one of four different states:
+
+  no cache: Has never been attached to a cache set.
+
+  clean: Part of a cache set, and there is no cached dirty data.
+
+  dirty: Part of a cache set, and there is cached dirty data.
+
+  inconsistent: The backing device was forcibly run by the user when there was
+  dirty data cached but the cache set was unavailable; whatever data was on the
+  backing device has likely been corrupted.
+
+stop
+  Write to this file to shut down the bcache device and close the backing
+  device.
+
+writeback_delay
+  When dirty data is written to the cache and it previously did not contain
+  any, waits some number of seconds before initiating writeback. Defaults to
+  30.
+
+writeback_percent
+  If nonzero, bcache tries to keep around this percentage of the cache dirty by
+  throttling background writeback and using a PD controller to smoothly adjust
+  the rate.
+
+writeback_rate
+  Rate in sectors per second - if writeback_percent is nonzero, background
+  writeback is throttled to this rate. Continuously adjusted by bcache but may
+  also be set by the user.
+
+writeback_running
+  If off, writeback of dirty data will not take place at all. Dirty data will
+  still be added to the cache until it is mostly full; only meant for
+  benchmarking. Defaults to on.
+
+SYSFS - BACKING DEVICE STATS:
+
+There are directories with these numbers for a running total, as well as
+versions that decay over the past day, hour and 5 minutes; they're also
+aggregated in the cache set directory as well.
+
+bypassed
+  Amount of IO (both reads and writes) that has bypassed the cache
+
+cache_hits
+cache_misses
+cache_hit_ratio
+  Hits and misses are counted per individual IO as bcache sees them; a
+  partial hit is counted as a miss.
+
+cache_bypass_hits
+cache_bypass_misses
+  Hits and misses for IO that is intended to skip the cache are still counted,
+  but broken out here.
+
+cache_miss_collisions
+  Counts instances where data was going to be inserted into the cache from a
+  cache miss, but raced with a write and data was already present (usually 0
+  since the synchronization for cache misses was rewritten)
+
+cache_readaheads
+  Count of times readahead occured.
+
+SYSFS - CACHE SET:
+
+average_key_size
+  Average data per key in the btree.
+
+bdev<0..n>
+  Symlink to each of the attached backing devices.
+
+block_size
+  Block size of the cache devices.
+
+btree_cache_size
+  Amount of memory currently used by the btree cache
+
+bucket_size
+  Size of buckets
+
+cache<0..n>
+  Symlink to each of the cache devices comprising this cache set. 
+
+cache_available_percent
+  Percentage of cache device free.
+
+clear_stats
+  Clears the statistics associated with this cache
+
+dirty_data
+  Amount of dirty data is in the cache (updated when garbage collection runs).
+
+flash_vol_create
+  Echoing a size to this file (in human readable units, k/M/G) creates a thinly
+  provisioned volume backed by the cache set.
+
+io_error_halflife
+io_error_limit
+  These determines how many errors we accept before disabling the cache.
+  Each error is decayed by the half life (in # ios).  If the decaying count
+  reaches io_error_limit dirty data is written out and the cache is disabled.
+
+journal_delay_ms
+  Journal writes will delay for up to this many milliseconds, unless a cache
+  flush happens sooner. Defaults to 100.
+
+root_usage_percent
+  Percentage of the root btree node in use.  If this gets too high the node
+  will split, increasing the tree depth.
+
+stop
+  Write to this file to shut down the cache set - waits until all attached
+  backing devices have been shut down.
+
+tree_depth
+  Depth of the btree (A single node btree has depth 0).
+
+unregister
+  Detaches all backing devices and closes the cache devices; if dirty data is
+  present it will disable writeback caching and wait for it to be flushed.
+
+SYSFS - CACHE SET INTERNAL:
+
+This directory also exposes timings for a number of internal operations, with
+separate files for average duration, average frequency, last occurence and max
+duration: garbage collection, btree read, btree node sorts and btree splits.
+
+active_journal_entries
+  Number of journal entries that are newer than the index.
+
+btree_nodes
+  Total nodes in the btree.
+
+btree_used_percent
+  Average fraction of btree in use.
+
+bset_tree_stats
+  Statistics about the auxiliary search trees
+
+btree_cache_max_chain
+  Longest chain in the btree node cache's hash table
+
+cache_read_races
+  Counts instances where while data was being read from the cache, the bucket
+  was reused and invalidated - i.e. where the pointer was stale after the read
+  completed. When this occurs the data is reread from the backing device.
+
+trigger_gc
+  Writing to this file forces garbage collection to run.
+
+SYSFS - CACHE DEVICE:
+
+block_size
+  Minimum granularity of writes - should match hardware sector size.
+
+btree_written
+  Sum of all btree writes, in (kilo/mega/giga) bytes
+
+bucket_size
+  Size of buckets
+
+cache_replacement_policy
+  One of either lru, fifo or random.
+
+discard
+  Boolean; if on a discard/TRIM will be issued to each bucket before it is
+  reused. Defaults to off, since SATA TRIM is an unqueued command (and thus
+  slow).
+
+freelist_percent
+  Size of the freelist as a percentage of nbuckets. Can be written to to
+  increase the number of buckets kept on the freelist, which lets you
+  artificially reduce the size of the cache at runtime. Mostly for testing
+  purposes (i.e. testing how different size caches affect your hit rate), but
+  since buckets are discarded when they move on to the freelist will also make
+  the SSD's garbage collection easier by effectively giving it more reserved
+  space.
+
+io_errors
+  Number of errors that have occured, decayed by io_error_halflife.
+
+metadata_written
+  Sum of all non data writes (btree writes and all other metadata).
+
+nbuckets
+  Total buckets in this cache
+
+priority_stats
+  Statistics about how recently data in the cache has been accessed.  This can
+  reveal your working set size.
+
+written
+  Sum of all data that has been written to the cache; comparison with
+  btree_written gives the amount of write inflation in bcache.
diff --git a/MAINTAINERS b/MAINTAINERS
index 50b4d735f961..64b849620b52 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1616,6 +1616,13 @@ W:	http://www.baycom.org/~tom/ham/ham.html
 S:	Maintained
 F:	drivers/net/hamradio/baycom*
 
+BCACHE (BLOCK LAYER CACHE)
+M:	Kent Overstreet <koverstreet@google.com>
+L:	linux-bcache@vger.kernel.org
+W:	http://bcache.evilpiepirate.org
+S:	Maintained:
+F:	drivers/md/bcache/
+
 BEFS FILE SYSTEM
 S:	Orphan
 F:	Documentation/filesystems/befs.txt
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 4d8d90b4fe78..3bfc8f1da9fe 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -174,6 +174,8 @@ config MD_FAULTY
 
 	  In unsure, say N.
 
+source "drivers/md/bcache/Kconfig"
+
 config BLK_DEV_DM
 	tristate "Device mapper support"
 	---help---
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 7ceeaefc0e95..1439fd4ad9b1 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_MD_RAID10)		+= raid10.o
 obj-$(CONFIG_MD_RAID456)	+= raid456.o
 obj-$(CONFIG_MD_MULTIPATH)	+= multipath.o
 obj-$(CONFIG_MD_FAULTY)		+= faulty.o
+obj-$(CONFIG_BCACHE)		+= bcache/
 obj-$(CONFIG_BLK_DEV_MD)	+= md-mod.o
 obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_BUFIO)		+= dm-bufio.o
diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
new file mode 100644
index 000000000000..05c220d05e23
--- /dev/null
+++ b/drivers/md/bcache/Kconfig
@@ -0,0 +1,42 @@
+
+config BCACHE
+	tristate "Block device as cache"
+	select CLOSURES
+	---help---
+	Allows a block device to be used as cache for other devices; uses
+	a btree for indexing and the layout is optimized for SSDs.
+
+	See Documentation/bcache.txt for details.
+
+config BCACHE_DEBUG
+	bool "Bcache debugging"
+	depends on BCACHE
+	---help---
+	Don't select this option unless you're a developer
+
+	Enables extra debugging tools (primarily a fuzz tester)
+
+config BCACHE_EDEBUG
+	bool "Extended runtime checks"
+	depends on BCACHE
+	---help---
+	Don't select this option unless you're a developer
+
+	Enables extra runtime checks which significantly affect performance
+
+config BCACHE_CLOSURES_DEBUG
+	bool "Debug closures"
+	depends on BCACHE
+	select DEBUG_FS
+	---help---
+	Keeps all active closures in a linked list and provides a debugfs
+	interface to list them, which makes it possible to see asynchronous
+	operations that get stuck.
+
+# cgroup code needs to be updated:
+#
+#config CGROUP_BCACHE
+#	bool "Cgroup controls for bcache"
+#	depends on BCACHE && BLK_CGROUP
+#	---help---
+#	TODO
diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
new file mode 100644
index 000000000000..0e9c82523be6
--- /dev/null
+++ b/drivers/md/bcache/Makefile
@@ -0,0 +1,7 @@
+
+obj-$(CONFIG_BCACHE)	+= bcache.o
+
+bcache-y		:= alloc.o btree.o bset.o io.o journal.o writeback.o\
+	movinggc.o request.o super.o sysfs.o debug.o util.o trace.o stats.o closure.o
+
+CFLAGS_request.o	+= -Iblock
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
new file mode 100644
index 000000000000..ed18115e078e
--- /dev/null
+++ b/drivers/md/bcache/alloc.c
@@ -0,0 +1,583 @@
+/*
+ * Primary bucket allocation code
+ *
+ * Copyright 2012 Google, Inc.
+ *
+ * Allocation in bcache is done in terms of buckets:
+ *
+ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in
+ * btree pointers - they must match for the pointer to be considered valid.
+ *
+ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a
+ * bucket simply by incrementing its gen.
+ *
+ * The gens (along with the priorities; it's really the gens are important but
+ * the code is named as if it's the priorities) are written in an arbitrary list
+ * of buckets on disk, with a pointer to them in the journal header.
+ *
+ * When we invalidate a bucket, we have to write its new gen to disk and wait
+ * for that write to complete before we use it - otherwise after a crash we
+ * could have pointers that appeared to be good but pointed to data that had
+ * been overwritten.
+ *
+ * Since the gens and priorities are all stored contiguously on disk, we can
+ * batch this up: We fill up the free_inc list with freshly invalidated buckets,
+ * call prio_write(), and when prio_write() finishes we pull buckets off the
+ * free_inc list and optionally discard them.
+ *
+ * free_inc isn't the only freelist - if it was, we'd often to sleep while
+ * priorities and gens were being written before we could allocate. c->free is a
+ * smaller freelist, and buckets on that list are always ready to be used.
+ *
+ * If we've got discards enabled, that happens when a bucket moves from the
+ * free_inc list to the free list.
+ *
+ * There is another freelist, because sometimes we have buckets that we know
+ * have nothing pointing into them - these we can reuse without waiting for
+ * priorities to be rewritten. These come from freed btree nodes and buckets
+ * that garbage collection discovered no longer had valid keys pointing into
+ * them (because they were overwritten). That's the unused list - buckets on the
+ * unused list move to the free list, optionally being discarded in the process.
+ *
+ * It's also important to ensure that gens don't wrap around - with respect to
+ * either the oldest gen in the btree or the gen on disk. This is quite
+ * difficult to do in practice, but we explicitly guard against it anyways - if
+ * a bucket is in danger of wrapping around we simply skip invalidating it that
+ * time around, and we garbage collect or rewrite the priorities sooner than we
+ * would have otherwise.
+ *
+ * bch_bucket_alloc() allocates a single bucket from a specific cache.
+ *
+ * bch_bucket_alloc_set() allocates one or more buckets from different caches
+ * out of a cache set.
+ *
+ * free_some_buckets() drives all the processes described above. It's called
+ * from bch_bucket_alloc() and a few other places that need to make sure free
+ * buckets are ready.
+ *
+ * invalidate_buckets_(lru|fifo)() find buckets that are available to be
+ * invalidated, and then invalidate them and stick them on the free_inc list -
+ * in either lru or fifo order.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+
+#include <linux/random.h>
+
+#define MAX_IN_FLIGHT_DISCARDS		8U
+
+/* Bucket heap / gen */
+
+uint8_t bch_inc_gen(struct cache *ca, struct bucket *b)
+{
+	uint8_t ret = ++b->gen;
+
+	ca->set->need_gc = max(ca->set->need_gc, bucket_gc_gen(b));
+	WARN_ON_ONCE(ca->set->need_gc > BUCKET_GC_GEN_MAX);
+
+	if (CACHE_SYNC(&ca->set->sb)) {
+		ca->need_save_prio = max(ca->need_save_prio,
+					 bucket_disk_gen(b));
+		WARN_ON_ONCE(ca->need_save_prio > BUCKET_DISK_GEN_MAX);
+	}
+
+	return ret;
+}
+
+void bch_rescale_priorities(struct cache_set *c, int sectors)
+{
+	struct cache *ca;
+	struct bucket *b;
+	unsigned next = c->nbuckets * c->sb.bucket_size / 1024;
+	unsigned i;
+	int r;
+
+	atomic_sub(sectors, &c->rescale);
+
+	do {
+		r = atomic_read(&c->rescale);
+
+		if (r >= 0)
+			return;
+	} while (atomic_cmpxchg(&c->rescale, r, r + next) != r);
+
+	mutex_lock(&c->bucket_lock);
+
+	c->min_prio = USHRT_MAX;
+
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca)
+			if (b->prio &&
+			    b->prio != BTREE_PRIO &&
+			    !atomic_read(&b->pin)) {
+				b->prio--;
+				c->min_prio = min(c->min_prio, b->prio);
+			}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+/* Discard/TRIM */
+
+struct discard {
+	struct list_head	list;
+	struct work_struct	work;
+	struct cache		*ca;
+	long			bucket;
+
+	struct bio		bio;
+	struct bio_vec		bv;
+};
+
+static void discard_finish(struct work_struct *w)
+{
+	struct discard *d = container_of(w, struct discard, work);
+	struct cache *ca = d->ca;
+	char buf[BDEVNAME_SIZE];
+
+	if (!test_bit(BIO_UPTODATE, &d->bio.bi_flags)) {
+		pr_notice("discard error on %s, disabling",
+			 bdevname(ca->bdev, buf));
+		d->ca->discard = 0;
+	}
+
+	mutex_lock(&ca->set->bucket_lock);
+
+	fifo_push(&ca->free, d->bucket);
+	list_add(&d->list, &ca->discards);
+	atomic_dec(&ca->discards_in_flight);
+
+	mutex_unlock(&ca->set->bucket_lock);
+
+	closure_wake_up(&ca->set->bucket_wait);
+	wake_up(&ca->set->alloc_wait);
+
+	closure_put(&ca->set->cl);
+}
+
+static void discard_endio(struct bio *bio, int error)
+{
+	struct discard *d = container_of(bio, struct discard, bio);
+	schedule_work(&d->work);
+}
+
+static void do_discard(struct cache *ca, long bucket)
+{
+	struct discard *d = list_first_entry(&ca->discards,
+					     struct discard, list);
+
+	list_del(&d->list);
+	d->bucket = bucket;
+
+	atomic_inc(&ca->discards_in_flight);
+	closure_get(&ca->set->cl);
+
+	bio_init(&d->bio);
+
+	d->bio.bi_sector	= bucket_to_sector(ca->set, d->bucket);
+	d->bio.bi_bdev		= ca->bdev;
+	d->bio.bi_rw		= REQ_WRITE|REQ_DISCARD;
+	d->bio.bi_max_vecs	= 1;
+	d->bio.bi_io_vec	= d->bio.bi_inline_vecs;
+	d->bio.bi_size		= bucket_bytes(ca);
+	d->bio.bi_end_io	= discard_endio;
+	bio_set_prio(&d->bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	submit_bio(0, &d->bio);
+}
+
+/* Allocation */
+
+static inline bool can_inc_bucket_gen(struct bucket *b)
+{
+	return bucket_gc_gen(b) < BUCKET_GC_GEN_MAX &&
+		bucket_disk_gen(b) < BUCKET_DISK_GEN_MAX;
+}
+
+bool bch_bucket_add_unused(struct cache *ca, struct bucket *b)
+{
+	BUG_ON(GC_MARK(b) || GC_SECTORS_USED(b));
+
+	if (fifo_used(&ca->free) > ca->watermark[WATERMARK_MOVINGGC] &&
+	    CACHE_REPLACEMENT(&ca->sb) == CACHE_REPLACEMENT_FIFO)
+		return false;
+
+	b->prio = 0;
+
+	if (can_inc_bucket_gen(b) &&
+	    fifo_push(&ca->unused, b - ca->buckets)) {
+		atomic_inc(&b->pin);
+		return true;
+	}
+
+	return false;
+}
+
+static bool can_invalidate_bucket(struct cache *ca, struct bucket *b)
+{
+	return GC_MARK(b) == GC_MARK_RECLAIMABLE &&
+		!atomic_read(&b->pin) &&
+		can_inc_bucket_gen(b);
+}
+
+static void invalidate_one_bucket(struct cache *ca, struct bucket *b)
+{
+	bch_inc_gen(ca, b);
+	b->prio = INITIAL_PRIO;
+	atomic_inc(&b->pin);
+	fifo_push(&ca->free_inc, b - ca->buckets);
+}
+
+static void invalidate_buckets_lru(struct cache *ca)
+{
+	unsigned bucket_prio(struct bucket *b)
+	{
+		return ((unsigned) (b->prio - ca->set->min_prio)) *
+			GC_SECTORS_USED(b);
+	}
+
+	bool bucket_max_cmp(struct bucket *l, struct bucket *r)
+	{
+		return bucket_prio(l) < bucket_prio(r);
+	}
+
+	bool bucket_min_cmp(struct bucket *l, struct bucket *r)
+	{
+		return bucket_prio(l) > bucket_prio(r);
+	}
+
+	struct bucket *b;
+	ssize_t i;
+
+	ca->heap.used = 0;
+
+	for_each_bucket(b, ca) {
+		if (!can_invalidate_bucket(ca, b))
+			continue;
+
+		if (!GC_SECTORS_USED(b)) {
+			if (!bch_bucket_add_unused(ca, b))
+				return;
+		} else {
+			if (!heap_full(&ca->heap))
+				heap_add(&ca->heap, b, bucket_max_cmp);
+			else if (bucket_max_cmp(b, heap_peek(&ca->heap))) {
+				ca->heap.data[0] = b;
+				heap_sift(&ca->heap, 0, bucket_max_cmp);
+			}
+		}
+	}
+
+	if (ca->heap.used * 2 < ca->heap.size)
+		bch_queue_gc(ca->set);
+
+	for (i = ca->heap.used / 2 - 1; i >= 0; --i)
+		heap_sift(&ca->heap, i, bucket_min_cmp);
+
+	while (!fifo_full(&ca->free_inc)) {
+		if (!heap_pop(&ca->heap, b, bucket_min_cmp)) {
+			/* We don't want to be calling invalidate_buckets()
+			 * multiple times when it can't do anything
+			 */
+			ca->invalidate_needs_gc = 1;
+			bch_queue_gc(ca->set);
+			return;
+		}
+
+		invalidate_one_bucket(ca, b);
+	}
+}
+
+static void invalidate_buckets_fifo(struct cache *ca)
+{
+	struct bucket *b;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		if (ca->fifo_last_bucket <  ca->sb.first_bucket ||
+		    ca->fifo_last_bucket >= ca->sb.nbuckets)
+			ca->fifo_last_bucket = ca->sb.first_bucket;
+
+		b = ca->buckets + ca->fifo_last_bucket++;
+
+		if (can_invalidate_bucket(ca, b))
+			invalidate_one_bucket(ca, b);
+
+		if (++checked >= ca->sb.nbuckets) {
+			ca->invalidate_needs_gc = 1;
+			bch_queue_gc(ca->set);
+			return;
+		}
+	}
+}
+
+static void invalidate_buckets_random(struct cache *ca)
+{
+	struct bucket *b;
+	size_t checked = 0;
+
+	while (!fifo_full(&ca->free_inc)) {
+		size_t n;
+		get_random_bytes(&n, sizeof(n));
+
+		n %= (size_t) (ca->sb.nbuckets - ca->sb.first_bucket);
+		n += ca->sb.first_bucket;
+
+		b = ca->buckets + n;
+
+		if (can_invalidate_bucket(ca, b))
+			invalidate_one_bucket(ca, b);
+
+		if (++checked >= ca->sb.nbuckets / 2) {
+			ca->invalidate_needs_gc = 1;
+			bch_queue_gc(ca->set);
+			return;
+		}
+	}
+}
+
+static void invalidate_buckets(struct cache *ca)
+{
+	if (ca->invalidate_needs_gc)
+		return;
+
+	switch (CACHE_REPLACEMENT(&ca->sb)) {
+	case CACHE_REPLACEMENT_LRU:
+		invalidate_buckets_lru(ca);
+		break;
+	case CACHE_REPLACEMENT_FIFO:
+		invalidate_buckets_fifo(ca);
+		break;
+	case CACHE_REPLACEMENT_RANDOM:
+		invalidate_buckets_random(ca);
+		break;
+	}
+}
+
+#define allocator_wait(ca, cond)					\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	while (!(cond)) {						\
+		prepare_to_wait(&ca->set->alloc_wait,			\
+				&__wait, TASK_INTERRUPTIBLE);		\
+									\
+		mutex_unlock(&(ca)->set->bucket_lock);			\
+		if (test_bit(CACHE_SET_STOPPING_2, &ca->set->flags)) {	\
+			finish_wait(&ca->set->alloc_wait, &__wait);	\
+			closure_return(cl);				\
+		}							\
+									\
+		schedule();						\
+		__set_current_state(TASK_RUNNING);			\
+		mutex_lock(&(ca)->set->bucket_lock);			\
+	}								\
+									\
+	finish_wait(&ca->set->alloc_wait, &__wait);			\
+} while (0)
+
+void bch_allocator_thread(struct closure *cl)
+{
+	struct cache *ca = container_of(cl, struct cache, alloc);
+
+	mutex_lock(&ca->set->bucket_lock);
+
+	while (1) {
+		while (1) {
+			long bucket;
+
+			if ((!atomic_read(&ca->set->prio_blocked) ||
+			     !CACHE_SYNC(&ca->set->sb)) &&
+			    !fifo_empty(&ca->unused))
+				fifo_pop(&ca->unused, bucket);
+			else if (!fifo_empty(&ca->free_inc))
+				fifo_pop(&ca->free_inc, bucket);
+			else
+				break;
+
+			allocator_wait(ca, (int) fifo_free(&ca->free) >
+				       atomic_read(&ca->discards_in_flight));
+
+			if (ca->discard) {
+				allocator_wait(ca, !list_empty(&ca->discards));
+				do_discard(ca, bucket);
+			} else {
+				fifo_push(&ca->free, bucket);
+				closure_wake_up(&ca->set->bucket_wait);
+			}
+		}
+
+		allocator_wait(ca, ca->set->gc_mark_valid);
+		invalidate_buckets(ca);
+
+		allocator_wait(ca, !atomic_read(&ca->set->prio_blocked) ||
+			       !CACHE_SYNC(&ca->set->sb));
+
+		if (CACHE_SYNC(&ca->set->sb) &&
+		    (!fifo_empty(&ca->free_inc) ||
+		     ca->need_save_prio > 64)) {
+			bch_prio_write(ca);
+		}
+	}
+}
+
+long bch_bucket_alloc(struct cache *ca, unsigned watermark, struct closure *cl)
+{
+	long r = -1;
+again:
+	wake_up(&ca->set->alloc_wait);
+
+	if (fifo_used(&ca->free) > ca->watermark[watermark] &&
+	    fifo_pop(&ca->free, r)) {
+		struct bucket *b = ca->buckets + r;
+#ifdef CONFIG_BCACHE_EDEBUG
+		size_t iter;
+		long i;
+
+		for (iter = 0; iter < prio_buckets(ca) * 2; iter++)
+			BUG_ON(ca->prio_buckets[iter] == (uint64_t) r);
+
+		fifo_for_each(i, &ca->free, iter)
+			BUG_ON(i == r);
+		fifo_for_each(i, &ca->free_inc, iter)
+			BUG_ON(i == r);
+		fifo_for_each(i, &ca->unused, iter)
+			BUG_ON(i == r);
+#endif
+		BUG_ON(atomic_read(&b->pin) != 1);
+
+		SET_GC_SECTORS_USED(b, ca->sb.bucket_size);
+
+		if (watermark <= WATERMARK_METADATA) {
+			SET_GC_MARK(b, GC_MARK_METADATA);
+			b->prio = BTREE_PRIO;
+		} else {
+			SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+			b->prio = INITIAL_PRIO;
+		}
+
+		return r;
+	}
+
+	pr_debug("alloc failure: blocked %i free %zu free_inc %zu unused %zu",
+		 atomic_read(&ca->set->prio_blocked), fifo_used(&ca->free),
+		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+
+	if (cl) {
+		closure_wait(&ca->set->bucket_wait, cl);
+
+		if (closure_blocking(cl)) {
+			mutex_unlock(&ca->set->bucket_lock);
+			closure_sync(cl);
+			mutex_lock(&ca->set->bucket_lock);
+			goto again;
+		}
+	}
+
+	return -1;
+}
+
+void bch_bucket_free(struct cache_set *c, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		struct bucket *b = PTR_BUCKET(c, k, i);
+
+		SET_GC_MARK(b, 0);
+		SET_GC_SECTORS_USED(b, 0);
+		bch_bucket_add_unused(PTR_CACHE(c, k, i), b);
+	}
+}
+
+int __bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+			   struct bkey *k, int n, struct closure *cl)
+{
+	int i;
+
+	lockdep_assert_held(&c->bucket_lock);
+	BUG_ON(!n || n > c->caches_loaded || n > 8);
+
+	bkey_init(k);
+
+	/* sort by free space/prio of oldest data in caches */
+
+	for (i = 0; i < n; i++) {
+		struct cache *ca = c->cache_by_alloc[i];
+		long b = bch_bucket_alloc(ca, watermark, cl);
+
+		if (b == -1)
+			goto err;
+
+		k->ptr[i] = PTR(ca->buckets[b].gen,
+				bucket_to_sector(c, b),
+				ca->sb.nr_this_dev);
+
+		SET_KEY_PTRS(k, i + 1);
+	}
+
+	return 0;
+err:
+	bch_bucket_free(c, k);
+	__bkey_put(c, k);
+	return -1;
+}
+
+int bch_bucket_alloc_set(struct cache_set *c, unsigned watermark,
+			 struct bkey *k, int n, struct closure *cl)
+{
+	int ret;
+	mutex_lock(&c->bucket_lock);
+	ret = __bch_bucket_alloc_set(c, watermark, k, n, cl);
+	mutex_unlock(&c->bucket_lock);
+	return ret;
+}
+
+/* Init */
+
+void bch_cache_allocator_exit(struct cache *ca)
+{
+	struct discard *d;
+
+	while (!list_empty(&ca->discards)) {
+		d = list_first_entry(&ca->discards, struct discard, list);
+		cancel_work_sync(&d->work);
+		list_del(&d->list);
+		kfree(d);
+	}
+}
+
+int bch_cache_allocator_init(struct cache *ca)
+{
+	unsigned i;
+
+	/*
+	 * Reserve:
+	 * Prio/gen writes first
+	 * Then 8 for btree allocations
+	 * Then half for the moving garbage collector
+	 */
+
+	ca->watermark[WATERMARK_PRIO] = 0;
+
+	ca->watermark[WATERMARK_METADATA] = prio_buckets(ca);
+
+	ca->watermark[WATERMARK_MOVINGGC] = 8 +
+		ca->watermark[WATERMARK_METADATA];
+
+	ca->watermark[WATERMARK_NONE] = ca->free.size / 2 +
+		ca->watermark[WATERMARK_MOVINGGC];
+
+	for (i = 0; i < MAX_IN_FLIGHT_DISCARDS; i++) {
+		struct discard *d = kzalloc(sizeof(*d), GFP_KERNEL);
+		if (!d)
+			return -ENOMEM;
+
+		d->ca = ca;
+		INIT_WORK(&d->work, discard_finish);
+		list_add(&d->list, &ca->discards);
+	}
+
+	return 0;
+}
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
new file mode 100644
index 000000000000..d01a553f63f3
--- /dev/null
+++ b/drivers/md/bcache/bcache.h
@@ -0,0 +1,1232 @@
+#ifndef _BCACHE_H
+#define _BCACHE_H
+
+/*
+ * SOME HIGH LEVEL CODE DOCUMENTATION:
+ *
+ * Bcache mostly works with cache sets, cache devices, and backing devices.
+ *
+ * Support for multiple cache devices hasn't quite been finished off yet, but
+ * it's about 95% plumbed through. A cache set and its cache devices is sort of
+ * like a md raid array and its component devices. Most of the code doesn't care
+ * about individual cache devices, the main abstraction is the cache set.
+ *
+ * Multiple cache devices is intended to give us the ability to mirror dirty
+ * cached data and metadata, without mirroring clean cached data.
+ *
+ * Backing devices are different, in that they have a lifetime independent of a
+ * cache set. When you register a newly formatted backing device it'll come up
+ * in passthrough mode, and then you can attach and detach a backing device from
+ * a cache set at runtime - while it's mounted and in use. Detaching implicitly
+ * invalidates any cached data for that backing device.
+ *
+ * A cache set can have multiple (many) backing devices attached to it.
+ *
+ * There's also flash only volumes - this is the reason for the distinction
+ * between struct cached_dev and struct bcache_device. A flash only volume
+ * works much like a bcache device that has a backing device, except the
+ * "cached" data is always dirty. The end result is that we get thin
+ * provisioning with very little additional code.
+ *
+ * Flash only volumes work but they're not production ready because the moving
+ * garbage collector needs more work. More on that later.
+ *
+ * BUCKETS/ALLOCATION:
+ *
+ * Bcache is primarily designed for caching, which means that in normal
+ * operation all of our available space will be allocated. Thus, we need an
+ * efficient way of deleting things from the cache so we can write new things to
+ * it.
+ *
+ * To do this, we first divide the cache device up into buckets. A bucket is the
+ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+
+ * works efficiently.
+ *
+ * Each bucket has a 16 bit priority, and an 8 bit generation associated with
+ * it. The gens and priorities for all the buckets are stored contiguously and
+ * packed on disk (in a linked list of buckets - aside from the superblock, all
+ * of bcache's metadata is stored in buckets).
+ *
+ * The priority is used to implement an LRU. We reset a bucket's priority when
+ * we allocate it or on cache it, and every so often we decrement the priority
+ * of each bucket. It could be used to implement something more sophisticated,
+ * if anyone ever gets around to it.
+ *
+ * The generation is used for invalidating buckets. Each pointer also has an 8
+ * bit generation embedded in it; for a pointer to be considered valid, its gen
+ * must match the gen of the bucket it points into.  Thus, to reuse a bucket all
+ * we have to do is increment its gen (and write its new gen to disk; we batch
+ * this up).
+ *
+ * Bcache is entirely COW - we never write twice to a bucket, even buckets that
+ * contain metadata (including btree nodes).
+ *
+ * THE BTREE:
+ *
+ * Bcache is in large part design around the btree.
+ *
+ * At a high level, the btree is just an index of key -> ptr tuples.
+ *
+ * Keys represent extents, and thus have a size field. Keys also have a variable
+ * number of pointers attached to them (potentially zero, which is handy for
+ * invalidating the cache).
+ *
+ * The key itself is an inode:offset pair. The inode number corresponds to a
+ * backing device or a flash only volume. The offset is the ending offset of the
+ * extent within the inode - not the starting offset; this makes lookups
+ * slightly more convenient.
+ *
+ * Pointers contain the cache device id, the offset on that device, and an 8 bit
+ * generation number. More on the gen later.
+ *
+ * Index lookups are not fully abstracted - cache lookups in particular are
+ * still somewhat mixed in with the btree code, but things are headed in that
+ * direction.
+ *
+ * Updates are fairly well abstracted, though. There are two different ways of
+ * updating the btree; insert and replace.
+ *
+ * BTREE_INSERT will just take a list of keys and insert them into the btree -
+ * overwriting (possibly only partially) any extents they overlap with. This is
+ * used to update the index after a write.
+ *
+ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is
+ * overwriting a key that matches another given key. This is used for inserting
+ * data into the cache after a cache miss, and for background writeback, and for
+ * the moving garbage collector.
+ *
+ * There is no "delete" operation; deleting things from the index is
+ * accomplished by either by invalidating pointers (by incrementing a bucket's
+ * gen) or by inserting a key with 0 pointers - which will overwrite anything
+ * previously present at that location in the index.
+ *
+ * This means that there are always stale/invalid keys in the btree. They're
+ * filtered out by the code that iterates through a btree node, and removed when
+ * a btree node is rewritten.
+ *
+ * BTREE NODES:
+ *
+ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and
+ * free smaller than a bucket - so, that's how big our btree nodes are.
+ *
+ * (If buckets are really big we'll only use part of the bucket for a btree node
+ * - no less than 1/4th - but a bucket still contains no more than a single
+ * btree node. I'd actually like to change this, but for now we rely on the
+ * bucket's gen for deleting btree nodes when we rewrite/split a node.)
+ *
+ * Anyways, btree nodes are big - big enough to be inefficient with a textbook
+ * btree implementation.
+ *
+ * The way this is solved is that btree nodes are internally log structured; we
+ * can append new keys to an existing btree node without rewriting it. This
+ * means each set of keys we write is sorted, but the node is not.
+ *
+ * We maintain this log structure in memory - keeping 1Mb of keys sorted would
+ * be expensive, and we have to distinguish between the keys we have written and
+ * the keys we haven't. So to do a lookup in a btree node, we have to search
+ * each sorted set. But we do merge written sets together lazily, so the cost of
+ * these extra searches is quite low (normally most of the keys in a btree node
+ * will be in one big set, and then there'll be one or two sets that are much
+ * smaller).
+ *
+ * This log structure makes bcache's btree more of a hybrid between a
+ * conventional btree and a compacting data structure, with some of the
+ * advantages of both.
+ *
+ * GARBAGE COLLECTION:
+ *
+ * We can't just invalidate any bucket - it might contain dirty data or
+ * metadata. If it once contained dirty data, other writes might overwrite it
+ * later, leaving no valid pointers into that bucket in the index.
+ *
+ * Thus, the primary purpose of garbage collection is to find buckets to reuse.
+ * It also counts how much valid data it each bucket currently contains, so that
+ * allocation can reuse buckets sooner when they've been mostly overwritten.
+ *
+ * It also does some things that are really internal to the btree
+ * implementation. If a btree node contains pointers that are stale by more than
+ * some threshold, it rewrites the btree node to avoid the bucket's generation
+ * wrapping around. It also merges adjacent btree nodes if they're empty enough.
+ *
+ * THE JOURNAL:
+ *
+ * Bcache's journal is not necessary for consistency; we always strictly
+ * order metadata writes so that the btree and everything else is consistent on
+ * disk in the event of an unclean shutdown, and in fact bcache had writeback
+ * caching (with recovery from unclean shutdown) before journalling was
+ * implemented.
+ *
+ * Rather, the journal is purely a performance optimization; we can't complete a
+ * write until we've updated the index on disk, otherwise the cache would be
+ * inconsistent in the event of an unclean shutdown. This means that without the
+ * journal, on random write workloads we constantly have to update all the leaf
+ * nodes in the btree, and those writes will be mostly empty (appending at most
+ * a few keys each) - highly inefficient in terms of amount of metadata writes,
+ * and it puts more strain on the various btree resorting/compacting code.
+ *
+ * The journal is just a log of keys we've inserted; on startup we just reinsert
+ * all the keys in the open journal entries. That means that when we're updating
+ * a node in the btree, we can wait until a 4k block of keys fills up before
+ * writing them out.
+ *
+ * For simplicity, we only journal updates to leaf nodes; updates to parent
+ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth
+ * the complexity to deal with journalling them (in particular, journal replay)
+ * - updates to non leaf nodes just happen synchronously (see btree_split()).
+ */
+
+#define pr_fmt(fmt) "bcache: %s() " fmt "\n", __func__
+
+#include <linux/bio.h>
+#include <linux/blktrace_api.h>
+#include <linux/kobject.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/rwsem.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "util.h"
+#include "closure.h"
+
+struct bucket {
+	atomic_t	pin;
+	uint16_t	prio;
+	uint8_t		gen;
+	uint8_t		disk_gen;
+	uint8_t		last_gc; /* Most out of date gen in the btree */
+	uint8_t		gc_gen;
+	uint16_t	gc_mark;
+};
+
+/*
+ * I'd use bitfields for these, but I don't trust the compiler not to screw me
+ * as multiple threads touch struct bucket without locking
+ */
+
+BITMASK(GC_MARK,	 struct bucket, gc_mark, 0, 2);
+#define GC_MARK_RECLAIMABLE	0
+#define GC_MARK_DIRTY		1
+#define GC_MARK_METADATA	2
+BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, 14);
+
+struct bkey {
+	uint64_t	high;
+	uint64_t	low;
+	uint64_t	ptr[];
+};
+
+/* Enough for a key with 6 pointers */
+#define BKEY_PAD		8
+
+#define BKEY_PADDED(key)					\
+	union { struct bkey key; uint64_t key ## _pad[BKEY_PAD]; }
+
+/* Version 1: Backing device
+ * Version 2: Seed pointer into btree node checksum
+ * Version 3: New UUID format
+ */
+#define BCACHE_SB_VERSION	3
+
+#define SB_SECTOR		8
+#define SB_SIZE			4096
+#define SB_LABEL_SIZE		32
+#define SB_JOURNAL_BUCKETS	256U
+/* SB_JOURNAL_BUCKETS must be divisible by BITS_PER_LONG */
+#define MAX_CACHES_PER_SET	8
+
+#define BDEV_DATA_START		16	/* sectors */
+
+struct cache_sb {
+	uint64_t		csum;
+	uint64_t		offset;	/* sector where this sb was written */
+	uint64_t		version;
+#define CACHE_BACKING_DEV	1
+
+	uint8_t			magic[16];
+
+	uint8_t			uuid[16];
+	union {
+		uint8_t		set_uuid[16];
+		uint64_t	set_magic;
+	};
+	uint8_t			label[SB_LABEL_SIZE];
+
+	uint64_t		flags;
+	uint64_t		seq;
+	uint64_t		pad[8];
+
+	uint64_t		nbuckets;	/* device size */
+	uint16_t		block_size;	/* sectors */
+	uint16_t		bucket_size;	/* sectors */
+
+	uint16_t		nr_in_set;
+	uint16_t		nr_this_dev;
+
+	uint32_t		last_mount;	/* time_t */
+
+	uint16_t		first_bucket;
+	union {
+		uint16_t	njournal_buckets;
+		uint16_t	keys;
+	};
+	uint64_t		d[SB_JOURNAL_BUCKETS];	/* journal buckets */
+};
+
+BITMASK(CACHE_SYNC,		struct cache_sb, flags, 0, 1);
+BITMASK(CACHE_DISCARD,		struct cache_sb, flags, 1, 1);
+BITMASK(CACHE_REPLACEMENT,	struct cache_sb, flags, 2, 3);
+#define CACHE_REPLACEMENT_LRU	0U
+#define CACHE_REPLACEMENT_FIFO	1U
+#define CACHE_REPLACEMENT_RANDOM 2U
+
+BITMASK(BDEV_CACHE_MODE,	struct cache_sb, flags, 0, 4);
+#define CACHE_MODE_WRITETHROUGH	0U
+#define CACHE_MODE_WRITEBACK	1U
+#define CACHE_MODE_WRITEAROUND	2U
+#define CACHE_MODE_NONE		3U
+BITMASK(BDEV_STATE,		struct cache_sb, flags, 61, 2);
+#define BDEV_STATE_NONE		0U
+#define BDEV_STATE_CLEAN	1U
+#define BDEV_STATE_DIRTY	2U
+#define BDEV_STATE_STALE	3U
+
+/* Version 1: Seed pointer into btree node checksum
+ */
+#define BCACHE_BSET_VERSION	1
+
+/*
+ * This is the on disk format for btree nodes - a btree node on disk is a list
+ * of these; within each set the keys are sorted
+ */
+struct bset {
+	uint64_t		csum;
+	uint64_t		magic;
+	uint64_t		seq;
+	uint32_t		version;
+	uint32_t		keys;
+
+	union {
+		struct bkey	start[0];
+		uint64_t	d[0];
+	};
+};
+
+/*
+ * On disk format for priorities and gens - see super.c near prio_write() for
+ * more.
+ */
+struct prio_set {
+	uint64_t		csum;
+	uint64_t		magic;
+	uint64_t		seq;
+	uint32_t		version;
+	uint32_t		pad;
+
+	uint64_t		next_bucket;
+
+	struct bucket_disk {
+		uint16_t	prio;
+		uint8_t		gen;
+	} __attribute((packed)) data[];
+};
+
+struct uuid_entry {
+	union {
+		struct {
+			uint8_t		uuid[16];
+			uint8_t		label[32];
+			uint32_t	first_reg;
+			uint32_t	last_reg;
+			uint32_t	invalidated;
+
+			uint32_t	flags;
+			/* Size of flash only volumes */
+			uint64_t	sectors;
+		};
+
+		uint8_t	pad[128];
+	};
+};
+
+BITMASK(UUID_FLASH_ONLY,	struct uuid_entry, flags, 0, 1);
+
+#include "journal.h"
+#include "stats.h"
+struct search;
+struct btree;
+struct keybuf;
+
+struct keybuf_key {
+	struct rb_node		node;
+	BKEY_PADDED(key);
+	void			*private;
+};
+
+typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
+
+struct keybuf {
+	keybuf_pred_fn		*key_predicate;
+
+	struct bkey		last_scanned;
+	spinlock_t		lock;
+
+	/*
+	 * Beginning and end of range in rb tree - so that we can skip taking
+	 * lock and checking the rb tree when we need to check for overlapping
+	 * keys.
+	 */
+	struct bkey		start;
+	struct bkey		end;
+
+	struct rb_root		keys;
+
+#define KEYBUF_NR		100
+	DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
+};
+
+struct bio_split_pool {
+	struct bio_set		*bio_split;
+	mempool_t		*bio_split_hook;
+};
+
+struct bio_split_hook {
+	struct closure		cl;
+	struct bio_split_pool	*p;
+	struct bio		*bio;
+	bio_end_io_t		*bi_end_io;
+	void			*bi_private;
+};
+
+struct bcache_device {
+	struct closure		cl;
+
+	struct kobject		kobj;
+
+	struct cache_set	*c;
+	unsigned		id;
+#define BCACHEDEVNAME_SIZE	12
+	char			name[BCACHEDEVNAME_SIZE];
+
+	struct gendisk		*disk;
+
+	/* If nonzero, we're closing */
+	atomic_t		closing;
+
+	/* If nonzero, we're detaching/unregistering from cache set */
+	atomic_t		detaching;
+
+	atomic_long_t		sectors_dirty;
+	unsigned long		sectors_dirty_gc;
+	unsigned long		sectors_dirty_last;
+	long			sectors_dirty_derivative;
+
+	mempool_t		*unaligned_bvec;
+	struct bio_set		*bio_split;
+
+	unsigned		data_csum:1;
+
+	int (*cache_miss)(struct btree *, struct search *,
+			  struct bio *, unsigned);
+	int (*ioctl) (struct bcache_device *, fmode_t, unsigned, unsigned long);
+
+	struct bio_split_pool	bio_split_hook;
+};
+
+struct io {
+	/* Used to track sequential IO so it can be skipped */
+	struct hlist_node	hash;
+	struct list_head	lru;
+
+	unsigned long		jiffies;
+	unsigned		sequential;
+	sector_t		last;
+};
+
+struct cached_dev {
+	struct list_head	list;
+	struct bcache_device	disk;
+	struct block_device	*bdev;
+
+	struct cache_sb		sb;
+	struct bio		sb_bio;
+	struct bio_vec		sb_bv[1];
+	struct closure_with_waitlist sb_write;
+
+	/* Refcount on the cache set. Always nonzero when we're caching. */
+	atomic_t		count;
+	struct work_struct	detach;
+
+	/*
+	 * Device might not be running if it's dirty and the cache set hasn't
+	 * showed up yet.
+	 */
+	atomic_t		running;
+
+	/*
+	 * Writes take a shared lock from start to finish; scanning for dirty
+	 * data to refill the rb tree requires an exclusive lock.
+	 */
+	struct rw_semaphore	writeback_lock;
+
+	/*
+	 * Nonzero, and writeback has a refcount (d->count), iff there is dirty
+	 * data in the cache. Protected by writeback_lock; must have an
+	 * shared lock to set and exclusive lock to clear.
+	 */
+	atomic_t		has_dirty;
+
+	struct ratelimit	writeback_rate;
+	struct delayed_work	writeback_rate_update;
+
+	/*
+	 * Internal to the writeback code, so read_dirty() can keep track of
+	 * where it's at.
+	 */
+	sector_t		last_read;
+
+	/* Number of writeback bios in flight */
+	atomic_t		in_flight;
+	struct closure_with_timer writeback;
+	struct closure_waitlist	writeback_wait;
+
+	struct keybuf		writeback_keys;
+
+	/* For tracking sequential IO */
+#define RECENT_IO_BITS	7
+#define RECENT_IO	(1 << RECENT_IO_BITS)
+	struct io		io[RECENT_IO];
+	struct hlist_head	io_hash[RECENT_IO + 1];
+	struct list_head	io_lru;
+	spinlock_t		io_lock;
+
+	struct cache_accounting	accounting;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		sequential_cutoff;
+	unsigned		readahead;
+
+	unsigned		sequential_merge:1;
+	unsigned		verify:1;
+
+	unsigned		writeback_metadata:1;
+	unsigned		writeback_running:1;
+	unsigned char		writeback_percent;
+	unsigned		writeback_delay;
+
+	int			writeback_rate_change;
+	int64_t			writeback_rate_derivative;
+	uint64_t		writeback_rate_target;
+
+	unsigned		writeback_rate_update_seconds;
+	unsigned		writeback_rate_d_term;
+	unsigned		writeback_rate_p_term_inverse;
+	unsigned		writeback_rate_d_smooth;
+};
+
+enum alloc_watermarks {
+	WATERMARK_PRIO,
+	WATERMARK_METADATA,
+	WATERMARK_MOVINGGC,
+	WATERMARK_NONE,
+	WATERMARK_MAX
+};
+
+struct cache {
+	struct cache_set	*set;
+	struct cache_sb		sb;
+	struct bio		sb_bio;
+	struct bio_vec		sb_bv[1];
+
+	struct kobject		kobj;
+	struct block_device	*bdev;
+
+	unsigned		watermark[WATERMARK_MAX];
+
+	struct closure		alloc;
+	struct workqueue_struct	*alloc_workqueue;
+
+	struct closure		prio;
+	struct prio_set		*disk_buckets;
+
+	/*
+	 * When allocating new buckets, prio_write() gets first dibs - since we
+	 * may not be allocate at all without writing priorities and gens.
+	 * prio_buckets[] contains the last buckets we wrote priorities to (so
+	 * gc can mark them as metadata), prio_next[] contains the buckets
+	 * allocated for the next prio write.
+	 */
+	uint64_t		*prio_buckets;
+	uint64_t		*prio_last_buckets;
+
+	/*
+	 * free: Buckets that are ready to be used
+	 *
+	 * free_inc: Incoming buckets - these are buckets that currently have
+	 * cached data in them, and we can't reuse them until after we write
+	 * their new gen to disk. After prio_write() finishes writing the new
+	 * gens/prios, they'll be moved to the free list (and possibly discarded
+	 * in the process)
+	 *
+	 * unused: GC found nothing pointing into these buckets (possibly
+	 * because all the data they contained was overwritten), so we only
+	 * need to discard them before they can be moved to the free list.
+	 */
+	DECLARE_FIFO(long, free);
+	DECLARE_FIFO(long, free_inc);
+	DECLARE_FIFO(long, unused);
+
+	size_t			fifo_last_bucket;
+
+	/* Allocation stuff: */
+	struct bucket		*buckets;
+
+	DECLARE_HEAP(struct bucket *, heap);
+
+	/*
+	 * max(gen - disk_gen) for all buckets. When it gets too big we have to
+	 * call prio_write() to keep gens from wrapping.
+	 */
+	uint8_t			need_save_prio;
+	unsigned		gc_move_threshold;
+
+	/*
+	 * If nonzero, we know we aren't going to find any buckets to invalidate
+	 * until a gc finishes - otherwise we could pointlessly burn a ton of
+	 * cpu
+	 */
+	unsigned		invalidate_needs_gc:1;
+
+	bool			discard; /* Get rid of? */
+
+	/*
+	 * We preallocate structs for issuing discards to buckets, and keep them
+	 * on this list when they're not in use; do_discard() issues discards
+	 * whenever there's work to do and is called by free_some_buckets() and
+	 * when a discard finishes.
+	 */
+	atomic_t		discards_in_flight;
+	struct list_head	discards;
+
+	struct journal_device	journal;
+
+	/* The rest of this all shows up in sysfs */
+#define IO_ERROR_SHIFT		20
+	atomic_t		io_errors;
+	atomic_t		io_count;
+
+	atomic_long_t		meta_sectors_written;
+	atomic_long_t		btree_sectors_written;
+	atomic_long_t		sectors_written;
+
+	struct bio_split_pool	bio_split_hook;
+};
+
+struct gc_stat {
+	size_t			nodes;
+	size_t			key_bytes;
+
+	size_t			nkeys;
+	uint64_t		data;	/* sectors */
+	uint64_t		dirty;	/* sectors */
+	unsigned		in_use; /* percent */
+};
+
+/*
+ * Flag bits, for how the cache set is shutting down, and what phase it's at:
+ *
+ * CACHE_SET_UNREGISTERING means we're not just shutting down, we're detaching
+ * all the backing devices first (their cached data gets invalidated, and they
+ * won't automatically reattach).
+ *
+ * CACHE_SET_STOPPING always gets set first when we're closing down a cache set;
+ * we'll continue to run normally for awhile with CACHE_SET_STOPPING set (i.e.
+ * flushing dirty data).
+ *
+ * CACHE_SET_STOPPING_2 gets set at the last phase, when it's time to shut down the
+ * allocation thread.
+ */
+#define CACHE_SET_UNREGISTERING		0
+#define	CACHE_SET_STOPPING		1
+#define	CACHE_SET_STOPPING_2		2
+
+struct cache_set {
+	struct closure		cl;
+
+	struct list_head	list;
+	struct kobject		kobj;
+	struct kobject		internal;
+	struct dentry		*debug;
+	struct cache_accounting accounting;
+
+	unsigned long		flags;
+
+	struct cache_sb		sb;
+
+	struct cache		*cache[MAX_CACHES_PER_SET];
+	struct cache		*cache_by_alloc[MAX_CACHES_PER_SET];
+	int			caches_loaded;
+
+	struct bcache_device	**devices;
+	struct list_head	cached_devs;
+	uint64_t		cached_dev_sectors;
+	struct closure		caching;
+
+	struct closure_with_waitlist sb_write;
+
+	mempool_t		*search;
+	mempool_t		*bio_meta;
+	struct bio_set		*bio_split;
+
+	/* For the btree cache */
+	struct shrinker		shrink;
+
+	/* For the allocator itself */
+	wait_queue_head_t	alloc_wait;
+
+	/* For the btree cache and anything allocation related */
+	struct mutex		bucket_lock;
+
+	/* log2(bucket_size), in sectors */
+	unsigned short		bucket_bits;
+
+	/* log2(block_size), in sectors */
+	unsigned short		block_bits;
+
+	/*
+	 * Default number of pages for a new btree node - may be less than a
+	 * full bucket
+	 */
+	unsigned		btree_pages;
+
+	/*
+	 * Lists of struct btrees; lru is the list for structs that have memory
+	 * allocated for actual btree node, freed is for structs that do not.
+	 *
+	 * We never free a struct btree, except on shutdown - we just put it on
+	 * the btree_cache_freed list and reuse it later. This simplifies the
+	 * code, and it doesn't cost us much memory as the memory usage is
+	 * dominated by buffers that hold the actual btree node data and those
+	 * can be freed - and the number of struct btrees allocated is
+	 * effectively bounded.
+	 *
+	 * btree_cache_freeable effectively is a small cache - we use it because
+	 * high order page allocations can be rather expensive, and it's quite
+	 * common to delete and allocate btree nodes in quick succession. It
+	 * should never grow past ~2-3 nodes in practice.
+	 */
+	struct list_head	btree_cache;
+	struct list_head	btree_cache_freeable;
+	struct list_head	btree_cache_freed;
+
+	/* Number of elements in btree_cache + btree_cache_freeable lists */
+	unsigned		bucket_cache_used;
+
+	/*
+	 * If we need to allocate memory for a new btree node and that
+	 * allocation fails, we can cannibalize another node in the btree cache
+	 * to satisfy the allocation. However, only one thread can be doing this
+	 * at a time, for obvious reasons - try_harder and try_wait are
+	 * basically a lock for this that we can wait on asynchronously. The
+	 * btree_root() macro releases the lock when it returns.
+	 */
+	struct closure		*try_harder;
+	struct closure_waitlist	try_wait;
+	uint64_t		try_harder_start;
+
+	/*
+	 * When we free a btree node, we increment the gen of the bucket the
+	 * node is in - but we can't rewrite the prios and gens until we
+	 * finished whatever it is we were doing, otherwise after a crash the
+	 * btree node would be freed but for say a split, we might not have the
+	 * pointers to the new nodes inserted into the btree yet.
+	 *
+	 * This is a refcount that blocks prio_write() until the new keys are
+	 * written.
+	 */
+	atomic_t		prio_blocked;
+	struct closure_waitlist	bucket_wait;
+
+	/*
+	 * For any bio we don't skip we subtract the number of sectors from
+	 * rescale; when it hits 0 we rescale all the bucket priorities.
+	 */
+	atomic_t		rescale;
+	/*
+	 * When we invalidate buckets, we use both the priority and the amount
+	 * of good data to determine which buckets to reuse first - to weight
+	 * those together consistently we keep track of the smallest nonzero
+	 * priority of any bucket.
+	 */
+	uint16_t		min_prio;
+
+	/*
+	 * max(gen - gc_gen) for all buckets. When it gets too big we have to gc
+	 * to keep gens from wrapping around.
+	 */
+	uint8_t			need_gc;
+	struct gc_stat		gc_stats;
+	size_t			nbuckets;
+
+	struct closure_with_waitlist gc;
+	/* Where in the btree gc currently is */
+	struct bkey		gc_done;
+
+	/*
+	 * The allocation code needs gc_mark in struct bucket to be correct, but
+	 * it's not while a gc is in progress. Protected by bucket_lock.
+	 */
+	int			gc_mark_valid;
+
+	/* Counts how many sectors bio_insert has added to the cache */
+	atomic_t		sectors_to_gc;
+
+	struct closure		moving_gc;
+	struct closure_waitlist	moving_gc_wait;
+	struct keybuf		moving_gc_keys;
+	/* Number of moving GC bios in flight */
+	atomic_t		in_flight;
+
+	struct btree		*root;
+
+#ifdef CONFIG_BCACHE_DEBUG
+	struct btree		*verify_data;
+	struct mutex		verify_lock;
+#endif
+
+	unsigned		nr_uuids;
+	struct uuid_entry	*uuids;
+	BKEY_PADDED(uuid_bucket);
+	struct closure_with_waitlist uuid_write;
+
+	/*
+	 * A btree node on disk could have too many bsets for an iterator to fit
+	 * on the stack - this is a single element mempool for btree_read_work()
+	 */
+	struct mutex		fill_lock;
+	struct btree_iter	*fill_iter;
+
+	/*
+	 * btree_sort() is a merge sort and requires temporary space - single
+	 * element mempool
+	 */
+	struct mutex		sort_lock;
+	struct bset		*sort;
+
+	/* List of buckets we're currently writing data to */
+	struct list_head	data_buckets;
+	spinlock_t		data_bucket_lock;
+
+	struct journal		journal;
+
+#define CONGESTED_MAX		1024
+	unsigned		congested_last_us;
+	atomic_t		congested;
+
+	/* The rest of this all shows up in sysfs */
+	unsigned		congested_read_threshold_us;
+	unsigned		congested_write_threshold_us;
+
+	spinlock_t		sort_time_lock;
+	struct time_stats	sort_time;
+	struct time_stats	btree_gc_time;
+	struct time_stats	btree_split_time;
+	spinlock_t		btree_read_time_lock;
+	struct time_stats	btree_read_time;
+	struct time_stats	try_harder_time;
+
+	atomic_long_t		cache_read_races;
+	atomic_long_t		writeback_keys_done;
+	atomic_long_t		writeback_keys_failed;
+	unsigned		error_limit;
+	unsigned		error_decay;
+	unsigned short		journal_delay_ms;
+	unsigned		verify:1;
+	unsigned		key_merging_disabled:1;
+	unsigned		gc_always_rewrite:1;
+	unsigned		shrinker_disabled:1;
+	unsigned		copy_gc_enabled:1;
+
+#define BUCKET_HASH_BITS	12
+	struct hlist_head	bucket_hash[1 << BUCKET_HASH_BITS];
+};
+
+static inline bool key_merging_disabled(struct cache_set *c)
+{
+#ifdef CONFIG_BCACHE_DEBUG
+	return c->key_merging_disabled;
+#else
+	return 0;
+#endif
+}
+
+struct bbio {
+	unsigned		submit_time_us;
+	union {
+		struct bkey	key;
+		uint64_t	_pad[3];
+		/*
+		 * We only need pad = 3 here because we only ever carry around a
+		 * single pointer - i.e. the pointer we're doing io to/from.
+		 */
+	};
+	struct bio		bio;
+};
+
+static inline unsigned local_clock_us(void)
+{
+	return local_clock() >> 10;
+}
+
+#define MAX_BSETS		4U
+
+#define BTREE_PRIO		USHRT_MAX
+#define INITIAL_PRIO		32768
+
+#define btree_bytes(c)		((c)->btree_pages * PAGE_SIZE)
+#define btree_blocks(b)							\
+	((unsigned) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
+
+#define btree_default_blocks(c)						\
+	((unsigned) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
+
+#define bucket_pages(c)		((c)->sb.bucket_size / PAGE_SECTORS)
+#define bucket_bytes(c)		((c)->sb.bucket_size << 9)
+#define block_bytes(c)		((c)->sb.block_size << 9)
+
+#define __set_bytes(i, k)	(sizeof(*(i)) + (k) * sizeof(uint64_t))
+#define set_bytes(i)		__set_bytes(i, i->keys)
+
+#define __set_blocks(i, k, c)	DIV_ROUND_UP(__set_bytes(i, k), block_bytes(c))
+#define set_blocks(i, c)	__set_blocks(i, (i)->keys, c)
+
+#define node(i, j)		((struct bkey *) ((i)->d + (j)))
+#define end(i)			node(i, (i)->keys)
+
+#define index(i, b)							\
+	((size_t) (((void *) i - (void *) (b)->sets[0].data) /		\
+		   block_bytes(b->c)))
+
+#define btree_data_space(b)	(PAGE_SIZE << (b)->page_order)
+
+#define prios_per_bucket(c)				\
+	((bucket_bytes(c) - sizeof(struct prio_set)) /	\
+	 sizeof(struct bucket_disk))
+#define prio_buckets(c)					\
+	DIV_ROUND_UP((size_t) (c)->sb.nbuckets, prios_per_bucket(c))
+
+#define JSET_MAGIC		0x245235c1a3625032ULL
+#define PSET_MAGIC		0x6750e15f87337f91ULL
+#define BSET_MAGIC		0x90135c78b99e07f5ULL
+
+#define jset_magic(c)		((c)->sb.set_magic ^ JSET_MAGIC)
+#define pset_magic(c)		((c)->sb.set_magic ^ PSET_MAGIC)
+#define bset_magic(c)		((c)->sb.set_magic ^ BSET_MAGIC)
+
+/* Bkey fields: all units are in sectors */
+
+#define KEY_FIELD(name, field, offset, size)				\
+	BITMASK(name, struct bkey, field, offset, size)
+
+#define PTR_FIELD(name, offset, size)					\
+	static inline uint64_t name(const struct bkey *k, unsigned i)	\
+	{ return (k->ptr[i] >> offset) & ~(((uint64_t) ~0) << size); }	\
+									\
+	static inline void SET_##name(struct bkey *k, unsigned i, uint64_t v)\
+	{								\
+		k->ptr[i] &= ~(~((uint64_t) ~0 << size) << offset);	\
+		k->ptr[i] |= v << offset;				\
+	}
+
+KEY_FIELD(KEY_PTRS,	high, 60, 3)
+KEY_FIELD(HEADER_SIZE,	high, 58, 2)
+KEY_FIELD(KEY_CSUM,	high, 56, 2)
+KEY_FIELD(KEY_PINNED,	high, 55, 1)
+KEY_FIELD(KEY_DIRTY,	high, 36, 1)
+
+KEY_FIELD(KEY_SIZE,	high, 20, 16)
+KEY_FIELD(KEY_INODE,	high, 0,  20)
+
+/* Next time I change the on disk format, KEY_OFFSET() won't be 64 bits */
+
+static inline uint64_t KEY_OFFSET(const struct bkey *k)
+{
+	return k->low;
+}
+
+static inline void SET_KEY_OFFSET(struct bkey *k, uint64_t v)
+{
+	k->low = v;
+}
+
+PTR_FIELD(PTR_DEV,		51, 12)
+PTR_FIELD(PTR_OFFSET,		8,  43)
+PTR_FIELD(PTR_GEN,		0,  8)
+
+#define PTR_CHECK_DEV		((1 << 12) - 1)
+
+#define PTR(gen, offset, dev)						\
+	((((uint64_t) dev) << 51) | ((uint64_t) offset) << 8 | gen)
+
+static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
+{
+	return s >> c->bucket_bits;
+}
+
+static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
+{
+	return ((sector_t) b) << c->bucket_bits;
+}
+
+static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
+{
+	return s & (c->sb.bucket_size - 1);
+}
+
+static inline struct cache *PTR_CACHE(struct cache_set *c,
+				      const struct bkey *k,
+				      unsigned ptr)
+{
+	return c->cache[PTR_DEV(k, ptr)];
+}
+
+static inline size_t PTR_BUCKET_NR(struct cache_set *c,
+				   const struct bkey *k,
+				   unsigned ptr)
+{
+	return sector_to_bucket(c, PTR_OFFSET(k, ptr));
+}
+
+static inline struct bucket *PTR_BUCKET(struct cache_set *c,
+					const struct bkey *k,
+					unsigned ptr)
+{
+	return PTR_CACHE(c, k, ptr)->buckets + PTR_BUCKET_NR(c, k, ptr);
+}
+
+/* Btree key macros */
+
+/*
+ * The high bit being set is a relic from when we used it to do binary
+ * searches - it told you where a key started. It's not used anymore,
+ * and can probably be safely dropped.
+ */
+#define KEY(dev, sector, len)	(struct bkey)				\
+{									\
+	.high = (1ULL << 63) | ((uint64_t) (len) << 20) | (dev),	\
+	.low = (sector)							\
+}
+
+static inline void bkey_init(struct bkey *k)
+{
+	*k = KEY(0, 0, 0);
+}
+
+#define KEY_START(k)		(KEY_OFFSET(k) - KEY_SIZE(k))
+#define START_KEY(k)		KEY(KEY_INODE(k), KEY_START(k), 0)
+#define MAX_KEY			KEY(~(~0 << 20), ((uint64_t) ~0) >> 1, 0)
+#define ZERO_KEY		KEY(0, 0, 0)
+
+/*
+ * This is used for various on disk data structures - cache_sb, prio_set, bset,
+ * jset: The checksum is _always_ the first 8 bytes of these structs
+ */
+#define csum_set(i)							\
+	crc64(((void *) (i)) + sizeof(uint64_t),			\
+	      ((void *) end(i)) - (((void *) (i)) + sizeof(uint64_t)))
+
+/* Error handling macros */
+
+#define btree_bug(b, ...)						\
+do {									\
+	if (bch_cache_set_error((b)->c, __VA_ARGS__))			\
+		dump_stack();						\
+} while (0)
+
+#define cache_bug(c, ...)						\
+do {									\
+	if (bch_cache_set_error(c, __VA_ARGS__))			\
+		dump_stack();						\
+} while (0)
+
+#define btree_bug_on(cond, b, ...)					\
+do {									\
+	if (cond)							\
+		btree_bug(b, __VA_ARGS__);				\
+} while (0)
+
+#define cache_bug_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		cache_bug(c, __VA_ARGS__);				\
+} while (0)
+
+#define cache_set_err_on(cond, c, ...)					\
+do {									\
+	if (cond)							\
+		bch_cache_set_error(c, __VA_ARGS__);			\
+} while (0)
+
+/* Looping macros */
+
+#define for_each_cache(ca, cs, iter)					\
+	for (iter = 0; ca = cs->cache[iter], iter < (cs)->sb.nr_in_set; iter++)
+
+#define for_each_bucket(b, ca)						\
+	for (b = (ca)->buckets + (ca)->sb.first_bucket;			\
+	     b < (ca)->buckets + (ca)->sb.nbuckets; b++)
+
+static inline void __bkey_put(struct cache_set *c, struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
+}
+
+/* Blktrace macros */
+
+#define blktrace_msg(c, fmt, ...)					\
+do {									\
+	struct request_queue *q = bdev_get_queue(c->bdev);		\
+	if (q)								\
+		blk_add_trace_msg(q, fmt, ##__VA_ARGS__);		\
+} while (0)
+
+#define blktrace_msg_all(s, fmt, ...)					\
+do {									\
+	struct cache *_c;						\
+	unsigned i;							\
+	for_each_cache(_c, (s), i)					\
+		blktrace_msg(_c, fmt, ##__VA_ARGS__);			\
+} while (0)
+
+static inline void cached_dev_put(struct cached_dev *dc)
+{
+	if (atomic_dec_and_test(&dc->count))
+		schedule_work(&dc->detach);
+}
+
+static inline bool cached_dev_get(struct cached_dev *dc)
+{
+	if (!atomic_inc_not_zero(&dc->count))
+		return false;
+
+	/* Paired with the mb in cached_dev_attach */
+	smp_mb__after_atomic_inc();
+	return true;
+}
+
+/*
+ * bucket_gc_gen() returns the difference between the bucket's current gen and
+ * the oldest gen of any pointer into that bucket in the btree (last_gc).
+ *
+ * bucket_disk_gen() returns the difference between the current gen and the gen
+ * on disk; they're both used to make sure gens don't wrap around.
+ */
+
+static inline uint8_t bucket_gc_gen(struct bucket *b)
+{
+	return b->gen - b->last_gc;
+}
+
+static inline uint8_t bucket_disk_gen(struct bucket *b)
+{
+	return b->gen - b->disk_gen;
+}
+
+#define BUCKET_GC_GEN_MAX	96U
+#define BUCKET_DISK_GEN_MAX	64U
+
+#define kobj_attribute_write(n, fn)					\
+	static struct kobj_attribute ksysfs_##n = __ATTR(n, S_IWUSR, NULL, fn)
+
+#define kobj_attribute_rw(n, show, store)				\
+	static struct kobj_attribute ksysfs_##n =			\
+		__ATTR(n, S_IWUSR|S_IRUSR, show, store)
+
+/* Forward declarations */
+
+void bch_writeback_queue(struct cached_dev *);
+void bch_writeback_add(struct cached_dev *, unsigned);
+
+void bch_count_io_errors(struct cache *, int, const char *);
+void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
+			      int, const char *);
+void bch_bbio_endio(struct cache_set *, struct bio *, int, const char *);
+void bch_bbio_free(struct bio *, struct cache_set *);
+struct bio *bch_bbio_alloc(struct cache_set *);
+
+struct bio *bch_bio_split(struct bio *, int, gfp_t, struct bio_set *);
+void bch_generic_make_request(struct bio *, struct bio_split_pool *);
+void __bch_submit_bbio(struct bio *, struct cache_set *);
+void bch_submit_bbio(struct bio *, struct cache_set *, struct bkey *, unsigned);
+
+uint8_t bch_inc_gen(struct cache *, struct bucket *);
+void bch_rescale_priorities(struct cache_set *, int);
+bool bch_bucket_add_unused(struct cache *, struct bucket *);
+void bch_allocator_thread(struct closure *);
+
+long bch_bucket_alloc(struct cache *, unsigned, struct closure *);
+void bch_bucket_free(struct cache_set *, struct bkey *);
+
+int __bch_bucket_alloc_set(struct cache_set *, unsigned,
+			   struct bkey *, int, struct closure *);
+int bch_bucket_alloc_set(struct cache_set *, unsigned,
+			 struct bkey *, int, struct closure *);
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *, const char *, ...);
+
+void bch_prio_write(struct cache *);
+void bch_write_bdev_super(struct cached_dev *, struct closure *);
+
+extern struct workqueue_struct *bcache_wq, *bch_gc_wq;
+extern const char * const bch_cache_modes[];
+extern struct mutex bch_register_lock;
+extern struct list_head bch_cache_sets;
+
+extern struct kobj_type bch_cached_dev_ktype;
+extern struct kobj_type bch_flash_dev_ktype;
+extern struct kobj_type bch_cache_set_ktype;
+extern struct kobj_type bch_cache_set_internal_ktype;
+extern struct kobj_type bch_cache_ktype;
+
+void bch_cached_dev_release(struct kobject *);
+void bch_flash_dev_release(struct kobject *);
+void bch_cache_set_release(struct kobject *);
+void bch_cache_release(struct kobject *);
+
+int bch_uuid_write(struct cache_set *);
+void bcache_write_super(struct cache_set *);
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size);
+
+int bch_cached_dev_attach(struct cached_dev *, struct cache_set *);
+void bch_cached_dev_detach(struct cached_dev *);
+void bch_cached_dev_run(struct cached_dev *);
+void bcache_device_stop(struct bcache_device *);
+
+void bch_cache_set_unregister(struct cache_set *);
+void bch_cache_set_stop(struct cache_set *);
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *);
+void bch_btree_cache_free(struct cache_set *);
+int bch_btree_cache_alloc(struct cache_set *);
+void bch_writeback_init_cached_dev(struct cached_dev *);
+void bch_moving_init_cache_set(struct cache_set *);
+
+void bch_cache_allocator_exit(struct cache *ca);
+int bch_cache_allocator_init(struct cache *ca);
+
+void bch_debug_exit(void);
+int bch_debug_init(struct kobject *);
+void bch_writeback_exit(void);
+int bch_writeback_init(void);
+void bch_request_exit(void);
+int bch_request_init(void);
+void bch_btree_exit(void);
+int bch_btree_init(void);
+
+#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
new file mode 100644
index 000000000000..bb0f7ae14b3c
--- /dev/null
+++ b/drivers/md/bcache/bset.c
@@ -0,0 +1,1190 @@
+/*
+ * Code for working with individual keys, and sorted sets of keys with in a
+ * btree node
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+#include <linux/random.h>
+
+/* Keylists */
+
+void bch_keylist_copy(struct keylist *dest, struct keylist *src)
+{
+	*dest = *src;
+
+	if (src->list == src->d) {
+		size_t n = (uint64_t *) src->top - src->d;
+		dest->top = (struct bkey *) &dest->d[n];
+		dest->list = dest->d;
+	}
+}
+
+int bch_keylist_realloc(struct keylist *l, int nptrs, struct cache_set *c)
+{
+	unsigned oldsize = (uint64_t *) l->top - l->list;
+	unsigned newsize = oldsize + 2 + nptrs;
+	uint64_t *new;
+
+	/* The journalling code doesn't handle the case where the keys to insert
+	 * is bigger than an empty write: If we just return -ENOMEM here,
+	 * bio_insert() and bio_invalidate() will insert the keys created so far
+	 * and finish the rest when the keylist is empty.
+	 */
+	if (newsize * sizeof(uint64_t) > block_bytes(c) - sizeof(struct jset))
+		return -ENOMEM;
+
+	newsize = roundup_pow_of_two(newsize);
+
+	if (newsize <= KEYLIST_INLINE ||
+	    roundup_pow_of_two(oldsize) == newsize)
+		return 0;
+
+	new = krealloc(l->list == l->d ? NULL : l->list,
+		       sizeof(uint64_t) * newsize, GFP_NOIO);
+
+	if (!new)
+		return -ENOMEM;
+
+	if (l->list == l->d)
+		memcpy(new, l->list, sizeof(uint64_t) * KEYLIST_INLINE);
+
+	l->list = new;
+	l->top = (struct bkey *) (&l->list[oldsize]);
+
+	return 0;
+}
+
+struct bkey *bch_keylist_pop(struct keylist *l)
+{
+	struct bkey *k = l->bottom;
+
+	if (k == l->top)
+		return NULL;
+
+	while (bkey_next(k) != l->top)
+		k = bkey_next(k);
+
+	return l->top = k;
+}
+
+/* Pointer validation */
+
+bool __bch_ptr_invalid(struct cache_set *c, int level, const struct bkey *k)
+{
+	unsigned i;
+
+	if (level && (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k)))
+		goto bad;
+
+	if (!level && KEY_SIZE(k) > KEY_OFFSET(k))
+		goto bad;
+
+	if (!KEY_SIZE(k))
+		return true;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i)) {
+			struct cache *ca = PTR_CACHE(c, k, i);
+			size_t bucket = PTR_BUCKET_NR(c, k, i);
+			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+			if (KEY_SIZE(k) + r > c->sb.bucket_size ||
+			    bucket <  ca->sb.first_bucket ||
+			    bucket >= ca->sb.nbuckets)
+				goto bad;
+		}
+
+	return false;
+bad:
+	cache_bug(c, "spotted bad key %s: %s", pkey(k), bch_ptr_status(c, k));
+	return true;
+}
+
+bool bch_ptr_bad(struct btree *b, const struct bkey *k)
+{
+	struct bucket *g;
+	unsigned i, stale;
+
+	if (!bkey_cmp(k, &ZERO_KEY) ||
+	    !KEY_PTRS(k) ||
+	    bch_ptr_invalid(b, k))
+		return true;
+
+	if (KEY_PTRS(k) && PTR_DEV(k, 0) == PTR_CHECK_DEV)
+		return true;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(b->c, k, i)) {
+			g = PTR_BUCKET(b->c, k, i);
+			stale = ptr_stale(b->c, k, i);
+
+			btree_bug_on(stale > 96, b,
+				     "key too stale: %i, need_gc %u",
+				     stale, b->c->need_gc);
+
+			btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
+				     b, "stale dirty pointer");
+
+			if (stale)
+				return true;
+
+#ifdef CONFIG_BCACHE_EDEBUG
+			if (!mutex_trylock(&b->c->bucket_lock))
+				continue;
+
+			if (b->level) {
+				if (KEY_DIRTY(k) ||
+				    g->prio != BTREE_PRIO ||
+				    (b->c->gc_mark_valid &&
+				     GC_MARK(g) != GC_MARK_METADATA))
+					goto bug;
+
+			} else {
+				if (g->prio == BTREE_PRIO)
+					goto bug;
+
+				if (KEY_DIRTY(k) &&
+				    b->c->gc_mark_valid &&
+				    GC_MARK(g) != GC_MARK_DIRTY)
+					goto bug;
+			}
+			mutex_unlock(&b->c->bucket_lock);
+#endif
+		}
+
+	return false;
+#ifdef CONFIG_BCACHE_EDEBUG
+bug:
+	mutex_unlock(&b->c->bucket_lock);
+	btree_bug(b, "inconsistent pointer %s: bucket %li pin %i "
+		  "prio %i gen %i last_gc %i mark %llu gc_gen %i", pkey(k),
+		  PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
+		  g->prio, g->gen, g->last_gc, GC_MARK(g), g->gc_gen);
+	return true;
+#endif
+}
+
+/* Key/pointer manipulation */
+
+void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
+			      unsigned i)
+{
+	BUG_ON(i > KEY_PTRS(src));
+
+	/* Only copy the header, key, and one pointer. */
+	memcpy(dest, src, 2 * sizeof(uint64_t));
+	dest->ptr[0] = src->ptr[i];
+	SET_KEY_PTRS(dest, 1);
+	/* We didn't copy the checksum so clear that bit. */
+	SET_KEY_CSUM(dest, 0);
+}
+
+bool __bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+	unsigned i, len = 0;
+
+	if (bkey_cmp(where, &START_KEY(k)) <= 0)
+		return false;
+
+	if (bkey_cmp(where, k) < 0)
+		len = KEY_OFFSET(k) - KEY_OFFSET(where);
+	else
+		bkey_copy_key(k, where);
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + KEY_SIZE(k) - len);
+
+	BUG_ON(len > KEY_SIZE(k));
+	SET_KEY_SIZE(k, len);
+	return true;
+}
+
+bool __bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+	unsigned len = 0;
+
+	if (bkey_cmp(where, k) >= 0)
+		return false;
+
+	BUG_ON(KEY_INODE(where) != KEY_INODE(k));
+
+	if (bkey_cmp(where, &START_KEY(k)) > 0)
+		len = KEY_OFFSET(where) - KEY_START(k);
+
+	bkey_copy_key(k, where);
+
+	BUG_ON(len > KEY_SIZE(k));
+	SET_KEY_SIZE(k, len);
+	return true;
+}
+
+static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
+{
+	return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
+		~((uint64_t)1 << 63);
+}
+
+/* Tries to merge l and r: l should be lower than r
+ * Returns true if we were able to merge. If we did merge, l will be the merged
+ * key, r will be untouched.
+ */
+bool bch_bkey_try_merge(struct btree *b, struct bkey *l, struct bkey *r)
+{
+	unsigned i;
+
+	if (key_merging_disabled(b->c))
+		return false;
+
+	if (KEY_PTRS(l) != KEY_PTRS(r) ||
+	    KEY_DIRTY(l) != KEY_DIRTY(r) ||
+	    bkey_cmp(l, &START_KEY(r)))
+		return false;
+
+	for (i = 0; i < KEY_PTRS(l); i++)
+		if (l->ptr[i] + PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
+		    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
+			return false;
+
+	/* Keys with no pointers aren't restricted to one bucket and could
+	 * overflow KEY_SIZE
+	 */
+	if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
+		SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
+		SET_KEY_SIZE(l, USHRT_MAX);
+
+		bch_cut_front(l, r);
+		return false;
+	}
+
+	if (KEY_CSUM(l)) {
+		if (KEY_CSUM(r))
+			l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
+		else
+			SET_KEY_CSUM(l, 0);
+	}
+
+	SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
+	SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
+
+	return true;
+}
+
+/* Binary tree stuff for auxiliary search trees */
+
+static unsigned inorder_next(unsigned j, unsigned size)
+{
+	if (j * 2 + 1 < size) {
+		j = j * 2 + 1;
+
+		while (j * 2 < size)
+			j *= 2;
+	} else
+		j >>= ffz(j) + 1;
+
+	return j;
+}
+
+static unsigned inorder_prev(unsigned j, unsigned size)
+{
+	if (j * 2 < size) {
+		j = j * 2;
+
+		while (j * 2 + 1 < size)
+			j = j * 2 + 1;
+	} else
+		j >>= ffs(j);
+
+	return j;
+}
+
+/* I have no idea why this code works... and I'm the one who wrote it
+ *
+ * However, I do know what it does:
+ * Given a binary tree constructed in an array (i.e. how you normally implement
+ * a heap), it converts a node in the tree - referenced by array index - to the
+ * index it would have if you did an inorder traversal.
+ *
+ * Also tested for every j, size up to size somewhere around 6 million.
+ *
+ * The binary tree starts at array index 1, not 0
+ * extra is a function of size:
+ *   extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+ */
+static unsigned __to_inorder(unsigned j, unsigned size, unsigned extra)
+{
+	unsigned b = fls(j);
+	unsigned shift = fls(size - 1) - b;
+
+	j  ^= 1U << (b - 1);
+	j <<= 1;
+	j  |= 1;
+	j <<= shift;
+
+	if (j > extra)
+		j -= (j - extra) >> 1;
+
+	return j;
+}
+
+static unsigned to_inorder(unsigned j, struct bset_tree *t)
+{
+	return __to_inorder(j, t->size, t->extra);
+}
+
+static unsigned __inorder_to_tree(unsigned j, unsigned size, unsigned extra)
+{
+	unsigned shift;
+
+	if (j > extra)
+		j += j - extra;
+
+	shift = ffs(j);
+
+	j >>= shift;
+	j  |= roundup_pow_of_two(size) >> shift;
+
+	return j;
+}
+
+static unsigned inorder_to_tree(unsigned j, struct bset_tree *t)
+{
+	return __inorder_to_tree(j, t->size, t->extra);
+}
+
+#if 0
+void inorder_test(void)
+{
+	unsigned long done = 0;
+	ktime_t start = ktime_get();
+
+	for (unsigned size = 2;
+	     size < 65536000;
+	     size++) {
+		unsigned extra = (size - rounddown_pow_of_two(size - 1)) << 1;
+		unsigned i = 1, j = rounddown_pow_of_two(size - 1);
+
+		if (!(size % 4096))
+			printk(KERN_NOTICE "loop %u, %llu per us\n", size,
+			       done / ktime_us_delta(ktime_get(), start));
+
+		while (1) {
+			if (__inorder_to_tree(i, size, extra) != j)
+				panic("size %10u j %10u i %10u", size, j, i);
+
+			if (__to_inorder(j, size, extra) != i)
+				panic("size %10u j %10u i %10u", size, j, i);
+
+			if (j == rounddown_pow_of_two(size) - 1)
+				break;
+
+			BUG_ON(inorder_prev(inorder_next(j, size), size) != j);
+
+			j = inorder_next(j, size);
+			i++;
+		}
+
+		done += size - 1;
+	}
+}
+#endif
+
+/*
+ * Cacheline/offset <-> bkey pointer arithmatic:
+ *
+ * t->tree is a binary search tree in an array; each node corresponds to a key
+ * in one cacheline in t->set (BSET_CACHELINE bytes).
+ *
+ * This means we don't have to store the full index of the key that a node in
+ * the binary tree points to; to_inorder() gives us the cacheline, and then
+ * bkey_float->m gives us the offset within that cacheline, in units of 8 bytes.
+ *
+ * cacheline_to_bkey() and friends abstract out all the pointer arithmatic to
+ * make this work.
+ *
+ * To construct the bfloat for an arbitrary key we need to know what the key
+ * immediately preceding it is: we have to check if the two keys differ in the
+ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size
+ * of the previous key so we can walk backwards to it from t->tree[j]'s key.
+ */
+
+static struct bkey *cacheline_to_bkey(struct bset_tree *t, unsigned cacheline,
+				      unsigned offset)
+{
+	return ((void *) t->data) + cacheline * BSET_CACHELINE + offset * 8;
+}
+
+static unsigned bkey_to_cacheline(struct bset_tree *t, struct bkey *k)
+{
+	return ((void *) k - (void *) t->data) / BSET_CACHELINE;
+}
+
+static unsigned bkey_to_cacheline_offset(struct bkey *k)
+{
+	return ((size_t) k & (BSET_CACHELINE - 1)) / sizeof(uint64_t);
+}
+
+static struct bkey *tree_to_bkey(struct bset_tree *t, unsigned j)
+{
+	return cacheline_to_bkey(t, to_inorder(j, t), t->tree[j].m);
+}
+
+static struct bkey *tree_to_prev_bkey(struct bset_tree *t, unsigned j)
+{
+	return (void *) (((uint64_t *) tree_to_bkey(t, j)) - t->prev[j]);
+}
+
+/*
+ * For the write set - the one we're currently inserting keys into - we don't
+ * maintain a full search tree, we just keep a simple lookup table in t->prev.
+ */
+static struct bkey *table_to_bkey(struct bset_tree *t, unsigned cacheline)
+{
+	return cacheline_to_bkey(t, cacheline, t->prev[cacheline]);
+}
+
+static inline uint64_t shrd128(uint64_t high, uint64_t low, uint8_t shift)
+{
+#ifdef CONFIG_X86_64
+	asm("shrd %[shift],%[high],%[low]"
+	    : [low] "+Rm" (low)
+	    : [high] "R" (high),
+	    [shift] "ci" (shift)
+	    : "cc");
+#else
+	low >>= shift;
+	low  |= (high << 1) << (63U - shift);
+#endif
+	return low;
+}
+
+static inline unsigned bfloat_mantissa(const struct bkey *k,
+				       struct bkey_float *f)
+{
+	const uint64_t *p = &k->low - (f->exponent >> 6);
+	return shrd128(p[-1], p[0], f->exponent & 63) & BKEY_MANTISSA_MASK;
+}
+
+static void make_bfloat(struct bset_tree *t, unsigned j)
+{
+	struct bkey_float *f = &t->tree[j];
+	struct bkey *m = tree_to_bkey(t, j);
+	struct bkey *p = tree_to_prev_bkey(t, j);
+
+	struct bkey *l = is_power_of_2(j)
+		? t->data->start
+		: tree_to_prev_bkey(t, j >> ffs(j));
+
+	struct bkey *r = is_power_of_2(j + 1)
+		? node(t->data, t->data->keys - bkey_u64s(&t->end))
+		: tree_to_bkey(t, j >> (ffz(j) + 1));
+
+	BUG_ON(m < l || m > r);
+	BUG_ON(bkey_next(p) != m);
+
+	if (KEY_INODE(l) != KEY_INODE(r))
+		f->exponent = fls64(KEY_INODE(r) ^ KEY_INODE(l)) + 64;
+	else
+		f->exponent = fls64(r->low ^ l->low);
+
+	f->exponent = max_t(int, f->exponent - BKEY_MANTISSA_BITS, 0);
+
+	/*
+	 * Setting f->exponent = 127 flags this node as failed, and causes the
+	 * lookup code to fall back to comparing against the original key.
+	 */
+
+	if (bfloat_mantissa(m, f) != bfloat_mantissa(p, f))
+		f->mantissa = bfloat_mantissa(m, f) - 1;
+	else
+		f->exponent = 127;
+}
+
+static void bset_alloc_tree(struct btree *b, struct bset_tree *t)
+{
+	if (t != b->sets) {
+		unsigned j = roundup(t[-1].size,
+				     64 / sizeof(struct bkey_float));
+
+		t->tree = t[-1].tree + j;
+		t->prev = t[-1].prev + j;
+	}
+
+	while (t < b->sets + MAX_BSETS)
+		t++->size = 0;
+}
+
+static void bset_build_unwritten_tree(struct btree *b)
+{
+	struct bset_tree *t = b->sets + b->nsets;
+
+	bset_alloc_tree(b, t);
+
+	if (t->tree != b->sets->tree + bset_tree_space(b)) {
+		t->prev[0] = bkey_to_cacheline_offset(t->data->start);
+		t->size = 1;
+	}
+}
+
+static void bset_build_written_tree(struct btree *b)
+{
+	struct bset_tree *t = b->sets + b->nsets;
+	struct bkey *k = t->data->start;
+	unsigned j, cacheline = 1;
+
+	bset_alloc_tree(b, t);
+
+	t->size = min_t(unsigned,
+			bkey_to_cacheline(t, end(t->data)),
+			b->sets->tree + bset_tree_space(b) - t->tree);
+
+	if (t->size < 2) {
+		t->size = 0;
+		return;
+	}
+
+	t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1;
+
+	/* First we figure out where the first key in each cacheline is */
+	for (j = inorder_next(0, t->size);
+	     j;
+	     j = inorder_next(j, t->size)) {
+		while (bkey_to_cacheline(t, k) != cacheline)
+			k = bkey_next(k);
+
+		t->prev[j] = bkey_u64s(k);
+		k = bkey_next(k);
+		cacheline++;
+		t->tree[j].m = bkey_to_cacheline_offset(k);
+	}
+
+	while (bkey_next(k) != end(t->data))
+		k = bkey_next(k);
+
+	t->end = *k;
+
+	/* Then we build the tree */
+	for (j = inorder_next(0, t->size);
+	     j;
+	     j = inorder_next(j, t->size))
+		make_bfloat(t, j);
+}
+
+void bch_bset_fix_invalidated_key(struct btree *b, struct bkey *k)
+{
+	struct bset_tree *t;
+	unsigned inorder, j = 1;
+
+	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+		if (k < end(t->data))
+			goto found_set;
+
+	BUG();
+found_set:
+	if (!t->size || !bset_written(b, t))
+		return;
+
+	inorder = bkey_to_cacheline(t, k);
+
+	if (k == t->data->start)
+		goto fix_left;
+
+	if (bkey_next(k) == end(t->data)) {
+		t->end = *k;
+		goto fix_right;
+	}
+
+	j = inorder_to_tree(inorder, t);
+
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_bkey(t, j))
+fix_left:	do {
+			make_bfloat(t, j);
+			j = j * 2;
+		} while (j < t->size);
+
+	j = inorder_to_tree(inorder + 1, t);
+
+	if (j &&
+	    j < t->size &&
+	    k == tree_to_prev_bkey(t, j))
+fix_right:	do {
+			make_bfloat(t, j);
+			j = j * 2 + 1;
+		} while (j < t->size);
+}
+
+void bch_bset_fix_lookup_table(struct btree *b, struct bkey *k)
+{
+	struct bset_tree *t = &b->sets[b->nsets];
+	unsigned shift = bkey_u64s(k);
+	unsigned j = bkey_to_cacheline(t, k);
+
+	/* We're getting called from btree_split() or btree_gc, just bail out */
+	if (!t->size)
+		return;
+
+	/* k is the key we just inserted; we need to find the entry in the
+	 * lookup table for the first key that is strictly greater than k:
+	 * it's either k's cacheline or the next one
+	 */
+	if (j < t->size &&
+	    table_to_bkey(t, j) <= k)
+		j++;
+
+	/* Adjust all the lookup table entries, and find a new key for any that
+	 * have gotten too big
+	 */
+	for (; j < t->size; j++) {
+		t->prev[j] += shift;
+
+		if (t->prev[j] > 7) {
+			k = table_to_bkey(t, j - 1);
+
+			while (k < cacheline_to_bkey(t, j, 0))
+				k = bkey_next(k);
+
+			t->prev[j] = bkey_to_cacheline_offset(k);
+		}
+	}
+
+	if (t->size == b->sets->tree + bset_tree_space(b) - t->tree)
+		return;
+
+	/* Possibly add a new entry to the end of the lookup table */
+
+	for (k = table_to_bkey(t, t->size - 1);
+	     k != end(t->data);
+	     k = bkey_next(k))
+		if (t->size == bkey_to_cacheline(t, k)) {
+			t->prev[t->size] = bkey_to_cacheline_offset(k);
+			t->size++;
+		}
+}
+
+void bch_bset_init_next(struct btree *b)
+{
+	struct bset *i = write_block(b);
+
+	if (i != b->sets[0].data) {
+		b->sets[++b->nsets].data = i;
+		i->seq = b->sets[0].data->seq;
+	} else
+		get_random_bytes(&i->seq, sizeof(uint64_t));
+
+	i->magic	= bset_magic(b->c);
+	i->version	= 0;
+	i->keys		= 0;
+
+	bset_build_unwritten_tree(b);
+}
+
+struct bset_search_iter {
+	struct bkey *l, *r;
+};
+
+static struct bset_search_iter bset_search_write_set(struct btree *b,
+						     struct bset_tree *t,
+						     const struct bkey *search)
+{
+	unsigned li = 0, ri = t->size;
+
+	BUG_ON(!b->nsets &&
+	       t->size < bkey_to_cacheline(t, end(t->data)));
+
+	while (li + 1 != ri) {
+		unsigned m = (li + ri) >> 1;
+
+		if (bkey_cmp(table_to_bkey(t, m), search) > 0)
+			ri = m;
+		else
+			li = m;
+	}
+
+	return (struct bset_search_iter) {
+		table_to_bkey(t, li),
+		ri < t->size ? table_to_bkey(t, ri) : end(t->data)
+	};
+}
+
+static struct bset_search_iter bset_search_tree(struct btree *b,
+						struct bset_tree *t,
+						const struct bkey *search)
+{
+	struct bkey *l, *r;
+	struct bkey_float *f;
+	unsigned inorder, j, n = 1;
+
+	do {
+		unsigned p = n << 4;
+		p &= ((int) (p - t->size)) >> 31;
+
+		prefetch(&t->tree[p]);
+
+		j = n;
+		f = &t->tree[j];
+
+		/*
+		 * n = (f->mantissa > bfloat_mantissa())
+		 *	? j * 2
+		 *	: j * 2 + 1;
+		 *
+		 * We need to subtract 1 from f->mantissa for the sign bit trick
+		 * to work  - that's done in make_bfloat()
+		 */
+		if (likely(f->exponent != 127))
+			n = j * 2 + (((unsigned)
+				      (f->mantissa -
+				       bfloat_mantissa(search, f))) >> 31);
+		else
+			n = (bkey_cmp(tree_to_bkey(t, j), search) > 0)
+				? j * 2
+				: j * 2 + 1;
+	} while (n < t->size);
+
+	inorder = to_inorder(j, t);
+
+	/*
+	 * n would have been the node we recursed to - the low bit tells us if
+	 * we recursed left or recursed right.
+	 */
+	if (n & 1) {
+		l = cacheline_to_bkey(t, inorder, f->m);
+
+		if (++inorder != t->size) {
+			f = &t->tree[inorder_next(j, t->size)];
+			r = cacheline_to_bkey(t, inorder, f->m);
+		} else
+			r = end(t->data);
+	} else {
+		r = cacheline_to_bkey(t, inorder, f->m);
+
+		if (--inorder) {
+			f = &t->tree[inorder_prev(j, t->size)];
+			l = cacheline_to_bkey(t, inorder, f->m);
+		} else
+			l = t->data->start;
+	}
+
+	return (struct bset_search_iter) {l, r};
+}
+
+struct bkey *__bch_bset_search(struct btree *b, struct bset_tree *t,
+			       const struct bkey *search)
+{
+	struct bset_search_iter i;
+
+	/*
+	 * First, we search for a cacheline, then lastly we do a linear search
+	 * within that cacheline.
+	 *
+	 * To search for the cacheline, there's three different possibilities:
+	 *  * The set is too small to have a search tree, so we just do a linear
+	 *    search over the whole set.
+	 *  * The set is the one we're currently inserting into; keeping a full
+	 *    auxiliary search tree up to date would be too expensive, so we
+	 *    use a much simpler lookup table to do a binary search -
+	 *    bset_search_write_set().
+	 *  * Or we use the auxiliary search tree we constructed earlier -
+	 *    bset_search_tree()
+	 */
+
+	if (unlikely(!t->size)) {
+		i.l = t->data->start;
+		i.r = end(t->data);
+	} else if (bset_written(b, t)) {
+		/*
+		 * Each node in the auxiliary search tree covers a certain range
+		 * of bits, and keys above and below the set it covers might
+		 * differ outside those bits - so we have to special case the
+		 * start and end - handle that here:
+		 */
+
+		if (unlikely(bkey_cmp(search, &t->end) >= 0))
+			return end(t->data);
+
+		if (unlikely(bkey_cmp(search, t->data->start) < 0))
+			return t->data->start;
+
+		i = bset_search_tree(b, t, search);
+	} else
+		i = bset_search_write_set(b, t, search);
+
+#ifdef CONFIG_BCACHE_EDEBUG
+	BUG_ON(bset_written(b, t) &&
+	       i.l != t->data->start &&
+	       bkey_cmp(tree_to_prev_bkey(t,
+		  inorder_to_tree(bkey_to_cacheline(t, i.l), t)),
+			search) > 0);
+
+	BUG_ON(i.r != end(t->data) &&
+	       bkey_cmp(i.r, search) <= 0);
+#endif
+
+	while (likely(i.l != i.r) &&
+	       bkey_cmp(i.l, search) <= 0)
+		i.l = bkey_next(i.l);
+
+	return i.l;
+}
+
+/* Btree iterator */
+
+static inline bool btree_iter_cmp(struct btree_iter_set l,
+				  struct btree_iter_set r)
+{
+	int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
+
+	return c ? c > 0 : l.k < r.k;
+}
+
+static inline bool btree_iter_end(struct btree_iter *iter)
+{
+	return !iter->used;
+}
+
+void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
+			 struct bkey *end)
+{
+	if (k != end)
+		BUG_ON(!heap_add(iter,
+				 ((struct btree_iter_set) { k, end }),
+				 btree_iter_cmp));
+}
+
+struct bkey *__bch_btree_iter_init(struct btree *b, struct btree_iter *iter,
+			       struct bkey *search, struct bset_tree *start)
+{
+	struct bkey *ret = NULL;
+	iter->size = ARRAY_SIZE(iter->data);
+	iter->used = 0;
+
+	for (; start <= &b->sets[b->nsets]; start++) {
+		ret = bch_bset_search(b, start, search);
+		bch_btree_iter_push(iter, ret, end(start->data));
+	}
+
+	return ret;
+}
+
+struct bkey *bch_btree_iter_next(struct btree_iter *iter)
+{
+	struct btree_iter_set unused;
+	struct bkey *ret = NULL;
+
+	if (!btree_iter_end(iter)) {
+		ret = iter->data->k;
+		iter->data->k = bkey_next(iter->data->k);
+
+		if (iter->data->k > iter->data->end) {
+			__WARN();
+			iter->data->k = iter->data->end;
+		}
+
+		if (iter->data->k == iter->data->end)
+			heap_pop(iter, unused, btree_iter_cmp);
+		else
+			heap_sift(iter, 0, btree_iter_cmp);
+	}
+
+	return ret;
+}
+
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
+					struct btree *b, ptr_filter_fn fn)
+{
+	struct bkey *ret;
+
+	do {
+		ret = bch_btree_iter_next(iter);
+	} while (ret && fn(b, ret));
+
+	return ret;
+}
+
+struct bkey *bch_next_recurse_key(struct btree *b, struct bkey *search)
+{
+	struct btree_iter iter;
+
+	bch_btree_iter_init(b, &iter, search);
+	return bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+}
+
+/* Mergesort */
+
+static void btree_sort_fixup(struct btree_iter *iter)
+{
+	while (iter->used > 1) {
+		struct btree_iter_set *top = iter->data, *i = top + 1;
+		struct bkey *k;
+
+		if (iter->used > 2 &&
+		    btree_iter_cmp(i[0], i[1]))
+			i++;
+
+		for (k = i->k;
+		     k != i->end && bkey_cmp(top->k, &START_KEY(k)) > 0;
+		     k = bkey_next(k))
+			if (top->k > i->k)
+				__bch_cut_front(top->k, k);
+			else if (KEY_SIZE(k))
+				bch_cut_back(&START_KEY(k), top->k);
+
+		if (top->k < i->k || k == i->k)
+			break;
+
+		heap_sift(iter, i - top, btree_iter_cmp);
+	}
+}
+
+static void btree_mergesort(struct btree *b, struct bset *out,
+			    struct btree_iter *iter,
+			    bool fixup, bool remove_stale)
+{
+	struct bkey *k, *last = NULL;
+	bool (*bad)(struct btree *, const struct bkey *) = remove_stale
+		? bch_ptr_bad
+		: bch_ptr_invalid;
+
+	while (!btree_iter_end(iter)) {
+		if (fixup && !b->level)
+			btree_sort_fixup(iter);
+
+		k = bch_btree_iter_next(iter);
+		if (bad(b, k))
+			continue;
+
+		if (!last) {
+			last = out->start;
+			bkey_copy(last, k);
+		} else if (b->level ||
+			   !bch_bkey_try_merge(b, last, k)) {
+			last = bkey_next(last);
+			bkey_copy(last, k);
+		}
+	}
+
+	out->keys = last ? (uint64_t *) bkey_next(last) - out->d : 0;
+
+	pr_debug("sorted %i keys", out->keys);
+	bch_check_key_order(b, out);
+}
+
+static void __btree_sort(struct btree *b, struct btree_iter *iter,
+			 unsigned start, unsigned order, bool fixup)
+{
+	uint64_t start_time;
+	bool remove_stale = !b->written;
+	struct bset *out = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOIO,
+						     order);
+	if (!out) {
+		mutex_lock(&b->c->sort_lock);
+		out = b->c->sort;
+		order = ilog2(bucket_pages(b->c));
+	}
+
+	start_time = local_clock();
+
+	btree_mergesort(b, out, iter, fixup, remove_stale);
+	b->nsets = start;
+
+	if (!fixup && !start && b->written)
+		bch_btree_verify(b, out);
+
+	if (!start && order == b->page_order) {
+		/*
+		 * Our temporary buffer is the same size as the btree node's
+		 * buffer, we can just swap buffers instead of doing a big
+		 * memcpy()
+		 */
+
+		out->magic	= bset_magic(b->c);
+		out->seq	= b->sets[0].data->seq;
+		out->version	= b->sets[0].data->version;
+		swap(out, b->sets[0].data);
+
+		if (b->c->sort == b->sets[0].data)
+			b->c->sort = out;
+	} else {
+		b->sets[start].data->keys = out->keys;
+		memcpy(b->sets[start].data->start, out->start,
+		       (void *) end(out) - (void *) out->start);
+	}
+
+	if (out == b->c->sort)
+		mutex_unlock(&b->c->sort_lock);
+	else
+		free_pages((unsigned long) out, order);
+
+	if (b->written)
+		bset_build_written_tree(b);
+
+	if (!start) {
+		spin_lock(&b->c->sort_time_lock);
+		time_stats_update(&b->c->sort_time, start_time);
+		spin_unlock(&b->c->sort_time_lock);
+	}
+}
+
+void bch_btree_sort_partial(struct btree *b, unsigned start)
+{
+	size_t oldsize = 0, order = b->page_order, keys = 0;
+	struct btree_iter iter;
+	__bch_btree_iter_init(b, &iter, NULL, &b->sets[start]);
+
+	BUG_ON(b->sets[b->nsets].data == write_block(b) &&
+	       (b->sets[b->nsets].size || b->nsets));
+
+	if (b->written)
+		oldsize = bch_count_data(b);
+
+	if (start) {
+		unsigned i;
+
+		for (i = start; i <= b->nsets; i++)
+			keys += b->sets[i].data->keys;
+
+		order = roundup_pow_of_two(__set_bytes(b->sets->data, keys)) / PAGE_SIZE;
+		if (order)
+			order = ilog2(order);
+	}
+
+	__btree_sort(b, &iter, start, order, false);
+
+	EBUG_ON(b->written && bch_count_data(b) != oldsize);
+}
+
+void bch_btree_sort_and_fix_extents(struct btree *b, struct btree_iter *iter)
+{
+	BUG_ON(!b->written);
+	__btree_sort(b, iter, 0, b->page_order, true);
+}
+
+void bch_btree_sort_into(struct btree *b, struct btree *new)
+{
+	uint64_t start_time = local_clock();
+
+	struct btree_iter iter;
+	bch_btree_iter_init(b, &iter, NULL);
+
+	btree_mergesort(b, new->sets->data, &iter, false, true);
+
+	spin_lock(&b->c->sort_time_lock);
+	time_stats_update(&b->c->sort_time, start_time);
+	spin_unlock(&b->c->sort_time_lock);
+
+	bkey_copy_key(&new->key, &b->key);
+	new->sets->size = 0;
+}
+
+void bch_btree_sort_lazy(struct btree *b)
+{
+	if (b->nsets) {
+		unsigned i, j, keys = 0, total;
+
+		for (i = 0; i <= b->nsets; i++)
+			keys += b->sets[i].data->keys;
+
+		total = keys;
+
+		for (j = 0; j < b->nsets; j++) {
+			if (keys * 2 < total ||
+			    keys < 1000) {
+				bch_btree_sort_partial(b, j);
+				return;
+			}
+
+			keys -= b->sets[j].data->keys;
+		}
+
+		/* Must sort if b->nsets == 3 or we'll overflow */
+		if (b->nsets >= (MAX_BSETS - 1) - b->level) {
+			bch_btree_sort(b);
+			return;
+		}
+	}
+
+	bset_build_written_tree(b);
+}
+
+/* Sysfs stuff */
+
+struct bset_stats {
+	size_t nodes;
+	size_t sets_written, sets_unwritten;
+	size_t bytes_written, bytes_unwritten;
+	size_t floats, failed;
+};
+
+static int bch_btree_bset_stats(struct btree *b, struct btree_op *op,
+			    struct bset_stats *stats)
+{
+	struct bkey *k;
+	unsigned i;
+
+	stats->nodes++;
+
+	for (i = 0; i <= b->nsets; i++) {
+		struct bset_tree *t = &b->sets[i];
+		size_t bytes = t->data->keys * sizeof(uint64_t);
+		size_t j;
+
+		if (bset_written(b, t)) {
+			stats->sets_written++;
+			stats->bytes_written += bytes;
+
+			stats->floats += t->size - 1;
+
+			for (j = 1; j < t->size; j++)
+				if (t->tree[j].exponent == 127)
+					stats->failed++;
+		} else {
+			stats->sets_unwritten++;
+			stats->bytes_unwritten += bytes;
+		}
+	}
+
+	if (b->level) {
+		struct btree_iter iter;
+
+		for_each_key_filter(b, k, &iter, bch_ptr_bad) {
+			int ret = btree(bset_stats, k, b, op, stats);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+int bch_bset_print_stats(struct cache_set *c, char *buf)
+{
+	struct btree_op op;
+	struct bset_stats t;
+	int ret;
+
+	bch_btree_op_init_stack(&op);
+	memset(&t, 0, sizeof(struct bset_stats));
+
+	ret = btree_root(bset_stats, c, &op, &t);
+	if (ret)
+		return ret;
+
+	return snprintf(buf, PAGE_SIZE,
+			"btree nodes:		%zu\n"
+			"written sets:		%zu\n"
+			"unwritten sets:		%zu\n"
+			"written key bytes:	%zu\n"
+			"unwritten key bytes:	%zu\n"
+			"floats:			%zu\n"
+			"failed:			%zu\n",
+			t.nodes,
+			t.sets_written, t.sets_unwritten,
+			t.bytes_written, t.bytes_unwritten,
+			t.floats, t.failed);
+}
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
new file mode 100644
index 000000000000..57a9cff41546
--- /dev/null
+++ b/drivers/md/bcache/bset.h
@@ -0,0 +1,379 @@
+#ifndef _BCACHE_BSET_H
+#define _BCACHE_BSET_H
+
+/*
+ * BKEYS:
+ *
+ * A bkey contains a key, a size field, a variable number of pointers, and some
+ * ancillary flag bits.
+ *
+ * We use two different functions for validating bkeys, bch_ptr_invalid and
+ * bch_ptr_bad().
+ *
+ * bch_ptr_invalid() primarily filters out keys and pointers that would be
+ * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
+ * pointer that occur in normal practice but don't point to real data.
+ *
+ * The one exception to the rule that ptr_invalid() filters out invalid keys is
+ * that it also filters out keys of size 0 - these are keys that have been
+ * completely overwritten. It'd be safe to delete these in memory while leaving
+ * them on disk, just unnecessary work - so we filter them out when resorting
+ * instead.
+ *
+ * We can't filter out stale keys when we're resorting, because garbage
+ * collection needs to find them to ensure bucket gens don't wrap around -
+ * unless we're rewriting the btree node those stale keys still exist on disk.
+ *
+ * We also implement functions here for removing some number of sectors from the
+ * front or the back of a bkey - this is mainly used for fixing overlapping
+ * extents, by removing the overlapping sectors from the older key.
+ *
+ * BSETS:
+ *
+ * A bset is an array of bkeys laid out contiguously in memory in sorted order,
+ * along with a header. A btree node is made up of a number of these, written at
+ * different times.
+ *
+ * There could be many of them on disk, but we never allow there to be more than
+ * 4 in memory - we lazily resort as needed.
+ *
+ * We implement code here for creating and maintaining auxiliary search trees
+ * (described below) for searching an individial bset, and on top of that we
+ * implement a btree iterator.
+ *
+ * BTREE ITERATOR:
+ *
+ * Most of the code in bcache doesn't care about an individual bset - it needs
+ * to search entire btree nodes and iterate over them in sorted order.
+ *
+ * The btree iterator code serves both functions; it iterates through the keys
+ * in a btree node in sorted order, starting from either keys after a specific
+ * point (if you pass it a search key) or the start of the btree node.
+ *
+ * AUXILIARY SEARCH TREES:
+ *
+ * Since keys are variable length, we can't use a binary search on a bset - we
+ * wouldn't be able to find the start of the next key. But binary searches are
+ * slow anyways, due to terrible cache behaviour; bcache originally used binary
+ * searches and that code topped out at under 50k lookups/second.
+ *
+ * So we need to construct some sort of lookup table. Since we only insert keys
+ * into the last (unwritten) set, most of the keys within a given btree node are
+ * usually in sets that are mostly constant. We use two different types of
+ * lookup tables to take advantage of this.
+ *
+ * Both lookup tables share in common that they don't index every key in the
+ * set; they index one key every BSET_CACHELINE bytes, and then a linear search
+ * is used for the rest.
+ *
+ * For sets that have been written to disk and are no longer being inserted
+ * into, we construct a binary search tree in an array - traversing a binary
+ * search tree in an array gives excellent locality of reference and is very
+ * fast, since both children of any node are adjacent to each other in memory
+ * (and their grandchildren, and great grandchildren...) - this means
+ * prefetching can be used to great effect.
+ *
+ * It's quite useful performance wise to keep these nodes small - not just
+ * because they're more likely to be in L2, but also because we can prefetch
+ * more nodes on a single cacheline and thus prefetch more iterations in advance
+ * when traversing this tree.
+ *
+ * Nodes in the auxiliary search tree must contain both a key to compare against
+ * (we don't want to fetch the key from the set, that would defeat the purpose),
+ * and a pointer to the key. We use a few tricks to compress both of these.
+ *
+ * To compress the pointer, we take advantage of the fact that one node in the
+ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
+ * a function (to_inorder()) that takes the index of a node in a binary tree and
+ * returns what its index would be in an inorder traversal, so we only have to
+ * store the low bits of the offset.
+ *
+ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
+ * compress that,  we take advantage of the fact that when we're traversing the
+ * search tree at every iteration we know that both our search key and the key
+ * we're looking for lie within some range - bounded by our previous
+ * comparisons. (We special case the start of a search so that this is true even
+ * at the root of the tree).
+ *
+ * So we know the key we're looking for is between a and b, and a and b don't
+ * differ higher than bit 50, we don't need to check anything higher than bit
+ * 50.
+ *
+ * We don't usually need the rest of the bits, either; we only need enough bits
+ * to partition the key range we're currently checking.  Consider key n - the
+ * key our auxiliary search tree node corresponds to, and key p, the key
+ * immediately preceding n.  The lowest bit we need to store in the auxiliary
+ * search tree is the highest bit that differs between n and p.
+ *
+ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
+ * comparison. But we'd really like our nodes in the auxiliary search tree to be
+ * of fixed size.
+ *
+ * The solution is to make them fixed size, and when we're constructing a node
+ * check if p and n differed in the bits we needed them to. If they don't we
+ * flag that node, and when doing lookups we fallback to comparing against the
+ * real key. As long as this doesn't happen to often (and it seems to reliably
+ * happen a bit less than 1% of the time), we win - even on failures, that key
+ * is then more likely to be in cache than if we were doing binary searches all
+ * the way, since we're touching so much less memory.
+ *
+ * The keys in the auxiliary search tree are stored in (software) floating
+ * point, with an exponent and a mantissa. The exponent needs to be big enough
+ * to address all the bits in the original key, but the number of bits in the
+ * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
+ *
+ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
+ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
+ * We need one node per 128 bytes in the btree node, which means the auxiliary
+ * search trees take up 3% as much memory as the btree itself.
+ *
+ * Constructing these auxiliary search trees is moderately expensive, and we
+ * don't want to be constantly rebuilding the search tree for the last set
+ * whenever we insert another key into it. For the unwritten set, we use a much
+ * simpler lookup table - it's just a flat array, so index i in the lookup table
+ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
+ * within each byte range works the same as with the auxiliary search trees.
+ *
+ * These are much easier to keep up to date when we insert a key - we do it
+ * somewhat lazily; when we shift a key up we usually just increment the pointer
+ * to it, only when it would overflow do we go to the trouble of finding the
+ * first key in that range of bytes again.
+ */
+
+/* Btree key comparison/iteration */
+
+struct btree_iter {
+	size_t size, used;
+	struct btree_iter_set {
+		struct bkey *k, *end;
+	} data[MAX_BSETS];
+};
+
+struct bset_tree {
+	/*
+	 * We construct a binary tree in an array as if the array
+	 * started at 1, so that things line up on the same cachelines
+	 * better: see comments in bset.c at cacheline_to_bkey() for
+	 * details
+	 */
+
+	/* size of the binary tree and prev array */
+	unsigned	size;
+
+	/* function of size - precalculated for to_inorder() */
+	unsigned	extra;
+
+	/* copy of the last key in the set */
+	struct bkey	end;
+	struct bkey_float *tree;
+
+	/*
+	 * The nodes in the bset tree point to specific keys - this
+	 * array holds the sizes of the previous key.
+	 *
+	 * Conceptually it's a member of struct bkey_float, but we want
+	 * to keep bkey_float to 4 bytes and prev isn't used in the fast
+	 * path.
+	 */
+	uint8_t		*prev;
+
+	/* The actual btree node, with pointers to each sorted set */
+	struct bset	*data;
+};
+
+static __always_inline int64_t bkey_cmp(const struct bkey *l,
+					const struct bkey *r)
+{
+	return unlikely(KEY_INODE(l) != KEY_INODE(r))
+		? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
+		: (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
+}
+
+static inline size_t bkey_u64s(const struct bkey *k)
+{
+	BUG_ON(KEY_CSUM(k) > 1);
+	return 2 + KEY_PTRS(k) + (KEY_CSUM(k) ? 1 : 0);
+}
+
+static inline size_t bkey_bytes(const struct bkey *k)
+{
+	return bkey_u64s(k) * sizeof(uint64_t);
+}
+
+static inline void bkey_copy(struct bkey *dest, const struct bkey *src)
+{
+	memcpy(dest, src, bkey_bytes(src));
+}
+
+static inline void bkey_copy_key(struct bkey *dest, const struct bkey *src)
+{
+	if (!src)
+		src = &KEY(0, 0, 0);
+
+	SET_KEY_INODE(dest, KEY_INODE(src));
+	SET_KEY_OFFSET(dest, KEY_OFFSET(src));
+}
+
+static inline struct bkey *bkey_next(const struct bkey *k)
+{
+	uint64_t *d = (void *) k;
+	return (struct bkey *) (d + bkey_u64s(k));
+}
+
+/* Keylists */
+
+struct keylist {
+	struct bkey		*top;
+	union {
+		uint64_t		*list;
+		struct bkey		*bottom;
+	};
+
+	/* Enough room for btree_split's keys without realloc */
+#define KEYLIST_INLINE		16
+	uint64_t		d[KEYLIST_INLINE];
+};
+
+static inline void bch_keylist_init(struct keylist *l)
+{
+	l->top = (void *) (l->list = l->d);
+}
+
+static inline void bch_keylist_push(struct keylist *l)
+{
+	l->top = bkey_next(l->top);
+}
+
+static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
+{
+	bkey_copy(l->top, k);
+	bch_keylist_push(l);
+}
+
+static inline bool bch_keylist_empty(struct keylist *l)
+{
+	return l->top == (void *) l->list;
+}
+
+static inline void bch_keylist_free(struct keylist *l)
+{
+	if (l->list != l->d)
+		kfree(l->list);
+}
+
+void bch_keylist_copy(struct keylist *, struct keylist *);
+struct bkey *bch_keylist_pop(struct keylist *);
+int bch_keylist_realloc(struct keylist *, int, struct cache_set *);
+
+void bch_bkey_copy_single_ptr(struct bkey *, const struct bkey *,
+			      unsigned);
+bool __bch_cut_front(const struct bkey *, struct bkey *);
+bool __bch_cut_back(const struct bkey *, struct bkey *);
+
+static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, k) > 0);
+	return __bch_cut_front(where, k);
+}
+
+static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
+{
+	BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
+	return __bch_cut_back(where, k);
+}
+
+const char *bch_ptr_status(struct cache_set *, const struct bkey *);
+bool __bch_ptr_invalid(struct cache_set *, int level, const struct bkey *);
+bool bch_ptr_bad(struct btree *, const struct bkey *);
+
+static inline uint8_t gen_after(uint8_t a, uint8_t b)
+{
+	uint8_t r = a - b;
+	return r > 128U ? 0 : r;
+}
+
+static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
+				unsigned i)
+{
+	return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
+}
+
+static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
+				 unsigned i)
+{
+	return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && PTR_CACHE(c, k, i);
+}
+
+
+typedef bool (*ptr_filter_fn)(struct btree *, const struct bkey *);
+
+struct bkey *bch_next_recurse_key(struct btree *, struct bkey *);
+struct bkey *bch_btree_iter_next(struct btree_iter *);
+struct bkey *bch_btree_iter_next_filter(struct btree_iter *,
+					struct btree *, ptr_filter_fn);
+
+void bch_btree_iter_push(struct btree_iter *, struct bkey *, struct bkey *);
+struct bkey *__bch_btree_iter_init(struct btree *, struct btree_iter *,
+				   struct bkey *, struct bset_tree *);
+
+/* 32 bits total: */
+#define BKEY_MID_BITS		3
+#define BKEY_EXPONENT_BITS	7
+#define BKEY_MANTISSA_BITS	22
+#define BKEY_MANTISSA_MASK	((1 << BKEY_MANTISSA_BITS) - 1)
+
+struct bkey_float {
+	unsigned	exponent:BKEY_EXPONENT_BITS;
+	unsigned	m:BKEY_MID_BITS;
+	unsigned	mantissa:BKEY_MANTISSA_BITS;
+} __packed;
+
+/*
+ * BSET_CACHELINE was originally intended to match the hardware cacheline size -
+ * it used to be 64, but I realized the lookup code would touch slightly less
+ * memory if it was 128.
+ *
+ * It definites the number of bytes (in struct bset) per struct bkey_float in
+ * the auxiliar search tree - when we're done searching the bset_float tree we
+ * have this many bytes left that we do a linear search over.
+ *
+ * Since (after level 5) every level of the bset_tree is on a new cacheline,
+ * we're touching one fewer cacheline in the bset tree in exchange for one more
+ * cacheline in the linear search - but the linear search might stop before it
+ * gets to the second cacheline.
+ */
+
+#define BSET_CACHELINE		128
+#define bset_tree_space(b)	(btree_data_space(b) / BSET_CACHELINE)
+
+#define bset_tree_bytes(b)	(bset_tree_space(b) * sizeof(struct bkey_float))
+#define bset_prev_bytes(b)	(bset_tree_space(b) * sizeof(uint8_t))
+
+void bch_bset_init_next(struct btree *);
+
+void bch_bset_fix_invalidated_key(struct btree *, struct bkey *);
+void bch_bset_fix_lookup_table(struct btree *, struct bkey *);
+
+struct bkey *__bch_bset_search(struct btree *, struct bset_tree *,
+			   const struct bkey *);
+
+static inline struct bkey *bch_bset_search(struct btree *b, struct bset_tree *t,
+					   const struct bkey *search)
+{
+	return search ? __bch_bset_search(b, t, search) : t->data->start;
+}
+
+bool bch_bkey_try_merge(struct btree *, struct bkey *, struct bkey *);
+void bch_btree_sort_lazy(struct btree *);
+void bch_btree_sort_into(struct btree *, struct btree *);
+void bch_btree_sort_and_fix_extents(struct btree *, struct btree_iter *);
+void bch_btree_sort_partial(struct btree *, unsigned);
+
+static inline void bch_btree_sort(struct btree *b)
+{
+	bch_btree_sort_partial(b, 0);
+}
+
+int bch_bset_print_stats(struct cache_set *, char *);
+
+#endif
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
new file mode 100644
index 000000000000..e7bc917ef0d7
--- /dev/null
+++ b/drivers/md/bcache/btree.c
@@ -0,0 +1,2503 @@
+/*
+ * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
+ *
+ * Uses a block device as cache for other block devices; optimized for SSDs.
+ * All allocation is done in buckets, which should match the erase block size
+ * of the device.
+ *
+ * Buckets containing cached data are kept on a heap sorted by priority;
+ * bucket priority is increased on cache hit, and periodically all the buckets
+ * on the heap have their priority scaled down. This currently is just used as
+ * an LRU but in the future should allow for more intelligent heuristics.
+ *
+ * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
+ * counter. Garbage collection is used to remove stale pointers.
+ *
+ * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
+ * as keys are inserted we only sort the pages that have not yet been written.
+ * When garbage collection is run, we resort the entire node.
+ *
+ * All configuration is done via sysfs; see Documentation/bcache.txt.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include <linux/rcupdate.h>
+#include <trace/events/bcache.h>
+
+/*
+ * Todo:
+ * register_bcache: Return errors out to userspace correctly
+ *
+ * Writeback: don't undirty key until after a cache flush
+ *
+ * Create an iterator for key pointers
+ *
+ * On btree write error, mark bucket such that it won't be freed from the cache
+ *
+ * Journalling:
+ *   Check for bad keys in replay
+ *   Propagate barriers
+ *   Refcount journal entries in journal_replay
+ *
+ * Garbage collection:
+ *   Finish incremental gc
+ *   Gc should free old UUIDs, data for invalid UUIDs
+ *
+ * Provide a way to list backing device UUIDs we have data cached for, and
+ * probably how long it's been since we've seen them, and a way to invalidate
+ * dirty data for devices that will never be attached again
+ *
+ * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
+ * that based on that and how much dirty data we have we can keep writeback
+ * from being starved
+ *
+ * Add a tracepoint or somesuch to watch for writeback starvation
+ *
+ * When btree depth > 1 and splitting an interior node, we have to make sure
+ * alloc_bucket() cannot fail. This should be true but is not completely
+ * obvious.
+ *
+ * Make sure all allocations get charged to the root cgroup
+ *
+ * Plugging?
+ *
+ * If data write is less than hard sector size of ssd, round up offset in open
+ * bucket to the next whole sector
+ *
+ * Also lookup by cgroup in get_open_bucket()
+ *
+ * Superblock needs to be fleshed out for multiple cache devices
+ *
+ * Add a sysfs tunable for the number of writeback IOs in flight
+ *
+ * Add a sysfs tunable for the number of open data buckets
+ *
+ * IO tracking: Can we track when one process is doing io on behalf of another?
+ * IO tracking: Don't use just an average, weigh more recent stuff higher
+ *
+ * Test module load/unload
+ */
+
+static const char * const op_types[] = {
+	"insert", "replace"
+};
+
+static const char *op_type(struct btree_op *op)
+{
+	return op_types[op->type];
+}
+
+#define MAX_NEED_GC		64
+#define MAX_SAVE_PRIO		72
+
+#define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
+
+#define PTR_HASH(c, k)							\
+	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+
+struct workqueue_struct *bch_gc_wq;
+static struct workqueue_struct *btree_io_wq;
+
+void bch_btree_op_init_stack(struct btree_op *op)
+{
+	memset(op, 0, sizeof(struct btree_op));
+	closure_init_stack(&op->cl);
+	op->lock = -1;
+	bch_keylist_init(&op->keys);
+}
+
+/* Btree key manipulation */
+
+static void bkey_put(struct cache_set *c, struct bkey *k, int level)
+{
+	if ((level && KEY_OFFSET(k)) || !level)
+		__bkey_put(c, k);
+}
+
+/* Btree IO */
+
+static uint64_t btree_csum_set(struct btree *b, struct bset *i)
+{
+	uint64_t crc = b->key.ptr[0];
+	void *data = (void *) i + 8, *end = end(i);
+
+	crc = crc64_update(crc, data, end - data);
+	return crc ^ 0xffffffffffffffff;
+}
+
+static void btree_bio_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct btree *b = container_of(cl, struct btree, io.cl);
+
+	if (error)
+		set_btree_node_io_error(b);
+
+	bch_bbio_count_io_errors(b->c, bio, error, (bio->bi_rw & WRITE)
+				 ? "writing btree" : "reading btree");
+	closure_put(cl);
+}
+
+static void btree_bio_init(struct btree *b)
+{
+	BUG_ON(b->bio);
+	b->bio = bch_bbio_alloc(b->c);
+
+	b->bio->bi_end_io	= btree_bio_endio;
+	b->bio->bi_private	= &b->io.cl;
+}
+
+void bch_btree_read_done(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io.cl);
+	struct bset *i = b->sets[0].data;
+	struct btree_iter *iter = b->c->fill_iter;
+	const char *err = "bad btree header";
+	BUG_ON(b->nsets || b->written);
+
+	bch_bbio_free(b->bio, b->c);
+	b->bio = NULL;
+
+	mutex_lock(&b->c->fill_lock);
+	iter->used = 0;
+
+	if (btree_node_io_error(b) ||
+	    !i->seq)
+		goto err;
+
+	for (;
+	     b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
+	     i = write_block(b)) {
+		err = "unsupported bset version";
+		if (i->version > BCACHE_BSET_VERSION)
+			goto err;
+
+		err = "bad btree header";
+		if (b->written + set_blocks(i, b->c) > btree_blocks(b))
+			goto err;
+
+		err = "bad magic";
+		if (i->magic != bset_magic(b->c))
+			goto err;
+
+		err = "bad checksum";
+		switch (i->version) {
+		case 0:
+			if (i->csum != csum_set(i))
+				goto err;
+			break;
+		case BCACHE_BSET_VERSION:
+			if (i->csum != btree_csum_set(b, i))
+				goto err;
+			break;
+		}
+
+		err = "empty set";
+		if (i != b->sets[0].data && !i->keys)
+			goto err;
+
+		bch_btree_iter_push(iter, i->start, end(i));
+
+		b->written += set_blocks(i, b->c);
+	}
+
+	err = "corrupted btree";
+	for (i = write_block(b);
+	     index(i, b) < btree_blocks(b);
+	     i = ((void *) i) + block_bytes(b->c))
+		if (i->seq == b->sets[0].data->seq)
+			goto err;
+
+	bch_btree_sort_and_fix_extents(b, iter);
+
+	i = b->sets[0].data;
+	err = "short btree key";
+	if (b->sets[0].size &&
+	    bkey_cmp(&b->key, &b->sets[0].end) < 0)
+		goto err;
+
+	if (b->written < btree_blocks(b))
+		bch_bset_init_next(b);
+out:
+
+	mutex_unlock(&b->c->fill_lock);
+
+	spin_lock(&b->c->btree_read_time_lock);
+	time_stats_update(&b->c->btree_read_time, b->io_start_time);
+	spin_unlock(&b->c->btree_read_time_lock);
+
+	smp_wmb(); /* read_done is our write lock */
+	set_btree_node_read_done(b);
+
+	closure_return(cl);
+err:
+	set_btree_node_io_error(b);
+	bch_cache_set_error(b->c, "%s at bucket %lu, block %zu, %u keys",
+			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
+			    index(i, b), i->keys);
+	goto out;
+}
+
+void bch_btree_read(struct btree *b)
+{
+	BUG_ON(b->nsets || b->written);
+
+	if (!closure_trylock(&b->io.cl, &b->c->cl))
+		BUG();
+
+	b->io_start_time = local_clock();
+
+	btree_bio_init(b);
+	b->bio->bi_rw	= REQ_META|READ_SYNC;
+	b->bio->bi_size	= KEY_SIZE(&b->key) << 9;
+
+	bio_map(b->bio, b->sets[0].data);
+
+	pr_debug("%s", pbtree(b));
+	trace_bcache_btree_read(b->bio);
+	bch_submit_bbio(b->bio, b->c, &b->key, 0);
+
+	continue_at(&b->io.cl, bch_btree_read_done, system_wq);
+}
+
+static void btree_complete_write(struct btree *b, struct btree_write *w)
+{
+	if (w->prio_blocked &&
+	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
+		wake_up(&b->c->alloc_wait);
+
+	if (w->journal) {
+		atomic_dec_bug(w->journal);
+		__closure_wake_up(&b->c->journal.wait);
+	}
+
+	if (w->owner)
+		closure_put(w->owner);
+
+	w->prio_blocked	= 0;
+	w->journal	= NULL;
+	w->owner	= NULL;
+}
+
+static void __btree_write_done(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io.cl);
+	struct btree_write *w = btree_prev_write(b);
+
+	bch_bbio_free(b->bio, b->c);
+	b->bio = NULL;
+	btree_complete_write(b, w);
+
+	if (btree_node_dirty(b))
+		queue_delayed_work(btree_io_wq, &b->work,
+				   msecs_to_jiffies(30000));
+
+	closure_return(cl);
+}
+
+static void btree_write_done(struct closure *cl)
+{
+	struct btree *b = container_of(cl, struct btree, io.cl);
+	struct bio_vec *bv;
+	int n;
+
+	__bio_for_each_segment(bv, b->bio, n, 0)
+		__free_page(bv->bv_page);
+
+	__btree_write_done(cl);
+}
+
+static void do_btree_write(struct btree *b)
+{
+	struct closure *cl = &b->io.cl;
+	struct bset *i = b->sets[b->nsets].data;
+	BKEY_PADDED(key) k;
+
+	i->version	= BCACHE_BSET_VERSION;
+	i->csum		= btree_csum_set(b, i);
+
+	btree_bio_init(b);
+	b->bio->bi_rw	= REQ_META|WRITE_SYNC;
+	b->bio->bi_size	= set_blocks(i, b->c) * block_bytes(b->c);
+	bio_map(b->bio, i);
+
+	bkey_copy(&k.key, &b->key);
+	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
+
+	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
+		int j;
+		struct bio_vec *bv;
+		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
+
+		bio_for_each_segment(bv, b->bio, j)
+			memcpy(page_address(bv->bv_page),
+			       base + j * PAGE_SIZE, PAGE_SIZE);
+
+		trace_bcache_btree_write(b->bio);
+		bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+		continue_at(cl, btree_write_done, NULL);
+	} else {
+		b->bio->bi_vcnt = 0;
+		bio_map(b->bio, i);
+
+		trace_bcache_btree_write(b->bio);
+		bch_submit_bbio(b->bio, b->c, &k.key, 0);
+
+		closure_sync(cl);
+		__btree_write_done(cl);
+	}
+}
+
+static void __btree_write(struct btree *b)
+{
+	struct bset *i = b->sets[b->nsets].data;
+
+	BUG_ON(current->bio_list);
+
+	closure_lock(&b->io, &b->c->cl);
+	cancel_delayed_work(&b->work);
+
+	clear_bit(BTREE_NODE_dirty,	 &b->flags);
+	change_bit(BTREE_NODE_write_idx, &b->flags);
+
+	bch_check_key_order(b, i);
+	BUG_ON(b->written && !i->keys);
+
+	do_btree_write(b);
+
+	pr_debug("%s block %i keys %i", pbtree(b), b->written, i->keys);
+
+	b->written += set_blocks(i, b->c);
+	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
+			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
+
+	bch_btree_sort_lazy(b);
+
+	if (b->written < btree_blocks(b))
+		bch_bset_init_next(b);
+}
+
+static void btree_write_work(struct work_struct *w)
+{
+	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
+
+	down_write(&b->lock);
+
+	if (btree_node_dirty(b))
+		__btree_write(b);
+	up_write(&b->lock);
+}
+
+void bch_btree_write(struct btree *b, bool now, struct btree_op *op)
+{
+	struct bset *i = b->sets[b->nsets].data;
+	struct btree_write *w = btree_current_write(b);
+
+	BUG_ON(b->written &&
+	       (b->written >= btree_blocks(b) ||
+		i->seq != b->sets[0].data->seq ||
+		!i->keys));
+
+	if (!btree_node_dirty(b)) {
+		set_btree_node_dirty(b);
+		queue_delayed_work(btree_io_wq, &b->work,
+				   msecs_to_jiffies(30000));
+	}
+
+	w->prio_blocked += b->prio_blocked;
+	b->prio_blocked = 0;
+
+	if (op && op->journal && !b->level) {
+		if (w->journal &&
+		    journal_pin_cmp(b->c, w, op)) {
+			atomic_dec_bug(w->journal);
+			w->journal = NULL;
+		}
+
+		if (!w->journal) {
+			w->journal = op->journal;
+			atomic_inc(w->journal);
+		}
+	}
+
+	if (current->bio_list)
+		return;
+
+	/* Force write if set is too big */
+	if (now ||
+	    b->level ||
+	    set_bytes(i) > PAGE_SIZE - 48) {
+		if (op && now) {
+			/* Must wait on multiple writes */
+			BUG_ON(w->owner);
+			w->owner = &op->cl;
+			closure_get(&op->cl);
+		}
+
+		__btree_write(b);
+	}
+	BUG_ON(!b->written);
+}
+
+/*
+ * Btree in memory cache - allocation/freeing
+ * mca -> memory cache
+ */
+
+static void mca_reinit(struct btree *b)
+{
+	unsigned i;
+
+	b->flags	= 0;
+	b->written	= 0;
+	b->nsets	= 0;
+
+	for (i = 0; i < MAX_BSETS; i++)
+		b->sets[i].size = 0;
+	/*
+	 * Second loop starts at 1 because b->sets[0]->data is the memory we
+	 * allocated
+	 */
+	for (i = 1; i < MAX_BSETS; i++)
+		b->sets[i].data = NULL;
+}
+
+#define mca_reserve(c)	(((c->root && c->root->level)		\
+			  ? c->root->level : 1) * 8 + 16)
+#define mca_can_free(c)						\
+	max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
+
+static void mca_data_free(struct btree *b)
+{
+	struct bset_tree *t = b->sets;
+	BUG_ON(!closure_is_unlocked(&b->io.cl));
+
+	if (bset_prev_bytes(b) < PAGE_SIZE)
+		kfree(t->prev);
+	else
+		free_pages((unsigned long) t->prev,
+			   get_order(bset_prev_bytes(b)));
+
+	if (bset_tree_bytes(b) < PAGE_SIZE)
+		kfree(t->tree);
+	else
+		free_pages((unsigned long) t->tree,
+			   get_order(bset_tree_bytes(b)));
+
+	free_pages((unsigned long) t->data, b->page_order);
+
+	t->prev = NULL;
+	t->tree = NULL;
+	t->data = NULL;
+	list_move(&b->list, &b->c->btree_cache_freed);
+	b->c->bucket_cache_used--;
+}
+
+static void mca_bucket_free(struct btree *b)
+{
+	BUG_ON(btree_node_dirty(b));
+
+	b->key.ptr[0] = 0;
+	hlist_del_init_rcu(&b->hash);
+	list_move(&b->list, &b->c->btree_cache_freeable);
+}
+
+static unsigned btree_order(struct bkey *k)
+{
+	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
+}
+
+static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
+{
+	struct bset_tree *t = b->sets;
+	BUG_ON(t->data);
+
+	b->page_order = max_t(unsigned,
+			      ilog2(b->c->btree_pages),
+			      btree_order(k));
+
+	t->data = (void *) __get_free_pages(gfp, b->page_order);
+	if (!t->data)
+		goto err;
+
+	t->tree = bset_tree_bytes(b) < PAGE_SIZE
+		? kmalloc(bset_tree_bytes(b), gfp)
+		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
+	if (!t->tree)
+		goto err;
+
+	t->prev = bset_prev_bytes(b) < PAGE_SIZE
+		? kmalloc(bset_prev_bytes(b), gfp)
+		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
+	if (!t->prev)
+		goto err;
+
+	list_move(&b->list, &b->c->btree_cache);
+	b->c->bucket_cache_used++;
+	return;
+err:
+	mca_data_free(b);
+}
+
+static struct btree *mca_bucket_alloc(struct cache_set *c,
+				      struct bkey *k, gfp_t gfp)
+{
+	struct btree *b = kzalloc(sizeof(struct btree), gfp);
+	if (!b)
+		return NULL;
+
+	init_rwsem(&b->lock);
+	lockdep_set_novalidate_class(&b->lock);
+	INIT_LIST_HEAD(&b->list);
+	INIT_DELAYED_WORK(&b->work, btree_write_work);
+	b->c = c;
+	closure_init_unlocked(&b->io);
+
+	mca_data_alloc(b, k, gfp);
+	return b;
+}
+
+static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
+{
+	lockdep_assert_held(&b->c->bucket_lock);
+
+	if (!down_write_trylock(&b->lock))
+		return -ENOMEM;
+
+	if (b->page_order < min_order) {
+		rw_unlock(true, b);
+		return -ENOMEM;
+	}
+
+	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
+
+	if (cl && btree_node_dirty(b))
+		bch_btree_write(b, true, NULL);
+
+	if (cl)
+		closure_wait_event_async(&b->io.wait, cl,
+			 atomic_read(&b->io.cl.remaining) == -1);
+
+	if (btree_node_dirty(b) ||
+	    !closure_is_unlocked(&b->io.cl) ||
+	    work_pending(&b->work.work)) {
+		rw_unlock(true, b);
+		return -EAGAIN;
+	}
+
+	return 0;
+}
+
+static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+	struct btree *b, *t;
+	unsigned long i, nr = sc->nr_to_scan;
+
+	if (c->shrinker_disabled)
+		return 0;
+
+	if (c->try_harder)
+		return 0;
+
+	/*
+	 * If nr == 0, we're supposed to return the number of items we have
+	 * cached. Not allowed to return -1.
+	 */
+	if (!nr)
+		return mca_can_free(c) * c->btree_pages;
+
+	/* Return -1 if we can't do anything right now */
+	if (sc->gfp_mask & __GFP_WAIT)
+		mutex_lock(&c->bucket_lock);
+	else if (!mutex_trylock(&c->bucket_lock))
+		return -1;
+
+	nr /= c->btree_pages;
+	nr = min_t(unsigned long, nr, mca_can_free(c));
+
+	i = 0;
+	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
+		if (!nr)
+			break;
+
+		if (++i > 3 &&
+		    !mca_reap(b, NULL, 0)) {
+			mca_data_free(b);
+			rw_unlock(true, b);
+			--nr;
+		}
+	}
+
+	/*
+	 * Can happen right when we first start up, before we've read in any
+	 * btree nodes
+	 */
+	if (list_empty(&c->btree_cache))
+		goto out;
+
+	for (i = 0; nr && i < c->bucket_cache_used; i++) {
+		b = list_first_entry(&c->btree_cache, struct btree, list);
+		list_rotate_left(&c->btree_cache);
+
+		if (!b->accessed &&
+		    !mca_reap(b, NULL, 0)) {
+			mca_bucket_free(b);
+			mca_data_free(b);
+			rw_unlock(true, b);
+			--nr;
+		} else
+			b->accessed = 0;
+	}
+out:
+	nr = mca_can_free(c) * c->btree_pages;
+	mutex_unlock(&c->bucket_lock);
+	return nr;
+}
+
+void bch_btree_cache_free(struct cache_set *c)
+{
+	struct btree *b;
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	if (c->shrink.list.next)
+		unregister_shrinker(&c->shrink);
+
+	mutex_lock(&c->bucket_lock);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	if (c->verify_data)
+		list_move(&c->verify_data->list, &c->btree_cache);
+#endif
+
+	list_splice(&c->btree_cache_freeable,
+		    &c->btree_cache);
+
+	while (!list_empty(&c->btree_cache)) {
+		b = list_first_entry(&c->btree_cache, struct btree, list);
+
+		if (btree_node_dirty(b))
+			btree_complete_write(b, btree_current_write(b));
+		clear_bit(BTREE_NODE_dirty, &b->flags);
+
+		mca_data_free(b);
+	}
+
+	while (!list_empty(&c->btree_cache_freed)) {
+		b = list_first_entry(&c->btree_cache_freed,
+				     struct btree, list);
+		list_del(&b->list);
+		cancel_delayed_work_sync(&b->work);
+		kfree(b);
+	}
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+int bch_btree_cache_alloc(struct cache_set *c)
+{
+	unsigned i;
+
+	/* XXX: doesn't check for errors */
+
+	closure_init_unlocked(&c->gc);
+
+	for (i = 0; i < mca_reserve(c); i++)
+		mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+	list_splice_init(&c->btree_cache,
+			 &c->btree_cache_freeable);
+
+#ifdef CONFIG_BCACHE_DEBUG
+	mutex_init(&c->verify_lock);
+
+	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
+
+	if (c->verify_data &&
+	    c->verify_data->sets[0].data)
+		list_del_init(&c->verify_data->list);
+	else
+		c->verify_data = NULL;
+#endif
+
+	c->shrink.shrink = bch_mca_shrink;
+	c->shrink.seeks = 4;
+	c->shrink.batch = c->btree_pages * 2;
+	register_shrinker(&c->shrink);
+
+	return 0;
+}
+
+/* Btree in memory cache - hash table */
+
+static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
+{
+	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
+}
+
+static struct btree *mca_find(struct cache_set *c, struct bkey *k)
+{
+	struct btree *b;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
+		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
+			goto out;
+	b = NULL;
+out:
+	rcu_read_unlock();
+	return b;
+}
+
+static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k,
+				     int level, struct closure *cl)
+{
+	int ret = -ENOMEM;
+	struct btree *i;
+
+	if (!cl)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Trying to free up some memory - i.e. reuse some btree nodes - may
+	 * require initiating IO to flush the dirty part of the node. If we're
+	 * running under generic_make_request(), that IO will never finish and
+	 * we would deadlock. Returning -EAGAIN causes the cache lookup code to
+	 * punt to workqueue and retry.
+	 */
+	if (current->bio_list)
+		return ERR_PTR(-EAGAIN);
+
+	if (c->try_harder && c->try_harder != cl) {
+		closure_wait_event_async(&c->try_wait, cl, !c->try_harder);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	/* XXX: tracepoint */
+	c->try_harder = cl;
+	c->try_harder_start = local_clock();
+retry:
+	list_for_each_entry_reverse(i, &c->btree_cache, list) {
+		int r = mca_reap(i, cl, btree_order(k));
+		if (!r)
+			return i;
+		if (r != -ENOMEM)
+			ret = r;
+	}
+
+	if (ret == -EAGAIN &&
+	    closure_blocking(cl)) {
+		mutex_unlock(&c->bucket_lock);
+		closure_sync(cl);
+		mutex_lock(&c->bucket_lock);
+		goto retry;
+	}
+
+	return ERR_PTR(ret);
+}
+
+/*
+ * We can only have one thread cannibalizing other cached btree nodes at a time,
+ * or we'll deadlock. We use an open coded mutex to ensure that, which a
+ * cannibalize_bucket() will take. This means every time we unlock the root of
+ * the btree, we need to release this lock if we have it held.
+ */
+void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl)
+{
+	if (c->try_harder == cl) {
+		time_stats_update(&c->try_harder_time, c->try_harder_start);
+		c->try_harder = NULL;
+		__closure_wake_up(&c->try_wait);
+	}
+}
+
+static struct btree *mca_alloc(struct cache_set *c, struct bkey *k,
+			       int level, struct closure *cl)
+{
+	struct btree *b;
+
+	lockdep_assert_held(&c->bucket_lock);
+
+	if (mca_find(c, k))
+		return NULL;
+
+	/* btree_free() doesn't free memory; it sticks the node on the end of
+	 * the list. Check if there's any freed nodes there:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freeable, list)
+		if (!mca_reap(b, NULL, btree_order(k)))
+			goto out;
+
+	/* We never free struct btree itself, just the memory that holds the on
+	 * disk node. Check the freed list before allocating a new one:
+	 */
+	list_for_each_entry(b, &c->btree_cache_freed, list)
+		if (!mca_reap(b, NULL, 0)) {
+			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
+			if (!b->sets[0].data)
+				goto err;
+			else
+				goto out;
+		}
+
+	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
+	if (!b)
+		goto err;
+
+	BUG_ON(!down_write_trylock(&b->lock));
+	if (!b->sets->data)
+		goto err;
+out:
+	BUG_ON(!closure_is_unlocked(&b->io.cl));
+
+	bkey_copy(&b->key, k);
+	list_move(&b->list, &c->btree_cache);
+	hlist_del_init_rcu(&b->hash);
+	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
+
+	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
+	b->level	= level;
+
+	mca_reinit(b);
+
+	return b;
+err:
+	if (b)
+		rw_unlock(true, b);
+
+	b = mca_cannibalize(c, k, level, cl);
+	if (!IS_ERR(b))
+		goto out;
+
+	return b;
+}
+
+/**
+ * bch_btree_node_get - find a btree node in the cache and lock it, reading it
+ * in from disk if necessary.
+ *
+ * If IO is necessary, it uses the closure embedded in struct btree_op to wait;
+ * if that closure is in non blocking mode, will return -EAGAIN.
+ *
+ * The btree node will have either a read or a write lock held, depending on
+ * level and op->lock.
+ */
+struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
+				 int level, struct btree_op *op)
+{
+	int i = 0;
+	bool write = level <= op->lock;
+	struct btree *b;
+
+	BUG_ON(level < 0);
+retry:
+	b = mca_find(c, k);
+
+	if (!b) {
+		mutex_lock(&c->bucket_lock);
+		b = mca_alloc(c, k, level, &op->cl);
+		mutex_unlock(&c->bucket_lock);
+
+		if (!b)
+			goto retry;
+		if (IS_ERR(b))
+			return b;
+
+		bch_btree_read(b);
+
+		if (!write)
+			downgrade_write(&b->lock);
+	} else {
+		rw_lock(write, b, level);
+		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
+			rw_unlock(write, b);
+			goto retry;
+		}
+		BUG_ON(b->level != level);
+	}
+
+	b->accessed = 1;
+
+	for (; i <= b->nsets && b->sets[i].size; i++) {
+		prefetch(b->sets[i].tree);
+		prefetch(b->sets[i].data);
+	}
+
+	for (; i <= b->nsets; i++)
+		prefetch(b->sets[i].data);
+
+	if (!closure_wait_event(&b->io.wait, &op->cl,
+				btree_node_read_done(b))) {
+		rw_unlock(write, b);
+		b = ERR_PTR(-EAGAIN);
+	} else if (btree_node_io_error(b)) {
+		rw_unlock(write, b);
+		b = ERR_PTR(-EIO);
+	} else
+		BUG_ON(!b->written);
+
+	return b;
+}
+
+static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
+{
+	struct btree *b;
+
+	mutex_lock(&c->bucket_lock);
+	b = mca_alloc(c, k, level, NULL);
+	mutex_unlock(&c->bucket_lock);
+
+	if (!IS_ERR_OR_NULL(b)) {
+		bch_btree_read(b);
+		rw_unlock(true, b);
+	}
+}
+
+/* Btree alloc */
+
+static void btree_node_free(struct btree *b, struct btree_op *op)
+{
+	unsigned i;
+
+	/*
+	 * The BUG_ON() in btree_node_get() implies that we must have a write
+	 * lock on parent to free or even invalidate a node
+	 */
+	BUG_ON(op->lock <= b->level);
+	BUG_ON(b == b->c->root);
+	pr_debug("bucket %s", pbtree(b));
+
+	if (btree_node_dirty(b))
+		btree_complete_write(b, btree_current_write(b));
+	clear_bit(BTREE_NODE_dirty, &b->flags);
+
+	if (b->prio_blocked &&
+	    !atomic_sub_return(b->prio_blocked, &b->c->prio_blocked))
+		closure_wake_up(&b->c->bucket_wait);
+
+	b->prio_blocked = 0;
+
+	cancel_delayed_work(&b->work);
+
+	mutex_lock(&b->c->bucket_lock);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++) {
+		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
+
+		bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
+			    PTR_BUCKET(b->c, &b->key, i));
+	}
+
+	bch_bucket_free(b->c, &b->key);
+	mca_bucket_free(b);
+	mutex_unlock(&b->c->bucket_lock);
+}
+
+struct btree *bch_btree_node_alloc(struct cache_set *c, int level,
+				   struct closure *cl)
+{
+	BKEY_PADDED(key) k;
+	struct btree *b = ERR_PTR(-EAGAIN);
+
+	mutex_lock(&c->bucket_lock);
+retry:
+	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl))
+		goto err;
+
+	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
+
+	b = mca_alloc(c, &k.key, level, cl);
+	if (IS_ERR(b))
+		goto err_free;
+
+	if (!b) {
+		cache_bug(c, "Tried to allocate bucket"
+			  " that was in btree cache");
+		__bkey_put(c, &k.key);
+		goto retry;
+	}
+
+	set_btree_node_read_done(b);
+	b->accessed = 1;
+	bch_bset_init_next(b);
+
+	mutex_unlock(&c->bucket_lock);
+	return b;
+err_free:
+	bch_bucket_free(c, &k.key);
+	__bkey_put(c, &k.key);
+err:
+	mutex_unlock(&c->bucket_lock);
+	return b;
+}
+
+static struct btree *btree_node_alloc_replacement(struct btree *b,
+						  struct closure *cl)
+{
+	struct btree *n = bch_btree_node_alloc(b->c, b->level, cl);
+	if (!IS_ERR_OR_NULL(n))
+		bch_btree_sort_into(b, n);
+
+	return n;
+}
+
+/* Garbage collection */
+
+uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
+{
+	uint8_t stale = 0;
+	unsigned i;
+	struct bucket *g;
+
+	/*
+	 * ptr_invalid() can't return true for the keys that mark btree nodes as
+	 * freed, but since ptr_bad() returns true we'll never actually use them
+	 * for anything and thus we don't want mark their pointers here
+	 */
+	if (!bkey_cmp(k, &ZERO_KEY))
+		return stale;
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		if (!ptr_available(c, k, i))
+			continue;
+
+		g = PTR_BUCKET(c, k, i);
+
+		if (gen_after(g->gc_gen, PTR_GEN(k, i)))
+			g->gc_gen = PTR_GEN(k, i);
+
+		if (ptr_stale(c, k, i)) {
+			stale = max(stale, ptr_stale(c, k, i));
+			continue;
+		}
+
+		cache_bug_on(GC_MARK(g) &&
+			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
+			     c, "inconsistent ptrs: mark = %llu, level = %i",
+			     GC_MARK(g), level);
+
+		if (level)
+			SET_GC_MARK(g, GC_MARK_METADATA);
+		else if (KEY_DIRTY(k))
+			SET_GC_MARK(g, GC_MARK_DIRTY);
+
+		/* guard against overflow */
+		SET_GC_SECTORS_USED(g, min_t(unsigned,
+					     GC_SECTORS_USED(g) + KEY_SIZE(k),
+					     (1 << 14) - 1));
+
+		BUG_ON(!GC_SECTORS_USED(g));
+	}
+
+	return stale;
+}
+
+#define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
+
+static int btree_gc_mark_node(struct btree *b, unsigned *keys,
+			      struct gc_stat *gc)
+{
+	uint8_t stale = 0;
+	unsigned last_dev = -1;
+	struct bcache_device *d = NULL;
+	struct bkey *k;
+	struct btree_iter iter;
+	struct bset_tree *t;
+
+	gc->nodes++;
+
+	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+		if (last_dev != KEY_INODE(k)) {
+			last_dev = KEY_INODE(k);
+
+			d = KEY_INODE(k) < b->c->nr_uuids
+				? b->c->devices[last_dev]
+				: NULL;
+		}
+
+		stale = max(stale, btree_mark_key(b, k));
+
+		if (bch_ptr_bad(b, k))
+			continue;
+
+		*keys += bkey_u64s(k);
+
+		gc->key_bytes += bkey_u64s(k);
+		gc->nkeys++;
+
+		gc->data += KEY_SIZE(k);
+		if (KEY_DIRTY(k)) {
+			gc->dirty += KEY_SIZE(k);
+			if (d)
+				d->sectors_dirty_gc += KEY_SIZE(k);
+		}
+	}
+
+	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
+		btree_bug_on(t->size &&
+			     bset_written(b, t) &&
+			     bkey_cmp(&b->key, &t->end) < 0,
+			     b, "found short btree key in gc");
+
+	return stale;
+}
+
+static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k,
+				    struct btree_op *op)
+{
+	/*
+	 * We block priorities from being written for the duration of garbage
+	 * collection, so we can't sleep in btree_alloc() ->
+	 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it
+	 * our closure.
+	 */
+	struct btree *n = btree_node_alloc_replacement(b, NULL);
+
+	if (!IS_ERR_OR_NULL(n)) {
+		swap(b, n);
+
+		memcpy(k->ptr, b->key.ptr,
+		       sizeof(uint64_t) * KEY_PTRS(&b->key));
+
+		__bkey_put(b->c, &b->key);
+		atomic_inc(&b->c->prio_blocked);
+		b->prio_blocked++;
+
+		btree_node_free(n, op);
+		up_write(&n->lock);
+	}
+
+	return b;
+}
+
+/*
+ * Leaving this at 2 until we've got incremental garbage collection done; it
+ * could be higher (and has been tested with 4) except that garbage collection
+ * could take much longer, adversely affecting latency.
+ */
+#define GC_MERGE_NODES	2U
+
+struct gc_merge_info {
+	struct btree	*b;
+	struct bkey	*k;
+	unsigned	keys;
+};
+
+static void btree_gc_coalesce(struct btree *b, struct btree_op *op,
+			      struct gc_stat *gc, struct gc_merge_info *r)
+{
+	unsigned nodes = 0, keys = 0, blocks;
+	int i;
+
+	while (nodes < GC_MERGE_NODES && r[nodes].b)
+		keys += r[nodes++].keys;
+
+	blocks = btree_default_blocks(b->c) * 2 / 3;
+
+	if (nodes < 2 ||
+	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
+		return;
+
+	for (i = nodes - 1; i >= 0; --i) {
+		if (r[i].b->written)
+			r[i].b = btree_gc_alloc(r[i].b, r[i].k, op);
+
+		if (r[i].b->written)
+			return;
+	}
+
+	for (i = nodes - 1; i > 0; --i) {
+		struct bset *n1 = r[i].b->sets->data;
+		struct bset *n2 = r[i - 1].b->sets->data;
+		struct bkey *k, *last = NULL;
+
+		keys = 0;
+
+		if (i == 1) {
+			/*
+			 * Last node we're not getting rid of - we're getting
+			 * rid of the node at r[0]. Have to try and fit all of
+			 * the remaining keys into this node; we can't ensure
+			 * they will always fit due to rounding and variable
+			 * length keys (shouldn't be possible in practice,
+			 * though)
+			 */
+			if (__set_blocks(n1, n1->keys + r->keys,
+					 b->c) > btree_blocks(r[i].b))
+				return;
+
+			keys = n2->keys;
+			last = &r->b->key;
+		} else
+			for (k = n2->start;
+			     k < end(n2);
+			     k = bkey_next(k)) {
+				if (__set_blocks(n1, n1->keys + keys +
+						 bkey_u64s(k), b->c) > blocks)
+					break;
+
+				last = k;
+				keys += bkey_u64s(k);
+			}
+
+		BUG_ON(__set_blocks(n1, n1->keys + keys,
+				    b->c) > btree_blocks(r[i].b));
+
+		if (last) {
+			bkey_copy_key(&r[i].b->key, last);
+			bkey_copy_key(r[i].k, last);
+		}
+
+		memcpy(end(n1),
+		       n2->start,
+		       (void *) node(n2, keys) - (void *) n2->start);
+
+		n1->keys += keys;
+
+		memmove(n2->start,
+			node(n2, keys),
+			(void *) end(n2) - (void *) node(n2, keys));
+
+		n2->keys -= keys;
+
+		r[i].keys	= n1->keys;
+		r[i - 1].keys	= n2->keys;
+	}
+
+	btree_node_free(r->b, op);
+	up_write(&r->b->lock);
+
+	pr_debug("coalesced %u nodes", nodes);
+
+	gc->nodes--;
+	nodes--;
+
+	memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes);
+	memset(&r[nodes], 0, sizeof(struct gc_merge_info));
+}
+
+static int btree_gc_recurse(struct btree *b, struct btree_op *op,
+			    struct closure *writes, struct gc_stat *gc)
+{
+	void write(struct btree *r)
+	{
+		if (!r->written)
+			bch_btree_write(r, true, op);
+		else if (btree_node_dirty(r)) {
+			BUG_ON(btree_current_write(r)->owner);
+			btree_current_write(r)->owner = writes;
+			closure_get(writes);
+
+			bch_btree_write(r, true, NULL);
+		}
+
+		up_write(&r->lock);
+	}
+
+	int ret = 0, stale;
+	unsigned i;
+	struct gc_merge_info r[GC_MERGE_NODES];
+
+	memset(r, 0, sizeof(r));
+
+	while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) {
+		r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op);
+
+		if (IS_ERR(r->b)) {
+			ret = PTR_ERR(r->b);
+			break;
+		}
+
+		r->keys	= 0;
+		stale = btree_gc_mark_node(r->b, &r->keys, gc);
+
+		if (!b->written &&
+		    (r->b->level || stale > 10 ||
+		     b->c->gc_always_rewrite))
+			r->b = btree_gc_alloc(r->b, r->k, op);
+
+		if (r->b->level)
+			ret = btree_gc_recurse(r->b, op, writes, gc);
+
+		if (ret) {
+			write(r->b);
+			break;
+		}
+
+		bkey_copy_key(&b->c->gc_done, r->k);
+
+		if (!b->written)
+			btree_gc_coalesce(b, op, gc, r);
+
+		if (r[GC_MERGE_NODES - 1].b)
+			write(r[GC_MERGE_NODES - 1].b);
+
+		memmove(&r[1], &r[0],
+			sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1));
+
+		/* When we've got incremental GC working, we'll want to do
+		 * if (should_resched())
+		 *	return -EAGAIN;
+		 */
+		cond_resched();
+#if 0
+		if (need_resched()) {
+			ret = -EAGAIN;
+			break;
+		}
+#endif
+	}
+
+	for (i = 1; i < GC_MERGE_NODES && r[i].b; i++)
+		write(r[i].b);
+
+	/* Might have freed some children, must remove their keys */
+	if (!b->written)
+		bch_btree_sort(b);
+
+	return ret;
+}
+
+static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
+			     struct closure *writes, struct gc_stat *gc)
+{
+	struct btree *n = NULL;
+	unsigned keys = 0;
+	int ret = 0, stale = btree_gc_mark_node(b, &keys, gc);
+
+	if (b->level || stale > 10)
+		n = btree_node_alloc_replacement(b, NULL);
+
+	if (!IS_ERR_OR_NULL(n))
+		swap(b, n);
+
+	if (b->level)
+		ret = btree_gc_recurse(b, op, writes, gc);
+
+	if (!b->written || btree_node_dirty(b)) {
+		atomic_inc(&b->c->prio_blocked);
+		b->prio_blocked++;
+		bch_btree_write(b, true, n ? op : NULL);
+	}
+
+	if (!IS_ERR_OR_NULL(n)) {
+		closure_sync(&op->cl);
+		bch_btree_set_root(b);
+		btree_node_free(n, op);
+		rw_unlock(true, b);
+	}
+
+	return ret;
+}
+
+static void btree_gc_start(struct cache_set *c)
+{
+	struct cache *ca;
+	struct bucket *b;
+	struct bcache_device **d;
+	unsigned i;
+
+	if (!c->gc_mark_valid)
+		return;
+
+	mutex_lock(&c->bucket_lock);
+
+	c->gc_mark_valid = 0;
+	c->gc_done = ZERO_KEY;
+
+	for_each_cache(ca, c, i)
+		for_each_bucket(b, ca) {
+			b->gc_gen = b->gen;
+			if (!atomic_read(&b->pin))
+				SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
+		}
+
+	for (d = c->devices;
+	     d < c->devices + c->nr_uuids;
+	     d++)
+		if (*d)
+			(*d)->sectors_dirty_gc = 0;
+
+	mutex_unlock(&c->bucket_lock);
+}
+
+size_t bch_btree_gc_finish(struct cache_set *c)
+{
+	size_t available = 0;
+	struct bucket *b;
+	struct cache *ca;
+	struct bcache_device **d;
+	unsigned i;
+
+	mutex_lock(&c->bucket_lock);
+
+	set_gc_sectors(c);
+	c->gc_mark_valid = 1;
+	c->need_gc	= 0;
+
+	if (c->root)
+		for (i = 0; i < KEY_PTRS(&c->root->key); i++)
+			SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
+				    GC_MARK_METADATA);
+
+	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
+		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
+			    GC_MARK_METADATA);
+
+	for_each_cache(ca, c, i) {
+		uint64_t *i;
+
+		ca->invalidate_needs_gc = 0;
+
+		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
+			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+		for (i = ca->prio_buckets;
+		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
+			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
+
+		for_each_bucket(b, ca) {
+			b->last_gc	= b->gc_gen;
+			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
+
+			if (!atomic_read(&b->pin) &&
+			    GC_MARK(b) == GC_MARK_RECLAIMABLE) {
+				available++;
+				if (!GC_SECTORS_USED(b))
+					bch_bucket_add_unused(ca, b);
+			}
+		}
+	}
+
+	for (d = c->devices;
+	     d < c->devices + c->nr_uuids;
+	     d++)
+		if (*d) {
+			unsigned long last =
+				atomic_long_read(&((*d)->sectors_dirty));
+			long difference = (*d)->sectors_dirty_gc - last;
+
+			pr_debug("sectors dirty off by %li", difference);
+
+			(*d)->sectors_dirty_last += difference;
+
+			atomic_long_set(&((*d)->sectors_dirty),
+					(*d)->sectors_dirty_gc);
+		}
+
+	mutex_unlock(&c->bucket_lock);
+	return available;
+}
+
+static void bch_btree_gc(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
+	int ret;
+	unsigned long available;
+	struct gc_stat stats;
+	struct closure writes;
+	struct btree_op op;
+
+	uint64_t start_time = local_clock();
+	trace_bcache_gc_start(c->sb.set_uuid);
+	blktrace_msg_all(c, "Starting gc");
+
+	memset(&stats, 0, sizeof(struct gc_stat));
+	closure_init_stack(&writes);
+	bch_btree_op_init_stack(&op);
+	op.lock = SHRT_MAX;
+
+	btree_gc_start(c);
+
+	ret = btree_root(gc_root, c, &op, &writes, &stats);
+	closure_sync(&op.cl);
+	closure_sync(&writes);
+
+	if (ret) {
+		blktrace_msg_all(c, "Stopped gc");
+		pr_warn("gc failed!");
+
+		continue_at(cl, bch_btree_gc, bch_gc_wq);
+	}
+
+	/* Possibly wait for new UUIDs or whatever to hit disk */
+	bch_journal_meta(c, &op.cl);
+	closure_sync(&op.cl);
+
+	available = bch_btree_gc_finish(c);
+
+	time_stats_update(&c->btree_gc_time, start_time);
+
+	stats.key_bytes *= sizeof(uint64_t);
+	stats.dirty	<<= 9;
+	stats.data	<<= 9;
+	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
+	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
+	blktrace_msg_all(c, "Finished gc");
+
+	trace_bcache_gc_end(c->sb.set_uuid);
+	wake_up(&c->alloc_wait);
+	closure_wake_up(&c->bucket_wait);
+
+	continue_at(cl, bch_moving_gc, bch_gc_wq);
+}
+
+void bch_queue_gc(struct cache_set *c)
+{
+	closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl);
+}
+
+/* Initial partial gc */
+
+static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
+				   unsigned long **seen)
+{
+	int ret;
+	unsigned i;
+	struct bkey *k;
+	struct bucket *g;
+	struct btree_iter iter;
+
+	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
+		for (i = 0; i < KEY_PTRS(k); i++) {
+			if (!ptr_available(b->c, k, i))
+				continue;
+
+			g = PTR_BUCKET(b->c, k, i);
+
+			if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
+						seen[PTR_DEV(k, i)]) ||
+			    !ptr_stale(b->c, k, i)) {
+				g->gen = PTR_GEN(k, i);
+
+				if (b->level)
+					g->prio = BTREE_PRIO;
+				else if (g->prio == BTREE_PRIO)
+					g->prio = INITIAL_PRIO;
+			}
+		}
+
+		btree_mark_key(b, k);
+	}
+
+	if (b->level) {
+		k = bch_next_recurse_key(b, &ZERO_KEY);
+
+		while (k) {
+			struct bkey *p = bch_next_recurse_key(b, k);
+			if (p)
+				btree_node_prefetch(b->c, p, b->level - 1);
+
+			ret = btree(check_recurse, k, b, op, seen);
+			if (ret)
+				return ret;
+
+			k = p;
+		}
+	}
+
+	return 0;
+}
+
+int bch_btree_check(struct cache_set *c, struct btree_op *op)
+{
+	int ret = -ENOMEM;
+	unsigned i;
+	unsigned long *seen[MAX_CACHES_PER_SET];
+
+	memset(seen, 0, sizeof(seen));
+
+	for (i = 0; c->cache[i]; i++) {
+		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
+		seen[i] = kmalloc(n, GFP_KERNEL);
+		if (!seen[i])
+			goto err;
+
+		/* Disables the seen array until prio_read() uses it too */
+		memset(seen[i], 0xFF, n);
+	}
+
+	ret = btree_root(check_recurse, c, op, seen);
+err:
+	for (i = 0; i < MAX_CACHES_PER_SET; i++)
+		kfree(seen[i]);
+	return ret;
+}
+
+/* Btree insertion */
+
+static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
+{
+	struct bset *i = b->sets[b->nsets].data;
+
+	memmove((uint64_t *) where + bkey_u64s(insert),
+		where,
+		(void *) end(i) - (void *) where);
+
+	i->keys += bkey_u64s(insert);
+	bkey_copy(where, insert);
+	bch_bset_fix_lookup_table(b, where);
+}
+
+static bool fix_overlapping_extents(struct btree *b,
+				    struct bkey *insert,
+				    struct btree_iter *iter,
+				    struct btree_op *op)
+{
+	void subtract_dirty(struct bkey *k, int sectors)
+	{
+		struct bcache_device *d = b->c->devices[KEY_INODE(k)];
+
+		if (KEY_DIRTY(k) && d)
+			atomic_long_sub(sectors, &d->sectors_dirty);
+	}
+
+	unsigned old_size, sectors_found = 0;
+
+	while (1) {
+		struct bkey *k = bch_btree_iter_next(iter);
+		if (!k ||
+		    bkey_cmp(&START_KEY(k), insert) >= 0)
+			break;
+
+		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
+			continue;
+
+		old_size = KEY_SIZE(k);
+
+		/*
+		 * We might overlap with 0 size extents; we can't skip these
+		 * because if they're in the set we're inserting to we have to
+		 * adjust them so they don't overlap with the key we're
+		 * inserting. But we don't want to check them for BTREE_REPLACE
+		 * operations.
+		 */
+
+		if (op->type == BTREE_REPLACE &&
+		    KEY_SIZE(k)) {
+			/*
+			 * k might have been split since we inserted/found the
+			 * key we're replacing
+			 */
+			unsigned i;
+			uint64_t offset = KEY_START(k) -
+				KEY_START(&op->replace);
+
+			/* But it must be a subset of the replace key */
+			if (KEY_START(k) < KEY_START(&op->replace) ||
+			    KEY_OFFSET(k) > KEY_OFFSET(&op->replace))
+				goto check_failed;
+
+			/* We didn't find a key that we were supposed to */
+			if (KEY_START(k) > KEY_START(insert) + sectors_found)
+				goto check_failed;
+
+			if (KEY_PTRS(&op->replace) != KEY_PTRS(k))
+				goto check_failed;
+
+			/* skip past gen */
+			offset <<= 8;
+
+			BUG_ON(!KEY_PTRS(&op->replace));
+
+			for (i = 0; i < KEY_PTRS(&op->replace); i++)
+				if (k->ptr[i] != op->replace.ptr[i] + offset)
+					goto check_failed;
+
+			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
+		}
+
+		if (bkey_cmp(insert, k) < 0 &&
+		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
+			/*
+			 * We overlapped in the middle of an existing key: that
+			 * means we have to split the old key. But we have to do
+			 * slightly different things depending on whether the
+			 * old key has been written out yet.
+			 */
+
+			struct bkey *top;
+
+			subtract_dirty(k, KEY_SIZE(insert));
+
+			if (bkey_written(b, k)) {
+				/*
+				 * We insert a new key to cover the top of the
+				 * old key, and the old key is modified in place
+				 * to represent the bottom split.
+				 *
+				 * It's completely arbitrary whether the new key
+				 * is the top or the bottom, but it has to match
+				 * up with what btree_sort_fixup() does - it
+				 * doesn't check for this kind of overlap, it
+				 * depends on us inserting a new key for the top
+				 * here.
+				 */
+				top = bch_bset_search(b, &b->sets[b->nsets],
+						      insert);
+				shift_keys(b, top, k);
+			} else {
+				BKEY_PADDED(key) temp;
+				bkey_copy(&temp.key, k);
+				shift_keys(b, k, &temp.key);
+				top = bkey_next(k);
+			}
+
+			bch_cut_front(insert, top);
+			bch_cut_back(&START_KEY(insert), k);
+			bch_bset_fix_invalidated_key(b, k);
+			return false;
+		}
+
+		if (bkey_cmp(insert, k) < 0) {
+			bch_cut_front(insert, k);
+		} else {
+			if (bkey_written(b, k) &&
+			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
+				/*
+				 * Completely overwrote, so we don't have to
+				 * invalidate the binary search tree
+				 */
+				bch_cut_front(k, k);
+			} else {
+				__bch_cut_back(&START_KEY(insert), k);
+				bch_bset_fix_invalidated_key(b, k);
+			}
+		}
+
+		subtract_dirty(k, old_size - KEY_SIZE(k));
+	}
+
+check_failed:
+	if (op->type == BTREE_REPLACE) {
+		if (!sectors_found) {
+			op->insert_collision = true;
+			return true;
+		} else if (sectors_found < KEY_SIZE(insert)) {
+			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
+				       (KEY_SIZE(insert) - sectors_found));
+			SET_KEY_SIZE(insert, sectors_found);
+		}
+	}
+
+	return false;
+}
+
+static bool btree_insert_key(struct btree *b, struct btree_op *op,
+			     struct bkey *k)
+{
+	struct bset *i = b->sets[b->nsets].data;
+	struct bkey *m, *prev;
+	const char *status = "insert";
+
+	BUG_ON(bkey_cmp(k, &b->key) > 0);
+	BUG_ON(b->level && !KEY_PTRS(k));
+	BUG_ON(!b->level && !KEY_OFFSET(k));
+
+	if (!b->level) {
+		struct btree_iter iter;
+		struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0);
+
+		/*
+		 * bset_search() returns the first key that is strictly greater
+		 * than the search key - but for back merging, we want to find
+		 * the first key that is greater than or equal to KEY_START(k) -
+		 * unless KEY_START(k) is 0.
+		 */
+		if (KEY_OFFSET(&search))
+			SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1);
+
+		prev = NULL;
+		m = bch_btree_iter_init(b, &iter, &search);
+
+		if (fix_overlapping_extents(b, k, &iter, op))
+			return false;
+
+		while (m != end(i) &&
+		       bkey_cmp(k, &START_KEY(m)) > 0)
+			prev = m, m = bkey_next(m);
+
+		if (key_merging_disabled(b->c))
+			goto insert;
+
+		/* prev is in the tree, if we merge we're done */
+		status = "back merging";
+		if (prev &&
+		    bch_bkey_try_merge(b, prev, k))
+			goto merged;
+
+		status = "overwrote front";
+		if (m != end(i) &&
+		    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
+			goto copy;
+
+		status = "front merge";
+		if (m != end(i) &&
+		    bch_bkey_try_merge(b, k, m))
+			goto copy;
+	} else
+		m = bch_bset_search(b, &b->sets[b->nsets], k);
+
+insert:	shift_keys(b, m, k);
+copy:	bkey_copy(m, k);
+merged:
+	bch_check_keys(b, "%s for %s at %s: %s", status,
+		       op_type(op), pbtree(b), pkey(k));
+	bch_check_key_order_msg(b, i, "%s for %s at %s: %s", status,
+				op_type(op), pbtree(b), pkey(k));
+
+	if (b->level && !KEY_OFFSET(k))
+		b->prio_blocked++;
+
+	pr_debug("%s for %s at %s: %s", status,
+		 op_type(op), pbtree(b), pkey(k));
+
+	return true;
+}
+
+bool bch_btree_insert_keys(struct btree *b, struct btree_op *op)
+{
+	bool ret = false;
+	struct bkey *k;
+	unsigned oldsize = bch_count_data(b);
+
+	while ((k = bch_keylist_pop(&op->keys))) {
+		bkey_put(b->c, k, b->level);
+		ret |= btree_insert_key(b, op, k);
+	}
+
+	BUG_ON(bch_count_data(b) < oldsize);
+	return ret;
+}
+
+bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
+				   struct bio *bio)
+{
+	bool ret = false;
+	uint64_t btree_ptr = b->key.ptr[0];
+	unsigned long seq = b->seq;
+	BKEY_PADDED(k) tmp;
+
+	rw_unlock(false, b);
+	rw_lock(true, b, b->level);
+
+	if (b->key.ptr[0] != btree_ptr ||
+	    b->seq != seq + 1 ||
+	    should_split(b))
+		goto out;
+
+	op->replace = KEY(op->inode, bio_end(bio), bio_sectors(bio));
+
+	SET_KEY_PTRS(&op->replace, 1);
+	get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t));
+
+	SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV);
+
+	bkey_copy(&tmp.k, &op->replace);
+
+	BUG_ON(op->type != BTREE_INSERT);
+	BUG_ON(!btree_insert_key(b, op, &tmp.k));
+	bch_btree_write(b, false, NULL);
+	ret = true;
+out:
+	downgrade_write(&b->lock);
+	return ret;
+}
+
+static int btree_split(struct btree *b, struct btree_op *op)
+{
+	bool split, root = b == b->c->root;
+	struct btree *n1, *n2 = NULL, *n3 = NULL;
+	uint64_t start_time = local_clock();
+
+	if (b->level)
+		set_closure_blocking(&op->cl);
+
+	n1 = btree_node_alloc_replacement(b, &op->cl);
+	if (IS_ERR(n1))
+		goto err;
+
+	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
+
+	pr_debug("%ssplitting at %s keys %i", split ? "" : "not ",
+		 pbtree(b), n1->sets[0].data->keys);
+
+	if (split) {
+		unsigned keys = 0;
+
+		n2 = bch_btree_node_alloc(b->c, b->level, &op->cl);
+		if (IS_ERR(n2))
+			goto err_free1;
+
+		if (root) {
+			n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl);
+			if (IS_ERR(n3))
+				goto err_free2;
+		}
+
+		bch_btree_insert_keys(n1, op);
+
+		/* Has to be a linear search because we don't have an auxiliary
+		 * search tree yet
+		 */
+
+		while (keys < (n1->sets[0].data->keys * 3) / 5)
+			keys += bkey_u64s(node(n1->sets[0].data, keys));
+
+		bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
+		keys += bkey_u64s(node(n1->sets[0].data, keys));
+
+		n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
+		n1->sets[0].data->keys = keys;
+
+		memcpy(n2->sets[0].data->start,
+		       end(n1->sets[0].data),
+		       n2->sets[0].data->keys * sizeof(uint64_t));
+
+		bkey_copy_key(&n2->key, &b->key);
+
+		bch_keylist_add(&op->keys, &n2->key);
+		bch_btree_write(n2, true, op);
+		rw_unlock(true, n2);
+	} else
+		bch_btree_insert_keys(n1, op);
+
+	bch_keylist_add(&op->keys, &n1->key);
+	bch_btree_write(n1, true, op);
+
+	if (n3) {
+		bkey_copy_key(&n3->key, &MAX_KEY);
+		bch_btree_insert_keys(n3, op);
+		bch_btree_write(n3, true, op);
+
+		closure_sync(&op->cl);
+		bch_btree_set_root(n3);
+		rw_unlock(true, n3);
+	} else if (root) {
+		op->keys.top = op->keys.bottom;
+		closure_sync(&op->cl);
+		bch_btree_set_root(n1);
+	} else {
+		unsigned i;
+
+		bkey_copy(op->keys.top, &b->key);
+		bkey_copy_key(op->keys.top, &ZERO_KEY);
+
+		for (i = 0; i < KEY_PTRS(&b->key); i++) {
+			uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1;
+
+			SET_PTR_GEN(op->keys.top, i, g);
+		}
+
+		bch_keylist_push(&op->keys);
+		closure_sync(&op->cl);
+		atomic_inc(&b->c->prio_blocked);
+	}
+
+	rw_unlock(true, n1);
+	btree_node_free(b, op);
+
+	time_stats_update(&b->c->btree_split_time, start_time);
+
+	return 0;
+err_free2:
+	__bkey_put(n2->c, &n2->key);
+	btree_node_free(n2, op);
+	rw_unlock(true, n2);
+err_free1:
+	__bkey_put(n1->c, &n1->key);
+	btree_node_free(n1, op);
+	rw_unlock(true, n1);
+err:
+	if (n3 == ERR_PTR(-EAGAIN) ||
+	    n2 == ERR_PTR(-EAGAIN) ||
+	    n1 == ERR_PTR(-EAGAIN))
+		return -EAGAIN;
+
+	pr_warn("couldn't split");
+	return -ENOMEM;
+}
+
+static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op,
+				    struct keylist *stack_keys)
+{
+	if (b->level) {
+		int ret;
+		struct bkey *insert = op->keys.bottom;
+		struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert));
+
+		if (!k) {
+			btree_bug(b, "no key to recurse on at level %i/%i",
+				  b->level, b->c->root->level);
+
+			op->keys.top = op->keys.bottom;
+			return -EIO;
+		}
+
+		if (bkey_cmp(insert, k) > 0) {
+			unsigned i;
+
+			if (op->type == BTREE_REPLACE) {
+				__bkey_put(b->c, insert);
+				op->keys.top = op->keys.bottom;
+				op->insert_collision = true;
+				return 0;
+			}
+
+			for (i = 0; i < KEY_PTRS(insert); i++)
+				atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin);
+
+			bkey_copy(stack_keys->top, insert);
+
+			bch_cut_back(k, insert);
+			bch_cut_front(k, stack_keys->top);
+
+			bch_keylist_push(stack_keys);
+		}
+
+		ret = btree(insert_recurse, k, b, op, stack_keys);
+		if (ret)
+			return ret;
+	}
+
+	if (!bch_keylist_empty(&op->keys)) {
+		if (should_split(b)) {
+			if (op->lock <= b->c->root->level) {
+				BUG_ON(b->level);
+				op->lock = b->c->root->level + 1;
+				return -EINTR;
+			}
+			return btree_split(b, op);
+		}
+
+		BUG_ON(write_block(b) != b->sets[b->nsets].data);
+
+		if (bch_btree_insert_keys(b, op))
+			bch_btree_write(b, false, op);
+	}
+
+	return 0;
+}
+
+int bch_btree_insert(struct btree_op *op, struct cache_set *c)
+{
+	int ret = 0;
+	struct keylist stack_keys;
+
+	/*
+	 * Don't want to block with the btree locked unless we have to,
+	 * otherwise we get deadlocks with try_harder and between split/gc
+	 */
+	clear_closure_blocking(&op->cl);
+
+	BUG_ON(bch_keylist_empty(&op->keys));
+	bch_keylist_copy(&stack_keys, &op->keys);
+	bch_keylist_init(&op->keys);
+
+	while (!bch_keylist_empty(&stack_keys) ||
+	       !bch_keylist_empty(&op->keys)) {
+		if (bch_keylist_empty(&op->keys)) {
+			bch_keylist_add(&op->keys,
+					bch_keylist_pop(&stack_keys));
+			op->lock = 0;
+		}
+
+		ret = btree_root(insert_recurse, c, op, &stack_keys);
+
+		if (ret == -EAGAIN) {
+			ret = 0;
+			closure_sync(&op->cl);
+		} else if (ret) {
+			struct bkey *k;
+
+			pr_err("error %i trying to insert key for %s",
+			       ret, op_type(op));
+
+			while ((k = bch_keylist_pop(&stack_keys) ?:
+				    bch_keylist_pop(&op->keys)))
+				bkey_put(c, k, 0);
+		}
+	}
+
+	bch_keylist_free(&stack_keys);
+
+	if (op->journal)
+		atomic_dec_bug(op->journal);
+	op->journal = NULL;
+	return ret;
+}
+
+void bch_btree_set_root(struct btree *b)
+{
+	unsigned i;
+
+	BUG_ON(!b->written);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
+
+	mutex_lock(&b->c->bucket_lock);
+	list_del_init(&b->list);
+	mutex_unlock(&b->c->bucket_lock);
+
+	b->c->root = b;
+	__bkey_put(b->c, &b->key);
+
+	bch_journal_meta(b->c, NULL);
+	pr_debug("%s for %pf", pbtree(b), __builtin_return_address(0));
+}
+
+/* Cache lookup */
+
+static int submit_partial_cache_miss(struct btree *b, struct btree_op *op,
+				     struct bkey *k)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+	int ret = 0;
+
+	while (!ret &&
+	       !op->lookup_done) {
+		unsigned sectors = INT_MAX;
+
+		if (KEY_INODE(k) == op->inode) {
+			if (KEY_START(k) <= bio->bi_sector)
+				break;
+
+			sectors = min_t(uint64_t, sectors,
+					KEY_START(k) - bio->bi_sector);
+		}
+
+		ret = s->d->cache_miss(b, s, bio, sectors);
+	}
+
+	return ret;
+}
+
+/*
+ * Read from a single key, handling the initial cache miss if the key starts in
+ * the middle of the bio
+ */
+static int submit_partial_cache_hit(struct btree *b, struct btree_op *op,
+				    struct bkey *k)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+	unsigned ptr;
+	struct bio *n;
+
+	int ret = submit_partial_cache_miss(b, op, k);
+	if (ret || op->lookup_done)
+		return ret;
+
+	/* XXX: figure out best pointer - for multiple cache devices */
+	ptr = 0;
+
+	PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
+
+	while (!op->lookup_done &&
+	       KEY_INODE(k) == op->inode &&
+	       bio->bi_sector < KEY_OFFSET(k)) {
+		struct bkey *bio_key;
+		sector_t sector = PTR_OFFSET(k, ptr) +
+			(bio->bi_sector - KEY_START(k));
+		unsigned sectors = min_t(uint64_t, INT_MAX,
+					 KEY_OFFSET(k) - bio->bi_sector);
+
+		n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+		if (!n)
+			return -EAGAIN;
+
+		if (n == bio)
+			op->lookup_done = true;
+
+		bio_key = &container_of(n, struct bbio, bio)->key;
+
+		/*
+		 * The bucket we're reading from might be reused while our bio
+		 * is in flight, and we could then end up reading the wrong
+		 * data.
+		 *
+		 * We guard against this by checking (in cache_read_endio()) if
+		 * the pointer is stale again; if so, we treat it as an error
+		 * and reread from the backing device (but we don't pass that
+		 * error up anywhere).
+		 */
+
+		bch_bkey_copy_single_ptr(bio_key, k, ptr);
+		SET_PTR_OFFSET(bio_key, 0, sector);
+
+		n->bi_end_io	= bch_cache_read_endio;
+		n->bi_private	= &s->cl;
+
+		trace_bcache_cache_hit(n);
+		__bch_submit_bbio(n, b->c);
+	}
+
+	return 0;
+}
+
+int bch_btree_search_recurse(struct btree *b, struct btree_op *op)
+{
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = &s->bio.bio;
+
+	int ret = 0;
+	struct bkey *k;
+	struct btree_iter iter;
+	bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0));
+
+	pr_debug("at %s searching for %u:%llu", pbtree(b), op->inode,
+		 (uint64_t) bio->bi_sector);
+
+	do {
+		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
+		if (!k) {
+			/*
+			 * b->key would be exactly what we want, except that
+			 * pointers to btree nodes have nonzero size - we
+			 * wouldn't go far enough
+			 */
+
+			ret = submit_partial_cache_miss(b, op,
+					&KEY(KEY_INODE(&b->key),
+					     KEY_OFFSET(&b->key), 0));
+			break;
+		}
+
+		ret = b->level
+			? btree(search_recurse, k, b, op)
+			: submit_partial_cache_hit(b, op, k);
+	} while (!ret &&
+		 !op->lookup_done);
+
+	return ret;
+}
+
+/* Keybuf code */
+
+static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
+{
+	/* Overlapping keys compare equal */
+	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
+		return -1;
+	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
+		return 1;
+	return 0;
+}
+
+static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
+					    struct keybuf_key *r)
+{
+	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
+}
+
+static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op,
+				   struct keybuf *buf, struct bkey *end)
+{
+	struct btree_iter iter;
+	bch_btree_iter_init(b, &iter, &buf->last_scanned);
+
+	while (!array_freelist_empty(&buf->freelist)) {
+		struct bkey *k = bch_btree_iter_next_filter(&iter, b,
+							    bch_ptr_bad);
+
+		if (!b->level) {
+			if (!k) {
+				buf->last_scanned = b->key;
+				break;
+			}
+
+			buf->last_scanned = *k;
+			if (bkey_cmp(&buf->last_scanned, end) >= 0)
+				break;
+
+			if (buf->key_predicate(buf, k)) {
+				struct keybuf_key *w;
+
+				pr_debug("%s", pkey(k));
+
+				spin_lock(&buf->lock);
+
+				w = array_alloc(&buf->freelist);
+
+				w->private = NULL;
+				bkey_copy(&w->key, k);
+
+				if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
+					array_free(&buf->freelist, w);
+
+				spin_unlock(&buf->lock);
+			}
+		} else {
+			if (!k)
+				break;
+
+			btree(refill_keybuf, k, b, op, buf, end);
+			/*
+			 * Might get an error here, but can't really do anything
+			 * and it'll get logged elsewhere. Just read what we
+			 * can.
+			 */
+
+			if (bkey_cmp(&buf->last_scanned, end) >= 0)
+				break;
+
+			cond_resched();
+		}
+	}
+
+	return 0;
+}
+
+void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
+			  struct bkey *end)
+{
+	struct bkey start = buf->last_scanned;
+	struct btree_op op;
+	bch_btree_op_init_stack(&op);
+
+	cond_resched();
+
+	btree_root(refill_keybuf, c, &op, buf, end);
+	closure_sync(&op.cl);
+
+	pr_debug("found %s keys from %llu:%llu to %llu:%llu",
+		 RB_EMPTY_ROOT(&buf->keys) ? "no" :
+		 array_freelist_empty(&buf->freelist) ? "some" : "a few",
+		 KEY_INODE(&start), KEY_OFFSET(&start),
+		 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned));
+
+	spin_lock(&buf->lock);
+
+	if (!RB_EMPTY_ROOT(&buf->keys)) {
+		struct keybuf_key *w;
+		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+		buf->start	= START_KEY(&w->key);
+
+		w = RB_LAST(&buf->keys, struct keybuf_key, node);
+		buf->end	= w->key;
+	} else {
+		buf->start	= MAX_KEY;
+		buf->end	= MAX_KEY;
+	}
+
+	spin_unlock(&buf->lock);
+}
+
+static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+	rb_erase(&w->node, &buf->keys);
+	array_free(&buf->freelist, w);
+}
+
+void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
+{
+	spin_lock(&buf->lock);
+	__bch_keybuf_del(buf, w);
+	spin_unlock(&buf->lock);
+}
+
+bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
+				  struct bkey *end)
+{
+	bool ret = false;
+	struct keybuf_key *p, *w, s;
+	s.key = *start;
+
+	if (bkey_cmp(end, &buf->start) <= 0 ||
+	    bkey_cmp(start, &buf->end) >= 0)
+		return false;
+
+	spin_lock(&buf->lock);
+	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
+
+	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
+		p = w;
+		w = RB_NEXT(w, node);
+
+		if (p->private)
+			ret = true;
+		else
+			__bch_keybuf_del(buf, p);
+	}
+
+	spin_unlock(&buf->lock);
+	return ret;
+}
+
+struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
+{
+	struct keybuf_key *w;
+	spin_lock(&buf->lock);
+
+	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
+
+	while (w && w->private)
+		w = RB_NEXT(w, node);
+
+	if (w)
+		w->private = ERR_PTR(-EINTR);
+
+	spin_unlock(&buf->lock);
+	return w;
+}
+
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
+					     struct keybuf *buf,
+					     struct bkey *end)
+{
+	struct keybuf_key *ret;
+
+	while (1) {
+		ret = bch_keybuf_next(buf);
+		if (ret)
+			break;
+
+		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
+			pr_debug("scan finished");
+			break;
+		}
+
+		bch_refill_keybuf(c, buf, end);
+	}
+
+	return ret;
+}
+
+void bch_keybuf_init(struct keybuf *buf, keybuf_pred_fn *fn)
+{
+	buf->key_predicate	= fn;
+	buf->last_scanned	= MAX_KEY;
+	buf->keys		= RB_ROOT;
+
+	spin_lock_init(&buf->lock);
+	array_allocator_init(&buf->freelist);
+}
+
+void bch_btree_exit(void)
+{
+	if (btree_io_wq)
+		destroy_workqueue(btree_io_wq);
+	if (bch_gc_wq)
+		destroy_workqueue(bch_gc_wq);
+}
+
+int __init bch_btree_init(void)
+{
+	if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) ||
+	    !(btree_io_wq = create_singlethread_workqueue("bch_btree_io")))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/md/bcache/btree.h b/drivers/md/bcache/btree.h
new file mode 100644
index 000000000000..af4a7092a28c
--- /dev/null
+++ b/drivers/md/bcache/btree.h
@@ -0,0 +1,405 @@
+#ifndef _BCACHE_BTREE_H
+#define _BCACHE_BTREE_H
+
+/*
+ * THE BTREE:
+ *
+ * At a high level, bcache's btree is relatively standard b+ tree. All keys and
+ * pointers are in the leaves; interior nodes only have pointers to the child
+ * nodes.
+ *
+ * In the interior nodes, a struct bkey always points to a child btree node, and
+ * the key is the highest key in the child node - except that the highest key in
+ * an interior node is always MAX_KEY. The size field refers to the size on disk
+ * of the child node - this would allow us to have variable sized btree nodes
+ * (handy for keeping the depth of the btree 1 by expanding just the root).
+ *
+ * Btree nodes are themselves log structured, but this is hidden fairly
+ * thoroughly. Btree nodes on disk will in practice have extents that overlap
+ * (because they were written at different times), but in memory we never have
+ * overlapping extents - when we read in a btree node from disk, the first thing
+ * we do is resort all the sets of keys with a mergesort, and in the same pass
+ * we check for overlapping extents and adjust them appropriately.
+ *
+ * struct btree_op is a central interface to the btree code. It's used for
+ * specifying read vs. write locking, and the embedded closure is used for
+ * waiting on IO or reserve memory.
+ *
+ * BTREE CACHE:
+ *
+ * Btree nodes are cached in memory; traversing the btree might require reading
+ * in btree nodes which is handled mostly transparently.
+ *
+ * bch_btree_node_get() looks up a btree node in the cache and reads it in from
+ * disk if necessary. This function is almost never called directly though - the
+ * btree() macro is used to get a btree node, call some function on it, and
+ * unlock the node after the function returns.
+ *
+ * The root is special cased - it's taken out of the cache's lru (thus pinning
+ * it in memory), so we can find the root of the btree by just dereferencing a
+ * pointer instead of looking it up in the cache. This makes locking a bit
+ * tricky, since the root pointer is protected by the lock in the btree node it
+ * points to - the btree_root() macro handles this.
+ *
+ * In various places we must be able to allocate memory for multiple btree nodes
+ * in order to make forward progress. To do this we use the btree cache itself
+ * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
+ * cache we can reuse. We can't allow more than one thread to be doing this at a
+ * time, so there's a lock, implemented by a pointer to the btree_op closure -
+ * this allows the btree_root() macro to implicitly release this lock.
+ *
+ * BTREE IO:
+ *
+ * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
+ * this.
+ *
+ * For writing, we have two btree_write structs embeddded in struct btree - one
+ * write in flight, and one being set up, and we toggle between them.
+ *
+ * Writing is done with a single function -  bch_btree_write() really serves two
+ * different purposes and should be broken up into two different functions. When
+ * passing now = false, it merely indicates that the node is now dirty - calling
+ * it ensures that the dirty keys will be written at some point in the future.
+ *
+ * When passing now = true, bch_btree_write() causes a write to happen
+ * "immediately" (if there was already a write in flight, it'll cause the write
+ * to happen as soon as the previous write completes). It returns immediately
+ * though - but it takes a refcount on the closure in struct btree_op you passed
+ * to it, so a closure_sync() later can be used to wait for the write to
+ * complete.
+ *
+ * This is handy because btree_split() and garbage collection can issue writes
+ * in parallel, reducing the amount of time they have to hold write locks.
+ *
+ * LOCKING:
+ *
+ * When traversing the btree, we may need write locks starting at some level -
+ * inserting a key into the btree will typically only require a write lock on
+ * the leaf node.
+ *
+ * This is specified with the lock field in struct btree_op; lock = 0 means we
+ * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
+ * checks this field and returns the node with the appropriate lock held.
+ *
+ * If, after traversing the btree, the insertion code discovers it has to split
+ * then it must restart from the root and take new locks - to do this it changes
+ * the lock field and returns -EINTR, which causes the btree_root() macro to
+ * loop.
+ *
+ * Handling cache misses require a different mechanism for upgrading to a write
+ * lock. We do cache lookups with only a read lock held, but if we get a cache
+ * miss and we wish to insert this data into the cache, we have to insert a
+ * placeholder key to detect races - otherwise, we could race with a write and
+ * overwrite the data that was just written to the cache with stale data from
+ * the backing device.
+ *
+ * For this we use a sequence number that write locks and unlocks increment - to
+ * insert the check key it unlocks the btree node and then takes a write lock,
+ * and fails if the sequence number doesn't match.
+ */
+
+#include "bset.h"
+#include "debug.h"
+
+struct btree_write {
+	struct closure		*owner;
+	atomic_t		*journal;
+
+	/* If btree_split() frees a btree node, it writes a new pointer to that
+	 * btree node indicating it was freed; it takes a refcount on
+	 * c->prio_blocked because we can't write the gens until the new
+	 * pointer is on disk. This allows btree_write_endio() to release the
+	 * refcount that btree_split() took.
+	 */
+	int			prio_blocked;
+};
+
+struct btree {
+	/* Hottest entries first */
+	struct hlist_node	hash;
+
+	/* Key/pointer for this btree node */
+	BKEY_PADDED(key);
+
+	/* Single bit - set when accessed, cleared by shrinker */
+	unsigned long		accessed;
+	unsigned long		seq;
+	struct rw_semaphore	lock;
+	struct cache_set	*c;
+
+	unsigned long		flags;
+	uint16_t		written;	/* would be nice to kill */
+	uint8_t			level;
+	uint8_t			nsets;
+	uint8_t			page_order;
+
+	/*
+	 * Set of sorted keys - the real btree node - plus a binary search tree
+	 *
+	 * sets[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
+	 * to the memory we have allocated for this btree node. Additionally,
+	 * set[0]->data points to the entire btree node as it exists on disk.
+	 */
+	struct bset_tree	sets[MAX_BSETS];
+
+	/* Used to refcount bio splits, also protects b->bio */
+	struct closure_with_waitlist	io;
+
+	/* Gets transferred to w->prio_blocked - see the comment there */
+	int			prio_blocked;
+
+	struct list_head	list;
+	struct delayed_work	work;
+
+	uint64_t		io_start_time;
+	struct btree_write	writes[2];
+	struct bio		*bio;
+};
+
+#define BTREE_FLAG(flag)						\
+static inline bool btree_node_ ## flag(struct btree *b)			\
+{	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
+									\
+static inline void set_btree_node_ ## flag(struct btree *b)		\
+{	set_bit(BTREE_NODE_ ## flag, &b->flags); }			\
+
+enum btree_flags {
+	BTREE_NODE_read_done,
+	BTREE_NODE_io_error,
+	BTREE_NODE_dirty,
+	BTREE_NODE_write_idx,
+};
+
+BTREE_FLAG(read_done);
+BTREE_FLAG(io_error);
+BTREE_FLAG(dirty);
+BTREE_FLAG(write_idx);
+
+static inline struct btree_write *btree_current_write(struct btree *b)
+{
+	return b->writes + btree_node_write_idx(b);
+}
+
+static inline struct btree_write *btree_prev_write(struct btree *b)
+{
+	return b->writes + (btree_node_write_idx(b) ^ 1);
+}
+
+static inline unsigned bset_offset(struct btree *b, struct bset *i)
+{
+	return (((size_t) i) - ((size_t) b->sets->data)) >> 9;
+}
+
+static inline struct bset *write_block(struct btree *b)
+{
+	return ((void *) b->sets[0].data) + b->written * block_bytes(b->c);
+}
+
+static inline bool bset_written(struct btree *b, struct bset_tree *t)
+{
+	return t->data < write_block(b);
+}
+
+static inline bool bkey_written(struct btree *b, struct bkey *k)
+{
+	return k < write_block(b)->start;
+}
+
+static inline void set_gc_sectors(struct cache_set *c)
+{
+	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 8);
+}
+
+static inline bool bch_ptr_invalid(struct btree *b, const struct bkey *k)
+{
+	return __bch_ptr_invalid(b->c, b->level, k);
+}
+
+static inline struct bkey *bch_btree_iter_init(struct btree *b,
+					       struct btree_iter *iter,
+					       struct bkey *search)
+{
+	return __bch_btree_iter_init(b, iter, search, b->sets);
+}
+
+/* Looping macros */
+
+#define for_each_cached_btree(b, c, iter)				\
+	for (iter = 0;							\
+	     iter < ARRAY_SIZE((c)->bucket_hash);			\
+	     iter++)							\
+		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
+
+#define for_each_key_filter(b, k, iter, filter)				\
+	for (bch_btree_iter_init((b), (iter), NULL);			\
+	     ((k) = bch_btree_iter_next_filter((iter), b, filter));)
+
+#define for_each_key(b, k, iter)					\
+	for (bch_btree_iter_init((b), (iter), NULL);			\
+	     ((k) = bch_btree_iter_next(iter));)
+
+/* Recursing down the btree */
+
+struct btree_op {
+	struct closure		cl;
+	struct cache_set	*c;
+
+	/* Journal entry we have a refcount on */
+	atomic_t		*journal;
+
+	/* Bio to be inserted into the cache */
+	struct bio		*cache_bio;
+
+	unsigned		inode;
+
+	uint16_t		write_prio;
+
+	/* Btree level at which we start taking write locks */
+	short			lock;
+
+	/* Btree insertion type */
+	enum {
+		BTREE_INSERT,
+		BTREE_REPLACE
+	} type:8;
+
+	unsigned		csum:1;
+	unsigned		skip:1;
+	unsigned		flush_journal:1;
+
+	unsigned		insert_data_done:1;
+	unsigned		lookup_done:1;
+	unsigned		insert_collision:1;
+
+	/* Anything after this point won't get zeroed in do_bio_hook() */
+
+	/* Keys to be inserted */
+	struct keylist		keys;
+	BKEY_PADDED(replace);
+};
+
+void bch_btree_op_init_stack(struct btree_op *);
+
+static inline void rw_lock(bool w, struct btree *b, int level)
+{
+	w ? down_write_nested(&b->lock, level + 1)
+	  : down_read_nested(&b->lock, level + 1);
+	if (w)
+		b->seq++;
+}
+
+static inline void rw_unlock(bool w, struct btree *b)
+{
+#ifdef CONFIG_BCACHE_EDEBUG
+	unsigned i;
+
+	if (w &&
+	    b->key.ptr[0] &&
+	    btree_node_read_done(b))
+		for (i = 0; i <= b->nsets; i++)
+			bch_check_key_order(b, b->sets[i].data);
+#endif
+
+	if (w)
+		b->seq++;
+	(w ? up_write : up_read)(&b->lock);
+}
+
+#define insert_lock(s, b)	((b)->level <= (s)->lock)
+
+/*
+ * These macros are for recursing down the btree - they handle the details of
+ * locking and looking up nodes in the cache for you. They're best treated as
+ * mere syntax when reading code that uses them.
+ *
+ * op->lock determines whether we take a read or a write lock at a given depth.
+ * If you've got a read lock and find that you need a write lock (i.e. you're
+ * going to have to split), set op->lock and return -EINTR; btree_root() will
+ * call you again and you'll have the correct lock.
+ */
+
+/**
+ * btree - recurse down the btree on a specified key
+ * @fn:		function to call, which will be passed the child node
+ * @key:	key to recurse on
+ * @b:		parent btree node
+ * @op:		pointer to struct btree_op
+ */
+#define btree(fn, key, b, op, ...)					\
+({									\
+	int _r, l = (b)->level - 1;					\
+	bool _w = l <= (op)->lock;					\
+	struct btree *_b = bch_btree_node_get((b)->c, key, l, op);	\
+	if (!IS_ERR(_b)) {						\
+		_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);		\
+		rw_unlock(_w, _b);					\
+	} else								\
+		_r = PTR_ERR(_b);					\
+	_r;								\
+})
+
+/**
+ * btree_root - call a function on the root of the btree
+ * @fn:		function to call, which will be passed the child node
+ * @c:		cache set
+ * @op:		pointer to struct btree_op
+ */
+#define btree_root(fn, c, op, ...)					\
+({									\
+	int _r = -EINTR;						\
+	do {								\
+		struct btree *_b = (c)->root;				\
+		bool _w = insert_lock(op, _b);				\
+		rw_lock(_w, _b, _b->level);				\
+		if (_b == (c)->root &&					\
+		    _w == insert_lock(op, _b))				\
+			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
+		rw_unlock(_w, _b);					\
+		bch_cannibalize_unlock(c, &(op)->cl);		\
+	} while (_r == -EINTR);						\
+									\
+	_r;								\
+})
+
+static inline bool should_split(struct btree *b)
+{
+	struct bset *i = write_block(b);
+	return b->written >= btree_blocks(b) ||
+		(i->seq == b->sets[0].data->seq &&
+		 b->written + __set_blocks(i, i->keys + 15, b->c)
+		 > btree_blocks(b));
+}
+
+void bch_btree_read_done(struct closure *);
+void bch_btree_read(struct btree *);
+void bch_btree_write(struct btree *b, bool now, struct btree_op *op);
+
+void bch_cannibalize_unlock(struct cache_set *, struct closure *);
+void bch_btree_set_root(struct btree *);
+struct btree *bch_btree_node_alloc(struct cache_set *, int, struct closure *);
+struct btree *bch_btree_node_get(struct cache_set *, struct bkey *,
+				int, struct btree_op *);
+
+bool bch_btree_insert_keys(struct btree *, struct btree_op *);
+bool bch_btree_insert_check_key(struct btree *, struct btree_op *,
+				   struct bio *);
+int bch_btree_insert(struct btree_op *, struct cache_set *);
+
+int bch_btree_search_recurse(struct btree *, struct btree_op *);
+
+void bch_queue_gc(struct cache_set *);
+size_t bch_btree_gc_finish(struct cache_set *);
+void bch_moving_gc(struct closure *);
+int bch_btree_check(struct cache_set *, struct btree_op *);
+uint8_t __bch_btree_mark_key(struct cache_set *, int, struct bkey *);
+
+void bch_keybuf_init(struct keybuf *, keybuf_pred_fn *);
+void bch_refill_keybuf(struct cache_set *, struct keybuf *, struct bkey *);
+bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
+				  struct bkey *);
+void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
+struct keybuf_key *bch_keybuf_next(struct keybuf *);
+struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *,
+					  struct keybuf *, struct bkey *);
+
+#endif
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
new file mode 100644
index 000000000000..d6fbec0f8484
--- /dev/null
+++ b/drivers/md/bcache/closure.c
@@ -0,0 +1,348 @@
+/*
+ * Asynchronous refcounty things
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include "closure.h"
+
+void closure_queue(struct closure *cl)
+{
+	struct workqueue_struct *wq = cl->wq;
+	if (wq) {
+		INIT_WORK(&cl->work, cl->work.func);
+		BUG_ON(!queue_work(wq, &cl->work));
+	} else
+		cl->fn(cl);
+}
+EXPORT_SYMBOL_GPL(closure_queue);
+
+#define CL_FIELD(type, field)					\
+	case TYPE_ ## type:					\
+	return &container_of(cl, struct type, cl)->field
+
+static struct closure_waitlist *closure_waitlist(struct closure *cl)
+{
+	switch (cl->type) {
+		CL_FIELD(closure_with_waitlist, wait);
+		CL_FIELD(closure_with_waitlist_and_timer, wait);
+	default:
+		return NULL;
+	}
+}
+
+static struct timer_list *closure_timer(struct closure *cl)
+{
+	switch (cl->type) {
+		CL_FIELD(closure_with_timer, timer);
+		CL_FIELD(closure_with_waitlist_and_timer, timer);
+	default:
+		return NULL;
+	}
+}
+
+static inline void closure_put_after_sub(struct closure *cl, int flags)
+{
+	int r = flags & CLOSURE_REMAINING_MASK;
+
+	BUG_ON(flags & CLOSURE_GUARD_MASK);
+	BUG_ON(!r && (flags & ~(CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING)));
+
+	/* Must deliver precisely one wakeup */
+	if (r == 1 && (flags & CLOSURE_SLEEPING))
+		wake_up_process(cl->task);
+
+	if (!r) {
+		if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
+			/* CLOSURE_BLOCKING might be set - clear it */
+			atomic_set(&cl->remaining,
+				   CLOSURE_REMAINING_INITIALIZER);
+			closure_queue(cl);
+		} else {
+			struct closure *parent = cl->parent;
+			struct closure_waitlist *wait = closure_waitlist(cl);
+
+			closure_debug_destroy(cl);
+
+			atomic_set(&cl->remaining, -1);
+
+			if (wait)
+				closure_wake_up(wait);
+
+			if (cl->fn)
+				cl->fn(cl);
+
+			if (parent)
+				closure_put(parent);
+		}
+	}
+}
+
+/* For clearing flags with the same atomic op as a put */
+void closure_sub(struct closure *cl, int v)
+{
+	closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_sub);
+
+void closure_put(struct closure *cl)
+{
+	closure_put_after_sub(cl, atomic_dec_return(&cl->remaining));
+}
+EXPORT_SYMBOL_GPL(closure_put);
+
+static void set_waiting(struct closure *cl, unsigned long f)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->waiting_on = f;
+#endif
+}
+
+void __closure_wake_up(struct closure_waitlist *wait_list)
+{
+	struct llist_node *list;
+	struct closure *cl;
+	struct llist_node *reverse = NULL;
+
+	list = llist_del_all(&wait_list->list);
+
+	/* We first reverse the list to preserve FIFO ordering and fairness */
+
+	while (list) {
+		struct llist_node *t = list;
+		list = llist_next(list);
+
+		t->next = reverse;
+		reverse = t;
+	}
+
+	/* Then do the wakeups */
+
+	while (reverse) {
+		cl = container_of(reverse, struct closure, list);
+		reverse = llist_next(reverse);
+
+		set_waiting(cl, 0);
+		closure_sub(cl, CLOSURE_WAITING + 1);
+	}
+}
+EXPORT_SYMBOL_GPL(__closure_wake_up);
+
+bool closure_wait(struct closure_waitlist *list, struct closure *cl)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_WAITING)
+		return false;
+
+	set_waiting(cl, _RET_IP_);
+	atomic_add(CLOSURE_WAITING + 1, &cl->remaining);
+	llist_add(&cl->list, &list->list);
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(closure_wait);
+
+/**
+ * closure_sync() - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+void closure_sync(struct closure *cl)
+{
+	while (1) {
+		__closure_start_sleep(cl);
+		closure_set_ret_ip(cl);
+
+		if ((atomic_read(&cl->remaining) &
+		     CLOSURE_REMAINING_MASK) == 1)
+			break;
+
+		schedule();
+	}
+
+	__closure_end_sleep(cl);
+}
+EXPORT_SYMBOL_GPL(closure_sync);
+
+/**
+ * closure_trylock() - try to acquire the closure, without waiting
+ * @cl:		closure to lock
+ *
+ * Returns true if the closure was succesfully locked.
+ */
+bool closure_trylock(struct closure *cl, struct closure *parent)
+{
+	if (atomic_cmpxchg(&cl->remaining, -1,
+			   CLOSURE_REMAINING_INITIALIZER) != -1)
+		return false;
+
+	closure_set_ret_ip(cl);
+
+	smp_mb();
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	closure_debug_create(cl);
+	return true;
+}
+EXPORT_SYMBOL_GPL(closure_trylock);
+
+void __closure_lock(struct closure *cl, struct closure *parent,
+		    struct closure_waitlist *wait_list)
+{
+	struct closure wait;
+	closure_init_stack(&wait);
+
+	while (1) {
+		if (closure_trylock(cl, parent))
+			return;
+
+		closure_wait_event_sync(wait_list, &wait,
+					atomic_read(&cl->remaining) == -1);
+	}
+}
+EXPORT_SYMBOL_GPL(__closure_lock);
+
+static void closure_delay_timer_fn(unsigned long data)
+{
+	struct closure *cl = (struct closure *) data;
+	closure_sub(cl, CLOSURE_TIMER + 1);
+}
+
+void do_closure_timer_init(struct closure *cl)
+{
+	struct timer_list *timer = closure_timer(cl);
+
+	init_timer(timer);
+	timer->data	= (unsigned long) cl;
+	timer->function = closure_delay_timer_fn;
+}
+EXPORT_SYMBOL_GPL(do_closure_timer_init);
+
+bool __closure_delay(struct closure *cl, unsigned long delay,
+		     struct timer_list *timer)
+{
+	if (atomic_read(&cl->remaining) & CLOSURE_TIMER)
+		return false;
+
+	BUG_ON(timer_pending(timer));
+
+	timer->expires	= jiffies + delay;
+
+	atomic_add(CLOSURE_TIMER + 1, &cl->remaining);
+	add_timer(timer);
+	return true;
+}
+EXPORT_SYMBOL_GPL(__closure_delay);
+
+void __closure_flush(struct closure *cl, struct timer_list *timer)
+{
+	if (del_timer(timer))
+		closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush);
+
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer)
+{
+	if (del_timer_sync(timer))
+		closure_sub(cl, CLOSURE_TIMER + 1);
+}
+EXPORT_SYMBOL_GPL(__closure_flush_sync);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+static LIST_HEAD(closure_list);
+static DEFINE_SPINLOCK(closure_list_lock);
+
+void closure_debug_create(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_ALIVE;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_add(&cl->all, &closure_list);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_create);
+
+void closure_debug_destroy(struct closure *cl)
+{
+	unsigned long flags;
+
+	BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE);
+	cl->magic = CLOSURE_MAGIC_DEAD;
+
+	spin_lock_irqsave(&closure_list_lock, flags);
+	list_del(&cl->all);
+	spin_unlock_irqrestore(&closure_list_lock, flags);
+}
+EXPORT_SYMBOL_GPL(closure_debug_destroy);
+
+static struct dentry *debug;
+
+#define work_data_bits(work) ((unsigned long *)(&(work)->data))
+
+static int debug_seq_show(struct seq_file *f, void *data)
+{
+	struct closure *cl;
+	spin_lock_irq(&closure_list_lock);
+
+	list_for_each_entry(cl, &closure_list, all) {
+		int r = atomic_read(&cl->remaining);
+
+		seq_printf(f, "%p: %pF -> %pf p %p r %i ",
+			   cl, (void *) cl->ip, cl->fn, cl->parent,
+			   r & CLOSURE_REMAINING_MASK);
+
+		seq_printf(f, "%s%s%s%s%s%s\n",
+			   test_bit(WORK_STRUCT_PENDING,
+				    work_data_bits(&cl->work)) ? "Q" : "",
+			   r & CLOSURE_RUNNING	? "R" : "",
+			   r & CLOSURE_BLOCKING	? "B" : "",
+			   r & CLOSURE_STACK	? "S" : "",
+			   r & CLOSURE_SLEEPING	? "Sl" : "",
+			   r & CLOSURE_TIMER	? "T" : "");
+
+		if (r & CLOSURE_WAITING)
+			seq_printf(f, " W %pF\n",
+				   (void *) cl->waiting_on);
+
+		seq_printf(f, "\n");
+	}
+
+	spin_unlock_irq(&closure_list_lock);
+	return 0;
+}
+
+static int debug_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, debug_seq_show, NULL);
+}
+
+static const struct file_operations debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= debug_seq_open,
+	.read		= seq_read,
+	.release	= single_release
+};
+
+int __init closure_debug_init(void)
+{
+	debug = debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops);
+	return 0;
+}
+
+module_init(closure_debug_init);
+
+#endif
+
+MODULE_AUTHOR("Kent Overstreet <koverstreet@google.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
new file mode 100644
index 000000000000..3f31d599ea56
--- /dev/null
+++ b/drivers/md/bcache/closure.h
@@ -0,0 +1,670 @@
+#ifndef _LINUX_CLOSURE_H
+#define _LINUX_CLOSURE_H
+
+#include <linux/llist.h>
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+/*
+ * Closure is perhaps the most overused and abused term in computer science, but
+ * since I've been unable to come up with anything better you're stuck with it
+ * again.
+ *
+ * What are closures?
+ *
+ * They embed a refcount. The basic idea is they count "things that are in
+ * progress" - in flight bios, some other thread that's doing something else -
+ * anything you might want to wait on.
+ *
+ * The refcount may be manipulated with closure_get() and closure_put().
+ * closure_put() is where many of the interesting things happen, when it causes
+ * the refcount to go to 0.
+ *
+ * Closures can be used to wait on things both synchronously and asynchronously,
+ * and synchronous and asynchronous use can be mixed without restriction. To
+ * wait synchronously, use closure_sync() - you will sleep until your closure's
+ * refcount hits 1.
+ *
+ * To wait asynchronously, use
+ *   continue_at(cl, next_function, workqueue);
+ *
+ * passing it, as you might expect, the function to run when nothing is pending
+ * and the workqueue to run that function out of.
+ *
+ * continue_at() also, critically, is a macro that returns the calling function.
+ * There's good reason for this.
+ *
+ * To use safely closures asynchronously, they must always have a refcount while
+ * they are running owned by the thread that is running them. Otherwise, suppose
+ * you submit some bios and wish to have a function run when they all complete:
+ *
+ * foo_endio(struct bio *bio, int error)
+ * {
+ *	closure_put(cl);
+ * }
+ *
+ * closure_init(cl);
+ *
+ * do_stuff();
+ * closure_get(cl);
+ * bio1->bi_endio = foo_endio;
+ * bio_submit(bio1);
+ *
+ * do_more_stuff();
+ * closure_get(cl);
+ * bio2->bi_endio = foo_endio;
+ * bio_submit(bio2);
+ *
+ * continue_at(cl, complete_some_read, system_wq);
+ *
+ * If closure's refcount started at 0, complete_some_read() could run before the
+ * second bio was submitted - which is almost always not what you want! More
+ * importantly, it wouldn't be possible to say whether the original thread or
+ * complete_some_read()'s thread owned the closure - and whatever state it was
+ * associated with!
+ *
+ * So, closure_init() initializes a closure's refcount to 1 - and when a
+ * closure_fn is run, the refcount will be reset to 1 first.
+ *
+ * Then, the rule is - if you got the refcount with closure_get(), release it
+ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount
+ * on a closure because you called closure_init() or you were run out of a
+ * closure - _always_ use continue_at(). Doing so consistently will help
+ * eliminate an entire class of particularly pernicious races.
+ *
+ * For a closure to wait on an arbitrary event, we need to introduce waitlists:
+ *
+ * struct closure_waitlist list;
+ * closure_wait_event(list, cl, condition);
+ * closure_wake_up(wait_list);
+ *
+ * These work analagously to wait_event() and wake_up() - except that instead of
+ * operating on the current thread (for wait_event()) and lists of threads, they
+ * operate on an explicit closure and lists of closures.
+ *
+ * Because it's a closure we can now wait either synchronously or
+ * asynchronously. closure_wait_event() returns the current value of the
+ * condition, and if it returned false continue_at() or closure_sync() can be
+ * used to wait for it to become true.
+ *
+ * It's useful for waiting on things when you can't sleep in the context in
+ * which you must check the condition (perhaps a spinlock held, or you might be
+ * beneath generic_make_request() - in which case you can't sleep on IO).
+ *
+ * closure_wait_event() will wait either synchronously or asynchronously,
+ * depending on whether the closure is in blocking mode or not. You can pick a
+ * mode explicitly with closure_wait_event_sync() and
+ * closure_wait_event_async(), which do just what you might expect.
+ *
+ * Lastly, you might have a wait list dedicated to a specific event, and have no
+ * need for specifying the condition - you just want to wait until someone runs
+ * closure_wake_up() on the appropriate wait list. In that case, just use
+ * closure_wait(). It will return either true or false, depending on whether the
+ * closure was already on a wait list or not - a closure can only be on one wait
+ * list at a time.
+ *
+ * Parents:
+ *
+ * closure_init() takes two arguments - it takes the closure to initialize, and
+ * a (possibly null) parent.
+ *
+ * If parent is non null, the new closure will have a refcount for its lifetime;
+ * a closure is considered to be "finished" when its refcount hits 0 and the
+ * function to run is null. Hence
+ *
+ * continue_at(cl, NULL, NULL);
+ *
+ * returns up the (spaghetti) stack of closures, precisely like normal return
+ * returns up the C stack. continue_at() with non null fn is better thought of
+ * as doing a tail call.
+ *
+ * All this implies that a closure should typically be embedded in a particular
+ * struct (which its refcount will normally control the lifetime of), and that
+ * struct can very much be thought of as a stack frame.
+ *
+ * Locking:
+ *
+ * Closures are based on work items but they can be thought of as more like
+ * threads - in that like threads and unlike work items they have a well
+ * defined lifetime; they are created (with closure_init()) and eventually
+ * complete after a continue_at(cl, NULL, NULL).
+ *
+ * Suppose you've got some larger structure with a closure embedded in it that's
+ * used for periodically doing garbage collection. You only want one garbage
+ * collection happening at a time, so the natural thing to do is protect it with
+ * a lock. However, it's difficult to use a lock protecting a closure correctly
+ * because the unlock should come after the last continue_to() (additionally, if
+ * you're using the closure asynchronously a mutex won't work since a mutex has
+ * to be unlocked by the same process that locked it).
+ *
+ * So to make it less error prone and more efficient, we also have the ability
+ * to use closures as locks:
+ *
+ * closure_init_unlocked();
+ * closure_trylock();
+ *
+ * That's all we need for trylock() - the last closure_put() implicitly unlocks
+ * it for you.  But for closure_lock(), we also need a wait list:
+ *
+ * struct closure_with_waitlist frobnicator_cl;
+ *
+ * closure_init_unlocked(&frobnicator_cl);
+ * closure_lock(&frobnicator_cl);
+ *
+ * A closure_with_waitlist embeds a closure and a wait list - much like struct
+ * delayed_work embeds a work item and a timer_list. The important thing is, use
+ * it exactly like you would a regular closure and closure_put() will magically
+ * handle everything for you.
+ *
+ * We've got closures that embed timers, too. They're called, appropriately
+ * enough:
+ * struct closure_with_timer;
+ *
+ * This gives you access to closure_delay(). It takes a refcount for a specified
+ * number of jiffies - you could then call closure_sync() (for a slightly
+ * convoluted version of msleep()) or continue_at() - which gives you the same
+ * effect as using a delayed work item, except you can reuse the work_struct
+ * already embedded in struct closure.
+ *
+ * Lastly, there's struct closure_with_waitlist_and_timer. It does what you
+ * probably expect, if you happen to need the features of both. (You don't
+ * really want to know how all this is implemented, but if I've done my job
+ * right you shouldn't have to care).
+ */
+
+struct closure;
+typedef void (closure_fn) (struct closure *);
+
+struct closure_waitlist {
+	struct llist_head	list;
+};
+
+enum closure_type {
+	TYPE_closure				= 0,
+	TYPE_closure_with_waitlist		= 1,
+	TYPE_closure_with_timer			= 2,
+	TYPE_closure_with_waitlist_and_timer	= 3,
+	MAX_CLOSURE_TYPE			= 3,
+};
+
+enum closure_state {
+	/*
+	 * CLOSURE_BLOCKING: Causes closure_wait_event() to block, instead of
+	 * waiting asynchronously
+	 *
+	 * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by
+	 * the thread that owns the closure, and cleared by the thread that's
+	 * waking up the closure.
+	 *
+	 * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
+	 * - indicates that cl->task is valid and closure_put() may wake it up.
+	 * Only set or cleared by the thread that owns the closure.
+	 *
+	 * CLOSURE_TIMER: Analagous to CLOSURE_WAITING, indicates that a closure
+	 * has an outstanding timer. Must be set by the thread that owns the
+	 * closure, and cleared by the timer function when the timer goes off.
+	 *
+	 * The rest are for debugging and don't affect behaviour:
+	 *
+	 * CLOSURE_RUNNING: Set when a closure is running (i.e. by
+	 * closure_init() and when closure_put() runs then next function), and
+	 * must be cleared before remaining hits 0. Primarily to help guard
+	 * against incorrect usage and accidentally transferring references.
+	 * continue_at() and closure_return() clear it for you, if you're doing
+	 * something unusual you can use closure_set_dead() which also helps
+	 * annotate where references are being transferred.
+	 *
+	 * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
+	 * closure with this flag set
+	 */
+
+	CLOSURE_BITS_START	= (1 << 19),
+	CLOSURE_DESTRUCTOR	= (1 << 19),
+	CLOSURE_BLOCKING	= (1 << 21),
+	CLOSURE_WAITING		= (1 << 23),
+	CLOSURE_SLEEPING	= (1 << 25),
+	CLOSURE_TIMER		= (1 << 27),
+	CLOSURE_RUNNING		= (1 << 29),
+	CLOSURE_STACK		= (1 << 31),
+};
+
+#define CLOSURE_GUARD_MASK					\
+	((CLOSURE_DESTRUCTOR|CLOSURE_BLOCKING|CLOSURE_WAITING|	\
+	  CLOSURE_SLEEPING|CLOSURE_TIMER|CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+
+#define CLOSURE_REMAINING_MASK		(CLOSURE_BITS_START - 1)
+#define CLOSURE_REMAINING_INITIALIZER	(1|CLOSURE_RUNNING)
+
+struct closure {
+	union {
+		struct {
+			struct workqueue_struct *wq;
+			struct task_struct	*task;
+			struct llist_node	list;
+			closure_fn		*fn;
+		};
+		struct work_struct	work;
+	};
+
+	struct closure		*parent;
+
+	atomic_t		remaining;
+
+	enum closure_type	type;
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+#define CLOSURE_MAGIC_DEAD	0xc054dead
+#define CLOSURE_MAGIC_ALIVE	0xc054a11e
+
+	unsigned		magic;
+	struct list_head	all;
+	unsigned long		ip;
+	unsigned long		waiting_on;
+#endif
+};
+
+struct closure_with_waitlist {
+	struct closure		cl;
+	struct closure_waitlist	wait;
+};
+
+struct closure_with_timer {
+	struct closure		cl;
+	struct timer_list	timer;
+};
+
+struct closure_with_waitlist_and_timer {
+	struct closure		cl;
+	struct closure_waitlist	wait;
+	struct timer_list	timer;
+};
+
+extern unsigned invalid_closure_type(void);
+
+#define __CLOSURE_TYPE(cl, _t)						\
+	  __builtin_types_compatible_p(typeof(cl), struct _t)		\
+		? TYPE_ ## _t :						\
+
+#define __closure_type(cl)						\
+(									\
+	__CLOSURE_TYPE(cl, closure)					\
+	__CLOSURE_TYPE(cl, closure_with_waitlist)			\
+	__CLOSURE_TYPE(cl, closure_with_timer)				\
+	__CLOSURE_TYPE(cl, closure_with_waitlist_and_timer)		\
+	invalid_closure_type()						\
+)
+
+void closure_sub(struct closure *cl, int v);
+void closure_put(struct closure *cl);
+void closure_queue(struct closure *cl);
+void __closure_wake_up(struct closure_waitlist *list);
+bool closure_wait(struct closure_waitlist *list, struct closure *cl);
+void closure_sync(struct closure *cl);
+
+bool closure_trylock(struct closure *cl, struct closure *parent);
+void __closure_lock(struct closure *cl, struct closure *parent,
+		    struct closure_waitlist *wait_list);
+
+void do_closure_timer_init(struct closure *cl);
+bool __closure_delay(struct closure *cl, unsigned long delay,
+		     struct timer_list *timer);
+void __closure_flush(struct closure *cl, struct timer_list *timer);
+void __closure_flush_sync(struct closure *cl, struct timer_list *timer);
+
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+
+void closure_debug_create(struct closure *cl);
+void closure_debug_destroy(struct closure *cl);
+
+#else
+
+static inline void closure_debug_create(struct closure *cl) {}
+static inline void closure_debug_destroy(struct closure *cl) {}
+
+#endif
+
+static inline void closure_set_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _THIS_IP_;
+#endif
+}
+
+static inline void closure_set_ret_ip(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	cl->ip = _RET_IP_;
+#endif
+}
+
+static inline void closure_get(struct closure *cl)
+{
+#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
+	BUG_ON((atomic_inc_return(&cl->remaining) &
+		CLOSURE_REMAINING_MASK) <= 1);
+#else
+	atomic_inc(&cl->remaining);
+#endif
+}
+
+static inline void closure_set_stopped(struct closure *cl)
+{
+	atomic_sub(CLOSURE_RUNNING, &cl->remaining);
+}
+
+static inline bool closure_is_stopped(struct closure *cl)
+{
+	return !(atomic_read(&cl->remaining) & CLOSURE_RUNNING);
+}
+
+static inline bool closure_is_unlocked(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) == -1;
+}
+
+static inline void do_closure_init(struct closure *cl, struct closure *parent,
+				   bool running)
+{
+	switch (cl->type) {
+	case TYPE_closure_with_timer:
+	case TYPE_closure_with_waitlist_and_timer:
+		do_closure_timer_init(cl);
+	default:
+		break;
+	}
+
+	cl->parent = parent;
+	if (parent)
+		closure_get(parent);
+
+	if (running) {
+		closure_debug_create(cl);
+		atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
+	} else
+		atomic_set(&cl->remaining, -1);
+
+	closure_set_ip(cl);
+}
+
+/*
+ * Hack to get at the embedded closure if there is one, by doing an unsafe cast:
+ * the result of __closure_type() is thrown away, it's used merely for type
+ * checking.
+ */
+#define __to_internal_closure(cl)				\
+({								\
+	BUILD_BUG_ON(__closure_type(*cl) > MAX_CLOSURE_TYPE);	\
+	(struct closure *) cl;					\
+})
+
+#define closure_init_type(cl, parent, running)			\
+do {								\
+	struct closure *_cl = __to_internal_closure(cl);	\
+	_cl->type = __closure_type(*(cl));			\
+	do_closure_init(_cl, parent, running);			\
+} while (0)
+
+/**
+ * __closure_init() - Initialize a closure, skipping the memset()
+ *
+ * May be used instead of closure_init() when memory has already been zeroed.
+ */
+#define __closure_init(cl, parent)				\
+	closure_init_type(cl, parent, true)
+
+/**
+ * closure_init() - Initialize a closure, setting the refcount to 1
+ * @cl:		closure to initialize
+ * @parent:	parent of the new closure. cl will take a refcount on it for its
+ *		lifetime; may be NULL.
+ */
+#define closure_init(cl, parent)				\
+do {								\
+	memset((cl), 0, sizeof(*(cl)));				\
+	__closure_init(cl, parent);				\
+} while (0)
+
+static inline void closure_init_stack(struct closure *cl)
+{
+	memset(cl, 0, sizeof(struct closure));
+	atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|
+		   CLOSURE_BLOCKING|CLOSURE_STACK);
+}
+
+/**
+ * closure_init_unlocked() - Initialize a closure but leave it unlocked.
+ * @cl:		closure to initialize
+ *
+ * For when the closure will be used as a lock. The closure may not be used
+ * until after a closure_lock() or closure_trylock().
+ */
+#define closure_init_unlocked(cl)				\
+do {								\
+	memset((cl), 0, sizeof(*(cl)));				\
+	closure_init_type(cl, NULL, false);			\
+} while (0)
+
+/**
+ * closure_lock() - lock and initialize a closure.
+ * @cl:		the closure to lock
+ * @parent:	the new parent for this closure
+ *
+ * The closure must be of one of the types that has a waitlist (otherwise we
+ * wouldn't be able to sleep on contention).
+ *
+ * @parent has exactly the same meaning as in closure_init(); if non null, the
+ * closure will take a reference on @parent which will be released when it is
+ * unlocked.
+ */
+#define closure_lock(cl, parent)				\
+	__closure_lock(__to_internal_closure(cl), parent, &(cl)->wait)
+
+/**
+ * closure_delay() - delay some number of jiffies
+ * @cl:		the closure that will sleep
+ * @delay:	the delay in jiffies
+ *
+ * Takes a refcount on @cl which will be released after @delay jiffies; this may
+ * be used to have a function run after a delay with continue_at(), or
+ * closure_sync() may be used for a convoluted version of msleep().
+ */
+#define closure_delay(cl, delay)			\
+	__closure_delay(__to_internal_closure(cl), delay, &(cl)->timer)
+
+#define closure_flush(cl)				\
+	__closure_flush(__to_internal_closure(cl), &(cl)->timer)
+
+#define closure_flush_sync(cl)				\
+	__closure_flush_sync(__to_internal_closure(cl), &(cl)->timer)
+
+static inline void __closure_end_sleep(struct closure *cl)
+{
+	__set_current_state(TASK_RUNNING);
+
+	if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
+		atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+static inline void __closure_start_sleep(struct closure *cl)
+{
+	closure_set_ip(cl);
+	cl->task = current;
+	set_current_state(TASK_UNINTERRUPTIBLE);
+
+	if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
+		atomic_add(CLOSURE_SLEEPING, &cl->remaining);
+}
+
+/**
+ * closure_blocking() - returns true if the closure is in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ */
+static inline bool closure_blocking(struct closure *cl)
+{
+	return atomic_read(&cl->remaining) & CLOSURE_BLOCKING;
+}
+
+/**
+ * set_closure_blocking() - put a closure in blocking mode.
+ *
+ * If a closure is in blocking mode, closure_wait_event() will sleep until the
+ * condition is true instead of waiting asynchronously.
+ *
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void set_closure_blocking(struct closure *cl)
+{
+	if (!closure_blocking(cl))
+		atomic_add(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/*
+ * Not thread safe - can only be called by the thread running the closure.
+ */
+static inline void clear_closure_blocking(struct closure *cl)
+{
+	if (closure_blocking(cl))
+		atomic_sub(CLOSURE_BLOCKING, &cl->remaining);
+}
+
+/**
+ * closure_wake_up() - wake up all closures on a wait list.
+ */
+static inline void closure_wake_up(struct closure_waitlist *list)
+{
+	smp_mb();
+	__closure_wake_up(list);
+}
+
+/*
+ * Wait on an event, synchronously or asynchronously - analogous to wait_event()
+ * but for closures.
+ *
+ * The loop is oddly structured so as to avoid a race; we must check the
+ * condition again after we've added ourself to the waitlist. We know if we were
+ * already on the waitlist because closure_wait() returns false; thus, we only
+ * schedule or break if closure_wait() returns false. If it returns true, we
+ * just loop again - rechecking the condition.
+ *
+ * The __closure_wake_up() is necessary because we may race with the event
+ * becoming true; i.e. we see event false -> wait -> recheck condition, but the
+ * thread that made the event true may have called closure_wake_up() before we
+ * added ourself to the wait list.
+ *
+ * We have to call closure_sync() at the end instead of just
+ * __closure_end_sleep() because a different thread might've called
+ * closure_wake_up() before us and gotten preempted before they dropped the
+ * refcount on our closure. If this was a stack allocated closure, that would be
+ * bad.
+ */
+#define __closure_wait_event(list, cl, condition, _block)		\
+({									\
+	bool block = _block;						\
+	typeof(condition) ret;						\
+									\
+	while (1) {							\
+		ret = (condition);					\
+		if (ret) {						\
+			__closure_wake_up(list);			\
+			if (block)					\
+				closure_sync(cl);			\
+									\
+			break;						\
+		}							\
+									\
+		if (block)						\
+			__closure_start_sleep(cl);			\
+									\
+		if (!closure_wait(list, cl)) {				\
+			if (!block)					\
+				break;					\
+									\
+			schedule();					\
+		}							\
+	}								\
+									\
+	ret;								\
+})
+
+/**
+ * closure_wait_event() - wait on a condition, synchronously or asynchronously.
+ * @list:	the wait list to wait on
+ * @cl:		the closure that is doing the waiting
+ * @condition:	a C expression for the event to wait for
+ *
+ * If the closure is in blocking mode, sleeps until the @condition evaluates to
+ * true - exactly like wait_event().
+ *
+ * If the closure is not in blocking mode, waits asynchronously; if the
+ * condition is currently false the @cl is put onto @list and returns. @list
+ * owns a refcount on @cl; closure_sync() or continue_at() may be used later to
+ * wait for another thread to wake up @list, which drops the refcount on @cl.
+ *
+ * Returns the value of @condition; @cl will be on @list iff @condition was
+ * false.
+ *
+ * closure_wake_up(@list) must be called after changing any variable that could
+ * cause @condition to become true.
+ */
+#define closure_wait_event(list, cl, condition)				\
+	__closure_wait_event(list, cl, condition, closure_blocking(cl))
+
+#define closure_wait_event_async(list, cl, condition)			\
+	__closure_wait_event(list, cl, condition, false)
+
+#define closure_wait_event_sync(list, cl, condition)			\
+	__closure_wait_event(list, cl, condition, true)
+
+static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
+				  struct workqueue_struct *wq)
+{
+	BUG_ON(object_is_on_stack(cl));
+	closure_set_ip(cl);
+	cl->fn = fn;
+	cl->wq = wq;
+	/* between atomic_dec() in closure_put() */
+	smp_mb__before_atomic_dec();
+}
+
+#define continue_at(_cl, _fn, _wq)					\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_sub(_cl, CLOSURE_RUNNING + 1);				\
+	return;								\
+} while (0)
+
+#define closure_return(_cl)	continue_at((_cl), NULL, NULL)
+
+#define continue_at_nobarrier(_cl, _fn, _wq)				\
+do {									\
+	set_closure_fn(_cl, _fn, _wq);					\
+	closure_queue(cl);						\
+	return;								\
+} while (0)
+
+#define closure_return_with_destructor(_cl, _destructor)		\
+do {									\
+	set_closure_fn(_cl, _destructor, NULL);				\
+	closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1);	\
+	return;								\
+} while (0)
+
+static inline void closure_call(struct closure *cl, closure_fn fn,
+				struct workqueue_struct *wq,
+				struct closure *parent)
+{
+	closure_init(cl, parent);
+	continue_at_nobarrier(cl, fn, wq);
+}
+
+static inline void closure_trylock_call(struct closure *cl, closure_fn fn,
+					struct workqueue_struct *wq,
+					struct closure *parent)
+{
+	if (closure_trylock(cl, parent))
+		continue_at_nobarrier(cl, fn, wq);
+}
+
+#endif /* _LINUX_CLOSURE_H */
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
new file mode 100644
index 000000000000..4b37ef2b80e5
--- /dev/null
+++ b/drivers/md/bcache/debug.c
@@ -0,0 +1,563 @@
+/*
+ * Assorted bcache debug code
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/console.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/seq_file.h>
+
+static struct dentry *debug;
+
+const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
+{
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++)
+		if (ptr_available(c, k, i)) {
+			struct cache *ca = PTR_CACHE(c, k, i);
+			size_t bucket = PTR_BUCKET_NR(c, k, i);
+			size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
+
+			if (KEY_SIZE(k) + r > c->sb.bucket_size)
+				return "bad, length too big";
+			if (bucket <  ca->sb.first_bucket)
+				return "bad, short offset";
+			if (bucket >= ca->sb.nbuckets)
+				return "bad, offset past end of device";
+			if (ptr_stale(c, k, i))
+				return "stale";
+		}
+
+	if (!bkey_cmp(k, &ZERO_KEY))
+		return "bad, null key";
+	if (!KEY_PTRS(k))
+		return "bad, no pointers";
+	if (!KEY_SIZE(k))
+		return "zeroed key";
+	return "";
+}
+
+struct keyprint_hack bch_pkey(const struct bkey *k)
+{
+	unsigned i = 0;
+	struct keyprint_hack r;
+	char *out = r.s, *end = r.s + KEYHACK_SIZE;
+
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+
+	p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_OFFSET(k), KEY_SIZE(k));
+
+	if (KEY_PTRS(k))
+		while (1) {
+			p("%llu:%llu gen %llu",
+			  PTR_DEV(k, i), PTR_OFFSET(k, i), PTR_GEN(k, i));
+
+			if (++i == KEY_PTRS(k))
+				break;
+
+			p(", ");
+		}
+
+	p("]");
+
+	if (KEY_DIRTY(k))
+		p(" dirty");
+	if (KEY_CSUM(k))
+		p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
+#undef p
+	return r;
+}
+
+struct keyprint_hack bch_pbtree(const struct btree *b)
+{
+	struct keyprint_hack r;
+
+	snprintf(r.s, 40, "%li level %i/%i", PTR_BUCKET_NR(b->c, &b->key, 0),
+		 b->level, b->c->root ? b->c->root->level : -1);
+	return r;
+}
+
+#if defined(CONFIG_BCACHE_DEBUG) || defined(CONFIG_BCACHE_EDEBUG)
+
+static bool skipped_backwards(struct btree *b, struct bkey *k)
+{
+	return bkey_cmp(k, (!b->level)
+			? &START_KEY(bkey_next(k))
+			: bkey_next(k)) > 0;
+}
+
+static void dump_bset(struct btree *b, struct bset *i)
+{
+	struct bkey *k;
+	unsigned j;
+
+	for (k = i->start; k < end(i); k = bkey_next(k)) {
+		printk(KERN_ERR "block %zu key %zi/%u: %s", index(i, b),
+		       (uint64_t *) k - i->d, i->keys, pkey(k));
+
+		for (j = 0; j < KEY_PTRS(k); j++) {
+			size_t n = PTR_BUCKET_NR(b->c, k, j);
+			printk(" bucket %zu", n);
+
+			if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
+				printk(" prio %i",
+				       PTR_BUCKET(b->c, k, j)->prio);
+		}
+
+		printk(" %s\n", bch_ptr_status(b->c, k));
+
+		if (bkey_next(k) < end(i) &&
+		    skipped_backwards(b, k))
+			printk(KERN_ERR "Key skipped backwards\n");
+	}
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_btree_verify(struct btree *b, struct bset *new)
+{
+	struct btree *v = b->c->verify_data;
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	if (!b->c->verify)
+		return;
+
+	closure_wait_event(&b->io.wait, &cl,
+			   atomic_read(&b->io.cl.remaining) == -1);
+
+	mutex_lock(&b->c->verify_lock);
+
+	bkey_copy(&v->key, &b->key);
+	v->written = 0;
+	v->level = b->level;
+
+	bch_btree_read(v);
+	closure_wait_event(&v->io.wait, &cl,
+			   atomic_read(&b->io.cl.remaining) == -1);
+
+	if (new->keys != v->sets[0].data->keys ||
+	    memcmp(new->start,
+		   v->sets[0].data->start,
+		   (void *) end(new) - (void *) new->start)) {
+		unsigned i, j;
+
+		console_lock();
+
+		printk(KERN_ERR "*** original memory node:\n");
+		for (i = 0; i <= b->nsets; i++)
+			dump_bset(b, b->sets[i].data);
+
+		printk(KERN_ERR "*** sorted memory node:\n");
+		dump_bset(b, new);
+
+		printk(KERN_ERR "*** on disk node:\n");
+		dump_bset(v, v->sets[0].data);
+
+		for (j = 0; j < new->keys; j++)
+			if (new->d[j] != v->sets[0].data->d[j])
+				break;
+
+		console_unlock();
+		panic("verify failed at %u\n", j);
+	}
+
+	mutex_unlock(&b->c->verify_lock);
+}
+
+static void data_verify_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	closure_put(cl);
+}
+
+void bch_data_verify(struct search *s)
+{
+	char name[BDEVNAME_SIZE];
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct closure *cl = &s->cl;
+	struct bio *check;
+	struct bio_vec *bv;
+	int i;
+
+	if (!s->unaligned_bvec)
+		bio_for_each_segment(bv, s->orig_bio, i)
+			bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
+
+	check = bio_clone(s->orig_bio, GFP_NOIO);
+	if (!check)
+		return;
+
+	if (bio_alloc_pages(check, GFP_NOIO))
+		goto out_put;
+
+	check->bi_rw		= READ_SYNC;
+	check->bi_private	= cl;
+	check->bi_end_io	= data_verify_endio;
+
+	closure_bio_submit(check, cl, &dc->disk);
+	closure_sync(cl);
+
+	bio_for_each_segment(bv, s->orig_bio, i) {
+		void *p1 = kmap(bv->bv_page);
+		void *p2 = kmap(check->bi_io_vec[i].bv_page);
+
+		if (memcmp(p1 + bv->bv_offset,
+			   p2 + bv->bv_offset,
+			   bv->bv_len))
+			printk(KERN_ERR "bcache (%s): verify failed"
+			       " at sector %llu\n",
+			       bdevname(dc->bdev, name),
+			       (uint64_t) s->orig_bio->bi_sector);
+
+		kunmap(bv->bv_page);
+		kunmap(check->bi_io_vec[i].bv_page);
+	}
+
+	__bio_for_each_segment(bv, check, i, 0)
+		__free_page(bv->bv_page);
+out_put:
+	bio_put(check);
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+unsigned bch_count_data(struct btree *b)
+{
+	unsigned ret = 0;
+	struct btree_iter iter;
+	struct bkey *k;
+
+	if (!b->level)
+		for_each_key(b, k, &iter)
+			ret += KEY_SIZE(k);
+	return ret;
+}
+
+static void vdump_bucket_and_panic(struct btree *b, const char *fmt,
+				   va_list args)
+{
+	unsigned i;
+
+	console_lock();
+
+	for (i = 0; i <= b->nsets; i++)
+		dump_bset(b, b->sets[i].data);
+
+	vprintk(fmt, args);
+
+	console_unlock();
+
+	panic("at %s\n", pbtree(b));
+}
+
+void bch_check_key_order_msg(struct btree *b, struct bset *i,
+			     const char *fmt, ...)
+{
+	struct bkey *k;
+
+	if (!i->keys)
+		return;
+
+	for (k = i->start; bkey_next(k) < end(i); k = bkey_next(k))
+		if (skipped_backwards(b, k)) {
+			va_list args;
+			va_start(args, fmt);
+
+			vdump_bucket_and_panic(b, fmt, args);
+			va_end(args);
+		}
+}
+
+void bch_check_keys(struct btree *b, const char *fmt, ...)
+{
+	va_list args;
+	struct bkey *k, *p = NULL;
+	struct btree_iter iter;
+
+	if (b->level)
+		return;
+
+	for_each_key(b, k, &iter) {
+		if (p && bkey_cmp(&START_KEY(p), &START_KEY(k)) > 0) {
+			printk(KERN_ERR "Keys out of order:\n");
+			goto bug;
+		}
+
+		if (bch_ptr_invalid(b, k))
+			continue;
+
+		if (p && bkey_cmp(p, &START_KEY(k)) > 0) {
+			printk(KERN_ERR "Overlapping keys:\n");
+			goto bug;
+		}
+		p = k;
+	}
+	return;
+bug:
+	va_start(args, fmt);
+	vdump_bucket_and_panic(b, fmt, args);
+	va_end(args);
+}
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+
+/* XXX: cache set refcounting */
+
+struct dump_iterator {
+	char			buf[PAGE_SIZE];
+	size_t			bytes;
+	struct cache_set	*c;
+	struct keybuf		keys;
+};
+
+static bool dump_pred(struct keybuf *buf, struct bkey *k)
+{
+	return true;
+}
+
+static ssize_t bch_dump_read(struct file *file, char __user *buf,
+			     size_t size, loff_t *ppos)
+{
+	struct dump_iterator *i = file->private_data;
+	ssize_t ret = 0;
+
+	while (size) {
+		struct keybuf_key *w;
+		unsigned bytes = min(i->bytes, size);
+
+		int err = copy_to_user(buf, i->buf, bytes);
+		if (err)
+			return err;
+
+		ret	 += bytes;
+		buf	 += bytes;
+		size	 -= bytes;
+		i->bytes -= bytes;
+		memmove(i->buf, i->buf + bytes, i->bytes);
+
+		if (i->bytes)
+			break;
+
+		w = bch_keybuf_next_rescan(i->c, &i->keys, &MAX_KEY);
+		if (!w)
+			break;
+
+		i->bytes = snprintf(i->buf, PAGE_SIZE, "%s\n", pkey(&w->key));
+		bch_keybuf_del(&i->keys, w);
+	}
+
+	return ret;
+}
+
+static int bch_dump_open(struct inode *inode, struct file *file)
+{
+	struct cache_set *c = inode->i_private;
+	struct dump_iterator *i;
+
+	i = kzalloc(sizeof(struct dump_iterator), GFP_KERNEL);
+	if (!i)
+		return -ENOMEM;
+
+	file->private_data = i;
+	i->c = c;
+	bch_keybuf_init(&i->keys, dump_pred);
+	i->keys.last_scanned = KEY(0, 0, 0);
+
+	return 0;
+}
+
+static int bch_dump_release(struct inode *inode, struct file *file)
+{
+	kfree(file->private_data);
+	return 0;
+}
+
+static const struct file_operations cache_set_debug_ops = {
+	.owner		= THIS_MODULE,
+	.open		= bch_dump_open,
+	.read		= bch_dump_read,
+	.release	= bch_dump_release
+};
+
+void bch_debug_init_cache_set(struct cache_set *c)
+{
+	if (!IS_ERR_OR_NULL(debug)) {
+		char name[50];
+		snprintf(name, 50, "bcache-%pU", c->sb.set_uuid);
+
+		c->debug = debugfs_create_file(name, 0400, debug, c,
+					       &cache_set_debug_ops);
+	}
+}
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+static ssize_t btree_fuzz(struct kobject *k, struct kobj_attribute *a,
+			  const char *buffer, size_t size)
+{
+	void dump(struct btree *b)
+	{
+		struct bset *i;
+
+		for (i = b->sets[0].data;
+		     index(i, b) < btree_blocks(b) &&
+		     i->seq == b->sets[0].data->seq;
+		     i = ((void *) i) + set_blocks(i, b->c) * block_bytes(b->c))
+			dump_bset(b, i);
+	}
+
+	struct cache_sb *sb;
+	struct cache_set *c;
+	struct btree *all[3], *b, *fill, *orig;
+	int j;
+
+	struct btree_op op;
+	bch_btree_op_init_stack(&op);
+
+	sb = kzalloc(sizeof(struct cache_sb), GFP_KERNEL);
+	if (!sb)
+		return -ENOMEM;
+
+	sb->bucket_size = 128;
+	sb->block_size = 4;
+
+	c = bch_cache_set_alloc(sb);
+	if (!c)
+		return -ENOMEM;
+
+	for (j = 0; j < 3; j++) {
+		BUG_ON(list_empty(&c->btree_cache));
+		all[j] = list_first_entry(&c->btree_cache, struct btree, list);
+		list_del_init(&all[j]->list);
+
+		all[j]->key = KEY(0, 0, c->sb.bucket_size);
+		bkey_copy_key(&all[j]->key, &MAX_KEY);
+	}
+
+	b = all[0];
+	fill = all[1];
+	orig = all[2];
+
+	while (1) {
+		for (j = 0; j < 3; j++)
+			all[j]->written = all[j]->nsets = 0;
+
+		bch_bset_init_next(b);
+
+		while (1) {
+			struct bset *i = write_block(b);
+			struct bkey *k = op.keys.top;
+			unsigned rand;
+
+			bkey_init(k);
+			rand = get_random_int();
+
+			op.type = rand & 1
+				? BTREE_INSERT
+				: BTREE_REPLACE;
+			rand >>= 1;
+
+			SET_KEY_SIZE(k, bucket_remainder(c, rand));
+			rand >>= c->bucket_bits;
+			rand &= 1024 * 512 - 1;
+			rand += c->sb.bucket_size;
+			SET_KEY_OFFSET(k, rand);
+#if 0
+			SET_KEY_PTRS(k, 1);
+#endif
+			bch_keylist_push(&op.keys);
+			bch_btree_insert_keys(b, &op);
+
+			if (should_split(b) ||
+			    set_blocks(i, b->c) !=
+			    __set_blocks(i, i->keys + 15, b->c)) {
+				i->csum = csum_set(i);
+
+				memcpy(write_block(fill),
+				       i, set_bytes(i));
+
+				b->written += set_blocks(i, b->c);
+				fill->written = b->written;
+				if (b->written == btree_blocks(b))
+					break;
+
+				bch_btree_sort_lazy(b);
+				bch_bset_init_next(b);
+			}
+		}
+
+		memcpy(orig->sets[0].data,
+		       fill->sets[0].data,
+		       btree_bytes(c));
+
+		bch_btree_sort(b);
+		fill->written = 0;
+		bch_btree_read_done(&fill->io.cl);
+
+		if (b->sets[0].data->keys != fill->sets[0].data->keys ||
+		    memcmp(b->sets[0].data->start,
+			   fill->sets[0].data->start,
+			   b->sets[0].data->keys * sizeof(uint64_t))) {
+			struct bset *i = b->sets[0].data;
+			struct bkey *k, *l;
+
+			for (k = i->start,
+			     l = fill->sets[0].data->start;
+			     k < end(i);
+			     k = bkey_next(k), l = bkey_next(l))
+				if (bkey_cmp(k, l) ||
+				    KEY_SIZE(k) != KEY_SIZE(l))
+					pr_err("key %zi differs: %s "
+					       "!= %s", (uint64_t *) k - i->d,
+					       pkey(k), pkey(l));
+
+			for (j = 0; j < 3; j++) {
+				pr_err("**** Set %i ****", j);
+				dump(all[j]);
+			}
+			panic("\n");
+		}
+
+		pr_info("fuzz complete: %i keys", b->sets[0].data->keys);
+	}
+}
+
+kobj_attribute_write(fuzz, btree_fuzz);
+#endif
+
+void bch_debug_exit(void)
+{
+	if (!IS_ERR_OR_NULL(debug))
+		debugfs_remove_recursive(debug);
+}
+
+int __init bch_debug_init(struct kobject *kobj)
+{
+	int ret = 0;
+#ifdef CONFIG_BCACHE_DEBUG
+	ret = sysfs_create_file(kobj, &ksysfs_fuzz.attr);
+	if (ret)
+		return ret;
+#endif
+
+	debug = debugfs_create_dir("bcache", NULL);
+	return ret;
+}
diff --git a/drivers/md/bcache/debug.h b/drivers/md/bcache/debug.h
new file mode 100644
index 000000000000..f9378a218148
--- /dev/null
+++ b/drivers/md/bcache/debug.h
@@ -0,0 +1,54 @@
+#ifndef _BCACHE_DEBUG_H
+#define _BCACHE_DEBUG_H
+
+/* Btree/bkey debug printing */
+
+#define KEYHACK_SIZE 80
+struct keyprint_hack {
+	char s[KEYHACK_SIZE];
+};
+
+struct keyprint_hack bch_pkey(const struct bkey *k);
+struct keyprint_hack bch_pbtree(const struct btree *b);
+#define pkey(k)		(&bch_pkey(k).s[0])
+#define pbtree(b)	(&bch_pbtree(b).s[0])
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+unsigned bch_count_data(struct btree *);
+void bch_check_key_order_msg(struct btree *, struct bset *, const char *, ...);
+void bch_check_keys(struct btree *, const char *, ...);
+
+#define bch_check_key_order(b, i)			\
+	bch_check_key_order_msg(b, i, "keys out of order")
+#define EBUG_ON(cond)		BUG_ON(cond)
+
+#else /* EDEBUG */
+
+#define bch_count_data(b)				0
+#define bch_check_key_order(b, i)			do {} while (0)
+#define bch_check_key_order_msg(b, i, ...)		do {} while (0)
+#define bch_check_keys(b, ...)				do {} while (0)
+#define EBUG_ON(cond)					do {} while (0)
+
+#endif
+
+#ifdef CONFIG_BCACHE_DEBUG
+
+void bch_btree_verify(struct btree *, struct bset *);
+void bch_data_verify(struct search *);
+
+#else /* DEBUG */
+
+static inline void bch_btree_verify(struct btree *b, struct bset *i) {}
+static inline void bch_data_verify(struct search *s) {};
+
+#endif
+
+#ifdef CONFIG_DEBUG_FS
+void bch_debug_init_cache_set(struct cache_set *);
+#else
+static inline void bch_debug_init_cache_set(struct cache_set *c) {}
+#endif
+
+#endif
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
new file mode 100644
index 000000000000..f565512f6fac
--- /dev/null
+++ b/drivers/md/bcache/io.c
@@ -0,0 +1,390 @@
+/*
+ * Some low level IO code, and hacks for various block layer limitations
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "bset.h"
+#include "debug.h"
+
+static void bch_bi_idx_hack_endio(struct bio *bio, int error)
+{
+	struct bio *p = bio->bi_private;
+
+	bio_endio(p, error);
+	bio_put(bio);
+}
+
+static void bch_generic_make_request_hack(struct bio *bio)
+{
+	if (bio->bi_idx) {
+		struct bio *clone = bio_alloc(GFP_NOIO, bio_segments(bio));
+
+		memcpy(clone->bi_io_vec,
+		       bio_iovec(bio),
+		       bio_segments(bio) * sizeof(struct bio_vec));
+
+		clone->bi_sector	= bio->bi_sector;
+		clone->bi_bdev		= bio->bi_bdev;
+		clone->bi_rw		= bio->bi_rw;
+		clone->bi_vcnt		= bio_segments(bio);
+		clone->bi_size		= bio->bi_size;
+
+		clone->bi_private	= bio;
+		clone->bi_end_io	= bch_bi_idx_hack_endio;
+
+		bio = clone;
+	}
+
+	generic_make_request(bio);
+}
+
+/**
+ * bch_bio_split - split a bio
+ * @bio:	bio to split
+ * @sectors:	number of sectors to split from the front of @bio
+ * @gfp:	gfp mask
+ * @bs:		bio set to allocate from
+ *
+ * Allocates and returns a new bio which represents @sectors from the start of
+ * @bio, and updates @bio to represent the remaining sectors.
+ *
+ * If bio_sectors(@bio) was less than or equal to @sectors, returns @bio
+ * unchanged.
+ *
+ * The newly allocated bio will point to @bio's bi_io_vec, if the split was on a
+ * bvec boundry; it is the caller's responsibility to ensure that @bio is not
+ * freed before the split.
+ *
+ * If bch_bio_split() is running under generic_make_request(), it's not safe to
+ * allocate more than one bio from the same bio set. Therefore, if it is running
+ * under generic_make_request() it masks out __GFP_WAIT when doing the
+ * allocation. The caller must check for failure if there's any possibility of
+ * it being called from under generic_make_request(); it is then the caller's
+ * responsibility to retry from a safe context (by e.g. punting to workqueue).
+ */
+struct bio *bch_bio_split(struct bio *bio, int sectors,
+			  gfp_t gfp, struct bio_set *bs)
+{
+	unsigned idx = bio->bi_idx, vcnt = 0, nbytes = sectors << 9;
+	struct bio_vec *bv;
+	struct bio *ret = NULL;
+
+	BUG_ON(sectors <= 0);
+
+	/*
+	 * If we're being called from underneath generic_make_request() and we
+	 * already allocated any bios from this bio set, we risk deadlock if we
+	 * use the mempool. So instead, we possibly fail and let the caller punt
+	 * to workqueue or somesuch and retry in a safe context.
+	 */
+	if (current->bio_list)
+		gfp &= ~__GFP_WAIT;
+
+	if (sectors >= bio_sectors(bio))
+		return bio;
+
+	if (bio->bi_rw & REQ_DISCARD) {
+		ret = bio_alloc_bioset(gfp, 1, bs);
+		idx = 0;
+		goto out;
+	}
+
+	bio_for_each_segment(bv, bio, idx) {
+		vcnt = idx - bio->bi_idx;
+
+		if (!nbytes) {
+			ret = bio_alloc_bioset(gfp, vcnt, bs);
+			if (!ret)
+				return NULL;
+
+			memcpy(ret->bi_io_vec, bio_iovec(bio),
+			       sizeof(struct bio_vec) * vcnt);
+
+			break;
+		} else if (nbytes < bv->bv_len) {
+			ret = bio_alloc_bioset(gfp, ++vcnt, bs);
+			if (!ret)
+				return NULL;
+
+			memcpy(ret->bi_io_vec, bio_iovec(bio),
+			       sizeof(struct bio_vec) * vcnt);
+
+			ret->bi_io_vec[vcnt - 1].bv_len = nbytes;
+			bv->bv_offset	+= nbytes;
+			bv->bv_len	-= nbytes;
+			break;
+		}
+
+		nbytes -= bv->bv_len;
+	}
+out:
+	ret->bi_bdev	= bio->bi_bdev;
+	ret->bi_sector	= bio->bi_sector;
+	ret->bi_size	= sectors << 9;
+	ret->bi_rw	= bio->bi_rw;
+	ret->bi_vcnt	= vcnt;
+	ret->bi_max_vecs = vcnt;
+
+	bio->bi_sector	+= sectors;
+	bio->bi_size	-= sectors << 9;
+	bio->bi_idx	 = idx;
+
+	if (bio_integrity(bio)) {
+		if (bio_integrity_clone(ret, bio, gfp)) {
+			bio_put(ret);
+			return NULL;
+		}
+
+		bio_integrity_trim(ret, 0, bio_sectors(ret));
+		bio_integrity_trim(bio, bio_sectors(ret), bio_sectors(bio));
+	}
+
+	return ret;
+}
+
+static unsigned bch_bio_max_sectors(struct bio *bio)
+{
+	unsigned ret = bio_sectors(bio);
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	struct bio_vec *bv, *end = bio_iovec(bio) +
+		min_t(int, bio_segments(bio), queue_max_segments(q));
+
+	struct bvec_merge_data bvm = {
+		.bi_bdev	= bio->bi_bdev,
+		.bi_sector	= bio->bi_sector,
+		.bi_size	= 0,
+		.bi_rw		= bio->bi_rw,
+	};
+
+	if (bio->bi_rw & REQ_DISCARD)
+		return min(ret, q->limits.max_discard_sectors);
+
+	if (bio_segments(bio) > queue_max_segments(q) ||
+	    q->merge_bvec_fn) {
+		ret = 0;
+
+		for (bv = bio_iovec(bio); bv < end; bv++) {
+			if (q->merge_bvec_fn &&
+			    q->merge_bvec_fn(q, &bvm, bv) < (int) bv->bv_len)
+				break;
+
+			ret		+= bv->bv_len >> 9;
+			bvm.bi_size	+= bv->bv_len;
+		}
+
+		if (ret >= (BIO_MAX_PAGES * PAGE_SIZE) >> 9)
+			return (BIO_MAX_PAGES * PAGE_SIZE) >> 9;
+	}
+
+	ret = min(ret, queue_max_sectors(q));
+
+	WARN_ON(!ret);
+	ret = max_t(int, ret, bio_iovec(bio)->bv_len >> 9);
+
+	return ret;
+}
+
+static void bch_bio_submit_split_done(struct closure *cl)
+{
+	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+	s->bio->bi_end_io = s->bi_end_io;
+	s->bio->bi_private = s->bi_private;
+	bio_endio(s->bio, 0);
+
+	closure_debug_destroy(&s->cl);
+	mempool_free(s, s->p->bio_split_hook);
+}
+
+static void bch_bio_submit_split_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+
+	if (error)
+		clear_bit(BIO_UPTODATE, &s->bio->bi_flags);
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+static void __bch_bio_submit_split(struct closure *cl)
+{
+	struct bio_split_hook *s = container_of(cl, struct bio_split_hook, cl);
+	struct bio *bio = s->bio, *n;
+
+	do {
+		n = bch_bio_split(bio, bch_bio_max_sectors(bio),
+				  GFP_NOIO, s->p->bio_split);
+		if (!n)
+			continue_at(cl, __bch_bio_submit_split, system_wq);
+
+		n->bi_end_io	= bch_bio_submit_split_endio;
+		n->bi_private	= cl;
+
+		closure_get(cl);
+		bch_generic_make_request_hack(n);
+	} while (n != bio);
+
+	continue_at(cl, bch_bio_submit_split_done, NULL);
+}
+
+void bch_generic_make_request(struct bio *bio, struct bio_split_pool *p)
+{
+	struct bio_split_hook *s;
+
+	if (!bio_has_data(bio) && !(bio->bi_rw & REQ_DISCARD))
+		goto submit;
+
+	if (bio_sectors(bio) <= bch_bio_max_sectors(bio))
+		goto submit;
+
+	s = mempool_alloc(p->bio_split_hook, GFP_NOIO);
+
+	s->bio		= bio;
+	s->p		= p;
+	s->bi_end_io	= bio->bi_end_io;
+	s->bi_private	= bio->bi_private;
+	bio_get(bio);
+
+	closure_call(&s->cl, __bch_bio_submit_split, NULL, NULL);
+	return;
+submit:
+	bch_generic_make_request_hack(bio);
+}
+
+/* Bios with headers */
+
+void bch_bbio_free(struct bio *bio, struct cache_set *c)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	mempool_free(b, c->bio_meta);
+}
+
+struct bio *bch_bbio_alloc(struct cache_set *c)
+{
+	struct bbio *b = mempool_alloc(c->bio_meta, GFP_NOIO);
+	struct bio *bio = &b->bio;
+
+	bio_init(bio);
+	bio->bi_flags		|= BIO_POOL_NONE << BIO_POOL_OFFSET;
+	bio->bi_max_vecs	 = bucket_pages(c);
+	bio->bi_io_vec		 = bio->bi_inline_vecs;
+
+	return bio;
+}
+
+void __bch_submit_bbio(struct bio *bio, struct cache_set *c)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+
+	bio->bi_sector	= PTR_OFFSET(&b->key, 0);
+	bio->bi_bdev	= PTR_CACHE(c, &b->key, 0)->bdev;
+
+	b->submit_time_us = local_clock_us();
+	closure_bio_submit(bio, bio->bi_private, PTR_CACHE(c, &b->key, 0));
+}
+
+void bch_submit_bbio(struct bio *bio, struct cache_set *c,
+		     struct bkey *k, unsigned ptr)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	bch_bkey_copy_single_ptr(&b->key, k, ptr);
+	__bch_submit_bbio(bio, c);
+}
+
+/* IO errors */
+
+void bch_count_io_errors(struct cache *ca, int error, const char *m)
+{
+	/*
+	 * The halflife of an error is:
+	 * log2(1/2)/log2(127/128) * refresh ~= 88 * refresh
+	 */
+
+	if (ca->set->error_decay) {
+		unsigned count = atomic_inc_return(&ca->io_count);
+
+		while (count > ca->set->error_decay) {
+			unsigned errors;
+			unsigned old = count;
+			unsigned new = count - ca->set->error_decay;
+
+			/*
+			 * First we subtract refresh from count; each time we
+			 * succesfully do so, we rescale the errors once:
+			 */
+
+			count = atomic_cmpxchg(&ca->io_count, old, new);
+
+			if (count == old) {
+				count = new;
+
+				errors = atomic_read(&ca->io_errors);
+				do {
+					old = errors;
+					new = ((uint64_t) errors * 127) / 128;
+					errors = atomic_cmpxchg(&ca->io_errors,
+								old, new);
+				} while (old != errors);
+			}
+		}
+	}
+
+	if (error) {
+		char buf[BDEVNAME_SIZE];
+		unsigned errors = atomic_add_return(1 << IO_ERROR_SHIFT,
+						    &ca->io_errors);
+		errors >>= IO_ERROR_SHIFT;
+
+		if (errors < ca->set->error_limit)
+			pr_err("%s: IO error on %s, recovering",
+			       bdevname(ca->bdev, buf), m);
+		else
+			bch_cache_set_error(ca->set,
+					    "%s: too many IO errors %s",
+					    bdevname(ca->bdev, buf), m);
+	}
+}
+
+void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
+			      int error, const char *m)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	struct cache *ca = PTR_CACHE(c, &b->key, 0);
+
+	unsigned threshold = bio->bi_rw & REQ_WRITE
+		? c->congested_write_threshold_us
+		: c->congested_read_threshold_us;
+
+	if (threshold) {
+		unsigned t = local_clock_us();
+
+		int us = t - b->submit_time_us;
+		int congested = atomic_read(&c->congested);
+
+		if (us > (int) threshold) {
+			int ms = us / 1024;
+			c->congested_last_us = t;
+
+			ms = min(ms, CONGESTED_MAX + congested);
+			atomic_sub(ms, &c->congested);
+		} else if (congested < 0)
+			atomic_inc(&c->congested);
+	}
+
+	bch_count_io_errors(ca, error, m);
+}
+
+void bch_bbio_endio(struct cache_set *c, struct bio *bio,
+		    int error, const char *m)
+{
+	struct closure *cl = bio->bi_private;
+
+	bch_bbio_count_io_errors(c, bio, error, m);
+	bio_put(bio);
+	closure_put(cl);
+}
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
new file mode 100644
index 000000000000..c871ffaabbb0
--- /dev/null
+++ b/drivers/md/bcache/journal.c
@@ -0,0 +1,785 @@
+/*
+ * bcache journalling code, for btree insertions
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+/*
+ * Journal replay/recovery:
+ *
+ * This code is all driven from run_cache_set(); we first read the journal
+ * entries, do some other stuff, then we mark all the keys in the journal
+ * entries (same as garbage collection would), then we replay them - reinserting
+ * them into the cache in precisely the same order as they appear in the
+ * journal.
+ *
+ * We only journal keys that go in leaf nodes, which simplifies things quite a
+ * bit.
+ */
+
+static void journal_read_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	closure_put(cl);
+}
+
+static int journal_read_bucket(struct cache *ca, struct list_head *list,
+			       struct btree_op *op, unsigned bucket_index)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = &ja->bio;
+
+	struct journal_replay *i;
+	struct jset *j, *data = ca->set->journal.w[0].data;
+	unsigned len, left, offset = 0;
+	int ret = 0;
+	sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]);
+
+	pr_debug("reading %llu", (uint64_t) bucket);
+
+	while (offset < ca->sb.bucket_size) {
+reread:		left = ca->sb.bucket_size - offset;
+		len = min_t(unsigned, left, PAGE_SECTORS * 8);
+
+		bio_reset(bio);
+		bio->bi_sector	= bucket + offset;
+		bio->bi_bdev	= ca->bdev;
+		bio->bi_rw	= READ;
+		bio->bi_size	= len << 9;
+
+		bio->bi_end_io	= journal_read_endio;
+		bio->bi_private = &op->cl;
+		bio_map(bio, data);
+
+		closure_bio_submit(bio, &op->cl, ca);
+		closure_sync(&op->cl);
+
+		/* This function could be simpler now since we no longer write
+		 * journal entries that overlap bucket boundaries; this means
+		 * the start of a bucket will always have a valid journal entry
+		 * if it has any journal entries at all.
+		 */
+
+		j = data;
+		while (len) {
+			struct list_head *where;
+			size_t blocks, bytes = set_bytes(j);
+
+			if (j->magic != jset_magic(ca->set))
+				return ret;
+
+			if (bytes > left << 9)
+				return ret;
+
+			if (bytes > len << 9)
+				goto reread;
+
+			if (j->csum != csum_set(j))
+				return ret;
+
+			blocks = set_blocks(j, ca->set);
+
+			while (!list_empty(list)) {
+				i = list_first_entry(list,
+					struct journal_replay, list);
+				if (i->j.seq >= j->last_seq)
+					break;
+				list_del(&i->list);
+				kfree(i);
+			}
+
+			list_for_each_entry_reverse(i, list, list) {
+				if (j->seq == i->j.seq)
+					goto next_set;
+
+				if (j->seq < i->j.last_seq)
+					goto next_set;
+
+				if (j->seq > i->j.seq) {
+					where = &i->list;
+					goto add;
+				}
+			}
+
+			where = list;
+add:
+			i = kmalloc(offsetof(struct journal_replay, j) +
+				    bytes, GFP_KERNEL);
+			if (!i)
+				return -ENOMEM;
+			memcpy(&i->j, j, bytes);
+			list_add(&i->list, where);
+			ret = 1;
+
+			ja->seq[bucket_index] = j->seq;
+next_set:
+			offset	+= blocks * ca->sb.block_size;
+			len	-= blocks * ca->sb.block_size;
+			j = ((void *) j) + blocks * block_bytes(ca);
+		}
+	}
+
+	return ret;
+}
+
+int bch_journal_read(struct cache_set *c, struct list_head *list,
+			struct btree_op *op)
+{
+#define read_bucket(b)							\
+	({								\
+		int ret = journal_read_bucket(ca, list, op, b);		\
+		__set_bit(b, bitmap);					\
+		if (ret < 0)						\
+			return ret;					\
+		ret;							\
+	})
+
+	struct cache *ca;
+	unsigned iter;
+
+	for_each_cache(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG];
+		unsigned i, l, r, m;
+		uint64_t seq;
+
+		bitmap_zero(bitmap, SB_JOURNAL_BUCKETS);
+		pr_debug("%u journal buckets", ca->sb.njournal_buckets);
+
+		/* Read journal buckets ordered by golden ratio hash to quickly
+		 * find a sequence of buckets with valid journal entries
+		 */
+		for (i = 0; i < ca->sb.njournal_buckets; i++) {
+			l = (i * 2654435769U) % ca->sb.njournal_buckets;
+
+			if (test_bit(l, bitmap))
+				break;
+
+			if (read_bucket(l))
+				goto bsearch;
+		}
+
+		/* If that fails, check all the buckets we haven't checked
+		 * already
+		 */
+		pr_debug("falling back to linear search");
+
+		for (l = 0; l < ca->sb.njournal_buckets; l++) {
+			if (test_bit(l, bitmap))
+				continue;
+
+			if (read_bucket(l))
+				goto bsearch;
+		}
+bsearch:
+		/* Binary search */
+		m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1);
+		pr_debug("starting binary search, l %u r %u", l, r);
+
+		while (l + 1 < r) {
+			m = (l + r) >> 1;
+
+			if (read_bucket(m))
+				l = m;
+			else
+				r = m;
+		}
+
+		/* Read buckets in reverse order until we stop finding more
+		 * journal entries
+		 */
+		pr_debug("finishing up");
+		l = m;
+
+		while (1) {
+			if (!l--)
+				l = ca->sb.njournal_buckets - 1;
+
+			if (l == m)
+				break;
+
+			if (test_bit(l, bitmap))
+				continue;
+
+			if (!read_bucket(l))
+				break;
+		}
+
+		seq = 0;
+
+		for (i = 0; i < ca->sb.njournal_buckets; i++)
+			if (ja->seq[i] > seq) {
+				seq = ja->seq[i];
+				ja->cur_idx = ja->discard_idx =
+					ja->last_idx = i;
+
+			}
+	}
+
+	c->journal.seq = list_entry(list->prev,
+				    struct journal_replay,
+				    list)->j.seq;
+
+	return 0;
+#undef read_bucket
+}
+
+void bch_journal_mark(struct cache_set *c, struct list_head *list)
+{
+	atomic_t p = { 0 };
+	struct bkey *k;
+	struct journal_replay *i;
+	struct journal *j = &c->journal;
+	uint64_t last = j->seq;
+
+	/*
+	 * journal.pin should never fill up - we never write a journal
+	 * entry when it would fill up. But if for some reason it does, we
+	 * iterate over the list in reverse order so that we can just skip that
+	 * refcount instead of bugging.
+	 */
+
+	list_for_each_entry_reverse(i, list, list) {
+		BUG_ON(last < i->j.seq);
+		i->pin = NULL;
+
+		while (last-- != i->j.seq)
+			if (fifo_free(&j->pin) > 1) {
+				fifo_push_front(&j->pin, p);
+				atomic_set(&fifo_front(&j->pin), 0);
+			}
+
+		if (fifo_free(&j->pin) > 1) {
+			fifo_push_front(&j->pin, p);
+			i->pin = &fifo_front(&j->pin);
+			atomic_set(i->pin, 1);
+		}
+
+		for (k = i->j.start;
+		     k < end(&i->j);
+		     k = bkey_next(k)) {
+			unsigned j;
+
+			for (j = 0; j < KEY_PTRS(k); j++) {
+				struct bucket *g = PTR_BUCKET(c, k, j);
+				atomic_inc(&g->pin);
+
+				if (g->prio == BTREE_PRIO &&
+				    !ptr_stale(c, k, j))
+					g->prio = INITIAL_PRIO;
+			}
+
+			__bch_btree_mark_key(c, 0, k);
+		}
+	}
+}
+
+int bch_journal_replay(struct cache_set *s, struct list_head *list,
+			  struct btree_op *op)
+{
+	int ret = 0, keys = 0, entries = 0;
+	struct bkey *k;
+	struct journal_replay *i =
+		list_entry(list->prev, struct journal_replay, list);
+
+	uint64_t start = i->j.last_seq, end = i->j.seq, n = start;
+
+	list_for_each_entry(i, list, list) {
+		BUG_ON(i->pin && atomic_read(i->pin) != 1);
+
+		if (n != i->j.seq)
+			pr_err("journal entries %llu-%llu "
+			       "missing! (replaying %llu-%llu)\n",
+			       n, i->j.seq - 1, start, end);
+
+		for (k = i->j.start;
+		     k < end(&i->j);
+		     k = bkey_next(k)) {
+			pr_debug("%s", pkey(k));
+			bkey_copy(op->keys.top, k);
+			bch_keylist_push(&op->keys);
+
+			op->journal = i->pin;
+			atomic_inc(op->journal);
+
+			ret = bch_btree_insert(op, s);
+			if (ret)
+				goto err;
+
+			BUG_ON(!bch_keylist_empty(&op->keys));
+			keys++;
+
+			cond_resched();
+		}
+
+		if (i->pin)
+			atomic_dec(i->pin);
+		n = i->j.seq + 1;
+		entries++;
+	}
+
+	pr_info("journal replay done, %i keys in %i entries, seq %llu",
+		keys, entries, end);
+
+	while (!list_empty(list)) {
+		i = list_first_entry(list, struct journal_replay, list);
+		list_del(&i->list);
+		kfree(i);
+	}
+err:
+	closure_sync(&op->cl);
+	return ret;
+}
+
+/* Journalling */
+
+static void btree_flush_write(struct cache_set *c)
+{
+	/*
+	 * Try to find the btree node with that references the oldest journal
+	 * entry, best is our current candidate and is locked if non NULL:
+	 */
+	struct btree *b, *best = NULL;
+	unsigned iter;
+
+	for_each_cached_btree(b, c, iter) {
+		if (!down_write_trylock(&b->lock))
+			continue;
+
+		if (!btree_node_dirty(b) ||
+		    !btree_current_write(b)->journal) {
+			rw_unlock(true, b);
+			continue;
+		}
+
+		if (!best)
+			best = b;
+		else if (journal_pin_cmp(c,
+					 btree_current_write(best),
+					 btree_current_write(b))) {
+			rw_unlock(true, best);
+			best = b;
+		} else
+			rw_unlock(true, b);
+	}
+
+	if (best)
+		goto out;
+
+	/* We can't find the best btree node, just pick the first */
+	list_for_each_entry(b, &c->btree_cache, list)
+		if (!b->level && btree_node_dirty(b)) {
+			best = b;
+			rw_lock(true, best, best->level);
+			goto found;
+		}
+
+out:
+	if (!best)
+		return;
+found:
+	if (btree_node_dirty(best))
+		bch_btree_write(best, true, NULL);
+	rw_unlock(true, best);
+}
+
+#define last_seq(j)	((j)->seq - fifo_used(&(j)->pin) + 1)
+
+static void journal_discard_endio(struct bio *bio, int error)
+{
+	struct journal_device *ja =
+		container_of(bio, struct journal_device, discard_bio);
+	struct cache *ca = container_of(ja, struct cache, journal);
+
+	atomic_set(&ja->discard_in_flight, DISCARD_DONE);
+
+	closure_wake_up(&ca->set->journal.wait);
+	closure_put(&ca->set->cl);
+}
+
+static void journal_discard_work(struct work_struct *work)
+{
+	struct journal_device *ja =
+		container_of(work, struct journal_device, discard_work);
+
+	submit_bio(0, &ja->discard_bio);
+}
+
+static void do_journal_discard(struct cache *ca)
+{
+	struct journal_device *ja = &ca->journal;
+	struct bio *bio = &ja->discard_bio;
+
+	if (!ca->discard) {
+		ja->discard_idx = ja->last_idx;
+		return;
+	}
+
+	switch (atomic_read(&ja->discard_in_flight) == DISCARD_IN_FLIGHT) {
+	case DISCARD_IN_FLIGHT:
+		return;
+
+	case DISCARD_DONE:
+		ja->discard_idx = (ja->discard_idx + 1) %
+			ca->sb.njournal_buckets;
+
+		atomic_set(&ja->discard_in_flight, DISCARD_READY);
+		/* fallthrough */
+
+	case DISCARD_READY:
+		if (ja->discard_idx == ja->last_idx)
+			return;
+
+		atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT);
+
+		bio_init(bio);
+		bio->bi_sector		= bucket_to_sector(ca->set,
+							   ca->sb.d[ja->discard_idx]);
+		bio->bi_bdev		= ca->bdev;
+		bio->bi_rw		= REQ_WRITE|REQ_DISCARD;
+		bio->bi_max_vecs	= 1;
+		bio->bi_io_vec		= bio->bi_inline_vecs;
+		bio->bi_size		= bucket_bytes(ca);
+		bio->bi_end_io		= journal_discard_endio;
+
+		closure_get(&ca->set->cl);
+		INIT_WORK(&ja->discard_work, journal_discard_work);
+		schedule_work(&ja->discard_work);
+	}
+}
+
+static void journal_reclaim(struct cache_set *c)
+{
+	struct bkey *k = &c->journal.key;
+	struct cache *ca;
+	uint64_t last_seq;
+	unsigned iter, n = 0;
+	atomic_t p;
+
+	while (!atomic_read(&fifo_front(&c->journal.pin)))
+		fifo_pop(&c->journal.pin, p);
+
+	last_seq = last_seq(&c->journal);
+
+	/* Update last_idx */
+
+	for_each_cache(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+
+		while (ja->last_idx != ja->cur_idx &&
+		       ja->seq[ja->last_idx] < last_seq)
+			ja->last_idx = (ja->last_idx + 1) %
+				ca->sb.njournal_buckets;
+	}
+
+	for_each_cache(ca, c, iter)
+		do_journal_discard(ca);
+
+	if (c->journal.blocks_free)
+		return;
+
+	/*
+	 * Allocate:
+	 * XXX: Sort by free journal space
+	 */
+
+	for_each_cache(ca, c, iter) {
+		struct journal_device *ja = &ca->journal;
+		unsigned next = (ja->cur_idx + 1) % ca->sb.njournal_buckets;
+
+		/* No space available on this device */
+		if (next == ja->discard_idx)
+			continue;
+
+		ja->cur_idx = next;
+		k->ptr[n++] = PTR(0,
+				  bucket_to_sector(c, ca->sb.d[ja->cur_idx]),
+				  ca->sb.nr_this_dev);
+	}
+
+	bkey_init(k);
+	SET_KEY_PTRS(k, n);
+
+	if (n)
+		c->journal.blocks_free = c->sb.bucket_size >> c->block_bits;
+
+	if (!journal_full(&c->journal))
+		__closure_wake_up(&c->journal.wait);
+}
+
+void bch_journal_next(struct journal *j)
+{
+	atomic_t p = { 1 };
+
+	j->cur = (j->cur == j->w)
+		? &j->w[1]
+		: &j->w[0];
+
+	/*
+	 * The fifo_push() needs to happen at the same time as j->seq is
+	 * incremented for last_seq() to be calculated correctly
+	 */
+	BUG_ON(!fifo_push(&j->pin, p));
+	atomic_set(&fifo_back(&j->pin), 1);
+
+	j->cur->data->seq	= ++j->seq;
+	j->cur->need_write	= false;
+	j->cur->data->keys	= 0;
+
+	if (fifo_full(&j->pin))
+		pr_debug("journal_pin full (%zu)", fifo_used(&j->pin));
+}
+
+static void journal_write_endio(struct bio *bio, int error)
+{
+	struct journal_write *w = bio->bi_private;
+
+	cache_set_err_on(error, w->c, "journal io error");
+	closure_put(&w->c->journal.io.cl);
+}
+
+static void journal_write(struct closure *);
+
+static void journal_write_done(struct closure *cl)
+{
+	struct journal *j = container_of(cl, struct journal, io.cl);
+	struct cache_set *c = container_of(j, struct cache_set, journal);
+
+	struct journal_write *w = (j->cur == j->w)
+		? &j->w[1]
+		: &j->w[0];
+
+	__closure_wake_up(&w->wait);
+
+	if (c->journal_delay_ms)
+		closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms));
+
+	continue_at(cl, journal_write, system_wq);
+}
+
+static void journal_write_unlocked(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+	struct cache *ca;
+	struct journal_write *w = c->journal.cur;
+	struct bkey *k = &c->journal.key;
+	unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size;
+
+	struct bio *bio;
+	struct bio_list list;
+	bio_list_init(&list);
+
+	if (!w->need_write) {
+		/*
+		 * XXX: have to unlock closure before we unlock journal lock,
+		 * else we race with bch_journal(). But this way we race
+		 * against cache set unregister. Doh.
+		 */
+		set_closure_fn(cl, NULL, NULL);
+		closure_sub(cl, CLOSURE_RUNNING + 1);
+		spin_unlock(&c->journal.lock);
+		return;
+	} else if (journal_full(&c->journal)) {
+		journal_reclaim(c);
+		spin_unlock(&c->journal.lock);
+
+		btree_flush_write(c);
+		continue_at(cl, journal_write, system_wq);
+	}
+
+	c->journal.blocks_free -= set_blocks(w->data, c);
+
+	w->data->btree_level = c->root->level;
+
+	bkey_copy(&w->data->btree_root, &c->root->key);
+	bkey_copy(&w->data->uuid_bucket, &c->uuid_bucket);
+
+	for_each_cache(ca, c, i)
+		w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0];
+
+	w->data->magic		= jset_magic(c);
+	w->data->version	= BCACHE_JSET_VERSION;
+	w->data->last_seq	= last_seq(&c->journal);
+	w->data->csum		= csum_set(w->data);
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		ca = PTR_CACHE(c, k, i);
+		bio = &ca->journal.bio;
+
+		atomic_long_add(sectors, &ca->meta_sectors_written);
+
+		bio_reset(bio);
+		bio->bi_sector	= PTR_OFFSET(k, i);
+		bio->bi_bdev	= ca->bdev;
+		bio->bi_rw	= REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH;
+		bio->bi_size	= sectors << 9;
+
+		bio->bi_end_io	= journal_write_endio;
+		bio->bi_private = w;
+		bio_map(bio, w->data);
+
+		trace_bcache_journal_write(bio);
+		bio_list_add(&list, bio);
+
+		SET_PTR_OFFSET(k, i, PTR_OFFSET(k, i) + sectors);
+
+		ca->journal.seq[ca->journal.cur_idx] = w->data->seq;
+	}
+
+	atomic_dec_bug(&fifo_back(&c->journal.pin));
+	bch_journal_next(&c->journal);
+	journal_reclaim(c);
+
+	spin_unlock(&c->journal.lock);
+
+	while ((bio = bio_list_pop(&list)))
+		closure_bio_submit(bio, cl, c->cache[0]);
+
+	continue_at(cl, journal_write_done, NULL);
+}
+
+static void journal_write(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl);
+
+	spin_lock(&c->journal.lock);
+	journal_write_unlocked(cl);
+}
+
+static void __journal_try_write(struct cache_set *c, bool noflush)
+{
+	struct closure *cl = &c->journal.io.cl;
+
+	if (!closure_trylock(cl, &c->cl))
+		spin_unlock(&c->journal.lock);
+	else if (noflush && journal_full(&c->journal)) {
+		spin_unlock(&c->journal.lock);
+		continue_at(cl, journal_write, system_wq);
+	} else
+		journal_write_unlocked(cl);
+}
+
+#define journal_try_write(c)	__journal_try_write(c, false)
+
+void bch_journal_meta(struct cache_set *c, struct closure *cl)
+{
+	struct journal_write *w;
+
+	if (CACHE_SYNC(&c->sb)) {
+		spin_lock(&c->journal.lock);
+
+		w = c->journal.cur;
+		w->need_write = true;
+
+		if (cl)
+			BUG_ON(!closure_wait(&w->wait, cl));
+
+		__journal_try_write(c, true);
+	}
+}
+
+/*
+ * Entry point to the journalling code - bio_insert() and btree_invalidate()
+ * pass bch_journal() a list of keys to be journalled, and then
+ * bch_journal() hands those same keys off to btree_insert_async()
+ */
+
+void bch_journal(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct cache_set *c = op->c;
+	struct journal_write *w;
+	size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list;
+
+	if (op->type != BTREE_INSERT ||
+	    !CACHE_SYNC(&c->sb))
+		goto out;
+
+	/*
+	 * If we're looping because we errored, might already be waiting on
+	 * another journal write:
+	 */
+	while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING)
+		closure_sync(cl->parent);
+
+	spin_lock(&c->journal.lock);
+
+	if (journal_full(&c->journal)) {
+		/* XXX: tracepoint */
+		closure_wait(&c->journal.wait, cl);
+
+		journal_reclaim(c);
+		spin_unlock(&c->journal.lock);
+
+		btree_flush_write(c);
+		continue_at(cl, bch_journal, bcache_wq);
+	}
+
+	w = c->journal.cur;
+	w->need_write = true;
+	b = __set_blocks(w->data, w->data->keys + n, c);
+
+	if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS ||
+	    b > c->journal.blocks_free) {
+		/* XXX: If we were inserting so many keys that they won't fit in
+		 * an _empty_ journal write, we'll deadlock. For now, handle
+		 * this in bch_keylist_realloc() - but something to think about.
+		 */
+		BUG_ON(!w->data->keys);
+
+		/* XXX: tracepoint */
+		BUG_ON(!closure_wait(&w->wait, cl));
+
+		closure_flush(&c->journal.io);
+
+		journal_try_write(c);
+		continue_at(cl, bch_journal, bcache_wq);
+	}
+
+	memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t));
+	w->data->keys += n;
+
+	op->journal = &fifo_back(&c->journal.pin);
+	atomic_inc(op->journal);
+
+	if (op->flush_journal) {
+		closure_flush(&c->journal.io);
+		closure_wait(&w->wait, cl->parent);
+	}
+
+	journal_try_write(c);
+out:
+	bch_btree_insert_async(cl);
+}
+
+void bch_journal_free(struct cache_set *c)
+{
+	free_pages((unsigned long) c->journal.w[1].data, JSET_BITS);
+	free_pages((unsigned long) c->journal.w[0].data, JSET_BITS);
+	free_fifo(&c->journal.pin);
+}
+
+int bch_journal_alloc(struct cache_set *c)
+{
+	struct journal *j = &c->journal;
+
+	closure_init_unlocked(&j->io);
+	spin_lock_init(&j->lock);
+
+	c->journal_delay_ms = 100;
+
+	j->w[0].c = c;
+	j->w[1].c = c;
+
+	if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) ||
+	    !(j->w[0].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)) ||
+	    !(j->w[1].data = (void *) __get_free_pages(GFP_KERNEL, JSET_BITS)))
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/drivers/md/bcache/journal.h b/drivers/md/bcache/journal.h
new file mode 100644
index 000000000000..3d7851274b04
--- /dev/null
+++ b/drivers/md/bcache/journal.h
@@ -0,0 +1,215 @@
+#ifndef _BCACHE_JOURNAL_H
+#define _BCACHE_JOURNAL_H
+
+/*
+ * THE JOURNAL:
+ *
+ * The journal is treated as a circular buffer of buckets - a journal entry
+ * never spans two buckets. This means (not implemented yet) we can resize the
+ * journal at runtime, and will be needed for bcache on raw flash support.
+ *
+ * Journal entries contain a list of keys, ordered by the time they were
+ * inserted; thus journal replay just has to reinsert the keys.
+ *
+ * We also keep some things in the journal header that are logically part of the
+ * superblock - all the things that are frequently updated. This is for future
+ * bcache on raw flash support; the superblock (which will become another
+ * journal) can't be moved or wear leveled, so it contains just enough
+ * information to find the main journal, and the superblock only has to be
+ * rewritten when we want to move/wear level the main journal.
+ *
+ * Currently, we don't journal BTREE_REPLACE operations - this will hopefully be
+ * fixed eventually. This isn't a bug - BTREE_REPLACE is used for insertions
+ * from cache misses, which don't have to be journaled, and for writeback and
+ * moving gc we work around it by flushing the btree to disk before updating the
+ * gc information. But it is a potential issue with incremental garbage
+ * collection, and it's fragile.
+ *
+ * OPEN JOURNAL ENTRIES:
+ *
+ * Each journal entry contains, in the header, the sequence number of the last
+ * journal entry still open - i.e. that has keys that haven't been flushed to
+ * disk in the btree.
+ *
+ * We track this by maintaining a refcount for every open journal entry, in a
+ * fifo; each entry in the fifo corresponds to a particular journal
+ * entry/sequence number. When the refcount at the tail of the fifo goes to
+ * zero, we pop it off - thus, the size of the fifo tells us the number of open
+ * journal entries
+ *
+ * We take a refcount on a journal entry when we add some keys to a journal
+ * entry that we're going to insert (held by struct btree_op), and then when we
+ * insert those keys into the btree the btree write we're setting up takes a
+ * copy of that refcount (held by struct btree_write). That refcount is dropped
+ * when the btree write completes.
+ *
+ * A struct btree_write can only hold a refcount on a single journal entry, but
+ * might contain keys for many journal entries - we handle this by making sure
+ * it always has a refcount on the _oldest_ journal entry of all the journal
+ * entries it has keys for.
+ *
+ * JOURNAL RECLAIM:
+ *
+ * As mentioned previously, our fifo of refcounts tells us the number of open
+ * journal entries; from that and the current journal sequence number we compute
+ * last_seq - the oldest journal entry we still need. We write last_seq in each
+ * journal entry, and we also have to keep track of where it exists on disk so
+ * we don't overwrite it when we loop around the journal.
+ *
+ * To do that we track, for each journal bucket, the sequence number of the
+ * newest journal entry it contains - if we don't need that journal entry we
+ * don't need anything in that bucket anymore. From that we track the last
+ * journal bucket we still need; all this is tracked in struct journal_device
+ * and updated by journal_reclaim().
+ *
+ * JOURNAL FILLING UP:
+ *
+ * There are two ways the journal could fill up; either we could run out of
+ * space to write to, or we could have too many open journal entries and run out
+ * of room in the fifo of refcounts. Since those refcounts are decremented
+ * without any locking we can't safely resize that fifo, so we handle it the
+ * same way.
+ *
+ * If the journal fills up, we start flushing dirty btree nodes until we can
+ * allocate space for a journal write again - preferentially flushing btree
+ * nodes that are pinning the oldest journal entries first.
+ */
+
+#define BCACHE_JSET_VERSION_UUIDv1	1
+/* Always latest UUID format */
+#define BCACHE_JSET_VERSION_UUID	1
+#define BCACHE_JSET_VERSION		1
+
+/*
+ * On disk format for a journal entry:
+ * seq is monotonically increasing; every journal entry has its own unique
+ * sequence number.
+ *
+ * last_seq is the oldest journal entry that still has keys the btree hasn't
+ * flushed to disk yet.
+ *
+ * version is for on disk format changes.
+ */
+struct jset {
+	uint64_t		csum;
+	uint64_t		magic;
+	uint64_t		seq;
+	uint32_t		version;
+	uint32_t		keys;
+
+	uint64_t		last_seq;
+
+	BKEY_PADDED(uuid_bucket);
+	BKEY_PADDED(btree_root);
+	uint16_t		btree_level;
+	uint16_t		pad[3];
+
+	uint64_t		prio_bucket[MAX_CACHES_PER_SET];
+
+	union {
+		struct bkey	start[0];
+		uint64_t	d[0];
+	};
+};
+
+/*
+ * Only used for holding the journal entries we read in btree_journal_read()
+ * during cache_registration
+ */
+struct journal_replay {
+	struct list_head	list;
+	atomic_t		*pin;
+	struct jset		j;
+};
+
+/*
+ * We put two of these in struct journal; we used them for writes to the
+ * journal that are being staged or in flight.
+ */
+struct journal_write {
+	struct jset		*data;
+#define JSET_BITS		3
+
+	struct cache_set	*c;
+	struct closure_waitlist	wait;
+	bool			need_write;
+};
+
+/* Embedded in struct cache_set */
+struct journal {
+	spinlock_t		lock;
+	/* used when waiting because the journal was full */
+	struct closure_waitlist	wait;
+	struct closure_with_timer io;
+
+	/* Number of blocks free in the bucket(s) we're currently writing to */
+	unsigned		blocks_free;
+	uint64_t		seq;
+	DECLARE_FIFO(atomic_t, pin);
+
+	BKEY_PADDED(key);
+
+	struct journal_write	w[2], *cur;
+};
+
+/*
+ * Embedded in struct cache. First three fields refer to the array of journal
+ * buckets, in cache_sb.
+ */
+struct journal_device {
+	/*
+	 * For each journal bucket, contains the max sequence number of the
+	 * journal writes it contains - so we know when a bucket can be reused.
+	 */
+	uint64_t		seq[SB_JOURNAL_BUCKETS];
+
+	/* Journal bucket we're currently writing to */
+	unsigned		cur_idx;
+
+	/* Last journal bucket that still contains an open journal entry */
+	unsigned		last_idx;
+
+	/* Next journal bucket to be discarded */
+	unsigned		discard_idx;
+
+#define DISCARD_READY		0
+#define DISCARD_IN_FLIGHT	1
+#define DISCARD_DONE		2
+	/* 1 - discard in flight, -1 - discard completed */
+	atomic_t		discard_in_flight;
+
+	struct work_struct	discard_work;
+	struct bio		discard_bio;
+	struct bio_vec		discard_bv;
+
+	/* Bio for journal reads/writes to this device */
+	struct bio		bio;
+	struct bio_vec		bv[8];
+};
+
+#define journal_pin_cmp(c, l, r)				\
+	(fifo_idx(&(c)->journal.pin, (l)->journal) >		\
+	 fifo_idx(&(c)->journal.pin, (r)->journal))
+
+#define JOURNAL_PIN	20000
+
+#define journal_full(j)						\
+	(!(j)->blocks_free || fifo_free(&(j)->pin) <= 1)
+
+struct closure;
+struct cache_set;
+struct btree_op;
+
+void bch_journal(struct closure *);
+void bch_journal_next(struct journal *);
+void bch_journal_mark(struct cache_set *, struct list_head *);
+void bch_journal_meta(struct cache_set *, struct closure *);
+int bch_journal_read(struct cache_set *, struct list_head *,
+			struct btree_op *);
+int bch_journal_replay(struct cache_set *, struct list_head *,
+			  struct btree_op *);
+
+void bch_journal_free(struct cache_set *);
+int bch_journal_alloc(struct cache_set *);
+
+#endif /* _BCACHE_JOURNAL_H */
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
new file mode 100644
index 000000000000..c69fc92b02cf
--- /dev/null
+++ b/drivers/md/bcache/movinggc.c
@@ -0,0 +1,254 @@
+/*
+ * Moving/copying garbage collector
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+struct moving_io {
+	struct keybuf_key	*w;
+	struct search		s;
+	struct bbio		bio;
+};
+
+static bool moving_pred(struct keybuf *buf, struct bkey *k)
+{
+	struct cache_set *c = container_of(buf, struct cache_set,
+					   moving_gc_keys);
+	unsigned i;
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		struct cache *ca = PTR_CACHE(c, k, i);
+		struct bucket *g = PTR_BUCKET(c, k, i);
+
+		if (GC_SECTORS_USED(g) < ca->gc_move_threshold)
+			return true;
+	}
+
+	return false;
+}
+
+/* Moving GC - IO loop */
+
+static void moving_io_destructor(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+	kfree(io);
+}
+
+static void write_moving_finish(struct closure *cl)
+{
+	struct moving_io *io = container_of(cl, struct moving_io, s.cl);
+	struct bio *bio = &io->bio.bio;
+	struct bio_vec *bv = bio_iovec_idx(bio, bio->bi_vcnt);
+
+	while (bv-- != bio->bi_io_vec)
+		__free_page(bv->bv_page);
+
+	pr_debug("%s %s", io->s.op.insert_collision
+		 ? "collision moving" : "moved",
+		 pkey(&io->w->key));
+
+	bch_keybuf_del(&io->s.op.c->moving_gc_keys, io->w);
+
+	atomic_dec_bug(&io->s.op.c->in_flight);
+	closure_wake_up(&io->s.op.c->moving_gc_wait);
+
+	closure_return_with_destructor(cl, moving_io_destructor);
+}
+
+static void read_moving_endio(struct bio *bio, int error)
+{
+	struct moving_io *io = container_of(bio->bi_private,
+					    struct moving_io, s.cl);
+
+	if (error)
+		io->s.error = error;
+
+	bch_bbio_endio(io->s.op.c, bio, error, "reading data to move");
+}
+
+static void moving_init(struct moving_io *io)
+{
+	struct bio *bio = &io->bio.bio;
+
+	bio_init(bio);
+	bio_get(bio);
+	bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_size		= KEY_SIZE(&io->w->key) << 9;
+	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&io->w->key),
+					       PAGE_SECTORS);
+	bio->bi_private		= &io->s.cl;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bio_map(bio, NULL);
+}
+
+static void write_moving(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct moving_io *io = container_of(s, struct moving_io, s);
+
+	if (!s->error) {
+		trace_bcache_write_moving(&io->bio.bio);
+
+		moving_init(io);
+
+		io->bio.bio.bi_sector	= KEY_START(&io->w->key);
+		s->op.lock		= -1;
+		s->op.write_prio	= 1;
+		s->op.cache_bio		= &io->bio.bio;
+
+		s->writeback		= KEY_DIRTY(&io->w->key);
+		s->op.csum		= KEY_CSUM(&io->w->key);
+
+		s->op.type = BTREE_REPLACE;
+		bkey_copy(&s->op.replace, &io->w->key);
+
+		closure_init(&s->op.cl, cl);
+		bch_insert_data(&s->op.cl);
+	}
+
+	continue_at(cl, write_moving_finish, NULL);
+}
+
+static void read_moving_submit(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct moving_io *io = container_of(s, struct moving_io, s);
+	struct bio *bio = &io->bio.bio;
+
+	trace_bcache_read_moving(bio);
+	bch_submit_bbio(bio, s->op.c, &io->w->key, 0);
+
+	continue_at(cl, write_moving, bch_gc_wq);
+}
+
+static void read_moving(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, moving_gc);
+	struct keybuf_key *w;
+	struct moving_io *io;
+	struct bio *bio;
+
+	/* XXX: if we error, background writeback could stall indefinitely */
+
+	while (!test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		w = bch_keybuf_next_rescan(c, &c->moving_gc_keys, &MAX_KEY);
+		if (!w)
+			break;
+
+		io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec)
+			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+			     GFP_KERNEL);
+		if (!io)
+			goto err;
+
+		w->private	= io;
+		io->w		= w;
+		io->s.op.inode	= KEY_INODE(&w->key);
+		io->s.op.c	= c;
+
+		moving_init(io);
+		bio = &io->bio.bio;
+
+		bio->bi_rw	= READ;
+		bio->bi_end_io	= read_moving_endio;
+
+		if (bio_alloc_pages(bio, GFP_KERNEL))
+			goto err;
+
+		pr_debug("%s", pkey(&w->key));
+
+		closure_call(&io->s.cl, read_moving_submit, NULL, &c->gc.cl);
+
+		if (atomic_inc_return(&c->in_flight) >= 64) {
+			closure_wait_event(&c->moving_gc_wait, cl,
+					   atomic_read(&c->in_flight) < 64);
+			continue_at(cl, read_moving, bch_gc_wq);
+		}
+	}
+
+	if (0) {
+err:		if (!IS_ERR_OR_NULL(w->private))
+			kfree(w->private);
+
+		bch_keybuf_del(&c->moving_gc_keys, w);
+	}
+
+	closure_return(cl);
+}
+
+void bch_moving_gc(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, gc.cl);
+	struct cache *ca;
+	struct bucket *b;
+	unsigned i;
+
+	bool bucket_cmp(struct bucket *l, struct bucket *r)
+	{
+		return GC_SECTORS_USED(l) < GC_SECTORS_USED(r);
+	}
+
+	unsigned top(struct cache *ca)
+	{
+		return GC_SECTORS_USED(heap_peek(&ca->heap));
+	}
+
+	if (!c->copy_gc_enabled)
+		closure_return(cl);
+
+	mutex_lock(&c->bucket_lock);
+
+	for_each_cache(ca, c, i) {
+		unsigned sectors_to_move = 0;
+		unsigned reserve_sectors = ca->sb.bucket_size *
+			min(fifo_used(&ca->free), ca->free.size / 2);
+
+		ca->heap.used = 0;
+
+		for_each_bucket(b, ca) {
+			if (!GC_SECTORS_USED(b))
+				continue;
+
+			if (!heap_full(&ca->heap)) {
+				sectors_to_move += GC_SECTORS_USED(b);
+				heap_add(&ca->heap, b, bucket_cmp);
+			} else if (bucket_cmp(b, heap_peek(&ca->heap))) {
+				sectors_to_move -= top(ca);
+				sectors_to_move += GC_SECTORS_USED(b);
+
+				ca->heap.data[0] = b;
+				heap_sift(&ca->heap, 0, bucket_cmp);
+			}
+		}
+
+		while (sectors_to_move > reserve_sectors) {
+			heap_pop(&ca->heap, b, bucket_cmp);
+			sectors_to_move -= GC_SECTORS_USED(b);
+		}
+
+		ca->gc_move_threshold = top(ca);
+
+		pr_debug("threshold %u", ca->gc_move_threshold);
+	}
+
+	mutex_unlock(&c->bucket_lock);
+
+	c->moving_gc_keys.last_scanned = ZERO_KEY;
+
+	closure_init(&c->moving_gc, cl);
+	read_moving(&c->moving_gc);
+
+	closure_return(cl);
+}
+
+void bch_moving_init_cache_set(struct cache_set *c)
+{
+	bch_keybuf_init(&c->moving_gc_keys, moving_pred);
+}
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
new file mode 100644
index 000000000000..4f552de49aaa
--- /dev/null
+++ b/drivers/md/bcache/request.c
@@ -0,0 +1,1409 @@
+/*
+ * Main bcache entry point - handle a read or a write request and decide what to
+ * do with it; the make_request functions are called by the block layer.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/cgroup.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/random.h>
+#include "blk-cgroup.h"
+
+#include <trace/events/bcache.h>
+
+#define CUTOFF_CACHE_ADD	95
+#define CUTOFF_CACHE_READA	90
+#define CUTOFF_WRITEBACK	50
+#define CUTOFF_WRITEBACK_SYNC	75
+
+struct kmem_cache *bch_search_cache;
+
+static void check_should_skip(struct cached_dev *, struct search *);
+
+/* Cgroup interface */
+
+#ifdef CONFIG_CGROUP_BCACHE
+static struct bch_cgroup bcache_default_cgroup = { .cache_mode = -1 };
+
+static struct bch_cgroup *cgroup_to_bcache(struct cgroup *cgroup)
+{
+	struct cgroup_subsys_state *css;
+	return cgroup &&
+		(css = cgroup_subsys_state(cgroup, bcache_subsys_id))
+		? container_of(css, struct bch_cgroup, css)
+		: &bcache_default_cgroup;
+}
+
+struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio)
+{
+	struct cgroup_subsys_state *css = bio->bi_css
+		? cgroup_subsys_state(bio->bi_css->cgroup, bcache_subsys_id)
+		: task_subsys_state(current, bcache_subsys_id);
+
+	return css
+		? container_of(css, struct bch_cgroup, css)
+		: &bcache_default_cgroup;
+}
+
+static ssize_t cache_mode_read(struct cgroup *cgrp, struct cftype *cft,
+			struct file *file,
+			char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	char tmp[1024];
+	int len = snprint_string_list(tmp, PAGE_SIZE, bch_cache_modes,
+				      cgroup_to_bcache(cgrp)->cache_mode + 1);
+
+	if (len < 0)
+		return len;
+
+	return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
+}
+
+static int cache_mode_write(struct cgroup *cgrp, struct cftype *cft,
+			    const char *buf)
+{
+	int v = read_string_list(buf, bch_cache_modes);
+	if (v < 0)
+		return v;
+
+	cgroup_to_bcache(cgrp)->cache_mode = v - 1;
+	return 0;
+}
+
+static u64 bch_verify_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cgroup_to_bcache(cgrp)->verify;
+}
+
+static int bch_verify_write(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+	cgroup_to_bcache(cgrp)->verify = val;
+	return 0;
+}
+
+static u64 bch_cache_hits_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+	return atomic_read(&bcachecg->stats.cache_hits);
+}
+
+static u64 bch_cache_misses_read(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+	return atomic_read(&bcachecg->stats.cache_misses);
+}
+
+static u64 bch_cache_bypass_hits_read(struct cgroup *cgrp,
+					 struct cftype *cft)
+{
+	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+	return atomic_read(&bcachecg->stats.cache_bypass_hits);
+}
+
+static u64 bch_cache_bypass_misses_read(struct cgroup *cgrp,
+					   struct cftype *cft)
+{
+	struct bch_cgroup *bcachecg = cgroup_to_bcache(cgrp);
+	return atomic_read(&bcachecg->stats.cache_bypass_misses);
+}
+
+static struct cftype bch_files[] = {
+	{
+		.name		= "cache_mode",
+		.read		= cache_mode_read,
+		.write_string	= cache_mode_write,
+	},
+	{
+		.name		= "verify",
+		.read_u64	= bch_verify_read,
+		.write_u64	= bch_verify_write,
+	},
+	{
+		.name		= "cache_hits",
+		.read_u64	= bch_cache_hits_read,
+	},
+	{
+		.name		= "cache_misses",
+		.read_u64	= bch_cache_misses_read,
+	},
+	{
+		.name		= "cache_bypass_hits",
+		.read_u64	= bch_cache_bypass_hits_read,
+	},
+	{
+		.name		= "cache_bypass_misses",
+		.read_u64	= bch_cache_bypass_misses_read,
+	},
+	{ }	/* terminate */
+};
+
+static void init_bch_cgroup(struct bch_cgroup *cg)
+{
+	cg->cache_mode = -1;
+}
+
+static struct cgroup_subsys_state *bcachecg_create(struct cgroup *cgroup)
+{
+	struct bch_cgroup *cg;
+
+	cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+	if (!cg)
+		return ERR_PTR(-ENOMEM);
+	init_bch_cgroup(cg);
+	return &cg->css;
+}
+
+static void bcachecg_destroy(struct cgroup *cgroup)
+{
+	struct bch_cgroup *cg = cgroup_to_bcache(cgroup);
+	free_css_id(&bcache_subsys, &cg->css);
+	kfree(cg);
+}
+
+struct cgroup_subsys bcache_subsys = {
+	.create		= bcachecg_create,
+	.destroy	= bcachecg_destroy,
+	.subsys_id	= bcache_subsys_id,
+	.name		= "bcache",
+	.module		= THIS_MODULE,
+};
+EXPORT_SYMBOL_GPL(bcache_subsys);
+#endif
+
+static unsigned cache_mode(struct cached_dev *dc, struct bio *bio)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+	int r = bch_bio_to_cgroup(bio)->cache_mode;
+	if (r >= 0)
+		return r;
+#endif
+	return BDEV_CACHE_MODE(&dc->sb);
+}
+
+static bool verify(struct cached_dev *dc, struct bio *bio)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+	if (bch_bio_to_cgroup(bio)->verify)
+		return true;
+#endif
+	return dc->verify;
+}
+
+static void bio_csum(struct bio *bio, struct bkey *k)
+{
+	struct bio_vec *bv;
+	uint64_t csum = 0;
+	int i;
+
+	bio_for_each_segment(bv, bio, i) {
+		void *d = kmap(bv->bv_page) + bv->bv_offset;
+		csum = crc64_update(csum, d, bv->bv_len);
+		kunmap(bv->bv_page);
+	}
+
+	k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
+}
+
+/* Insert data into cache */
+
+static void bio_invalidate(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct bio *bio = op->cache_bio;
+
+	pr_debug("invalidating %i sectors from %llu",
+		 bio_sectors(bio), (uint64_t) bio->bi_sector);
+
+	while (bio_sectors(bio)) {
+		unsigned len = min(bio_sectors(bio), 1U << 14);
+
+		if (bch_keylist_realloc(&op->keys, 0, op->c))
+			goto out;
+
+		bio->bi_sector	+= len;
+		bio->bi_size	-= len << 9;
+
+		bch_keylist_add(&op->keys,
+				&KEY(op->inode, bio->bi_sector, len));
+	}
+
+	op->insert_data_done = true;
+	bio_put(bio);
+out:
+	continue_at(cl, bch_journal, bcache_wq);
+}
+
+struct open_bucket {
+	struct list_head	list;
+	struct task_struct	*last;
+	unsigned		sectors_free;
+	BKEY_PADDED(key);
+};
+
+void bch_open_buckets_free(struct cache_set *c)
+{
+	struct open_bucket *b;
+
+	while (!list_empty(&c->data_buckets)) {
+		b = list_first_entry(&c->data_buckets,
+				     struct open_bucket, list);
+		list_del(&b->list);
+		kfree(b);
+	}
+}
+
+int bch_open_buckets_alloc(struct cache_set *c)
+{
+	int i;
+
+	spin_lock_init(&c->data_bucket_lock);
+
+	for (i = 0; i < 6; i++) {
+		struct open_bucket *b = kzalloc(sizeof(*b), GFP_KERNEL);
+		if (!b)
+			return -ENOMEM;
+
+		list_add(&b->list, &c->data_buckets);
+	}
+
+	return 0;
+}
+
+/*
+ * We keep multiple buckets open for writes, and try to segregate different
+ * write streams for better cache utilization: first we look for a bucket where
+ * the last write to it was sequential with the current write, and failing that
+ * we look for a bucket that was last used by the same task.
+ *
+ * The ideas is if you've got multiple tasks pulling data into the cache at the
+ * same time, you'll get better cache utilization if you try to segregate their
+ * data and preserve locality.
+ *
+ * For example, say you've starting Firefox at the same time you're copying a
+ * bunch of files. Firefox will likely end up being fairly hot and stay in the
+ * cache awhile, but the data you copied might not be; if you wrote all that
+ * data to the same buckets it'd get invalidated at the same time.
+ *
+ * Both of those tasks will be doing fairly random IO so we can't rely on
+ * detecting sequential IO to segregate their data, but going off of the task
+ * should be a sane heuristic.
+ */
+static struct open_bucket *pick_data_bucket(struct cache_set *c,
+					    const struct bkey *search,
+					    struct task_struct *task,
+					    struct bkey *alloc)
+{
+	struct open_bucket *ret, *ret_task = NULL;
+
+	list_for_each_entry_reverse(ret, &c->data_buckets, list)
+		if (!bkey_cmp(&ret->key, search))
+			goto found;
+		else if (ret->last == task)
+			ret_task = ret;
+
+	ret = ret_task ?: list_first_entry(&c->data_buckets,
+					   struct open_bucket, list);
+found:
+	if (!ret->sectors_free && KEY_PTRS(alloc)) {
+		ret->sectors_free = c->sb.bucket_size;
+		bkey_copy(&ret->key, alloc);
+		bkey_init(alloc);
+	}
+
+	if (!ret->sectors_free)
+		ret = NULL;
+
+	return ret;
+}
+
+/*
+ * Allocates some space in the cache to write to, and k to point to the newly
+ * allocated space, and updates KEY_SIZE(k) and KEY_OFFSET(k) (to point to the
+ * end of the newly allocated space).
+ *
+ * May allocate fewer sectors than @sectors, KEY_SIZE(k) indicates how many
+ * sectors were actually allocated.
+ *
+ * If s->writeback is true, will not fail.
+ */
+static bool bch_alloc_sectors(struct bkey *k, unsigned sectors,
+			      struct search *s)
+{
+	struct cache_set *c = s->op.c;
+	struct open_bucket *b;
+	BKEY_PADDED(key) alloc;
+	struct closure cl, *w = NULL;
+	unsigned i;
+
+	if (s->writeback) {
+		closure_init_stack(&cl);
+		w = &cl;
+	}
+
+	/*
+	 * We might have to allocate a new bucket, which we can't do with a
+	 * spinlock held. So if we have to allocate, we drop the lock, allocate
+	 * and then retry. KEY_PTRS() indicates whether alloc points to
+	 * allocated bucket(s).
+	 */
+
+	bkey_init(&alloc.key);
+	spin_lock(&c->data_bucket_lock);
+
+	while (!(b = pick_data_bucket(c, k, s->task, &alloc.key))) {
+		unsigned watermark = s->op.write_prio
+			? WATERMARK_MOVINGGC
+			: WATERMARK_NONE;
+
+		spin_unlock(&c->data_bucket_lock);
+
+		if (bch_bucket_alloc_set(c, watermark, &alloc.key, 1, w))
+			return false;
+
+		spin_lock(&c->data_bucket_lock);
+	}
+
+	/*
+	 * If we had to allocate, we might race and not need to allocate the
+	 * second time we call find_data_bucket(). If we allocated a bucket but
+	 * didn't use it, drop the refcount bch_bucket_alloc_set() took:
+	 */
+	if (KEY_PTRS(&alloc.key))
+		__bkey_put(c, &alloc.key);
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		EBUG_ON(ptr_stale(c, &b->key, i));
+
+	/* Set up the pointer to the space we're allocating: */
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++)
+		k->ptr[i] = b->key.ptr[i];
+
+	sectors = min(sectors, b->sectors_free);
+
+	SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors);
+	SET_KEY_SIZE(k, sectors);
+	SET_KEY_PTRS(k, KEY_PTRS(&b->key));
+
+	/*
+	 * Move b to the end of the lru, and keep track of what this bucket was
+	 * last used for:
+	 */
+	list_move_tail(&b->list, &c->data_buckets);
+	bkey_copy_key(&b->key, k);
+	b->last = s->task;
+
+	b->sectors_free	-= sectors;
+
+	for (i = 0; i < KEY_PTRS(&b->key); i++) {
+		SET_PTR_OFFSET(&b->key, i, PTR_OFFSET(&b->key, i) + sectors);
+
+		atomic_long_add(sectors,
+				&PTR_CACHE(c, &b->key, i)->sectors_written);
+	}
+
+	if (b->sectors_free < c->sb.block_size)
+		b->sectors_free = 0;
+
+	/*
+	 * k takes refcounts on the buckets it points to until it's inserted
+	 * into the btree, but if we're done with this bucket we just transfer
+	 * get_data_bucket()'s refcount.
+	 */
+	if (b->sectors_free)
+		for (i = 0; i < KEY_PTRS(&b->key); i++)
+			atomic_inc(&PTR_BUCKET(c, &b->key, i)->pin);
+
+	spin_unlock(&c->data_bucket_lock);
+	return true;
+}
+
+static void bch_insert_data_error(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+	/*
+	 * Our data write just errored, which means we've got a bunch of keys to
+	 * insert that point to data that wasn't succesfully written.
+	 *
+	 * We don't have to insert those keys but we still have to invalidate
+	 * that region of the cache - so, if we just strip off all the pointers
+	 * from the keys we'll accomplish just that.
+	 */
+
+	struct bkey *src = op->keys.bottom, *dst = op->keys.bottom;
+
+	while (src != op->keys.top) {
+		struct bkey *n = bkey_next(src);
+
+		SET_KEY_PTRS(src, 0);
+		bkey_copy(dst, src);
+
+		dst = bkey_next(dst);
+		src = n;
+	}
+
+	op->keys.top = dst;
+
+	bch_journal(cl);
+}
+
+static void bch_insert_data_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
+
+	if (error) {
+		/* TODO: We could try to recover from this. */
+		if (s->writeback)
+			s->error = error;
+		else if (s->write)
+			set_closure_fn(cl, bch_insert_data_error, bcache_wq);
+		else
+			set_closure_fn(cl, NULL, NULL);
+	}
+
+	bch_bbio_endio(op->c, bio, error, "writing data to cache");
+}
+
+static void bch_insert_data_loop(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
+	struct bio *bio = op->cache_bio, *n;
+
+	if (op->skip)
+		return bio_invalidate(cl);
+
+	if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) {
+		set_gc_sectors(op->c);
+		bch_queue_gc(op->c);
+	}
+
+	do {
+		unsigned i;
+		struct bkey *k;
+		struct bio_set *split = s->d
+			? s->d->bio_split : op->c->bio_split;
+
+		/* 1 for the device pointer and 1 for the chksum */
+		if (bch_keylist_realloc(&op->keys,
+					1 + (op->csum ? 1 : 0),
+					op->c))
+			continue_at(cl, bch_journal, bcache_wq);
+
+		k = op->keys.top;
+		bkey_init(k);
+		SET_KEY_INODE(k, op->inode);
+		SET_KEY_OFFSET(k, bio->bi_sector);
+
+		if (!bch_alloc_sectors(k, bio_sectors(bio), s))
+			goto err;
+
+		n = bch_bio_split(bio, KEY_SIZE(k), GFP_NOIO, split);
+		if (!n) {
+			__bkey_put(op->c, k);
+			continue_at(cl, bch_insert_data_loop, bcache_wq);
+		}
+
+		n->bi_end_io	= bch_insert_data_endio;
+		n->bi_private	= cl;
+
+		if (s->writeback) {
+			SET_KEY_DIRTY(k, true);
+
+			for (i = 0; i < KEY_PTRS(k); i++)
+				SET_GC_MARK(PTR_BUCKET(op->c, k, i),
+					    GC_MARK_DIRTY);
+		}
+
+		SET_KEY_CSUM(k, op->csum);
+		if (KEY_CSUM(k))
+			bio_csum(n, k);
+
+		pr_debug("%s", pkey(k));
+		bch_keylist_push(&op->keys);
+
+		trace_bcache_cache_insert(n, n->bi_sector, n->bi_bdev);
+		n->bi_rw |= REQ_WRITE;
+		bch_submit_bbio(n, op->c, k, 0);
+	} while (n != bio);
+
+	op->insert_data_done = true;
+	continue_at(cl, bch_journal, bcache_wq);
+err:
+	/* bch_alloc_sectors() blocks if s->writeback = true */
+	BUG_ON(s->writeback);
+
+	/*
+	 * But if it's not a writeback write we'd rather just bail out if
+	 * there aren't any buckets ready to write to - it might take awhile and
+	 * we might be starving btree writes for gc or something.
+	 */
+
+	if (s->write) {
+		/*
+		 * Writethrough write: We can't complete the write until we've
+		 * updated the index. But we don't want to delay the write while
+		 * we wait for buckets to be freed up, so just invalidate the
+		 * rest of the write.
+		 */
+		op->skip = true;
+		return bio_invalidate(cl);
+	} else {
+		/*
+		 * From a cache miss, we can just insert the keys for the data
+		 * we have written or bail out if we didn't do anything.
+		 */
+		op->insert_data_done = true;
+		bio_put(bio);
+
+		if (!bch_keylist_empty(&op->keys))
+			continue_at(cl, bch_journal, bcache_wq);
+		else
+			closure_return(cl);
+	}
+}
+
+/**
+ * bch_insert_data - stick some data in the cache
+ *
+ * This is the starting point for any data to end up in a cache device; it could
+ * be from a normal write, or a writeback write, or a write to a flash only
+ * volume - it's also used by the moving garbage collector to compact data in
+ * mostly empty buckets.
+ *
+ * It first writes the data to the cache, creating a list of keys to be inserted
+ * (if the data had to be fragmented there will be multiple keys); after the
+ * data is written it calls bch_journal, and after the keys have been added to
+ * the next journal write they're inserted into the btree.
+ *
+ * It inserts the data in op->cache_bio; bi_sector is used for the key offset,
+ * and op->inode is used for the key inode.
+ *
+ * If op->skip is true, instead of inserting the data it invalidates the region
+ * of the cache represented by op->cache_bio and op->inode.
+ */
+void bch_insert_data(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+	bch_keylist_init(&op->keys);
+	bio_get(op->cache_bio);
+	bch_insert_data_loop(cl);
+}
+
+void bch_btree_insert_async(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+	struct search *s = container_of(op, struct search, op);
+
+	if (bch_btree_insert(op, op->c)) {
+		s->error		= -ENOMEM;
+		op->insert_data_done	= true;
+	}
+
+	if (op->insert_data_done) {
+		bch_keylist_free(&op->keys);
+		closure_return(cl);
+	} else
+		continue_at(cl, bch_insert_data_loop, bcache_wq);
+}
+
+/* Common code for the make_request functions */
+
+static void request_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+
+	if (error) {
+		struct search *s = container_of(cl, struct search, cl);
+		s->error = error;
+		/* Only cache read errors are recoverable */
+		s->recoverable = false;
+	}
+
+	bio_put(bio);
+	closure_put(cl);
+}
+
+void bch_cache_read_endio(struct bio *bio, int error)
+{
+	struct bbio *b = container_of(bio, struct bbio, bio);
+	struct closure *cl = bio->bi_private;
+	struct search *s = container_of(cl, struct search, cl);
+
+	/*
+	 * If the bucket was reused while our bio was in flight, we might have
+	 * read the wrong data. Set s->error but not error so it doesn't get
+	 * counted against the cache device, but we'll still reread the data
+	 * from the backing device.
+	 */
+
+	if (error)
+		s->error = error;
+	else if (ptr_stale(s->op.c, &b->key, 0)) {
+		atomic_long_inc(&s->op.c->cache_read_races);
+		s->error = -EINTR;
+	}
+
+	bch_bbio_endio(s->op.c, bio, error, "reading from cache");
+}
+
+static void bio_complete(struct search *s)
+{
+	if (s->orig_bio) {
+		int cpu, rw = bio_data_dir(s->orig_bio);
+		unsigned long duration = jiffies - s->start_time;
+
+		cpu = part_stat_lock();
+		part_round_stats(cpu, &s->d->disk->part0);
+		part_stat_add(cpu, &s->d->disk->part0, ticks[rw], duration);
+		part_stat_unlock();
+
+		trace_bcache_request_end(s, s->orig_bio);
+		bio_endio(s->orig_bio, s->error);
+		s->orig_bio = NULL;
+	}
+}
+
+static void do_bio_hook(struct search *s)
+{
+	struct bio *bio = &s->bio.bio;
+	memcpy(bio, s->orig_bio, sizeof(struct bio));
+
+	bio->bi_end_io		= request_endio;
+	bio->bi_private		= &s->cl;
+	atomic_set(&bio->bi_cnt, 3);
+}
+
+static void search_free(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	bio_complete(s);
+
+	if (s->op.cache_bio)
+		bio_put(s->op.cache_bio);
+
+	if (s->unaligned_bvec)
+		mempool_free(s->bio.bio.bi_io_vec, s->d->unaligned_bvec);
+
+	closure_debug_destroy(cl);
+	mempool_free(s, s->d->c->search);
+}
+
+static struct search *search_alloc(struct bio *bio, struct bcache_device *d)
+{
+	struct bio_vec *bv;
+	struct search *s = mempool_alloc(d->c->search, GFP_NOIO);
+	memset(s, 0, offsetof(struct search, op.keys));
+
+	__closure_init(&s->cl, NULL);
+
+	s->op.inode		= d->id;
+	s->op.c			= d->c;
+	s->d			= d;
+	s->op.lock		= -1;
+	s->task			= current;
+	s->orig_bio		= bio;
+	s->write		= (bio->bi_rw & REQ_WRITE) != 0;
+	s->op.flush_journal	= (bio->bi_rw & REQ_FLUSH) != 0;
+	s->op.skip		= (bio->bi_rw & REQ_DISCARD) != 0;
+	s->recoverable		= 1;
+	s->start_time		= jiffies;
+	do_bio_hook(s);
+
+	if (bio->bi_size != bio_segments(bio) * PAGE_SIZE) {
+		bv = mempool_alloc(d->unaligned_bvec, GFP_NOIO);
+		memcpy(bv, bio_iovec(bio),
+		       sizeof(struct bio_vec) * bio_segments(bio));
+
+		s->bio.bio.bi_io_vec	= bv;
+		s->unaligned_bvec	= 1;
+	}
+
+	return s;
+}
+
+static void btree_read_async(struct closure *cl)
+{
+	struct btree_op *op = container_of(cl, struct btree_op, cl);
+
+	int ret = btree_root(search_recurse, op->c, op);
+
+	if (ret == -EAGAIN)
+		continue_at(cl, btree_read_async, bcache_wq);
+
+	closure_return(cl);
+}
+
+/* Cached devices */
+
+static void cached_dev_bio_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	search_free(cl);
+	cached_dev_put(dc);
+}
+
+/* Process reads */
+
+static void cached_dev_read_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+
+	if (s->op.insert_collision)
+		bch_mark_cache_miss_collision(s);
+
+	if (s->op.cache_bio) {
+		int i;
+		struct bio_vec *bv;
+
+		__bio_for_each_segment(bv, s->op.cache_bio, i, 0)
+			__free_page(bv->bv_page);
+	}
+
+	cached_dev_bio_complete(cl);
+}
+
+static void request_read_error(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct bio_vec *bv;
+	int i;
+
+	if (s->recoverable) {
+		/* The cache read failed, but we can retry from the backing
+		 * device.
+		 */
+		pr_debug("recovering at sector %llu",
+			 (uint64_t) s->orig_bio->bi_sector);
+
+		s->error = 0;
+		bv = s->bio.bio.bi_io_vec;
+		do_bio_hook(s);
+		s->bio.bio.bi_io_vec = bv;
+
+		if (!s->unaligned_bvec)
+			bio_for_each_segment(bv, s->orig_bio, i)
+				bv->bv_offset = 0, bv->bv_len = PAGE_SIZE;
+		else
+			memcpy(s->bio.bio.bi_io_vec,
+			       bio_iovec(s->orig_bio),
+			       sizeof(struct bio_vec) *
+			       bio_segments(s->orig_bio));
+
+		/* XXX: invalidate cache */
+
+		trace_bcache_read_retry(&s->bio.bio);
+		closure_bio_submit(&s->bio.bio, &s->cl, s->d);
+	}
+
+	continue_at(cl, cached_dev_read_complete, NULL);
+}
+
+static void request_read_done(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	/*
+	 * s->cache_bio != NULL implies that we had a cache miss; cache_bio now
+	 * contains data ready to be inserted into the cache.
+	 *
+	 * First, we copy the data we just read from cache_bio's bounce buffers
+	 * to the buffers the original bio pointed to:
+	 */
+
+	if (s->op.cache_bio) {
+		struct bio_vec *src, *dst;
+		unsigned src_offset, dst_offset, bytes;
+		void *dst_ptr;
+
+		bio_reset(s->op.cache_bio);
+		s->op.cache_bio->bi_sector	= s->cache_miss->bi_sector;
+		s->op.cache_bio->bi_bdev	= s->cache_miss->bi_bdev;
+		s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
+		bio_map(s->op.cache_bio, NULL);
+
+		src = bio_iovec(s->op.cache_bio);
+		dst = bio_iovec(s->cache_miss);
+		src_offset = src->bv_offset;
+		dst_offset = dst->bv_offset;
+		dst_ptr = kmap(dst->bv_page);
+
+		while (1) {
+			if (dst_offset == dst->bv_offset + dst->bv_len) {
+				kunmap(dst->bv_page);
+				dst++;
+				if (dst == bio_iovec_idx(s->cache_miss,
+						s->cache_miss->bi_vcnt))
+					break;
+
+				dst_offset = dst->bv_offset;
+				dst_ptr = kmap(dst->bv_page);
+			}
+
+			if (src_offset == src->bv_offset + src->bv_len) {
+				src++;
+				if (src == bio_iovec_idx(s->op.cache_bio,
+						 s->op.cache_bio->bi_vcnt))
+					BUG();
+
+				src_offset = src->bv_offset;
+			}
+
+			bytes = min(dst->bv_offset + dst->bv_len - dst_offset,
+				    src->bv_offset + src->bv_len - src_offset);
+
+			memcpy(dst_ptr + dst_offset,
+			       page_address(src->bv_page) + src_offset,
+			       bytes);
+
+			src_offset	+= bytes;
+			dst_offset	+= bytes;
+		}
+
+		bio_put(s->cache_miss);
+		s->cache_miss = NULL;
+	}
+
+	if (verify(dc, &s->bio.bio) && s->recoverable)
+		bch_data_verify(s);
+
+	bio_complete(s);
+
+	if (s->op.cache_bio &&
+	    !test_bit(CACHE_SET_STOPPING, &s->op.c->flags)) {
+		s->op.type = BTREE_REPLACE;
+		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+	}
+
+	continue_at(cl, cached_dev_read_complete, NULL);
+}
+
+static void request_read_done_bh(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	bch_mark_cache_accounting(s, !s->cache_miss, s->op.skip);
+
+	if (s->error)
+		continue_at_nobarrier(cl, request_read_error, bcache_wq);
+	else if (s->op.cache_bio || verify(dc, &s->bio.bio))
+		continue_at_nobarrier(cl, request_read_done, bcache_wq);
+	else
+		continue_at_nobarrier(cl, cached_dev_read_complete, NULL);
+}
+
+static int cached_dev_cache_miss(struct btree *b, struct search *s,
+				 struct bio *bio, unsigned sectors)
+{
+	int ret = 0;
+	unsigned reada;
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	struct bio *miss;
+
+	miss = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split);
+	if (!miss)
+		return -EAGAIN;
+
+	if (miss == bio)
+		s->op.lookup_done = true;
+
+	miss->bi_end_io		= request_endio;
+	miss->bi_private	= &s->cl;
+
+	if (s->cache_miss || s->op.skip)
+		goto out_submit;
+
+	if (miss != bio ||
+	    (bio->bi_rw & REQ_RAHEAD) ||
+	    (bio->bi_rw & REQ_META) ||
+	    s->op.c->gc_stats.in_use >= CUTOFF_CACHE_READA)
+		reada = 0;
+	else {
+		reada = min(dc->readahead >> 9,
+			    sectors - bio_sectors(miss));
+
+		if (bio_end(miss) + reada > bdev_sectors(miss->bi_bdev))
+			reada = bdev_sectors(miss->bi_bdev) - bio_end(miss);
+	}
+
+	s->cache_bio_sectors = bio_sectors(miss) + reada;
+	s->op.cache_bio = bio_alloc_bioset(GFP_NOWAIT,
+			DIV_ROUND_UP(s->cache_bio_sectors, PAGE_SECTORS),
+			dc->disk.bio_split);
+
+	if (!s->op.cache_bio)
+		goto out_submit;
+
+	s->op.cache_bio->bi_sector	= miss->bi_sector;
+	s->op.cache_bio->bi_bdev	= miss->bi_bdev;
+	s->op.cache_bio->bi_size	= s->cache_bio_sectors << 9;
+
+	s->op.cache_bio->bi_end_io	= request_endio;
+	s->op.cache_bio->bi_private	= &s->cl;
+
+	/* btree_search_recurse()'s btree iterator is no good anymore */
+	ret = -EINTR;
+	if (!bch_btree_insert_check_key(b, &s->op, s->op.cache_bio))
+		goto out_put;
+
+	bio_map(s->op.cache_bio, NULL);
+	if (bio_alloc_pages(s->op.cache_bio, __GFP_NOWARN|GFP_NOIO))
+		goto out_put;
+
+	s->cache_miss = miss;
+	bio_get(s->op.cache_bio);
+
+	trace_bcache_cache_miss(s->orig_bio);
+	closure_bio_submit(s->op.cache_bio, &s->cl, s->d);
+
+	return ret;
+out_put:
+	bio_put(s->op.cache_bio);
+	s->op.cache_bio = NULL;
+out_submit:
+	closure_bio_submit(miss, &s->cl, s->d);
+	return ret;
+}
+
+static void request_read(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+
+	check_should_skip(dc, s);
+	closure_call(&s->op.cl, btree_read_async, NULL, cl);
+
+	continue_at(cl, request_read_done_bh, NULL);
+}
+
+/* Process writes */
+
+static void cached_dev_write_complete(struct closure *cl)
+{
+	struct search *s = container_of(cl, struct search, cl);
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+
+	up_read_non_owner(&dc->writeback_lock);
+	cached_dev_bio_complete(cl);
+}
+
+static bool should_writeback(struct cached_dev *dc, struct bio *bio)
+{
+	unsigned threshold = (bio->bi_rw & REQ_SYNC)
+		? CUTOFF_WRITEBACK_SYNC
+		: CUTOFF_WRITEBACK;
+
+	return !atomic_read(&dc->disk.detaching) &&
+		cache_mode(dc, bio) == CACHE_MODE_WRITEBACK &&
+		dc->disk.c->gc_stats.in_use < threshold;
+}
+
+static void request_write(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->bio.bio;
+	struct bkey start, end;
+	start = KEY(dc->disk.id, bio->bi_sector, 0);
+	end = KEY(dc->disk.id, bio_end(bio), 0);
+
+	bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys, &start, &end);
+
+	check_should_skip(dc, s);
+	down_read_non_owner(&dc->writeback_lock);
+
+	if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
+		s->op.skip	= false;
+		s->writeback	= true;
+	}
+
+	if (bio->bi_rw & REQ_DISCARD)
+		goto skip;
+
+	if (s->op.skip)
+		goto skip;
+
+	if (should_writeback(dc, s->orig_bio))
+		s->writeback = true;
+
+	if (!s->writeback) {
+		s->op.cache_bio = bio_clone_bioset(bio, GFP_NOIO,
+						   dc->disk.bio_split);
+
+		trace_bcache_writethrough(s->orig_bio);
+		closure_bio_submit(bio, cl, s->d);
+	} else {
+		s->op.cache_bio = bio;
+		trace_bcache_writeback(s->orig_bio);
+		bch_writeback_add(dc, bio_sectors(bio));
+	}
+out:
+	closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+	continue_at(cl, cached_dev_write_complete, NULL);
+skip:
+	s->op.skip = true;
+	s->op.cache_bio = s->orig_bio;
+	bio_get(s->op.cache_bio);
+	trace_bcache_write_skip(s->orig_bio);
+
+	if ((bio->bi_rw & REQ_DISCARD) &&
+	    !blk_queue_discard(bdev_get_queue(dc->bdev)))
+		goto out;
+
+	closure_bio_submit(bio, cl, s->d);
+	goto out;
+}
+
+static void request_nodata(struct cached_dev *dc, struct search *s)
+{
+	struct closure *cl = &s->cl;
+	struct bio *bio = &s->bio.bio;
+
+	if (bio->bi_rw & REQ_DISCARD) {
+		request_write(dc, s);
+		return;
+	}
+
+	if (s->op.flush_journal)
+		bch_journal_meta(s->op.c, cl);
+
+	closure_bio_submit(bio, cl, s->d);
+
+	continue_at(cl, cached_dev_bio_complete, NULL);
+}
+
+/* Cached devices - read & write stuff */
+
+int bch_get_congested(struct cache_set *c)
+{
+	int i;
+
+	if (!c->congested_read_threshold_us &&
+	    !c->congested_write_threshold_us)
+		return 0;
+
+	i = (local_clock_us() - c->congested_last_us) / 1024;
+	if (i < 0)
+		return 0;
+
+	i += atomic_read(&c->congested);
+	if (i >= 0)
+		return 0;
+
+	i += CONGESTED_MAX;
+
+	return i <= 0 ? 1 : fract_exp_two(i, 6);
+}
+
+static void add_sequential(struct task_struct *t)
+{
+	ewma_add(t->sequential_io_avg,
+		 t->sequential_io, 8, 0);
+
+	t->sequential_io = 0;
+}
+
+static void check_should_skip(struct cached_dev *dc, struct search *s)
+{
+	struct hlist_head *iohash(uint64_t k)
+	{ return &dc->io_hash[hash_64(k, RECENT_IO_BITS)]; }
+
+	struct cache_set *c = s->op.c;
+	struct bio *bio = &s->bio.bio;
+
+	long rand;
+	int cutoff = bch_get_congested(c);
+	unsigned mode = cache_mode(dc, bio);
+
+	if (atomic_read(&dc->disk.detaching) ||
+	    c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
+	    (bio->bi_rw & REQ_DISCARD))
+		goto skip;
+
+	if (mode == CACHE_MODE_NONE ||
+	    (mode == CACHE_MODE_WRITEAROUND &&
+	     (bio->bi_rw & REQ_WRITE)))
+		goto skip;
+
+	if (bio->bi_sector   & (c->sb.block_size - 1) ||
+	    bio_sectors(bio) & (c->sb.block_size - 1)) {
+		pr_debug("skipping unaligned io");
+		goto skip;
+	}
+
+	if (!cutoff) {
+		cutoff = dc->sequential_cutoff >> 9;
+
+		if (!cutoff)
+			goto rescale;
+
+		if (mode == CACHE_MODE_WRITEBACK &&
+		    (bio->bi_rw & REQ_WRITE) &&
+		    (bio->bi_rw & REQ_SYNC))
+			goto rescale;
+	}
+
+	if (dc->sequential_merge) {
+		struct io *i;
+
+		spin_lock(&dc->io_lock);
+
+		hlist_for_each_entry(i, iohash(bio->bi_sector), hash)
+			if (i->last == bio->bi_sector &&
+			    time_before(jiffies, i->jiffies))
+				goto found;
+
+		i = list_first_entry(&dc->io_lru, struct io, lru);
+
+		add_sequential(s->task);
+		i->sequential = 0;
+found:
+		if (i->sequential + bio->bi_size > i->sequential)
+			i->sequential	+= bio->bi_size;
+
+		i->last			 = bio_end(bio);
+		i->jiffies		 = jiffies + msecs_to_jiffies(5000);
+		s->task->sequential_io	 = i->sequential;
+
+		hlist_del(&i->hash);
+		hlist_add_head(&i->hash, iohash(i->last));
+		list_move_tail(&i->lru, &dc->io_lru);
+
+		spin_unlock(&dc->io_lock);
+	} else {
+		s->task->sequential_io = bio->bi_size;
+
+		add_sequential(s->task);
+	}
+
+	rand = get_random_int();
+	cutoff -= bitmap_weight(&rand, BITS_PER_LONG);
+
+	if (cutoff <= (int) (max(s->task->sequential_io,
+				 s->task->sequential_io_avg) >> 9))
+		goto skip;
+
+rescale:
+	bch_rescale_priorities(c, bio_sectors(bio));
+	return;
+skip:
+	bch_mark_sectors_bypassed(s, bio_sectors(bio));
+	s->op.skip = true;
+}
+
+static void cached_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct search *s;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	int cpu, rw = bio_data_dir(bio);
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
+	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+
+	bio->bi_bdev = dc->bdev;
+	bio->bi_sector += BDEV_DATA_START;
+
+	if (cached_dev_get(dc)) {
+		s = search_alloc(bio, d);
+		trace_bcache_request_start(s, bio);
+
+		if (!bio_has_data(bio))
+			request_nodata(dc, s);
+		else if (rw)
+			request_write(dc, s);
+		else
+			request_read(dc, s);
+	} else {
+		if ((bio->bi_rw & REQ_DISCARD) &&
+		    !blk_queue_discard(bdev_get_queue(dc->bdev)))
+			bio_endio(bio, 0);
+		else
+			bch_generic_make_request(bio, &d->bio_split_hook);
+	}
+}
+
+static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
+			    unsigned int cmd, unsigned long arg)
+{
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
+}
+
+static int cached_dev_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct cached_dev *dc = container_of(d, struct cached_dev, disk);
+	struct request_queue *q = bdev_get_queue(dc->bdev);
+	int ret = 0;
+
+	if (bdi_congested(&q->backing_dev_info, bits))
+		return 1;
+
+	if (cached_dev_get(dc)) {
+		unsigned i;
+		struct cache *ca;
+
+		for_each_cache(ca, d->c, i) {
+			q = bdev_get_queue(ca->bdev);
+			ret |= bdi_congested(&q->backing_dev_info, bits);
+		}
+
+		cached_dev_put(dc);
+	}
+
+	return ret;
+}
+
+void bch_cached_dev_request_init(struct cached_dev *dc)
+{
+	struct gendisk *g = dc->disk.disk;
+
+	g->queue->make_request_fn		= cached_dev_make_request;
+	g->queue->backing_dev_info.congested_fn = cached_dev_congested;
+	dc->disk.cache_miss			= cached_dev_cache_miss;
+	dc->disk.ioctl				= cached_dev_ioctl;
+}
+
+/* Flash backed devices */
+
+static int flash_dev_cache_miss(struct btree *b, struct search *s,
+				struct bio *bio, unsigned sectors)
+{
+	/* Zero fill bio */
+
+	while (bio->bi_idx != bio->bi_vcnt) {
+		struct bio_vec *bv = bio_iovec(bio);
+		unsigned j = min(bv->bv_len >> 9, sectors);
+
+		void *p = kmap(bv->bv_page);
+		memset(p + bv->bv_offset, 0, j << 9);
+		kunmap(bv->bv_page);
+
+		bv->bv_len	-= j << 9;
+		bv->bv_offset	+= j << 9;
+
+		if (bv->bv_len)
+			return 0;
+
+		bio->bi_sector	+= j;
+		bio->bi_size	-= j << 9;
+
+		bio->bi_idx++;
+		sectors		-= j;
+	}
+
+	s->op.lookup_done = true;
+
+	return 0;
+}
+
+static void flash_dev_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct search *s;
+	struct closure *cl;
+	struct bcache_device *d = bio->bi_bdev->bd_disk->private_data;
+	int cpu, rw = bio_data_dir(bio);
+
+	cpu = part_stat_lock();
+	part_stat_inc(cpu, &d->disk->part0, ios[rw]);
+	part_stat_add(cpu, &d->disk->part0, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+
+	s = search_alloc(bio, d);
+	cl = &s->cl;
+	bio = &s->bio.bio;
+
+	trace_bcache_request_start(s, bio);
+
+	if (bio_has_data(bio) && !rw) {
+		closure_call(&s->op.cl, btree_read_async, NULL, cl);
+	} else if (bio_has_data(bio) || s->op.skip) {
+		bch_keybuf_check_overlapping(&s->op.c->moving_gc_keys,
+					     &KEY(d->id, bio->bi_sector, 0),
+					     &KEY(d->id, bio_end(bio), 0));
+
+		s->writeback	= true;
+		s->op.cache_bio	= bio;
+
+		closure_call(&s->op.cl, bch_insert_data, NULL, cl);
+	} else {
+		/* No data - probably a cache flush */
+		if (s->op.flush_journal)
+			bch_journal_meta(s->op.c, cl);
+	}
+
+	continue_at(cl, search_free, NULL);
+}
+
+static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
+			   unsigned int cmd, unsigned long arg)
+{
+	return -ENOTTY;
+}
+
+static int flash_dev_congested(void *data, int bits)
+{
+	struct bcache_device *d = data;
+	struct request_queue *q;
+	struct cache *ca;
+	unsigned i;
+	int ret = 0;
+
+	for_each_cache(ca, d->c, i) {
+		q = bdev_get_queue(ca->bdev);
+		ret |= bdi_congested(&q->backing_dev_info, bits);
+	}
+
+	return ret;
+}
+
+void bch_flash_dev_request_init(struct bcache_device *d)
+{
+	struct gendisk *g = d->disk;
+
+	g->queue->make_request_fn		= flash_dev_make_request;
+	g->queue->backing_dev_info.congested_fn = flash_dev_congested;
+	d->cache_miss				= flash_dev_cache_miss;
+	d->ioctl				= flash_dev_ioctl;
+}
+
+void bch_request_exit(void)
+{
+#ifdef CONFIG_CGROUP_BCACHE
+	cgroup_unload_subsys(&bcache_subsys);
+#endif
+	if (bch_search_cache)
+		kmem_cache_destroy(bch_search_cache);
+}
+
+int __init bch_request_init(void)
+{
+	bch_search_cache = KMEM_CACHE(search, 0);
+	if (!bch_search_cache)
+		return -ENOMEM;
+
+#ifdef CONFIG_CGROUP_BCACHE
+	cgroup_load_subsys(&bcache_subsys);
+	init_bch_cgroup(&bcache_default_cgroup);
+
+	cgroup_add_cftypes(&bcache_subsys, bch_files);
+#endif
+	return 0;
+}
diff --git a/drivers/md/bcache/request.h b/drivers/md/bcache/request.h
new file mode 100644
index 000000000000..254d9ab5707c
--- /dev/null
+++ b/drivers/md/bcache/request.h
@@ -0,0 +1,62 @@
+#ifndef _BCACHE_REQUEST_H_
+#define _BCACHE_REQUEST_H_
+
+#include <linux/cgroup.h>
+
+struct search {
+	/* Stack frame for bio_complete */
+	struct closure		cl;
+
+	struct bcache_device	*d;
+	struct task_struct	*task;
+
+	struct bbio		bio;
+	struct bio		*orig_bio;
+	struct bio		*cache_miss;
+	unsigned		cache_bio_sectors;
+
+	unsigned		recoverable:1;
+	unsigned		unaligned_bvec:1;
+
+	unsigned		write:1;
+	unsigned		writeback:1;
+
+	/* IO error returned to s->bio */
+	short			error;
+	unsigned long		start_time;
+
+	/* Anything past op->keys won't get zeroed in do_bio_hook */
+	struct btree_op		op;
+};
+
+void bch_cache_read_endio(struct bio *, int);
+int bch_get_congested(struct cache_set *);
+void bch_insert_data(struct closure *cl);
+void bch_btree_insert_async(struct closure *);
+void bch_cache_read_endio(struct bio *, int);
+
+void bch_open_buckets_free(struct cache_set *);
+int bch_open_buckets_alloc(struct cache_set *);
+
+void bch_cached_dev_request_init(struct cached_dev *dc);
+void bch_flash_dev_request_init(struct bcache_device *d);
+
+extern struct kmem_cache *bch_search_cache, *bch_passthrough_cache;
+
+struct bch_cgroup {
+#ifdef CONFIG_CGROUP_BCACHE
+	struct cgroup_subsys_state	css;
+#endif
+	/*
+	 * We subtract one from the index into bch_cache_modes[], so that
+	 * default == -1; this makes it so the rest match up with d->cache_mode,
+	 * and we use d->cache_mode if cgrp->cache_mode < 0
+	 */
+	short				cache_mode;
+	bool				verify;
+	struct cache_stat_collector	stats;
+};
+
+struct bch_cgroup *bch_bio_to_cgroup(struct bio *bio);
+
+#endif /* _BCACHE_REQUEST_H_ */
diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c
new file mode 100644
index 000000000000..bf6cf9518c89
--- /dev/null
+++ b/drivers/md/bcache/stats.c
@@ -0,0 +1,245 @@
+/*
+ * bcache stats code
+ *
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "stats.h"
+#include "btree.h"
+#include "request.h"
+#include "sysfs.h"
+
+/*
+ * We keep absolute totals of various statistics, and addionally a set of three
+ * rolling averages.
+ *
+ * Every so often, a timer goes off and rescales the rolling averages.
+ * accounting_rescale[] is how many times the timer has to go off before we
+ * rescale each set of numbers; that gets us half lives of 5 minutes, one hour,
+ * and one day.
+ *
+ * accounting_delay is how often the timer goes off - 22 times in 5 minutes,
+ * and accounting_weight is what we use to rescale:
+ *
+ * pow(31 / 32, 22) ~= 1/2
+ *
+ * So that we don't have to increment each set of numbers every time we (say)
+ * get a cache hit, we increment a single atomic_t in acc->collector, and when
+ * the rescale function runs it resets the atomic counter to 0 and adds its
+ * old value to each of the exported numbers.
+ *
+ * To reduce rounding error, the numbers in struct cache_stats are all
+ * stored left shifted by 16, and scaled back in the sysfs show() function.
+ */
+
+static const unsigned DAY_RESCALE		= 288;
+static const unsigned HOUR_RESCALE		= 12;
+static const unsigned FIVE_MINUTE_RESCALE	= 1;
+static const unsigned accounting_delay		= (HZ * 300) / 22;
+static const unsigned accounting_weight		= 32;
+
+/* sysfs reading/writing */
+
+read_attribute(cache_hits);
+read_attribute(cache_misses);
+read_attribute(cache_bypass_hits);
+read_attribute(cache_bypass_misses);
+read_attribute(cache_hit_ratio);
+read_attribute(cache_readaheads);
+read_attribute(cache_miss_collisions);
+read_attribute(bypassed);
+
+SHOW(bch_stats)
+{
+	struct cache_stats *s =
+		container_of(kobj, struct cache_stats, kobj);
+#define var(stat)		(s->stat >> 16)
+	var_print(cache_hits);
+	var_print(cache_misses);
+	var_print(cache_bypass_hits);
+	var_print(cache_bypass_misses);
+
+	sysfs_print(cache_hit_ratio,
+		    DIV_SAFE(var(cache_hits) * 100,
+			     var(cache_hits) + var(cache_misses)));
+
+	var_print(cache_readaheads);
+	var_print(cache_miss_collisions);
+	sysfs_hprint(bypassed,	var(sectors_bypassed) << 9);
+#undef var
+	return 0;
+}
+
+STORE(bch_stats)
+{
+	return size;
+}
+
+static void bch_stats_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_stats_files[] = {
+	&sysfs_cache_hits,
+	&sysfs_cache_misses,
+	&sysfs_cache_bypass_hits,
+	&sysfs_cache_bypass_misses,
+	&sysfs_cache_hit_ratio,
+	&sysfs_cache_readaheads,
+	&sysfs_cache_miss_collisions,
+	&sysfs_bypassed,
+	NULL
+};
+static KTYPE(bch_stats);
+
+static void scale_accounting(unsigned long data);
+
+void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent)
+{
+	kobject_init(&acc->total.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->five_minute.kobj,	&bch_stats_ktype);
+	kobject_init(&acc->hour.kobj,		&bch_stats_ktype);
+	kobject_init(&acc->day.kobj,		&bch_stats_ktype);
+
+	closure_init(&acc->cl, parent);
+	init_timer(&acc->timer);
+	acc->timer.expires	= jiffies + accounting_delay;
+	acc->timer.data		= (unsigned long) acc;
+	acc->timer.function	= scale_accounting;
+	add_timer(&acc->timer);
+}
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent)
+{
+	int ret = kobject_add(&acc->total.kobj, parent,
+			      "stats_total");
+	ret = ret ?: kobject_add(&acc->five_minute.kobj, parent,
+				 "stats_five_minute");
+	ret = ret ?: kobject_add(&acc->hour.kobj, parent,
+				 "stats_hour");
+	ret = ret ?: kobject_add(&acc->day.kobj, parent,
+				 "stats_day");
+	return ret;
+}
+
+void bch_cache_accounting_clear(struct cache_accounting *acc)
+{
+	memset(&acc->total.cache_hits,
+	       0,
+	       sizeof(unsigned long) * 7);
+}
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc)
+{
+	kobject_put(&acc->total.kobj);
+	kobject_put(&acc->five_minute.kobj);
+	kobject_put(&acc->hour.kobj);
+	kobject_put(&acc->day.kobj);
+
+	atomic_set(&acc->closing, 1);
+	if (del_timer_sync(&acc->timer))
+		closure_return(&acc->cl);
+}
+
+/* EWMA scaling */
+
+static void scale_stat(unsigned long *stat)
+{
+	*stat =  ewma_add(*stat, 0, accounting_weight, 0);
+}
+
+static void scale_stats(struct cache_stats *stats, unsigned long rescale_at)
+{
+	if (++stats->rescale == rescale_at) {
+		stats->rescale = 0;
+		scale_stat(&stats->cache_hits);
+		scale_stat(&stats->cache_misses);
+		scale_stat(&stats->cache_bypass_hits);
+		scale_stat(&stats->cache_bypass_misses);
+		scale_stat(&stats->cache_readaheads);
+		scale_stat(&stats->cache_miss_collisions);
+		scale_stat(&stats->sectors_bypassed);
+	}
+}
+
+static void scale_accounting(unsigned long data)
+{
+	struct cache_accounting *acc = (struct cache_accounting *) data;
+
+#define move_stat(name) do {						\
+	unsigned t = atomic_xchg(&acc->collector.name, 0);		\
+	t <<= 16;							\
+	acc->five_minute.name += t;					\
+	acc->hour.name += t;						\
+	acc->day.name += t;						\
+	acc->total.name += t;						\
+} while (0)
+
+	move_stat(cache_hits);
+	move_stat(cache_misses);
+	move_stat(cache_bypass_hits);
+	move_stat(cache_bypass_misses);
+	move_stat(cache_readaheads);
+	move_stat(cache_miss_collisions);
+	move_stat(sectors_bypassed);
+
+	scale_stats(&acc->total, 0);
+	scale_stats(&acc->day, DAY_RESCALE);
+	scale_stats(&acc->hour, HOUR_RESCALE);
+	scale_stats(&acc->five_minute, FIVE_MINUTE_RESCALE);
+
+	acc->timer.expires += accounting_delay;
+
+	if (!atomic_read(&acc->closing))
+		add_timer(&acc->timer);
+	else
+		closure_return(&acc->cl);
+}
+
+static void mark_cache_stats(struct cache_stat_collector *stats,
+			     bool hit, bool bypass)
+{
+	if (!bypass)
+		if (hit)
+			atomic_inc(&stats->cache_hits);
+		else
+			atomic_inc(&stats->cache_misses);
+	else
+		if (hit)
+			atomic_inc(&stats->cache_bypass_hits);
+		else
+			atomic_inc(&stats->cache_bypass_misses);
+}
+
+void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass)
+{
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	mark_cache_stats(&dc->accounting.collector, hit, bypass);
+	mark_cache_stats(&s->op.c->accounting.collector, hit, bypass);
+#ifdef CONFIG_CGROUP_BCACHE
+	mark_cache_stats(&(bch_bio_to_cgroup(s->orig_bio)->stats), hit, bypass);
+#endif
+}
+
+void bch_mark_cache_readahead(struct search *s)
+{
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	atomic_inc(&dc->accounting.collector.cache_readaheads);
+	atomic_inc(&s->op.c->accounting.collector.cache_readaheads);
+}
+
+void bch_mark_cache_miss_collision(struct search *s)
+{
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	atomic_inc(&dc->accounting.collector.cache_miss_collisions);
+	atomic_inc(&s->op.c->accounting.collector.cache_miss_collisions);
+}
+
+void bch_mark_sectors_bypassed(struct search *s, int sectors)
+{
+	struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
+	atomic_add(sectors, &dc->accounting.collector.sectors_bypassed);
+	atomic_add(sectors, &s->op.c->accounting.collector.sectors_bypassed);
+}
diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h
new file mode 100644
index 000000000000..c7c7a8fd29fe
--- /dev/null
+++ b/drivers/md/bcache/stats.h
@@ -0,0 +1,58 @@
+#ifndef _BCACHE_STATS_H_
+#define _BCACHE_STATS_H_
+
+struct cache_stat_collector {
+	atomic_t cache_hits;
+	atomic_t cache_misses;
+	atomic_t cache_bypass_hits;
+	atomic_t cache_bypass_misses;
+	atomic_t cache_readaheads;
+	atomic_t cache_miss_collisions;
+	atomic_t sectors_bypassed;
+};
+
+struct cache_stats {
+	struct kobject		kobj;
+
+	unsigned long cache_hits;
+	unsigned long cache_misses;
+	unsigned long cache_bypass_hits;
+	unsigned long cache_bypass_misses;
+	unsigned long cache_readaheads;
+	unsigned long cache_miss_collisions;
+	unsigned long sectors_bypassed;
+
+	unsigned		rescale;
+};
+
+struct cache_accounting {
+	struct closure		cl;
+	struct timer_list	timer;
+	atomic_t		closing;
+
+	struct cache_stat_collector collector;
+
+	struct cache_stats total;
+	struct cache_stats five_minute;
+	struct cache_stats hour;
+	struct cache_stats day;
+};
+
+struct search;
+
+void bch_cache_accounting_init(struct cache_accounting *acc,
+			       struct closure *parent);
+
+int bch_cache_accounting_add_kobjs(struct cache_accounting *acc,
+				   struct kobject *parent);
+
+void bch_cache_accounting_clear(struct cache_accounting *acc);
+
+void bch_cache_accounting_destroy(struct cache_accounting *acc);
+
+void bch_mark_cache_accounting(struct search *s, bool hit, bool bypass);
+void bch_mark_cache_readahead(struct search *s);
+void bch_mark_cache_miss_collision(struct search *s);
+void bch_mark_sectors_bypassed(struct search *s, int sectors);
+
+#endif /* _BCACHE_STATS_H_ */
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
new file mode 100644
index 000000000000..31ef47f1f3b6
--- /dev/null
+++ b/drivers/md/bcache/super.c
@@ -0,0 +1,1941 @@
+/*
+ * bcache setup/teardown code, and some metadata io - read a superblock and
+ * figure out what to do with it.
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+#include "request.h"
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/sysfs.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
+
+static const char bcache_magic[] = {
+	0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
+	0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
+};
+
+static const char invalid_uuid[] = {
+	0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
+	0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
+};
+
+/* Default is -1; we skip past it for struct cached_dev's cache mode */
+const char * const bch_cache_modes[] = {
+	"default",
+	"writethrough",
+	"writeback",
+	"writearound",
+	"none",
+	NULL
+};
+
+struct uuid_entry_v0 {
+	uint8_t		uuid[16];
+	uint8_t		label[32];
+	uint32_t	first_reg;
+	uint32_t	last_reg;
+	uint32_t	invalidated;
+	uint32_t	pad;
+};
+
+static struct kobject *bcache_kobj;
+struct mutex bch_register_lock;
+LIST_HEAD(bch_cache_sets);
+static LIST_HEAD(uncached_devices);
+
+static int bcache_major, bcache_minor;
+static wait_queue_head_t unregister_wait;
+struct workqueue_struct *bcache_wq;
+
+#define BTREE_MAX_PAGES		(256 * 1024 / PAGE_SIZE)
+
+static void bio_split_pool_free(struct bio_split_pool *p)
+{
+	if (p->bio_split)
+		bioset_free(p->bio_split);
+
+}
+
+static int bio_split_pool_init(struct bio_split_pool *p)
+{
+	p->bio_split = bioset_create(4, 0);
+	if (!p->bio_split)
+		return -ENOMEM;
+
+	p->bio_split_hook = mempool_create_kmalloc_pool(4,
+				sizeof(struct bio_split_hook));
+	if (!p->bio_split_hook)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/* Superblock */
+
+static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
+			      struct page **res)
+{
+	const char *err;
+	struct cache_sb *s;
+	struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
+	unsigned i;
+
+	if (!bh)
+		return "IO error";
+
+	s = (struct cache_sb *) bh->b_data;
+
+	sb->offset		= le64_to_cpu(s->offset);
+	sb->version		= le64_to_cpu(s->version);
+
+	memcpy(sb->magic,	s->magic, 16);
+	memcpy(sb->uuid,	s->uuid, 16);
+	memcpy(sb->set_uuid,	s->set_uuid, 16);
+	memcpy(sb->label,	s->label, SB_LABEL_SIZE);
+
+	sb->flags		= le64_to_cpu(s->flags);
+	sb->seq			= le64_to_cpu(s->seq);
+
+	sb->nbuckets		= le64_to_cpu(s->nbuckets);
+	sb->block_size		= le16_to_cpu(s->block_size);
+	sb->bucket_size		= le16_to_cpu(s->bucket_size);
+
+	sb->nr_in_set		= le16_to_cpu(s->nr_in_set);
+	sb->nr_this_dev		= le16_to_cpu(s->nr_this_dev);
+	sb->last_mount		= le32_to_cpu(s->last_mount);
+
+	sb->first_bucket	= le16_to_cpu(s->first_bucket);
+	sb->keys		= le16_to_cpu(s->keys);
+
+	for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
+		sb->d[i] = le64_to_cpu(s->d[i]);
+
+	pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
+		 sb->version, sb->flags, sb->seq, sb->keys);
+
+	err = "Not a bcache superblock";
+	if (sb->offset != SB_SECTOR)
+		goto err;
+
+	if (memcmp(sb->magic, bcache_magic, 16))
+		goto err;
+
+	err = "Too many journal buckets";
+	if (sb->keys > SB_JOURNAL_BUCKETS)
+		goto err;
+
+	err = "Bad checksum";
+	if (s->csum != csum_set(s))
+		goto err;
+
+	err = "Bad UUID";
+	if (is_zero(sb->uuid, 16))
+		goto err;
+
+	err = "Unsupported superblock version";
+	if (sb->version > BCACHE_SB_VERSION)
+		goto err;
+
+	err = "Bad block/bucket size";
+	if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS ||
+	    !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS)
+		goto err;
+
+	err = "Too many buckets";
+	if (sb->nbuckets > LONG_MAX)
+		goto err;
+
+	err = "Not enough buckets";
+	if (sb->nbuckets < 1 << 7)
+		goto err;
+
+	err = "Invalid superblock: device too small";
+	if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
+		goto err;
+
+	if (sb->version == CACHE_BACKING_DEV)
+		goto out;
+
+	err = "Bad UUID";
+	if (is_zero(sb->set_uuid, 16))
+		goto err;
+
+	err = "Bad cache device number in set";
+	if (!sb->nr_in_set ||
+	    sb->nr_in_set <= sb->nr_this_dev ||
+	    sb->nr_in_set > MAX_CACHES_PER_SET)
+		goto err;
+
+	err = "Journal buckets not sequential";
+	for (i = 0; i < sb->keys; i++)
+		if (sb->d[i] != sb->first_bucket + i)
+			goto err;
+
+	err = "Too many journal buckets";
+	if (sb->first_bucket + sb->keys > sb->nbuckets)
+		goto err;
+
+	err = "Invalid superblock: first bucket comes before end of super";
+	if (sb->first_bucket * sb->bucket_size < 16)
+		goto err;
+out:
+	sb->last_mount = get_seconds();
+	err = NULL;
+
+	get_page(bh->b_page);
+	*res = bh->b_page;
+err:
+	put_bh(bh);
+	return err;
+}
+
+static void write_bdev_super_endio(struct bio *bio, int error)
+{
+	struct cached_dev *dc = bio->bi_private;
+	/* XXX: error checking */
+
+	closure_put(&dc->sb_write.cl);
+}
+
+static void __write_super(struct cache_sb *sb, struct bio *bio)
+{
+	struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+	unsigned i;
+
+	bio->bi_sector	= SB_SECTOR;
+	bio->bi_rw	= REQ_SYNC|REQ_META;
+	bio->bi_size	= SB_SIZE;
+	bio_map(bio, NULL);
+
+	out->offset		= cpu_to_le64(sb->offset);
+	out->version		= cpu_to_le64(sb->version);
+
+	memcpy(out->uuid,	sb->uuid, 16);
+	memcpy(out->set_uuid,	sb->set_uuid, 16);
+	memcpy(out->label,	sb->label, SB_LABEL_SIZE);
+
+	out->flags		= cpu_to_le64(sb->flags);
+	out->seq		= cpu_to_le64(sb->seq);
+
+	out->last_mount		= cpu_to_le32(sb->last_mount);
+	out->first_bucket	= cpu_to_le16(sb->first_bucket);
+	out->keys		= cpu_to_le16(sb->keys);
+
+	for (i = 0; i < sb->keys; i++)
+		out->d[i] = cpu_to_le64(sb->d[i]);
+
+	out->csum = csum_set(out);
+
+	pr_debug("ver %llu, flags %llu, seq %llu",
+		 sb->version, sb->flags, sb->seq);
+
+	submit_bio(REQ_WRITE, bio);
+}
+
+void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
+{
+	struct closure *cl = &dc->sb_write.cl;
+	struct bio *bio = &dc->sb_bio;
+
+	closure_lock(&dc->sb_write, parent);
+
+	bio_reset(bio);
+	bio->bi_bdev	= dc->bdev;
+	bio->bi_end_io	= write_bdev_super_endio;
+	bio->bi_private = dc;
+
+	closure_get(cl);
+	__write_super(&dc->sb, bio);
+
+	closure_return(cl);
+}
+
+static void write_super_endio(struct bio *bio, int error)
+{
+	struct cache *ca = bio->bi_private;
+
+	bch_count_io_errors(ca, error, "writing superblock");
+	closure_put(&ca->set->sb_write.cl);
+}
+
+void bcache_write_super(struct cache_set *c)
+{
+	struct closure *cl = &c->sb_write.cl;
+	struct cache *ca;
+	unsigned i;
+
+	closure_lock(&c->sb_write, &c->cl);
+
+	c->sb.seq++;
+
+	for_each_cache(ca, c, i) {
+		struct bio *bio = &ca->sb_bio;
+
+		ca->sb.version		= BCACHE_SB_VERSION;
+		ca->sb.seq		= c->sb.seq;
+		ca->sb.last_mount	= c->sb.last_mount;
+
+		SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
+
+		bio_reset(bio);
+		bio->bi_bdev	= ca->bdev;
+		bio->bi_end_io	= write_super_endio;
+		bio->bi_private = ca;
+
+		closure_get(cl);
+		__write_super(&ca->sb, bio);
+	}
+
+	closure_return(cl);
+}
+
+/* UUID io */
+
+static void uuid_endio(struct bio *bio, int error)
+{
+	struct closure *cl = bio->bi_private;
+	struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
+
+	cache_set_err_on(error, c, "accessing uuids");
+	bch_bbio_free(bio, c);
+	closure_put(cl);
+}
+
+static void uuid_io(struct cache_set *c, unsigned long rw,
+		    struct bkey *k, struct closure *parent)
+{
+	struct closure *cl = &c->uuid_write.cl;
+	struct uuid_entry *u;
+	unsigned i;
+
+	BUG_ON(!parent);
+	closure_lock(&c->uuid_write, parent);
+
+	for (i = 0; i < KEY_PTRS(k); i++) {
+		struct bio *bio = bch_bbio_alloc(c);
+
+		bio->bi_rw	= REQ_SYNC|REQ_META|rw;
+		bio->bi_size	= KEY_SIZE(k) << 9;
+
+		bio->bi_end_io	= uuid_endio;
+		bio->bi_private = cl;
+		bio_map(bio, c->uuids);
+
+		bch_submit_bbio(bio, c, k, i);
+
+		if (!(rw & WRITE))
+			break;
+	}
+
+	pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
+		 pkey(&c->uuid_bucket));
+
+	for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
+		if (!is_zero(u->uuid, 16))
+			pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
+				 u - c->uuids, u->uuid, u->label,
+				 u->first_reg, u->last_reg, u->invalidated);
+
+	closure_return(cl);
+}
+
+static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
+{
+	struct bkey *k = &j->uuid_bucket;
+
+	if (__bch_ptr_invalid(c, 1, k))
+		return "bad uuid pointer";
+
+	bkey_copy(&c->uuid_bucket, k);
+	uuid_io(c, READ_SYNC, k, cl);
+
+	if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
+		struct uuid_entry_v0	*u0 = (void *) c->uuids;
+		struct uuid_entry	*u1 = (void *) c->uuids;
+		int i;
+
+		closure_sync(cl);
+
+		/*
+		 * Since the new uuid entry is bigger than the old, we have to
+		 * convert starting at the highest memory address and work down
+		 * in order to do it in place
+		 */
+
+		for (i = c->nr_uuids - 1;
+		     i >= 0;
+		     --i) {
+			memcpy(u1[i].uuid,	u0[i].uuid, 16);
+			memcpy(u1[i].label,	u0[i].label, 32);
+
+			u1[i].first_reg		= u0[i].first_reg;
+			u1[i].last_reg		= u0[i].last_reg;
+			u1[i].invalidated	= u0[i].invalidated;
+
+			u1[i].flags	= 0;
+			u1[i].sectors	= 0;
+		}
+	}
+
+	return NULL;
+}
+
+static int __uuid_write(struct cache_set *c)
+{
+	BKEY_PADDED(key) k;
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	lockdep_assert_held(&bch_register_lock);
+
+	if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
+		return 1;
+
+	SET_KEY_SIZE(&k.key, c->sb.bucket_size);
+	uuid_io(c, REQ_WRITE, &k.key, &cl);
+	closure_sync(&cl);
+
+	bkey_copy(&c->uuid_bucket, &k.key);
+	__bkey_put(c, &k.key);
+	return 0;
+}
+
+int bch_uuid_write(struct cache_set *c)
+{
+	int ret = __uuid_write(c);
+
+	if (!ret)
+		bch_journal_meta(c, NULL);
+
+	return ret;
+}
+
+static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
+{
+	struct uuid_entry *u;
+
+	for (u = c->uuids;
+	     u < c->uuids + c->nr_uuids; u++)
+		if (!memcmp(u->uuid, uuid, 16))
+			return u;
+
+	return NULL;
+}
+
+static struct uuid_entry *uuid_find_empty(struct cache_set *c)
+{
+	static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+	return uuid_find(c, zero_uuid);
+}
+
+/*
+ * Bucket priorities/gens:
+ *
+ * For each bucket, we store on disk its
+   * 8 bit gen
+   * 16 bit priority
+ *
+ * See alloc.c for an explanation of the gen. The priority is used to implement
+ * lru (and in the future other) cache replacement policies; for most purposes
+ * it's just an opaque integer.
+ *
+ * The gens and the priorities don't have a whole lot to do with each other, and
+ * it's actually the gens that must be written out at specific times - it's no
+ * big deal if the priorities don't get written, if we lose them we just reuse
+ * buckets in suboptimal order.
+ *
+ * On disk they're stored in a packed array, and in as many buckets are required
+ * to fit them all. The buckets we use to store them form a list; the journal
+ * header points to the first bucket, the first bucket points to the second
+ * bucket, et cetera.
+ *
+ * This code is used by the allocation code; periodically (whenever it runs out
+ * of buckets to allocate from) the allocation code will invalidate some
+ * buckets, but it can't use those buckets until their new gens are safely on
+ * disk.
+ */
+
+static void prio_endio(struct bio *bio, int error)
+{
+	struct cache *ca = bio->bi_private;
+
+	cache_set_err_on(error, ca->set, "accessing priorities");
+	bch_bbio_free(bio, ca->set);
+	closure_put(&ca->prio);
+}
+
+static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
+{
+	struct closure *cl = &ca->prio;
+	struct bio *bio = bch_bbio_alloc(ca->set);
+
+	closure_init_stack(cl);
+
+	bio->bi_sector	= bucket * ca->sb.bucket_size;
+	bio->bi_bdev	= ca->bdev;
+	bio->bi_rw	= REQ_SYNC|REQ_META|rw;
+	bio->bi_size	= bucket_bytes(ca);
+
+	bio->bi_end_io	= prio_endio;
+	bio->bi_private = ca;
+	bio_map(bio, ca->disk_buckets);
+
+	closure_bio_submit(bio, &ca->prio, ca);
+	closure_sync(cl);
+}
+
+#define buckets_free(c)	"free %zu, free_inc %zu, unused %zu",		\
+	fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
+
+void bch_prio_write(struct cache *ca)
+{
+	int i;
+	struct bucket *b;
+	struct closure cl;
+
+	closure_init_stack(&cl);
+
+	lockdep_assert_held(&ca->set->bucket_lock);
+
+	for (b = ca->buckets;
+	     b < ca->buckets + ca->sb.nbuckets; b++)
+		b->disk_gen = b->gen;
+
+	ca->disk_buckets->seq++;
+
+	atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
+			&ca->meta_sectors_written);
+
+	pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
+		 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
+	blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
+
+	for (i = prio_buckets(ca) - 1; i >= 0; --i) {
+		long bucket;
+		struct prio_set *p = ca->disk_buckets;
+		struct bucket_disk *d = p->data, *end = d + prios_per_bucket(ca);
+
+		for (b = ca->buckets + i * prios_per_bucket(ca);
+		     b < ca->buckets + ca->sb.nbuckets && d < end;
+		     b++, d++) {
+			d->prio = cpu_to_le16(b->prio);
+			d->gen = b->gen;
+		}
+
+		p->next_bucket	= ca->prio_buckets[i + 1];
+		p->magic	= pset_magic(ca);
+		p->csum		= crc64(&p->magic, bucket_bytes(ca) - 8);
+
+		bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
+		BUG_ON(bucket == -1);
+
+		mutex_unlock(&ca->set->bucket_lock);
+		prio_io(ca, bucket, REQ_WRITE);
+		mutex_lock(&ca->set->bucket_lock);
+
+		ca->prio_buckets[i] = bucket;
+		atomic_dec_bug(&ca->buckets[bucket].pin);
+	}
+
+	mutex_unlock(&ca->set->bucket_lock);
+
+	bch_journal_meta(ca->set, &cl);
+	closure_sync(&cl);
+
+	mutex_lock(&ca->set->bucket_lock);
+
+	ca->need_save_prio = 0;
+
+	/*
+	 * Don't want the old priorities to get garbage collected until after we
+	 * finish writing the new ones, and they're journalled
+	 */
+	for (i = 0; i < prio_buckets(ca); i++)
+		ca->prio_last_buckets[i] = ca->prio_buckets[i];
+}
+
+static void prio_read(struct cache *ca, uint64_t bucket)
+{
+	struct prio_set *p = ca->disk_buckets;
+	struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
+	struct bucket *b;
+	unsigned bucket_nr = 0;
+
+	for (b = ca->buckets;
+	     b < ca->buckets + ca->sb.nbuckets;
+	     b++, d++) {
+		if (d == end) {
+			ca->prio_buckets[bucket_nr] = bucket;
+			ca->prio_last_buckets[bucket_nr] = bucket;
+			bucket_nr++;
+
+			prio_io(ca, bucket, READ_SYNC);
+
+			if (p->csum != crc64(&p->magic, bucket_bytes(ca) - 8))
+				pr_warn("bad csum reading priorities");
+
+			if (p->magic != pset_magic(ca))
+				pr_warn("bad magic reading priorities");
+
+			bucket = p->next_bucket;
+			d = p->data;
+		}
+
+		b->prio = le16_to_cpu(d->prio);
+		b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
+	}
+}
+
+/* Bcache device */
+
+static int open_dev(struct block_device *b, fmode_t mode)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+	if (atomic_read(&d->closing))
+		return -ENXIO;
+
+	closure_get(&d->cl);
+	return 0;
+}
+
+static int release_dev(struct gendisk *b, fmode_t mode)
+{
+	struct bcache_device *d = b->private_data;
+	closure_put(&d->cl);
+	return 0;
+}
+
+static int ioctl_dev(struct block_device *b, fmode_t mode,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct bcache_device *d = b->bd_disk->private_data;
+	return d->ioctl(d, mode, cmd, arg);
+}
+
+static const struct block_device_operations bcache_ops = {
+	.open		= open_dev,
+	.release	= release_dev,
+	.ioctl		= ioctl_dev,
+	.owner		= THIS_MODULE,
+};
+
+void bcache_device_stop(struct bcache_device *d)
+{
+	if (!atomic_xchg(&d->closing, 1))
+		closure_queue(&d->cl);
+}
+
+static void bcache_device_detach(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (atomic_read(&d->detaching)) {
+		struct uuid_entry *u = d->c->uuids + d->id;
+
+		SET_UUID_FLASH_ONLY(u, 0);
+		memcpy(u->uuid, invalid_uuid, 16);
+		u->invalidated = cpu_to_le32(get_seconds());
+		bch_uuid_write(d->c);
+
+		atomic_set(&d->detaching, 0);
+	}
+
+	d->c->devices[d->id] = NULL;
+	closure_put(&d->c->caching);
+	d->c = NULL;
+}
+
+static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
+				 unsigned id)
+{
+	BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
+
+	d->id = id;
+	d->c = c;
+	c->devices[id] = d;
+
+	closure_get(&c->caching);
+}
+
+static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
+			       const char *name)
+{
+	snprintf(d->name, BCACHEDEVNAME_SIZE,
+		 "%s%u", name, d->id);
+
+	WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
+	     sysfs_create_link(&c->kobj, &d->kobj, d->name),
+	     "Couldn't create device <-> cache set symlinks");
+}
+
+static void bcache_device_free(struct bcache_device *d)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	pr_info("%s stopped", d->disk->disk_name);
+
+	if (d->c)
+		bcache_device_detach(d);
+
+	if (d->disk)
+		del_gendisk(d->disk);
+	if (d->disk && d->disk->queue)
+		blk_cleanup_queue(d->disk->queue);
+	if (d->disk)
+		put_disk(d->disk);
+
+	bio_split_pool_free(&d->bio_split_hook);
+	if (d->unaligned_bvec)
+		mempool_destroy(d->unaligned_bvec);
+	if (d->bio_split)
+		bioset_free(d->bio_split);
+
+	closure_debug_destroy(&d->cl);
+}
+
+static int bcache_device_init(struct bcache_device *d, unsigned block_size)
+{
+	struct request_queue *q;
+
+	if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
+				sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
+	    bio_split_pool_init(&d->bio_split_hook))
+
+		return -ENOMEM;
+
+	d->disk = alloc_disk(1);
+	if (!d->disk)
+		return -ENOMEM;
+
+	snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
+
+	d->disk->major		= bcache_major;
+	d->disk->first_minor	= bcache_minor++;
+	d->disk->fops		= &bcache_ops;
+	d->disk->private_data	= d;
+
+	q = blk_alloc_queue(GFP_KERNEL);
+	if (!q)
+		return -ENOMEM;
+
+	blk_queue_make_request(q, NULL);
+	d->disk->queue			= q;
+	q->queuedata			= d;
+	q->backing_dev_info.congested_data = d;
+	q->limits.max_hw_sectors	= UINT_MAX;
+	q->limits.max_sectors		= UINT_MAX;
+	q->limits.max_segment_size	= UINT_MAX;
+	q->limits.max_segments		= BIO_MAX_PAGES;
+	q->limits.max_discard_sectors	= UINT_MAX;
+	q->limits.io_min		= block_size;
+	q->limits.logical_block_size	= block_size;
+	q->limits.physical_block_size	= block_size;
+	set_bit(QUEUE_FLAG_NONROT,	&d->disk->queue->queue_flags);
+	set_bit(QUEUE_FLAG_DISCARD,	&d->disk->queue->queue_flags);
+
+	return 0;
+}
+
+/* Cached device */
+
+static void calc_cached_dev_sectors(struct cache_set *c)
+{
+	uint64_t sectors = 0;
+	struct cached_dev *dc;
+
+	list_for_each_entry(dc, &c->cached_devs, list)
+		sectors += bdev_sectors(dc->bdev);
+
+	c->cached_dev_sectors = sectors;
+}
+
+void bch_cached_dev_run(struct cached_dev *dc)
+{
+	struct bcache_device *d = &dc->disk;
+
+	if (atomic_xchg(&dc->running, 1))
+		return;
+
+	if (!d->c &&
+	    BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
+		struct closure cl;
+		closure_init_stack(&cl);
+
+		SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	}
+
+	add_disk(d->disk);
+#if 0
+	char *env[] = { "SYMLINK=label" , NULL };
+	kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
+#endif
+	if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
+	    sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
+		pr_debug("error creating sysfs link");
+}
+
+static void cached_dev_detach_finish(struct work_struct *w)
+{
+	struct cached_dev *dc = container_of(w, struct cached_dev, detach);
+	char buf[BDEVNAME_SIZE];
+	struct closure cl;
+	closure_init_stack(&cl);
+
+	BUG_ON(!atomic_read(&dc->disk.detaching));
+	BUG_ON(atomic_read(&dc->count));
+
+	sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name);
+	sysfs_remove_link(&dc->disk.kobj, "cache");
+
+	mutex_lock(&bch_register_lock);
+
+	memset(&dc->sb.set_uuid, 0, 16);
+	SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
+
+	bch_write_bdev_super(dc, &cl);
+	closure_sync(&cl);
+
+	bcache_device_detach(&dc->disk);
+	list_move(&dc->list, &uncached_devices);
+
+	mutex_unlock(&bch_register_lock);
+
+	pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
+
+	/* Drop ref we took in cached_dev_detach() */
+	closure_put(&dc->disk.cl);
+}
+
+void bch_cached_dev_detach(struct cached_dev *dc)
+{
+	lockdep_assert_held(&bch_register_lock);
+
+	if (atomic_read(&dc->disk.closing))
+		return;
+
+	if (atomic_xchg(&dc->disk.detaching, 1))
+		return;
+
+	/*
+	 * Block the device from being closed and freed until we're finished
+	 * detaching
+	 */
+	closure_get(&dc->disk.cl);
+
+	bch_writeback_queue(dc);
+	cached_dev_put(dc);
+}
+
+int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
+{
+	uint32_t rtime = cpu_to_le32(get_seconds());
+	struct uuid_entry *u;
+	char buf[BDEVNAME_SIZE];
+
+	bdevname(dc->bdev, buf);
+
+	if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
+		return -ENOENT;
+
+	if (dc->disk.c) {
+		pr_err("Can't attach %s: already attached", buf);
+		return -EINVAL;
+	}
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
+		pr_err("Can't attach %s: shutting down", buf);
+		return -EINVAL;
+	}
+
+	if (dc->sb.block_size < c->sb.block_size) {
+		/* Will die */
+		pr_err("Couldn't attach %s: block size "
+		       "less than set's block size", buf);
+		return -EINVAL;
+	}
+
+	u = uuid_find(c, dc->sb.uuid);
+
+	if (u &&
+	    (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
+	     BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
+		memcpy(u->uuid, invalid_uuid, 16);
+		u->invalidated = cpu_to_le32(get_seconds());
+		u = NULL;
+	}
+
+	if (!u) {
+		if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+			pr_err("Couldn't find uuid for %s in set", buf);
+			return -ENOENT;
+		}
+
+		u = uuid_find_empty(c);
+		if (!u) {
+			pr_err("Not caching %s, no room for UUID", buf);
+			return -EINVAL;
+		}
+	}
+
+	/* Deadlocks since we're called via sysfs...
+	sysfs_remove_file(&dc->kobj, &sysfs_attach);
+	 */
+
+	if (is_zero(u->uuid, 16)) {
+		struct closure cl;
+		closure_init_stack(&cl);
+
+		memcpy(u->uuid, dc->sb.uuid, 16);
+		memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
+		u->first_reg = u->last_reg = rtime;
+		bch_uuid_write(c);
+
+		memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
+		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+
+		bch_write_bdev_super(dc, &cl);
+		closure_sync(&cl);
+	} else {
+		u->last_reg = rtime;
+		bch_uuid_write(c);
+	}
+
+	bcache_device_attach(&dc->disk, c, u - c->uuids);
+	bcache_device_link(&dc->disk, c, "bdev");
+	list_move(&dc->list, &c->cached_devs);
+	calc_cached_dev_sectors(c);
+
+	smp_wmb();
+	/*
+	 * dc->c must be set before dc->count != 0 - paired with the mb in
+	 * cached_dev_get()
+	 */
+	atomic_set(&dc->count, 1);
+
+	if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
+		atomic_set(&dc->has_dirty, 1);
+		atomic_inc(&dc->count);
+		bch_writeback_queue(dc);
+	}
+
+	bch_cached_dev_run(dc);
+
+	pr_info("Caching %s as %s on set %pU",
+		bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
+		dc->disk.c->sb.set_uuid);
+	return 0;
+}
+
+void bch_cached_dev_release(struct kobject *kobj)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	kfree(dc);
+	module_put(THIS_MODULE);
+}
+
+static void cached_dev_free(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+
+	cancel_delayed_work_sync(&dc->writeback_rate_update);
+
+	mutex_lock(&bch_register_lock);
+
+	bcache_device_free(&dc->disk);
+	list_del(&dc->list);
+
+	mutex_unlock(&bch_register_lock);
+
+	if (!IS_ERR_OR_NULL(dc->bdev)) {
+		blk_sync_queue(bdev_get_queue(dc->bdev));
+		blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	}
+
+	wake_up(&unregister_wait);
+
+	kobject_put(&dc->disk.kobj);
+}
+
+static void cached_dev_flush(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
+	struct bcache_device *d = &dc->disk;
+
+	bch_cache_accounting_destroy(&dc->accounting);
+	kobject_del(&d->kobj);
+
+	continue_at(cl, cached_dev_free, system_wq);
+}
+
+static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
+{
+	int err;
+	struct io *io;
+
+	closure_init(&dc->disk.cl, NULL);
+	set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
+
+	__module_get(THIS_MODULE);
+	INIT_LIST_HEAD(&dc->list);
+	kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
+
+	bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
+
+	err = bcache_device_init(&dc->disk, block_size);
+	if (err)
+		goto err;
+
+	spin_lock_init(&dc->io_lock);
+	closure_init_unlocked(&dc->sb_write);
+	INIT_WORK(&dc->detach, cached_dev_detach_finish);
+
+	dc->sequential_merge		= true;
+	dc->sequential_cutoff		= 4 << 20;
+
+	INIT_LIST_HEAD(&dc->io_lru);
+	dc->sb_bio.bi_max_vecs	= 1;
+	dc->sb_bio.bi_io_vec	= dc->sb_bio.bi_inline_vecs;
+
+	for (io = dc->io; io < dc->io + RECENT_IO; io++) {
+		list_add(&io->lru, &dc->io_lru);
+		hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
+	}
+
+	bch_writeback_init_cached_dev(dc);
+	return 0;
+err:
+	bcache_device_stop(&dc->disk);
+	return err;
+}
+
+/* Cached device - bcache superblock */
+
+static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
+				 struct block_device *bdev,
+				 struct cached_dev *dc)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err = "cannot allocate memory";
+	struct gendisk *g;
+	struct cache_set *c;
+
+	if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0)
+		return err;
+
+	memcpy(&dc->sb, sb, sizeof(struct cache_sb));
+	dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	dc->bdev = bdev;
+	dc->bdev->bd_holder = dc;
+
+	g = dc->disk.disk;
+
+	set_capacity(g, dc->bdev->bd_part->nr_sects - 16);
+
+	bch_cached_dev_request_init(dc);
+
+	err = "error creating kobject";
+	if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
+			"bcache"))
+		goto err;
+	if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
+		goto err;
+
+	list_add(&dc->list, &uncached_devices);
+	list_for_each_entry(c, &bch_cache_sets, list)
+		bch_cached_dev_attach(dc, c);
+
+	if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
+	    BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
+		bch_cached_dev_run(dc);
+
+	return NULL;
+err:
+	kobject_put(&dc->disk.kobj);
+	pr_notice("error opening %s: %s", bdevname(bdev, name), err);
+	/*
+	 * Return NULL instead of an error because kobject_put() cleans
+	 * everything up
+	 */
+	return NULL;
+}
+
+/* Flash only volumes */
+
+void bch_flash_dev_release(struct kobject *kobj)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	kfree(d);
+}
+
+static void flash_dev_free(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+	bcache_device_free(d);
+	kobject_put(&d->kobj);
+}
+
+static void flash_dev_flush(struct closure *cl)
+{
+	struct bcache_device *d = container_of(cl, struct bcache_device, cl);
+
+	sysfs_remove_link(&d->c->kobj, d->name);
+	sysfs_remove_link(&d->kobj, "cache");
+	kobject_del(&d->kobj);
+	continue_at(cl, flash_dev_free, system_wq);
+}
+
+static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
+{
+	struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
+					  GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	closure_init(&d->cl, NULL);
+	set_closure_fn(&d->cl, flash_dev_flush, system_wq);
+
+	kobject_init(&d->kobj, &bch_flash_dev_ktype);
+
+	if (bcache_device_init(d, block_bytes(c)))
+		goto err;
+
+	bcache_device_attach(d, c, u - c->uuids);
+	set_capacity(d->disk, u->sectors);
+	bch_flash_dev_request_init(d);
+	add_disk(d->disk);
+
+	if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
+		goto err;
+
+	bcache_device_link(d, c, "volume");
+
+	return 0;
+err:
+	kobject_put(&d->kobj);
+	return -ENOMEM;
+}
+
+static int flash_devs_run(struct cache_set *c)
+{
+	int ret = 0;
+	struct uuid_entry *u;
+
+	for (u = c->uuids;
+	     u < c->uuids + c->nr_uuids && !ret;
+	     u++)
+		if (UUID_FLASH_ONLY(u))
+			ret = flash_dev_run(c, u);
+
+	return ret;
+}
+
+int bch_flash_dev_create(struct cache_set *c, uint64_t size)
+{
+	struct uuid_entry *u;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return -EINTR;
+
+	u = uuid_find_empty(c);
+	if (!u) {
+		pr_err("Can't create volume, no room for UUID");
+		return -EINVAL;
+	}
+
+	get_random_bytes(u->uuid, 16);
+	memset(u->label, 0, 32);
+	u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
+
+	SET_UUID_FLASH_ONLY(u, 1);
+	u->sectors = size >> 9;
+
+	bch_uuid_write(c);
+
+	return flash_dev_run(c, u);
+}
+
+/* Cache set */
+
+__printf(2, 3)
+bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
+{
+	va_list args;
+
+	if (test_bit(CACHE_SET_STOPPING, &c->flags))
+		return false;
+
+	/* XXX: we can be called from atomic context
+	acquire_console_sem();
+	*/
+
+	printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
+
+	va_start(args, fmt);
+	vprintk(fmt, args);
+	va_end(args);
+
+	printk(", disabling caching\n");
+
+	bch_cache_set_unregister(c);
+	return true;
+}
+
+void bch_cache_set_release(struct kobject *kobj)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+	kfree(c);
+	module_put(THIS_MODULE);
+}
+
+static void cache_set_free(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, cl);
+	struct cache *ca;
+	unsigned i;
+
+	if (!IS_ERR_OR_NULL(c->debug))
+		debugfs_remove(c->debug);
+
+	bch_open_buckets_free(c);
+	bch_btree_cache_free(c);
+	bch_journal_free(c);
+
+	for_each_cache(ca, c, i)
+		if (ca)
+			kobject_put(&ca->kobj);
+
+	free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
+	free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
+
+	kfree(c->fill_iter);
+	if (c->bio_split)
+		bioset_free(c->bio_split);
+	if (c->bio_meta)
+		mempool_destroy(c->bio_meta);
+	if (c->search)
+		mempool_destroy(c->search);
+	kfree(c->devices);
+
+	mutex_lock(&bch_register_lock);
+	list_del(&c->list);
+	mutex_unlock(&bch_register_lock);
+
+	pr_info("Cache set %pU unregistered", c->sb.set_uuid);
+	wake_up(&unregister_wait);
+
+	closure_debug_destroy(&c->cl);
+	kobject_put(&c->kobj);
+}
+
+static void cache_set_flush(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	struct btree *b;
+
+	/* Shut down allocator threads */
+	set_bit(CACHE_SET_STOPPING_2, &c->flags);
+	wake_up(&c->alloc_wait);
+
+	bch_cache_accounting_destroy(&c->accounting);
+
+	kobject_put(&c->internal);
+	kobject_del(&c->kobj);
+
+	if (!IS_ERR_OR_NULL(c->root))
+		list_add(&c->root->list, &c->btree_cache);
+
+	/* Should skip this if we're unregistering because of an error */
+	list_for_each_entry(b, &c->btree_cache, list)
+		if (btree_node_dirty(b))
+			bch_btree_write(b, true, NULL);
+
+	closure_return(cl);
+}
+
+static void __cache_set_unregister(struct closure *cl)
+{
+	struct cache_set *c = container_of(cl, struct cache_set, caching);
+	struct cached_dev *dc, *t;
+	size_t i;
+
+	mutex_lock(&bch_register_lock);
+
+	if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
+		list_for_each_entry_safe(dc, t, &c->cached_devs, list)
+			bch_cached_dev_detach(dc);
+
+	for (i = 0; i < c->nr_uuids; i++)
+		if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
+			bcache_device_stop(c->devices[i]);
+
+	mutex_unlock(&bch_register_lock);
+
+	continue_at(cl, cache_set_flush, system_wq);
+}
+
+void bch_cache_set_stop(struct cache_set *c)
+{
+	if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
+		closure_queue(&c->caching);
+}
+
+void bch_cache_set_unregister(struct cache_set *c)
+{
+	set_bit(CACHE_SET_UNREGISTERING, &c->flags);
+	bch_cache_set_stop(c);
+}
+
+#define alloc_bucket_pages(gfp, c)			\
+	((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
+
+struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
+{
+	int iter_size;
+	struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
+	if (!c)
+		return NULL;
+
+	__module_get(THIS_MODULE);
+	closure_init(&c->cl, NULL);
+	set_closure_fn(&c->cl, cache_set_free, system_wq);
+
+	closure_init(&c->caching, &c->cl);
+	set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
+
+	/* Maybe create continue_at_noreturn() and use it here? */
+	closure_set_stopped(&c->cl);
+	closure_put(&c->cl);
+
+	kobject_init(&c->kobj, &bch_cache_set_ktype);
+	kobject_init(&c->internal, &bch_cache_set_internal_ktype);
+
+	bch_cache_accounting_init(&c->accounting, &c->cl);
+
+	memcpy(c->sb.set_uuid, sb->set_uuid, 16);
+	c->sb.block_size	= sb->block_size;
+	c->sb.bucket_size	= sb->bucket_size;
+	c->sb.nr_in_set		= sb->nr_in_set;
+	c->sb.last_mount	= sb->last_mount;
+	c->bucket_bits		= ilog2(sb->bucket_size);
+	c->block_bits		= ilog2(sb->block_size);
+	c->nr_uuids		= bucket_bytes(c) / sizeof(struct uuid_entry);
+
+	c->btree_pages		= c->sb.bucket_size / PAGE_SECTORS;
+	if (c->btree_pages > BTREE_MAX_PAGES)
+		c->btree_pages = max_t(int, c->btree_pages / 4,
+				       BTREE_MAX_PAGES);
+
+	init_waitqueue_head(&c->alloc_wait);
+	mutex_init(&c->bucket_lock);
+	mutex_init(&c->fill_lock);
+	mutex_init(&c->sort_lock);
+	spin_lock_init(&c->sort_time_lock);
+	closure_init_unlocked(&c->sb_write);
+	closure_init_unlocked(&c->uuid_write);
+	spin_lock_init(&c->btree_read_time_lock);
+	bch_moving_init_cache_set(c);
+
+	INIT_LIST_HEAD(&c->list);
+	INIT_LIST_HEAD(&c->cached_devs);
+	INIT_LIST_HEAD(&c->btree_cache);
+	INIT_LIST_HEAD(&c->btree_cache_freeable);
+	INIT_LIST_HEAD(&c->btree_cache_freed);
+	INIT_LIST_HEAD(&c->data_buckets);
+
+	c->search = mempool_create_slab_pool(32, bch_search_cache);
+	if (!c->search)
+		goto err;
+
+	iter_size = (sb->bucket_size / sb->block_size + 1) *
+		sizeof(struct btree_iter_set);
+
+	if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
+	    !(c->bio_meta = mempool_create_kmalloc_pool(2,
+				sizeof(struct bbio) + sizeof(struct bio_vec) *
+				bucket_pages(c))) ||
+	    !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
+	    !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
+	    !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
+	    !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
+	    bch_journal_alloc(c) ||
+	    bch_btree_cache_alloc(c) ||
+	    bch_open_buckets_alloc(c))
+		goto err;
+
+	c->fill_iter->size = sb->bucket_size / sb->block_size;
+
+	c->congested_read_threshold_us	= 2000;
+	c->congested_write_threshold_us	= 20000;
+	c->error_limit	= 8 << IO_ERROR_SHIFT;
+
+	return c;
+err:
+	bch_cache_set_unregister(c);
+	return NULL;
+}
+
+static void run_cache_set(struct cache_set *c)
+{
+	const char *err = "cannot allocate memory";
+	struct cached_dev *dc, *t;
+	struct cache *ca;
+	unsigned i;
+
+	struct btree_op op;
+	bch_btree_op_init_stack(&op);
+	op.lock = SHRT_MAX;
+
+	for_each_cache(ca, c, i)
+		c->nbuckets += ca->sb.nbuckets;
+
+	if (CACHE_SYNC(&c->sb)) {
+		LIST_HEAD(journal);
+		struct bkey *k;
+		struct jset *j;
+
+		err = "cannot allocate memory for journal";
+		if (bch_journal_read(c, &journal, &op))
+			goto err;
+
+		pr_debug("btree_journal_read() done");
+
+		err = "no journal entries found";
+		if (list_empty(&journal))
+			goto err;
+
+		j = &list_entry(journal.prev, struct journal_replay, list)->j;
+
+		err = "IO error reading priorities";
+		for_each_cache(ca, c, i)
+			prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
+
+		/*
+		 * If prio_read() fails it'll call cache_set_error and we'll
+		 * tear everything down right away, but if we perhaps checked
+		 * sooner we could avoid journal replay.
+		 */
+
+		k = &j->btree_root;
+
+		err = "bad btree root";
+		if (__bch_ptr_invalid(c, j->btree_level + 1, k))
+			goto err;
+
+		err = "error reading btree root";
+		c->root = bch_btree_node_get(c, k, j->btree_level, &op);
+		if (IS_ERR_OR_NULL(c->root))
+			goto err;
+
+		list_del_init(&c->root->list);
+		rw_unlock(true, c->root);
+
+		err = uuid_read(c, j, &op.cl);
+		if (err)
+			goto err;
+
+		err = "error in recovery";
+		if (bch_btree_check(c, &op))
+			goto err;
+
+		bch_journal_mark(c, &journal);
+		bch_btree_gc_finish(c);
+		pr_debug("btree_check() done");
+
+		/*
+		 * bcache_journal_next() can't happen sooner, or
+		 * btree_gc_finish() will give spurious errors about last_gc >
+		 * gc_gen - this is a hack but oh well.
+		 */
+		bch_journal_next(&c->journal);
+
+		for_each_cache(ca, c, i)
+			closure_call(&ca->alloc, bch_allocator_thread,
+				     system_wq, &c->cl);
+
+		/*
+		 * First place it's safe to allocate: btree_check() and
+		 * btree_gc_finish() have to run before we have buckets to
+		 * allocate, and bch_bucket_alloc_set() might cause a journal
+		 * entry to be written so bcache_journal_next() has to be called
+		 * first.
+		 *
+		 * If the uuids were in the old format we have to rewrite them
+		 * before the next journal entry is written:
+		 */
+		if (j->version < BCACHE_JSET_VERSION_UUID)
+			__uuid_write(c);
+
+		bch_journal_replay(c, &journal, &op);
+	} else {
+		pr_notice("invalidating existing data");
+		/* Don't want invalidate_buckets() to queue a gc yet */
+		closure_lock(&c->gc, NULL);
+
+		for_each_cache(ca, c, i) {
+			unsigned j;
+
+			ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
+					      2, SB_JOURNAL_BUCKETS);
+
+			for (j = 0; j < ca->sb.keys; j++)
+				ca->sb.d[j] = ca->sb.first_bucket + j;
+		}
+
+		bch_btree_gc_finish(c);
+
+		for_each_cache(ca, c, i)
+			closure_call(&ca->alloc, bch_allocator_thread,
+				     ca->alloc_workqueue, &c->cl);
+
+		mutex_lock(&c->bucket_lock);
+		for_each_cache(ca, c, i)
+			bch_prio_write(ca);
+		mutex_unlock(&c->bucket_lock);
+
+		wake_up(&c->alloc_wait);
+
+		err = "cannot allocate new UUID bucket";
+		if (__uuid_write(c))
+			goto err_unlock_gc;
+
+		err = "cannot allocate new btree root";
+		c->root = bch_btree_node_alloc(c, 0, &op.cl);
+		if (IS_ERR_OR_NULL(c->root))
+			goto err_unlock_gc;
+
+		bkey_copy_key(&c->root->key, &MAX_KEY);
+		bch_btree_write(c->root, true, &op);
+
+		bch_btree_set_root(c->root);
+		rw_unlock(true, c->root);
+
+		/*
+		 * We don't want to write the first journal entry until
+		 * everything is set up - fortunately journal entries won't be
+		 * written until the SET_CACHE_SYNC() here:
+		 */
+		SET_CACHE_SYNC(&c->sb, true);
+
+		bch_journal_next(&c->journal);
+		bch_journal_meta(c, &op.cl);
+
+		/* Unlock */
+		closure_set_stopped(&c->gc.cl);
+		closure_put(&c->gc.cl);
+	}
+
+	closure_sync(&op.cl);
+	c->sb.last_mount = get_seconds();
+	bcache_write_super(c);
+
+	list_for_each_entry_safe(dc, t, &uncached_devices, list)
+		bch_cached_dev_attach(dc, c);
+
+	flash_devs_run(c);
+
+	return;
+err_unlock_gc:
+	closure_set_stopped(&c->gc.cl);
+	closure_put(&c->gc.cl);
+err:
+	closure_sync(&op.cl);
+	/* XXX: test this, it's broken */
+	bch_cache_set_error(c, err);
+}
+
+static bool can_attach_cache(struct cache *ca, struct cache_set *c)
+{
+	return ca->sb.block_size	== c->sb.block_size &&
+		ca->sb.bucket_size	== c->sb.block_size &&
+		ca->sb.nr_in_set	== c->sb.nr_in_set;
+}
+
+static const char *register_cache_set(struct cache *ca)
+{
+	char buf[12];
+	const char *err = "cannot allocate memory";
+	struct cache_set *c;
+
+	list_for_each_entry(c, &bch_cache_sets, list)
+		if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
+			if (c->cache[ca->sb.nr_this_dev])
+				return "duplicate cache set member";
+
+			if (!can_attach_cache(ca, c))
+				return "cache sb does not match set";
+
+			if (!CACHE_SYNC(&ca->sb))
+				SET_CACHE_SYNC(&c->sb, false);
+
+			goto found;
+		}
+
+	c = bch_cache_set_alloc(&ca->sb);
+	if (!c)
+		return err;
+
+	err = "error creating kobject";
+	if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
+	    kobject_add(&c->internal, &c->kobj, "internal"))
+		goto err;
+
+	if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
+		goto err;
+
+	bch_debug_init_cache_set(c);
+
+	list_add(&c->list, &bch_cache_sets);
+found:
+	sprintf(buf, "cache%i", ca->sb.nr_this_dev);
+	if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
+	    sysfs_create_link(&c->kobj, &ca->kobj, buf))
+		goto err;
+
+	if (ca->sb.seq > c->sb.seq) {
+		c->sb.version		= ca->sb.version;
+		memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
+		c->sb.flags             = ca->sb.flags;
+		c->sb.seq		= ca->sb.seq;
+		pr_debug("set version = %llu", c->sb.version);
+	}
+
+	ca->set = c;
+	ca->set->cache[ca->sb.nr_this_dev] = ca;
+	c->cache_by_alloc[c->caches_loaded++] = ca;
+
+	if (c->caches_loaded == c->sb.nr_in_set)
+		run_cache_set(c);
+
+	return NULL;
+err:
+	bch_cache_set_unregister(c);
+	return err;
+}
+
+/* Cache device */
+
+void bch_cache_release(struct kobject *kobj)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	if (ca->set)
+		ca->set->cache[ca->sb.nr_this_dev] = NULL;
+
+	bch_cache_allocator_exit(ca);
+
+	bio_split_pool_free(&ca->bio_split_hook);
+
+	if (ca->alloc_workqueue)
+		destroy_workqueue(ca->alloc_workqueue);
+
+	free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
+	kfree(ca->prio_buckets);
+	vfree(ca->buckets);
+
+	free_heap(&ca->heap);
+	free_fifo(&ca->unused);
+	free_fifo(&ca->free_inc);
+	free_fifo(&ca->free);
+
+	if (ca->sb_bio.bi_inline_vecs[0].bv_page)
+		put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+
+	if (!IS_ERR_OR_NULL(ca->bdev)) {
+		blk_sync_queue(bdev_get_queue(ca->bdev));
+		blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	}
+
+	kfree(ca);
+	module_put(THIS_MODULE);
+}
+
+static int cache_alloc(struct cache_sb *sb, struct cache *ca)
+{
+	size_t free;
+	struct bucket *b;
+
+	if (!ca)
+		return -ENOMEM;
+
+	__module_get(THIS_MODULE);
+	kobject_init(&ca->kobj, &bch_cache_ktype);
+
+	memcpy(&ca->sb, sb, sizeof(struct cache_sb));
+
+	INIT_LIST_HEAD(&ca->discards);
+
+	bio_init(&ca->sb_bio);
+	ca->sb_bio.bi_max_vecs	= 1;
+	ca->sb_bio.bi_io_vec	= ca->sb_bio.bi_inline_vecs;
+
+	bio_init(&ca->journal.bio);
+	ca->journal.bio.bi_max_vecs = 8;
+	ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
+
+	free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
+	free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
+
+	if (!init_fifo(&ca->free,	free, GFP_KERNEL) ||
+	    !init_fifo(&ca->free_inc,	free << 2, GFP_KERNEL) ||
+	    !init_fifo(&ca->unused,	free << 2, GFP_KERNEL) ||
+	    !init_heap(&ca->heap,	free << 3, GFP_KERNEL) ||
+	    !(ca->buckets	= vmalloc(sizeof(struct bucket) *
+					  ca->sb.nbuckets)) ||
+	    !(ca->prio_buckets	= kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
+					  2, GFP_KERNEL)) ||
+	    !(ca->disk_buckets	= alloc_bucket_pages(GFP_KERNEL, ca)) ||
+	    !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
+	    bio_split_pool_init(&ca->bio_split_hook))
+		goto err;
+
+	ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
+
+	memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket));
+	for_each_bucket(b, ca)
+		atomic_set(&b->pin, 0);
+
+	if (bch_cache_allocator_init(ca))
+		goto err;
+
+	return 0;
+err:
+	kobject_put(&ca->kobj);
+	return -ENOMEM;
+}
+
+static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
+				  struct block_device *bdev, struct cache *ca)
+{
+	char name[BDEVNAME_SIZE];
+	const char *err = "cannot allocate memory";
+
+	if (cache_alloc(sb, ca) != 0)
+		return err;
+
+	ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+	ca->bdev = bdev;
+	ca->bdev->bd_holder = ca;
+
+	if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+		ca->discard = CACHE_DISCARD(&ca->sb);
+
+	err = "error creating kobject";
+	if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
+		goto err;
+
+	err = register_cache_set(ca);
+	if (err)
+		goto err;
+
+	pr_info("registered cache device %s", bdevname(bdev, name));
+
+	return NULL;
+err:
+	kobject_put(&ca->kobj);
+	pr_info("error opening %s: %s", bdevname(bdev, name), err);
+	/* Return NULL instead of an error because kobject_put() cleans
+	 * everything up
+	 */
+	return NULL;
+}
+
+/* Global interfaces/init */
+
+static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
+			       const char *, size_t);
+
+kobj_attribute_write(register,		register_bcache);
+kobj_attribute_write(register_quiet,	register_bcache);
+
+static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
+			       const char *buffer, size_t size)
+{
+	ssize_t ret = size;
+	const char *err = "cannot allocate memory";
+	char *path = NULL;
+	struct cache_sb *sb = NULL;
+	struct block_device *bdev = NULL;
+	struct page *sb_page = NULL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -EBUSY;
+
+	mutex_lock(&bch_register_lock);
+
+	if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
+	    !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
+		goto err;
+
+	err = "failed to open device";
+	bdev = blkdev_get_by_path(strim(path),
+				  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+				  sb);
+	if (bdev == ERR_PTR(-EBUSY))
+		err = "device busy";
+
+	if (IS_ERR(bdev) ||
+	    set_blocksize(bdev, 4096))
+		goto err;
+
+	err = read_super(sb, bdev, &sb_page);
+	if (err)
+		goto err_close;
+
+	if (sb->version == CACHE_BACKING_DEV) {
+		struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
+
+		err = register_bdev(sb, sb_page, bdev, dc);
+	} else {
+		struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+
+		err = register_cache(sb, sb_page, bdev, ca);
+	}
+
+	if (err) {
+		/* register_(bdev|cache) will only return an error if they
+		 * didn't get far enough to create the kobject - if they did,
+		 * the kobject destructor will do this cleanup.
+		 */
+		put_page(sb_page);
+err_close:
+		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+err:
+		if (attr != &ksysfs_register_quiet)
+			pr_info("error opening %s: %s", path, err);
+		ret = -EINVAL;
+	}
+
+	kfree(sb);
+	kfree(path);
+	mutex_unlock(&bch_register_lock);
+	module_put(THIS_MODULE);
+	return ret;
+}
+
+static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
+{
+	if (code == SYS_DOWN ||
+	    code == SYS_HALT ||
+	    code == SYS_POWER_OFF) {
+		DEFINE_WAIT(wait);
+		unsigned long start = jiffies;
+		bool stopped = false;
+
+		struct cache_set *c, *tc;
+		struct cached_dev *dc, *tdc;
+
+		mutex_lock(&bch_register_lock);
+
+		if (list_empty(&bch_cache_sets) &&
+		    list_empty(&uncached_devices))
+			goto out;
+
+		pr_info("Stopping all devices:");
+
+		list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
+			bch_cache_set_stop(c);
+
+		list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
+			bcache_device_stop(&dc->disk);
+
+		/* What's a condition variable? */
+		while (1) {
+			long timeout = start + 2 * HZ - jiffies;
+
+			stopped = list_empty(&bch_cache_sets) &&
+				list_empty(&uncached_devices);
+
+			if (timeout < 0 || stopped)
+				break;
+
+			prepare_to_wait(&unregister_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+
+			mutex_unlock(&bch_register_lock);
+			schedule_timeout(timeout);
+			mutex_lock(&bch_register_lock);
+		}
+
+		finish_wait(&unregister_wait, &wait);
+
+		if (stopped)
+			pr_info("All devices stopped");
+		else
+			pr_notice("Timeout waiting for devices to be closed");
+out:
+		mutex_unlock(&bch_register_lock);
+	}
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block reboot = {
+	.notifier_call	= bcache_reboot,
+	.priority	= INT_MAX, /* before any real devices */
+};
+
+static void bcache_exit(void)
+{
+	bch_debug_exit();
+	bch_writeback_exit();
+	bch_request_exit();
+	bch_btree_exit();
+	if (bcache_kobj)
+		kobject_put(bcache_kobj);
+	if (bcache_wq)
+		destroy_workqueue(bcache_wq);
+	unregister_blkdev(bcache_major, "bcache");
+	unregister_reboot_notifier(&reboot);
+}
+
+static int __init bcache_init(void)
+{
+	static const struct attribute *files[] = {
+		&ksysfs_register.attr,
+		&ksysfs_register_quiet.attr,
+		NULL
+	};
+
+	mutex_init(&bch_register_lock);
+	init_waitqueue_head(&unregister_wait);
+	register_reboot_notifier(&reboot);
+
+	bcache_major = register_blkdev(0, "bcache");
+	if (bcache_major < 0)
+		return bcache_major;
+
+	if (!(bcache_wq = create_workqueue("bcache")) ||
+	    !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
+	    sysfs_create_files(bcache_kobj, files) ||
+	    bch_btree_init() ||
+	    bch_request_init() ||
+	    bch_writeback_init() ||
+	    bch_debug_init(bcache_kobj))
+		goto err;
+
+	return 0;
+err:
+	bcache_exit();
+	return -ENOMEM;
+}
+
+module_exit(bcache_exit);
+module_init(bcache_init);
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
new file mode 100644
index 000000000000..5c7e77073b1f
--- /dev/null
+++ b/drivers/md/bcache/sysfs.c
@@ -0,0 +1,817 @@
+/*
+ * bcache sysfs interfaces
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "sysfs.h"
+#include "btree.h"
+#include "request.h"
+
+#include <linux/sort.h>
+
+static const char * const cache_replacement_policies[] = {
+	"lru",
+	"fifo",
+	"random",
+	NULL
+};
+
+write_attribute(attach);
+write_attribute(detach);
+write_attribute(unregister);
+write_attribute(stop);
+write_attribute(clear_stats);
+write_attribute(trigger_gc);
+write_attribute(prune_cache);
+write_attribute(flash_vol_create);
+
+read_attribute(bucket_size);
+read_attribute(block_size);
+read_attribute(nbuckets);
+read_attribute(tree_depth);
+read_attribute(root_usage_percent);
+read_attribute(priority_stats);
+read_attribute(btree_cache_size);
+read_attribute(btree_cache_max_chain);
+read_attribute(cache_available_percent);
+read_attribute(written);
+read_attribute(btree_written);
+read_attribute(metadata_written);
+read_attribute(active_journal_entries);
+
+sysfs_time_stats_attribute(btree_gc,	sec, ms);
+sysfs_time_stats_attribute(btree_split, sec, us);
+sysfs_time_stats_attribute(btree_sort,	ms,  us);
+sysfs_time_stats_attribute(btree_read,	ms,  us);
+sysfs_time_stats_attribute(try_harder,	ms,  us);
+
+read_attribute(btree_nodes);
+read_attribute(btree_used_percent);
+read_attribute(average_key_size);
+read_attribute(dirty_data);
+read_attribute(bset_tree_stats);
+
+read_attribute(state);
+read_attribute(cache_read_races);
+read_attribute(writeback_keys_done);
+read_attribute(writeback_keys_failed);
+read_attribute(io_errors);
+read_attribute(congested);
+rw_attribute(congested_read_threshold_us);
+rw_attribute(congested_write_threshold_us);
+
+rw_attribute(sequential_cutoff);
+rw_attribute(sequential_merge);
+rw_attribute(data_csum);
+rw_attribute(cache_mode);
+rw_attribute(writeback_metadata);
+rw_attribute(writeback_running);
+rw_attribute(writeback_percent);
+rw_attribute(writeback_delay);
+rw_attribute(writeback_rate);
+
+rw_attribute(writeback_rate_update_seconds);
+rw_attribute(writeback_rate_d_term);
+rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_d_smooth);
+read_attribute(writeback_rate_debug);
+
+rw_attribute(synchronous);
+rw_attribute(journal_delay_ms);
+rw_attribute(discard);
+rw_attribute(running);
+rw_attribute(label);
+rw_attribute(readahead);
+rw_attribute(io_error_limit);
+rw_attribute(io_error_halflife);
+rw_attribute(verify);
+rw_attribute(key_merging_disabled);
+rw_attribute(gc_always_rewrite);
+rw_attribute(freelist_percent);
+rw_attribute(cache_replacement_policy);
+rw_attribute(btree_shrinker_disabled);
+rw_attribute(copy_gc_enabled);
+rw_attribute(size);
+
+SHOW(__bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	const char *states[] = { "no cache", "clean", "dirty", "inconsistent" };
+
+#define var(stat)		(dc->stat)
+
+	if (attr == &sysfs_cache_mode)
+		return snprint_string_list(buf, PAGE_SIZE,
+					   bch_cache_modes + 1,
+					   BDEV_CACHE_MODE(&dc->sb));
+
+	sysfs_printf(data_csum,		"%i", dc->disk.data_csum);
+	var_printf(verify,		"%i");
+	var_printf(writeback_metadata,	"%i");
+	var_printf(writeback_running,	"%i");
+	var_print(writeback_delay);
+	var_print(writeback_percent);
+	sysfs_print(writeback_rate,	dc->writeback_rate.rate);
+
+	var_print(writeback_rate_update_seconds);
+	var_print(writeback_rate_d_term);
+	var_print(writeback_rate_p_term_inverse);
+	var_print(writeback_rate_d_smooth);
+
+	if (attr == &sysfs_writeback_rate_debug) {
+		char dirty[20];
+		char derivative[20];
+		char target[20];
+		hprint(dirty,
+		       atomic_long_read(&dc->disk.sectors_dirty) << 9);
+		hprint(derivative,	dc->writeback_rate_derivative << 9);
+		hprint(target,		dc->writeback_rate_target << 9);
+
+		return sprintf(buf,
+			       "rate:\t\t%u\n"
+			       "change:\t\t%i\n"
+			       "dirty:\t\t%s\n"
+			       "derivative:\t%s\n"
+			       "target:\t\t%s\n",
+			       dc->writeback_rate.rate,
+			       dc->writeback_rate_change,
+			       dirty, derivative, target);
+	}
+
+	sysfs_hprint(dirty_data,
+		     atomic_long_read(&dc->disk.sectors_dirty) << 9);
+
+	var_printf(sequential_merge,	"%i");
+	var_hprint(sequential_cutoff);
+	var_hprint(readahead);
+
+	sysfs_print(running,		atomic_read(&dc->running));
+	sysfs_print(state,		states[BDEV_STATE(&dc->sb)]);
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+#undef var
+	return 0;
+}
+SHOW_LOCKED(bch_cached_dev)
+
+STORE(__cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+	unsigned v = size;
+	struct cache_set *c;
+
+#define d_strtoul(var)		sysfs_strtoul(var, dc->var)
+#define d_strtoi_h(var)		sysfs_hatoi(var, dc->var)
+
+	sysfs_strtoul(data_csum,	dc->disk.data_csum);
+	d_strtoul(verify);
+	d_strtoul(writeback_metadata);
+	d_strtoul(writeback_running);
+	d_strtoul(writeback_delay);
+	sysfs_strtoul_clamp(writeback_rate,
+			    dc->writeback_rate.rate, 1, 1000000);
+	sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent, 0, 40);
+
+	d_strtoul(writeback_rate_update_seconds);
+	d_strtoul(writeback_rate_d_term);
+	d_strtoul(writeback_rate_p_term_inverse);
+	sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
+			    dc->writeback_rate_p_term_inverse, 1, INT_MAX);
+	d_strtoul(writeback_rate_d_smooth);
+
+	d_strtoul(sequential_merge);
+	d_strtoi_h(sequential_cutoff);
+	d_strtoi_h(readahead);
+
+	if (attr == &sysfs_clear_stats)
+		bch_cache_accounting_clear(&dc->accounting);
+
+	if (attr == &sysfs_running &&
+	    strtoul_or_return(buf))
+		bch_cached_dev_run(dc);
+
+	if (attr == &sysfs_cache_mode) {
+		ssize_t v = read_string_list(buf, bch_cache_modes + 1);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != BDEV_CACHE_MODE(&dc->sb)) {
+			SET_BDEV_CACHE_MODE(&dc->sb, v);
+			bch_write_bdev_super(dc, NULL);
+		}
+	}
+
+	if (attr == &sysfs_label) {
+		memcpy(dc->sb.label, buf, SB_LABEL_SIZE);
+		bch_write_bdev_super(dc, NULL);
+		if (dc->disk.c) {
+			memcpy(dc->disk.c->uuids[dc->disk.id].label,
+			       buf, SB_LABEL_SIZE);
+			bch_uuid_write(dc->disk.c);
+		}
+	}
+
+	if (attr == &sysfs_attach) {
+		if (parse_uuid(buf, dc->sb.set_uuid) < 16)
+			return -EINVAL;
+
+		list_for_each_entry(c, &bch_cache_sets, list) {
+			v = bch_cached_dev_attach(dc, c);
+			if (!v)
+				return size;
+		}
+
+		pr_err("Can't attach %s: cache set not found", buf);
+		size = v;
+	}
+
+	if (attr == &sysfs_detach && dc->disk.c)
+		bch_cached_dev_detach(dc);
+
+	if (attr == &sysfs_stop)
+		bcache_device_stop(&dc->disk);
+
+	return size;
+}
+
+STORE(bch_cached_dev)
+{
+	struct cached_dev *dc = container_of(kobj, struct cached_dev,
+					     disk.kobj);
+
+	mutex_lock(&bch_register_lock);
+	size = __cached_dev_store(kobj, attr, buf, size);
+
+	if (attr == &sysfs_writeback_running)
+		bch_writeback_queue(dc);
+
+	if (attr == &sysfs_writeback_percent)
+		schedule_delayed_work(&dc->writeback_rate_update,
+				      dc->writeback_rate_update_seconds * HZ);
+
+	mutex_unlock(&bch_register_lock);
+	return size;
+}
+
+static struct attribute *bch_cached_dev_files[] = {
+	&sysfs_attach,
+	&sysfs_detach,
+	&sysfs_stop,
+#if 0
+	&sysfs_data_csum,
+#endif
+	&sysfs_cache_mode,
+	&sysfs_writeback_metadata,
+	&sysfs_writeback_running,
+	&sysfs_writeback_delay,
+	&sysfs_writeback_percent,
+	&sysfs_writeback_rate,
+	&sysfs_writeback_rate_update_seconds,
+	&sysfs_writeback_rate_d_term,
+	&sysfs_writeback_rate_p_term_inverse,
+	&sysfs_writeback_rate_d_smooth,
+	&sysfs_writeback_rate_debug,
+	&sysfs_dirty_data,
+	&sysfs_sequential_cutoff,
+	&sysfs_sequential_merge,
+	&sysfs_clear_stats,
+	&sysfs_running,
+	&sysfs_state,
+	&sysfs_label,
+	&sysfs_readahead,
+#ifdef CONFIG_BCACHE_DEBUG
+	&sysfs_verify,
+#endif
+	NULL
+};
+KTYPE(bch_cached_dev);
+
+SHOW(bch_flash_dev)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	struct uuid_entry *u = &d->c->uuids[d->id];
+
+	sysfs_printf(data_csum,	"%i", d->data_csum);
+	sysfs_hprint(size,	u->sectors << 9);
+
+	if (attr == &sysfs_label) {
+		memcpy(buf, u->label, SB_LABEL_SIZE);
+		buf[SB_LABEL_SIZE + 1] = '\0';
+		strcat(buf, "\n");
+		return strlen(buf);
+	}
+
+	return 0;
+}
+
+STORE(__bch_flash_dev)
+{
+	struct bcache_device *d = container_of(kobj, struct bcache_device,
+					       kobj);
+	struct uuid_entry *u = &d->c->uuids[d->id];
+
+	sysfs_strtoul(data_csum,	d->data_csum);
+
+	if (attr == &sysfs_size) {
+		uint64_t v;
+		strtoi_h_or_return(buf, v);
+
+		u->sectors = v >> 9;
+		bch_uuid_write(d->c);
+		set_capacity(d->disk, u->sectors);
+	}
+
+	if (attr == &sysfs_label) {
+		memcpy(u->label, buf, SB_LABEL_SIZE);
+		bch_uuid_write(d->c);
+	}
+
+	if (attr == &sysfs_unregister) {
+		atomic_set(&d->detaching, 1);
+		bcache_device_stop(d);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_flash_dev)
+
+static struct attribute *bch_flash_dev_files[] = {
+	&sysfs_unregister,
+#if 0
+	&sysfs_data_csum,
+#endif
+	&sysfs_label,
+	&sysfs_size,
+	NULL
+};
+KTYPE(bch_flash_dev);
+
+SHOW(__bch_cache_set)
+{
+	unsigned root_usage(struct cache_set *c)
+	{
+		unsigned bytes = 0;
+		struct bkey *k;
+		struct btree *b;
+		struct btree_iter iter;
+
+		goto lock_root;
+
+		do {
+			rw_unlock(false, b);
+lock_root:
+			b = c->root;
+			rw_lock(false, b, b->level);
+		} while (b != c->root);
+
+		for_each_key_filter(b, k, &iter, bch_ptr_bad)
+			bytes += bkey_bytes(k);
+
+		rw_unlock(false, b);
+
+		return (bytes * 100) / btree_bytes(c);
+	}
+
+	size_t cache_size(struct cache_set *c)
+	{
+		size_t ret = 0;
+		struct btree *b;
+
+		mutex_lock(&c->bucket_lock);
+		list_for_each_entry(b, &c->btree_cache, list)
+			ret += 1 << (b->page_order + PAGE_SHIFT);
+
+		mutex_unlock(&c->bucket_lock);
+		return ret;
+	}
+
+	unsigned cache_max_chain(struct cache_set *c)
+	{
+		unsigned ret = 0;
+		struct hlist_head *h;
+
+		mutex_lock(&c->bucket_lock);
+
+		for (h = c->bucket_hash;
+		     h < c->bucket_hash + (1 << BUCKET_HASH_BITS);
+		     h++) {
+			unsigned i = 0;
+			struct hlist_node *p;
+
+			hlist_for_each(p, h)
+				i++;
+
+			ret = max(ret, i);
+		}
+
+		mutex_unlock(&c->bucket_lock);
+		return ret;
+	}
+
+	unsigned btree_used(struct cache_set *c)
+	{
+		return div64_u64(c->gc_stats.key_bytes * 100,
+				 (c->gc_stats.nodes ?: 1) * btree_bytes(c));
+	}
+
+	unsigned average_key_size(struct cache_set *c)
+	{
+		return c->gc_stats.nkeys
+			? div64_u64(c->gc_stats.data, c->gc_stats.nkeys)
+			: 0;
+	}
+
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	sysfs_print(synchronous,		CACHE_SYNC(&c->sb));
+	sysfs_print(journal_delay_ms,		c->journal_delay_ms);
+	sysfs_hprint(bucket_size,		bucket_bytes(c));
+	sysfs_hprint(block_size,		block_bytes(c));
+	sysfs_print(tree_depth,			c->root->level);
+	sysfs_print(root_usage_percent,		root_usage(c));
+
+	sysfs_hprint(btree_cache_size,		cache_size(c));
+	sysfs_print(btree_cache_max_chain,	cache_max_chain(c));
+	sysfs_print(cache_available_percent,	100 - c->gc_stats.in_use);
+
+	sysfs_print_time_stats(&c->btree_gc_time,	btree_gc, sec, ms);
+	sysfs_print_time_stats(&c->btree_split_time,	btree_split, sec, us);
+	sysfs_print_time_stats(&c->sort_time,		btree_sort, ms, us);
+	sysfs_print_time_stats(&c->btree_read_time,	btree_read, ms, us);
+	sysfs_print_time_stats(&c->try_harder_time,	try_harder, ms, us);
+
+	sysfs_print(btree_used_percent,	btree_used(c));
+	sysfs_print(btree_nodes,	c->gc_stats.nodes);
+	sysfs_hprint(dirty_data,	c->gc_stats.dirty);
+	sysfs_hprint(average_key_size,	average_key_size(c));
+
+	sysfs_print(cache_read_races,
+		    atomic_long_read(&c->cache_read_races));
+
+	sysfs_print(writeback_keys_done,
+		    atomic_long_read(&c->writeback_keys_done));
+	sysfs_print(writeback_keys_failed,
+		    atomic_long_read(&c->writeback_keys_failed));
+
+	/* See count_io_errors for why 88 */
+	sysfs_print(io_error_halflife,	c->error_decay * 88);
+	sysfs_print(io_error_limit,	c->error_limit >> IO_ERROR_SHIFT);
+
+	sysfs_hprint(congested,
+		     ((uint64_t) bch_get_congested(c)) << 9);
+	sysfs_print(congested_read_threshold_us,
+		    c->congested_read_threshold_us);
+	sysfs_print(congested_write_threshold_us,
+		    c->congested_write_threshold_us);
+
+	sysfs_print(active_journal_entries,	fifo_used(&c->journal.pin));
+	sysfs_printf(verify,			"%i", c->verify);
+	sysfs_printf(key_merging_disabled,	"%i", c->key_merging_disabled);
+	sysfs_printf(gc_always_rewrite,		"%i", c->gc_always_rewrite);
+	sysfs_printf(btree_shrinker_disabled,	"%i", c->shrinker_disabled);
+	sysfs_printf(copy_gc_enabled,		"%i", c->copy_gc_enabled);
+
+	if (attr == &sysfs_bset_tree_stats)
+		return bch_bset_print_stats(c, buf);
+
+	return 0;
+}
+SHOW_LOCKED(bch_cache_set)
+
+STORE(__bch_cache_set)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, kobj);
+
+	if (attr == &sysfs_unregister)
+		bch_cache_set_unregister(c);
+
+	if (attr == &sysfs_stop)
+		bch_cache_set_stop(c);
+
+	if (attr == &sysfs_synchronous) {
+		bool sync = strtoul_or_return(buf);
+
+		if (sync != CACHE_SYNC(&c->sb)) {
+			SET_CACHE_SYNC(&c->sb, sync);
+			bcache_write_super(c);
+		}
+	}
+
+	if (attr == &sysfs_flash_vol_create) {
+		int r;
+		uint64_t v;
+		strtoi_h_or_return(buf, v);
+
+		r = bch_flash_dev_create(c, v);
+		if (r)
+			return r;
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		atomic_long_set(&c->writeback_keys_done,	0);
+		atomic_long_set(&c->writeback_keys_failed,	0);
+
+		memset(&c->gc_stats, 0, sizeof(struct gc_stat));
+		bch_cache_accounting_clear(&c->accounting);
+	}
+
+	if (attr == &sysfs_trigger_gc)
+		bch_queue_gc(c);
+
+	if (attr == &sysfs_prune_cache) {
+		struct shrink_control sc;
+		sc.gfp_mask = GFP_KERNEL;
+		sc.nr_to_scan = strtoul_or_return(buf);
+		c->shrink.shrink(&c->shrink, &sc);
+	}
+
+	sysfs_strtoul(congested_read_threshold_us,
+		      c->congested_read_threshold_us);
+	sysfs_strtoul(congested_write_threshold_us,
+		      c->congested_write_threshold_us);
+
+	if (attr == &sysfs_io_error_limit)
+		c->error_limit = strtoul_or_return(buf) << IO_ERROR_SHIFT;
+
+	/* See count_io_errors() for why 88 */
+	if (attr == &sysfs_io_error_halflife)
+		c->error_decay = strtoul_or_return(buf) / 88;
+
+	sysfs_strtoul(journal_delay_ms,		c->journal_delay_ms);
+	sysfs_strtoul(verify,			c->verify);
+	sysfs_strtoul(key_merging_disabled,	c->key_merging_disabled);
+	sysfs_strtoul(gc_always_rewrite,	c->gc_always_rewrite);
+	sysfs_strtoul(btree_shrinker_disabled,	c->shrinker_disabled);
+	sysfs_strtoul(copy_gc_enabled,		c->copy_gc_enabled);
+
+	return size;
+}
+STORE_LOCKED(bch_cache_set)
+
+SHOW(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_show(&c->kobj, attr, buf);
+}
+
+STORE(bch_cache_set_internal)
+{
+	struct cache_set *c = container_of(kobj, struct cache_set, internal);
+	return bch_cache_set_store(&c->kobj, attr, buf, size);
+}
+
+static void bch_cache_set_internal_release(struct kobject *k)
+{
+}
+
+static struct attribute *bch_cache_set_files[] = {
+	&sysfs_unregister,
+	&sysfs_stop,
+	&sysfs_synchronous,
+	&sysfs_journal_delay_ms,
+	&sysfs_flash_vol_create,
+
+	&sysfs_bucket_size,
+	&sysfs_block_size,
+	&sysfs_tree_depth,
+	&sysfs_root_usage_percent,
+	&sysfs_btree_cache_size,
+	&sysfs_cache_available_percent,
+
+	&sysfs_average_key_size,
+	&sysfs_dirty_data,
+
+	&sysfs_io_error_limit,
+	&sysfs_io_error_halflife,
+	&sysfs_congested,
+	&sysfs_congested_read_threshold_us,
+	&sysfs_congested_write_threshold_us,
+	&sysfs_clear_stats,
+	NULL
+};
+KTYPE(bch_cache_set);
+
+static struct attribute *bch_cache_set_internal_files[] = {
+	&sysfs_active_journal_entries,
+
+	sysfs_time_stats_attribute_list(btree_gc, sec, ms)
+	sysfs_time_stats_attribute_list(btree_split, sec, us)
+	sysfs_time_stats_attribute_list(btree_sort, ms, us)
+	sysfs_time_stats_attribute_list(btree_read, ms, us)
+	sysfs_time_stats_attribute_list(try_harder, ms, us)
+
+	&sysfs_btree_nodes,
+	&sysfs_btree_used_percent,
+	&sysfs_btree_cache_max_chain,
+
+	&sysfs_bset_tree_stats,
+	&sysfs_cache_read_races,
+	&sysfs_writeback_keys_done,
+	&sysfs_writeback_keys_failed,
+
+	&sysfs_trigger_gc,
+	&sysfs_prune_cache,
+#ifdef CONFIG_BCACHE_DEBUG
+	&sysfs_verify,
+	&sysfs_key_merging_disabled,
+#endif
+	&sysfs_gc_always_rewrite,
+	&sysfs_btree_shrinker_disabled,
+	&sysfs_copy_gc_enabled,
+	NULL
+};
+KTYPE(bch_cache_set_internal);
+
+SHOW(__bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	sysfs_hprint(bucket_size,	bucket_bytes(ca));
+	sysfs_hprint(block_size,	block_bytes(ca));
+	sysfs_print(nbuckets,		ca->sb.nbuckets);
+	sysfs_print(discard,		ca->discard);
+	sysfs_hprint(written, atomic_long_read(&ca->sectors_written) << 9);
+	sysfs_hprint(btree_written,
+		     atomic_long_read(&ca->btree_sectors_written) << 9);
+	sysfs_hprint(metadata_written,
+		     (atomic_long_read(&ca->meta_sectors_written) +
+		      atomic_long_read(&ca->btree_sectors_written)) << 9);
+
+	sysfs_print(io_errors,
+		    atomic_read(&ca->io_errors) >> IO_ERROR_SHIFT);
+
+	sysfs_print(freelist_percent, ca->free.size * 100 /
+		    ((size_t) ca->sb.nbuckets));
+
+	if (attr == &sysfs_cache_replacement_policy)
+		return snprint_string_list(buf, PAGE_SIZE,
+					   cache_replacement_policies,
+					   CACHE_REPLACEMENT(&ca->sb));
+
+	if (attr == &sysfs_priority_stats) {
+		int cmp(const void *l, const void *r)
+		{	return *((uint16_t *) r) - *((uint16_t *) l); }
+
+		/* Number of quantiles we compute */
+		const unsigned nq = 31;
+
+		size_t n = ca->sb.nbuckets, i, unused, btree;
+		uint64_t sum = 0;
+		uint16_t q[nq], *p, *cached;
+		ssize_t ret;
+
+		cached = p = vmalloc(ca->sb.nbuckets * sizeof(uint16_t));
+		if (!p)
+			return -ENOMEM;
+
+		mutex_lock(&ca->set->bucket_lock);
+		for (i = ca->sb.first_bucket; i < n; i++)
+			p[i] = ca->buckets[i].prio;
+		mutex_unlock(&ca->set->bucket_lock);
+
+		sort(p, n, sizeof(uint16_t), cmp, NULL);
+
+		while (n &&
+		       !cached[n - 1])
+			--n;
+
+		unused = ca->sb.nbuckets - n;
+
+		while (cached < p + n &&
+		       *cached == BTREE_PRIO)
+			cached++;
+
+		btree = cached - p;
+		n -= btree;
+
+		for (i = 0; i < n; i++)
+			sum += INITIAL_PRIO - cached[i];
+
+		if (n)
+			do_div(sum, n);
+
+		for (i = 0; i < nq; i++)
+			q[i] = INITIAL_PRIO - cached[n * (i + 1) / (nq + 1)];
+
+		vfree(p);
+
+		ret = snprintf(buf, PAGE_SIZE,
+			       "Unused:		%zu%%\n"
+			       "Metadata:	%zu%%\n"
+			       "Average:	%llu\n"
+			       "Sectors per Q:	%zu\n"
+			       "Quantiles:	[",
+			       unused * 100 / (size_t) ca->sb.nbuckets,
+			       btree * 100 / (size_t) ca->sb.nbuckets, sum,
+			       n * ca->sb.bucket_size / (nq + 1));
+
+		for (i = 0; i < nq && ret < (ssize_t) PAGE_SIZE; i++)
+			ret += snprintf(buf + ret, PAGE_SIZE - ret,
+					i < nq - 1 ? "%u " : "%u]\n", q[i]);
+
+		buf[PAGE_SIZE - 1] = '\0';
+		return ret;
+	}
+
+	return 0;
+}
+SHOW_LOCKED(bch_cache)
+
+STORE(__bch_cache)
+{
+	struct cache *ca = container_of(kobj, struct cache, kobj);
+
+	if (attr == &sysfs_discard) {
+		bool v = strtoul_or_return(buf);
+
+		if (blk_queue_discard(bdev_get_queue(ca->bdev)))
+			ca->discard = v;
+
+		if (v != CACHE_DISCARD(&ca->sb)) {
+			SET_CACHE_DISCARD(&ca->sb, v);
+			bcache_write_super(ca->set);
+		}
+	}
+
+	if (attr == &sysfs_cache_replacement_policy) {
+		ssize_t v = read_string_list(buf, cache_replacement_policies);
+
+		if (v < 0)
+			return v;
+
+		if ((unsigned) v != CACHE_REPLACEMENT(&ca->sb)) {
+			mutex_lock(&ca->set->bucket_lock);
+			SET_CACHE_REPLACEMENT(&ca->sb, v);
+			mutex_unlock(&ca->set->bucket_lock);
+
+			bcache_write_super(ca->set);
+		}
+	}
+
+	if (attr == &sysfs_freelist_percent) {
+		DECLARE_FIFO(long, free);
+		long i;
+		size_t p = strtoul_or_return(buf);
+
+		p = clamp_t(size_t,
+			    ((size_t) ca->sb.nbuckets * p) / 100,
+			    roundup_pow_of_two(ca->sb.nbuckets) >> 9,
+			    ca->sb.nbuckets / 2);
+
+		if (!init_fifo_exact(&free, p, GFP_KERNEL))
+			return -ENOMEM;
+
+		mutex_lock(&ca->set->bucket_lock);
+
+		fifo_move(&free, &ca->free);
+		fifo_swap(&free, &ca->free);
+
+		mutex_unlock(&ca->set->bucket_lock);
+
+		while (fifo_pop(&free, i))
+			atomic_dec(&ca->buckets[i].pin);
+
+		free_fifo(&free);
+	}
+
+	if (attr == &sysfs_clear_stats) {
+		atomic_long_set(&ca->sectors_written, 0);
+		atomic_long_set(&ca->btree_sectors_written, 0);
+		atomic_long_set(&ca->meta_sectors_written, 0);
+		atomic_set(&ca->io_count, 0);
+		atomic_set(&ca->io_errors, 0);
+	}
+
+	return size;
+}
+STORE_LOCKED(bch_cache)
+
+static struct attribute *bch_cache_files[] = {
+	&sysfs_bucket_size,
+	&sysfs_block_size,
+	&sysfs_nbuckets,
+	&sysfs_priority_stats,
+	&sysfs_discard,
+	&sysfs_written,
+	&sysfs_btree_written,
+	&sysfs_metadata_written,
+	&sysfs_io_errors,
+	&sysfs_clear_stats,
+	&sysfs_freelist_percent,
+	&sysfs_cache_replacement_policy,
+	NULL
+};
+KTYPE(bch_cache);
diff --git a/drivers/md/bcache/sysfs.h b/drivers/md/bcache/sysfs.h
new file mode 100644
index 000000000000..34e4ba1184fe
--- /dev/null
+++ b/drivers/md/bcache/sysfs.h
@@ -0,0 +1,110 @@
+#ifndef _BCACHE_SYSFS_H_
+#define _BCACHE_SYSFS_H_
+
+#define KTYPE(type)							\
+struct kobj_type type ## _ktype = {					\
+	.release	= type ## _release,				\
+	.sysfs_ops	= &((const struct sysfs_ops) {			\
+		.show	= type ## _show,				\
+		.store	= type ## _store				\
+	}),								\
+	.default_attrs	= type ## _files				\
+}
+
+#define SHOW(fn)							\
+static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\
+			   char *buf)					\
+
+#define STORE(fn)							\
+static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\
+			    const char *buf, size_t size)		\
+
+#define SHOW_LOCKED(fn)							\
+SHOW(fn)								\
+{									\
+	ssize_t ret;							\
+	mutex_lock(&bch_register_lock);					\
+	ret = __ ## fn ## _show(kobj, attr, buf);			\
+	mutex_unlock(&bch_register_lock);				\
+	return ret;							\
+}
+
+#define STORE_LOCKED(fn)						\
+STORE(fn)								\
+{									\
+	ssize_t ret;							\
+	mutex_lock(&bch_register_lock);					\
+	ret = __ ## fn ## _store(kobj, attr, buf, size);		\
+	mutex_unlock(&bch_register_lock);				\
+	return ret;							\
+}
+
+#define __sysfs_attribute(_name, _mode)					\
+	static struct attribute sysfs_##_name =				\
+		{ .name = #_name, .mode = _mode }
+
+#define write_attribute(n)	__sysfs_attribute(n, S_IWUSR)
+#define read_attribute(n)	__sysfs_attribute(n, S_IRUGO)
+#define rw_attribute(n)		__sysfs_attribute(n, S_IRUGO|S_IWUSR)
+
+#define sysfs_printf(file, fmt, ...)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);	\
+} while (0)
+
+#define sysfs_print(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return snprint(buf, PAGE_SIZE, var);			\
+} while (0)
+
+#define sysfs_hprint(file, val)						\
+do {									\
+	if (attr == &sysfs_ ## file) {					\
+		ssize_t ret = hprint(buf, val);				\
+		strcat(buf, "\n");					\
+		return ret + 1;						\
+	}								\
+} while (0)
+
+#define var_printf(_var, fmt)	sysfs_printf(_var, fmt, var(_var))
+#define var_print(_var)		sysfs_print(_var, var(_var))
+#define var_hprint(_var)	sysfs_hprint(_var, var(_var))
+
+#define sysfs_strtoul(file, var)					\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe(buf, var) ?: (ssize_t) size;	\
+} while (0)
+
+#define sysfs_strtoul_clamp(file, var, min, max)			\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoul_safe_clamp(buf, var, min, max)		\
+			?: (ssize_t) size;				\
+} while (0)
+
+#define strtoul_or_return(cp)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (_r)								\
+		return _r;						\
+	_v;								\
+})
+
+#define strtoi_h_or_return(cp, v)					\
+do {									\
+	int _r = strtoi_h(cp, &v);					\
+	if (_r)								\
+		return _r;						\
+} while (0)
+
+#define sysfs_hatoi(file, var)						\
+do {									\
+	if (attr == &sysfs_ ## file)					\
+		return strtoi_h(buf, &var) ?: (ssize_t) size;		\
+} while (0)
+
+#endif  /* _BCACHE_SYSFS_H_ */
diff --git a/drivers/md/bcache/trace.c b/drivers/md/bcache/trace.c
new file mode 100644
index 000000000000..983f9bb411bc
--- /dev/null
+++ b/drivers/md/bcache/trace.c
@@ -0,0 +1,26 @@
+#include "bcache.h"
+#include "btree.h"
+#include "request.h"
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/bcache.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_request_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_passthrough);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_hit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_miss);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_retry);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writethrough);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_writeback);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_skip);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_read);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_btree_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_write_dirty);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_read_dirty);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_journal_write);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_cache_insert);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(bcache_gc_end);
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
new file mode 100644
index 000000000000..dcec2e4f84ad
--- /dev/null
+++ b/drivers/md/bcache/util.c
@@ -0,0 +1,389 @@
+/*
+ * random utiility code, for bcache but in theory not specific to bcache
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+
+#include "util.h"
+
+#define simple_strtoint(c, end, base)	simple_strtol(c, end, base)
+#define simple_strtouint(c, end, base)	simple_strtoul(c, end, base)
+
+#define STRTO_H(name, type)					\
+int name ## _h(const char *cp, type *res)		        \
+{								\
+	int u = 0;						\
+	char *e;						\
+	type i = simple_ ## name(cp, &e, 10);			\
+								\
+	switch (tolower(*e)) {					\
+	default:						\
+		return -EINVAL;					\
+	case 'y':						\
+	case 'z':						\
+		u++;						\
+	case 'e':						\
+		u++;						\
+	case 'p':						\
+		u++;						\
+	case 't':						\
+		u++;						\
+	case 'g':						\
+		u++;						\
+	case 'm':						\
+		u++;						\
+	case 'k':						\
+		u++;						\
+		if (e++ == cp)					\
+			return -EINVAL;				\
+	case '\n':						\
+	case '\0':						\
+		if (*e == '\n')					\
+			e++;					\
+	}							\
+								\
+	if (*e)							\
+		return -EINVAL;					\
+								\
+	while (u--) {						\
+		if ((type) ~0 > 0 &&				\
+		    (type) ~0 / 1024 <= i)			\
+			return -EINVAL;				\
+		if ((i > 0 && ANYSINT_MAX(type) / 1024 < i) ||	\
+		    (i < 0 && -ANYSINT_MAX(type) / 1024 > i))	\
+			return -EINVAL;				\
+		i *= 1024;					\
+	}							\
+								\
+	*res = i;						\
+	return 0;						\
+}								\
+EXPORT_SYMBOL_GPL(name ## _h);
+
+STRTO_H(strtoint, int)
+STRTO_H(strtouint, unsigned int)
+STRTO_H(strtoll, long long)
+STRTO_H(strtoull, unsigned long long)
+
+ssize_t hprint(char *buf, int64_t v)
+{
+	static const char units[] = "?kMGTPEZY";
+	char dec[3] = "";
+	int u, t = 0;
+
+	for (u = 0; v >= 1024 || v <= -1024; u++) {
+		t = v & ~(~0 << 10);
+		v >>= 10;
+	}
+
+	if (!u)
+		return sprintf(buf, "%llu", v);
+
+	if (v < 100 && v > -100)
+		sprintf(dec, ".%i", t / 100);
+
+	return sprintf(buf, "%lli%s%c", v, dec, units[u]);
+}
+EXPORT_SYMBOL_GPL(hprint);
+
+ssize_t snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected)
+{
+	char *out = buf;
+	size_t i;
+
+	for (i = 0; list[i]; i++)
+		out += snprintf(out, buf + size - out,
+				i == selected ? "[%s] " : "%s ", list[i]);
+
+	out[-1] = '\n';
+	return out - buf;
+}
+EXPORT_SYMBOL_GPL(snprint_string_list);
+
+ssize_t read_string_list(const char *buf, const char * const list[])
+{
+	size_t i;
+	char *s, *d = kstrndup(buf, PAGE_SIZE - 1, GFP_KERNEL);
+	if (!d)
+		return -ENOMEM;
+
+	s = strim(d);
+
+	for (i = 0; list[i]; i++)
+		if (!strcmp(list[i], s))
+			break;
+
+	kfree(d);
+
+	if (!list[i])
+		return -EINVAL;
+
+	return i;
+}
+EXPORT_SYMBOL_GPL(read_string_list);
+
+bool is_zero(const char *p, size_t n)
+{
+	size_t i;
+
+	for (i = 0; i < n; i++)
+		if (p[i])
+			return false;
+	return true;
+}
+EXPORT_SYMBOL_GPL(is_zero);
+
+int parse_uuid(const char *s, char *uuid)
+{
+	size_t i, j, x;
+	memset(uuid, 0, 16);
+
+	for (i = 0, j = 0;
+	     i < strspn(s, "-0123456789:ABCDEFabcdef") && j < 32;
+	     i++) {
+		x = s[i] | 32;
+
+		switch (x) {
+		case '0'...'9':
+			x -= '0';
+			break;
+		case 'a'...'f':
+			x -= 'a' - 10;
+			break;
+		default:
+			continue;
+		}
+
+		if (!(j & 1))
+			x <<= 4;
+		uuid[j++ >> 1] |= x;
+	}
+	return i;
+}
+EXPORT_SYMBOL_GPL(parse_uuid);
+
+void time_stats_update(struct time_stats *stats, uint64_t start_time)
+{
+	uint64_t now		= local_clock();
+	uint64_t duration	= time_after64(now, start_time)
+		? now - start_time : 0;
+	uint64_t last		= time_after64(now, stats->last)
+		? now - stats->last : 0;
+
+	stats->max_duration = max(stats->max_duration, duration);
+
+	if (stats->last) {
+		ewma_add(stats->average_duration, duration, 8, 8);
+
+		if (stats->average_frequency)
+			ewma_add(stats->average_frequency, last, 8, 8);
+		else
+			stats->average_frequency  = last << 8;
+	} else {
+		stats->average_duration  = duration << 8;
+	}
+
+	stats->last = now ?: 1;
+}
+EXPORT_SYMBOL_GPL(time_stats_update);
+
+unsigned next_delay(struct ratelimit *d, uint64_t done)
+{
+	uint64_t now = local_clock();
+
+	d->next += div_u64(done, d->rate);
+
+	return time_after64(d->next, now)
+		? div_u64(d->next - now, NSEC_PER_SEC / HZ)
+		: 0;
+}
+EXPORT_SYMBOL_GPL(next_delay);
+
+void bio_map(struct bio *bio, void *base)
+{
+	size_t size = bio->bi_size;
+	struct bio_vec *bv = bio->bi_io_vec;
+
+	BUG_ON(!bio->bi_size);
+	BUG_ON(bio->bi_vcnt);
+
+	bv->bv_offset = base ? ((unsigned long) base) % PAGE_SIZE : 0;
+	goto start;
+
+	for (; size; bio->bi_vcnt++, bv++) {
+		bv->bv_offset	= 0;
+start:		bv->bv_len	= min_t(size_t, PAGE_SIZE - bv->bv_offset,
+					size);
+		if (base) {
+			bv->bv_page = is_vmalloc_addr(base)
+				? vmalloc_to_page(base)
+				: virt_to_page(base);
+
+			base += bv->bv_len;
+		}
+
+		size -= bv->bv_len;
+	}
+}
+EXPORT_SYMBOL_GPL(bio_map);
+
+int bio_alloc_pages(struct bio *bio, gfp_t gfp)
+{
+	int i;
+	struct bio_vec *bv;
+
+	bio_for_each_segment(bv, bio, i) {
+		bv->bv_page = alloc_page(gfp);
+		if (!bv->bv_page) {
+			while (bv-- != bio->bi_io_vec + bio->bi_idx)
+				__free_page(bv->bv_page);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_pages);
+
+/*
+ * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
+ * use permitted, subject to terms of PostgreSQL license; see.)
+
+ * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
+ * usual sort of implementation. (See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
+ * If we have no working 64-bit type, then fake it with two 32-bit registers.
+ *
+ * The present implementation is a normal (not "reflected", in Williams'
+ * terms) 64-bit CRC, using initial all-ones register contents and a final
+ * bit inversion. The chosen polynomial is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+*/
+
+static const uint64_t crc_table[256] = {
+	0x0000000000000000, 0x42F0E1EBA9EA3693, 0x85E1C3D753D46D26,
+	0xC711223CFA3E5BB5, 0x493366450E42ECDF, 0x0BC387AEA7A8DA4C,
+	0xCCD2A5925D9681F9, 0x8E224479F47CB76A, 0x9266CC8A1C85D9BE,
+	0xD0962D61B56FEF2D, 0x17870F5D4F51B498, 0x5577EEB6E6BB820B,
+	0xDB55AACF12C73561, 0x99A54B24BB2D03F2, 0x5EB4691841135847,
+	0x1C4488F3E8F96ED4, 0x663D78FF90E185EF, 0x24CD9914390BB37C,
+	0xE3DCBB28C335E8C9, 0xA12C5AC36ADFDE5A, 0x2F0E1EBA9EA36930,
+	0x6DFEFF5137495FA3, 0xAAEFDD6DCD770416, 0xE81F3C86649D3285,
+	0xF45BB4758C645C51, 0xB6AB559E258E6AC2, 0x71BA77A2DFB03177,
+	0x334A9649765A07E4, 0xBD68D2308226B08E, 0xFF9833DB2BCC861D,
+	0x388911E7D1F2DDA8, 0x7A79F00C7818EB3B, 0xCC7AF1FF21C30BDE,
+	0x8E8A101488293D4D, 0x499B3228721766F8, 0x0B6BD3C3DBFD506B,
+	0x854997BA2F81E701, 0xC7B97651866BD192, 0x00A8546D7C558A27,
+	0x4258B586D5BFBCB4, 0x5E1C3D753D46D260, 0x1CECDC9E94ACE4F3,
+	0xDBFDFEA26E92BF46, 0x990D1F49C77889D5, 0x172F5B3033043EBF,
+	0x55DFBADB9AEE082C, 0x92CE98E760D05399, 0xD03E790CC93A650A,
+	0xAA478900B1228E31, 0xE8B768EB18C8B8A2, 0x2FA64AD7E2F6E317,
+	0x6D56AB3C4B1CD584, 0xE374EF45BF6062EE, 0xA1840EAE168A547D,
+	0x66952C92ECB40FC8, 0x2465CD79455E395B, 0x3821458AADA7578F,
+	0x7AD1A461044D611C, 0xBDC0865DFE733AA9, 0xFF3067B657990C3A,
+	0x711223CFA3E5BB50, 0x33E2C2240A0F8DC3, 0xF4F3E018F031D676,
+	0xB60301F359DBE0E5, 0xDA050215EA6C212F, 0x98F5E3FE438617BC,
+	0x5FE4C1C2B9B84C09, 0x1D14202910527A9A, 0x93366450E42ECDF0,
+	0xD1C685BB4DC4FB63, 0x16D7A787B7FAA0D6, 0x5427466C1E109645,
+	0x4863CE9FF6E9F891, 0x0A932F745F03CE02, 0xCD820D48A53D95B7,
+	0x8F72ECA30CD7A324, 0x0150A8DAF8AB144E, 0x43A04931514122DD,
+	0x84B16B0DAB7F7968, 0xC6418AE602954FFB, 0xBC387AEA7A8DA4C0,
+	0xFEC89B01D3679253, 0x39D9B93D2959C9E6, 0x7B2958D680B3FF75,
+	0xF50B1CAF74CF481F, 0xB7FBFD44DD257E8C, 0x70EADF78271B2539,
+	0x321A3E938EF113AA, 0x2E5EB66066087D7E, 0x6CAE578BCFE24BED,
+	0xABBF75B735DC1058, 0xE94F945C9C3626CB, 0x676DD025684A91A1,
+	0x259D31CEC1A0A732, 0xE28C13F23B9EFC87, 0xA07CF2199274CA14,
+	0x167FF3EACBAF2AF1, 0x548F120162451C62, 0x939E303D987B47D7,
+	0xD16ED1D631917144, 0x5F4C95AFC5EDC62E, 0x1DBC74446C07F0BD,
+	0xDAAD56789639AB08, 0x985DB7933FD39D9B, 0x84193F60D72AF34F,
+	0xC6E9DE8B7EC0C5DC, 0x01F8FCB784FE9E69, 0x43081D5C2D14A8FA,
+	0xCD2A5925D9681F90, 0x8FDAB8CE70822903, 0x48CB9AF28ABC72B6,
+	0x0A3B7B1923564425, 0x70428B155B4EAF1E, 0x32B26AFEF2A4998D,
+	0xF5A348C2089AC238, 0xB753A929A170F4AB, 0x3971ED50550C43C1,
+	0x7B810CBBFCE67552, 0xBC902E8706D82EE7, 0xFE60CF6CAF321874,
+	0xE224479F47CB76A0, 0xA0D4A674EE214033, 0x67C58448141F1B86,
+	0x253565A3BDF52D15, 0xAB1721DA49899A7F, 0xE9E7C031E063ACEC,
+	0x2EF6E20D1A5DF759, 0x6C0603E6B3B7C1CA, 0xF6FAE5C07D3274CD,
+	0xB40A042BD4D8425E, 0x731B26172EE619EB, 0x31EBC7FC870C2F78,
+	0xBFC9838573709812, 0xFD39626EDA9AAE81, 0x3A28405220A4F534,
+	0x78D8A1B9894EC3A7, 0x649C294A61B7AD73, 0x266CC8A1C85D9BE0,
+	0xE17DEA9D3263C055, 0xA38D0B769B89F6C6, 0x2DAF4F0F6FF541AC,
+	0x6F5FAEE4C61F773F, 0xA84E8CD83C212C8A, 0xEABE6D3395CB1A19,
+	0x90C79D3FEDD3F122, 0xD2377CD44439C7B1, 0x15265EE8BE079C04,
+	0x57D6BF0317EDAA97, 0xD9F4FB7AE3911DFD, 0x9B041A914A7B2B6E,
+	0x5C1538ADB04570DB, 0x1EE5D94619AF4648, 0x02A151B5F156289C,
+	0x4051B05E58BC1E0F, 0x87409262A28245BA, 0xC5B073890B687329,
+	0x4B9237F0FF14C443, 0x0962D61B56FEF2D0, 0xCE73F427ACC0A965,
+	0x8C8315CC052A9FF6, 0x3A80143F5CF17F13, 0x7870F5D4F51B4980,
+	0xBF61D7E80F251235, 0xFD913603A6CF24A6, 0x73B3727A52B393CC,
+	0x31439391FB59A55F, 0xF652B1AD0167FEEA, 0xB4A25046A88DC879,
+	0xA8E6D8B54074A6AD, 0xEA16395EE99E903E, 0x2D071B6213A0CB8B,
+	0x6FF7FA89BA4AFD18, 0xE1D5BEF04E364A72, 0xA3255F1BE7DC7CE1,
+	0x64347D271DE22754, 0x26C49CCCB40811C7, 0x5CBD6CC0CC10FAFC,
+	0x1E4D8D2B65FACC6F, 0xD95CAF179FC497DA, 0x9BAC4EFC362EA149,
+	0x158E0A85C2521623, 0x577EEB6E6BB820B0, 0x906FC95291867B05,
+	0xD29F28B9386C4D96, 0xCEDBA04AD0952342, 0x8C2B41A1797F15D1,
+	0x4B3A639D83414E64, 0x09CA82762AAB78F7, 0x87E8C60FDED7CF9D,
+	0xC51827E4773DF90E, 0x020905D88D03A2BB, 0x40F9E43324E99428,
+	0x2CFFE7D5975E55E2, 0x6E0F063E3EB46371, 0xA91E2402C48A38C4,
+	0xEBEEC5E96D600E57, 0x65CC8190991CB93D, 0x273C607B30F68FAE,
+	0xE02D4247CAC8D41B, 0xA2DDA3AC6322E288, 0xBE992B5F8BDB8C5C,
+	0xFC69CAB42231BACF, 0x3B78E888D80FE17A, 0x7988096371E5D7E9,
+	0xF7AA4D1A85996083, 0xB55AACF12C735610, 0x724B8ECDD64D0DA5,
+	0x30BB6F267FA73B36, 0x4AC29F2A07BFD00D, 0x08327EC1AE55E69E,
+	0xCF235CFD546BBD2B, 0x8DD3BD16FD818BB8, 0x03F1F96F09FD3CD2,
+	0x41011884A0170A41, 0x86103AB85A2951F4, 0xC4E0DB53F3C36767,
+	0xD8A453A01B3A09B3, 0x9A54B24BB2D03F20, 0x5D45907748EE6495,
+	0x1FB5719CE1045206, 0x919735E51578E56C, 0xD367D40EBC92D3FF,
+	0x1476F63246AC884A, 0x568617D9EF46BED9, 0xE085162AB69D5E3C,
+	0xA275F7C11F7768AF, 0x6564D5FDE549331A, 0x279434164CA30589,
+	0xA9B6706FB8DFB2E3, 0xEB46918411358470, 0x2C57B3B8EB0BDFC5,
+	0x6EA7525342E1E956, 0x72E3DAA0AA188782, 0x30133B4B03F2B111,
+	0xF7021977F9CCEAA4, 0xB5F2F89C5026DC37, 0x3BD0BCE5A45A6B5D,
+	0x79205D0E0DB05DCE, 0xBE317F32F78E067B, 0xFCC19ED95E6430E8,
+	0x86B86ED5267CDBD3, 0xC4488F3E8F96ED40, 0x0359AD0275A8B6F5,
+	0x41A94CE9DC428066, 0xCF8B0890283E370C, 0x8D7BE97B81D4019F,
+	0x4A6ACB477BEA5A2A, 0x089A2AACD2006CB9, 0x14DEA25F3AF9026D,
+	0x562E43B4931334FE, 0x913F6188692D6F4B, 0xD3CF8063C0C759D8,
+	0x5DEDC41A34BBEEB2, 0x1F1D25F19D51D821, 0xD80C07CD676F8394,
+	0x9AFCE626CE85B507
+};
+
+uint64_t crc64_update(uint64_t crc, const void *_data, size_t len)
+{
+	const unsigned char *data = _data;
+
+	while (len--) {
+		int i = ((int) (crc >> 56) ^ *data++) & 0xFF;
+		crc = crc_table[i] ^ (crc << 8);
+	}
+
+	return crc;
+}
+EXPORT_SYMBOL(crc64_update);
+
+uint64_t crc64(const void *data, size_t len)
+{
+	uint64_t crc = 0xffffffffffffffff;
+
+	crc = crc64_update(crc, data, len);
+
+	return crc ^ 0xffffffffffffffff;
+}
+EXPORT_SYMBOL(crc64);
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
new file mode 100644
index 000000000000..56705fdcc149
--- /dev/null
+++ b/drivers/md/bcache/util.h
@@ -0,0 +1,589 @@
+
+#ifndef _BCACHE_UTIL_H
+#define _BCACHE_UTIL_H
+
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/llist.h>
+#include <linux/ratelimit.h>
+#include <linux/vmalloc.h>
+#include <linux/workqueue.h>
+
+#include "closure.h"
+
+#define PAGE_SECTORS		(PAGE_SIZE / 512)
+
+struct closure;
+
+#include <trace/events/bcache.h>
+
+#ifdef CONFIG_BCACHE_EDEBUG
+
+#define atomic_dec_bug(v)	BUG_ON(atomic_dec_return(v) < 0)
+#define atomic_inc_bug(v, i)	BUG_ON(atomic_inc_return(v) <= i)
+
+#else /* EDEBUG */
+
+#define atomic_dec_bug(v)	atomic_dec(v)
+#define atomic_inc_bug(v, i)	atomic_inc(v)
+
+#endif
+
+#define BITMASK(name, type, field, offset, size)		\
+static inline uint64_t name(const type *k)			\
+{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); }	\
+								\
+static inline void SET_##name(type *k, uint64_t v)		\
+{								\
+	k->field &= ~(~((uint64_t) ~0 << size) << offset);	\
+	k->field |= v << offset;				\
+}
+
+#define DECLARE_HEAP(type, name)					\
+	struct {							\
+		size_t size, used;					\
+		type *data;						\
+	} name
+
+#define init_heap(heap, _size, gfp)					\
+({									\
+	size_t _bytes;							\
+	(heap)->used = 0;						\
+	(heap)->size = (_size);						\
+	_bytes = (heap)->size * sizeof(*(heap)->data);			\
+	(heap)->data = NULL;						\
+	if (_bytes < KMALLOC_MAX_SIZE)					\
+		(heap)->data = kmalloc(_bytes, (gfp));			\
+	if ((!(heap)->data) && ((gfp) & GFP_KERNEL))			\
+		(heap)->data = vmalloc(_bytes);				\
+	(heap)->data;							\
+})
+
+#define free_heap(heap)							\
+do {									\
+	if (is_vmalloc_addr((heap)->data))				\
+		vfree((heap)->data);					\
+	else								\
+		kfree((heap)->data);					\
+	(heap)->data = NULL;						\
+} while (0)
+
+#define heap_swap(h, i, j)	swap((h)->data[i], (h)->data[j])
+
+#define heap_sift(h, i, cmp)						\
+do {									\
+	size_t _r, _j = i;						\
+									\
+	for (; _j * 2 + 1 < (h)->used; _j = _r) {			\
+		_r = _j * 2 + 1;					\
+		if (_r + 1 < (h)->used &&				\
+		    cmp((h)->data[_r], (h)->data[_r + 1]))		\
+			_r++;						\
+									\
+		if (cmp((h)->data[_r], (h)->data[_j]))			\
+			break;						\
+		heap_swap(h, _r, _j);					\
+	}								\
+} while (0)
+
+#define heap_sift_down(h, i, cmp)					\
+do {									\
+	while (i) {							\
+		size_t p = (i - 1) / 2;					\
+		if (cmp((h)->data[i], (h)->data[p]))			\
+			break;						\
+		heap_swap(h, i, p);					\
+		i = p;							\
+	}								\
+} while (0)
+
+#define heap_add(h, d, cmp)						\
+({									\
+	bool _r = !heap_full(h);					\
+	if (_r) {							\
+		size_t _i = (h)->used++;				\
+		(h)->data[_i] = d;					\
+									\
+		heap_sift_down(h, _i, cmp);				\
+		heap_sift(h, _i, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_pop(h, d, cmp)						\
+({									\
+	bool _r = (h)->used;						\
+	if (_r) {							\
+		(d) = (h)->data[0];					\
+		(h)->used--;						\
+		heap_swap(h, 0, (h)->used);				\
+		heap_sift(h, 0, cmp);					\
+	}								\
+	_r;								\
+})
+
+#define heap_peek(h)	((h)->size ? (h)->data[0] : NULL)
+
+#define heap_full(h)	((h)->used == (h)->size)
+
+#define DECLARE_FIFO(type, name)					\
+	struct {							\
+		size_t front, back, size, mask;				\
+		type *data;						\
+	} name
+
+#define fifo_for_each(c, fifo, iter)					\
+	for (iter = (fifo)->front;					\
+	     c = (fifo)->data[iter], iter != (fifo)->back;		\
+	     iter = (iter + 1) & (fifo)->mask)
+
+#define __init_fifo(fifo, gfp)						\
+({									\
+	size_t _allocated_size, _bytes;					\
+	BUG_ON(!(fifo)->size);						\
+									\
+	_allocated_size = roundup_pow_of_two((fifo)->size + 1);		\
+	_bytes = _allocated_size * sizeof(*(fifo)->data);		\
+									\
+	(fifo)->mask = _allocated_size - 1;				\
+	(fifo)->front = (fifo)->back = 0;				\
+	(fifo)->data = NULL;						\
+									\
+	if (_bytes < KMALLOC_MAX_SIZE)					\
+		(fifo)->data = kmalloc(_bytes, (gfp));			\
+	if ((!(fifo)->data) && ((gfp) & GFP_KERNEL))			\
+		(fifo)->data = vmalloc(_bytes);				\
+	(fifo)->data;							\
+})
+
+#define init_fifo_exact(fifo, _size, gfp)				\
+({									\
+	(fifo)->size = (_size);						\
+	__init_fifo(fifo, gfp);						\
+})
+
+#define init_fifo(fifo, _size, gfp)					\
+({									\
+	(fifo)->size = (_size);						\
+	if ((fifo)->size > 4)						\
+		(fifo)->size = roundup_pow_of_two((fifo)->size) - 1;	\
+	__init_fifo(fifo, gfp);						\
+})
+
+#define free_fifo(fifo)							\
+do {									\
+	if (is_vmalloc_addr((fifo)->data))				\
+		vfree((fifo)->data);					\
+	else								\
+		kfree((fifo)->data);					\
+	(fifo)->data = NULL;						\
+} while (0)
+
+#define fifo_used(fifo)		(((fifo)->back - (fifo)->front) & (fifo)->mask)
+#define fifo_free(fifo)		((fifo)->size - fifo_used(fifo))
+
+#define fifo_empty(fifo)	(!fifo_used(fifo))
+#define fifo_full(fifo)		(!fifo_free(fifo))
+
+#define fifo_front(fifo)	((fifo)->data[(fifo)->front])
+#define fifo_back(fifo)							\
+	((fifo)->data[((fifo)->back - 1) & (fifo)->mask])
+
+#define fifo_idx(fifo, p)	(((p) - &fifo_front(fifo)) & (fifo)->mask)
+
+#define fifo_push_back(fifo, i)						\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r) {							\
+		(fifo)->data[(fifo)->back++] = (i);			\
+		(fifo)->back &= (fifo)->mask;				\
+	}								\
+	_r;								\
+})
+
+#define fifo_pop_front(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r) {							\
+		(i) = (fifo)->data[(fifo)->front++];			\
+		(fifo)->front &= (fifo)->mask;				\
+	}								\
+	_r;								\
+})
+
+#define fifo_push_front(fifo, i)					\
+({									\
+	bool _r = !fifo_full((fifo));					\
+	if (_r) {							\
+		--(fifo)->front;					\
+		(fifo)->front &= (fifo)->mask;				\
+		(fifo)->data[(fifo)->front] = (i);			\
+	}								\
+	_r;								\
+})
+
+#define fifo_pop_back(fifo, i)						\
+({									\
+	bool _r = !fifo_empty((fifo));					\
+	if (_r) {							\
+		--(fifo)->back;						\
+		(fifo)->back &= (fifo)->mask;				\
+		(i) = (fifo)->data[(fifo)->back]			\
+	}								\
+	_r;								\
+})
+
+#define fifo_push(fifo, i)	fifo_push_back(fifo, (i))
+#define fifo_pop(fifo, i)	fifo_pop_front(fifo, (i))
+
+#define fifo_swap(l, r)							\
+do {									\
+	swap((l)->front, (r)->front);					\
+	swap((l)->back, (r)->back);					\
+	swap((l)->size, (r)->size);					\
+	swap((l)->mask, (r)->mask);					\
+	swap((l)->data, (r)->data);					\
+} while (0)
+
+#define fifo_move(dest, src)						\
+do {									\
+	typeof(*((dest)->data)) _t;					\
+	while (!fifo_full(dest) &&					\
+	       fifo_pop(src, _t))					\
+		fifo_push(dest, _t);					\
+} while (0)
+
+/*
+ * Simple array based allocator - preallocates a number of elements and you can
+ * never allocate more than that, also has no locking.
+ *
+ * Handy because if you know you only need a fixed number of elements you don't
+ * have to worry about memory allocation failure, and sometimes a mempool isn't
+ * what you want.
+ *
+ * We treat the free elements as entries in a singly linked list, and the
+ * freelist as a stack - allocating and freeing push and pop off the freelist.
+ */
+
+#define DECLARE_ARRAY_ALLOCATOR(type, name, size)			\
+	struct {							\
+		type	*freelist;					\
+		type	data[size];					\
+	} name
+
+#define array_alloc(array)						\
+({									\
+	typeof((array)->freelist) _ret = (array)->freelist;		\
+									\
+	if (_ret)							\
+		(array)->freelist = *((typeof((array)->freelist) *) _ret);\
+									\
+	_ret;								\
+})
+
+#define array_free(array, ptr)						\
+do {									\
+	typeof((array)->freelist) _ptr = ptr;				\
+									\
+	*((typeof((array)->freelist) *) _ptr) = (array)->freelist;	\
+	(array)->freelist = _ptr;					\
+} while (0)
+
+#define array_allocator_init(array)					\
+do {									\
+	typeof((array)->freelist) _i;					\
+									\
+	BUILD_BUG_ON(sizeof((array)->data[0]) < sizeof(void *));	\
+	(array)->freelist = NULL;					\
+									\
+	for (_i = (array)->data;					\
+	     _i < (array)->data + ARRAY_SIZE((array)->data);		\
+	     _i++)							\
+		array_free(array, _i);					\
+} while (0)
+
+#define array_freelist_empty(array)	((array)->freelist == NULL)
+
+#define ANYSINT_MAX(t)							\
+	((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1)
+
+int strtoint_h(const char *, int *);
+int strtouint_h(const char *, unsigned int *);
+int strtoll_h(const char *, long long *);
+int strtoull_h(const char *, unsigned long long *);
+
+static inline int strtol_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return strtoint_h(cp, (int *) res);
+#else
+	return strtoll_h(cp, (long long *) res);
+#endif
+}
+
+static inline int strtoul_h(const char *cp, long *res)
+{
+#if BITS_PER_LONG == 32
+	return strtouint_h(cp, (unsigned int *) res);
+#else
+	return strtoull_h(cp, (unsigned long long *) res);
+#endif
+}
+
+#define strtoi_h(cp, res)						\
+	(__builtin_types_compatible_p(typeof(*res), int)		\
+	? strtoint_h(cp, (void *) res)					\
+	: __builtin_types_compatible_p(typeof(*res), long)		\
+	? strtol_h(cp, (void *) res)					\
+	: __builtin_types_compatible_p(typeof(*res), long long)		\
+	? strtoll_h(cp, (void *) res)					\
+	: __builtin_types_compatible_p(typeof(*res), unsigned int)	\
+	? strtouint_h(cp, (void *) res)					\
+	: __builtin_types_compatible_p(typeof(*res), unsigned long)	\
+	? strtoul_h(cp, (void *) res)					\
+	: __builtin_types_compatible_p(typeof(*res), unsigned long long)\
+	? strtoull_h(cp, (void *) res) : -EINVAL)
+
+#define strtoul_safe(cp, var)						\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = _v;						\
+	_r;								\
+})
+
+#define strtoul_safe_clamp(cp, var, min, max)				\
+({									\
+	unsigned long _v;						\
+	int _r = kstrtoul(cp, 10, &_v);					\
+	if (!_r)							\
+		var = clamp_t(typeof(var), _v, min, max);		\
+	_r;								\
+})
+
+#define snprint(buf, size, var)						\
+	snprintf(buf, size,						\
+		__builtin_types_compatible_p(typeof(var), int)		\
+		     ? "%i\n" :						\
+		__builtin_types_compatible_p(typeof(var), unsigned)	\
+		     ? "%u\n" :						\
+		__builtin_types_compatible_p(typeof(var), long)		\
+		     ? "%li\n" :					\
+		__builtin_types_compatible_p(typeof(var), unsigned long)\
+		     ? "%lu\n" :					\
+		__builtin_types_compatible_p(typeof(var), int64_t)	\
+		     ? "%lli\n" :					\
+		__builtin_types_compatible_p(typeof(var), uint64_t)	\
+		     ? "%llu\n" :					\
+		__builtin_types_compatible_p(typeof(var), const char *)	\
+		     ? "%s\n" : "%i\n", var)
+
+ssize_t hprint(char *buf, int64_t v);
+
+bool is_zero(const char *p, size_t n);
+int parse_uuid(const char *s, char *uuid);
+
+ssize_t snprint_string_list(char *buf, size_t size, const char * const list[],
+			    size_t selected);
+
+ssize_t read_string_list(const char *buf, const char * const list[]);
+
+struct time_stats {
+	/*
+	 * all fields are in nanoseconds, averages are ewmas stored left shifted
+	 * by 8
+	 */
+	uint64_t	max_duration;
+	uint64_t	average_duration;
+	uint64_t	average_frequency;
+	uint64_t	last;
+};
+
+void time_stats_update(struct time_stats *stats, uint64_t time);
+
+#define NSEC_PER_ns			1L
+#define NSEC_PER_us			NSEC_PER_USEC
+#define NSEC_PER_ms			NSEC_PER_MSEC
+#define NSEC_PER_sec			NSEC_PER_SEC
+
+#define __print_time_stat(stats, name, stat, units)			\
+	sysfs_print(name ## _ ## stat ## _ ## units,			\
+		    div_u64((stats)->stat >> 8, NSEC_PER_ ## units))
+
+#define sysfs_print_time_stats(stats, name,				\
+			       frequency_units,				\
+			       duration_units)				\
+do {									\
+	__print_time_stat(stats, name,					\
+			  average_frequency,	frequency_units);	\
+	__print_time_stat(stats, name,					\
+			  average_duration,	duration_units);	\
+	__print_time_stat(stats, name,					\
+			  max_duration,		duration_units);	\
+									\
+	sysfs_print(name ## _last_ ## frequency_units, (stats)->last	\
+		    ? div_s64(local_clock() - (stats)->last,		\
+			      NSEC_PER_ ## frequency_units)		\
+		    : -1LL);						\
+} while (0)
+
+#define sysfs_time_stats_attribute(name,				\
+				   frequency_units,			\
+				   duration_units)			\
+read_attribute(name ## _average_frequency_ ## frequency_units);		\
+read_attribute(name ## _average_duration_ ## duration_units);		\
+read_attribute(name ## _max_duration_ ## duration_units);		\
+read_attribute(name ## _last_ ## frequency_units)
+
+#define sysfs_time_stats_attribute_list(name,				\
+					frequency_units,		\
+					duration_units)			\
+&sysfs_ ## name ## _average_frequency_ ## frequency_units,		\
+&sysfs_ ## name ## _average_duration_ ## duration_units,		\
+&sysfs_ ## name ## _max_duration_ ## duration_units,			\
+&sysfs_ ## name ## _last_ ## frequency_units,
+
+#define ewma_add(ewma, val, weight, factor)				\
+({									\
+	(ewma) *= (weight) - 1;						\
+	(ewma) += (val) << factor;					\
+	(ewma) /= (weight);						\
+	(ewma) >> factor;						\
+})
+
+struct ratelimit {
+	uint64_t		next;
+	unsigned		rate;
+};
+
+static inline void ratelimit_reset(struct ratelimit *d)
+{
+	d->next = local_clock();
+}
+
+unsigned next_delay(struct ratelimit *d, uint64_t done);
+
+#define __DIV_SAFE(n, d, zero)						\
+({									\
+	typeof(n) _n = (n);						\
+	typeof(d) _d = (d);						\
+	_d ? _n / _d : zero;						\
+})
+
+#define DIV_SAFE(n, d)	__DIV_SAFE(n, d, 0)
+
+#define container_of_or_null(ptr, type, member)				\
+({									\
+	typeof(ptr) _ptr = ptr;						\
+	_ptr ? container_of(_ptr, type, member) : NULL;			\
+})
+
+#define RB_INSERT(root, new, member, cmp)				\
+({									\
+	__label__ dup;							\
+	struct rb_node **n = &(root)->rb_node, *parent = NULL;		\
+	typeof(new) this;						\
+	int res, ret = -1;						\
+									\
+	while (*n) {							\
+		parent = *n;						\
+		this = container_of(*n, typeof(*(new)), member);	\
+		res = cmp(new, this);					\
+		if (!res)						\
+			goto dup;					\
+		n = res < 0						\
+			? &(*n)->rb_left				\
+			: &(*n)->rb_right;				\
+	}								\
+									\
+	rb_link_node(&(new)->member, parent, n);			\
+	rb_insert_color(&(new)->member, root);				\
+	ret = 0;							\
+dup:									\
+	ret;								\
+})
+
+#define RB_SEARCH(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (!res) {						\
+			ret = this;					\
+			break;						\
+		}							\
+		n = res < 0						\
+			? n->rb_left					\
+			: n->rb_right;					\
+	}								\
+	ret;								\
+})
+
+#define RB_GREATER(root, search, member, cmp)				\
+({									\
+	struct rb_node *n = (root)->rb_node;				\
+	typeof(&(search)) this, ret = NULL;				\
+	int res;							\
+									\
+	while (n) {							\
+		this = container_of(n, typeof(search), member);		\
+		res = cmp(&(search), this);				\
+		if (res < 0) {						\
+			ret = this;					\
+			n = n->rb_left;					\
+		} else							\
+			n = n->rb_right;				\
+	}								\
+	ret;								\
+})
+
+#define RB_FIRST(root, type, member)					\
+	container_of_or_null(rb_first(root), type, member)
+
+#define RB_LAST(root, type, member)					\
+	container_of_or_null(rb_last(root), type, member)
+
+#define RB_NEXT(ptr, member)						\
+	container_of_or_null(rb_next(&(ptr)->member), typeof(*ptr), member)
+
+#define RB_PREV(ptr, member)						\
+	container_of_or_null(rb_prev(&(ptr)->member), typeof(*ptr), member)
+
+/* Does linear interpolation between powers of two */
+static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
+{
+	unsigned fract = x & ~(~0 << fract_bits);
+
+	x >>= fract_bits;
+	x   = 1 << x;
+	x  += (x * fract) >> fract_bits;
+
+	return x;
+}
+
+#define bio_end(bio)	((bio)->bi_sector + bio_sectors(bio))
+
+void bio_map(struct bio *bio, void *base);
+
+int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+
+static inline sector_t bdev_sectors(struct block_device *bdev)
+{
+	return bdev->bd_inode->i_size >> 9;
+}
+
+#define closure_bio_submit(bio, cl, dev)				\
+do {									\
+	closure_get(cl);						\
+	bch_generic_make_request(bio, &(dev)->bio_split_hook);		\
+} while (0)
+
+uint64_t crc64_update(uint64_t, const void *, size_t);
+uint64_t crc64(const void *, size_t);
+
+#endif /* _BCACHE_UTIL_H */
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
new file mode 100644
index 000000000000..a80ee5373fd8
--- /dev/null
+++ b/drivers/md/bcache/writeback.c
@@ -0,0 +1,414 @@
+/*
+ * background writeback - scan btree for dirty data and write it to the backing
+ * device
+ *
+ * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
+ * Copyright 2012 Google, Inc.
+ */
+
+#include "bcache.h"
+#include "btree.h"
+#include "debug.h"
+
+static struct workqueue_struct *dirty_wq;
+
+static void read_dirty(struct closure *);
+
+struct dirty_io {
+	struct closure		cl;
+	struct cached_dev	*dc;
+	struct bio		bio;
+};
+
+/* Rate limiting */
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
+	struct cache_set *c = dc->disk.c;
+	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
+	uint64_t cache_dirty_target =
+		div_u64(cache_sectors * dc->writeback_percent, 100);
+
+	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
+				   c->cached_dev_sectors);
+
+	/* PD controller */
+
+	int change = 0;
+	int64_t error;
+	int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
+	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
+
+	dc->disk.sectors_dirty_last = dirty;
+
+	derivative *= dc->writeback_rate_d_term;
+	derivative = clamp(derivative, -dirty, dirty);
+
+	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
+			      dc->writeback_rate_d_smooth, 0);
+
+	/* Avoid divide by zero */
+	if (!target)
+		goto out;
+
+	error = div64_s64((dirty + derivative - target) << 8, target);
+
+	change = div_s64((dc->writeback_rate.rate * error) >> 8,
+			 dc->writeback_rate_p_term_inverse);
+
+	/* Don't increase writeback rate if the device isn't keeping up */
+	if (change > 0 &&
+	    time_after64(local_clock(),
+			 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
+		change = 0;
+
+	dc->writeback_rate.rate =
+		clamp_t(int64_t, dc->writeback_rate.rate + change,
+			1, NSEC_PER_MSEC);
+out:
+	dc->writeback_rate_derivative = derivative;
+	dc->writeback_rate_change = change;
+	dc->writeback_rate_target = target;
+
+	schedule_delayed_work(&dc->writeback_rate_update,
+			      dc->writeback_rate_update_seconds * HZ);
+}
+
+static void update_writeback_rate(struct work_struct *work)
+{
+	struct cached_dev *dc = container_of(to_delayed_work(work),
+					     struct cached_dev,
+					     writeback_rate_update);
+
+	down_read(&dc->writeback_lock);
+
+	if (atomic_read(&dc->has_dirty) &&
+	    dc->writeback_percent)
+		__update_writeback_rate(dc);
+
+	up_read(&dc->writeback_lock);
+}
+
+static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
+{
+	if (atomic_read(&dc->disk.detaching) ||
+	    !dc->writeback_percent)
+		return 0;
+
+	return next_delay(&dc->writeback_rate, sectors * 10000000ULL);
+}
+
+/* Background writeback */
+
+static bool dirty_pred(struct keybuf *buf, struct bkey *k)
+{
+	return KEY_DIRTY(k);
+}
+
+static void dirty_init(struct keybuf_key *w)
+{
+	struct dirty_io *io = w->private;
+	struct bio *bio = &io->bio;
+
+	bio_init(bio);
+	if (!io->dc->writeback_percent)
+		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
+
+	bio->bi_size		= KEY_SIZE(&w->key) << 9;
+	bio->bi_max_vecs	= DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
+	bio->bi_private		= w;
+	bio->bi_io_vec		= bio->bi_inline_vecs;
+	bio_map(bio, NULL);
+}
+
+static void refill_dirty(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev,
+					     writeback.cl);
+	struct keybuf *buf = &dc->writeback_keys;
+	bool searched_from_start = false;
+	struct bkey end = MAX_KEY;
+	SET_KEY_INODE(&end, dc->disk.id);
+
+	if (!atomic_read(&dc->disk.detaching) &&
+	    !dc->writeback_running)
+		closure_return(cl);
+
+	down_write(&dc->writeback_lock);
+
+	if (!atomic_read(&dc->has_dirty)) {
+		SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
+		bch_write_bdev_super(dc, NULL);
+
+		up_write(&dc->writeback_lock);
+		closure_return(cl);
+	}
+
+	if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
+		buf->last_scanned = KEY(dc->disk.id, 0, 0);
+		searched_from_start = true;
+	}
+
+	bch_refill_keybuf(dc->disk.c, buf, &end);
+
+	if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
+		/* Searched the entire btree  - delay awhile */
+
+		if (RB_EMPTY_ROOT(&buf->keys)) {
+			atomic_set(&dc->has_dirty, 0);
+			cached_dev_put(dc);
+		}
+
+		if (!atomic_read(&dc->disk.detaching))
+			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
+	}
+
+	up_write(&dc->writeback_lock);
+
+	ratelimit_reset(&dc->writeback_rate);
+
+	/* Punt to workqueue only so we don't recurse and blow the stack */
+	continue_at(cl, read_dirty, dirty_wq);
+}
+
+void bch_writeback_queue(struct cached_dev *dc)
+{
+	if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
+		if (!atomic_read(&dc->disk.detaching))
+			closure_delay(&dc->writeback, dc->writeback_delay * HZ);
+
+		continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
+	}
+}
+
+void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
+{
+	atomic_long_add(sectors, &dc->disk.sectors_dirty);
+
+	if (!atomic_read(&dc->has_dirty) &&
+	    !atomic_xchg(&dc->has_dirty, 1)) {
+		atomic_inc(&dc->count);
+
+		if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
+			SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
+			/* XXX: should do this synchronously */
+			bch_write_bdev_super(dc, NULL);
+		}
+
+		bch_writeback_queue(dc);
+
+		if (dc->writeback_percent)
+			schedule_delayed_work(&dc->writeback_rate_update,
+				      dc->writeback_rate_update_seconds * HZ);
+	}
+}
+
+/* Background writeback - IO loop */
+
+static void dirty_io_destructor(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	kfree(io);
+}
+
+static void write_dirty_finish(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	struct keybuf_key *w = io->bio.bi_private;
+	struct cached_dev *dc = io->dc;
+	struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
+
+	while (bv-- != io->bio.bi_io_vec)
+		__free_page(bv->bv_page);
+
+	/* This is kind of a dumb way of signalling errors. */
+	if (KEY_DIRTY(&w->key)) {
+		unsigned i;
+		struct btree_op op;
+		bch_btree_op_init_stack(&op);
+
+		op.type = BTREE_REPLACE;
+		bkey_copy(&op.replace, &w->key);
+
+		SET_KEY_DIRTY(&w->key, false);
+		bch_keylist_add(&op.keys, &w->key);
+
+		for (i = 0; i < KEY_PTRS(&w->key); i++)
+			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
+
+		pr_debug("clearing %s", pkey(&w->key));
+		bch_btree_insert(&op, dc->disk.c);
+		closure_sync(&op.cl);
+
+		atomic_long_inc(op.insert_collision
+				? &dc->disk.c->writeback_keys_failed
+				: &dc->disk.c->writeback_keys_done);
+	}
+
+	bch_keybuf_del(&dc->writeback_keys, w);
+	atomic_dec_bug(&dc->in_flight);
+
+	closure_wake_up(&dc->writeback_wait);
+
+	closure_return_with_destructor(cl, dirty_io_destructor);
+}
+
+static void dirty_endio(struct bio *bio, int error)
+{
+	struct keybuf_key *w = bio->bi_private;
+	struct dirty_io *io = w->private;
+
+	if (error)
+		SET_KEY_DIRTY(&w->key, false);
+
+	closure_put(&io->cl);
+}
+
+static void write_dirty(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+	struct keybuf_key *w = io->bio.bi_private;
+
+	dirty_init(w);
+	io->bio.bi_rw		= WRITE;
+	io->bio.bi_sector	= KEY_START(&w->key);
+	io->bio.bi_bdev		= io->dc->bdev;
+	io->bio.bi_end_io	= dirty_endio;
+
+	trace_bcache_write_dirty(&io->bio);
+	closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+	continue_at(cl, write_dirty_finish, dirty_wq);
+}
+
+static void read_dirty_endio(struct bio *bio, int error)
+{
+	struct keybuf_key *w = bio->bi_private;
+	struct dirty_io *io = w->private;
+
+	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
+			    error, "reading dirty data from cache");
+
+	dirty_endio(bio, error);
+}
+
+static void read_dirty_submit(struct closure *cl)
+{
+	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
+
+	trace_bcache_read_dirty(&io->bio);
+	closure_bio_submit(&io->bio, cl, &io->dc->disk);
+
+	continue_at(cl, write_dirty, dirty_wq);
+}
+
+static void read_dirty(struct closure *cl)
+{
+	struct cached_dev *dc = container_of(cl, struct cached_dev,
+					     writeback.cl);
+	unsigned delay = writeback_delay(dc, 0);
+	struct keybuf_key *w;
+	struct dirty_io *io;
+
+	/*
+	 * XXX: if we error, background writeback just spins. Should use some
+	 * mempools.
+	 */
+
+	while (1) {
+		w = bch_keybuf_next(&dc->writeback_keys);
+		if (!w)
+			break;
+
+		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
+
+		if (delay > 0 &&
+		    (KEY_START(&w->key) != dc->last_read ||
+		     jiffies_to_msecs(delay) > 50)) {
+			w->private = NULL;
+
+			closure_delay(&dc->writeback, delay);
+			continue_at(cl, read_dirty, dirty_wq);
+		}
+
+		dc->last_read	= KEY_OFFSET(&w->key);
+
+		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
+			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+			     GFP_KERNEL);
+		if (!io)
+			goto err;
+
+		w->private	= io;
+		io->dc		= dc;
+
+		dirty_init(w);
+		io->bio.bi_sector	= PTR_OFFSET(&w->key, 0);
+		io->bio.bi_bdev		= PTR_CACHE(dc->disk.c,
+						    &w->key, 0)->bdev;
+		io->bio.bi_rw		= READ;
+		io->bio.bi_end_io	= read_dirty_endio;
+
+		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
+			goto err_free;
+
+		pr_debug("%s", pkey(&w->key));
+
+		closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
+
+		delay = writeback_delay(dc, KEY_SIZE(&w->key));
+
+		atomic_inc(&dc->in_flight);
+
+		if (!closure_wait_event(&dc->writeback_wait, cl,
+					atomic_read(&dc->in_flight) < 64))
+			continue_at(cl, read_dirty, dirty_wq);
+	}
+
+	if (0) {
+err_free:
+		kfree(w->private);
+err:
+		bch_keybuf_del(&dc->writeback_keys, w);
+	}
+
+	refill_dirty(cl);
+}
+
+void bch_writeback_init_cached_dev(struct cached_dev *dc)
+{
+	closure_init_unlocked(&dc->writeback);
+	init_rwsem(&dc->writeback_lock);
+
+	bch_keybuf_init(&dc->writeback_keys, dirty_pred);
+
+	dc->writeback_metadata		= true;
+	dc->writeback_running		= true;
+	dc->writeback_percent		= 10;
+	dc->writeback_delay		= 30;
+	dc->writeback_rate.rate		= 1024;
+
+	dc->writeback_rate_update_seconds = 30;
+	dc->writeback_rate_d_term	= 16;
+	dc->writeback_rate_p_term_inverse = 64;
+	dc->writeback_rate_d_smooth	= 8;
+
+	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
+	schedule_delayed_work(&dc->writeback_rate_update,
+			      dc->writeback_rate_update_seconds * HZ);
+}
+
+void bch_writeback_exit(void)
+{
+	if (dirty_wq)
+		destroy_workqueue(dirty_wq);
+}
+
+int __init bch_writeback_init(void)
+{
+	dirty_wq = create_singlethread_workqueue("bcache_writeback");
+	if (!dirty_wq)
+		return -ENOMEM;
+
+	return 0;
+}
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index f204a7a9cf38..6e7ec64b69ab 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -78,3 +78,9 @@ SUBSYS(hugetlb)
 #endif
 
 /* */
+
+#ifdef CONFIG_CGROUP_BCACHE
+SUBSYS(bcache)
+#endif
+
+/* */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..a8482d063bc3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1576,6 +1576,10 @@ struct task_struct {
 #ifdef CONFIG_UPROBES
 	struct uprobe_task *utask;
 #endif
+#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
+	unsigned int	sequential_io;
+	unsigned int	sequential_io_avg;
+#endif
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h
new file mode 100644
index 000000000000..3cc5a0b278c3
--- /dev/null
+++ b/include/trace/events/bcache.h
@@ -0,0 +1,271 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM bcache
+
+#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BCACHE_H
+
+#include <linux/tracepoint.h>
+
+struct search;
+
+DECLARE_EVENT_CLASS(bcache_request,
+
+	TP_PROTO(struct search *s, struct bio *bio),
+
+	TP_ARGS(s, bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(unsigned int,	orig_major		)
+		__field(unsigned int,	orig_minor		)
+		__field(sector_t,	sector			)
+		__field(dev_t,		orig_sector		)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__array(char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->orig_major	= s->d->disk->major;
+		__entry->orig_minor	= s->d->disk->first_minor;
+		__entry->sector		= bio->bi_sector;
+		__entry->orig_sector	= bio->bi_sector - 16;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s] (from %d,%d @ %llu)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm,
+		  __entry->orig_major, __entry->orig_minor,
+		  (unsigned long long)__entry->orig_sector)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_start,
+
+	TP_PROTO(struct search *s, struct bio *bio),
+
+	TP_ARGS(s, bio)
+);
+
+DEFINE_EVENT(bcache_request, bcache_request_end,
+
+	TP_PROTO(struct search *s, struct bio *bio),
+
+	TP_ARGS(s, bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_bio,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(sector_t,	sector			)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__array(char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
+);
+
+
+DEFINE_EVENT(bcache_bio, bcache_passthrough,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_cache_hit,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_cache_miss,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_retry,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_writethrough,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_writeback,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_skip,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_btree_read,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_btree_write,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_dirty,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_dirty,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_write_moving,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_read_moving,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DEFINE_EVENT(bcache_bio, bcache_journal_write,
+
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio)
+);
+
+DECLARE_EVENT_CLASS(bcache_cache_bio,
+
+	TP_PROTO(struct bio *bio,
+		 sector_t orig_sector,
+		 struct block_device* orig_bdev),
+
+	TP_ARGS(bio, orig_sector, orig_bdev),
+
+	TP_STRUCT__entry(
+		__field(dev_t,		dev			)
+		__field(dev_t,		orig_dev		)
+		__field(sector_t,	sector			)
+		__field(sector_t,	orig_sector		)
+		__field(unsigned int,	nr_sector		)
+		__array(char,		rwbs,	6		)
+		__array(char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->orig_dev	= orig_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->orig_sector	= orig_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d  %s %llu + %u [%s] (from %d,%d %llu)",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm,
+		  MAJOR(__entry->orig_dev), MINOR(__entry->orig_dev),
+		  (unsigned long long)__entry->orig_sector)
+);
+
+DEFINE_EVENT(bcache_cache_bio, bcache_cache_insert,
+
+	TP_PROTO(struct bio *bio,
+		 sector_t orig_sector,
+		 struct block_device *orig_bdev),
+
+	TP_ARGS(bio, orig_sector, orig_bdev)
+);
+
+DECLARE_EVENT_CLASS(bcache_gc,
+
+	TP_PROTO(uint8_t *uuid),
+
+	TP_ARGS(uuid),
+
+	TP_STRUCT__entry(
+		__field(uint8_t *,	uuid)
+	),
+
+	TP_fast_assign(
+		__entry->uuid		= uuid;
+	),
+
+	TP_printk("%pU", __entry->uuid)
+);
+
+
+DEFINE_EVENT(bcache_gc, bcache_gc_start,
+
+	     TP_PROTO(uint8_t *uuid),
+
+	     TP_ARGS(uuid)
+);
+
+DEFINE_EVENT(bcache_gc, bcache_gc_end,
+
+	     TP_PROTO(uint8_t *uuid),
+
+	     TP_ARGS(uuid)
+);
+
+#endif /* _TRACE_BCACHE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 1766d324d5e3..7b54fb62332c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1303,6 +1303,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->memcg_batch.do_batch = 0;
 	p->memcg_batch.memcg = NULL;
 #endif
+#ifdef CONFIG_BCACHE
+	p->sequential_io	= 0;
+	p->sequential_io_avg	= 0;
+#endif
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p);
-- 
cgit 


From 5f9296ba21b3c395e53dd84e7ff9578f97f24295 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Tue, 28 Feb 2012 18:26:31 +0530
Subject: i2c: Add bus recovery infrastructure

Add i2c bus recovery infrastructure to i2c adapters as specified in the i2c
protocol Rev. 03 section 3.1.16 titled "Bus clear".

http://www.nxp.com/documents/user_manual/UM10204.pdf

Sometimes during operation i2c bus hangs and we need to give dummy clocks to
slave device to start the transfer again. Now we may have capability in the bus
controller to generate these clocks or platform may have gpio pins which can be
toggled to generate dummy clocks. This patch supports both.

This patch also adds in generic bus recovery routines gpio or scl line based
which can be used by bus controller. In addition controller driver may provide
its own version of the bus recovery routine.

This doesn't support multi-master recovery for now.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
[wsa: changed gpio type to int and minor reformatting]
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 159 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/i2c.h    |  41 +++++++++++++
 2 files changed, 200 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index f7dfe878a51b..0d873ba2e82e 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -27,7 +27,9 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/delay.h>
 #include <linux/errno.h>
+#include <linux/gpio.h>
 #include <linux/slab.h>
 #include <linux/i2c.h>
 #include <linux/init.h>
@@ -109,6 +111,130 @@ static int i2c_device_uevent(struct device *dev, struct kobj_uevent_env *env)
 #define i2c_device_uevent	NULL
 #endif	/* CONFIG_HOTPLUG */
 
+/* i2c bus recovery routines */
+static int get_scl_gpio_value(struct i2c_adapter *adap)
+{
+	return gpio_get_value(adap->bus_recovery_info->scl_gpio);
+}
+
+static void set_scl_gpio_value(struct i2c_adapter *adap, int val)
+{
+	gpio_set_value(adap->bus_recovery_info->scl_gpio, val);
+}
+
+static int get_sda_gpio_value(struct i2c_adapter *adap)
+{
+	return gpio_get_value(adap->bus_recovery_info->sda_gpio);
+}
+
+static int i2c_get_gpios_for_recovery(struct i2c_adapter *adap)
+{
+	struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+	struct device *dev = &adap->dev;
+	int ret = 0;
+
+	ret = gpio_request_one(bri->scl_gpio, GPIOF_OPEN_DRAIN |
+			GPIOF_OUT_INIT_HIGH, "i2c-scl");
+	if (ret) {
+		dev_warn(dev, "Can't get SCL gpio: %d\n", bri->scl_gpio);
+		return ret;
+	}
+
+	if (bri->get_sda) {
+		if (gpio_request_one(bri->sda_gpio, GPIOF_IN, "i2c-sda")) {
+			/* work without SDA polling */
+			dev_warn(dev, "Can't get SDA gpio: %d. Not using SDA polling\n",
+					bri->sda_gpio);
+			bri->get_sda = NULL;
+		}
+	}
+
+	return ret;
+}
+
+static void i2c_put_gpios_for_recovery(struct i2c_adapter *adap)
+{
+	struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+
+	if (bri->get_sda)
+		gpio_free(bri->sda_gpio);
+
+	gpio_free(bri->scl_gpio);
+}
+
+/*
+ * We are generating clock pulses. ndelay() determines durating of clk pulses.
+ * We will generate clock with rate 100 KHz and so duration of both clock levels
+ * is: delay in ns = (10^6 / 100) / 2
+ */
+#define RECOVERY_NDELAY		5000
+#define RECOVERY_CLK_CNT	9
+
+static int i2c_generic_recovery(struct i2c_adapter *adap)
+{
+	struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+	int i = 0, val = 1, ret = 0;
+
+	if (bri->prepare_recovery)
+		bri->prepare_recovery(bri);
+
+	/*
+	 * By this time SCL is high, as we need to give 9 falling-rising edges
+	 */
+	while (i++ < RECOVERY_CLK_CNT * 2) {
+		if (val) {
+			/* Break if SDA is high */
+			if (bri->get_sda && bri->get_sda(adap))
+					break;
+			/* SCL shouldn't be low here */
+			if (!bri->get_scl(adap)) {
+				dev_err(&adap->dev,
+					"SCL is stuck low, exit recovery\n");
+				ret = -EBUSY;
+				break;
+			}
+		}
+
+		val = !val;
+		bri->set_scl(adap, val);
+		ndelay(RECOVERY_NDELAY);
+	}
+
+	if (bri->unprepare_recovery)
+		bri->unprepare_recovery(bri);
+
+	return ret;
+}
+
+int i2c_generic_scl_recovery(struct i2c_adapter *adap)
+{
+	adap->bus_recovery_info->set_scl(adap, 1);
+	return i2c_generic_recovery(adap);
+}
+
+int i2c_generic_gpio_recovery(struct i2c_adapter *adap)
+{
+	int ret;
+
+	ret = i2c_get_gpios_for_recovery(adap);
+	if (ret)
+		return ret;
+
+	ret = i2c_generic_recovery(adap);
+	i2c_put_gpios_for_recovery(adap);
+
+	return ret;
+}
+
+int i2c_recover_bus(struct i2c_adapter *adap)
+{
+	if (!adap->bus_recovery_info)
+		return -EOPNOTSUPP;
+
+	dev_dbg(&adap->dev, "Trying i2c bus recovery\n");
+	return adap->bus_recovery_info->recover_bus(adap);
+}
+
 static int i2c_device_probe(struct device *dev)
 {
 	struct i2c_client	*client = i2c_verify_client(dev);
@@ -902,6 +1028,39 @@ static int i2c_register_adapter(struct i2c_adapter *adap)
 			 "Failed to create compatibility class link\n");
 #endif
 
+	/* bus recovery specific initialization */
+	if (adap->bus_recovery_info) {
+		struct i2c_bus_recovery_info *bri = adap->bus_recovery_info;
+
+		if (!bri->recover_bus) {
+			dev_err(&adap->dev, "No recover_bus() found, not using recovery\n");
+			adap->bus_recovery_info = NULL;
+			goto exit_recovery;
+		}
+
+		/* Generic GPIO recovery */
+		if (bri->recover_bus == i2c_generic_gpio_recovery) {
+			if (!gpio_is_valid(bri->scl_gpio)) {
+				dev_err(&adap->dev, "Invalid SCL gpio, not using recovery\n");
+				adap->bus_recovery_info = NULL;
+				goto exit_recovery;
+			}
+
+			if (gpio_is_valid(bri->sda_gpio))
+				bri->get_sda = get_sda_gpio_value;
+			else
+				bri->get_sda = NULL;
+
+			bri->get_scl = get_scl_gpio_value;
+			bri->set_scl = set_scl_gpio_value;
+		} else if (!bri->set_scl || !bri->get_scl) {
+			/* Generic SCL recovery */
+			dev_err(&adap->dev, "No {get|set}_gpio() found, not using recovery\n");
+			adap->bus_recovery_info = NULL;
+		}
+	}
+
+exit_recovery:
 	/* create pre-declared device nodes */
 	if (adap->nr < __i2c_first_dynamic_bus_num)
 		i2c_scan_static_board_info(adap);
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index d0c4db7b4872..2eca3860b77f 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -370,6 +370,45 @@ struct i2c_algorithm {
 	u32 (*functionality) (struct i2c_adapter *);
 };
 
+/**
+ * struct i2c_bus_recovery_info - I2C bus recovery information
+ * @recover_bus: Recover routine. Either pass driver's recover_bus() routine, or
+ *	i2c_generic_scl_recovery() or i2c_generic_gpio_recovery().
+ * @get_scl: This gets current value of SCL line. Mandatory for generic SCL
+ *      recovery. Used internally for generic GPIO recovery.
+ * @set_scl: This sets/clears SCL line. Mandatory for generic SCL recovery. Used
+ *      internally for generic GPIO recovery.
+ * @get_sda: This gets current value of SDA line. Optional for generic SCL
+ *      recovery. Used internally, if sda_gpio is a valid GPIO, for generic GPIO
+ *      recovery.
+ * @prepare_recovery: This will be called before starting recovery. Platform may
+ *	configure padmux here for SDA/SCL line or something else they want.
+ * @unprepare_recovery: This will be called after completing recovery. Platform
+ *	may configure padmux here for SDA/SCL line or something else they want.
+ * @scl_gpio: gpio number of the SCL line. Only required for GPIO recovery.
+ * @sda_gpio: gpio number of the SDA line. Only required for GPIO recovery.
+ */
+struct i2c_bus_recovery_info {
+	int (*recover_bus)(struct i2c_adapter *);
+
+	int (*get_scl)(struct i2c_adapter *);
+	void (*set_scl)(struct i2c_adapter *, int val);
+	int (*get_sda)(struct i2c_adapter *);
+
+	void (*prepare_recovery)(struct i2c_bus_recovery_info *bri);
+	void (*unprepare_recovery)(struct i2c_bus_recovery_info *bri);
+
+	/* gpio recovery */
+	int scl_gpio;
+	int sda_gpio;
+};
+
+int i2c_recover_bus(struct i2c_adapter *adap);
+
+/* Generic recovery routines */
+int i2c_generic_gpio_recovery(struct i2c_adapter *adap);
+int i2c_generic_scl_recovery(struct i2c_adapter *adap);
+
 /*
  * i2c_adapter is the structure used to identify a physical i2c bus along
  * with the access algorithms necessary to access it.
@@ -393,6 +432,8 @@ struct i2c_adapter {
 
 	struct mutex userspace_clients_lock;
 	struct list_head userspace_clients;
+
+	struct i2c_bus_recovery_info *bus_recovery_info;
 };
 #define to_i2c_adapter(d) container_of(d, struct i2c_adapter, dev)
 
-- 
cgit 


From 49a64ac555f1dabd2b94325553187d0db6ecac16 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Thu, 21 Mar 2013 08:08:46 +0000
Subject: i2c: tegra: assume CONFIG_OF, remove platform data

Tegra only supports, and always enables, device tree. Remove all ifdefs
and runtime checks for DT support from the driver. Platform data is
therefore no longer required. Delete the header that defines it.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 arch/arm/mach-tegra/board-dt-tegra20.c |  2 --
 drivers/i2c/busses/i2c-tegra.c         | 26 +++++++-------------------
 include/linux/i2c-tegra.h              | 25 -------------------------
 3 files changed, 7 insertions(+), 46 deletions(-)
 delete mode 100644 include/linux/i2c-tegra.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-tegra/board-dt-tegra20.c b/arch/arm/mach-tegra/board-dt-tegra20.c
index a0edf2510280..6e1c9a91848f 100644
--- a/arch/arm/mach-tegra/board-dt-tegra20.c
+++ b/arch/arm/mach-tegra/board-dt-tegra20.c
@@ -30,8 +30,6 @@
 #include <linux/pda_power.h>
 #include <linux/platform_data/tegra_usb.h>
 #include <linux/io.h>
-#include <linux/i2c.h>
-#include <linux/i2c-tegra.h>
 #include <linux/usb/tegra_usb_phy.h>
 
 #include <asm/mach-types.h>
diff --git a/drivers/i2c/busses/i2c-tegra.c b/drivers/i2c/busses/i2c-tegra.c
index b714776b6ddd..b60ff90adc39 100644
--- a/drivers/i2c/busses/i2c-tegra.c
+++ b/drivers/i2c/busses/i2c-tegra.c
@@ -25,7 +25,6 @@
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/slab.h>
-#include <linux/i2c-tegra.h>
 #include <linux/of_i2c.h>
 #include <linux/of_device.h>
 #include <linux/module.h>
@@ -172,7 +171,7 @@ struct tegra_i2c_dev {
 	u8 *msg_buf;
 	size_t msg_buf_remaining;
 	int msg_read;
-	unsigned long bus_clk_rate;
+	u32 bus_clk_rate;
 	bool is_suspended;
 };
 
@@ -694,7 +693,6 @@ static const struct tegra_i2c_hw_feature tegra114_i2c_hw = {
 	.clk_divisor_std_fast_mode = 0x19,
 };
 
-#if defined(CONFIG_OF)
 /* Match table for of_platform binding */
 static const struct of_device_id tegra_i2c_of_match[] = {
 	{ .compatible = "nvidia,tegra114-i2c", .data = &tegra114_i2c_hw, },
@@ -704,16 +702,13 @@ static const struct of_device_id tegra_i2c_of_match[] = {
 	{},
 };
 MODULE_DEVICE_TABLE(of, tegra_i2c_of_match);
-#endif
 
 static int tegra_i2c_probe(struct platform_device *pdev)
 {
 	struct tegra_i2c_dev *i2c_dev;
-	struct tegra_i2c_platform_data *pdata = pdev->dev.platform_data;
 	struct resource *res;
 	struct clk *div_clk;
 	struct clk *fast_clk;
-	const unsigned int *prop;
 	void __iomem *base;
 	int irq;
 	int ret = 0;
@@ -754,23 +749,16 @@ static int tegra_i2c_probe(struct platform_device *pdev)
 	i2c_dev->cont_id = pdev->id;
 	i2c_dev->dev = &pdev->dev;
 
-	i2c_dev->bus_clk_rate = 100000; /* default clock rate */
-	if (pdata) {
-		i2c_dev->bus_clk_rate = pdata->bus_clk_rate;
-
-	} else if (i2c_dev->dev->of_node) {    /* if there is a device tree node ... */
-		prop = of_get_property(i2c_dev->dev->of_node,
-				"clock-frequency", NULL);
-		if (prop)
-			i2c_dev->bus_clk_rate = be32_to_cpup(prop);
-	}
+	ret = of_property_read_u32(i2c_dev->dev->of_node, "clock-frequency",
+					&i2c_dev->bus_clk_rate);
+	if (ret)
+		i2c_dev->bus_clk_rate = 100000; /* default clock rate */
 
 	i2c_dev->hw = &tegra20_i2c_hw;
 
 	if (pdev->dev.of_node) {
 		const struct of_device_id *match;
-		match = of_match_device(of_match_ptr(tegra_i2c_of_match),
-						&pdev->dev);
+		match = of_match_device(tegra_i2c_of_match, &pdev->dev);
 		i2c_dev->hw = match->data;
 		i2c_dev->is_dvc = of_device_is_compatible(pdev->dev.of_node,
 						"nvidia,tegra20-i2c-dvc");
@@ -876,7 +864,7 @@ static struct platform_driver tegra_i2c_driver = {
 	.driver  = {
 		.name  = "tegra-i2c",
 		.owner = THIS_MODULE,
-		.of_match_table = of_match_ptr(tegra_i2c_of_match),
+		.of_match_table = tegra_i2c_of_match,
 		.pm    = TEGRA_I2C_PM,
 	},
 };
diff --git a/include/linux/i2c-tegra.h b/include/linux/i2c-tegra.h
deleted file mode 100644
index 9c85da49857a..000000000000
--- a/include/linux/i2c-tegra.h
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * drivers/i2c/busses/i2c-tegra.c
- *
- * Copyright (C) 2010 Google, Inc.
- * Author: Colin Cross <ccross@android.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _LINUX_I2C_TEGRA_H
-#define _LINUX_I2C_TEGRA_H
-
-struct tegra_i2c_platform_data {
-	unsigned long bus_clk_rate;
-};
-
-#endif /* _LINUX_I2C_TEGRA_H */
-- 
cgit 


From d4e1a692e9e85f9cbee090ea8d6158b133d32157 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Mon, 4 Mar 2013 21:30:41 +0000
Subject: ACPI: Remove acpi_device dependency in acpi_device_set_id()

This patch updates the internal operations of acpi_device_set_id()
to setup acpi_device_pnp without using acpi_device.  There is no
functional change to acpi_device_set_id() in this patch.

acpi_pnp_type is added to acpi_device_pnp, so that PNPID type is
self-contained within acpi_device_pnp.  acpi_add_id(), acpi_bay_match(),
acpi_dock_match(), acpi_ibm_smbus_match() and acpi_is_video_device()
are changed to take acpi_handle as an argument, instead of acpi_device.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/acpi/scan.c                   | 69 +++++++++++++++++------------------
 drivers/acpi/video_detect.c           | 25 ++++++-------
 drivers/gpu/drm/i915/intel_opregion.c |  4 +-
 include/acpi/acpi_bus.h               | 14 +++++--
 include/linux/acpi.h                  |  4 +-
 5 files changed, 59 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index d69d77ab9c7e..f9c698d766f1 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -526,7 +526,7 @@ static int acpi_device_setup_files(struct acpi_device *dev)
 			goto end;
 	}
 
-	if (dev->flags.bus_address)
+	if (dev->pnp.type.bus_address)
 		result = device_create_file(&dev->dev, &dev_attr_adr);
 	if (dev->pnp.unique_id)
 		result = device_create_file(&dev->dev, &dev_attr_uid);
@@ -599,7 +599,7 @@ static void acpi_device_remove_files(struct acpi_device *dev)
 
 	if (dev->pnp.unique_id)
 		device_remove_file(&dev->dev, &dev_attr_uid);
-	if (dev->flags.bus_address)
+	if (dev->pnp.type.bus_address)
 		device_remove_file(&dev->dev, &dev_attr_adr);
 	device_remove_file(&dev->dev, &dev_attr_modalias);
 	device_remove_file(&dev->dev, &dev_attr_hid);
@@ -1406,19 +1406,17 @@ static void acpi_device_get_busid(struct acpi_device *device)
 }
 
 /*
- * acpi_bay_match - see if a device is an ejectable driver bay
+ * acpi_bay_match - see if an acpi object is an ejectable driver bay
  *
  * If an acpi object is ejectable and has one of the ACPI ATA methods defined,
  * then we can safely call it an ejectable drive bay
  */
-static int acpi_bay_match(struct acpi_device *device){
+static int acpi_bay_match(acpi_handle handle)
+{
 	acpi_status status;
-	acpi_handle handle;
 	acpi_handle tmp;
 	acpi_handle phandle;
 
-	handle = device->handle;
-
 	status = acpi_get_handle(handle, "_EJ0", &tmp);
 	if (ACPI_FAILURE(status))
 		return -ENODEV;
@@ -1442,12 +1440,12 @@ static int acpi_bay_match(struct acpi_device *device){
 }
 
 /*
- * acpi_dock_match - see if a device has a _DCK method
+ * acpi_dock_match - see if an acpi object has a _DCK method
  */
-static int acpi_dock_match(struct acpi_device *device)
+static int acpi_dock_match(acpi_handle handle)
 {
 	acpi_handle tmp;
-	return acpi_get_handle(device->handle, "_DCK", &tmp);
+	return acpi_get_handle(handle, "_DCK", &tmp);
 }
 
 const char *acpi_device_hid(struct acpi_device *device)
@@ -1462,7 +1460,7 @@ const char *acpi_device_hid(struct acpi_device *device)
 }
 EXPORT_SYMBOL(acpi_device_hid);
 
-static void acpi_add_id(struct acpi_device *device, const char *dev_id)
+static void acpi_add_id(struct acpi_device_pnp *pnp, const char *dev_id)
 {
 	struct acpi_hardware_id *id;
 
@@ -1476,7 +1474,8 @@ static void acpi_add_id(struct acpi_device *device, const char *dev_id)
 		return;
 	}
 
-	list_add_tail(&id->list, &device->pnp.ids);
+	list_add_tail(&id->list, &pnp->ids);
+	pnp->type.hardware_id = 1;
 }
 
 /*
@@ -1484,7 +1483,7 @@ static void acpi_add_id(struct acpi_device *device, const char *dev_id)
  * lacks the SMBUS01 HID and the methods do not have the necessary "_"
  * prefix.  Work around this.
  */
-static int acpi_ibm_smbus_match(struct acpi_device *device)
+static int acpi_ibm_smbus_match(acpi_handle handle)
 {
 	acpi_handle h_dummy;
 	struct acpi_buffer path = {ACPI_ALLOCATE_BUFFER, NULL};
@@ -1494,7 +1493,7 @@ static int acpi_ibm_smbus_match(struct acpi_device *device)
 		return -ENODEV;
 
 	/* Look for SMBS object */
-	result = acpi_get_name(device->handle, ACPI_SINGLE_NAME, &path);
+	result = acpi_get_name(handle, ACPI_SINGLE_NAME, &path);
 	if (result)
 		return result;
 
@@ -1505,9 +1504,9 @@ static int acpi_ibm_smbus_match(struct acpi_device *device)
 
 	/* Does it have the necessary (but misnamed) methods? */
 	result = -ENODEV;
-	if (ACPI_SUCCESS(acpi_get_handle(device->handle, "SBI", &h_dummy)) &&
-	    ACPI_SUCCESS(acpi_get_handle(device->handle, "SBR", &h_dummy)) &&
-	    ACPI_SUCCESS(acpi_get_handle(device->handle, "SBW", &h_dummy)))
+	if (ACPI_SUCCESS(acpi_get_handle(handle, "SBI", &h_dummy)) &&
+	    ACPI_SUCCESS(acpi_get_handle(handle, "SBR", &h_dummy)) &&
+	    ACPI_SUCCESS(acpi_get_handle(handle, "SBW", &h_dummy)))
 		result = 0;
 out:
 	kfree(path.pointer);
@@ -1524,7 +1523,7 @@ static void acpi_device_set_id(struct acpi_device *device)
 	switch (device->device_type) {
 	case ACPI_BUS_TYPE_DEVICE:
 		if (ACPI_IS_ROOT_DEVICE(device)) {
-			acpi_add_id(device, ACPI_SYSTEM_HID);
+			acpi_add_id(&device->pnp, ACPI_SYSTEM_HID);
 			break;
 		}
 
@@ -1535,15 +1534,15 @@ static void acpi_device_set_id(struct acpi_device *device)
 		}
 
 		if (info->valid & ACPI_VALID_HID)
-			acpi_add_id(device, info->hardware_id.string);
+			acpi_add_id(&device->pnp, info->hardware_id.string);
 		if (info->valid & ACPI_VALID_CID) {
 			cid_list = &info->compatible_id_list;
 			for (i = 0; i < cid_list->count; i++)
-				acpi_add_id(device, cid_list->ids[i].string);
+				acpi_add_id(&device->pnp, cid_list->ids[i].string);
 		}
 		if (info->valid & ACPI_VALID_ADR) {
 			device->pnp.bus_address = info->address;
-			device->flags.bus_address = 1;
+			device->pnp.type.bus_address = 1;
 		}
 		if (info->valid & ACPI_VALID_UID)
 			device->pnp.unique_id = kstrdup(info->unique_id.string,
@@ -1555,36 +1554,36 @@ static void acpi_device_set_id(struct acpi_device *device)
 		 * Some devices don't reliably have _HIDs & _CIDs, so add
 		 * synthetic HIDs to make sure drivers can find them.
 		 */
-		if (acpi_is_video_device(device))
-			acpi_add_id(device, ACPI_VIDEO_HID);
-		else if (ACPI_SUCCESS(acpi_bay_match(device)))
-			acpi_add_id(device, ACPI_BAY_HID);
-		else if (ACPI_SUCCESS(acpi_dock_match(device)))
-			acpi_add_id(device, ACPI_DOCK_HID);
-		else if (!acpi_ibm_smbus_match(device))
-			acpi_add_id(device, ACPI_SMBUS_IBM_HID);
+		if (acpi_is_video_device(device->handle))
+			acpi_add_id(&device->pnp, ACPI_VIDEO_HID);
+		else if (ACPI_SUCCESS(acpi_bay_match(device->handle)))
+			acpi_add_id(&device->pnp, ACPI_BAY_HID);
+		else if (ACPI_SUCCESS(acpi_dock_match(device->handle)))
+			acpi_add_id(&device->pnp, ACPI_DOCK_HID);
+		else if (!acpi_ibm_smbus_match(device->handle))
+			acpi_add_id(&device->pnp, ACPI_SMBUS_IBM_HID);
 		else if (list_empty(&device->pnp.ids) &&
 			 ACPI_IS_ROOT_DEVICE(device->parent)) {
-			acpi_add_id(device, ACPI_BUS_HID); /* \_SB, LNXSYBUS */
+			acpi_add_id(&device->pnp, ACPI_BUS_HID); /* \_SB, LNXSYBUS */
 			strcpy(device->pnp.device_name, ACPI_BUS_DEVICE_NAME);
 			strcpy(device->pnp.device_class, ACPI_BUS_CLASS);
 		}
 
 		break;
 	case ACPI_BUS_TYPE_POWER:
-		acpi_add_id(device, ACPI_POWER_HID);
+		acpi_add_id(&device->pnp, ACPI_POWER_HID);
 		break;
 	case ACPI_BUS_TYPE_PROCESSOR:
-		acpi_add_id(device, ACPI_PROCESSOR_OBJECT_HID);
+		acpi_add_id(&device->pnp, ACPI_PROCESSOR_OBJECT_HID);
 		break;
 	case ACPI_BUS_TYPE_THERMAL:
-		acpi_add_id(device, ACPI_THERMAL_HID);
+		acpi_add_id(&device->pnp, ACPI_THERMAL_HID);
 		break;
 	case ACPI_BUS_TYPE_POWER_BUTTON:
-		acpi_add_id(device, ACPI_BUTTON_HID_POWERF);
+		acpi_add_id(&device->pnp, ACPI_BUTTON_HID_POWERF);
 		break;
 	case ACPI_BUS_TYPE_SLEEP_BUTTON:
-		acpi_add_id(device, ACPI_BUTTON_HID_SLEEPF);
+		acpi_add_id(&device->pnp, ACPI_BUTTON_HID_SLEEPF);
 		break;
 	}
 }
diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c
index 4ac2593234e7..66f67626f02e 100644
--- a/drivers/acpi/video_detect.c
+++ b/drivers/acpi/video_detect.c
@@ -67,40 +67,37 @@ acpi_backlight_cap_match(acpi_handle handle, u32 level, void *context,
 	return 0;
 }
 
-/* Returns true if the device is a video device which can be handled by
- * video.ko.
+/* Returns true if the ACPI object is a video device which can be
+ * handled by video.ko.
  * The device will get a Linux specific CID added in scan.c to
  * identify the device as an ACPI graphics device
  * Be aware that the graphics device may not be physically present
  * Use acpi_video_get_capabilities() to detect general ACPI video
  * capabilities of present cards
  */
-long acpi_is_video_device(struct acpi_device *device)
+long acpi_is_video_device(acpi_handle handle)
 {
 	acpi_handle h_dummy;
 	long video_caps = 0;
 
-	if (!device)
-		return 0;
-
 	/* Is this device able to support video switching ? */
-	if (ACPI_SUCCESS(acpi_get_handle(device->handle, "_DOD", &h_dummy)) ||
-	    ACPI_SUCCESS(acpi_get_handle(device->handle, "_DOS", &h_dummy)))
+	if (ACPI_SUCCESS(acpi_get_handle(handle, "_DOD", &h_dummy)) ||
+	    ACPI_SUCCESS(acpi_get_handle(handle, "_DOS", &h_dummy)))
 		video_caps |= ACPI_VIDEO_OUTPUT_SWITCHING;
 
 	/* Is this device able to retrieve a video ROM ? */
-	if (ACPI_SUCCESS(acpi_get_handle(device->handle, "_ROM", &h_dummy)))
+	if (ACPI_SUCCESS(acpi_get_handle(handle, "_ROM", &h_dummy)))
 		video_caps |= ACPI_VIDEO_ROM_AVAILABLE;
 
 	/* Is this device able to configure which video head to be POSTed ? */
-	if (ACPI_SUCCESS(acpi_get_handle(device->handle, "_VPO", &h_dummy)) &&
-	    ACPI_SUCCESS(acpi_get_handle(device->handle, "_GPD", &h_dummy)) &&
-	    ACPI_SUCCESS(acpi_get_handle(device->handle, "_SPD", &h_dummy)))
+	if (ACPI_SUCCESS(acpi_get_handle(handle, "_VPO", &h_dummy)) &&
+	    ACPI_SUCCESS(acpi_get_handle(handle, "_GPD", &h_dummy)) &&
+	    ACPI_SUCCESS(acpi_get_handle(handle, "_SPD", &h_dummy)))
 		video_caps |= ACPI_VIDEO_DEVICE_POSTING;
 
 	/* Only check for backlight functionality if one of the above hit. */
 	if (video_caps)
-		acpi_walk_namespace(ACPI_TYPE_DEVICE, device->handle,
+		acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
 				    ACPI_UINT32_MAX, acpi_backlight_cap_match, NULL,
 				    &video_caps, NULL);
 
@@ -127,7 +124,7 @@ find_video(acpi_handle handle, u32 lvl, void *context, void **rv)
 		if (!dev)
 			return AE_OK;
 		pci_dev_put(dev);
-		*cap |= acpi_is_video_device(acpi_dev);
+		*cap |= acpi_is_video_device(handle);
 	}
 	return AE_OK;
 }
diff --git a/drivers/gpu/drm/i915/intel_opregion.c b/drivers/gpu/drm/i915/intel_opregion.c
index 4d338740f2cb..a8117e614009 100644
--- a/drivers/gpu/drm/i915/intel_opregion.c
+++ b/drivers/gpu/drm/i915/intel_opregion.c
@@ -350,11 +350,11 @@ static void intel_didl_outputs(struct drm_device *dev)
 	if (!handle || acpi_bus_get_device(handle, &acpi_dev))
 		return;
 
-	if (acpi_is_video_device(acpi_dev))
+	if (acpi_is_video_device(handle))
 		acpi_video_bus = acpi_dev;
 	else {
 		list_for_each_entry(acpi_cdev, &acpi_dev->children, node) {
-			if (acpi_is_video_device(acpi_cdev)) {
+			if (acpi_is_video_device(acpi_cdev->handle)) {
 				acpi_video_bus = acpi_cdev;
 				break;
 			}
diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 533ef039c5e0..3cb3da8ac9d9 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -161,7 +161,6 @@ struct acpi_device_status {
 
 struct acpi_device_flags {
 	u32 dynamic_status:1;
-	u32 bus_address:1;
 	u32 removable:1;
 	u32 ejectable:1;
 	u32 suprise_removal_ok:1;
@@ -169,7 +168,7 @@ struct acpi_device_flags {
 	u32 performance_manageable:1;
 	u32 eject_pending:1;
 	u32 match_driver:1;
-	u32 reserved:23;
+	u32 reserved:24;
 };
 
 /* File System */
@@ -192,10 +191,17 @@ struct acpi_hardware_id {
 	char *id;
 };
 
+struct acpi_pnp_type {
+	u32 hardware_id:1;
+	u32 bus_address:1;
+	u32 reserved:30;
+};
+
 struct acpi_device_pnp {
-	acpi_bus_id bus_id;	/* Object name */
+	acpi_bus_id bus_id;		/* Object name */
+	struct acpi_pnp_type type;	/* ID type */
 	acpi_bus_address bus_address;	/* _ADR */
-	char *unique_id;	/* _UID */
+	char *unique_id;		/* _UID */
 	struct list_head ids;		/* _HID and _CIDs */
 	acpi_device_name device_name;	/* Driver-determined */
 	acpi_device_class device_class;	/*        "          */
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bcbdd7484e58..edaf311473e5 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -204,7 +204,7 @@ extern bool wmi_has_guid(const char *guid);
 #if defined(CONFIG_ACPI_VIDEO) || defined(CONFIG_ACPI_VIDEO_MODULE)
 
 extern long acpi_video_get_capabilities(acpi_handle graphics_dev_handle);
-extern long acpi_is_video_device(struct acpi_device *device);
+extern long acpi_is_video_device(acpi_handle handle);
 extern void acpi_video_dmi_promote_vendor(void);
 extern void acpi_video_dmi_demote_vendor(void);
 extern int acpi_video_backlight_support(void);
@@ -217,7 +217,7 @@ static inline long acpi_video_get_capabilities(acpi_handle graphics_dev_handle)
 	return 0;
 }
 
-static inline long acpi_is_video_device(struct acpi_device *device)
+static inline long acpi_is_video_device(acpi_handle handle)
 {
 	return 0;
 }
-- 
cgit 


From c58c844187df61ef7cc103d0abb5dd6198bcfcd6 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 18 Mar 2013 19:45:14 -0400
Subject: NFS: Don't accept more reads/writes if the open context recovery
 failed

If the state recovery failed, we want to ensure that the application
doesn't try to use the same file descriptor for more reads or writes.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4filelayout.c |  8 ++++++++
 fs/nfs/nfs4state.c      | 16 ++++++++++++++++
 fs/nfs/pagelist.c       |  2 ++
 fs/nfs/read.c           |  2 ++
 fs/nfs/write.c          |  2 ++
 include/linux/nfs_fs.h  |  1 +
 6 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 1ee5737211d7..4ba32e23eddd 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -305,6 +305,10 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_read_data *rdata = data;
 
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &rdata->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return;
+	}
 	if (filelayout_reset_to_mds(rdata->header->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 		filelayout_reset_read(rdata);
@@ -407,6 +411,10 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 {
 	struct nfs_write_data *wdata = data;
 
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &wdata->args.context->flags))) {
+		rpc_exit(task, -EIO);
+		return;
+	}
 	if (filelayout_reset_to_mds(wdata->header->lseg)) {
 		dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
 		filelayout_reset_write(wdata);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index fec1c5bb4863..8db102c7add6 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1328,9 +1328,25 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
 		nfs4_schedule_state_manager(clp);
 }
 
+static void nfs4_state_mark_open_context_bad(struct nfs4_state *state)
+{
+	struct inode *inode = state->inode;
+	struct nfs_inode *nfsi = NFS_I(inode);
+	struct nfs_open_context *ctx;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry(ctx, &nfsi->open_files, list) {
+		if (ctx->state != state)
+			continue;
+		set_bit(NFS_CONTEXT_BAD, &ctx->flags);
+	}
+	spin_unlock(&inode->i_lock);
+}
+
 static void nfs4_state_mark_recovery_failed(struct nfs4_state *state, int error)
 {
 	set_bit(NFS_STATE_RECOVERY_FAILED, &state->flags);
+	nfs4_state_mark_open_context_bad(state);
 }
 
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index e56e846e9d2d..7f0933086b36 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -104,6 +104,8 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 	struct nfs_page		*req;
 	struct nfs_lock_context *l_ctx;
 
+	if (test_bit(NFS_CONTEXT_BAD, &ctx->flags))
+		return ERR_PTR(-EBADF);
 	/* try to allocate the request struct */
 	req = nfs_page_alloc();
 	if (req == NULL)
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index a5e5d9899d56..70a26c651f09 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -514,6 +514,8 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_read_data *data = calldata;
 	NFS_PROTO(data->header->inode)->read_rpc_prepare(task, data);
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+		rpc_exit(task, -EIO);
 }
 
 static const struct rpc_call_ops nfs_read_common_ops = {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c483cc50b82e..a2c7c28049d5 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1251,6 +1251,8 @@ void nfs_write_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs_write_data *data = calldata;
 	NFS_PROTO(data->header->inode)->write_rpc_prepare(task, data);
+	if (unlikely(test_bit(NFS_CONTEXT_BAD, &data->args.context->flags)))
+		rpc_exit(task, -EIO);
 }
 
 void nfs_commit_prepare(struct rpc_task *task, void *calldata)
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 1cc25682b20b..f6b1956f3c86 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -77,6 +77,7 @@ struct nfs_open_context {
 	unsigned long flags;
 #define NFS_CONTEXT_ERROR_WRITE		(0)
 #define NFS_CONTEXT_RESEND_WRITES	(1)
+#define NFS_CONTEXT_BAD			(2)
 	int error;
 
 	struct list_head list;
-- 
cgit 


From 9b20614988199fb03580b335a28250922e902098 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Mar 2013 15:52:00 -0400
Subject: NFSv4: The stateid must remain the same for replayed RPC calls

If we replay a READ or WRITE call, we should not be changing the
stateid. Currently, we may end up doing so, because the stateid
is only selected at xdr encode time.

This patch ensures that we select the stateid after we get an NFSv4.1
session slot, and that we keep that same stateid across retries.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4_fs.h        |  4 ++++
 fs/nfs/nfs4filelayout.c | 14 ++++++++++----
 fs/nfs/nfs4proc.c       | 27 +++++++++++++++++++++++----
 fs/nfs/nfs4xdr.c        | 28 ++--------------------------
 include/linux/nfs_xdr.h |  2 ++
 5 files changed, 41 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9ce90135bf22..8309e98c44f9 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -234,6 +234,10 @@ extern struct rpc_clnt *nfs4_proc_lookup_mountpoint(struct inode *, struct qstr
 extern int nfs4_proc_secinfo(struct inode *, const struct qstr *, struct nfs4_secinfo_flavors *);
 extern int nfs4_release_lockowner(struct nfs4_lock_state *);
 extern const struct xattr_handler *nfs4_xattr_handlers[];
+extern void nfs4_set_rw_stateid(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode);
 
 #if defined(CONFIG_NFS_V4_1)
 static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server)
diff --git a/fs/nfs/nfs4filelayout.c b/fs/nfs/nfs4filelayout.c
index 4ba32e23eddd..22d10623f5ee 100644
--- a/fs/nfs/nfs4filelayout.c
+++ b/fs/nfs/nfs4filelayout.c
@@ -317,10 +317,13 @@ static void filelayout_read_prepare(struct rpc_task *task, void *data)
 	}
 	rdata->read_done_cb = filelayout_read_done_cb;
 
-	nfs41_setup_sequence(rdata->ds_clp->cl_session,
+	if (nfs41_setup_sequence(rdata->ds_clp->cl_session,
 			&rdata->args.seq_args,
 			&rdata->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&rdata->args.stateid, rdata->args.context,
+			rdata->args.lock_context, FMODE_READ);
 }
 
 static void filelayout_read_call_done(struct rpc_task *task, void *data)
@@ -421,10 +424,13 @@ static void filelayout_write_prepare(struct rpc_task *task, void *data)
 		rpc_exit(task, 0);
 		return;
 	}
-	nfs41_setup_sequence(wdata->ds_clp->cl_session,
+	if (nfs41_setup_sequence(wdata->ds_clp->cl_session,
 			&wdata->args.seq_args,
 			&wdata->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&wdata->args.stateid, wdata->args.context,
+			wdata->args.lock_context, FMODE_WRITE);
 }
 
 static void filelayout_write_call_done(struct rpc_task *task, void *data)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index c3bbb6c53d61..26176ce3d96a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3454,6 +3454,19 @@ static int nfs4_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle,
 	return err;
 }
 
+void nfs4_set_rw_stateid(nfs4_stateid *stateid,
+		const struct nfs_open_context *ctx,
+		const struct nfs_lock_context *l_ctx,
+		fmode_t fmode)
+{
+	const struct nfs_lockowner *lockowner = NULL;
+
+	if (l_ctx != NULL)
+		lockowner = &l_ctx->lockowner;
+	nfs4_select_rw_stateid(stateid, ctx->state, fmode, lockowner);
+}
+EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
+
 void __nfs4_read_done_cb(struct nfs_read_data *data)
 {
 	nfs_invalidate_atime(data->header->inode);
@@ -3496,10 +3509,13 @@ static void nfs4_proc_read_setup(struct nfs_read_data *data, struct rpc_message
 
 static void nfs4_proc_read_rpc_prepare(struct rpc_task *task, struct nfs_read_data *data)
 {
-	nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
 			&data->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+			data->args.lock_context, FMODE_READ);
 }
 
 static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_write_data *data)
@@ -3560,10 +3576,13 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 
 static void nfs4_proc_write_rpc_prepare(struct rpc_task *task, struct nfs_write_data *data)
 {
-	nfs4_setup_sequence(NFS_SERVER(data->header->inode),
+	if (nfs4_setup_sequence(NFS_SERVER(data->header->inode),
 			&data->args.seq_args,
 			&data->res.seq_res,
-			task);
+			task))
+		return;
+	nfs4_set_rw_stateid(&data->args.stateid, data->args.context,
+			data->args.lock_context, FMODE_WRITE);
 }
 
 static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index e3edda554ac7..9d328777b4c1 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1506,35 +1506,12 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 	encode_op_hdr(xdr, OP_PUTROOTFH, decode_putrootfh_maxsz, hdr);
 }
 
-static void encode_open_stateid(struct xdr_stream *xdr,
-		const struct nfs_open_context *ctx,
-		const struct nfs_lock_context *l_ctx,
-		fmode_t fmode,
-		int zero_seqid)
-{
-	nfs4_stateid stateid;
-
-	if (ctx->state != NULL) {
-		const struct nfs_lockowner *lockowner = NULL;
-
-		if (l_ctx != NULL)
-			lockowner = &l_ctx->lockowner;
-		nfs4_select_rw_stateid(&stateid, ctx->state,
-				fmode, lockowner);
-		if (zero_seqid)
-			stateid.seqid = 0;
-		encode_nfs4_stateid(xdr, &stateid);
-	} else
-		encode_nfs4_stateid(xdr, &zero_stateid);
-}
-
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_READ, decode_read_maxsz, hdr);
-	encode_open_stateid(xdr, args->context, args->lock_context,
-			FMODE_READ, hdr->minorversion);
+	encode_nfs4_stateid(xdr, &args->stateid);
 
 	p = reserve_space(xdr, 12);
 	p = xdr_encode_hyper(p, args->offset);
@@ -1670,8 +1647,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_WRITE, decode_write_maxsz, hdr);
-	encode_open_stateid(xdr, args->context, args->lock_context,
-			FMODE_WRITE, hdr->minorversion);
+	encode_nfs4_stateid(xdr, &args->stateid);
 
 	p = reserve_space(xdr, 16);
 	p = xdr_encode_hyper(p, args->offset);
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 4b993d358dad..90a4aa190b43 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -486,6 +486,7 @@ struct nfs_readargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
 	struct nfs_lock_context *lock_context;
+	nfs4_stateid		stateid;
 	__u64			offset;
 	__u32			count;
 	unsigned int		pgbase;
@@ -507,6 +508,7 @@ struct nfs_writeargs {
 	struct nfs_fh *		fh;
 	struct nfs_open_context *context;
 	struct nfs_lock_context *lock_context;
+	nfs4_stateid		stateid;
 	__u64			offset;
 	__u32			count;
 	enum nfs3_stable_how	stable;
-- 
cgit 


From 3b66486c4c7136f8d4bbe1306d581fadc6bce4c7 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 17 Mar 2013 15:31:15 -0400
Subject: NFSv4.1: Select the "most recent locking state" for
 read/write/setattr stateids

Follow the practice described in section 8.2.2 of RFC5661: When sending a
read/write or setattr stateid, set the seqid field to zero in order to
signal that the NFS server should apply the most recent locking state.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c         | 3 ++-
 fs/nfs/nfs4state.c        | 2 ++
 include/linux/nfs_fs_sb.h | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 22c80f9ed824..625a729ebcca 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6841,7 +6841,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.init_caps = NFS_CAP_READDIRPLUS
 		| NFS_CAP_ATOMIC_OPEN
 		| NFS_CAP_CHANGE_ATTR
-		| NFS_CAP_POSIX_LOCK,
+		| NFS_CAP_POSIX_LOCK
+		| NFS_CAP_STATEID_NFSV41,
 	.call_sync = nfs4_call_sync_sequence,
 	.match_stateid = nfs41_match_stateid,
 	.find_root_sec = nfs41_find_root_sec,
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 4e95bd72f480..685b1e953ed8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1053,6 +1053,8 @@ int nfs4_select_rw_stateid(nfs4_stateid *dst, struct nfs4_state *state,
 		goto out;
 	ret = nfs4_copy_open_stateid(dst, state);
 out:
+	if (nfs_server_capable(state->inode, NFS_CAP_STATEID_NFSV41))
+		dst->seqid = 0;
 	return ret;
 }
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 6c6ed153a9b4..74c9e52c9338 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -197,5 +197,6 @@ struct nfs_server {
 #define NFS_CAP_MTIME		(1U << 13)
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
 #define NFS_CAP_UIDGID_NOMAP	(1U << 15)
+#define NFS_CAP_STATEID_NFSV41	(1U << 16)
 
 #endif
-- 
cgit 


From 49f9a0fafd844c32f2abada047c0b9a5ba0d6255 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 15 Mar 2013 16:44:28 -0400
Subject: NFSv4.1: Enable open-by-filehandle

Sometimes, we actually _want_ to do open-by-filehandle, for instance
when recovering opens after a network partition, or when called
from nfs4_file_open.
Enable that functionality using a new capability NFS_CAP_ATOMIC_OPEN_V1,
and which is only enabled for NFSv4.1 servers that support it.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/dir.c              |  2 ++
 fs/nfs/nfs4proc.c         | 53 ++++++++++++++++++++++++++++++++++++++++-------
 include/linux/nfs_fs_sb.h |  1 +
 3 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index f23f455be42b..e093e73178b7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1486,6 +1486,8 @@ static int nfs4_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 		goto no_open;
 	if (d_mountpoint(dentry))
 		goto no_open;
+	if (NFS_SB(dentry->d_sb)->caps & NFS_CAP_ATOMIC_OPEN_V1)
+		goto no_open;
 
 	inode = dentry->d_inode;
 	parent = dget_parent(dentry);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7bbb06f280fc..732b76f703d6 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -767,6 +767,35 @@ struct nfs4_opendata {
 	int cancelled;
 };
 
+static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
+		int err, struct nfs4_exception *exception)
+{
+	if (err != -EINVAL)
+		return false;
+	if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+		return false;
+	server->caps &= ~NFS_CAP_ATOMIC_OPEN_V1;
+	exception->retry = 1;
+	return true;
+}
+
+static enum open_claim_type4
+nfs4_map_atomic_open_claim(struct nfs_server *server,
+		enum open_claim_type4 claim)
+{
+	if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+		return claim;
+	switch (claim) {
+	default:
+		return claim;
+	case NFS4_OPEN_CLAIM_FH:
+		return NFS4_OPEN_CLAIM_NULL;
+	case NFS4_OPEN_CLAIM_DELEG_CUR_FH:
+		return NFS4_OPEN_CLAIM_DELEGATE_CUR;
+	case NFS4_OPEN_CLAIM_DELEG_PREV_FH:
+		return NFS4_OPEN_CLAIM_DELEGATE_PREV;
+	}
+}
 
 static void nfs4_init_opendata_res(struct nfs4_opendata *p)
 {
@@ -818,8 +847,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.server = server;
 	p->o_arg.bitmask = server->attr_bitmask;
 	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
-	p->o_arg.claim = claim;
-	switch (claim) {
+	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
+	switch (p->o_arg.claim) {
 	case NFS4_OPEN_CLAIM_NULL:
 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
 	case NFS4_OPEN_CLAIM_DELEGATE_PREV:
@@ -1326,6 +1355,8 @@ static int nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
 	int err;
 	do {
 		err = _nfs4_do_open_reclaim(ctx, state);
+		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+			continue;
 		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
@@ -1741,7 +1772,7 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
 	int ret;
 
 	opendata = nfs4_open_recoverdata_alloc(ctx, state,
-			NFS4_OPEN_CLAIM_NULL);
+			NFS4_OPEN_CLAIM_FH);
 	if (IS_ERR(opendata))
 		return PTR_ERR(opendata);
 	ret = nfs4_open_recover(opendata, state);
@@ -1759,6 +1790,8 @@ static int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4_state
 
 	do {
 		err = _nfs4_open_expired(ctx, state);
+		if (nfs4_clear_cap_atomic_open_v1(server, err, &exception))
+			continue;
 		switch (err) {
 		default:
 			goto out;
@@ -1926,6 +1959,7 @@ static int _nfs4_do_open(struct inode *dir,
 	struct nfs4_state     *state = NULL;
 	struct nfs_server       *server = NFS_SERVER(dir);
 	struct nfs4_opendata *opendata;
+	enum open_claim_type4 claim = NFS4_OPEN_CLAIM_NULL;
 	int status;
 
 	/* Protect against reboot recovery conflicts */
@@ -1941,9 +1975,10 @@ static int _nfs4_do_open(struct inode *dir,
 	if (dentry->d_inode != NULL)
 		nfs4_return_incompatible_delegation(dentry->d_inode, fmode);
 	status = -ENOMEM;
+	if (dentry->d_inode)
+		claim = NFS4_OPEN_CLAIM_FH;
 	opendata = nfs4_opendata_alloc(dentry, sp, fmode, flags, sattr,
-			NFS4_OPEN_CLAIM_NULL,
-			GFP_KERNEL);
+			claim, GFP_KERNEL);
 	if (opendata == NULL)
 		goto err_put_state_owner;
 
@@ -2001,6 +2036,7 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 					struct rpc_cred *cred,
 					struct nfs4_threshold **ctx_th)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
 	struct nfs4_state *res;
 	int status;
@@ -2044,7 +2080,9 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
 			exception.retry = 1;
 			continue;
 		}
-		res = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir),
+		if (nfs4_clear_cap_atomic_open_v1(server, status, &exception))
+			continue;
+		res = ERR_PTR(nfs4_handle_exception(server,
 					status, &exception));
 	} while (exception.retry);
 	return res;
@@ -6858,7 +6896,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 		| NFS_CAP_ATOMIC_OPEN
 		| NFS_CAP_CHANGE_ATTR
 		| NFS_CAP_POSIX_LOCK
-		| NFS_CAP_STATEID_NFSV41,
+		| NFS_CAP_STATEID_NFSV41
+		| NFS_CAP_ATOMIC_OPEN_V1,
 	.call_sync = nfs4_call_sync_sequence,
 	.match_stateid = nfs41_match_stateid,
 	.find_root_sec = nfs41_find_root_sec,
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 74c9e52c9338..d8fdfdc7a8fe 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -198,5 +198,6 @@ struct nfs_server {
 #define NFS_CAP_POSIX_LOCK	(1U << 14)
 #define NFS_CAP_UIDGID_NOMAP	(1U << 15)
 #define NFS_CAP_STATEID_NFSV41	(1U << 16)
+#define NFS_CAP_ATOMIC_OPEN_V1	(1U << 17)
 
 #endif
-- 
cgit 


From e44b0ceee4cc2a926225e73ac1e20b9a5bb22c2d Mon Sep 17 00:00:00 2001
From: Kenneth Heitke <kheitke@codeaurora.org>
Date: Tue, 12 Mar 2013 11:41:46 -0700
Subject: add single-wire serial bus interface (SSBI) driver

SSBI is the Qualcomm single-wire serial bus interface used to connect
the MSM devices to the PMIC and other devices.

Since SSBI only supports a single slave, the driver gets the name of the
slave device passed in from the board file through the master device's
platform data.

SSBI registers pretty early (postcore), so that the PMIC can come up
before the board init. This is useful if the board init requires the
use of gpios that are connected through the PMIC.

Based on a patch by Dima Zavin <dima@android.com> that can be found at:
http://android.git.kernel.org/?p=kernel/msm.git;a=commitdiff;h=eb060bac4

This patch adds PMIC Arbiter support for the MSM8660. The PMIC Arbiter
is a hardware wrapper around the SSBI 2.0 controller that is designed to
overcome concurrency issues and security limitations.  A controller_type
field is added to the platform data to specify the type of the SSBI
controller (1.0, 2.0, or PMIC Arbiter).

[davidb@codeaurora.org:
 I've moved this driver into drivers/ssbi/ and added an include for
 linux/module.h so that it will compile]

Signed-off-by: Kenneth Heitke <kheitke@codeaurora.org>
Signed-off-by: David Brown <davidb@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/Kconfig          |   2 +
 drivers/Makefile         |   1 +
 drivers/ssbi/Kconfig     |  16 ++
 drivers/ssbi/Makefile    |   1 +
 drivers/ssbi/ssbi.c      | 397 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/msm_ssbi.h |  49 ++++++
 6 files changed, 466 insertions(+)
 create mode 100644 drivers/ssbi/Kconfig
 create mode 100644 drivers/ssbi/Makefile
 create mode 100644 drivers/ssbi/ssbi.c
 create mode 100644 include/linux/msm_ssbi.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 202fa6d051b9..78a956e286e6 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -52,6 +52,8 @@ source "drivers/i2c/Kconfig"
 
 source "drivers/spi/Kconfig"
 
+source "drivers/ssbi/Kconfig"
+
 source "drivers/hsi/Kconfig"
 
 source "drivers/pps/Kconfig"
diff --git a/drivers/Makefile b/drivers/Makefile
index dce39a95fa71..778821ba3f68 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -114,6 +114,7 @@ obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
 obj-$(CONFIG_ARCH_SHMOBILE)	+= sh/
+obj-$(CONFIG_MSM_SSBI)		+= ssbi/
 ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 obj-y				+= clocksource/
 endif
diff --git a/drivers/ssbi/Kconfig b/drivers/ssbi/Kconfig
new file mode 100644
index 000000000000..b57c41bd0119
--- /dev/null
+++ b/drivers/ssbi/Kconfig
@@ -0,0 +1,16 @@
+#
+# MSM SSBI bus support
+#
+
+menu "Qualcomm MSM SSBI bus support"
+
+config MSM_SSBI
+	bool "Qualcomm Single-wire Serial Bus Interface (SSBI)"
+	help
+	  If you say yes to this option, support will be included for the
+	  built-in SSBI interface on Qualcomm MSM family processors.
+
+	  This is required for communicating with Qualcomm PMICs and
+	  other devices that have the SSBI interface.
+
+endmenu
diff --git a/drivers/ssbi/Makefile b/drivers/ssbi/Makefile
new file mode 100644
index 000000000000..22e408f45d61
--- /dev/null
+++ b/drivers/ssbi/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_MSM_SSBI) += ssbi.o
diff --git a/drivers/ssbi/ssbi.c b/drivers/ssbi/ssbi.c
new file mode 100644
index 000000000000..8b0b10d6e1de
--- /dev/null
+++ b/drivers/ssbi/ssbi.c
@@ -0,0 +1,397 @@
+/* Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ * Copyright (c) 2010, Google Inc.
+ *
+ * Original authors: Code Aurora Forum
+ *
+ * Author: Dima Zavin <dima@android.com>
+ *  - Largely rewritten from original to not be an i2c driver.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#define pr_fmt(fmt) "%s: " fmt, __func__
+
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/msm_ssbi.h>
+#include <linux/module.h>
+
+/* SSBI 2.0 controller registers */
+#define SSBI2_CMD			0x0008
+#define SSBI2_RD			0x0010
+#define SSBI2_STATUS			0x0014
+#define SSBI2_MODE2			0x001C
+
+/* SSBI_CMD fields */
+#define SSBI_CMD_RDWRN			(1 << 24)
+
+/* SSBI_STATUS fields */
+#define SSBI_STATUS_RD_READY		(1 << 2)
+#define SSBI_STATUS_READY		(1 << 1)
+#define SSBI_STATUS_MCHN_BUSY		(1 << 0)
+
+/* SSBI_MODE2 fields */
+#define SSBI_MODE2_REG_ADDR_15_8_SHFT	0x04
+#define SSBI_MODE2_REG_ADDR_15_8_MASK	(0x7f << SSBI_MODE2_REG_ADDR_15_8_SHFT)
+
+#define SET_SSBI_MODE2_REG_ADDR_15_8(MD, AD) \
+	(((MD) & 0x0F) | ((((AD) >> 8) << SSBI_MODE2_REG_ADDR_15_8_SHFT) & \
+	SSBI_MODE2_REG_ADDR_15_8_MASK))
+
+/* SSBI PMIC Arbiter command registers */
+#define SSBI_PA_CMD			0x0000
+#define SSBI_PA_RD_STATUS		0x0004
+
+/* SSBI_PA_CMD fields */
+#define SSBI_PA_CMD_RDWRN		(1 << 24)
+#define SSBI_PA_CMD_ADDR_MASK		0x7fff /* REG_ADDR_7_0, REG_ADDR_8_14*/
+
+/* SSBI_PA_RD_STATUS fields */
+#define SSBI_PA_RD_STATUS_TRANS_DONE	(1 << 27)
+#define SSBI_PA_RD_STATUS_TRANS_DENIED	(1 << 26)
+
+#define SSBI_TIMEOUT_US			100
+
+struct msm_ssbi {
+	struct device		*dev;
+	struct device		*slave;
+	void __iomem		*base;
+	spinlock_t		lock;
+	enum msm_ssbi_controller_type controller_type;
+	int (*read)(struct msm_ssbi *, u16 addr, u8 *buf, int len);
+	int (*write)(struct msm_ssbi *, u16 addr, u8 *buf, int len);
+};
+
+#define to_msm_ssbi(dev)	platform_get_drvdata(to_platform_device(dev))
+
+static inline u32 ssbi_readl(struct msm_ssbi *ssbi, u32 reg)
+{
+	return readl(ssbi->base + reg);
+}
+
+static inline void ssbi_writel(struct msm_ssbi *ssbi, u32 val, u32 reg)
+{
+	writel(val, ssbi->base + reg);
+}
+
+static int ssbi_wait_mask(struct msm_ssbi *ssbi, u32 set_mask, u32 clr_mask)
+{
+	u32 timeout = SSBI_TIMEOUT_US;
+	u32 val;
+
+	while (timeout--) {
+		val = ssbi_readl(ssbi, SSBI2_STATUS);
+		if (((val & set_mask) == set_mask) && ((val & clr_mask) == 0))
+			return 0;
+		udelay(1);
+	}
+
+	dev_err(ssbi->dev, "%s: timeout (status %x set_mask %x clr_mask %x)\n",
+		__func__, ssbi_readl(ssbi, SSBI2_STATUS), set_mask, clr_mask);
+	return -ETIMEDOUT;
+}
+
+static int
+msm_ssbi_read_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+{
+	u32 cmd = SSBI_CMD_RDWRN | ((addr & 0xff) << 16);
+	int ret = 0;
+
+	if (ssbi->controller_type == MSM_SBI_CTRL_SSBI2) {
+		u32 mode2 = ssbi_readl(ssbi, SSBI2_MODE2);
+		mode2 = SET_SSBI_MODE2_REG_ADDR_15_8(mode2, addr);
+		ssbi_writel(ssbi, mode2, SSBI2_MODE2);
+	}
+
+	while (len) {
+		ret = ssbi_wait_mask(ssbi, SSBI_STATUS_READY, 0);
+		if (ret)
+			goto err;
+
+		ssbi_writel(ssbi, cmd, SSBI2_CMD);
+		ret = ssbi_wait_mask(ssbi, SSBI_STATUS_RD_READY, 0);
+		if (ret)
+			goto err;
+		*buf++ = ssbi_readl(ssbi, SSBI2_RD) & 0xff;
+		len--;
+	}
+
+err:
+	return ret;
+}
+
+static int
+msm_ssbi_write_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+{
+	int ret = 0;
+
+	if (ssbi->controller_type == MSM_SBI_CTRL_SSBI2) {
+		u32 mode2 = ssbi_readl(ssbi, SSBI2_MODE2);
+		mode2 = SET_SSBI_MODE2_REG_ADDR_15_8(mode2, addr);
+		ssbi_writel(ssbi, mode2, SSBI2_MODE2);
+	}
+
+	while (len) {
+		ret = ssbi_wait_mask(ssbi, SSBI_STATUS_READY, 0);
+		if (ret)
+			goto err;
+
+		ssbi_writel(ssbi, ((addr & 0xff) << 16) | *buf, SSBI2_CMD);
+		ret = ssbi_wait_mask(ssbi, 0, SSBI_STATUS_MCHN_BUSY);
+		if (ret)
+			goto err;
+		buf++;
+		len--;
+	}
+
+err:
+	return ret;
+}
+
+static inline int
+msm_ssbi_pa_transfer(struct msm_ssbi *ssbi, u32 cmd, u8 *data)
+{
+	u32 timeout = SSBI_TIMEOUT_US;
+	u32 rd_status = 0;
+
+	ssbi_writel(ssbi, cmd, SSBI_PA_CMD);
+
+	while (timeout--) {
+		rd_status = ssbi_readl(ssbi, SSBI_PA_RD_STATUS);
+
+		if (rd_status & SSBI_PA_RD_STATUS_TRANS_DENIED) {
+			dev_err(ssbi->dev, "%s: transaction denied (0x%x)\n",
+					__func__, rd_status);
+			return -EPERM;
+		}
+
+		if (rd_status & SSBI_PA_RD_STATUS_TRANS_DONE) {
+			if (data)
+				*data = rd_status & 0xff;
+			return 0;
+		}
+		udelay(1);
+	}
+
+	dev_err(ssbi->dev, "%s: timeout, status 0x%x\n", __func__, rd_status);
+	return -ETIMEDOUT;
+}
+
+static int
+msm_ssbi_pa_read_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+{
+	u32 cmd;
+	int ret = 0;
+
+	cmd = SSBI_PA_CMD_RDWRN | (addr & SSBI_PA_CMD_ADDR_MASK) << 8;
+
+	while (len) {
+		ret = msm_ssbi_pa_transfer(ssbi, cmd, buf);
+		if (ret)
+			goto err;
+		buf++;
+		len--;
+	}
+
+err:
+	return ret;
+}
+
+static int
+msm_ssbi_pa_write_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+{
+	u32 cmd;
+	int ret = 0;
+
+	while (len) {
+		cmd = (addr & SSBI_PA_CMD_ADDR_MASK) << 8 | *buf;
+		ret = msm_ssbi_pa_transfer(ssbi, cmd, NULL);
+		if (ret)
+			goto err;
+		buf++;
+		len--;
+	}
+
+err:
+	return ret;
+}
+
+int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len)
+{
+	struct msm_ssbi *ssbi = to_msm_ssbi(dev);
+	unsigned long flags;
+	int ret;
+
+	if (ssbi->dev != dev)
+		return -ENXIO;
+
+	spin_lock_irqsave(&ssbi->lock, flags);
+	ret = ssbi->read(ssbi, addr, buf, len);
+	spin_unlock_irqrestore(&ssbi->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(msm_ssbi_read);
+
+int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len)
+{
+	struct msm_ssbi *ssbi = to_msm_ssbi(dev);
+	unsigned long flags;
+	int ret;
+
+	if (ssbi->dev != dev)
+		return -ENXIO;
+
+	spin_lock_irqsave(&ssbi->lock, flags);
+	ret = ssbi->write(ssbi, addr, buf, len);
+	spin_unlock_irqrestore(&ssbi->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(msm_ssbi_write);
+
+static int msm_ssbi_add_slave(struct msm_ssbi *ssbi,
+				const struct msm_ssbi_slave_info *slave)
+{
+	struct platform_device *slave_pdev;
+	int ret;
+
+	if (ssbi->slave) {
+		pr_err("slave already attached??\n");
+		return -EBUSY;
+	}
+
+	slave_pdev = platform_device_alloc(slave->name, -1);
+	if (!slave_pdev) {
+		pr_err("cannot allocate pdev for slave '%s'", slave->name);
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	slave_pdev->dev.parent = ssbi->dev;
+	slave_pdev->dev.platform_data = slave->platform_data;
+
+	ret = platform_device_add(slave_pdev);
+	if (ret) {
+		pr_err("cannot add slave platform device for '%s'\n",
+				slave->name);
+		goto err;
+	}
+
+	ssbi->slave = &slave_pdev->dev;
+	return 0;
+
+err:
+	if (slave_pdev)
+		platform_device_put(slave_pdev);
+	return ret;
+}
+
+static int msm_ssbi_probe(struct platform_device *pdev)
+{
+	const struct msm_ssbi_platform_data *pdata = pdev->dev.platform_data;
+	struct resource *mem_res;
+	struct msm_ssbi *ssbi;
+	int ret = 0;
+
+	if (!pdata) {
+		pr_err("missing platform data\n");
+		return -EINVAL;
+	}
+
+	pr_debug("%s\n", pdata->slave.name);
+
+	ssbi = kzalloc(sizeof(struct msm_ssbi), GFP_KERNEL);
+	if (!ssbi) {
+		pr_err("can not allocate ssbi_data\n");
+		return -ENOMEM;
+	}
+
+	mem_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!mem_res) {
+		pr_err("missing mem resource\n");
+		ret = -EINVAL;
+		goto err_get_mem_res;
+	}
+
+	ssbi->base = ioremap(mem_res->start, resource_size(mem_res));
+	if (!ssbi->base) {
+		pr_err("ioremap of 0x%p failed\n", (void *)mem_res->start);
+		ret = -EINVAL;
+		goto err_ioremap;
+	}
+	ssbi->dev = &pdev->dev;
+	platform_set_drvdata(pdev, ssbi);
+
+	ssbi->controller_type = pdata->controller_type;
+	if (ssbi->controller_type == MSM_SBI_CTRL_PMIC_ARBITER) {
+		ssbi->read = msm_ssbi_pa_read_bytes;
+		ssbi->write = msm_ssbi_pa_write_bytes;
+	} else {
+		ssbi->read = msm_ssbi_read_bytes;
+		ssbi->write = msm_ssbi_write_bytes;
+	}
+
+	spin_lock_init(&ssbi->lock);
+
+	ret = msm_ssbi_add_slave(ssbi, &pdata->slave);
+	if (ret)
+		goto err_ssbi_add_slave;
+
+	return 0;
+
+err_ssbi_add_slave:
+	platform_set_drvdata(pdev, NULL);
+	iounmap(ssbi->base);
+err_ioremap:
+err_get_mem_res:
+	kfree(ssbi);
+	return ret;
+}
+
+static int msm_ssbi_remove(struct platform_device *pdev)
+{
+	struct msm_ssbi *ssbi = platform_get_drvdata(pdev);
+
+	platform_set_drvdata(pdev, NULL);
+	iounmap(ssbi->base);
+	kfree(ssbi);
+	return 0;
+}
+
+static struct platform_driver msm_ssbi_driver = {
+	.probe		= msm_ssbi_probe,
+	.remove		= __exit_p(msm_ssbi_remove),
+	.driver		= {
+		.name	= "msm_ssbi",
+		.owner	= THIS_MODULE,
+	},
+};
+
+static int __init msm_ssbi_init(void)
+{
+	return platform_driver_register(&msm_ssbi_driver);
+}
+postcore_initcall(msm_ssbi_init);
+
+static void __exit msm_ssbi_exit(void)
+{
+	platform_driver_unregister(&msm_ssbi_driver);
+}
+module_exit(msm_ssbi_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_VERSION("1.0");
+MODULE_ALIAS("platform:msm_ssbi");
+MODULE_AUTHOR("Dima Zavin <dima@android.com>");
diff --git a/include/linux/msm_ssbi.h b/include/linux/msm_ssbi.h
new file mode 100644
index 000000000000..cfa47df6d003
--- /dev/null
+++ b/include/linux/msm_ssbi.h
@@ -0,0 +1,49 @@
+/* Copyright (C) 2010 Google, Inc.
+ * Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+ * Author: Dima Zavin <dima@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_MSM_SSBI_H
+#define _LINUX_MSM_SSBI_H
+
+#include <linux/types.h>
+
+struct msm_ssbi_slave_info {
+	const char	*name;
+	void		*platform_data;
+};
+
+enum msm_ssbi_controller_type {
+	MSM_SBI_CTRL_SSBI = 0,
+	MSM_SBI_CTRL_SSBI2,
+	MSM_SBI_CTRL_PMIC_ARBITER,
+};
+
+struct msm_ssbi_platform_data {
+	struct msm_ssbi_slave_info	slave;
+	enum msm_ssbi_controller_type controller_type;
+};
+
+#ifdef CONFIG_MSM_SSBI
+int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len);
+int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len);
+#else
+static inline int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len)
+{
+	return -ENXIO;
+}
+static inline int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len)
+{
+	return -ENXIO;
+}
+#endif
+#endif
-- 
cgit 


From 4a6692e2ac4c6b09235a9568468dd83a380c271d Mon Sep 17 00:00:00 2001
From: David Brown <davidb@codeaurora.org>
Date: Tue, 12 Mar 2013 11:41:49 -0700
Subject: ssbi: Allow compilation as a module

The ssbi driver's read/write entry points are protected with wrappers
in the case when the driver isn't enabled.  These wrappers don't make
any sense, since a client of the SSBI bus won't work without it.  Make
these just regular functions, so that the SSBI driver can be built as
a module.

Signed-off-by: David Brown <davidb@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/ssbi/Kconfig     |  2 +-
 include/linux/msm_ssbi.h | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ssbi/Kconfig b/drivers/ssbi/Kconfig
index b57c41bd0119..c7bc534ddf50 100644
--- a/drivers/ssbi/Kconfig
+++ b/drivers/ssbi/Kconfig
@@ -5,7 +5,7 @@
 menu "Qualcomm MSM SSBI bus support"
 
 config MSM_SSBI
-	bool "Qualcomm Single-wire Serial Bus Interface (SSBI)"
+	tristate "Qualcomm Single-wire Serial Bus Interface (SSBI)"
 	help
 	  If you say yes to this option, support will be included for the
 	  built-in SSBI interface on Qualcomm MSM family processors.
diff --git a/include/linux/msm_ssbi.h b/include/linux/msm_ssbi.h
index cfa47df6d003..0fe245bb2940 100644
--- a/include/linux/msm_ssbi.h
+++ b/include/linux/msm_ssbi.h
@@ -33,17 +33,6 @@ struct msm_ssbi_platform_data {
 	enum msm_ssbi_controller_type controller_type;
 };
 
-#ifdef CONFIG_MSM_SSBI
 int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len);
 int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len);
-#else
-static inline int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len)
-{
-	return -ENXIO;
-}
-static inline int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len)
-{
-	return -ENXIO;
-}
-#endif
 #endif
-- 
cgit 


From ce44bf5b5544cbe6358abb01f039361a99b80901 Mon Sep 17 00:00:00 2001
From: David Brown <davidb@codeaurora.org>
Date: Tue, 12 Mar 2013 11:41:54 -0700
Subject: SSBI: Remove MSM_ prefix from SSBI drivers

Although the SSBI sub is currently only used on MSM SoCs, it is still
a bus in its own right.  Remove this msm_ prefix from the driver and
it's symbols.  Clients can now refer directly to ssbi_write() and
ssbi_read().

Signed-off-by: David Brown <davidb@codeaurora.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/Makefile          |  2 +-
 drivers/mfd/Kconfig       |  2 +-
 drivers/mfd/pm8921-core.c | 14 ++++----
 drivers/ssbi/Kconfig      |  4 +--
 drivers/ssbi/Makefile     |  2 +-
 drivers/ssbi/ssbi.c       | 86 +++++++++++++++++++++++------------------------
 include/linux/msm_ssbi.h  | 38 ---------------------
 include/linux/ssbi.h      | 38 +++++++++++++++++++++
 8 files changed, 93 insertions(+), 93 deletions(-)
 delete mode 100644 include/linux/msm_ssbi.h
 create mode 100644 include/linux/ssbi.h

(limited to 'include/linux')

diff --git a/drivers/Makefile b/drivers/Makefile
index 778821ba3f68..4865ed24708a 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -114,7 +114,7 @@ obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
 obj-$(CONFIG_SUPERH)		+= sh/
 obj-$(CONFIG_ARCH_SHMOBILE)	+= sh/
-obj-$(CONFIG_MSM_SSBI)		+= ssbi/
+obj-$(CONFIG_SSBI)		+= ssbi/
 ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 obj-y				+= clocksource/
 endif
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 671f5b171c73..5bfa7bb555b7 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -990,7 +990,7 @@ config MFD_PM8XXX
 
 config MFD_PM8921_CORE
 	tristate "Qualcomm PM8921 PMIC chip"
-	depends on MSM_SSBI
+	depends on SSBI
 	select MFD_CORE
 	select MFD_PM8XXX
 	help
diff --git a/drivers/mfd/pm8921-core.c b/drivers/mfd/pm8921-core.c
index d4b297cbd801..ecc137ffa8c3 100644
--- a/drivers/mfd/pm8921-core.c
+++ b/drivers/mfd/pm8921-core.c
@@ -17,7 +17,7 @@
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/err.h>
-#include <linux/msm_ssbi.h>
+#include <linux/ssbi.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/pm8xxx/pm8921.h>
 #include <linux/mfd/pm8xxx/core.h>
@@ -35,7 +35,7 @@ static int pm8921_readb(const struct device *dev, u16 addr, u8 *val)
 	const struct pm8xxx_drvdata *pm8921_drvdata = dev_get_drvdata(dev);
 	const struct pm8921 *pmic = pm8921_drvdata->pm_chip_data;
 
-	return msm_ssbi_read(pmic->dev->parent, addr, val, 1);
+	return ssbi_read(pmic->dev->parent, addr, val, 1);
 }
 
 static int pm8921_writeb(const struct device *dev, u16 addr, u8 val)
@@ -43,7 +43,7 @@ static int pm8921_writeb(const struct device *dev, u16 addr, u8 val)
 	const struct pm8xxx_drvdata *pm8921_drvdata = dev_get_drvdata(dev);
 	const struct pm8921 *pmic = pm8921_drvdata->pm_chip_data;
 
-	return msm_ssbi_write(pmic->dev->parent, addr, &val, 1);
+	return ssbi_write(pmic->dev->parent, addr, &val, 1);
 }
 
 static int pm8921_read_buf(const struct device *dev, u16 addr, u8 *buf,
@@ -52,7 +52,7 @@ static int pm8921_read_buf(const struct device *dev, u16 addr, u8 *buf,
 	const struct pm8xxx_drvdata *pm8921_drvdata = dev_get_drvdata(dev);
 	const struct pm8921 *pmic = pm8921_drvdata->pm_chip_data;
 
-	return msm_ssbi_read(pmic->dev->parent, addr, buf, cnt);
+	return ssbi_read(pmic->dev->parent, addr, buf, cnt);
 }
 
 static int pm8921_write_buf(const struct device *dev, u16 addr, u8 *buf,
@@ -61,7 +61,7 @@ static int pm8921_write_buf(const struct device *dev, u16 addr, u8 *buf,
 	const struct pm8xxx_drvdata *pm8921_drvdata = dev_get_drvdata(dev);
 	const struct pm8921 *pmic = pm8921_drvdata->pm_chip_data;
 
-	return msm_ssbi_write(pmic->dev->parent, addr, buf, cnt);
+	return ssbi_write(pmic->dev->parent, addr, buf, cnt);
 }
 
 static int pm8921_read_irq_stat(const struct device *dev, int irq)
@@ -124,7 +124,7 @@ static int pm8921_probe(struct platform_device *pdev)
 	}
 
 	/* Read PMIC chip revision */
-	rc = msm_ssbi_read(pdev->dev.parent, REG_HWREV, &val, sizeof(val));
+	rc = ssbi_read(pdev->dev.parent, REG_HWREV, &val, sizeof(val));
 	if (rc) {
 		pr_err("Failed to read hw rev reg %d:rc=%d\n", REG_HWREV, rc);
 		goto err_read_rev;
@@ -133,7 +133,7 @@ static int pm8921_probe(struct platform_device *pdev)
 	rev = val;
 
 	/* Read PMIC chip revision 2 */
-	rc = msm_ssbi_read(pdev->dev.parent, REG_HWREV_2, &val, sizeof(val));
+	rc = ssbi_read(pdev->dev.parent, REG_HWREV_2, &val, sizeof(val));
 	if (rc) {
 		pr_err("Failed to read hw rev 2 reg %d:rc=%d\n",
 			REG_HWREV_2, rc);
diff --git a/drivers/ssbi/Kconfig b/drivers/ssbi/Kconfig
index c7bc534ddf50..1ae4040afedd 100644
--- a/drivers/ssbi/Kconfig
+++ b/drivers/ssbi/Kconfig
@@ -1,10 +1,10 @@
 #
-# MSM SSBI bus support
+# SSBI bus support
 #
 
 menu "Qualcomm MSM SSBI bus support"
 
-config MSM_SSBI
+config SSBI
 	tristate "Qualcomm Single-wire Serial Bus Interface (SSBI)"
 	help
 	  If you say yes to this option, support will be included for the
diff --git a/drivers/ssbi/Makefile b/drivers/ssbi/Makefile
index 22e408f45d61..38fb70c31caf 100644
--- a/drivers/ssbi/Makefile
+++ b/drivers/ssbi/Makefile
@@ -1 +1 @@
-obj-$(CONFIG_MSM_SSBI) += ssbi.o
+obj-$(CONFIG_SSBI) += ssbi.o
diff --git a/drivers/ssbi/ssbi.c b/drivers/ssbi/ssbi.c
index b056a072c3b3..f32da0258a8e 100644
--- a/drivers/ssbi/ssbi.c
+++ b/drivers/ssbi/ssbi.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+/* Copyright (c) 2009-2013, The Linux Foundation. All rights reserved.
  * Copyright (c) 2010, Google Inc.
  *
  * Original authors: Code Aurora Forum
@@ -24,7 +24,7 @@
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
-#include <linux/msm_ssbi.h>
+#include <linux/ssbi.h>
 #include <linux/module.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -65,23 +65,23 @@
 
 #define SSBI_TIMEOUT_US			100
 
-struct msm_ssbi {
+struct ssbi {
 	struct device		*slave;
 	void __iomem		*base;
 	spinlock_t		lock;
-	enum msm_ssbi_controller_type controller_type;
-	int (*read)(struct msm_ssbi *, u16 addr, u8 *buf, int len);
-	int (*write)(struct msm_ssbi *, u16 addr, u8 *buf, int len);
+	enum ssbi_controller_type controller_type;
+	int (*read)(struct ssbi *, u16 addr, u8 *buf, int len);
+	int (*write)(struct ssbi *, u16 addr, u8 *buf, int len);
 };
 
-#define to_msm_ssbi(dev)	platform_get_drvdata(to_platform_device(dev))
+#define to_ssbi(dev)	platform_get_drvdata(to_platform_device(dev))
 
-static inline u32 ssbi_readl(struct msm_ssbi *ssbi, u32 reg)
+static inline u32 ssbi_readl(struct ssbi *ssbi, u32 reg)
 {
 	return readl(ssbi->base + reg);
 }
 
-static inline void ssbi_writel(struct msm_ssbi *ssbi, u32 val, u32 reg)
+static inline void ssbi_writel(struct ssbi *ssbi, u32 val, u32 reg)
 {
 	writel(val, ssbi->base + reg);
 }
@@ -95,7 +95,7 @@ static inline void ssbi_writel(struct msm_ssbi *ssbi, u32 val, u32 reg)
  *
  * As such, this wait merely spins, with a udelay.
  */
-static int ssbi_wait_mask(struct msm_ssbi *ssbi, u32 set_mask, u32 clr_mask)
+static int ssbi_wait_mask(struct ssbi *ssbi, u32 set_mask, u32 clr_mask)
 {
 	u32 timeout = SSBI_TIMEOUT_US;
 	u32 val;
@@ -111,7 +111,7 @@ static int ssbi_wait_mask(struct msm_ssbi *ssbi, u32 set_mask, u32 clr_mask)
 }
 
 static int
-msm_ssbi_read_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+ssbi_read_bytes(struct ssbi *ssbi, u16 addr, u8 *buf, int len)
 {
 	u32 cmd = SSBI_CMD_RDWRN | ((addr & 0xff) << 16);
 	int ret = 0;
@@ -140,7 +140,7 @@ err:
 }
 
 static int
-msm_ssbi_write_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+ssbi_write_bytes(struct ssbi *ssbi, u16 addr, u8 *buf, int len)
 {
 	int ret = 0;
 
@@ -172,7 +172,7 @@ err:
  * busywait.
  */
 static inline int
-msm_ssbi_pa_transfer(struct msm_ssbi *ssbi, u32 cmd, u8 *data)
+ssbi_pa_transfer(struct ssbi *ssbi, u32 cmd, u8 *data)
 {
 	u32 timeout = SSBI_TIMEOUT_US;
 	u32 rd_status = 0;
@@ -197,7 +197,7 @@ msm_ssbi_pa_transfer(struct msm_ssbi *ssbi, u32 cmd, u8 *data)
 }
 
 static int
-msm_ssbi_pa_read_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+ssbi_pa_read_bytes(struct ssbi *ssbi, u16 addr, u8 *buf, int len)
 {
 	u32 cmd;
 	int ret = 0;
@@ -205,7 +205,7 @@ msm_ssbi_pa_read_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
 	cmd = SSBI_PA_CMD_RDWRN | (addr & SSBI_PA_CMD_ADDR_MASK) << 8;
 
 	while (len) {
-		ret = msm_ssbi_pa_transfer(ssbi, cmd, buf);
+		ret = ssbi_pa_transfer(ssbi, cmd, buf);
 		if (ret)
 			goto err;
 		buf++;
@@ -217,14 +217,14 @@ err:
 }
 
 static int
-msm_ssbi_pa_write_bytes(struct msm_ssbi *ssbi, u16 addr, u8 *buf, int len)
+ssbi_pa_write_bytes(struct ssbi *ssbi, u16 addr, u8 *buf, int len)
 {
 	u32 cmd;
 	int ret = 0;
 
 	while (len) {
 		cmd = (addr & SSBI_PA_CMD_ADDR_MASK) << 8 | *buf;
-		ret = msm_ssbi_pa_transfer(ssbi, cmd, NULL);
+		ret = ssbi_pa_transfer(ssbi, cmd, NULL);
 		if (ret)
 			goto err;
 		buf++;
@@ -235,9 +235,9 @@ err:
 	return ret;
 }
 
-int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len)
+int ssbi_read(struct device *dev, u16 addr, u8 *buf, int len)
 {
-	struct msm_ssbi *ssbi = to_msm_ssbi(dev);
+	struct ssbi *ssbi = to_ssbi(dev);
 	unsigned long flags;
 	int ret;
 
@@ -247,11 +247,11 @@ int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(msm_ssbi_read);
+EXPORT_SYMBOL_GPL(ssbi_read);
 
-int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len)
+int ssbi_write(struct device *dev, u16 addr, u8 *buf, int len)
 {
-	struct msm_ssbi *ssbi = to_msm_ssbi(dev);
+	struct ssbi *ssbi = to_ssbi(dev);
 	unsigned long flags;
 	int ret;
 
@@ -261,17 +261,17 @@ int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len)
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(msm_ssbi_write);
+EXPORT_SYMBOL_GPL(ssbi_write);
 
-static int msm_ssbi_probe(struct platform_device *pdev)
+static int ssbi_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct resource *mem_res;
-	struct msm_ssbi *ssbi;
+	struct ssbi *ssbi;
 	int ret = 0;
 	const char *type;
 
-	ssbi = kzalloc(sizeof(struct msm_ssbi), GFP_KERNEL);
+	ssbi = kzalloc(sizeof(struct ssbi), GFP_KERNEL);
 	if (!ssbi) {
 		pr_err("can not allocate ssbi_data\n");
 		return -ENOMEM;
@@ -312,11 +312,11 @@ static int msm_ssbi_probe(struct platform_device *pdev)
 	}
 
 	if (ssbi->controller_type == MSM_SBI_CTRL_PMIC_ARBITER) {
-		ssbi->read = msm_ssbi_pa_read_bytes;
-		ssbi->write = msm_ssbi_pa_write_bytes;
+		ssbi->read = ssbi_pa_read_bytes;
+		ssbi->write = ssbi_pa_write_bytes;
 	} else {
-		ssbi->read = msm_ssbi_read_bytes;
-		ssbi->write = msm_ssbi_write_bytes;
+		ssbi->read = ssbi_read_bytes;
+		ssbi->write = ssbi_write_bytes;
 	}
 
 	spin_lock_init(&ssbi->lock);
@@ -336,9 +336,9 @@ err_get_mem_res:
 	return ret;
 }
 
-static int msm_ssbi_remove(struct platform_device *pdev)
+static int ssbi_remove(struct platform_device *pdev)
 {
-	struct msm_ssbi *ssbi = platform_get_drvdata(pdev);
+	struct ssbi *ssbi = platform_get_drvdata(pdev);
 
 	platform_set_drvdata(pdev, NULL);
 	iounmap(ssbi->base);
@@ -351,29 +351,29 @@ static struct of_device_id ssbi_match_table[] = {
 	{}
 };
 
-static struct platform_driver msm_ssbi_driver = {
-	.probe		= msm_ssbi_probe,
-	.remove		= msm_ssbi_remove,
+static struct platform_driver ssbi_driver = {
+	.probe		= ssbi_probe,
+	.remove		= ssbi_remove,
 	.driver		= {
-		.name	= "msm_ssbi",
+		.name	= "ssbi",
 		.owner	= THIS_MODULE,
 		.of_match_table = ssbi_match_table,
 	},
 };
 
-static int __init msm_ssbi_init(void)
+static int __init ssbi_init(void)
 {
-	return platform_driver_register(&msm_ssbi_driver);
+	return platform_driver_register(&ssbi_driver);
 }
-module_init(msm_ssbi_init);
+module_init(ssbi_init);
 
-static void __exit msm_ssbi_exit(void)
+static void __exit ssbi_exit(void)
 {
-	platform_driver_unregister(&msm_ssbi_driver);
+	platform_driver_unregister(&ssbi_driver);
 }
-module_exit(msm_ssbi_exit)
+module_exit(ssbi_exit)
 
 MODULE_LICENSE("GPL v2");
 MODULE_VERSION("1.0");
-MODULE_ALIAS("platform:msm_ssbi");
+MODULE_ALIAS("platform:ssbi");
 MODULE_AUTHOR("Dima Zavin <dima@android.com>");
diff --git a/include/linux/msm_ssbi.h b/include/linux/msm_ssbi.h
deleted file mode 100644
index 0fe245bb2940..000000000000
--- a/include/linux/msm_ssbi.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (C) 2010 Google, Inc.
- * Copyright (c) 2011, Code Aurora Forum. All rights reserved.
- * Author: Dima Zavin <dima@android.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 and
- * only version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _LINUX_MSM_SSBI_H
-#define _LINUX_MSM_SSBI_H
-
-#include <linux/types.h>
-
-struct msm_ssbi_slave_info {
-	const char	*name;
-	void		*platform_data;
-};
-
-enum msm_ssbi_controller_type {
-	MSM_SBI_CTRL_SSBI = 0,
-	MSM_SBI_CTRL_SSBI2,
-	MSM_SBI_CTRL_PMIC_ARBITER,
-};
-
-struct msm_ssbi_platform_data {
-	struct msm_ssbi_slave_info	slave;
-	enum msm_ssbi_controller_type controller_type;
-};
-
-int msm_ssbi_write(struct device *dev, u16 addr, u8 *buf, int len);
-int msm_ssbi_read(struct device *dev, u16 addr, u8 *buf, int len);
-#endif
diff --git a/include/linux/ssbi.h b/include/linux/ssbi.h
new file mode 100644
index 000000000000..44ef5da21470
--- /dev/null
+++ b/include/linux/ssbi.h
@@ -0,0 +1,38 @@
+/* Copyright (C) 2010 Google, Inc.
+ * Copyright (c) 2011, Code Aurora Forum. All rights reserved.
+ * Author: Dima Zavin <dima@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_SSBI_H
+#define _LINUX_SSBI_H
+
+#include <linux/types.h>
+
+struct ssbi_slave_info {
+	const char	*name;
+	void		*platform_data;
+};
+
+enum ssbi_controller_type {
+	MSM_SBI_CTRL_SSBI = 0,
+	MSM_SBI_CTRL_SSBI2,
+	MSM_SBI_CTRL_PMIC_ARBITER,
+};
+
+struct ssbi_platform_data {
+	struct ssbi_slave_info	slave;
+	enum ssbi_controller_type controller_type;
+};
+
+int ssbi_write(struct device *dev, u16 addr, u8 *buf, int len);
+int ssbi_read(struct device *dev, u16 addr, u8 *buf, int len);
+#endif
-- 
cgit 


From 303f0847925ece27129487a2bfc05199ab2a0b51 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Fri, 15 Mar 2013 12:08:53 +0800
Subject: USB: adds comment on suspend callback

This patch adds comments on interface driver suspend callback
to emphasize that the failure return value is ignored by
USB core in system sleep context, so do not try to recover
device for this case and let resume/reset_resume callback
handle the suspend failure if needed.

Also kerneldoc for usb_suspend_both() is updated with the
fact.

Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/driver.c | 11 ++++++++---
 include/linux/usb.h       |  7 ++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index d938b2b99e31..eb1d00a3543a 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1196,9 +1196,14 @@ done:
  *
  * This is the central routine for suspending USB devices.  It calls the
  * suspend methods for all the interface drivers in @udev and then calls
- * the suspend method for @udev itself.  If an error occurs at any stage,
- * all the interfaces which were suspended are resumed so that they remain
- * in the same state as the device.
+ * the suspend method for @udev itself.  When the routine is called in
+ * autosuspend, if an error occurs at any stage, all the interfaces
+ * which were suspended are resumed so that they remain in the same
+ * state as the device, but when called from system sleep, all error
+ * from suspend methods of interfaces and the non-root-hub device itself
+ * are simply ignored, so all suspended interfaces are only resumed
+ * to the device's state when @udev is root-hub and its suspend method
+ * returns failure.
  *
  * Autosuspend requests originating from a child device or an interface
  * driver may be made without the protection of @udev's device lock, but
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 52464fb2389b..8d4bc173d66a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -976,7 +976,12 @@ struct usbdrv_wrap {
  *	the "usbfs" filesystem.  This lets devices provide ways to
  *	expose information to user space regardless of where they
  *	do (or don't) show up otherwise in the filesystem.
- * @suspend: Called when the device is going to be suspended by the system.
+ * @suspend: Called when the device is going to be suspended by the
+ *	system either from system sleep or runtime suspend context. The
+ *	return value will be ignored in system sleep context, so do NOT
+ *	try to continue using the device if suspend fails in this case.
+ *	Instead, let the resume or reset-resume routine recover from
+ *	the failure.
  * @resume: Called when the device is being resumed by the system.
  * @reset_resume: Called when the suspended device has been reset instead
  *	of being resumed.
-- 
cgit 


From f91a595d0b813c3a2f3b848c165e865ca4c5b8cc Mon Sep 17 00:00:00 2001
From: Nishanth Menon <nm@ti.com>
Date: Sat, 16 Mar 2013 11:46:44 +0530
Subject: memory: emif: Handle devices which are not rated for >85C

As per JESD209-2E specification for LPDDR2,
      http://www.jedec.org/standards-documents/results/jesd209-2E
Table 73, LPDDR2 memories come in two flavors - Standard and
Extended. The Standard types can operate from -25C to +85C
However, beyond that and upto +105C can only be supported by
Extended types.

Unfortunately, it seems there is no info in MR0(device info) or
MR[1,2](device feature) for run time detection of this capability
as far as seen on the spec. Hence, we provide a custom_config
flag to be populated by platforms which have these "extended"
type memories.

For the "Standard" memories, we need to consider MR4 notifications
of temperature triggers >85C as equivalent to thermal shutdown
events (equivalent to Spec specified thermal shutdown events for
"extended" parts).

Reported-by: Richard Woodruff <r-woodruff2@ti.com>
Signed-off-by: Nishanth Menon <nm@ti.com>
Signed-off-by: Lokesh Vutla <lokeshvutla@ti.com>
Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/memory/emif.c                   | 27 +++++++++++++++++++++++++++
 include/linux/platform_data/emif_plat.h |  1 +
 2 files changed, 28 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/memory/emif.c b/drivers/memory/emif.c
index 249222905c94..96add5b9ce5d 100644
--- a/drivers/memory/emif.c
+++ b/drivers/memory/emif.c
@@ -918,6 +918,7 @@ static irqreturn_t handle_temp_alert(void __iomem *base, struct emif_data *emif)
 {
 	u32		old_temp_level;
 	irqreturn_t	ret = IRQ_HANDLED;
+	struct emif_custom_configs *custom_configs;
 
 	spin_lock_irqsave(&emif_lock, irq_state);
 	old_temp_level = emif->temperature_level;
@@ -930,6 +931,29 @@ static irqreturn_t handle_temp_alert(void __iomem *base, struct emif_data *emif)
 		goto out;
 	}
 
+	custom_configs = emif->plat_data->custom_configs;
+
+	/*
+	 * IF we detect higher than "nominal rating" from DDR sensor
+	 * on an unsupported DDR part, shutdown system
+	 */
+	if (custom_configs && !(custom_configs->mask &
+				EMIF_CUSTOM_CONFIG_EXTENDED_TEMP_PART)) {
+		if (emif->temperature_level >= SDRAM_TEMP_HIGH_DERATE_REFRESH) {
+			dev_err(emif->dev,
+				"%s:NOT Extended temperature capable memory."
+				"Converting MR4=0x%02x as shutdown event\n",
+				__func__, emif->temperature_level);
+			/*
+			 * Temperature far too high - do kernel_power_off()
+			 * from thread context
+			 */
+			emif->temperature_level = SDRAM_TEMP_VERY_HIGH_SHUTDOWN;
+			ret = IRQ_WAKE_THREAD;
+			goto out;
+		}
+	}
+
 	if (emif->temperature_level < old_temp_level ||
 		emif->temperature_level == SDRAM_TEMP_VERY_HIGH_SHUTDOWN) {
 		/*
@@ -1228,6 +1252,9 @@ static void __init_or_module of_get_custom_configs(struct device_node *np_emif,
 		cust_cfgs->temp_alert_poll_interval_ms = *poll_intvl;
 	}
 
+	if (of_find_property(np_emif, "extended-temp-part", &len))
+		cust_cfgs->mask |= EMIF_CUSTOM_CONFIG_EXTENDED_TEMP_PART;
+
 	if (!is_custom_config_valid(cust_cfgs, emif->dev)) {
 		devm_kfree(emif->dev, cust_cfgs);
 		return;
diff --git a/include/linux/platform_data/emif_plat.h b/include/linux/platform_data/emif_plat.h
index 03378ca84061..5c19a2a647c4 100644
--- a/include/linux/platform_data/emif_plat.h
+++ b/include/linux/platform_data/emif_plat.h
@@ -40,6 +40,7 @@
 /* Custom config requests */
 #define EMIF_CUSTOM_CONFIG_LPMODE			0x00000001
 #define EMIF_CUSTOM_CONFIG_TEMP_ALERT_POLL_INTERVAL	0x00000002
+#define EMIF_CUSTOM_CONFIG_EXTENDED_TEMP_PART		0x00000004
 
 #ifndef __ASSEMBLY__
 /**
-- 
cgit 


From 3edce1cf813aa6a087df7730cec0e67d57288300 Mon Sep 17 00:00:00 2001
From: Bjørn Mork <bjorn@mork.no>
Date: Sun, 17 Mar 2013 21:00:06 +0100
Subject: USB: cdc-wdm: implement IOCTL_WDM_MAX_COMMAND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Userspace applications need to know the maximum supported message
size.

The cdc-wdm driver translates between a character device stream
and a message based protocol.  Each message is transported as a
usb control message with no further encapsulation or syncronization.
Each read or write on the character device should translate to
exactly one usb control message to ensure that message boundaries
are kept intact.  That means that the userspace application must
know the maximum message size supported by the device and driver,
making this size a vital part of the cdc-wdm character device API.

CDC WDM and CDC MBIM functions export the maximum supported
message size through CDC functional descriptors.  The cdc-wdm and
cdc_mbim drivers will parse these descriptors and use the value
chosen by the device.  The only current way for a userspace
application to retrive the value is by duplicating the descriptor
parsing. This is an unnecessary complex task, and application
writers are likely to postpone it, using a fixed value and adding
a "todo" item.

QMI functions have no way to tell the host what message size they
support.  The qmi_wwan driver use a fixed value based on protocol
recommendations and observed device behaviour.  Userspace
applications must know and hard code the same value.  This scheme
will break if we ever encounter a QMI device needing a device
specific message size quirk.  We are currently unable to support
such a device because using a non default size would break the
implicit userspace API.

The message size is currently a hidden attribute of the cdc-wdm
userspace API.  Retrieving it is unnecessarily complex, increasing
the possibility of drivers and applications using different limits.
The resulting errors are hard to debug, and can only be replicated
on identical hardware.

Exporting the maximum message size from the driver simplifies the
task for the userspace application, and creates a unified
information source independent of device and function class. It also
serves to document that the message size is part of the cdc-wdm
userspace API.

This proposed API extension has been presented for the authors of
userspace applications and libraries using the current API: libmbim,
libqmi, uqmi, oFono and ModemManager.  The replies were:

Aleksander Morgado:
 "We do really need max message size for MBIM; and as you say, it may be
  good to have the max message size info also for QMI, so the new ioctl
  seems a good addition. So +1 from my side, for what it's worth."

Dan Williams:
 "Yeah, +1 here.  I'd prefer the sysfs file, but the fact that that
  doesn't work for fd passing pretty much kills it."

No negative replies are so far received.

Cc: Aleksander Morgado <aleksander@lanedo.com>
Cc: Dan Williams <dcbw@redhat.com>
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Acked-by: Oliver Neukum <oliver@neukum.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ioctl/ioctl-number.txt |  1 +
 drivers/usb/class/cdc-wdm.c          | 19 +++++++++++++++++++
 include/linux/usb/cdc-wdm.h          |  2 ++
 include/uapi/linux/usb/cdc-wdm.h     | 21 +++++++++++++++++++++
 4 files changed, 43 insertions(+)
 create mode 100644 include/uapi/linux/usb/cdc-wdm.h

(limited to 'include/linux')

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 3210540f8bd3..237acab169dd 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -131,6 +131,7 @@ Code  Seq#(hex)	Include File		Comments
 'H'	40-4F	sound/hdspm.h		conflict!
 'H'	40-4F	sound/hdsp.h		conflict!
 'H'	90	sound/usb/usx2y/usb_stream.h
+'H'	A0	uapi/linux/usb/cdc-wdm.h
 'H'	C0-F0	net/bluetooth/hci.h	conflict!
 'H'	C0-DF	net/bluetooth/hidp/hidp.h	conflict!
 'H'	C0-DF	net/bluetooth/cmtp/cmtp.h	conflict!
diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c
index 122d056d96d5..8a230f0ef77c 100644
--- a/drivers/usb/class/cdc-wdm.c
+++ b/drivers/usb/class/cdc-wdm.c
@@ -13,6 +13,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/ioctl.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
@@ -644,6 +645,22 @@ static int wdm_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static long wdm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct wdm_device *desc = file->private_data;
+	int rv = 0;
+
+	switch (cmd) {
+	case IOCTL_WDM_MAX_COMMAND:
+		if (copy_to_user((void __user *)arg, &desc->wMaxCommand, sizeof(desc->wMaxCommand)))
+			rv = -EFAULT;
+		break;
+	default:
+		rv = -ENOTTY;
+	}
+	return rv;
+}
+
 static const struct file_operations wdm_fops = {
 	.owner =	THIS_MODULE,
 	.read =		wdm_read,
@@ -652,6 +669,8 @@ static const struct file_operations wdm_fops = {
 	.flush =	wdm_flush,
 	.release =	wdm_release,
 	.poll =		wdm_poll,
+	.unlocked_ioctl = wdm_ioctl,
+	.compat_ioctl = wdm_ioctl,
 	.llseek =	noop_llseek,
 };
 
diff --git a/include/linux/usb/cdc-wdm.h b/include/linux/usb/cdc-wdm.h
index 719c332620fa..0b3f4295c025 100644
--- a/include/linux/usb/cdc-wdm.h
+++ b/include/linux/usb/cdc-wdm.h
@@ -11,6 +11,8 @@
 #ifndef __LINUX_USB_CDC_WDM_H
 #define __LINUX_USB_CDC_WDM_H
 
+#include <uapi/linux/usb/cdc-wdm.h>
+
 extern struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf,
 					struct usb_endpoint_descriptor *ep,
 					int bufsize,
diff --git a/include/uapi/linux/usb/cdc-wdm.h b/include/uapi/linux/usb/cdc-wdm.h
new file mode 100644
index 000000000000..f03134feebd6
--- /dev/null
+++ b/include/uapi/linux/usb/cdc-wdm.h
@@ -0,0 +1,21 @@
+/*
+ * USB CDC Device Management userspace API definitions
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ */
+
+#ifndef _UAPI__LINUX_USB_CDC_WDM_H
+#define _UAPI__LINUX_USB_CDC_WDM_H
+
+/*
+ * This IOCTL is used to retrieve the wMaxCommand for the device,
+ * defining the message limit for both reading and writing.
+ *
+ * For CDC WDM functions this will be the wMaxCommand field of the
+ * Device Management Functional Descriptor.
+ */
+#define IOCTL_WDM_MAX_COMMAND _IOR('H', 0xA0, __u16)
+
+#endif /* _UAPI__LINUX_USB_CDC_WDM_H */
-- 
cgit 


From c5116e9d8d2de324f13a91fe5afc308cd6b0ca93 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 19 Mar 2013 16:58:58 +0100
Subject: ssb: define more board types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ssb/ssb.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 51 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ssb/ssb.h b/include/linux/ssb/ssb.h
index 8b1322296fed..c64999fd1660 100644
--- a/include/linux/ssb/ssb.h
+++ b/include/linux/ssb/ssb.h
@@ -340,13 +340,61 @@ enum ssb_bustype {
 #define SSB_BOARDVENDOR_DELL	0x1028	/* Dell */
 #define SSB_BOARDVENDOR_HP	0x0E11	/* HP */
 /* board_type */
+#define SSB_BOARD_BCM94301CB	0x0406
+#define SSB_BOARD_BCM94301MP	0x0407
+#define SSB_BOARD_BU4309	0x040A
+#define SSB_BOARD_BCM94309CB	0x040B
+#define SSB_BOARD_BCM4309MP	0x040C
+#define SSB_BOARD_BU4306	0x0416
 #define SSB_BOARD_BCM94306MP	0x0418
 #define SSB_BOARD_BCM4309G	0x0421
 #define SSB_BOARD_BCM4306CB	0x0417
-#define SSB_BOARD_BCM4309MP	0x040C
+#define SSB_BOARD_BCM94306PC	0x0425	/* pcmcia 3.3v 4306 card */
+#define SSB_BOARD_BCM94306CBSG	0x042B	/* with SiGe PA */
+#define SSB_BOARD_PCSG94306	0x042D	/* with SiGe PA */
+#define SSB_BOARD_BU4704SD	0x042E	/* with sdram */
+#define SSB_BOARD_BCM94704AGR	0x042F	/* dual 11a/11g Router */
+#define SSB_BOARD_BCM94308MP	0x0430	/* 11a-only minipci */
+#define SSB_BOARD_BU4318	0x0447
+#define SSB_BOARD_CB4318	0x0448
+#define SSB_BOARD_MPG4318	0x0449
 #define SSB_BOARD_MP4318	0x044A
-#define SSB_BOARD_BU4306	0x0416
-#define SSB_BOARD_BU4309	0x040A
+#define SSB_BOARD_SD4318	0x044B
+#define SSB_BOARD_BCM94306P	0x044C	/* with SiGe */
+#define SSB_BOARD_BCM94303MP	0x044E
+#define SSB_BOARD_BCM94306MPM	0x0450
+#define SSB_BOARD_BCM94306MPL	0x0453
+#define SSB_BOARD_PC4303	0x0454	/* pcmcia */
+#define SSB_BOARD_BCM94306MPLNA	0x0457
+#define SSB_BOARD_BCM94306MPH	0x045B
+#define SSB_BOARD_BCM94306PCIV	0x045C
+#define SSB_BOARD_BCM94318MPGH	0x0463
+#define SSB_BOARD_BU4311	0x0464
+#define SSB_BOARD_BCM94311MC	0x0465
+#define SSB_BOARD_BCM94311MCAG	0x0466
+/* 4321 boards */
+#define SSB_BOARD_BU4321	0x046B
+#define SSB_BOARD_BU4321E	0x047C
+#define SSB_BOARD_MP4321	0x046C
+#define SSB_BOARD_CB2_4321	0x046D
+#define SSB_BOARD_CB2_4321_AG	0x0066
+#define SSB_BOARD_MC4321	0x046E
+/* 4325 boards */
+#define SSB_BOARD_BCM94325DEVBU	0x0490
+#define SSB_BOARD_BCM94325BGABU	0x0491
+#define SSB_BOARD_BCM94325SDGWB	0x0492
+#define SSB_BOARD_BCM94325SDGMDL	0x04AA
+#define SSB_BOARD_BCM94325SDGMDL2	0x04C6
+#define SSB_BOARD_BCM94325SDGMDL3	0x04C9
+#define SSB_BOARD_BCM94325SDABGWBA	0x04E1
+/* 4322 boards */
+#define SSB_BOARD_BCM94322MC	0x04A4
+#define SSB_BOARD_BCM94322USB	0x04A8	/* dualband */
+#define SSB_BOARD_BCM94322HM	0x04B0
+#define SSB_BOARD_BCM94322USB2D	0x04Bf	/* single band discrete front end */
+/* 4312 boards */
+#define SSB_BOARD_BU4312	0x048A
+#define SSB_BOARD_BCM4312MCGSG	0x04B5
 /* chip_package */
 #define SSB_CHIPPACK_BCM4712S	1	/* Small 200pin 4712 */
 #define SSB_CHIPPACK_BCM4712M	2	/* Medium 225pin 4712 */
-- 
cgit 


From 3e6998574fde0ab7a3329c9229394dd80462ead2 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 19 Mar 2013 16:58:59 +0100
Subject: bcma: define board types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using that IDs we can write workarounds for various cards

Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/bcma/bcma.h | 54 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index e0ce311011c0..0ab6712fd76b 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -173,6 +173,60 @@ struct bcma_host_ops {
 #define BCMA_CHIP_ID_BCM53572	53572
 #define  BCMA_PKG_ID_BCM47188	9
 
+/* Board types (on PCI usually equals to the subsystem dev id) */
+/* BCM4313 */
+#define BCMA_BOARD_TYPE_BCM94313BU	0X050F
+#define BCMA_BOARD_TYPE_BCM94313HM	0X0510
+#define BCMA_BOARD_TYPE_BCM94313EPA	0X0511
+#define BCMA_BOARD_TYPE_BCM94313HMG	0X051C
+/* BCM4716 */
+#define BCMA_BOARD_TYPE_BCM94716NR2	0X04CD
+/* BCM43224 */
+#define BCMA_BOARD_TYPE_BCM943224X21	0X056E
+#define BCMA_BOARD_TYPE_BCM943224X21_FCC	0X00D1
+#define BCMA_BOARD_TYPE_BCM943224X21B	0X00E9
+#define BCMA_BOARD_TYPE_BCM943224M93	0X008B
+#define BCMA_BOARD_TYPE_BCM943224M93A	0X0090
+#define BCMA_BOARD_TYPE_BCM943224X16	0X0093
+#define BCMA_BOARD_TYPE_BCM94322X9	0X008D
+#define BCMA_BOARD_TYPE_BCM94322M35E	0X008E
+/* BCM43228 */
+#define BCMA_BOARD_TYPE_BCM943228BU8	0X0540
+#define BCMA_BOARD_TYPE_BCM943228BU9	0X0541
+#define BCMA_BOARD_TYPE_BCM943228BU	0X0542
+#define BCMA_BOARD_TYPE_BCM943227HM4L	0X0543
+#define BCMA_BOARD_TYPE_BCM943227HMB	0X0544
+#define BCMA_BOARD_TYPE_BCM943228HM4L	0X0545
+#define BCMA_BOARD_TYPE_BCM943228SD	0X0573
+/* BCM4331 */
+#define BCMA_BOARD_TYPE_BCM94331X19	0X00D6
+#define BCMA_BOARD_TYPE_BCM94331X28	0X00E4
+#define BCMA_BOARD_TYPE_BCM94331X28B	0X010E
+#define BCMA_BOARD_TYPE_BCM94331PCIEBT3AX	0X00E4
+#define BCMA_BOARD_TYPE_BCM94331X12_2G	0X00EC
+#define BCMA_BOARD_TYPE_BCM94331X12_5G	0X00ED
+#define BCMA_BOARD_TYPE_BCM94331X29B	0X00EF
+#define BCMA_BOARD_TYPE_BCM94331CSAX	0X00EF
+#define BCMA_BOARD_TYPE_BCM94331X19C	0X00F5
+#define BCMA_BOARD_TYPE_BCM94331X33	0X00F4
+#define BCMA_BOARD_TYPE_BCM94331BU	0X0523
+#define BCMA_BOARD_TYPE_BCM94331S9BU	0X0524
+#define BCMA_BOARD_TYPE_BCM94331MC	0X0525
+#define BCMA_BOARD_TYPE_BCM94331MCI	0X0526
+#define BCMA_BOARD_TYPE_BCM94331PCIEBT4	0X0527
+#define BCMA_BOARD_TYPE_BCM94331HM	0X0574
+#define BCMA_BOARD_TYPE_BCM94331PCIEDUAL	0X059B
+#define BCMA_BOARD_TYPE_BCM94331MCH5	0X05A9
+#define BCMA_BOARD_TYPE_BCM94331CS	0X05C6
+#define BCMA_BOARD_TYPE_BCM94331CD	0X05DA
+/* BCM53572 */
+#define BCMA_BOARD_TYPE_BCM953572BU	0X058D
+#define BCMA_BOARD_TYPE_BCM953572NR2	0X058E
+#define BCMA_BOARD_TYPE_BCM947188NR2	0X058F
+#define BCMA_BOARD_TYPE_BCM953572SDRNR2	0X0590
+/* BCM43142 */
+#define BCMA_BOARD_TYPE_BCM943142HM	0X05E0
+
 struct bcma_device {
 	struct bcma_bus *bus;
 	struct bcma_device_id id;
-- 
cgit 


From 12bef78f0a806639daef58b1770be6ea19b2e94d Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Thu, 21 Mar 2013 16:26:19 +0100
Subject: ssb: fix sprom constant for ant_available_{bg,a}

This was done accordingly to new specs.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 include/linux/ssb/ssb_regs.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ssb/ssb_regs.h b/include/linux/ssb/ssb_regs.h
index 6ecfa02ddbac..3a7256955b10 100644
--- a/include/linux/ssb/ssb_regs.h
+++ b/include/linux/ssb/ssb_regs.h
@@ -289,11 +289,11 @@
 #define  SSB_SPROM4_ETHPHY_ET1A_SHIFT	5
 #define  SSB_SPROM4_ETHPHY_ET0M		(1<<14)	/* MDIO for enet0 */
 #define  SSB_SPROM4_ETHPHY_ET1M		(1<<15)	/* MDIO for enet1 */
-#define SSB_SPROM4_ANTAVAIL		0x005D  /* Antenna available bitfields */
-#define  SSB_SPROM4_ANTAVAIL_A		0x00FF	/* A-PHY bitfield */
-#define  SSB_SPROM4_ANTAVAIL_A_SHIFT	0
-#define  SSB_SPROM4_ANTAVAIL_BG		0xFF00	/* B-PHY and G-PHY bitfield */
-#define  SSB_SPROM4_ANTAVAIL_BG_SHIFT	8
+#define SSB_SPROM4_ANTAVAIL		0x005C  /* Antenna available bitfields */
+#define  SSB_SPROM4_ANTAVAIL_BG		0x00FF	/* B-PHY and G-PHY bitfield */
+#define  SSB_SPROM4_ANTAVAIL_BG_SHIFT	0
+#define  SSB_SPROM4_ANTAVAIL_A		0xFF00	/* A-PHY bitfield */
+#define  SSB_SPROM4_ANTAVAIL_A_SHIFT	8
 #define SSB_SPROM4_AGAIN01		0x005E	/* Antenna Gain (in dBm Q5.2) */
 #define  SSB_SPROM4_AGAIN0		0x00FF	/* Antenna 0 */
 #define  SSB_SPROM4_AGAIN0_SHIFT	0
-- 
cgit 


From 0f16cfe39eeef47c91aa3c3bf2b49954d5313a58 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 21 Mar 2013 12:36:42 +0100
Subject: USB: serial: remove generic disconnect callback

Remove the now empty generic disconnect callback and make the disconnect
callback non-mandatory.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/generic.c    | 6 ------
 drivers/usb/serial/usb-serial.c | 4 ++--
 include/linux/usb/serial.h      | 1 -
 3 files changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index a6d0ac638e0a..4d421f3f8a7c 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -45,7 +45,6 @@ struct usb_serial_driver usb_serial_generic_device = {
 	},
 	.id_table =		generic_device_ids,
 	.num_ports =		1,
-	.disconnect =		usb_serial_generic_disconnect,
 	.release =		usb_serial_generic_release,
 	.throttle =		usb_serial_generic_throttle,
 	.unthrottle =		usb_serial_generic_unthrottle,
@@ -500,11 +499,6 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
 
-void usb_serial_generic_disconnect(struct usb_serial *serial)
-{
-}
-EXPORT_SYMBOL_GPL(usb_serial_generic_disconnect);
-
 void usb_serial_generic_release(struct usb_serial *serial)
 {
 }
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index e7f97b58e914..569b6792c218 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -1095,7 +1095,8 @@ static void usb_serial_disconnect(struct usb_interface *interface)
 				device_del(&port->dev);
 		}
 	}
-	serial->type->disconnect(serial);
+	if (serial->type->disconnect)
+		serial->type->disconnect(serial);
 
 	/* let the last holder of this object cause it to be cleaned up */
 	usb_serial_put(serial);
@@ -1304,7 +1305,6 @@ static void fixup_generic(struct usb_serial_driver *device)
 	set_to_generic_if_null(device, chars_in_buffer);
 	set_to_generic_if_null(device, read_bulk_callback);
 	set_to_generic_if_null(device, write_bulk_callback);
-	set_to_generic_if_null(device, disconnect);
 	set_to_generic_if_null(device, release);
 	set_to_generic_if_null(device, process_read_urb);
 	set_to_generic_if_null(device, prepare_write_buffer);
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 1819b59aab2a..437dfd6787f9 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -329,7 +329,6 @@ extern void usb_serial_generic_read_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_write_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_throttle(struct tty_struct *tty);
 extern void usb_serial_generic_unthrottle(struct tty_struct *tty);
-extern void usb_serial_generic_disconnect(struct usb_serial *serial);
 extern void usb_serial_generic_release(struct usb_serial *serial);
 extern int usb_serial_generic_register(void);
 extern void usb_serial_generic_deregister(void);
-- 
cgit 


From 79b80b8a1141ba0605e917a6fc12d44383ab29b8 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 21 Mar 2013 12:36:43 +0100
Subject: USB: serial: remove generic release callback

Remove empty generic release implementation and make the release
callback non-mandatory (like attach, probe and disconnect).

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/generic.c    | 5 -----
 drivers/usb/serial/usb-serial.c | 3 +--
 include/linux/usb/serial.h      | 1 -
 3 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 4d421f3f8a7c..aa71f6e72f61 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -45,7 +45,6 @@ struct usb_serial_driver usb_serial_generic_device = {
 	},
 	.id_table =		generic_device_ids,
 	.num_ports =		1,
-	.release =		usb_serial_generic_release,
 	.throttle =		usb_serial_generic_throttle,
 	.unthrottle =		usb_serial_generic_unthrottle,
 	.resume =		usb_serial_generic_resume,
@@ -498,7 +497,3 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 	return c ? -EIO : 0;
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
-
-void usb_serial_generic_release(struct usb_serial *serial)
-{
-}
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 569b6792c218..4819fd9a639a 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -137,7 +137,7 @@ static void destroy_serial(struct kref *kref)
 	if (serial->minor != SERIAL_TTY_NO_MINOR)
 		return_serial(serial);
 
-	if (serial->attached)
+	if (serial->attached && serial->type->release)
 		serial->type->release(serial);
 
 	/* Now that nothing is using the ports, they can be freed */
@@ -1305,7 +1305,6 @@ static void fixup_generic(struct usb_serial_driver *device)
 	set_to_generic_if_null(device, chars_in_buffer);
 	set_to_generic_if_null(device, read_bulk_callback);
 	set_to_generic_if_null(device, write_bulk_callback);
-	set_to_generic_if_null(device, release);
 	set_to_generic_if_null(device, process_read_urb);
 	set_to_generic_if_null(device, prepare_write_buffer);
 }
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 437dfd6787f9..3f8f5e3c76d5 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -329,7 +329,6 @@ extern void usb_serial_generic_read_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_write_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_throttle(struct tty_struct *tty);
 extern void usb_serial_generic_unthrottle(struct tty_struct *tty);
-extern void usb_serial_generic_release(struct usb_serial *serial);
 extern int usb_serial_generic_register(void);
 extern void usb_serial_generic_deregister(void);
 extern int usb_serial_generic_submit_read_urbs(struct usb_serial_port *port,
-- 
cgit 


From 143d9d961608b737d90a813deaaf91affb41c83c Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 21 Mar 2013 12:36:51 +0100
Subject: USB: serial: add tiocmiwait subdriver operation

Add tiocmiwait operation to struct usb_serial_driver.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/usb-serial.c | 15 +++++++++++----
 include/linux/usb/serial.h      |  1 +
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index 0b39d013c505..ada400d6594b 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -402,10 +402,17 @@ static int serial_ioctl(struct tty_struct *tty,
 
 	dev_dbg(tty->dev, "%s - cmd 0x%.4x\n", __func__, cmd);
 
-	if (port->serial->type->ioctl)
-		retval = port->serial->type->ioctl(tty, cmd, arg);
-	else
-		retval = -ENOIOCTLCMD;
+	switch (cmd) {
+	case TIOCMIWAIT:
+		if (port->serial->type->tiocmiwait)
+			retval = port->serial->type->tiocmiwait(tty, arg);
+		break;
+	default:
+		if (port->serial->type->ioctl)
+			retval = port->serial->type->ioctl(tty, cmd, arg);
+		else
+			retval = -ENOIOCTLCMD;
+	}
 
 	return retval;
 }
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 3f8f5e3c76d5..9c8b53f80f48 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -272,6 +272,7 @@ struct usb_serial_driver {
 	int  (*tiocmget)(struct tty_struct *tty);
 	int  (*tiocmset)(struct tty_struct *tty,
 			 unsigned int set, unsigned int clear);
+	int  (*tiocmiwait)(struct tty_struct *tty, unsigned long arg);
 	int  (*get_icount)(struct tty_struct *tty,
 			struct serial_icounter_struct *icount);
 	/* Called by the tty layer for port level work. There may or may not
-- 
cgit 


From 980373b7918b8023be6b7df03857f494ae124d0b Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 21 Mar 2013 12:36:52 +0100
Subject: USB: serial: add generic TIOCMIWAIT implementation

Add generic TIOCMIWAIT implementation which correctly handles hangup,
USB-device disconnect, does not rely on the deprecated sleep_on
functions and hence does not suffer from the races currently affecting
several usb-serial drivers.

This makes it much easier to add TIOCMIWAIT support to subdrivers as the
tricky details related to hangup and disconnect (e.g. atomicity, that
the private port data may have been freed when woken up, and waking up
processes at disconnect) have been handled once and for all.

To add support to a subdriver, simply set the tiocmiwait-port-operation
field, update the port icount fields and wake up any process sleeping on
the tty-port modem-status-change wait queue on changes.

Note that the tty-port initialised flag can be used to detect
disconnected as the port will be hung up as part of disconnect (and
cannot be reactivated due to the disconnected flag). However, as the
tty-port implementation currently wakes up processes before calling port
shutdown, the tty-hupping flag must also be checked to detect hangup for
now.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/generic.c | 58 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/usb/serial.h   |  5 ++++
 2 files changed, 63 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index aa71f6e72f61..18bc74e20fe1 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -418,6 +418,64 @@ void usb_serial_generic_unthrottle(struct tty_struct *tty)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_unthrottle);
 
+static bool usb_serial_generic_msr_changed(struct tty_struct *tty,
+				unsigned long arg, struct async_icount *cprev)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	struct async_icount cnow;
+	unsigned long flags;
+	bool ret;
+
+	/*
+	 * Use tty-port initialised flag to detect all hangups including the
+	 * one generated at USB-device disconnect.
+	 *
+	 * FIXME: Remove hupping check once tty_port_hangup calls shutdown
+	 *        (which clears the initialised flag) before wake up.
+	 */
+	if (test_bit(TTY_HUPPING, &tty->flags))
+		return true;
+	if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+		return true;
+
+	spin_lock_irqsave(&port->lock, flags);
+	cnow = port->icount;				/* atomic copy*/
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	ret =	((arg & TIOCM_RNG) && (cnow.rng != cprev->rng)) ||
+		((arg & TIOCM_DSR) && (cnow.dsr != cprev->dsr)) ||
+		((arg & TIOCM_CD)  && (cnow.dcd != cprev->dcd)) ||
+		((arg & TIOCM_CTS) && (cnow.cts != cprev->cts));
+
+	*cprev = cnow;
+
+	return ret;
+}
+
+int usb_serial_generic_tiocmiwait(struct tty_struct *tty, unsigned long arg)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	struct async_icount cnow;
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&port->lock, flags);
+	cnow = port->icount;				/* atomic copy */
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	ret = wait_event_interruptible(port->port.delta_msr_wait,
+			usb_serial_generic_msr_changed(tty, arg, &cnow));
+	if (!ret) {
+		if (test_bit(TTY_HUPPING, &tty->flags))
+			ret = -EIO;
+		if (!test_bit(ASYNCB_INITIALIZED, &port->port.flags))
+			ret = -EIO;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(usb_serial_generic_tiocmiwait);
+
 #ifdef CONFIG_MAGIC_SYSRQ
 int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch)
 {
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 9c8b53f80f48..47c8d2c506c8 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -15,6 +15,7 @@
 
 #include <linux/kref.h>
 #include <linux/mutex.h>
+#include <linux/serial.h>
 #include <linux/sysrq.h>
 #include <linux/kfifo.h>
 
@@ -61,6 +62,7 @@
  * @bulk_out_buffers: pointers to the bulk out buffers for this port
  * @write_urbs: pointers to the bulk out urbs for this port
  * @write_urbs_free: status bitmap the for bulk out urbs
+ * @icount: interrupt counters
  * @tx_bytes: number of bytes currently in host stack queues
  * @bulk_out_endpointAddress: endpoint address for the bulk out pipe for this
  *	port.
@@ -109,6 +111,7 @@ struct usb_serial_port {
 	unsigned long		write_urbs_free;
 	__u8			bulk_out_endpointAddress;
 
+	struct async_icount	icount;
 	int			tx_bytes;
 
 	unsigned long		flags;
@@ -330,6 +333,8 @@ extern void usb_serial_generic_read_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_write_bulk_callback(struct urb *urb);
 extern void usb_serial_generic_throttle(struct tty_struct *tty);
 extern void usb_serial_generic_unthrottle(struct tty_struct *tty);
+extern int usb_serial_generic_tiocmiwait(struct tty_struct *tty,
+							unsigned long arg);
 extern int usb_serial_generic_register(void);
 extern void usb_serial_generic_deregister(void);
 extern int usb_serial_generic_submit_read_urbs(struct usb_serial_port *port,
-- 
cgit 


From befefcda4bddc52d29248931801961a72aeef28b Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 21 Mar 2013 12:36:54 +0100
Subject: USB: serial: add generic get_icount implementation

Add generic get_icount implementation that subdrivers relying on the
port interrupt counters can use.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/serial/generic.c | 27 +++++++++++++++++++++++++++
 include/linux/usb/serial.h   |  2 ++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 18bc74e20fe1..5e55761b2cb8 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -476,6 +476,33 @@ int usb_serial_generic_tiocmiwait(struct tty_struct *tty, unsigned long arg)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_tiocmiwait);
 
+int usb_serial_generic_get_icount(struct tty_struct *tty,
+					struct serial_icounter_struct *icount)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	struct async_icount cnow;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	cnow = port->icount;				/* atomic copy */
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	icount->cts = cnow.cts;
+	icount->dsr = cnow.dsr;
+	icount->rng = cnow.rng;
+	icount->dcd = cnow.dcd;
+	icount->tx = cnow.tx;
+	icount->rx = cnow.rx;
+	icount->frame = cnow.frame;
+	icount->parity = cnow.parity;
+	icount->overrun = cnow.overrun;
+	icount->brk = cnow.brk;
+	icount->buf_overrun = cnow.buf_overrun;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(usb_serial_generic_get_icount);
+
 #ifdef CONFIG_MAGIC_SYSRQ
 int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch)
 {
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 47c8d2c506c8..c786ee7fca8f 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -335,6 +335,8 @@ extern void usb_serial_generic_throttle(struct tty_struct *tty);
 extern void usb_serial_generic_unthrottle(struct tty_struct *tty);
 extern int usb_serial_generic_tiocmiwait(struct tty_struct *tty,
 							unsigned long arg);
+extern int usb_serial_generic_get_icount(struct tty_struct *tty,
+					struct serial_icounter_struct *icount);
 extern int usb_serial_generic_register(void);
 extern void usb_serial_generic_deregister(void);
 extern int usb_serial_generic_submit_read_urbs(struct usb_serial_port *port,
-- 
cgit 


From 53ab34dc50ad99366257d34cdb8a84f24250d611 Mon Sep 17 00:00:00 2001
From: Johan Hovold <jhovold@gmail.com>
Date: Thu, 21 Mar 2013 12:37:35 +0100
Subject: USB: serial: remove unused MSR-wait queue

Remove the port MSR-wait queue now that all drivers have been migrated
to the tty-port queue.

Signed-off-by: Johan Hovold <jhovold@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/usb/serial.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index c786ee7fca8f..b9b0f7b4e43b 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -68,7 +68,6 @@
  *	port.
  * @flags: usb serial port flags
  * @write_wait: a wait_queue_head_t used by the port.
- * @delta_msr_wait: modem-status-change wait queue
  * @work: work queue entry for the line discipline waking up.
  * @throttled: nonzero if the read urb is inactive to throttle the device
  * @throttle_req: nonzero if the tty wants to throttle us
@@ -116,7 +115,6 @@ struct usb_serial_port {
 
 	unsigned long		flags;
 	wait_queue_head_t	write_wait;
-	wait_queue_head_t	delta_msr_wait;
 	struct work_struct	work;
 	char			throttled;
 	char			throttle_req;
-- 
cgit 


From 1e9663c62b32f695af37fec4afc473b59f5ca9b4 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Mon, 25 Mar 2013 08:58:00 +0000
Subject: iio:trigger: Introduce iio_tigger_{set,get}_drvdata

Introduce iio_tigger_{set,get}_drvdata which allows to attach driver specific
data to a trigger. The functions wrap access to the triggers private_data field
and all current users are updated to use iio_tigger_{set,get}_drvdata instead of
directly accessing the private_data field. This is the first step towards
removing the private_data field from the iio_trigger struct.

The following coccinelle script has been used to update the drivers:
<smpl>
@@
struct iio_trigger *trigger;
expression priv;
@@
-trigger->private_data = priv
+iio_trigger_set_drv_data(trigger, priv)

@@
struct iio_trigger *trigger;
@@
-trigger->private_data
+iio_trigger_get_drv_data(trigger)
</smpl>

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/iio/accel/st_accel_buffer.c                |  2 +-
 drivers/iio/adc/ad_sigma_delta.c                   |  2 +-
 drivers/iio/adc/at91_adc.c                         |  4 ++--
 .../iio/common/hid-sensors/hid-sensor-trigger.c    |  4 ++--
 drivers/iio/common/st_sensors/st_sensors_trigger.c |  2 +-
 drivers/iio/gyro/itg3200_buffer.c                  |  4 ++--
 drivers/iio/gyro/st_gyro_buffer.c                  |  2 +-
 drivers/iio/imu/adis_trigger.c                     |  4 ++--
 drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c      |  4 ++--
 drivers/staging/iio/accel/lis3l02dq_ring.c         |  6 +++---
 drivers/staging/iio/adc/mxs-lradc.c                |  4 ++--
 drivers/staging/iio/meter/ade7758_trigger.c        |  6 +++---
 drivers/staging/iio/trigger/iio-trig-bfin-timer.c  |  8 ++++----
 drivers/staging/iio/trigger/iio-trig-gpio.c        |  6 +++---
 .../staging/iio/trigger/iio-trig-periodic-rtc.c    | 12 +++++------
 drivers/staging/iio/trigger/iio-trig-sysfs.c       |  4 ++--
 include/linux/iio/trigger.h                        | 24 ++++++++++++++++++++++
 17 files changed, 61 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iio/accel/st_accel_buffer.c b/drivers/iio/accel/st_accel_buffer.c
index 6bd82c7f769c..d9b350756f90 100644
--- a/drivers/iio/accel/st_accel_buffer.c
+++ b/drivers/iio/accel/st_accel_buffer.c
@@ -25,7 +25,7 @@
 
 int st_accel_trig_set_state(struct iio_trigger *trig, bool state)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 
 	return st_sensors_set_dataready_irq(indio_dev, state);
 }
diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c
index afe6d78c8ff0..f0d6335ae087 100644
--- a/drivers/iio/adc/ad_sigma_delta.c
+++ b/drivers/iio/adc/ad_sigma_delta.c
@@ -470,7 +470,7 @@ static int ad_sd_probe_trigger(struct iio_dev *indio_dev)
 		disable_irq_nosync(sigma_delta->spi->irq);
 	}
 	sigma_delta->trig->dev.parent = &sigma_delta->spi->dev;
-	sigma_delta->trig->private_data = sigma_delta;
+	iio_trigger_set_drvdata(sigma_delta->trig, sigma_delta);
 
 	ret = iio_trigger_register(sigma_delta->trig);
 	if (ret)
diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index 92eb6a5b9e72..6fc43c15f028 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -188,7 +188,7 @@ static u8 at91_adc_get_trigger_value_by_name(struct iio_dev *idev,
 
 static int at91_adc_configure_trigger(struct iio_trigger *trig, bool state)
 {
-	struct iio_dev *idev = trig->private_data;
+	struct iio_dev *idev = iio_trigger_get_drvdata(trig);
 	struct at91_adc_state *st = iio_priv(idev);
 	struct iio_buffer *buffer = idev->buffer;
 	struct at91_adc_reg_desc *reg = st->registers;
@@ -254,7 +254,7 @@ static struct iio_trigger *at91_adc_allocate_trigger(struct iio_dev *idev,
 		return NULL;
 
 	trig->dev.parent = idev->dev.parent;
-	trig->private_data = idev;
+	iio_trigger_set_drvdata(trig, idev);
 	trig->ops = &at91_adc_trigger_ops;
 
 	ret = iio_trigger_register(trig);
diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
index 7a525a91105d..87419c41b991 100644
--- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
+++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c
@@ -31,7 +31,7 @@
 static int hid_sensor_data_rdy_trigger_set_state(struct iio_trigger *trig,
 						bool state)
 {
-	struct hid_sensor_common *st = trig->private_data;
+	struct hid_sensor_common *st = iio_trigger_get_drvdata(trig);
 	int state_val;
 
 	state_val = state ? 1 : 0;
@@ -76,7 +76,7 @@ int hid_sensor_setup_trigger(struct iio_dev *indio_dev, const char *name,
 	}
 
 	trig->dev.parent = indio_dev->dev.parent;
-	trig->private_data = attrb;
+	iio_trigger_set_drvdata(trig, attrb);
 	trig->ops = &hid_sensor_trigger_ops;
 	ret = iio_trigger_register(trig);
 
diff --git a/drivers/iio/common/st_sensors/st_sensors_trigger.c b/drivers/iio/common/st_sensors/st_sensors_trigger.c
index 139ed030abb0..8fc3a97eb266 100644
--- a/drivers/iio/common/st_sensors/st_sensors_trigger.c
+++ b/drivers/iio/common/st_sensors/st_sensors_trigger.c
@@ -40,7 +40,7 @@ int st_sensors_allocate_trigger(struct iio_dev *indio_dev,
 	if (err)
 		goto request_irq_error;
 
-	sdata->trig->private_data = indio_dev;
+	iio_trigger_set_drvdata(sdata->trig, indio_dev);
 	sdata->trig->ops = trigger_ops;
 	sdata->trig->dev.parent = sdata->dev;
 
diff --git a/drivers/iio/gyro/itg3200_buffer.c b/drivers/iio/gyro/itg3200_buffer.c
index f667d2c8c00f..6c43af9bb0a4 100644
--- a/drivers/iio/gyro/itg3200_buffer.c
+++ b/drivers/iio/gyro/itg3200_buffer.c
@@ -81,7 +81,7 @@ void itg3200_buffer_unconfigure(struct iio_dev *indio_dev)
 static int itg3200_data_rdy_trigger_set_state(struct iio_trigger *trig,
 		bool state)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	int ret;
 	u8 msc;
 
@@ -129,7 +129,7 @@ int itg3200_probe_trigger(struct iio_dev *indio_dev)
 
 	st->trig->dev.parent = &st->i2c->dev;
 	st->trig->ops = &itg3200_trigger_ops;
-	st->trig->private_data = indio_dev;
+	iio_trigger_set_drvdata(st->trig, indio_dev);
 	ret = iio_trigger_register(st->trig);
 	if (ret)
 		goto error_free_irq;
diff --git a/drivers/iio/gyro/st_gyro_buffer.c b/drivers/iio/gyro/st_gyro_buffer.c
index da4d122ec7dc..69017c7ec302 100644
--- a/drivers/iio/gyro/st_gyro_buffer.c
+++ b/drivers/iio/gyro/st_gyro_buffer.c
@@ -25,7 +25,7 @@
 
 int st_gyro_trig_set_state(struct iio_trigger *trig, bool state)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 
 	return st_sensors_set_dataready_irq(indio_dev, state);
 }
diff --git a/drivers/iio/imu/adis_trigger.c b/drivers/iio/imu/adis_trigger.c
index 5a24c9cac343..e0017c22bb9c 100644
--- a/drivers/iio/imu/adis_trigger.c
+++ b/drivers/iio/imu/adis_trigger.c
@@ -19,7 +19,7 @@
 static int adis_data_rdy_trigger_set_state(struct iio_trigger *trig,
 						bool state)
 {
-	struct adis *adis = trig->private_data;
+	struct adis *adis = iio_trigger_get_drvdata(trig);
 
 	return adis_enable_irq(adis, state);
 }
@@ -57,7 +57,7 @@ int adis_probe_trigger(struct adis *adis, struct iio_dev *indio_dev)
 
 	adis->trig->dev.parent = &adis->spi->dev;
 	adis->trig->ops = &adis_trigger_ops;
-	adis->trig->private_data = adis;
+	iio_trigger_set_drvdata(adis->trig, adis);
 	ret = iio_trigger_register(adis->trig);
 
 	indio_dev->trig = adis->trig;
diff --git a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
index e1d0869e0ad1..03b9372c1212 100644
--- a/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
+++ b/drivers/iio/imu/inv_mpu6050/inv_mpu_trigger.c
@@ -103,7 +103,7 @@ static int inv_mpu6050_set_enable(struct iio_dev *indio_dev, bool enable)
 static int inv_mpu_data_rdy_trigger_set_state(struct iio_trigger *trig,
 						bool state)
 {
-	return inv_mpu6050_set_enable(trig->private_data, state);
+	return inv_mpu6050_set_enable(iio_trigger_get_drvdata(trig), state);
 }
 
 static const struct iio_trigger_ops inv_mpu_trigger_ops = {
@@ -130,8 +130,8 @@ int inv_mpu6050_probe_trigger(struct iio_dev *indio_dev)
 	if (ret)
 		goto error_free_trig;
 	st->trig->dev.parent = &st->client->dev;
-	st->trig->private_data = indio_dev;
 	st->trig->ops = &inv_mpu_trigger_ops;
+	iio_trigger_set_drvdata(st->trig, indio_dev);
 	ret = iio_trigger_register(st->trig);
 	if (ret)
 		goto error_free_irq;
diff --git a/drivers/staging/iio/accel/lis3l02dq_ring.c b/drivers/staging/iio/accel/lis3l02dq_ring.c
index e676403ea3ea..5b8f0f6c9938 100644
--- a/drivers/staging/iio/accel/lis3l02dq_ring.c
+++ b/drivers/staging/iio/accel/lis3l02dq_ring.c
@@ -228,7 +228,7 @@ error_ret:
 static int lis3l02dq_data_rdy_trigger_set_state(struct iio_trigger *trig,
 						bool state)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	int ret = 0;
 	u8 t;
 
@@ -252,7 +252,7 @@ static int lis3l02dq_data_rdy_trigger_set_state(struct iio_trigger *trig,
  */
 static int lis3l02dq_trig_try_reen(struct iio_trigger *trig)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct lis3l02dq_state *st = iio_priv(indio_dev);
 	int i;
 
@@ -290,7 +290,7 @@ int lis3l02dq_probe_trigger(struct iio_dev *indio_dev)
 
 	st->trig->dev.parent = &st->us->dev;
 	st->trig->ops = &lis3l02dq_trigger_ops;
-	st->trig->private_data = indio_dev;
+	iio_trigger_set_drvdata(st->trig, indio_dev);
 	ret = iio_trigger_register(st->trig);
 	if (ret)
 		goto error_free_trig;
diff --git a/drivers/staging/iio/adc/mxs-lradc.c b/drivers/staging/iio/adc/mxs-lradc.c
index 25a4359a92db..eab975d11bb2 100644
--- a/drivers/staging/iio/adc/mxs-lradc.c
+++ b/drivers/staging/iio/adc/mxs-lradc.c
@@ -646,7 +646,7 @@ static irqreturn_t mxs_lradc_trigger_handler(int irq, void *p)
 
 static int mxs_lradc_configure_trigger(struct iio_trigger *trig, bool state)
 {
-	struct iio_dev *iio = trig->private_data;
+	struct iio_dev *iio = iio_trigger_get_drvdata(trig);
 	struct mxs_lradc *lradc = iio_priv(iio);
 	const uint32_t st = state ? STMP_OFFSET_REG_SET : STMP_OFFSET_REG_CLR;
 
@@ -670,7 +670,7 @@ static int mxs_lradc_trigger_init(struct iio_dev *iio)
 		return -ENOMEM;
 
 	trig->dev.parent = iio->dev.parent;
-	trig->private_data = iio;
+	iio_trigger_set_drvdata(trig, iio);
 	trig->ops = &mxs_lradc_trigger_ops;
 
 	ret = iio_trigger_register(trig);
diff --git a/drivers/staging/iio/meter/ade7758_trigger.c b/drivers/staging/iio/meter/ade7758_trigger.c
index f9c6a340092b..7a94ddd42f59 100644
--- a/drivers/staging/iio/meter/ade7758_trigger.c
+++ b/drivers/staging/iio/meter/ade7758_trigger.c
@@ -32,7 +32,7 @@ static irqreturn_t ade7758_data_rdy_trig_poll(int irq, void *private)
 static int ade7758_data_rdy_trigger_set_state(struct iio_trigger *trig,
 						bool state)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 
 	dev_dbg(&indio_dev->dev, "%s (%d)\n", __func__, state);
 	return ade7758_set_irq(&indio_dev->dev, state);
@@ -44,7 +44,7 @@ static int ade7758_data_rdy_trigger_set_state(struct iio_trigger *trig,
  **/
 static int ade7758_trig_try_reen(struct iio_trigger *trig)
 {
-	struct iio_dev *indio_dev = trig->private_data;
+	struct iio_dev *indio_dev = iio_trigger_get_drvdata(trig);
 	struct ade7758_state *st = iio_priv(indio_dev);
 
 	enable_irq(st->us->irq);
@@ -81,7 +81,7 @@ int ade7758_probe_trigger(struct iio_dev *indio_dev)
 
 	st->trig->dev.parent = &st->us->dev;
 	st->trig->ops = &ade7758_trigger_ops;
-	st->trig->private_data = indio_dev;
+	iio_trigger_set_drvdata(st->trig, indio_dev);
 	ret = iio_trigger_register(st->trig);
 
 	/* select default trigger */
diff --git a/drivers/staging/iio/trigger/iio-trig-bfin-timer.c b/drivers/staging/iio/trigger/iio-trig-bfin-timer.c
index 42798da575c0..38a158b77b1d 100644
--- a/drivers/staging/iio/trigger/iio-trig-bfin-timer.c
+++ b/drivers/staging/iio/trigger/iio-trig-bfin-timer.c
@@ -65,7 +65,7 @@ struct bfin_tmr_state {
 
 static int iio_bfin_tmr_set_state(struct iio_trigger *trig, bool state)
 {
-	struct bfin_tmr_state *st = trig->private_data;
+	struct bfin_tmr_state *st = iio_trigger_get_drvdata(trig);
 
 	if (get_gptimer_period(st->t->id) == 0)
 		return -EINVAL;
@@ -82,7 +82,7 @@ static ssize_t iio_bfin_tmr_frequency_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct iio_trigger *trig = to_iio_trigger(dev);
-	struct bfin_tmr_state *st = trig->private_data;
+	struct bfin_tmr_state *st = iio_trigger_get_drvdata(trig);
 	unsigned long val;
 	bool enabled;
 	int ret;
@@ -125,7 +125,7 @@ static ssize_t iio_bfin_tmr_frequency_show(struct device *dev,
 				 char *buf)
 {
 	struct iio_trigger *trig = to_iio_trigger(dev);
-	struct bfin_tmr_state *st = trig->private_data;
+	struct bfin_tmr_state *st = iio_trigger_get_drvdata(trig);
 	unsigned int period = get_gptimer_period(st->t->id);
 	unsigned long val;
 
@@ -213,9 +213,9 @@ static int iio_bfin_tmr_trigger_probe(struct platform_device *pdev)
 		goto out1;
 	}
 
-	st->trig->private_data = st;
 	st->trig->ops = &iio_bfin_tmr_trigger_ops;
 	st->trig->dev.groups = iio_bfin_tmr_trigger_attr_groups;
+	iio_trigger_set_drvdata(st->trig, st);
 	ret = iio_trigger_register(st->trig);
 	if (ret)
 		goto out2;
diff --git a/drivers/staging/iio/trigger/iio-trig-gpio.c b/drivers/staging/iio/trigger/iio-trig-gpio.c
index fcc4cb048c9a..7c593d18a910 100644
--- a/drivers/staging/iio/trigger/iio-trig-gpio.c
+++ b/drivers/staging/iio/trigger/iio-trig-gpio.c
@@ -83,7 +83,7 @@ static int iio_gpio_trigger_probe(struct platform_device *pdev)
 				ret = -ENOMEM;
 				goto error_put_trigger;
 			}
-			trig->private_data = trig_info;
+			iio_trigger_set_drvdata(trig, trig_info);
 			trig_info->irq = irq;
 			trig->ops = &iio_gpio_trigger_ops;
 			ret = request_irq(irq, iio_gpio_trigger_poll,
@@ -121,7 +121,7 @@ error_free_completed_registrations:
 				 trig2,
 				 &iio_gpio_trigger_list,
 				 alloc_list) {
-		trig_info = trig->private_data;
+		trig_info = iio_trigger_get_drvdata(trig);
 		free_irq(gpio_to_irq(trig_info->irq), trig);
 		kfree(trig_info);
 		iio_trigger_unregister(trig);
@@ -140,7 +140,7 @@ static int iio_gpio_trigger_remove(struct platform_device *pdev)
 				 trig2,
 				 &iio_gpio_trigger_list,
 				 alloc_list) {
-		trig_info = trig->private_data;
+		trig_info = iio_trigger_get_drvdata(trig);
 		iio_trigger_unregister(trig);
 		free_irq(trig_info->irq, trig);
 		kfree(trig_info);
diff --git a/drivers/staging/iio/trigger/iio-trig-periodic-rtc.c b/drivers/staging/iio/trigger/iio-trig-periodic-rtc.c
index 9102b1ba2530..79695974b1d4 100644
--- a/drivers/staging/iio/trigger/iio-trig-periodic-rtc.c
+++ b/drivers/staging/iio/trigger/iio-trig-periodic-rtc.c
@@ -30,7 +30,7 @@ struct iio_prtc_trigger_info {
 
 static int iio_trig_periodic_rtc_set_state(struct iio_trigger *trig, bool state)
 {
-	struct iio_prtc_trigger_info *trig_info = trig->private_data;
+	struct iio_prtc_trigger_info *trig_info = iio_trigger_get_drvdata(trig);
 	if (trig_info->frequency == 0)
 		return -EINVAL;
 	printk(KERN_INFO "trigger frequency is %d\n", trig_info->frequency);
@@ -42,7 +42,7 @@ static ssize_t iio_trig_periodic_read_freq(struct device *dev,
 					   char *buf)
 {
 	struct iio_trigger *trig = to_iio_trigger(dev);
-	struct iio_prtc_trigger_info *trig_info = trig->private_data;
+	struct iio_prtc_trigger_info *trig_info = iio_trigger_get_drvdata(trig);
 	return sprintf(buf, "%u\n", trig_info->frequency);
 }
 
@@ -52,7 +52,7 @@ static ssize_t iio_trig_periodic_write_freq(struct device *dev,
 					    size_t len)
 {
 	struct iio_trigger *trig = to_iio_trigger(dev);
-	struct iio_prtc_trigger_info *trig_info = trig->private_data;
+	struct iio_prtc_trigger_info *trig_info = iio_trigger_get_drvdata(trig);
 	unsigned long val;
 	int ret;
 
@@ -124,7 +124,7 @@ static int iio_trig_periodic_rtc_probe(struct platform_device *dev)
 			ret = -ENOMEM;
 			goto error_put_trigger_and_remove_from_list;
 		}
-		trig->private_data = trig_info;
+		iio_trigger_set_drvdata(trig, trig_info);
 		trig->ops = &iio_prtc_trigger_ops;
 		/* RTC access */
 		trig_info->rtc
@@ -158,7 +158,7 @@ error_free_completed_registrations:
 				 trig2,
 				 &iio_prtc_trigger_list,
 				 alloc_list) {
-		trig_info = trig->private_data;
+		trig_info = iio_trigger_get_drvdata(trig);
 		rtc_irq_unregister(trig_info->rtc, &trig_info->task);
 		rtc_class_close(trig_info->rtc);
 		kfree(trig_info);
@@ -176,7 +176,7 @@ static int iio_trig_periodic_rtc_remove(struct platform_device *dev)
 				 trig2,
 				 &iio_prtc_trigger_list,
 				 alloc_list) {
-		trig_info = trig->private_data;
+		trig_info = iio_trigger_get_drvdata(trig);
 		rtc_irq_unregister(trig_info->rtc, &trig_info->task);
 		rtc_class_close(trig_info->rtc);
 		kfree(trig_info);
diff --git a/drivers/staging/iio/trigger/iio-trig-sysfs.c b/drivers/staging/iio/trigger/iio-trig-sysfs.c
index 3bac97224bf4..b727bde8b7fe 100644
--- a/drivers/staging/iio/trigger/iio-trig-sysfs.c
+++ b/drivers/staging/iio/trigger/iio-trig-sysfs.c
@@ -103,7 +103,7 @@ static ssize_t iio_sysfs_trigger_poll(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct iio_trigger *trig = to_iio_trigger(dev);
-	struct iio_sysfs_trig *sysfs_trig = trig->private_data;
+	struct iio_sysfs_trig *sysfs_trig = iio_trigger_get_drvdata(trig);
 
 	irq_work_queue(&sysfs_trig->work);
 
@@ -160,7 +160,7 @@ static int iio_sysfs_trigger_probe(int id)
 	t->trig->dev.groups = iio_sysfs_trigger_attr_groups;
 	t->trig->ops = &iio_sysfs_trigger_ops;
 	t->trig->dev.parent = &iio_sysfs_trig_dev;
-	t->trig->private_data = t;
+	iio_trigger_set_drvdata(t->trig, t);
 
 	init_irq_work(&t->work, iio_sysfs_trigger_work);
 
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index c66e0a96f6e8..b81948aba1d6 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -91,6 +91,30 @@ static inline void iio_trigger_get(struct iio_trigger *trig)
 	__module_get(trig->ops->owner);
 }
 
+/**
+ * iio_device_set_drvdata() - Set trigger driver data
+ * @trig: IIO trigger structure
+ * @data: Driver specific data
+ *
+ * Allows to attach an arbitrary pointer to an IIO trigger, which can later be
+ * retrieved by iio_trigger_get_drvdata().
+ */
+static inline void iio_trigger_set_drvdata(struct iio_trigger *trig, void *data)
+{
+	trig->private_data = data;
+}
+
+/**
+ * iio_trigger_get_drvdata() - Get trigger driver data
+ * @trig: IIO trigger structure
+ *
+ * Returns the data previously set with iio_trigger_set_drvdata()
+ */
+static inline void *iio_trigger_get_drvdata(struct iio_trigger *trig)
+{
+	return trig->private_data;
+}
+
 /**
  * iio_trigger_register() - register a trigger with the IIO core
  * @trig_info:	trigger to be registered
-- 
cgit 


From 5034bfc976928b447cb6decd311d35161107a72f Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Mon, 25 Mar 2013 08:58:00 +0000
Subject: iio:trigger: Use dev_{set,get}_drvdata for private data management

Use dev_{set,get}_drvdata for managing private data attached to a trigger
instead of using a custom field in the iio_trigger struct.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Jonathan Cameron <jic23@kernel.org>
---
 drivers/staging/iio/Documentation/trigger.txt | 3 ---
 include/linux/iio/trigger.h                   | 6 ++----
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/iio/Documentation/trigger.txt b/drivers/staging/iio/Documentation/trigger.txt
index 75cc37ff1ed0..64e2e08fb4d0 100644
--- a/drivers/staging/iio/Documentation/trigger.txt
+++ b/drivers/staging/iio/Documentation/trigger.txt
@@ -10,9 +10,6 @@ struct iio_trig *trig = iio_trigger_alloc("<trigger format string>", ...);
 allocates a trigger structure.  The key elements to then fill in within
 a driver are:
 
-trig->private_data
-	Device specific private data.
-
 trig->owner
 	Typically set to THIS_MODULE. Used to ensure correct
 	ownership of core allocated resources.
diff --git a/include/linux/iio/trigger.h b/include/linux/iio/trigger.h
index b81948aba1d6..3869c525b052 100644
--- a/include/linux/iio/trigger.h
+++ b/include/linux/iio/trigger.h
@@ -44,7 +44,6 @@ struct iio_trigger_ops {
  * @id:			[INTERN] unique id number
  * @name:		[DRIVER] unique name
  * @dev:		[DRIVER] associated device (if relevant)
- * @private_data:	[DRIVER] device specific data
  * @list:		[INTERN] used in maintenance of global trigger list
  * @alloc_list:		[DRIVER] used for driver specific trigger list
  * @use_count:		use count for the trigger
@@ -60,7 +59,6 @@ struct iio_trigger {
 	const char			*name;
 	struct device			dev;
 
-	void				*private_data;
 	struct list_head		list;
 	struct list_head		alloc_list;
 	int use_count;
@@ -101,7 +99,7 @@ static inline void iio_trigger_get(struct iio_trigger *trig)
  */
 static inline void iio_trigger_set_drvdata(struct iio_trigger *trig, void *data)
 {
-	trig->private_data = data;
+	dev_set_drvdata(&trig->dev, data);
 }
 
 /**
@@ -112,7 +110,7 @@ static inline void iio_trigger_set_drvdata(struct iio_trigger *trig, void *data)
  */
 static inline void *iio_trigger_get_drvdata(struct iio_trigger *trig)
 {
-	return trig->private_data;
+	return dev_get_drvdata(&trig->dev);
 }
 
 /**
-- 
cgit 


From 57df8106932b57427df1eaaa13871857f75b1194 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Fri, 8 Feb 2013 14:52:06 +0800
Subject: Thermal: exynos: fix cooling state translation

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Tested-by: Amit Daniel kachhap <amit.daniel@samsung.com>
---
 drivers/thermal/cpu_cooling.c    | 11 +++++++++++
 drivers/thermal/exynos_thermal.c | 24 ++----------------------
 include/linux/cpu_cooling.h      |  7 +++++++
 include/linux/thermal.h          |  5 ++++-
 4 files changed, 24 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index 9e208d300647..e03891b03c9b 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -196,6 +196,17 @@ static int get_property(unsigned int cpu, unsigned long input,
 	return -EINVAL;
 }
 
+unsigned long cpufreq_cooling_get_level(unsigned int cpu, unsigned int freq)
+{
+	unsigned int val;
+
+	if (get_property(cpu, (unsigned long)freq, &val, GET_LEVEL))
+		return THERMAL_CSTATE_INVALID;
+	return (unsigned long)val;
+}
+
+EXPORT_SYMBOL(cpufreq_cooling_get_level);
+
 /**
  * get_cpu_frequency - get the absolute value of frequency from level.
  * @cpu: cpu for which frequency is fetched.
diff --git a/drivers/thermal/exynos_thermal.c b/drivers/thermal/exynos_thermal.c
index 46568c078dee..541257888c3e 100644
--- a/drivers/thermal/exynos_thermal.c
+++ b/drivers/thermal/exynos_thermal.c
@@ -242,26 +242,6 @@ static int exynos_get_crit_temp(struct thermal_zone_device *thermal,
 	return ret;
 }
 
-static int exynos_get_frequency_level(unsigned int cpu, unsigned int freq)
-{
-	int i = 0, ret = -EINVAL;
-	struct cpufreq_frequency_table *table = NULL;
-#ifdef CONFIG_CPU_FREQ
-	table = cpufreq_frequency_get_table(cpu);
-#endif
-	if (!table)
-		return ret;
-
-	while (table[i].frequency != CPUFREQ_TABLE_END) {
-		if (table[i].frequency == CPUFREQ_ENTRY_INVALID)
-			continue;
-		if (table[i].frequency == freq)
-			return i;
-		i++;
-	}
-	return ret;
-}
-
 /* Bind callback functions for thermal zone */
 static int exynos_bind(struct thermal_zone_device *thermal,
 			struct thermal_cooling_device *cdev)
@@ -288,8 +268,8 @@ static int exynos_bind(struct thermal_zone_device *thermal,
 	/* Bind the thermal zone to the cpufreq cooling device */
 	for (i = 0; i < tab_size; i++) {
 		clip_data = (struct freq_clip_table *)&(tab_ptr[i]);
-		level = exynos_get_frequency_level(0, clip_data->freq_clip_max);
-		if (level < 0)
+		level = cpufreq_cooling_get_level(0, clip_data->freq_clip_max);
+		if (level == THERMAL_CSTATE_INVALID)
 			return 0;
 		switch (GET_ZONE(i)) {
 		case MONITOR_ZONE:
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index 40b4ef54cc7d..bc479b1e0fd9 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -42,6 +42,8 @@ struct thermal_cooling_device *cpufreq_cooling_register(
  * @cdev: thermal cooling device pointer.
  */
 void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev);
+
+unsigned long cpufreq_cooling_get_level(unsigned int, unsigned int);
 #else /* !CONFIG_CPU_THERMAL */
 static inline struct thermal_cooling_device *cpufreq_cooling_register(
 	const struct cpumask *clip_cpus)
@@ -53,6 +55,11 @@ static inline void cpufreq_cooling_unregister(
 {
 	return;
 }
+static inline unsigned long cpufreq_cooling_get_level(unsigned int,
+						 unsigned int)
+{
+	return THERMAL_CSTATE_INVALID;
+}
 #endif	/* CONFIG_CPU_THERMAL */
 
 #endif /* __CPU_COOLING_H__ */
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index f0bd7f90a90d..5a3b428daaab 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -33,8 +33,11 @@
 #define THERMAL_MAX_TRIPS	12
 #define THERMAL_NAME_LENGTH	20
 
+/* invalid cooling state */
+#define THERMAL_CSTATE_INVALID -1UL
+
 /* No upper/lower limit requirement */
-#define THERMAL_NO_LIMIT	-1UL
+#define THERMAL_NO_LIMIT	THERMAL_CSTATE_INVALID
 
 /* Unit conversion macros */
 #define KELVIN_TO_CELSIUS(t)	(long)(((long)t-2732 >= 0) ?	\
-- 
cgit 


From f8b587055a793c7719f0d4f41b7b4aeeef43aa2d Mon Sep 17 00:00:00 2001
From: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Date: Wed, 20 Mar 2013 21:38:07 +0000
Subject: thermal: Fix compiler warning

The following warning is obtained when CONFIG_NET is not defined:

In file included from drivers/thermal/mvebu_thermal.c:27:0:
include/linux/thermal.h:254:12: warning: 'thermal_generate_netlink_event'
defined but not used [-Wunused-function]

This patch fixes the warning by properly inlining
thermal_generate_netlink_event().

Signed-off-by: Ezequiel Garcia <ezequiel.garcia@free-electrons.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 include/linux/thermal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index f0bd7f90a90d..fd7b8f3e6f42 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -251,7 +251,7 @@ void thermal_unregister_governor(struct thermal_governor *);
 extern int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 						enum events event);
 #else
-static int thermal_generate_netlink_event(struct thermal_zone_device *tz,
+static inline int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 						enum events event)
 {
 	return 0;
-- 
cgit 


From c0f4dfd4f90f1667d234d21f15153ea09a2eaa66 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paul.mckenney@linaro.org>
Date: Fri, 28 Dec 2012 11:30:36 -0800
Subject: rcu: Make RCU_FAST_NO_HZ take advantage of numbered callbacks

Because RCU callbacks are now associated with the number of the grace
period that they must wait for, CPUs can now take advance callbacks
corresponding to grace periods that ended while a given CPU was in
dyntick-idle mode.  This eliminates the need to try forcing the RCU
state machine while entering idle, thus reducing the CPU intensiveness
of RCU_FAST_NO_HZ, which should increase its energy efficiency.

Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  28 ++-
 include/linux/rcupdate.h            |   1 +
 init/Kconfig                        |  17 +-
 kernel/rcutree.c                    |  28 +--
 kernel/rcutree.h                    |  12 +-
 kernel/rcutree_plugin.h             | 374 ++++++++++--------------------------
 kernel/rcutree_trace.c              |   2 -
 7 files changed, 149 insertions(+), 313 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a17ba16c8fc8..22303b2e74bc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2490,6 +2490,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			leaf rcu_node structure.  Useful for very large
 			systems.
 
+	rcutree.jiffies_till_first_fqs= [KNL,BOOT]
+			Set delay from grace-period initialization to
+			first attempt to force quiescent states.
+			Units are jiffies, minimum value is zero,
+			and maximum value is HZ.
+
+	rcutree.jiffies_till_next_fqs= [KNL,BOOT]
+			Set delay between subsequent attempts to force
+			quiescent states.  Units are jiffies, minimum
+			value is one, and maximum value is HZ.
+
 	rcutree.qhimark=	[KNL,BOOT]
 			Set threshold of queued
 			RCU callbacks over which batch limiting is disabled.
@@ -2504,16 +2515,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	rcutree.rcu_cpu_stall_timeout= [KNL,BOOT]
 			Set timeout for RCU CPU stall warning messages.
 
-	rcutree.jiffies_till_first_fqs= [KNL,BOOT]
-			Set delay from grace-period initialization to
-			first attempt to force quiescent states.
-			Units are jiffies, minimum value is zero,
-			and maximum value is HZ.
+	rcutree.rcu_idle_gp_delay=	[KNL,BOOT]
+			Set wakeup interval for idle CPUs that have
+			RCU callbacks (RCU_FAST_NO_HZ=y).
 
-	rcutree.jiffies_till_next_fqs= [KNL,BOOT]
-			Set delay between subsequent attempts to force
-			quiescent states.  Units are jiffies, minimum
-			value is one, and maximum value is HZ.
+	rcutree.rcu_idle_lazy_gp_delay=	[KNL,BOOT]
+			Set wakeup interval for idle CPUs that have
+			only "lazy" RCU callbacks (RCU_FAST_NO_HZ=y).
+			Lazy RCU callbacks are those which RCU can
+			prove do nothing more than free memory.
 
 	rcutorture.fqs_duration= [KNL,BOOT]
 			Set duration of force_quiescent_state bursts.
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b758ce17b309..9ed2c9a4de45 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -80,6 +80,7 @@ extern void do_trace_rcu_torture_read(char *rcutorturename,
 #define UINT_CMP_LT(a, b)	(UINT_MAX / 2 < (a) - (b))
 #define ULONG_CMP_GE(a, b)	(ULONG_MAX / 2 >= (a) - (b))
 #define ULONG_CMP_LT(a, b)	(ULONG_MAX / 2 < (a) - (b))
+#define ulong2long(a)		(*(long *)(&(a)))
 
 /* Exported common interfaces */
 
diff --git a/init/Kconfig b/init/Kconfig
index 717584064a7e..a3a2304fa6d2 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -582,13 +582,16 @@ config RCU_FAST_NO_HZ
 	depends on NO_HZ && SMP
 	default n
 	help
-	  This option causes RCU to attempt to accelerate grace periods in
-	  order to allow CPUs to enter dynticks-idle state more quickly.
-	  On the other hand, this option increases the overhead of the
-	  dynticks-idle checking, thus degrading scheduling latency.
-
-	  Say Y if energy efficiency is critically important, and you don't
-	  	care about real-time response.
+	  This option permits CPUs to enter dynticks-idle state even if
+	  they have RCU callbacks queued, and prevents RCU from waking
+	  these CPUs up more than roughly once every four jiffies (by
+	  default, you can adjust this using the rcutree.rcu_idle_gp_delay
+	  parameter), thus improving energy efficiency.  On the other
+	  hand, this option increases the duration of RCU grace periods,
+	  for example, slowing down synchronize_rcu().
+
+	  Say Y if energy efficiency is critically important, and you
+	  	don't care about increased grace-period durations.
 
 	  Say N if you are unsure.
 
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 2015bce749f9..7b1d7769872a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -2640,19 +2640,27 @@ static int rcu_pending(int cpu)
 }
 
 /*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.
+ * Return true if the specified CPU has any callback.  If all_lazy is
+ * non-NULL, store an indication of whether all callbacks are lazy.
+ * (If there are no callbacks, all of them are deemed to be lazy.)
  */
-static int rcu_cpu_has_callbacks(int cpu)
+static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy)
 {
+	bool al = true;
+	bool hc = false;
+	struct rcu_data *rdp;
 	struct rcu_state *rsp;
 
-	/* RCU callbacks either ready or pending? */
-	for_each_rcu_flavor(rsp)
-		if (per_cpu_ptr(rsp->rda, cpu)->nxtlist)
-			return 1;
-	return 0;
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		if (rdp->qlen != rdp->qlen_lazy)
+			al = false;
+		if (rdp->nxtlist)
+			hc = true;
+	}
+	if (all_lazy)
+		*all_lazy = al;
+	return hc;
 }
 
 /*
@@ -2871,7 +2879,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 	atomic_set(&rdp->dynticks->dynticks,
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
-	rcu_prepare_for_idle_init(cpu);
 	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
 	/* Add CPU to rcu_node bitmasks. */
@@ -2945,7 +2952,6 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 		 */
 		for_each_rcu_flavor(rsp)
 			rcu_cleanup_dying_cpu(rsp);
-		rcu_cleanup_after_idle(cpu);
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b6c2335efbdf..96a27f922e92 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -88,18 +88,13 @@ struct rcu_dynticks {
 	int dynticks_nmi_nesting;   /* Track NMI nesting level. */
 	atomic_t dynticks;	    /* Even value for idle, else odd. */
 #ifdef CONFIG_RCU_FAST_NO_HZ
-	int dyntick_drain;	    /* Prepare-for-idle state variable. */
-	unsigned long dyntick_holdoff;
-				    /* No retries for the jiffy of failure. */
-	struct timer_list idle_gp_timer;
-				    /* Wake up CPU sleeping with callbacks. */
-	unsigned long idle_gp_timer_expires;
-				    /* When to wake up CPU (for repost). */
-	bool idle_first_pass;	    /* First pass of attempt to go idle? */
+	bool all_lazy;		    /* Are all CPU's CBs lazy? */
 	unsigned long nonlazy_posted;
 				    /* # times non-lazy CBs posted to CPU. */
 	unsigned long nonlazy_posted_snap;
 				    /* idle-period nonlazy_posted snapshot. */
+	unsigned long last_accelerate;
+				    /* Last jiffy CBs were accelerated. */
 	int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */
 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
 };
@@ -521,7 +516,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 						 struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void __cpuinit rcu_prepare_kthreads(int cpu);
-static void rcu_prepare_for_idle_init(int cpu);
 static void rcu_cleanup_after_idle(int cpu);
 static void rcu_prepare_for_idle(int cpu);
 static void rcu_idle_count_callbacks_posted(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 28185ad18df3..d318f9f18be5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1543,14 +1543,7 @@ static void __cpuinit rcu_prepare_kthreads(int cpu)
 int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
 {
 	*delta_jiffies = ULONG_MAX;
-	return rcu_cpu_has_callbacks(cpu);
-}
-
-/*
- * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
+	return rcu_cpu_has_callbacks(cpu, NULL);
 }
 
 /*
@@ -1587,16 +1580,6 @@ static void rcu_idle_count_callbacks_posted(void)
  *
  * The following three proprocessor symbols control this state machine:
  *
- * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
- *	to satisfy RCU.  Beyond this point, it is better to incur a periodic
- *	scheduling-clock interrupt than to loop through the state machine
- *	at full power.
- * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
- *	optional if RCU does not need anything immediately from this
- *	CPU, even if this CPU still has RCU callbacks queued.  The first
- *	times through the state machine are mandatory: we need to give
- *	the state machine a chance to communicate a quiescent state
- *	to the RCU core.
  * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
  *	to sleep in dyntick-idle mode with RCU callbacks pending.  This
  *	is sized to be roughly one RCU grace period.  Those energy-efficiency
@@ -1612,15 +1595,9 @@ static void rcu_idle_count_callbacks_posted(void)
  * adjustment, they can be converted into kernel config parameters, though
  * making the state machine smarter might be a better option.
  */
-#define RCU_IDLE_FLUSHES 5		/* Number of dyntick-idle tries. */
-#define RCU_IDLE_OPT_FLUSHES 3		/* Optional dyntick-idle tries. */
 #define RCU_IDLE_GP_DELAY 4		/* Roughly one grace period. */
 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ)	/* Roughly six seconds. */
 
-static int rcu_idle_flushes = RCU_IDLE_FLUSHES;
-module_param(rcu_idle_flushes, int, 0644);
-static int rcu_idle_opt_flushes = RCU_IDLE_OPT_FLUSHES;
-module_param(rcu_idle_opt_flushes, int, 0644);
 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
 module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
@@ -1629,178 +1606,97 @@ module_param(rcu_idle_lazy_gp_delay, int, 0644);
 extern int tick_nohz_enabled;
 
 /*
- * Does the specified flavor of RCU have non-lazy callbacks pending on
- * the specified CPU?  Both RCU flavor and CPU are specified by the
- * rcu_data structure.
- */
-static bool __rcu_cpu_has_nonlazy_callbacks(struct rcu_data *rdp)
-{
-	return rdp->qlen != rdp->qlen_lazy;
-}
-
-#ifdef CONFIG_TREE_PREEMPT_RCU
-
-/*
- * Are there non-lazy RCU-preempt callbacks?  (There cannot be if there
- * is no RCU-preempt in the kernel.)
+ * Try to advance callbacks for all flavors of RCU on the current CPU.
+ * Afterwards, if there are any callbacks ready for immediate invocation,
+ * return true.
  */
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
+static bool rcu_try_advance_all_cbs(void)
 {
-	struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-
-	return __rcu_cpu_has_nonlazy_callbacks(rdp);
-}
-
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+	bool cbs_ready = false;
+	struct rcu_data *rdp;
+	struct rcu_node *rnp;
+	struct rcu_state *rsp;
 
-static bool rcu_preempt_cpu_has_nonlazy_callbacks(int cpu)
-{
-	return 0;
-}
+	for_each_rcu_flavor(rsp) {
+		rdp = this_cpu_ptr(rsp->rda);
+		rnp = rdp->mynode;
 
-#endif /* else #ifdef CONFIG_TREE_PREEMPT_RCU */
+		/*
+		 * Don't bother checking unless a grace period has
+		 * completed since we last checked and there are
+		 * callbacks not yet ready to invoke.
+		 */
+		if (rdp->completed != rnp->completed &&
+		    rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
+			rcu_process_gp_end(rsp, rdp);
 
-/*
- * Does any flavor of RCU have non-lazy callbacks on the specified CPU?
- */
-static bool rcu_cpu_has_nonlazy_callbacks(int cpu)
-{
-	return __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_sched_data, cpu)) ||
-	       __rcu_cpu_has_nonlazy_callbacks(&per_cpu(rcu_bh_data, cpu)) ||
-	       rcu_preempt_cpu_has_nonlazy_callbacks(cpu);
+		if (cpu_has_callbacks_ready_to_invoke(rdp))
+			cbs_ready = true;
+	}
+	return cbs_ready;
 }
 
 /*
- * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
- * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
- * it is better to incur scheduling-clock interrupts than to spin
- * continuously for the same time duration!
+ * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
+ * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
+ * caller to set the timeout based on whether or not there are non-lazy
+ * callbacks.
  *
- * The delta_jiffies argument is used to store the time when RCU is
- * going to need the CPU again if it still has callbacks.  The reason
- * for this is that rcu_prepare_for_idle() might need to post a timer,
- * but if so, it will do so after tick_nohz_stop_sched_tick() has set
- * the wakeup time for this CPU.  This means that RCU's timer can be
- * delayed until the wakeup time, which defeats the purpose of posting
- * a timer.
+ * The caller must have disabled interrupts.
  */
-int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies)
+int rcu_needs_cpu(int cpu, unsigned long *dj)
 {
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
 
-	/* Flag a new idle sojourn to the idle-entry state machine. */
-	rdtp->idle_first_pass = 1;
+	/* Snapshot to detect later posting of non-lazy callback. */
+	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
+
 	/* If no callbacks, RCU doesn't need the CPU. */
-	if (!rcu_cpu_has_callbacks(cpu)) {
-		*delta_jiffies = ULONG_MAX;
+	if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) {
+		*dj = ULONG_MAX;
 		return 0;
 	}
-	if (rdtp->dyntick_holdoff == jiffies) {
-		/* RCU recently tried and failed, so don't try again. */
-		*delta_jiffies = 1;
+
+	/* Attempt to advance callbacks. */
+	if (rcu_try_advance_all_cbs()) {
+		/* Some ready to invoke, so initiate later invocation. */
+		invoke_rcu_core();
 		return 1;
 	}
-	/* Set up for the possibility that RCU will post a timer. */
-	if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-		*delta_jiffies = round_up(rcu_idle_gp_delay + jiffies,
-					  rcu_idle_gp_delay) - jiffies;
+	rdtp->last_accelerate = jiffies;
+
+	/* Request timer delay depending on laziness, and round. */
+	if (rdtp->all_lazy) {
+		*dj = round_up(rcu_idle_gp_delay + jiffies,
+			       rcu_idle_gp_delay) - jiffies;
 	} else {
-		*delta_jiffies = jiffies + rcu_idle_lazy_gp_delay;
-		*delta_jiffies = round_jiffies(*delta_jiffies) - jiffies;
+		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
 	}
 	return 0;
 }
 
 /*
- * Handler for smp_call_function_single().  The only point of this
- * handler is to wake the CPU up, so the handler does only tracing.
- */
-void rcu_idle_demigrate(void *unused)
-{
-	trace_rcu_prep_idle("Demigrate");
-}
-
-/*
- * Timer handler used to force CPU to start pushing its remaining RCU
- * callbacks in the case where it entered dyntick-idle mode with callbacks
- * pending.  The hander doesn't really need to do anything because the
- * real work is done upon re-entry to idle, or by the next scheduling-clock
- * interrupt should idle not be re-entered.
- *
- * One special case: the timer gets migrated without awakening the CPU
- * on which the timer was scheduled on.  In this case, we must wake up
- * that CPU.  We do so with smp_call_function_single().
- */
-static void rcu_idle_gp_timer_func(unsigned long cpu_in)
-{
-	int cpu = (int)cpu_in;
-
-	trace_rcu_prep_idle("Timer");
-	if (cpu != smp_processor_id())
-		smp_call_function_single(cpu, rcu_idle_demigrate, NULL, 0);
-	else
-		WARN_ON_ONCE(1); /* Getting here can hang the system... */
-}
-
-/*
- * Initialize the timer used to pull CPUs out of dyntick-idle mode.
- */
-static void rcu_prepare_for_idle_init(int cpu)
-{
-	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-	rdtp->dyntick_holdoff = jiffies - 1;
-	setup_timer(&rdtp->idle_gp_timer, rcu_idle_gp_timer_func, cpu);
-	rdtp->idle_gp_timer_expires = jiffies - 1;
-	rdtp->idle_first_pass = 1;
-}
-
-/*
- * Clean up for exit from idle.  Because we are exiting from idle, there
- * is no longer any point to ->idle_gp_timer, so cancel it.  This will
- * do nothing if this timer is not active, so just cancel it unconditionally.
- */
-static void rcu_cleanup_after_idle(int cpu)
-{
-	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-
-	del_timer(&rdtp->idle_gp_timer);
-	trace_rcu_prep_idle("Cleanup after idle");
-	rdtp->tick_nohz_enabled_snap = ACCESS_ONCE(tick_nohz_enabled);
-}
-
-/*
- * Check to see if any RCU-related work can be done by the current CPU,
- * and if so, schedule a softirq to get it done.  This function is part
- * of the RCU implementation; it is -not- an exported member of the RCU API.
- *
- * The idea is for the current CPU to clear out all work required by the
- * RCU core for the current grace period, so that this CPU can be permitted
- * to enter dyntick-idle mode.  In some cases, it will need to be awakened
- * at the end of the grace period by whatever CPU ends the grace period.
- * This allows CPUs to go dyntick-idle more quickly, and to reduce the
- * number of wakeups by a modest integer factor.
- *
- * Because it is not legal to invoke rcu_process_callbacks() with irqs
- * disabled, we do one pass of force_quiescent_state(), then do a
- * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
- * later.  The ->dyntick_drain field controls the sequencing.
+ * Prepare a CPU for idle from an RCU perspective.  The first major task
+ * is to sense whether nohz mode has been enabled or disabled via sysfs.
+ * The second major task is to check to see if a non-lazy callback has
+ * arrived at a CPU that previously had only lazy callbacks.  The third
+ * major task is to accelerate (that is, assign grace-period numbers to)
+ * any recently arrived callbacks.
  *
  * The caller must have disabled interrupts.
  */
 static void rcu_prepare_for_idle(int cpu)
 {
-	struct timer_list *tp;
+	struct rcu_data *rdp;
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
+	struct rcu_node *rnp;
+	struct rcu_state *rsp;
 	int tne;
 
 	/* Handle nohz enablement switches conservatively. */
 	tne = ACCESS_ONCE(tick_nohz_enabled);
 	if (tne != rdtp->tick_nohz_enabled_snap) {
-		if (rcu_cpu_has_callbacks(cpu))
+		if (rcu_cpu_has_callbacks(cpu, NULL))
 			invoke_rcu_core(); /* force nohz to see update. */
 		rdtp->tick_nohz_enabled_snap = tne;
 		return;
@@ -1808,125 +1704,56 @@ static void rcu_prepare_for_idle(int cpu)
 	if (!tne)
 		return;
 
-	/* Adaptive-tick mode, where usermode execution is idle to RCU. */
-	if (!is_idle_task(current)) {
-		rdtp->dyntick_holdoff = jiffies - 1;
-		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-			trace_rcu_prep_idle("User dyntick with callbacks");
-			rdtp->idle_gp_timer_expires =
-				round_up(jiffies + rcu_idle_gp_delay,
-					 rcu_idle_gp_delay);
-		} else if (rcu_cpu_has_callbacks(cpu)) {
-			rdtp->idle_gp_timer_expires =
-				round_jiffies(jiffies + rcu_idle_lazy_gp_delay);
-			trace_rcu_prep_idle("User dyntick with lazy callbacks");
-		} else {
-			return;
-		}
-		tp = &rdtp->idle_gp_timer;
-		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
+	/* If this is a no-CBs CPU, no callbacks, just return. */
+	if (is_nocb_cpu(cpu))
 		return;
-	}
 
 	/*
-	 * If this is an idle re-entry, for example, due to use of
-	 * RCU_NONIDLE() or the new idle-loop tracing API within the idle
-	 * loop, then don't take any state-machine actions, unless the
-	 * momentary exit from idle queued additional non-lazy callbacks.
-	 * Instead, repost the ->idle_gp_timer if this CPU has callbacks
-	 * pending.
+	 * If a non-lazy callback arrived at a CPU having only lazy
+	 * callbacks, invoke RCU core for the side-effect of recalculating
+	 * idle duration on re-entry to idle.
 	 */
-	if (!rdtp->idle_first_pass &&
-	    (rdtp->nonlazy_posted == rdtp->nonlazy_posted_snap)) {
-		if (rcu_cpu_has_callbacks(cpu)) {
-			tp = &rdtp->idle_gp_timer;
-			mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-		}
+	if (rdtp->all_lazy &&
+	    rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
+		invoke_rcu_core();
 		return;
 	}
-	rdtp->idle_first_pass = 0;
-	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted - 1;
 
 	/*
-	 * If there are no callbacks on this CPU, enter dyntick-idle mode.
-	 * Also reset state to avoid prejudicing later attempts.
+	 * If we have not yet accelerated this jiffy, accelerate all
+	 * callbacks on this CPU.
 	 */
-	if (!rcu_cpu_has_callbacks(cpu)) {
-		rdtp->dyntick_holdoff = jiffies - 1;
-		rdtp->dyntick_drain = 0;
-		trace_rcu_prep_idle("No callbacks");
+	if (rdtp->last_accelerate == jiffies)
 		return;
+	rdtp->last_accelerate = jiffies;
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		if (!*rdp->nxttail[RCU_DONE_TAIL])
+			continue;
+		rnp = rdp->mynode;
+		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+		rcu_accelerate_cbs(rsp, rnp, rdp);
+		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 	}
+}
 
-	/*
-	 * If in holdoff mode, just return.  We will presumably have
-	 * refrained from disabling the scheduling-clock tick.
-	 */
-	if (rdtp->dyntick_holdoff == jiffies) {
-		trace_rcu_prep_idle("In holdoff");
-		return;
-	}
+/*
+ * Clean up for exit from idle.  Attempt to advance callbacks based on
+ * any grace periods that elapsed while the CPU was idle, and if any
+ * callbacks are now ready to invoke, initiate invocation.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+	struct rcu_data *rdp;
+	struct rcu_state *rsp;
 
-	/* Check and update the ->dyntick_drain sequencing. */
-	if (rdtp->dyntick_drain <= 0) {
-		/* First time through, initialize the counter. */
-		rdtp->dyntick_drain = rcu_idle_flushes;
-	} else if (rdtp->dyntick_drain <= rcu_idle_opt_flushes &&
-		   !rcu_pending(cpu) &&
-		   !local_softirq_pending()) {
-		/* Can we go dyntick-idle despite still having callbacks? */
-		rdtp->dyntick_drain = 0;
-		rdtp->dyntick_holdoff = jiffies;
-		if (rcu_cpu_has_nonlazy_callbacks(cpu)) {
-			trace_rcu_prep_idle("Dyntick with callbacks");
-			rdtp->idle_gp_timer_expires =
-				round_up(jiffies + rcu_idle_gp_delay,
-					 rcu_idle_gp_delay);
-		} else {
-			rdtp->idle_gp_timer_expires =
-				round_jiffies(jiffies + rcu_idle_lazy_gp_delay);
-			trace_rcu_prep_idle("Dyntick with lazy callbacks");
-		}
-		tp = &rdtp->idle_gp_timer;
-		mod_timer_pinned(tp, rdtp->idle_gp_timer_expires);
-		rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
-		return; /* Nothing more to do immediately. */
-	} else if (--(rdtp->dyntick_drain) <= 0) {
-		/* We have hit the limit, so time to give up. */
-		rdtp->dyntick_holdoff = jiffies;
-		trace_rcu_prep_idle("Begin holdoff");
-		invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
+	if (is_nocb_cpu(cpu))
 		return;
-	}
-
-	/*
-	 * Do one step of pushing the remaining RCU callbacks through
-	 * the RCU core state machine.
-	 */
-#ifdef CONFIG_TREE_PREEMPT_RCU
-	if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
-		rcu_preempt_qs(cpu);
-		force_quiescent_state(&rcu_preempt_state);
-	}
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-	if (per_cpu(rcu_sched_data, cpu).nxtlist) {
-		rcu_sched_qs(cpu);
-		force_quiescent_state(&rcu_sched_state);
-	}
-	if (per_cpu(rcu_bh_data, cpu).nxtlist) {
-		rcu_bh_qs(cpu);
-		force_quiescent_state(&rcu_bh_state);
-	}
-
-	/*
-	 * If RCU callbacks are still pending, RCU still needs this CPU.
-	 * So try forcing the callbacks through the grace period.
-	 */
-	if (rcu_cpu_has_callbacks(cpu)) {
-		trace_rcu_prep_idle("More callbacks");
-		invoke_rcu_core();
-	} else {
-		trace_rcu_prep_idle("Callbacks drained");
+	rcu_try_advance_all_cbs();
+	for_each_rcu_flavor(rsp) {
+		rdp = per_cpu_ptr(rsp->rda, cpu);
+		if (cpu_has_callbacks_ready_to_invoke(rdp))
+			invoke_rcu_core();
 	}
 }
 
@@ -2034,16 +1861,13 @@ early_initcall(rcu_register_oom_notifier);
 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
 {
 	struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
-	struct timer_list *tltp = &rdtp->idle_gp_timer;
-	char c;
+	unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
 
-	c = rdtp->dyntick_holdoff == jiffies ? 'H' : '.';
-	if (timer_pending(tltp))
-		sprintf(cp, "drain=%d %c timer=%lu",
-			rdtp->dyntick_drain, c, tltp->expires - jiffies);
-	else
-		sprintf(cp, "drain=%d %c timer not pending",
-			rdtp->dyntick_drain, c);
+	sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
+		rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
+		ulong2long(nlpd),
+		rdtp->all_lazy ? 'L' : '.',
+		rdtp->tick_nohz_enabled_snap ? '.' : 'D');
 }
 
 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 0d095dcaa670..49099e81c87b 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,8 +46,6 @@
 #define RCU_TREE_NONCORE
 #include "rcutree.h"
 
-#define ulong2long(a) (*(long *)(&(a)))
-
 static int r_open(struct inode *inode, struct file *file,
 					const struct seq_operations *op)
 {
-- 
cgit 


From f607e31ce3963327f749b56c65dfec2642aa623c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 22 Feb 2013 18:36:53 +0000
Subject: ASoC: arizona: Fix interaction between headphone outputs and
 identification

Running HPDET while the headphone outputs are enabled can disrupt the
operation of HPDET. In order to avoid this HPDET needs to disable the
headphone outputs and ASoC needs to not enable them while HPDET is
running.

Do the ASoC side of this by storing the enable state in the core driver
structure and only writing to the device if a flag indicating that the
accessory detection side is in a state where it can have the headphone
output stage enabled.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/mfd/arizona/core.h |  3 +++
 sound/soc/codecs/arizona.c       | 33 +++++++++++++++++++++++++++++++++
 sound/soc/codecs/arizona.h       |  3 +++
 sound/soc/codecs/wm5102.c        |  8 ++++----
 sound/soc/codecs/wm5110.c        |  8 ++++----
 5 files changed, 47 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/arizona/core.h b/include/linux/mfd/arizona/core.h
index a710255528d7..cc281368dc55 100644
--- a/include/linux/mfd/arizona/core.h
+++ b/include/linux/mfd/arizona/core.h
@@ -100,6 +100,9 @@ struct arizona {
 	struct regmap_irq_chip_data *aod_irq_chip;
 	struct regmap_irq_chip_data *irq_chip;
 
+	bool hpdet_magic;
+	unsigned int hp_ena;
+
 	struct mutex clk_lock;
 	int clk32k_ref;
 
diff --git a/sound/soc/codecs/arizona.c b/sound/soc/codecs/arizona.c
index ac948a671ea6..e7d34711412c 100644
--- a/sound/soc/codecs/arizona.c
+++ b/sound/soc/codecs/arizona.c
@@ -364,6 +364,39 @@ int arizona_out_ev(struct snd_soc_dapm_widget *w,
 }
 EXPORT_SYMBOL_GPL(arizona_out_ev);
 
+int arizona_hp_ev(struct snd_soc_dapm_widget *w,
+		   struct snd_kcontrol *kcontrol,
+		   int event)
+{
+	struct arizona_priv *priv = snd_soc_codec_get_drvdata(w->codec);
+	unsigned int mask = 1 << w->shift;
+	unsigned int val;
+
+	switch (event) {
+	case SND_SOC_DAPM_POST_PMU:
+		val = mask;
+		break;
+	case SND_SOC_DAPM_PRE_PMD:
+		val = 0;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Store the desired state for the HP outputs */
+	priv->arizona->hp_ena &= ~mask;
+	priv->arizona->hp_ena |= val;
+
+	/* Force off if HPDET magic is active */
+	if (priv->arizona->hpdet_magic)
+		val = 0;
+
+	snd_soc_update_bits(w->codec, ARIZONA_OUTPUT_ENABLES_1, mask, val);
+
+	return arizona_out_ev(w, kcontrol, event);
+}
+EXPORT_SYMBOL_GPL(arizona_hp_ev);
+
 static unsigned int arizona_sysclk_48k_rates[] = {
 	6144000,
 	12288000,
diff --git a/sound/soc/codecs/arizona.h b/sound/soc/codecs/arizona.h
index 116372c91f5d..13dd2916b721 100644
--- a/sound/soc/codecs/arizona.h
+++ b/sound/soc/codecs/arizona.h
@@ -184,6 +184,9 @@ extern int arizona_in_ev(struct snd_soc_dapm_widget *w,
 extern int arizona_out_ev(struct snd_soc_dapm_widget *w,
 			  struct snd_kcontrol *kcontrol,
 			  int event);
+extern int arizona_hp_ev(struct snd_soc_dapm_widget *w,
+			 struct snd_kcontrol *kcontrol,
+			 int event);
 
 extern int arizona_set_sysclk(struct snd_soc_codec *codec, int clk_id,
 			      int source, unsigned int freq, int dir);
diff --git a/sound/soc/codecs/wm5102.c b/sound/soc/codecs/wm5102.c
index b82bbf584146..2657aad3f8b1 100644
--- a/sound/soc/codecs/wm5102.c
+++ b/sound/soc/codecs/wm5102.c
@@ -1131,11 +1131,11 @@ ARIZONA_DSP_WIDGETS(DSP1, "DSP1"),
 SND_SOC_DAPM_VALUE_MUX("AEC Loopback", ARIZONA_DAC_AEC_CONTROL_1,
 		       ARIZONA_AEC_LOOPBACK_ENA, 0, &wm5102_aec_loopback_mux),
 
-SND_SOC_DAPM_PGA_E("OUT1L", ARIZONA_OUTPUT_ENABLES_1,
-		   ARIZONA_OUT1L_ENA_SHIFT, 0, NULL, 0, arizona_out_ev,
+SND_SOC_DAPM_PGA_E("OUT1L", SND_SOC_NOPM,
+		   ARIZONA_OUT1L_ENA_SHIFT, 0, NULL, 0, arizona_hp_ev,
 		   SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMU),
-SND_SOC_DAPM_PGA_E("OUT1R", ARIZONA_OUTPUT_ENABLES_1,
-		   ARIZONA_OUT1R_ENA_SHIFT, 0, NULL, 0, arizona_out_ev,
+SND_SOC_DAPM_PGA_E("OUT1R", SND_SOC_NOPM,
+		   ARIZONA_OUT1R_ENA_SHIFT, 0, NULL, 0, arizona_hp_ev,
 		   SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMU),
 SND_SOC_DAPM_PGA_E("OUT2L", ARIZONA_OUTPUT_ENABLES_1,
 		   ARIZONA_OUT2L_ENA_SHIFT, 0, NULL, 0, arizona_out_ev,
diff --git a/sound/soc/codecs/wm5110.c b/sound/soc/codecs/wm5110.c
index cdeb301da1f6..7841b42a819c 100644
--- a/sound/soc/codecs/wm5110.c
+++ b/sound/soc/codecs/wm5110.c
@@ -551,11 +551,11 @@ SND_SOC_DAPM_AIF_IN("AIF3RX1", NULL, 0,
 SND_SOC_DAPM_AIF_IN("AIF3RX2", NULL, 0,
 		    ARIZONA_AIF3_RX_ENABLES, ARIZONA_AIF3RX2_ENA_SHIFT, 0),
 
-SND_SOC_DAPM_PGA_E("OUT1L", ARIZONA_OUTPUT_ENABLES_1,
-		   ARIZONA_OUT1L_ENA_SHIFT, 0, NULL, 0, arizona_out_ev,
+SND_SOC_DAPM_PGA_E("OUT1L", SND_SOC_NOPM,
+		   ARIZONA_OUT1L_ENA_SHIFT, 0, NULL, 0, arizona_hp_ev,
 		   SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMU),
-SND_SOC_DAPM_PGA_E("OUT1R", ARIZONA_OUTPUT_ENABLES_1,
-		   ARIZONA_OUT1R_ENA_SHIFT, 0, NULL, 0, arizona_out_ev,
+SND_SOC_DAPM_PGA_E("OUT1R", SND_SOC_NOPM,
+		   ARIZONA_OUT1R_ENA_SHIFT, 0, NULL, 0, arizona_hp_ev,
 		   SND_SOC_DAPM_PRE_PMD | SND_SOC_DAPM_POST_PMU),
 SND_SOC_DAPM_PGA_E("OUT2L", ARIZONA_OUTPUT_ENABLES_1,
 		   ARIZONA_OUT2L_ENA_SHIFT, 0, NULL, 0, arizona_out_ev,
-- 
cgit 


From de88cbb7b244f3bcd61d49fd6dec35c19192545a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 18 Jan 2013 15:31:37 +0000
Subject: arm: Move chained_irq_(enter|exit) to a generic file

These functions have been introduced by commit 10a8c383 (irq: introduce
entry and exit functions for chained handlers) in asm/mach/irq.h. This
patch moves them to linux/irqchip/chained_irq.h so that generic irqchip
drivers do not rely on architecture specific header files.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Rob Herring <rob.herring@calxeda.com>
---
 arch/arm/include/asm/mach/irq.h             | 31 -----------------
 arch/arm/mach-at91/gpio.c                   |  3 +-
 arch/arm/mach-exynos/common.c               |  1 +
 arch/arm/mach-s3c24xx/irq.c                 |  1 +
 arch/arm/plat-samsung/irq-vic-timer.c       |  3 +-
 arch/arm/plat-samsung/s5p-irq-gpioint.c     |  3 +-
 drivers/gpio/gpio-msm-v2.c                  |  3 +-
 drivers/gpio/gpio-mxc.c                     |  2 +-
 drivers/gpio/gpio-omap.c                    |  3 +-
 drivers/gpio/gpio-pl061.c                   |  2 +-
 drivers/gpio/gpio-pxa.c                     |  3 +-
 drivers/gpio/gpio-tegra.c                   |  3 +-
 drivers/irqchip/exynos-combiner.c           |  1 +
 drivers/irqchip/irq-gic.c                   |  1 +
 drivers/pinctrl/pinctrl-at91.c              |  3 +-
 drivers/pinctrl/pinctrl-exynos.c            |  3 +-
 drivers/pinctrl/pinctrl-nomadik.c           |  2 +-
 drivers/pinctrl/pinctrl-sirf.c              |  2 +-
 drivers/pinctrl/spear/pinctrl-plgpio.c      |  2 +-
 drivers/staging/imx-drm/ipu-v3/ipu-common.c |  2 +-
 include/linux/irqchip/chained_irq.h         | 52 +++++++++++++++++++++++++++++
 21 files changed, 71 insertions(+), 55 deletions(-)
 create mode 100644 include/linux/irqchip/chained_irq.h

(limited to 'include/linux')

diff --git a/arch/arm/include/asm/mach/irq.h b/arch/arm/include/asm/mach/irq.h
index 749d5052fbb7..2092ee1e1300 100644
--- a/arch/arm/include/asm/mach/irq.h
+++ b/arch/arm/include/asm/mach/irq.h
@@ -30,35 +30,4 @@ do {							\
 	raw_spin_unlock(&desc->lock);			\
 } while(0)
 
-#ifndef __ASSEMBLY__
-/*
- * Entry/exit functions for chained handlers where the primary IRQ chip
- * may implement either fasteoi or level-trigger flow control.
- */
-static inline void chained_irq_enter(struct irq_chip *chip,
-				     struct irq_desc *desc)
-{
-	/* FastEOI controllers require no action on entry. */
-	if (chip->irq_eoi)
-		return;
-
-	if (chip->irq_mask_ack) {
-		chip->irq_mask_ack(&desc->irq_data);
-	} else {
-		chip->irq_mask(&desc->irq_data);
-		if (chip->irq_ack)
-			chip->irq_ack(&desc->irq_data);
-	}
-}
-
-static inline void chained_irq_exit(struct irq_chip *chip,
-				    struct irq_desc *desc)
-{
-	if (chip->irq_eoi)
-		chip->irq_eoi(&desc->irq_data);
-	else
-		chip->irq_unmask(&desc->irq_data);
-}
-#endif
-
 #endif
diff --git a/arch/arm/mach-at91/gpio.c b/arch/arm/mach-at91/gpio.c
index c5d7e1e9d757..a5afcf76550e 100644
--- a/arch/arm/mach-at91/gpio.c
+++ b/arch/arm/mach-at91/gpio.c
@@ -22,10 +22,9 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/of_address.h>
 
-#include <asm/mach/irq.h>
-
 #include <mach/hardware.h>
 #include <mach/at91_pio.h>
 
diff --git a/arch/arm/mach-exynos/common.c b/arch/arm/mach-exynos/common.c
index d63d399c7bae..7bc0f9aa8b33 100644
--- a/arch/arm/mach-exynos/common.c
+++ b/arch/arm/mach-exynos/common.c
@@ -26,6 +26,7 @@
 #include <linux/irqchip.h>
 #include <linux/of_address.h>
 #include <linux/irqchip/arm-gic.h>
+#include <linux/irqchip/chained_irq.h>
 
 #include <asm/proc-fns.h>
 #include <asm/exception.h>
diff --git a/arch/arm/mach-s3c24xx/irq.c b/arch/arm/mach-s3c24xx/irq.c
index cb9f5e011e73..b6fac28a0034 100644
--- a/arch/arm/mach-s3c24xx/irq.c
+++ b/arch/arm/mach-s3c24xx/irq.c
@@ -25,6 +25,7 @@
 #include <linux/ioport.h>
 #include <linux/device.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 
 #include <asm/mach/irq.h>
 
diff --git a/arch/arm/plat-samsung/irq-vic-timer.c b/arch/arm/plat-samsung/irq-vic-timer.c
index f980cf3d2baa..5d205e74e495 100644
--- a/arch/arm/plat-samsung/irq-vic-timer.c
+++ b/arch/arm/plat-samsung/irq-vic-timer.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/io.h>
 
 #include <mach/map.h>
@@ -23,8 +24,6 @@
 #include <plat/irq-vic-timer.h>
 #include <plat/regs-timer.h>
 
-#include <asm/mach/irq.h>
-
 static void s3c_irq_demux_vic_timer(unsigned int irq, struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_get_chip(irq);
diff --git a/arch/arm/plat-samsung/s5p-irq-gpioint.c b/arch/arm/plat-samsung/s5p-irq-gpioint.c
index bae56131a50a..fafdb059043a 100644
--- a/arch/arm/plat-samsung/s5p-irq-gpioint.c
+++ b/arch/arm/plat-samsung/s5p-irq-gpioint.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
 #include <linux/slab.h>
@@ -22,8 +23,6 @@
 #include <plat/gpio-core.h>
 #include <plat/gpio-cfg.h>
 
-#include <asm/mach/irq.h>
-
 #define GPIO_BASE(chip)		((void __iomem *)((unsigned long)((chip)->base) & 0xFFFFF000u))
 
 #define CON_OFFSET		0x700
diff --git a/drivers/gpio/gpio-msm-v2.c b/drivers/gpio/gpio-msm-v2.c
index 55a7e7769af6..dd2eddeb1e0c 100644
--- a/drivers/gpio/gpio-msm-v2.c
+++ b/drivers/gpio/gpio-msm-v2.c
@@ -23,13 +23,12 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/spinlock.h>
 
-#include <asm/mach/irq.h>
-
 #include <mach/msm_gpiomux.h>
 #include <mach/msm_iomap.h>
 
diff --git a/drivers/gpio/gpio-mxc.c b/drivers/gpio/gpio-mxc.c
index 7877335c4cc8..7176743915d3 100644
--- a/drivers/gpio/gpio-mxc.c
+++ b/drivers/gpio/gpio-mxc.c
@@ -24,6 +24,7 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/gpio.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
@@ -32,7 +33,6 @@
 #include <linux/of_device.h>
 #include <linux/module.h>
 #include <asm-generic/bug.h>
-#include <asm/mach/irq.h>
 
 enum mxc_gpio_hwtype {
 	IMX1_GPIO,	/* runs on i.mx1 */
diff --git a/drivers/gpio/gpio-omap.c b/drivers/gpio/gpio-omap.c
index 159f5c57eb45..a612ea1c53cb 100644
--- a/drivers/gpio/gpio-omap.c
+++ b/drivers/gpio/gpio-omap.c
@@ -25,11 +25,10 @@
 #include <linux/of.h>
 #include <linux/of_device.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/gpio.h>
 #include <linux/platform_data/gpio-omap.h>
 
-#include <asm/mach/irq.h>
-
 #define OFF_MODE	1
 
 static LIST_HEAD(omap_gpio_list);
diff --git a/drivers/gpio/gpio-pl061.c b/drivers/gpio/gpio-pl061.c
index b820869ca93c..29763361d13c 100644
--- a/drivers/gpio/gpio-pl061.c
+++ b/drivers/gpio/gpio-pl061.c
@@ -15,6 +15,7 @@
 #include <linux/io.h>
 #include <linux/ioport.h>
 #include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/bitops.h>
 #include <linux/workqueue.h>
 #include <linux/gpio.h>
@@ -23,7 +24,6 @@
 #include <linux/amba/pl061.h>
 #include <linux/slab.h>
 #include <linux/pm.h>
-#include <asm/mach/irq.h>
 
 #define GPIODIR 0x400
 #define GPIOIS  0x404
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index 9cc108d2b770..7523b6d108d0 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -19,6 +19,7 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_device.h>
@@ -26,8 +27,6 @@
 #include <linux/syscore_ops.h>
 #include <linux/slab.h>
 
-#include <asm/mach/irq.h>
-
 #include <mach/irqs.h>
 
 /*
diff --git a/drivers/gpio/gpio-tegra.c b/drivers/gpio/gpio-tegra.c
index 414ad912232f..8e2155548888 100644
--- a/drivers/gpio/gpio-tegra.c
+++ b/drivers/gpio/gpio-tegra.c
@@ -27,11 +27,10 @@
 #include <linux/platform_device.h>
 #include <linux/module.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/pm.h>
 
-#include <asm/mach/irq.h>
-
 #define GPIO_BANK(x)		((x) >> 5)
 #define GPIO_PORT(x)		(((x) >> 3) & 0x3)
 #define GPIO_BIT(x)		((x) & 0x7)
diff --git a/drivers/irqchip/exynos-combiner.c b/drivers/irqchip/exynos-combiner.c
index 04d86a9803f4..6a5201351507 100644
--- a/drivers/irqchip/exynos-combiner.c
+++ b/drivers/irqchip/exynos-combiner.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/io.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <asm/mach/irq.h>
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index a32e0d5aa45f..0b1c0af646de 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -38,6 +38,7 @@
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/slab.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/irqchip/arm-gic.h>
 
 #include <asm/irq.h>
diff --git a/drivers/pinctrl/pinctrl-at91.c b/drivers/pinctrl/pinctrl-at91.c
index 75933a6aa828..5cbadc9ad2e8 100644
--- a/drivers/pinctrl/pinctrl-at91.c
+++ b/drivers/pinctrl/pinctrl-at91.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
 #include <linux/pinctrl/machine.h>
@@ -27,8 +28,6 @@
 /* Since we request GPIOs from ourself */
 #include <linux/pinctrl/consumer.h>
 
-#include <asm/mach/irq.h>
-
 #include <mach/hardware.h>
 #include <mach/at91_pio.h>
 
diff --git a/drivers/pinctrl/pinctrl-exynos.c b/drivers/pinctrl/pinctrl-exynos.c
index 538b9ddaadf7..7265e551dddb 100644
--- a/drivers/pinctrl/pinctrl-exynos.c
+++ b/drivers/pinctrl/pinctrl-exynos.c
@@ -23,13 +23,12 @@
 #include <linux/interrupt.h>
 #include <linux/irqdomain.h>
 #include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/of_irq.h>
 #include <linux/io.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 
-#include <asm/mach/irq.h>
-
 #include "pinctrl-samsung.h"
 #include "pinctrl-exynos.h"
 
diff --git a/drivers/pinctrl/pinctrl-nomadik.c b/drivers/pinctrl/pinctrl-nomadik.c
index 36d20293de5c..93eba9715e62 100644
--- a/drivers/pinctrl/pinctrl-nomadik.c
+++ b/drivers/pinctrl/pinctrl-nomadik.c
@@ -23,6 +23,7 @@
 #include <linux/interrupt.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/slab.h>
 #include <linux/of_device.h>
 #include <linux/of_address.h>
@@ -33,7 +34,6 @@
 /* Since we request GPIOs from ourself */
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_data/pinctrl-nomadik.h>
-#include <asm/mach/irq.h>
 #include "pinctrl-nomadik.h"
 #include "core.h"
 
diff --git a/drivers/pinctrl/pinctrl-sirf.c b/drivers/pinctrl/pinctrl-sirf.c
index d02498b30c6e..ab26b4b669d5 100644
--- a/drivers/pinctrl/pinctrl-sirf.c
+++ b/drivers/pinctrl/pinctrl-sirf.c
@@ -14,6 +14,7 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/pinctrl/pinmux.h>
 #include <linux/pinctrl/consumer.h>
@@ -25,7 +26,6 @@
 #include <linux/bitops.h>
 #include <linux/gpio.h>
 #include <linux/of_gpio.h>
-#include <asm/mach/irq.h>
 
 #define DRIVER_NAME "pinmux-sirf"
 
diff --git a/drivers/pinctrl/spear/pinctrl-plgpio.c b/drivers/pinctrl/spear/pinctrl-plgpio.c
index 295b349a05cf..a4908ecd74fb 100644
--- a/drivers/pinctrl/spear/pinctrl-plgpio.c
+++ b/drivers/pinctrl/spear/pinctrl-plgpio.c
@@ -15,12 +15,12 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/module.h>
 #include <linux/pinctrl/consumer.h>
 #include <linux/platform_device.h>
 #include <linux/pm.h>
 #include <linux/spinlock.h>
-#include <asm/mach/irq.h>
 
 #define MAX_GPIO_PER_REG		32
 #define PIN_OFFSET(pin)			(pin % MAX_GPIO_PER_REG)
diff --git a/drivers/staging/imx-drm/ipu-v3/ipu-common.c b/drivers/staging/imx-drm/ipu-v3/ipu-common.c
index 366f259e3756..6efe4e1b499f 100644
--- a/drivers/staging/imx-drm/ipu-v3/ipu-common.c
+++ b/drivers/staging/imx-drm/ipu-v3/ipu-common.c
@@ -25,8 +25,8 @@
 #include <linux/clk.h>
 #include <linux/list.h>
 #include <linux/irq.h>
+#include <linux/irqchip/chained_irq.h>
 #include <linux/of_device.h>
-#include <asm/mach/irq.h>
 
 #include "imx-ipu-v3.h"
 #include "ipu-prv.h"
diff --git a/include/linux/irqchip/chained_irq.h b/include/linux/irqchip/chained_irq.h
new file mode 100644
index 000000000000..adf4c30f3af6
--- /dev/null
+++ b/include/linux/irqchip/chained_irq.h
@@ -0,0 +1,52 @@
+/*
+ * Chained IRQ handlers support.
+ *
+ * Copyright (C) 2011 ARM Ltd.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#ifndef __IRQCHIP_CHAINED_IRQ_H
+#define __IRQCHIP_CHAINED_IRQ_H
+
+#include <linux/irq.h>
+
+/*
+ * Entry/exit functions for chained handlers where the primary IRQ chip
+ * may implement either fasteoi or level-trigger flow control.
+ */
+static inline void chained_irq_enter(struct irq_chip *chip,
+				     struct irq_desc *desc)
+{
+	/* FastEOI controllers require no action on entry. */
+	if (chip->irq_eoi)
+		return;
+
+	if (chip->irq_mask_ack) {
+		chip->irq_mask_ack(&desc->irq_data);
+	} else {
+		chip->irq_mask(&desc->irq_data);
+		if (chip->irq_ack)
+			chip->irq_ack(&desc->irq_data);
+	}
+}
+
+static inline void chained_irq_exit(struct irq_chip *chip,
+				    struct irq_desc *desc)
+{
+	if (chip->irq_eoi)
+		chip->irq_eoi(&desc->irq_data);
+	else
+		chip->irq_unmask(&desc->irq_data);
+}
+
+#endif /* __IRQCHIP_CHAINED_IRQ_H */
-- 
cgit 


From c0114709ed85a5693eb74acdfa03d94f7f12e5b8 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Mon, 14 Jan 2013 18:05:37 +0000
Subject: irqchip: gic: Perform the gic_secondary_init() call via CPU notifier

All the calls to gic_secondary_init() pass 0 as the first argument.
Since this function is called on each CPU when starting, it can be done
in a platform-independent way via a CPU notifier registered by the GIC
code.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Acked-by: Rob Herring <rob.herring@calxeda.com>
Acked-by: Simon Horman <horms+renesas@verge.net.au>
Tested-by: Simon Horman <horms+renesas@verge.net.au>
Acked-by: Srinidhi Kasagar <srinidhi.kasagar@stericsson.com>
Tested-by: Dinh Nguyen <dinguyen@altera.com>
Acked-by: Nicolas Pitre <nico@linaro.org>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: Sascha Hauer <kernel@pengutronix.de>
Cc: David Brown <davidb@codeaurora.org>
Cc: Bryan Huntsman <bryanh@codeaurora.org>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Shiraz Hashim <shiraz.hashim@st.com>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Kukjin Kim <kgene.kim@samsung.com>
Cc: Barry Song <baohua.song@csr.com>
---
 arch/arm/mach-exynos/platsmp.c       |  8 --------
 arch/arm/mach-highbank/platsmp.c     |  7 -------
 arch/arm/mach-imx/platsmp.c          | 12 ------------
 arch/arm/mach-msm/platsmp.c          |  8 --------
 arch/arm/mach-omap2/omap-smp.c       |  7 -------
 arch/arm/mach-prima2/platsmp.c       |  8 --------
 arch/arm/mach-shmobile/smp-emev2.c   |  7 -------
 arch/arm/mach-shmobile/smp-r8a7779.c |  7 -------
 arch/arm/mach-shmobile/smp-sh73a0.c  |  7 -------
 arch/arm/mach-socfpga/platsmp.c      | 12 ------------
 arch/arm/mach-spear13xx/platsmp.c    |  8 --------
 arch/arm/mach-tegra/platsmp.c        |  8 --------
 arch/arm/mach-ux500/platsmp.c        |  8 --------
 arch/arm/mach-virt/platsmp.c         |  8 --------
 arch/arm/plat-versatile/platsmp.c    |  8 --------
 drivers/irqchip/irq-gic.c            | 28 +++++++++++++++++++++-------
 include/linux/irqchip/arm-gic.h      |  1 -
 17 files changed, 21 insertions(+), 131 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/platsmp.c b/arch/arm/mach-exynos/platsmp.c
index 60f7c5be057d..95e04bd5813f 100644
--- a/arch/arm/mach-exynos/platsmp.c
+++ b/arch/arm/mach-exynos/platsmp.c
@@ -20,7 +20,6 @@
 #include <linux/jiffies.h>
 #include <linux/smp.h>
 #include <linux/io.h>
-#include <linux/irqchip/arm-gic.h>
 
 #include <asm/cacheflush.h>
 #include <asm/smp_plat.h>
@@ -75,13 +74,6 @@ static DEFINE_SPINLOCK(boot_lock);
 
 static void __cpuinit exynos_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * let the primary processor know we're out of the
 	 * pen, then head off into the C entry point
diff --git a/arch/arm/mach-highbank/platsmp.c b/arch/arm/mach-highbank/platsmp.c
index 8797a7001720..a984573e0d02 100644
--- a/arch/arm/mach-highbank/platsmp.c
+++ b/arch/arm/mach-highbank/platsmp.c
@@ -17,7 +17,6 @@
 #include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/io.h>
-#include <linux/irqchip/arm-gic.h>
 
 #include <asm/smp_scu.h>
 
@@ -25,11 +24,6 @@
 
 extern void secondary_startup(void);
 
-static void __cpuinit highbank_secondary_init(unsigned int cpu)
-{
-	gic_secondary_init(0);
-}
-
 static int __cpuinit highbank_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	highbank_set_cpu_jump(cpu, secondary_startup);
@@ -67,7 +61,6 @@ static void __init highbank_smp_prepare_cpus(unsigned int max_cpus)
 struct smp_operations highbank_smp_ops __initdata = {
 	.smp_init_cpus		= highbank_smp_init_cpus,
 	.smp_prepare_cpus	= highbank_smp_prepare_cpus,
-	.smp_secondary_init	= highbank_secondary_init,
 	.smp_boot_secondary	= highbank_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_die		= highbank_cpu_die,
diff --git a/arch/arm/mach-imx/platsmp.c b/arch/arm/mach-imx/platsmp.c
index 7c0b03f67b05..77e9a25ed0f6 100644
--- a/arch/arm/mach-imx/platsmp.c
+++ b/arch/arm/mach-imx/platsmp.c
@@ -12,7 +12,6 @@
 
 #include <linux/init.h>
 #include <linux/smp.h>
-#include <linux/irqchip/arm-gic.h>
 #include <asm/page.h>
 #include <asm/smp_scu.h>
 #include <asm/mach/map.h>
@@ -52,16 +51,6 @@ void imx_scu_standby_enable(void)
 	writel_relaxed(val, scu_base);
 }
 
-static void __cpuinit imx_secondary_init(unsigned int cpu)
-{
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-}
-
 static int __cpuinit imx_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	imx_set_cpu_jump(cpu, v7_secondary_startup);
@@ -96,7 +85,6 @@ static void __init imx_smp_prepare_cpus(unsigned int max_cpus)
 struct smp_operations  imx_smp_ops __initdata = {
 	.smp_init_cpus		= imx_smp_init_cpus,
 	.smp_prepare_cpus	= imx_smp_prepare_cpus,
-	.smp_secondary_init	= imx_secondary_init,
 	.smp_boot_secondary	= imx_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_die		= imx_cpu_die,
diff --git a/arch/arm/mach-msm/platsmp.c b/arch/arm/mach-msm/platsmp.c
index 42932865416a..00cdb0a5dac8 100644
--- a/arch/arm/mach-msm/platsmp.c
+++ b/arch/arm/mach-msm/platsmp.c
@@ -15,7 +15,6 @@
 #include <linux/jiffies.h>
 #include <linux/smp.h>
 #include <linux/io.h>
-#include <linux/irqchip/arm-gic.h>
 
 #include <asm/cacheflush.h>
 #include <asm/cputype.h>
@@ -41,13 +40,6 @@ static inline int get_core_count(void)
 
 static void __cpuinit msm_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * let the primary processor know we're out of the
 	 * pen, then head off into the C entry point
diff --git a/arch/arm/mach-omap2/omap-smp.c b/arch/arm/mach-omap2/omap-smp.c
index d9727218dd0a..e7a449758ab5 100644
--- a/arch/arm/mach-omap2/omap-smp.c
+++ b/arch/arm/mach-omap2/omap-smp.c
@@ -66,13 +66,6 @@ static void __cpuinit omap4_secondary_init(unsigned int cpu)
 		omap_secure_dispatcher(OMAP4_PPA_CPU_ACTRL_SMP_INDEX,
 							4, 0, 0, 0, 0, 0);
 
-	/*
-	 * If any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * Synchronise with the boot thread.
 	 */
diff --git a/arch/arm/mach-prima2/platsmp.c b/arch/arm/mach-prima2/platsmp.c
index 4b788310f6a6..c7c92e78f0cf 100644
--- a/arch/arm/mach-prima2/platsmp.c
+++ b/arch/arm/mach-prima2/platsmp.c
@@ -11,7 +11,6 @@
 #include <linux/delay.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/irqchip/arm-gic.h>
 #include <asm/page.h>
 #include <asm/mach/map.h>
 #include <asm/smp_plat.h>
@@ -48,13 +47,6 @@ void __init sirfsoc_map_scu(void)
 
 static void __cpuinit sirfsoc_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * let the primary processor know we're out of the
 	 * pen, then head off into the C entry point
diff --git a/arch/arm/mach-shmobile/smp-emev2.c b/arch/arm/mach-shmobile/smp-emev2.c
index 953eb1f9388d..384e27dd3601 100644
--- a/arch/arm/mach-shmobile/smp-emev2.c
+++ b/arch/arm/mach-shmobile/smp-emev2.c
@@ -23,7 +23,6 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/delay.h>
-#include <linux/irqchip/arm-gic.h>
 #include <mach/common.h>
 #include <mach/emev2.h>
 #include <asm/smp_plat.h>
@@ -85,11 +84,6 @@ static int __maybe_unused emev2_cpu_kill(unsigned int cpu)
 }
 
 
-static void __cpuinit emev2_secondary_init(unsigned int cpu)
-{
-	gic_secondary_init(0);
-}
-
 static int __cpuinit emev2_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	cpu = cpu_logical_map(cpu);
@@ -124,7 +118,6 @@ static void __init emev2_smp_init_cpus(void)
 struct smp_operations emev2_smp_ops __initdata = {
 	.smp_init_cpus		= emev2_smp_init_cpus,
 	.smp_prepare_cpus	= emev2_smp_prepare_cpus,
-	.smp_secondary_init	= emev2_secondary_init,
 	.smp_boot_secondary	= emev2_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_kill		= emev2_cpu_kill,
diff --git a/arch/arm/mach-shmobile/smp-r8a7779.c b/arch/arm/mach-shmobile/smp-r8a7779.c
index 3a4acf23edcf..994906560edd 100644
--- a/arch/arm/mach-shmobile/smp-r8a7779.c
+++ b/arch/arm/mach-shmobile/smp-r8a7779.c
@@ -23,7 +23,6 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/delay.h>
-#include <linux/irqchip/arm-gic.h>
 #include <mach/common.h>
 #include <mach/r8a7779.h>
 #include <asm/smp_plat.h>
@@ -132,11 +131,6 @@ static int __maybe_unused r8a7779_cpu_kill(unsigned int cpu)
 }
 
 
-static void __cpuinit r8a7779_secondary_init(unsigned int cpu)
-{
-	gic_secondary_init(0);
-}
-
 static int __cpuinit r8a7779_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	struct r8a7779_pm_ch *ch = NULL;
@@ -186,7 +180,6 @@ static void __init r8a7779_smp_init_cpus(void)
 struct smp_operations r8a7779_smp_ops  __initdata = {
 	.smp_init_cpus		= r8a7779_smp_init_cpus,
 	.smp_prepare_cpus	= r8a7779_smp_prepare_cpus,
-	.smp_secondary_init	= r8a7779_secondary_init,
 	.smp_boot_secondary	= r8a7779_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_kill		= r8a7779_cpu_kill,
diff --git a/arch/arm/mach-shmobile/smp-sh73a0.c b/arch/arm/mach-shmobile/smp-sh73a0.c
index acb46a94ccdf..d0f9aca22477 100644
--- a/arch/arm/mach-shmobile/smp-sh73a0.c
+++ b/arch/arm/mach-shmobile/smp-sh73a0.c
@@ -23,7 +23,6 @@
 #include <linux/spinlock.h>
 #include <linux/io.h>
 #include <linux/delay.h>
-#include <linux/irqchip/arm-gic.h>
 #include <mach/common.h>
 #include <asm/cacheflush.h>
 #include <asm/smp_plat.h>
@@ -59,11 +58,6 @@ static unsigned int __init sh73a0_get_core_count(void)
 	return scu_get_core_count(scu_base);
 }
 
-static void __cpuinit sh73a0_secondary_init(unsigned int cpu)
-{
-	gic_secondary_init(0);
-}
-
 static int __cpuinit sh73a0_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	cpu = cpu_logical_map(cpu);
@@ -138,7 +132,6 @@ static void sh73a0_cpu_die(unsigned int cpu)
 struct smp_operations sh73a0_smp_ops __initdata = {
 	.smp_init_cpus		= sh73a0_smp_init_cpus,
 	.smp_prepare_cpus	= sh73a0_smp_prepare_cpus,
-	.smp_secondary_init	= sh73a0_secondary_init,
 	.smp_boot_secondary	= sh73a0_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_kill		= sh73a0_cpu_kill,
diff --git a/arch/arm/mach-socfpga/platsmp.c b/arch/arm/mach-socfpga/platsmp.c
index 84c60fa8daa0..ca14d1d5ac7f 100644
--- a/arch/arm/mach-socfpga/platsmp.c
+++ b/arch/arm/mach-socfpga/platsmp.c
@@ -22,7 +22,6 @@
 #include <linux/io.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
-#include <linux/irqchip/arm-gic.h>
 
 #include <asm/cacheflush.h>
 #include <asm/smp_scu.h>
@@ -33,16 +32,6 @@
 extern void __iomem *sys_manager_base_addr;
 extern void __iomem *rst_manager_base_addr;
 
-static void __cpuinit socfpga_secondary_init(unsigned int cpu)
-{
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-}
-
 static int __cpuinit socfpga_boot_secondary(unsigned int cpu, struct task_struct *idle)
 {
 	int trampoline_size = &secondary_trampoline_end - &secondary_trampoline;
@@ -109,7 +98,6 @@ static void socfpga_cpu_die(unsigned int cpu)
 struct smp_operations socfpga_smp_ops __initdata = {
 	.smp_init_cpus		= socfpga_smp_init_cpus,
 	.smp_prepare_cpus	= socfpga_smp_prepare_cpus,
-	.smp_secondary_init	= socfpga_secondary_init,
 	.smp_boot_secondary	= socfpga_boot_secondary,
 #ifdef CONFIG_HOTPLUG_CPU
 	.cpu_die		= socfpga_cpu_die,
diff --git a/arch/arm/mach-spear13xx/platsmp.c b/arch/arm/mach-spear13xx/platsmp.c
index af4ade61cd95..551c69c9a228 100644
--- a/arch/arm/mach-spear13xx/platsmp.c
+++ b/arch/arm/mach-spear13xx/platsmp.c
@@ -15,7 +15,6 @@
 #include <linux/jiffies.h>
 #include <linux/io.h>
 #include <linux/smp.h>
-#include <linux/irqchip/arm-gic.h>
 #include <asm/cacheflush.h>
 #include <asm/smp_scu.h>
 #include <mach/spear.h>
@@ -27,13 +26,6 @@ static void __iomem *scu_base = IOMEM(VA_SCU_BASE);
 
 static void __cpuinit spear13xx_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * let the primary processor know we're out of the
 	 * pen, then head off into the C entry point
diff --git a/arch/arm/mach-tegra/platsmp.c b/arch/arm/mach-tegra/platsmp.c
index 2c6b3d55213b..9348d3c496a9 100644
--- a/arch/arm/mach-tegra/platsmp.c
+++ b/arch/arm/mach-tegra/platsmp.c
@@ -18,7 +18,6 @@
 #include <linux/jiffies.h>
 #include <linux/smp.h>
 #include <linux/io.h>
-#include <linux/irqchip/arm-gic.h>
 #include <linux/clk/tegra.h>
 
 #include <asm/cacheflush.h>
@@ -44,13 +43,6 @@ static cpumask_t tegra_cpu_init_mask;
 
 static void __cpuinit tegra_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	cpumask_set_cpu(cpu, &tegra_cpu_init_mask);
 }
 
diff --git a/arch/arm/mach-ux500/platsmp.c b/arch/arm/mach-ux500/platsmp.c
index 18f7af339dc9..152b1309b9af 100644
--- a/arch/arm/mach-ux500/platsmp.c
+++ b/arch/arm/mach-ux500/platsmp.c
@@ -16,7 +16,6 @@
 #include <linux/device.h>
 #include <linux/smp.h>
 #include <linux/io.h>
-#include <linux/irqchip/arm-gic.h>
 
 #include <asm/cacheflush.h>
 #include <asm/smp_plat.h>
@@ -57,13 +56,6 @@ static DEFINE_SPINLOCK(boot_lock);
 
 static void __cpuinit ux500_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * let the primary processor know we're out of the
 	 * pen, then head off into the C entry point
diff --git a/arch/arm/mach-virt/platsmp.c b/arch/arm/mach-virt/platsmp.c
index 8badaabe70a1..f4143f5bfa5b 100644
--- a/arch/arm/mach-virt/platsmp.c
+++ b/arch/arm/mach-virt/platsmp.c
@@ -21,8 +21,6 @@
 #include <linux/smp.h>
 #include <linux/of.h>
 
-#include <linux/irqchip/arm-gic.h>
-
 #include <asm/psci.h>
 #include <asm/smp_plat.h>
 
@@ -45,14 +43,8 @@ static int __cpuinit virt_boot_secondary(unsigned int cpu,
 	return -ENODEV;
 }
 
-static void __cpuinit virt_secondary_init(unsigned int cpu)
-{
-	gic_secondary_init(0);
-}
-
 struct smp_operations __initdata virt_smp_ops = {
 	.smp_init_cpus		= virt_smp_init_cpus,
 	.smp_prepare_cpus	= virt_smp_prepare_cpus,
-	.smp_secondary_init	= virt_secondary_init,
 	.smp_boot_secondary	= virt_boot_secondary,
 };
diff --git a/arch/arm/plat-versatile/platsmp.c b/arch/arm/plat-versatile/platsmp.c
index f2ac15561778..1e1b2d769748 100644
--- a/arch/arm/plat-versatile/platsmp.c
+++ b/arch/arm/plat-versatile/platsmp.c
@@ -14,7 +14,6 @@
 #include <linux/device.h>
 #include <linux/jiffies.h>
 #include <linux/smp.h>
-#include <linux/irqchip/arm-gic.h>
 
 #include <asm/cacheflush.h>
 #include <asm/smp_plat.h>
@@ -36,13 +35,6 @@ static DEFINE_SPINLOCK(boot_lock);
 
 void __cpuinit versatile_secondary_init(unsigned int cpu)
 {
-	/*
-	 * if any interrupts are already enabled for the primary
-	 * core (e.g. timer irq), then they will not have been enabled
-	 * for us: do so
-	 */
-	gic_secondary_init(0);
-
 	/*
 	 * let the primary processor know we're out of the
 	 * pen, then head off into the C entry point
diff --git a/drivers/irqchip/irq-gic.c b/drivers/irqchip/irq-gic.c
index 974f77c887b8..add1fd84fc4b 100644
--- a/drivers/irqchip/irq-gic.c
+++ b/drivers/irqchip/irq-gic.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/list.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
 #include <linux/cpu_pm.h>
 #include <linux/cpumask.h>
 #include <linux/io.h>
@@ -699,6 +700,25 @@ static int gic_irq_domain_xlate(struct irq_domain *d,
 	return 0;
 }
 
+#ifdef CONFIG_SMP
+static int __cpuinit gic_secondary_init(struct notifier_block *nfb,
+					unsigned long action, void *hcpu)
+{
+	if (action == CPU_STARTING)
+		gic_cpu_init(&gic_data[0]);
+	return NOTIFY_OK;
+}
+
+/*
+ * Notifier for enabling the GIC CPU interface. Set an arbitrarily high
+ * priority because the GIC needs to be up before the ARM generic timers.
+ */
+static struct notifier_block __cpuinitdata gic_cpu_notifier = {
+	.notifier_call = gic_secondary_init,
+	.priority = 100,
+};
+#endif
+
 const struct irq_domain_ops gic_irq_domain_ops = {
 	.map = gic_irq_domain_map,
 	.xlate = gic_irq_domain_xlate,
@@ -789,6 +809,7 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 
 #ifdef CONFIG_SMP
 	set_smp_cross_call(gic_raise_softirq);
+	register_cpu_notifier(&gic_cpu_notifier);
 #endif
 
 	set_handle_irq(gic_handle_irq);
@@ -799,13 +820,6 @@ void __init gic_init_bases(unsigned int gic_nr, int irq_start,
 	gic_pm_init(gic);
 }
 
-void __cpuinit gic_secondary_init(unsigned int gic_nr)
-{
-	BUG_ON(gic_nr >= MAX_GIC_NR);
-
-	gic_cpu_init(&gic_data[gic_nr]);
-}
-
 #ifdef CONFIG_OF
 static int gic_cnt __initdata = 0;
 
diff --git a/include/linux/irqchip/arm-gic.h b/include/linux/irqchip/arm-gic.h
index 3fd8e4290a1c..3e203eb23cc7 100644
--- a/include/linux/irqchip/arm-gic.h
+++ b/include/linux/irqchip/arm-gic.h
@@ -65,7 +65,6 @@ extern struct irq_chip gic_arch_extn;
 
 void gic_init_bases(unsigned int, int, void __iomem *, void __iomem *,
 		    u32 offset, struct device_node *);
-void gic_secondary_init(unsigned int);
 void gic_cascade_irq(unsigned int gic_nr, unsigned int irq);
 
 static inline void gic_init(unsigned int nr, int start,
-- 
cgit 


From 6752c8db8e0cfedb44ba62806dd15b383ed64000 Mon Sep 17 00:00:00 2001
From: YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org>
Date: Mon, 25 Mar 2013 08:26:16 +0000
Subject: firewire net, ipv4 arp: Extend hardware address and remove
 driver-level packet inspection.

Inspection of upper layer protocol is considered harmful, especially
if it is about ARP or other stateful upper layer protocol; driver
cannot (and should not) have full state of them.

IPv4 over Firewire module used to inspect ARP (both in sending path
and in receiving path), and record peer's GUID, max packet size, max
speed and fifo address.  This patch removes such inspection by extending
our "hardware address" definition to include other information as well:
max packet size, max speed and fifo.  By doing this, The neighbour
module in networking subsystem can cache them.

Note: As we have started ignoring sspd and max_rec in ARP/NDP, those
      information will not be used in the driver when sending.

When a packet is being sent, the IP layer fills our pseudo header with
the extended "hardware address", including GUID and fifo.  The driver
can look-up node-id (the real but rather volatile low-level address)
by GUID, and then the module can send the packet to the wire using
parameters provided in the extendedn hardware address.

This approach is realistic because IP over IEEE1394 (RFC2734) and IPv6
over IEEE1394 (RFC3146) share same "hardware address" format
in their address resolution protocols.

Here, extended "hardware address" is defined as follows:

union fwnet_hwaddr {
	u8 u[16];
	struct {
		__be64 uniq_id;		/* EUI-64			*/
		u8 max_rec;		/* max packet size		*/
		u8 sspd;		/* max speed			*/
		__be16 fifo_hi;		/* hi 16bits of FIFO addr	*/
		__be32 fifo_lo;		/* lo 32bits of FIFO addr	*/
	} __packed uc;
};

Note that Hardware address is declared as union, so that we can map full
IP address into this, when implementing MCAP (Multicast Cannel Allocation
Protocol) for IPv6, but IP and ARP subsystem do not need to know this
format in detail.

One difference between original ARP (RFC826) and 1394 ARP (RFC2734)
is that 1394 ARP Request/Reply do not contain the target hardware address
field (aka ar$tha).  This difference is handled in the ARP subsystem.

CC: Stephan Gatzka <stephan.gatzka@gmail.com>
Signed-off-by: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firewire/net.c | 153 +++++++++----------------------------------------
 include/linux/if_arp.h |  12 +++-
 include/net/firewire.h |  25 ++++++++
 net/ipv4/arp.c         |  27 +++++++--
 4 files changed, 82 insertions(+), 135 deletions(-)
 create mode 100644 include/net/firewire.h

(limited to 'include/linux')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index cb8aa865ff88..790017eb5051 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -28,6 +28,7 @@
 
 #include <asm/unaligned.h>
 #include <net/arp.h>
+#include <net/firewire.h>
 
 /* rx limits */
 #define FWNET_MAX_FRAGMENTS		30 /* arbitrary, > TX queue depth */
@@ -57,33 +58,6 @@
 #define RFC2374_HDR_LASTFRAG	2	/* last fragment	*/
 #define RFC2374_HDR_INTFRAG	3	/* interior fragment	*/
 
-#define RFC2734_HW_ADDR_LEN	16
-
-struct rfc2734_arp {
-	__be16 hw_type;		/* 0x0018	*/
-	__be16 proto_type;	/* 0x0806       */
-	u8 hw_addr_len;		/* 16		*/
-	u8 ip_addr_len;		/* 4		*/
-	__be16 opcode;		/* ARP Opcode	*/
-	/* Above is exactly the same format as struct arphdr */
-
-	__be64 s_uniq_id;	/* Sender's 64bit EUI			*/
-	u8 max_rec;		/* Sender's max packet size		*/
-	u8 sspd;		/* Sender's max speed			*/
-	__be16 fifo_hi;		/* hi 16bits of sender's FIFO addr	*/
-	__be32 fifo_lo;		/* lo 32bits of sender's FIFO addr	*/
-	__be32 sip;		/* Sender's IP Address			*/
-	__be32 tip;		/* IP Address of requested hw addr	*/
-} __packed;
-
-/* This header format is specific to this driver implementation. */
-#define FWNET_ALEN	8
-#define FWNET_HLEN	10
-struct fwnet_header {
-	u8 h_dest[FWNET_ALEN];	/* destination address */
-	__be16 h_proto;		/* packet type ID field */
-} __packed;
-
 static bool fwnet_hwaddr_is_multicast(u8 *ha)
 {
 	return !!(*ha & 1);
@@ -196,8 +170,6 @@ struct fwnet_peer {
 	struct list_head peer_link;
 	struct fwnet_device *dev;
 	u64 guid;
-	u64 fifo;
-	__be32 ip;
 
 	/* guarded by dev->lock */
 	struct list_head pd_list; /* received partial datagrams */
@@ -226,6 +198,15 @@ struct fwnet_packet_task {
 	u8 enqueued;
 };
 
+/*
+ * Get fifo address embedded in hwaddr
+ */
+static __u64 fwnet_hwaddr_fifo(union fwnet_hwaddr *ha)
+{
+	return (u64)get_unaligned_be16(&ha->uc.fifo_hi) << 32
+	       | get_unaligned_be32(&ha->uc.fifo_lo);
+}
+
 /*
  * saddr == NULL means use device source address.
  * daddr == NULL means leave destination address (eg unresolved arp).
@@ -518,7 +499,6 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 					bool is_broadcast, u16 ether_type)
 {
 	struct fwnet_device *dev;
-	static const __be64 broadcast_hw = cpu_to_be64(~0ULL);
 	int status;
 	__be64 guid;
 
@@ -537,76 +517,11 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 
 	/*
 	 * Parse the encapsulation header. This actually does the job of
-	 * converting to an ethernet frame header, as well as arp
-	 * conversion if needed. ARP conversion is easier in this
-	 * direction, since we are using ethernet as our backend.
-	 */
-	/*
-	 * If this is an ARP packet, convert it. First, we want to make
-	 * use of some of the fields, since they tell us a little bit
-	 * about the sending machine.
+	 * converting to an ethernet-like pseudo frame header.
 	 */
-	if (ether_type == ETH_P_ARP) {
-		struct rfc2734_arp *arp1394;
-		struct arphdr *arp;
-		unsigned char *arp_ptr;
-		u64 fifo_addr;
-		u64 peer_guid;
-		struct fwnet_peer *peer;
-		unsigned long flags;
-
-		arp1394   = (struct rfc2734_arp *)skb->data;
-		arp       = (struct arphdr *)skb->data;
-		arp_ptr   = (unsigned char *)(arp + 1);
-		peer_guid = get_unaligned_be64(&arp1394->s_uniq_id);
-		fifo_addr = (u64)get_unaligned_be16(&arp1394->fifo_hi) << 32
-				| get_unaligned_be32(&arp1394->fifo_lo);
-
-		spin_lock_irqsave(&dev->lock, flags);
-		peer = fwnet_peer_find_by_guid(dev, peer_guid);
-		if (peer) {
-			peer->fifo = fifo_addr;
-			peer->ip = arp1394->sip;
-		}
-		spin_unlock_irqrestore(&dev->lock, flags);
-
-		if (!peer) {
-			dev_notice(&net->dev,
-				   "no peer for ARP packet from %016llx\n",
-				   (unsigned long long)peer_guid);
-			goto no_peer;
-		}
-
-		/*
-		 * Now that we're done with the 1394 specific stuff, we'll
-		 * need to alter some of the data.  Believe it or not, all
-		 * that needs to be done is sender_IP_address needs to be
-		 * moved, the destination hardware address get stuffed
-		 * in and the hardware address length set to 8.
-		 *
-		 * IMPORTANT: The code below overwrites 1394 specific data
-		 * needed above so keep the munging of the data for the
-		 * higher level IP stack last.
-		 */
-
-		arp->ar_hln = 8;
-		/* skip over sender unique id */
-		arp_ptr += arp->ar_hln;
-		/* move sender IP addr */
-		put_unaligned(arp1394->sip, (u32 *)arp_ptr);
-		/* skip over sender IP addr */
-		arp_ptr += arp->ar_pln;
-
-		if (arp->ar_op == htons(ARPOP_REQUEST))
-			memset(arp_ptr, 0, sizeof(u64));
-		else
-			memcpy(arp_ptr, net->dev_addr, sizeof(u64));
-	}
-
-	/* Now add the ethernet header. */
 	guid = cpu_to_be64(dev->card->guid);
 	if (dev_hard_header(skb, net, ether_type,
-			   is_broadcast ? &broadcast_hw : &guid,
+			   is_broadcast ? net->broadcast : net->dev_addr,
 			   NULL, skb->len) >= 0) {
 		struct fwnet_header *eth;
 		u16 *rawp;
@@ -649,7 +564,6 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 
 	return 0;
 
- no_peer:
  err:
 	net->stats.rx_errors++;
 	net->stats.rx_dropped++;
@@ -1355,11 +1269,12 @@ static netdev_tx_t fwnet_tx(struct sk_buff *skb, struct net_device *net)
 		ptask->dest_node   = IEEE1394_ALL_NODES;
 		ptask->speed       = SCODE_100;
 	} else {
-		__be64 guid = get_unaligned((__be64 *)hdr_buf.h_dest);
+		union fwnet_hwaddr *ha = (union fwnet_hwaddr *)hdr_buf.h_dest;
+		__be64 guid = get_unaligned(&ha->uc.uniq_id);
 		u8 generation;
 
 		peer = fwnet_peer_find_by_guid(dev, be64_to_cpu(guid));
-		if (!peer || peer->fifo == FWNET_NO_FIFO_ADDR)
+		if (!peer)
 			goto fail;
 
 		generation         = peer->generation;
@@ -1367,32 +1282,12 @@ static netdev_tx_t fwnet_tx(struct sk_buff *skb, struct net_device *net)
 		max_payload        = peer->max_payload;
 		datagram_label_ptr = &peer->datagram_label;
 
-		ptask->fifo_addr   = peer->fifo;
+		ptask->fifo_addr   = fwnet_hwaddr_fifo(ha);
 		ptask->generation  = generation;
 		ptask->dest_node   = dest_node;
 		ptask->speed       = peer->speed;
 	}
 
-	/* If this is an ARP packet, convert it */
-	if (proto == htons(ETH_P_ARP)) {
-		struct arphdr *arp = (struct arphdr *)skb->data;
-		unsigned char *arp_ptr = (unsigned char *)(arp + 1);
-		struct rfc2734_arp *arp1394 = (struct rfc2734_arp *)skb->data;
-		__be32 ipaddr;
-
-		ipaddr = get_unaligned((__be32 *)(arp_ptr + FWNET_ALEN));
-
-		arp1394->hw_addr_len    = RFC2734_HW_ADDR_LEN;
-		arp1394->max_rec        = dev->card->max_receive;
-		arp1394->sspd		= dev->card->link_speed;
-
-		put_unaligned_be16(dev->local_fifo >> 32,
-				   &arp1394->fifo_hi);
-		put_unaligned_be32(dev->local_fifo & 0xffffffff,
-				   &arp1394->fifo_lo);
-		put_unaligned(ipaddr, &arp1394->sip);
-	}
-
 	ptask->hdr.w0 = 0;
 	ptask->hdr.w1 = 0;
 	ptask->skb = skb;
@@ -1507,8 +1402,6 @@ static int fwnet_add_peer(struct fwnet_device *dev,
 
 	peer->dev = dev;
 	peer->guid = (u64)device->config_rom[3] << 32 | device->config_rom[4];
-	peer->fifo = FWNET_NO_FIFO_ADDR;
-	peer->ip = 0;
 	INIT_LIST_HEAD(&peer->pd_list);
 	peer->pdg_size = 0;
 	peer->datagram_label = 0;
@@ -1538,6 +1431,7 @@ static int fwnet_probe(struct device *_dev)
 	struct fwnet_device *dev;
 	unsigned max_mtu;
 	int ret;
+	union fwnet_hwaddr *ha;
 
 	mutex_lock(&fwnet_device_mutex);
 
@@ -1582,8 +1476,15 @@ static int fwnet_probe(struct device *_dev)
 	net->mtu = min(1500U, max_mtu);
 
 	/* Set our hardware address while we're at it */
-	put_unaligned_be64(card->guid, net->dev_addr);
-	put_unaligned_be64(~0ULL, net->broadcast);
+	ha = (union fwnet_hwaddr *)net->dev_addr;
+	put_unaligned_be64(card->guid, &ha->uc.uniq_id);
+	ha->uc.max_rec = dev->card->max_receive;
+	ha->uc.sspd = dev->card->link_speed;
+	put_unaligned_be16(dev->local_fifo >> 32, &ha->uc.fifo_hi);
+	put_unaligned_be32(dev->local_fifo & 0xffffffff, &ha->uc.fifo_lo);
+
+	memset(net->broadcast, -1, net->addr_len);
+
 	ret = register_netdev(net);
 	if (ret)
 		goto out;
@@ -1632,8 +1533,6 @@ static int fwnet_remove(struct device *_dev)
 	mutex_lock(&fwnet_device_mutex);
 
 	net = dev->netdev;
-	if (net && peer->ip)
-		arp_invalidate(net, peer->ip);
 
 	fwnet_remove_peer(peer, dev);
 
diff --git a/include/linux/if_arp.h b/include/linux/if_arp.h
index 89b4614a4722..f563907ed776 100644
--- a/include/linux/if_arp.h
+++ b/include/linux/if_arp.h
@@ -33,7 +33,15 @@ static inline struct arphdr *arp_hdr(const struct sk_buff *skb)
 
 static inline int arp_hdr_len(struct net_device *dev)
 {
-	/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
-	return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2;
+	switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+	case ARPHRD_IEEE1394:
+		/* ARP header, device address and 2 IP addresses */
+		return sizeof(struct arphdr) + dev->addr_len + sizeof(u32) * 2;
+#endif
+	default:
+		/* ARP header, plus 2 device addresses, plus 2 IP addresses. */
+		return sizeof(struct arphdr) + (dev->addr_len + sizeof(u32)) * 2;
+	}
 }
 #endif	/* _LINUX_IF_ARP_H */
diff --git a/include/net/firewire.h b/include/net/firewire.h
new file mode 100644
index 000000000000..31bcbfe7a220
--- /dev/null
+++ b/include/net/firewire.h
@@ -0,0 +1,25 @@
+#ifndef _NET_FIREWIRE_H
+#define _NET_FIREWIRE_H
+
+/* Pseudo L2 address */
+#define FWNET_ALEN	16
+union fwnet_hwaddr {
+	u8 u[FWNET_ALEN];
+	/* "Hardware address" defined in RFC2734/RF3146 */
+	struct {
+		__be64 uniq_id;		/* EUI-64			*/
+		u8 max_rec;		/* max packet size		*/
+		u8 sspd;		/* max speed			*/
+		__be16 fifo_hi;		/* hi 16bits of FIFO addr	*/
+		__be32 fifo_lo;		/* lo 32bits of FIFO addr	*/
+	} __packed uc;
+};
+
+/* Pseudo L2 Header */
+#define FWNET_HLEN	18
+struct fwnet_header {
+	u8 h_dest[FWNET_ALEN];	/* destination address */
+	__be16 h_proto;		/* packet type ID field */
+} __packed;
+
+#endif
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index fea4929f6200..247ec1951c35 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -654,11 +654,19 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip,
 	arp_ptr += dev->addr_len;
 	memcpy(arp_ptr, &src_ip, 4);
 	arp_ptr += 4;
-	if (target_hw != NULL)
-		memcpy(arp_ptr, target_hw, dev->addr_len);
-	else
-		memset(arp_ptr, 0, dev->addr_len);
-	arp_ptr += dev->addr_len;
+
+	switch (dev->type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+	case ARPHRD_IEEE1394:
+		break;
+#endif
+	default:
+		if (target_hw != NULL)
+			memcpy(arp_ptr, target_hw, dev->addr_len);
+		else
+			memset(arp_ptr, 0, dev->addr_len);
+		arp_ptr += dev->addr_len;
+	}
 	memcpy(arp_ptr, &dest_ip, 4);
 
 	return skb;
@@ -781,7 +789,14 @@ static int arp_process(struct sk_buff *skb)
 	arp_ptr += dev->addr_len;
 	memcpy(&sip, arp_ptr, 4);
 	arp_ptr += 4;
-	arp_ptr += dev->addr_len;
+	switch (dev_type) {
+#if IS_ENABLED(CONFIG_FIREWIRE_NET)
+	case ARPHRD_IEEE1394:
+		break;
+#endif
+	default:
+		arp_ptr += dev->addr_len;
+	}
 	memcpy(&tip, arp_ptr, 4);
 /*
  *	Check for bad requests for 127.x.x.x and requests for multicast
-- 
cgit 


From 0e5e4f0e56aca0df1d5648db0be9028bd573b25c Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Fri, 9 Nov 2012 16:33:05 -0700
Subject: NVMe: Add discard support for capable devices

This adds discard support to block queues if the nvme device is capable of
deallocating blocks as indicated by the controller's optional command support.
A discard flagged bio request will submit an NVMe deallocate Data Set
Management command for the requested blocks.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme.c | 60 +++++++++++++++++++++++++++++++++++++++++++++++++++-
 include/linux/nvme.h | 32 ++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 9dcefe40380b..26e266072079 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -80,6 +80,7 @@ struct nvme_dev {
 	char model[40];
 	char firmware_rev[8];
 	u32 max_hw_sectors;
+	u16 oncs;
 };
 
 /*
@@ -510,6 +511,44 @@ static int nvme_map_bio(struct device *dev, struct nvme_iod *iod,
 	return length;
 }
 
+/*
+ * We reuse the small pool to allocate the 16-byte range here as it is not
+ * worth having a special pool for these or additional cases to handle freeing
+ * the iod.
+ */
+static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
+		struct bio *bio, struct nvme_iod *iod, int cmdid)
+{
+	struct nvme_dsm_range *range;
+	struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail];
+
+	range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC,
+							&iod->first_dma);
+	if (!range)
+		return -ENOMEM;
+
+	iod_list(iod)[0] = (__le64 *)range;
+	iod->npages = 0;
+
+	range->cattr = cpu_to_le32(0);
+	range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift);
+	range->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+
+	memset(cmnd, 0, sizeof(*cmnd));
+	cmnd->dsm.opcode = nvme_cmd_dsm;
+	cmnd->dsm.command_id = cmdid;
+	cmnd->dsm.nsid = cpu_to_le32(ns->ns_id);
+	cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma);
+	cmnd->dsm.nr = 0;
+	cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD);
+
+	if (++nvmeq->sq_tail == nvmeq->q_depth)
+		nvmeq->sq_tail = 0;
+	writel(nvmeq->sq_tail, nvmeq->q_db);
+
+	return 0;
+}
+
 static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 								int cmdid)
 {
@@ -567,6 +606,12 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	if (unlikely(cmdid < 0))
 		goto free_iod;
 
+	if (bio->bi_rw & REQ_DISCARD) {
+		result = nvme_submit_discard(nvmeq, ns, bio, iod, cmdid);
+		if (result)
+			goto free_cmdid;
+		return result;
+	}
 	if ((bio->bi_rw & REQ_FLUSH) && !psegs)
 		return nvme_submit_flush(nvmeq, ns, cmdid);
 
@@ -1347,6 +1392,16 @@ static void nvme_put_ns_idx(int index)
 	spin_unlock(&dev_list_lock);
 }
 
+static void nvme_config_discard(struct nvme_ns *ns)
+{
+	u32 logical_block_size = queue_logical_block_size(ns->queue);
+	ns->queue->limits.discard_zeroes_data = 0;
+	ns->queue->limits.discard_alignment = logical_block_size;
+	ns->queue->limits.discard_granularity = logical_block_size;
+	ns->queue->limits.max_discard_sectors = 0xffffffff;
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue);
+}
+
 static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
@@ -1366,7 +1421,6 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	ns->queue->queue_flags = QUEUE_FLAG_DEFAULT;
 	queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
-/*	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); */
 	blk_queue_make_request(ns->queue, nvme_make_request);
 	ns->dev = dev;
 	ns->queue->queuedata = ns;
@@ -1392,6 +1446,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid);
 	set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9));
 
+	if (dev->oncs & NVME_CTRL_ONCS_DSM)
+		nvme_config_discard(ns);
+
 	return ns;
 
  out_free_queue:
@@ -1520,6 +1577,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 
 	ctrl = mem;
 	nn = le32_to_cpup(&ctrl->nn);
+	dev->oncs = le16_to_cpup(&ctrl->oncs);
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 4fa3b0b9b071..bde44c1fd213 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -107,6 +107,12 @@ struct nvme_id_ctrl {
 	__u8			vs[1024];
 };
 
+enum {
+	NVME_CTRL_ONCS_COMPARE			= 1 << 0,
+	NVME_CTRL_ONCS_WRITE_UNCORRECTABLE	= 1 << 1,
+	NVME_CTRL_ONCS_DSM			= 1 << 2,
+};
+
 struct nvme_lbaf {
 	__le16			ms;
 	__u8			ds;
@@ -246,6 +252,31 @@ enum {
 	NVME_RW_DSM_COMPRESSED		= 1 << 7,
 };
 
+struct nvme_dsm_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[2];
+	__le64			prp1;
+	__le64			prp2;
+	__le32			nr;
+	__le32			attributes;
+	__u32			rsvd12[4];
+};
+
+enum {
+	NVME_DSMGMT_IDR		= 1 << 0,
+	NVME_DSMGMT_IDW		= 1 << 1,
+	NVME_DSMGMT_AD		= 1 << 2,
+};
+
+struct nvme_dsm_range {
+	__le32			cattr;
+	__le32			nlb;
+	__le64			slba;
+};
+
 /* Admin commands */
 
 enum nvme_admin_opcode {
@@ -372,6 +403,7 @@ struct nvme_command {
 		struct nvme_create_sq create_sq;
 		struct nvme_delete_queue delete_queue;
 		struct nvme_download_firmware dlfw;
+		struct nvme_dsm_cmd dsm;
 	};
 };
 
-- 
cgit 


From ece70094f6ab2107d4313fa1802b13dab0234ac5 Mon Sep 17 00:00:00 2001
From: Prashant Gaikwad <pgaikwad@nvidia.com>
Date: Wed, 20 Mar 2013 17:30:34 +0530
Subject: clk: Add composite clock type

Not all clocks are required to be decomposed into basic clock
types but at the same time want to use the functionality
provided by these basic clock types instead of duplicating.

For example, Tegra SoC has ~100 clocks which can be decomposed
into Mux -> Div -> Gate clock types making the clock count to
~300. Also, parent change operation can not be performed on gate
clock which forces to use mux clock in driver if want to change
the parent.

Instead aggregate the basic clock types functionality into one
clock and just use this clock for all operations. This clock
type re-uses the functionality of basic clock types and not
limited to basic clock types but any hardware-specific
implementation.

Signed-off-by: Prashant Gaikwad <pgaikwad@nvidia.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/Makefile         |   1 +
 drivers/clk/clk-composite.c  | 201 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/clk-provider.h |  31 +++++++
 3 files changed, 233 insertions(+)
 create mode 100644 drivers/clk/clk-composite.c

(limited to 'include/linux')

diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index 1c22f9dc721d..41cb123a2d02 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -7,6 +7,7 @@ obj-$(CONFIG_COMMON_CLK)	+= clk-fixed-factor.o
 obj-$(CONFIG_COMMON_CLK)	+= clk-fixed-rate.o
 obj-$(CONFIG_COMMON_CLK)	+= clk-gate.o
 obj-$(CONFIG_COMMON_CLK)	+= clk-mux.o
+obj-$(CONFIG_COMMON_CLK)	+= clk-composite.o
 
 # SoCs specific
 obj-$(CONFIG_ARCH_BCM2835)	+= clk-bcm2835.o
diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
new file mode 100644
index 000000000000..097dee4fd209
--- /dev/null
+++ b/drivers/clk/clk-composite.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2013 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+
+#define to_clk_composite(_hw) container_of(_hw, struct clk_composite, hw)
+
+static u8 clk_composite_get_parent(struct clk_hw *hw)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *mux_ops = composite->mux_ops;
+	struct clk_hw *mux_hw = composite->mux_hw;
+
+	mux_hw->clk = hw->clk;
+
+	return mux_ops->get_parent(mux_hw);
+}
+
+static int clk_composite_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *mux_ops = composite->mux_ops;
+	struct clk_hw *mux_hw = composite->mux_hw;
+
+	mux_hw->clk = hw->clk;
+
+	return mux_ops->set_parent(mux_hw, index);
+}
+
+static unsigned long clk_composite_recalc_rate(struct clk_hw *hw,
+					    unsigned long parent_rate)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *div_ops = composite->div_ops;
+	struct clk_hw *div_hw = composite->div_hw;
+
+	div_hw->clk = hw->clk;
+
+	return div_ops->recalc_rate(div_hw, parent_rate);
+}
+
+static long clk_composite_round_rate(struct clk_hw *hw, unsigned long rate,
+				  unsigned long *prate)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *div_ops = composite->div_ops;
+	struct clk_hw *div_hw = composite->div_hw;
+
+	div_hw->clk = hw->clk;
+
+	return div_ops->round_rate(div_hw, rate, prate);
+}
+
+static int clk_composite_set_rate(struct clk_hw *hw, unsigned long rate,
+			       unsigned long parent_rate)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *div_ops = composite->div_ops;
+	struct clk_hw *div_hw = composite->div_hw;
+
+	div_hw->clk = hw->clk;
+
+	return div_ops->set_rate(div_hw, rate, parent_rate);
+}
+
+static int clk_composite_is_enabled(struct clk_hw *hw)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *gate_ops = composite->gate_ops;
+	struct clk_hw *gate_hw = composite->gate_hw;
+
+	gate_hw->clk = hw->clk;
+
+	return gate_ops->is_enabled(gate_hw);
+}
+
+static int clk_composite_enable(struct clk_hw *hw)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *gate_ops = composite->gate_ops;
+	struct clk_hw *gate_hw = composite->gate_hw;
+
+	gate_hw->clk = hw->clk;
+
+	return gate_ops->enable(gate_hw);
+}
+
+static void clk_composite_disable(struct clk_hw *hw)
+{
+	struct clk_composite *composite = to_clk_composite(hw);
+	const struct clk_ops *gate_ops = composite->gate_ops;
+	struct clk_hw *gate_hw = composite->gate_hw;
+
+	gate_hw->clk = hw->clk;
+
+	gate_ops->disable(gate_hw);
+}
+
+struct clk *clk_register_composite(struct device *dev, const char *name,
+			const char **parent_names, int num_parents,
+			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+			struct clk_hw *div_hw, const struct clk_ops *div_ops,
+			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+			unsigned long flags)
+{
+	struct clk *clk;
+	struct clk_init_data init;
+	struct clk_composite *composite;
+	struct clk_ops *clk_composite_ops;
+
+	composite = kzalloc(sizeof(*composite), GFP_KERNEL);
+	if (!composite) {
+		pr_err("%s: could not allocate composite clk\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	init.name = name;
+	init.flags = flags | CLK_IS_BASIC;
+	init.parent_names = parent_names;
+	init.num_parents = num_parents;
+
+	clk_composite_ops = &composite->ops;
+
+	if (mux_hw && mux_ops) {
+		if (!mux_ops->get_parent || !mux_ops->set_parent) {
+			clk = ERR_PTR(-EINVAL);
+			goto err;
+		}
+
+		composite->mux_hw = mux_hw;
+		composite->mux_ops = mux_ops;
+		clk_composite_ops->get_parent = clk_composite_get_parent;
+		clk_composite_ops->set_parent = clk_composite_set_parent;
+	}
+
+	if (div_hw && div_ops) {
+		if (!div_ops->recalc_rate || !div_ops->round_rate ||
+		    !div_ops->set_rate) {
+			clk = ERR_PTR(-EINVAL);
+			goto err;
+		}
+
+		composite->div_hw = div_hw;
+		composite->div_ops = div_ops;
+		clk_composite_ops->recalc_rate = clk_composite_recalc_rate;
+		clk_composite_ops->round_rate = clk_composite_round_rate;
+		clk_composite_ops->set_rate = clk_composite_set_rate;
+	}
+
+	if (gate_hw && gate_ops) {
+		if (!gate_ops->is_enabled || !gate_ops->enable ||
+		    !gate_ops->disable) {
+			clk = ERR_PTR(-EINVAL);
+			goto err;
+		}
+
+		composite->gate_hw = gate_hw;
+		composite->gate_ops = gate_ops;
+		clk_composite_ops->is_enabled = clk_composite_is_enabled;
+		clk_composite_ops->enable = clk_composite_enable;
+		clk_composite_ops->disable = clk_composite_disable;
+	}
+
+	init.ops = clk_composite_ops;
+	composite->hw.init = &init;
+
+	clk = clk_register(dev, &composite->hw);
+	if (IS_ERR(clk))
+		goto err;
+
+	if (composite->mux_hw)
+		composite->mux_hw->clk = clk;
+
+	if (composite->div_hw)
+		composite->div_hw->clk = clk;
+
+	if (composite->gate_hw)
+		composite->gate_hw->clk = clk;
+
+	return clk;
+
+err:
+	kfree(composite);
+	return clk;
+}
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 63ba3b740794..1f0352802794 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -342,6 +342,37 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 		const char *parent_name, unsigned long flags,
 		unsigned int mult, unsigned int div);
 
+/***
+ * struct clk_composite - aggregate clock of mux, divider and gate clocks
+ *
+ * @hw:		handle between common and hardware-specific interfaces
+ * @mux_hw:	handle between composite and hardware-specifix mux clock
+ * @div_hw:	handle between composite and hardware-specifix divider clock
+ * @gate_hw:	handle between composite and hardware-specifix gate clock
+ * @mux_ops:	clock ops for mux
+ * @div_ops:	clock ops for divider
+ * @gate_ops:	clock ops for gate
+ */
+struct clk_composite {
+	struct clk_hw	hw;
+	struct clk_ops	ops;
+
+	struct clk_hw	*mux_hw;
+	struct clk_hw	*div_hw;
+	struct clk_hw	*gate_hw;
+
+	const struct clk_ops	*mux_ops;
+	const struct clk_ops	*div_ops;
+	const struct clk_ops	*gate_ops;
+};
+
+struct clk *clk_register_composite(struct device *dev, const char *name,
+		const char **parent_names, int num_parents,
+		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
+		struct clk_hw *div_hw, const struct clk_ops *div_ops,
+		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
+		unsigned long flags);
+
 /**
  * clk_register - allocate a new clock, register it and return an opaque cookie
  * @dev: device that is registering this clock
-- 
cgit 


From 3a54aaa0a3ddb2cf2ec1b94a94024e9a8a8af962 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 24 Jan 2013 16:10:26 +0100
Subject: perf/x86: Improve sysfs event mapping with event string

This patch extends Jiri's changes to make generic
events mapping visible via sysfs. The patch extends
the mechanism to non-generic events by allowing
the mappings to be hardcoded in strings.

This mechanism will be used by the PEBS-LL patch
later on.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: ak@linux.intel.com
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: namhyung.kim@lge.com
Link: http://lkml.kernel.org/r/1359040242-8269-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[ fixed up conflict with 2663960 "perf: Make EVENT_ATTR global" ]
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/x86/kernel/cpu/perf_event.c | 20 ++++++++++++--------
 arch/x86/kernel/cpu/perf_event.h | 17 +++++++++++++++++
 include/linux/perf_event.h       |  1 +
 3 files changed, 30 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index c886dc8c63f8..6e8ab0427041 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1316,9 +1316,16 @@ static struct attribute_group x86_pmu_format_group = {
  */
 static void __init filter_events(struct attribute **attrs)
 {
+	struct device_attribute *d;
+	struct perf_pmu_events_attr *pmu_attr;
 	int i, j;
 
 	for (i = 0; attrs[i]; i++) {
+		d = (struct device_attribute *)attrs[i];
+		pmu_attr = container_of(d, struct perf_pmu_events_attr, attr);
+		/* str trumps id */
+		if (pmu_attr->event_str)
+			continue;
 		if (x86_pmu.event_map(i))
 			continue;
 
@@ -1361,17 +1368,14 @@ static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *at
 {
 	struct perf_pmu_events_attr *pmu_attr = \
 		container_of(attr, struct perf_pmu_events_attr, attr);
-
 	u64 config = x86_pmu.event_map(pmu_attr->id);
-	return x86_pmu.events_sysfs_show(page, config);
-}
 
-#define EVENT_VAR(_id)  event_attr_##_id
-#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+	/* string trumps id */
+	if (pmu_attr->event_str)
+		return sprintf(page, "%s", pmu_attr->event_str);
 
-#define EVENT_ATTR(_name, _id)						\
-	PMU_EVENT_ATTR(_name, EVENT_VAR(_id), PERF_COUNT_HW_##_id,	\
-			events_sysfs_show)
+	return x86_pmu.events_sysfs_show(page, config);
+}
 
 EVENT_ATTR(cpu-cycles,			CPU_CYCLES		);
 EVENT_ATTR(instructions,		INSTRUCTIONS		);
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 95152c12a8d9..b1518eed5f99 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -422,6 +422,23 @@ do {									\
 #define ERF_NO_HT_SHARING	1
 #define ERF_HAS_RSP_1		2
 
+#define EVENT_VAR(_id)  event_attr_##_id
+#define EVENT_PTR(_id) &event_attr_##_id.attr.attr
+
+#define EVENT_ATTR(_name, _id)						\
+static struct perf_pmu_events_attr EVENT_VAR(_id) = {			\
+	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
+	.id		= PERF_COUNT_HW_##_id,				\
+	.event_str	= NULL,						\
+};
+
+#define EVENT_ATTR_STR(_name, v, str)					\
+static struct perf_pmu_events_attr event_attr_##v = {			\
+	.attr		= __ATTR(_name, 0444, events_sysfs_show, NULL),	\
+	.id		= 0,						\
+	.event_str	= str,						\
+};
+
 extern struct x86_pmu x86_pmu __read_mostly;
 
 DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 8737e1cee8b2..1c592114c437 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -809,6 +809,7 @@ do {									\
 struct perf_pmu_events_attr {
 	struct device_attribute attr;
 	u64 id;
+	const char *event_str;
 };
 
 #define PMU_EVENT_ATTR(_name, _var, _id, _show)				\
-- 
cgit 


From 85467136cdcc674f30beb0e5b79f048fe3a6a76f Mon Sep 17 00:00:00 2001
From: Shuah Khan <shuah.khan@hp.com>
Date: Wed, 27 Feb 2013 17:06:45 -0700
Subject: PCI: Add PCI_BUS_NUM() and PCI_DEVID() interfaces

PCI defines PCI_DEVFN(), PCI_SLOT(), and PCI_FUNC() interfaces; however,
it doesn't have interfaces to return PCI bus and PCI device id. Drivers
(AMD IOMMU, and AER) implement module specific definitions for PCI_BUS()
and AMD_IOMMU driver also has a module specific interface to calculate PCI
device id from bus number and devfn.

Add PCI_BUS_NUM and PCI_DEVID interfaces to return PCI bus number and PCI
device id respectively to avoid the need for duplicate definitions in other
modules. AER driver code and AMD IOMMU driver define PCI_BUS. AMD IOMMU
driver defines an interface to calculate device id from bus number, and
devfn pair.

PCI_DEVFN(), PCI_SLOT(), and PCI_FUNC() interfaces are exported to
user-space via uapi/linux/pci.h. However, in the interest to keep the new
interfaces as kernel only and not export them to user-space unnecessarily,
added them to linux/pci.h instead.

Signed-off-by: Shuah Khan <shuah.khan@hp.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Joerg Roedel <joro@8bytes.org>
---
 include/linux/pci.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2461033a7987..849a336e149c 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -35,6 +35,21 @@
 /* Include the ID list */
 #include <linux/pci_ids.h>
 
+/*
+ * The PCI interface treats multi-function devices as independent
+ * devices.  The slot/function address of each device is encoded
+ * in a single byte as follows:
+ *
+ *	7:3 = slot
+ *	2:0 = function
+ * PCI_DEVFN(), PCI_SLOT(), and PCI_FUNC() are defined uapi/linux/pci.h
+ * In the interest of not exposing interfaces to user-space unnecessarily,
+ * the following kernel only defines are being added here.
+ */
+#define PCI_DEVID(bus, devfn)  ((((u16)bus) << 8) | devfn)
+/* return bus from PCI devid = ((u16)bus_number) << 8) | devfn */
+#define PCI_BUS_NUM(x) (((x) >> 8) & 0xff)
+
 /* pci_slot represents a physical slot */
 struct pci_slot {
 	struct pci_bus *bus;		/* The bus this slot is on */
-- 
cgit 


From 13c3b0fcc8e33ba49f252378f6e7290b146042af Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Mon, 4 Mar 2013 18:40:57 -0700
Subject: NVMe: Move structures & definitions to header file

nvme-scsi.c uses several data structures and definitions that were
previously private to nvme-core.c.  Move the definitions to nvme.h,
protected by __KERNEL__.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 55 -------------------------------------------
 include/linux/nvme.h      | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 26e266072079..1f98040cf677 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -46,7 +46,6 @@
 #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
 #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
 #define NVME_MINORS 64
-#define NVME_IO_TIMEOUT	(5 * HZ)
 #define ADMIN_TIMEOUT	(60 * HZ)
 
 static int nvme_major;
@@ -59,44 +58,6 @@ static DEFINE_SPINLOCK(dev_list_lock);
 static LIST_HEAD(dev_list);
 static struct task_struct *nvme_thread;
 
-/*
- * Represents an NVM Express device.  Each nvme_dev is a PCI function.
- */
-struct nvme_dev {
-	struct list_head node;
-	struct nvme_queue **queues;
-	u32 __iomem *dbs;
-	struct pci_dev *pci_dev;
-	struct dma_pool *prp_page_pool;
-	struct dma_pool *prp_small_pool;
-	int instance;
-	int queue_count;
-	int db_stride;
-	u32 ctrl_config;
-	struct msix_entry *entry;
-	struct nvme_bar __iomem *bar;
-	struct list_head namespaces;
-	char serial[20];
-	char model[40];
-	char firmware_rev[8];
-	u32 max_hw_sectors;
-	u16 oncs;
-};
-
-/*
- * An NVM Express namespace is equivalent to a SCSI LUN
- */
-struct nvme_ns {
-	struct list_head list;
-
-	struct nvme_dev *dev;
-	struct request_queue *queue;
-	struct gendisk *disk;
-
-	int ns_id;
-	int lba_shift;
-};
-
 /*
  * An NVM Express queue.  Each device has at least two (one for admin
  * commands and one for I/O commands).
@@ -295,22 +256,6 @@ static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd)
 	return 0;
 }
 
-/*
- * The nvme_iod describes the data in an I/O, including the list of PRP
- * entries.  You can't see it in this data structure because C doesn't let
- * me express that.  Use nvme_alloc_iod to ensure there's enough space
- * allocated to store the PRP list.
- */
-struct nvme_iod {
-	void *private;		/* For the use of the submitter of the I/O */
-	int npages;		/* In the PRP list. 0 means small pool in use */
-	int offset;		/* Of PRP list */
-	int nents;		/* Used in scatterlist */
-	int length;		/* Of data, in bytes */
-	dma_addr_t first_dma;
-	struct scatterlist sg[0];
-};
-
 static __le64 **iod_list(struct nvme_iod *iod)
 {
 	return ((void *)iod) + iod->offset;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index bde44c1fd213..6f899add14ab 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -493,4 +493,64 @@ struct nvme_admin_cmd {
 #define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
 #define NVME_IOCTL_SUBMIT_IO	_IOW('N', 0x42, struct nvme_user_io)
 
+#ifdef __KERNEL__
+#include <linux/pci.h>
+
+#define NVME_IO_TIMEOUT	(5 * HZ)
+
+/*
+ * Represents an NVM Express device.  Each nvme_dev is a PCI function.
+ */
+struct nvme_dev {
+	struct list_head node;
+	struct nvme_queue **queues;
+	u32 __iomem *dbs;
+	struct pci_dev *pci_dev;
+	struct dma_pool *prp_page_pool;
+	struct dma_pool *prp_small_pool;
+	int instance;
+	int queue_count;
+	int db_stride;
+	u32 ctrl_config;
+	struct msix_entry *entry;
+	struct nvme_bar __iomem *bar;
+	struct list_head namespaces;
+	char serial[20];
+	char model[40];
+	char firmware_rev[8];
+	u32 max_hw_sectors;
+	u16 oncs;
+};
+
+/*
+ * An NVM Express namespace is equivalent to a SCSI LUN
+ */
+struct nvme_ns {
+	struct list_head list;
+
+	struct nvme_dev *dev;
+	struct request_queue *queue;
+	struct gendisk *disk;
+
+	int ns_id;
+	int lba_shift;
+};
+
+/*
+ * The nvme_iod describes the data in an I/O, including the list of PRP
+ * entries.  You can't see it in this data structure because C doesn't let
+ * me express that.  Use nvme_alloc_iod to ensure there's enough space
+ * allocated to store the PRP list.
+ */
+struct nvme_iod {
+	void *private;		/* For the use of the submitter of the I/O */
+	int npages;		/* In the PRP list. 0 means small pool in use */
+	int offset;		/* Of PRP list */
+	int nents;		/* Used in scatterlist */
+	int length;		/* Of data, in bytes */
+	dma_addr_t first_dma;
+	struct scatterlist sg[0];
+};
+#endif
+
 #endif /* _LINUX_NVME_H */
-- 
cgit 


From f8ebf8409abfdaeeb8c847381629a2a8b8e3d816 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Wed, 27 Mar 2013 07:13:41 -0400
Subject: NVMe: Add definitions for format command

The SCSI emulation has the ability to send format commands, so we need
to add the definition of the command.  Also add a missing error code.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c |  1 +
 include/linux/nvme.h      | 12 ++++++++++++
 2 files changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 1f98040cf677..d0cfb85d5582 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -93,6 +93,7 @@ static inline void _nvme_check_size(void)
 	BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_features) != 64);
+	BUILD_BUG_ON(sizeof(struct nvme_format_cmd) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_command) != 64);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ctrl) != 4096);
 	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 6f899add14ab..f1974cab60cf 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -393,6 +393,16 @@ struct nvme_download_firmware {
 	__u32			rsvd12[4];
 };
 
+struct nvme_format_cmd {
+	__u8			opcode;
+	__u8			flags;
+	__u16			command_id;
+	__le32			nsid;
+	__u64			rsvd2[4];
+	__le32			cdw10;
+	__u32			rsvd11[5];
+};
+
 struct nvme_command {
 	union {
 		struct nvme_common_command common;
@@ -403,6 +413,7 @@ struct nvme_command {
 		struct nvme_create_sq create_sq;
 		struct nvme_delete_queue delete_queue;
 		struct nvme_download_firmware dlfw;
+		struct nvme_format_cmd format;
 		struct nvme_dsm_cmd dsm;
 	};
 };
@@ -420,6 +431,7 @@ enum {
 	NVME_SC_FUSED_FAIL		= 0x9,
 	NVME_SC_FUSED_MISSING		= 0xa,
 	NVME_SC_INVALID_NS		= 0xb,
+	NVME_SC_CMD_SEQ_ERROR		= 0xc,
 	NVME_SC_LBA_RANGE		= 0x80,
 	NVME_SC_CAP_EXCEEDED		= 0x81,
 	NVME_SC_NS_NOT_READY		= 0x82,
-- 
cgit 


From e05a4f4fc9ddf7a8633c368786a115b3111d36fd Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Mon, 25 Mar 2013 21:12:27 +0100
Subject: Remove spurious _H suffixes from ifdef comments

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/evm.h       | 2 +-
 include/linux/ima.h       | 4 ++--
 include/linux/integrity.h | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/evm.h b/include/linux/evm.h
index 9fc13a760928..1fcb88ca88de 100644
--- a/include/linux/evm.h
+++ b/include/linux/evm.h
@@ -96,5 +96,5 @@ static inline int evm_inode_init_security(struct inode *inode,
 	return 0;
 }
 
-#endif /* CONFIG_EVM_H */
+#endif /* CONFIG_EVM */
 #endif /* LINUX_EVM_H */
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 86c361e947b9..1b7f268cddce 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -46,7 +46,7 @@ static inline int ima_module_check(struct file *file)
 	return 0;
 }
 
-#endif /* CONFIG_IMA_H */
+#endif /* CONFIG_IMA */
 
 #ifdef CONFIG_IMA_APPRAISE
 extern void ima_inode_post_setattr(struct dentry *dentry);
@@ -72,5 +72,5 @@ static inline int ima_inode_removexattr(struct dentry *dentry,
 {
 	return 0;
 }
-#endif /* CONFIG_IMA_APPRAISE_H */
+#endif /* CONFIG_IMA_APPRAISE */
 #endif /* _LINUX_IMA_H */
diff --git a/include/linux/integrity.h b/include/linux/integrity.h
index 66c5fe9550a5..83222cebd47b 100644
--- a/include/linux/integrity.h
+++ b/include/linux/integrity.h
@@ -36,5 +36,5 @@ static inline void integrity_inode_free(struct inode *inode)
 {
 	return;
 }
-#endif /* CONFIG_INTEGRITY_H */
+#endif /* CONFIG_INTEGRITY */
 #endif /* _LINUX_INTEGRITY_H */
-- 
cgit 


From 4f22decf9b6329acfe59091c5cba6b378b9b31db Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Fri, 22 Mar 2013 18:38:28 +0100
Subject: HID: input: don't register unmapped input devices

There is no need to register an input device containing no events.
This allows drivers using the quirk MULTI_INPUT to register one input
per report effectively used.

For backward compatibility, we need to add a quirk to request
this behavior.

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-input.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/hid.h     |  1 +
 2 files changed, 78 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 21b196c394b1..945b8158ec4c 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -1198,6 +1198,67 @@ static struct hid_input *hidinput_allocate(struct hid_device *hid)
 	return hidinput;
 }
 
+static bool hidinput_has_been_populated(struct hid_input *hidinput)
+{
+	int i;
+	unsigned long r = 0;
+
+	for (i = 0; i < BITS_TO_LONGS(EV_CNT); i++)
+		r |= hidinput->input->evbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(KEY_CNT); i++)
+		r |= hidinput->input->keybit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(REL_CNT); i++)
+		r |= hidinput->input->relbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(ABS_CNT); i++)
+		r |= hidinput->input->absbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(MSC_CNT); i++)
+		r |= hidinput->input->mscbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(LED_CNT); i++)
+		r |= hidinput->input->ledbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(SND_CNT); i++)
+		r |= hidinput->input->sndbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(FF_CNT); i++)
+		r |= hidinput->input->ffbit[i];
+
+	for (i = 0; i < BITS_TO_LONGS(SW_CNT); i++)
+		r |= hidinput->input->swbit[i];
+
+	return !!r;
+}
+
+static void hidinput_cleanup_hidinput(struct hid_device *hid,
+		struct hid_input *hidinput)
+{
+	struct hid_report *report;
+	int i, k;
+
+	list_del(&hidinput->list);
+	input_free_device(hidinput->input);
+
+	for (k = HID_INPUT_REPORT; k <= HID_OUTPUT_REPORT; k++) {
+		if (k == HID_OUTPUT_REPORT &&
+			hid->quirks & HID_QUIRK_SKIP_OUTPUT_REPORTS)
+			continue;
+
+		list_for_each_entry(report, &hid->report_enum[k].report_list,
+				    list) {
+
+			for (i = 0; i < report->maxfield; i++)
+				if (report->field[i]->hidinput == hidinput)
+					report->field[i]->hidinput = NULL;
+		}
+	}
+
+	kfree(hidinput);
+}
+
 /*
  * Register the input device; print a message.
  * Configure the input layer interface
@@ -1249,6 +1310,10 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 					hidinput_configure_usage(hidinput, report->field[i],
 								 report->field[i]->usage + j);
 
+			if ((hid->quirks & HID_QUIRK_NO_EMPTY_INPUT) &&
+			    !hidinput_has_been_populated(hidinput))
+				continue;
+
 			if (hid->quirks & HID_QUIRK_MULTI_INPUT) {
 				/* This will leave hidinput NULL, so that it
 				 * allocates another one if we have more inputs on
@@ -1265,6 +1330,18 @@ int hidinput_connect(struct hid_device *hid, unsigned int force)
 		}
 	}
 
+	if (hidinput && (hid->quirks & HID_QUIRK_NO_EMPTY_INPUT) &&
+	    !hidinput_has_been_populated(hidinput)) {
+		/* no need to register an input device not populated */
+		hidinput_cleanup_hidinput(hid, hidinput);
+		hidinput = NULL;
+	}
+
+	if (list_empty(&hid->inputs)) {
+		hid_err(hid, "No inputs registered, leaving\n");
+		goto out_unwind;
+	}
+
 	if (hidinput) {
 		if (drv->input_configured)
 			drv->input_configured(hid, hidinput);
diff --git a/include/linux/hid.h b/include/linux/hid.h
index 863744c38ddc..fffa06bc4880 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -282,6 +282,7 @@ struct hid_item {
 #define HID_QUIRK_BADPAD			0x00000020
 #define HID_QUIRK_MULTI_INPUT			0x00000040
 #define HID_QUIRK_HIDINPUT_FORCE		0x00000080
+#define HID_QUIRK_NO_EMPTY_INPUT		0x00000100
 #define HID_QUIRK_SKIP_OUTPUT_REPORTS		0x00010000
 #define HID_QUIRK_FULLSPEED_INTERVAL		0x10000000
 #define HID_QUIRK_NO_INIT_REPORTS		0x20000000
-- 
cgit 


From 221ad7f2df7c54b3f05471a3599ea7368366aaeb Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 26 Mar 2013 21:24:20 +0000
Subject: regmap: core: Provide regmap_can_raw_write() operation

Mainly useful internally but exported since this is a public API that's
being checked for.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/base/regmap/regmap.c | 15 ++++++++++++---
 include/linux/regmap.h       |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c
index 9174c9d45a16..9ab1e1fedbc9 100644
--- a/drivers/base/regmap/regmap.c
+++ b/drivers/base/regmap/regmap.c
@@ -1097,6 +1097,17 @@ int _regmap_raw_write(struct regmap *map, unsigned int reg,
 	return ret;
 }
 
+/**
+ * regmap_can_raw_write - Test if regmap_raw_write() is supported
+ *
+ * @map: Map to check.
+ */
+bool regmap_can_raw_write(struct regmap *map)
+{
+	return map->bus && map->format.format_val && map->format.format_reg;
+}
+EXPORT_SYMBOL_GPL(regmap_can_raw_write);
+
 static int _regmap_bus_formatted_write(void *context, unsigned int reg,
 				       unsigned int val)
 {
@@ -1220,12 +1231,10 @@ int regmap_raw_write(struct regmap *map, unsigned int reg,
 {
 	int ret;
 
-	if (!map->bus)
+	if (!regmap_can_raw_write(map))
 		return -EINVAL;
 	if (val_len % map->format.val_bytes)
 		return -EINVAL;
-	if (reg % map->reg_stride)
-		return -EINVAL;
 
 	map->lock(map->lock_arg);
 
diff --git a/include/linux/regmap.h b/include/linux/regmap.h
index bf77dfdabef9..02d84e24b7c2 100644
--- a/include/linux/regmap.h
+++ b/include/linux/regmap.h
@@ -389,6 +389,7 @@ int regmap_update_bits_check(struct regmap *map, unsigned int reg,
 			     bool *change);
 int regmap_get_val_bytes(struct regmap *map);
 int regmap_async_complete(struct regmap *map);
+bool regmap_can_raw_write(struct regmap *map);
 
 int regcache_sync(struct regmap *map);
 int regcache_sync_region(struct regmap *map, unsigned int min,
-- 
cgit 


From 8761e31c227f9751327196f170eba2b519eab48f Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Tue, 26 Mar 2013 10:30:44 -0700
Subject: mmzone: correct "pags" to "pages" in comment.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 include/linux/mmzone.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ede274957e05..2570216b844a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -450,7 +450,7 @@ struct zone {
 	 *
 	 * present_pages is physical pages existing within the zone, which
 	 * is calculated as:
-	 *	present_pages = spanned_pages - absent_pages(pags in holes);
+	 *	present_pages = spanned_pages - absent_pages(pages in holes);
 	 *
 	 * managed_pages is present pages managed by the buddy system, which
 	 * is calculated as (reserved_pages includes pages allocated by the
-- 
cgit 


From e874a6697710f52fa8ab29487a99034d5d96fdcc Mon Sep 17 00:00:00 2001
From: Emilio López <emilio@elopez.com.ar>
Date: Mon, 25 Feb 2013 11:44:26 -0300
Subject: clk: arm: sunxi: Add a new clock driver for sunxi SOCs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit implements the base CPU clocks for sunxi devices. It has
been tested using a slightly modified cpufreq driver from the
linux-sunxi 3.0 tree.

Additionally, document the new bindings introduced by this patch.

Idling:
    / # cat /sys/kernel/debug/clk/clk_summary
       clock                        enable_cnt  prepare_cnt  rate
    ---------------------------------------------------------------------
     osc32k                         0           0            32768
     osc24M_fixed                   0           0            24000000
        osc24M                      0           0            24000000
           apb1_mux                 0           0            24000000
              apb1                  0           0            24000000
           pll1                     0           0            60000000
              cpu                   0           0            60000000
                 axi                0           0            60000000
                    ahb             0           0            60000000
                       apb0         0           0            30000000
     dummy                          0           0            0

After "yes >/dev/null &":
    / # cat /sys/kernel/debug/clk/clk_summary
       clock                        enable_cnt  prepare_cnt  rate
    ---------------------------------------------------------------------
     osc32k                         0           0            32768
     osc24M_fixed                   0           0            24000000
        osc24M                      0           0            24000000
           apb1_mux                 0           0            24000000
              apb1                  0           0            24000000
           pll1                     0           0            1008000000
              cpu                   0           0            1008000000
                 axi                0           0            336000000
                    ahb             0           0            168000000
                       apb0         0           0            84000000
     dummy                          0           0            0

Signed-off-by: Emilio López <emilio@elopez.com.ar>
Acked-by: Maxime Ripard <maxime.ripard@free-electrons.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 Documentation/devicetree/bindings/clock/sunxi.txt |  44 +++
 drivers/clk/Makefile                              |   1 +
 drivers/clk/sunxi/Makefile                        |   5 +
 drivers/clk/sunxi/clk-factors.c                   | 180 +++++++++++
 drivers/clk/sunxi/clk-factors.h                   |  27 ++
 drivers/clk/sunxi/clk-sunxi.c                     | 362 ++++++++++++++++++++++
 drivers/clocksource/sunxi_timer.c                 |   4 +-
 include/linux/clk/sunxi.h                         |  22 ++
 8 files changed, 643 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/clock/sunxi.txt
 create mode 100644 drivers/clk/sunxi/Makefile
 create mode 100644 drivers/clk/sunxi/clk-factors.c
 create mode 100644 drivers/clk/sunxi/clk-factors.h
 create mode 100644 drivers/clk/sunxi/clk-sunxi.c
 create mode 100644 include/linux/clk/sunxi.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/sunxi.txt b/Documentation/devicetree/bindings/clock/sunxi.txt
new file mode 100644
index 000000000000..b23cfbdbcd6d
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/sunxi.txt
@@ -0,0 +1,44 @@
+Device Tree Clock bindings for arch-sunxi
+
+This binding uses the common clock binding[1].
+
+[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Required properties:
+- compatible : shall be one of the following:
+	"allwinner,sunxi-osc-clk" - for a gatable oscillator
+	"allwinner,sunxi-pll1-clk" - for the main PLL clock
+	"allwinner,sunxi-cpu-clk" - for the CPU multiplexer clock
+	"allwinner,sunxi-axi-clk" - for the sunxi AXI clock
+	"allwinner,sunxi-ahb-clk" - for the sunxi AHB clock
+	"allwinner,sunxi-apb0-clk" - for the sunxi APB0 clock
+	"allwinner,sunxi-apb1-clk" - for the sunxi APB1 clock
+	"allwinner,sunxi-apb1-mux-clk" - for the sunxi APB1 clock muxing
+
+Required properties for all clocks:
+- reg : shall be the control register address for the clock.
+- clocks : shall be the input parent clock(s) phandle for the clock
+- #clock-cells : from common clock binding; shall be set to 0.
+
+For example:
+
+osc24M: osc24M@01c20050 {
+	#clock-cells = <0>;
+	compatible = "allwinner,sunxi-osc-clk";
+	reg = <0x01c20050 0x4>;
+	clocks = <&osc24M_fixed>;
+};
+
+pll1: pll1@01c20000 {
+	#clock-cells = <0>;
+	compatible = "allwinner,sunxi-pll1-clk";
+	reg = <0x01c20000 0x4>;
+	clocks = <&osc24M>;
+};
+
+cpu: cpu@01c20054 {
+	#clock-cells = <0>;
+	compatible = "allwinner,sunxi-cpu-clk";
+	reg = <0x01c20054 0x4>;
+	clocks = <&osc32k>, <&osc24M>, <&pll1>;
+};
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index 41cb123a2d02..79e98e416724 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -24,6 +24,7 @@ ifeq ($(CONFIG_COMMON_CLK), y)
 obj-$(CONFIG_ARCH_MMP)		+= mmp/
 endif
 obj-$(CONFIG_MACH_LOONGSON1)	+= clk-ls1x.o
+obj-$(CONFIG_ARCH_SUNXI)	+= sunxi/
 obj-$(CONFIG_ARCH_U8500)	+= ux500/
 obj-$(CONFIG_ARCH_VT8500)	+= clk-vt8500.o
 obj-$(CONFIG_ARCH_ZYNQ)		+= clk-zynq.o
diff --git a/drivers/clk/sunxi/Makefile b/drivers/clk/sunxi/Makefile
new file mode 100644
index 000000000000..b5bac917612c
--- /dev/null
+++ b/drivers/clk/sunxi/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for sunxi specific clk
+#
+
+obj-y += clk-sunxi.o clk-factors.o
diff --git a/drivers/clk/sunxi/clk-factors.c b/drivers/clk/sunxi/clk-factors.c
new file mode 100644
index 000000000000..88523f91d9b7
--- /dev/null
+++ b/drivers/clk/sunxi/clk-factors.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013 Emilio López <emilio@elopez.com.ar>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Adjustable factor-based clock implementation
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/err.h>
+#include <linux/string.h>
+
+#include <linux/delay.h>
+
+#include "clk-factors.h"
+
+/*
+ * DOC: basic adjustable factor-based clock that cannot gate
+ *
+ * Traits of this clock:
+ * prepare - clk_prepare only ensures that parents are prepared
+ * enable - clk_enable only ensures that parents are enabled
+ * rate - rate is adjustable.
+ *        clk->rate = (parent->rate * N * (K + 1) >> P) / (M + 1)
+ * parent - fixed parent.  No clk_set_parent support
+ */
+
+struct clk_factors {
+	struct clk_hw hw;
+	void __iomem *reg;
+	struct clk_factors_config *config;
+	void (*get_factors) (u32 *rate, u32 parent, u8 *n, u8 *k, u8 *m, u8 *p);
+	spinlock_t *lock;
+};
+
+#define to_clk_factors(_hw) container_of(_hw, struct clk_factors, hw)
+
+#define SETMASK(len, pos)		(((-1U) >> (31-len))  << (pos))
+#define CLRMASK(len, pos)		(~(SETMASK(len, pos)))
+#define FACTOR_GET(bit, len, reg)	(((reg) & SETMASK(len, bit)) >> (bit))
+
+#define FACTOR_SET(bit, len, reg, val) \
+	(((reg) & CLRMASK(len, bit)) | (val << (bit)))
+
+static unsigned long clk_factors_recalc_rate(struct clk_hw *hw,
+					     unsigned long parent_rate)
+{
+	u8 n = 1, k = 0, p = 0, m = 0;
+	u32 reg;
+	unsigned long rate;
+	struct clk_factors *factors = to_clk_factors(hw);
+	struct clk_factors_config *config = factors->config;
+
+	/* Fetch the register value */
+	reg = readl(factors->reg);
+
+	/* Get each individual factor if applicable */
+	if (config->nwidth != SUNXI_FACTORS_NOT_APPLICABLE)
+		n = FACTOR_GET(config->nshift, config->nwidth, reg);
+	if (config->kwidth != SUNXI_FACTORS_NOT_APPLICABLE)
+		k = FACTOR_GET(config->kshift, config->kwidth, reg);
+	if (config->mwidth != SUNXI_FACTORS_NOT_APPLICABLE)
+		m = FACTOR_GET(config->mshift, config->mwidth, reg);
+	if (config->pwidth != SUNXI_FACTORS_NOT_APPLICABLE)
+		p = FACTOR_GET(config->pshift, config->pwidth, reg);
+
+	/* Calculate the rate */
+	rate = (parent_rate * n * (k + 1) >> p) / (m + 1);
+
+	return rate;
+}
+
+static long clk_factors_round_rate(struct clk_hw *hw, unsigned long rate,
+				   unsigned long *parent_rate)
+{
+	struct clk_factors *factors = to_clk_factors(hw);
+	factors->get_factors((u32 *)&rate, (u32)*parent_rate,
+			     NULL, NULL, NULL, NULL);
+
+	return rate;
+}
+
+static int clk_factors_set_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long parent_rate)
+{
+	u8 n, k, m, p;
+	u32 reg;
+	struct clk_factors *factors = to_clk_factors(hw);
+	struct clk_factors_config *config = factors->config;
+	unsigned long flags = 0;
+
+	factors->get_factors((u32 *)&rate, (u32)parent_rate, &n, &k, &m, &p);
+
+	if (factors->lock)
+		spin_lock_irqsave(factors->lock, flags);
+
+	/* Fetch the register value */
+	reg = readl(factors->reg);
+
+	/* Set up the new factors - macros do not do anything if width is 0 */
+	reg = FACTOR_SET(config->nshift, config->nwidth, reg, n);
+	reg = FACTOR_SET(config->kshift, config->kwidth, reg, k);
+	reg = FACTOR_SET(config->mshift, config->mwidth, reg, m);
+	reg = FACTOR_SET(config->pshift, config->pwidth, reg, p);
+
+	/* Apply them now */
+	writel(reg, factors->reg);
+
+	/* delay 500us so pll stabilizes */
+	__delay((rate >> 20) * 500 / 2);
+
+	if (factors->lock)
+		spin_unlock_irqrestore(factors->lock, flags);
+
+	return 0;
+}
+
+static const struct clk_ops clk_factors_ops = {
+	.recalc_rate = clk_factors_recalc_rate,
+	.round_rate = clk_factors_round_rate,
+	.set_rate = clk_factors_set_rate,
+};
+
+/**
+ * clk_register_factors - register a factors clock with
+ * the clock framework
+ * @dev: device registering this clock
+ * @name: name of this clock
+ * @parent_name: name of clock's parent
+ * @flags: framework-specific flags
+ * @reg: register address to adjust factors
+ * @config: shift and width of factors n, k, m and p
+ * @get_factors: function to calculate the factors for a given frequency
+ * @lock: shared register lock for this clock
+ */
+struct clk *clk_register_factors(struct device *dev, const char *name,
+				 const char *parent_name,
+				 unsigned long flags, void __iomem *reg,
+				 struct clk_factors_config *config,
+				 void (*get_factors)(u32 *rate, u32 parent,
+						     u8 *n, u8 *k, u8 *m, u8 *p),
+				 spinlock_t *lock)
+{
+	struct clk_factors *factors;
+	struct clk *clk;
+	struct clk_init_data init;
+
+	/* allocate the factors */
+	factors = kzalloc(sizeof(struct clk_factors), GFP_KERNEL);
+	if (!factors) {
+		pr_err("%s: could not allocate factors clk\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	init.name = name;
+	init.ops = &clk_factors_ops;
+	init.flags = flags;
+	init.parent_names = (parent_name ? &parent_name : NULL);
+	init.num_parents = (parent_name ? 1 : 0);
+
+	/* struct clk_factors assignments */
+	factors->reg = reg;
+	factors->config = config;
+	factors->lock = lock;
+	factors->hw.init = &init;
+	factors->get_factors = get_factors;
+
+	/* register the clock */
+	clk = clk_register(dev, &factors->hw);
+
+	if (IS_ERR(clk))
+		kfree(factors);
+
+	return clk;
+}
diff --git a/drivers/clk/sunxi/clk-factors.h b/drivers/clk/sunxi/clk-factors.h
new file mode 100644
index 000000000000..f49851cc4380
--- /dev/null
+++ b/drivers/clk/sunxi/clk-factors.h
@@ -0,0 +1,27 @@
+#ifndef __MACH_SUNXI_CLK_FACTORS_H
+#define __MACH_SUNXI_CLK_FACTORS_H
+
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+
+#define SUNXI_FACTORS_NOT_APPLICABLE	(0)
+
+struct clk_factors_config {
+	u8 nshift;
+	u8 nwidth;
+	u8 kshift;
+	u8 kwidth;
+	u8 mshift;
+	u8 mwidth;
+	u8 pshift;
+	u8 pwidth;
+};
+
+struct clk *clk_register_factors(struct device *dev, const char *name,
+				 const char *parent_name,
+				 unsigned long flags, void __iomem *reg,
+				 struct clk_factors_config *config,
+				 void (*get_factors) (u32 *rate, u32 parent_rate,
+						      u8 *n, u8 *k, u8 *m, u8 *p),
+				 spinlock_t *lock);
+#endif
diff --git a/drivers/clk/sunxi/clk-sunxi.c b/drivers/clk/sunxi/clk-sunxi.c
new file mode 100644
index 000000000000..d4ad1c22859e
--- /dev/null
+++ b/drivers/clk/sunxi/clk-sunxi.c
@@ -0,0 +1,362 @@
+/*
+ * Copyright 2013 Emilio López
+ *
+ * Emilio López <emilio@elopez.com.ar>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/clk-provider.h>
+#include <linux/clkdev.h>
+#include <linux/clk/sunxi.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+
+#include "clk-factors.h"
+
+static DEFINE_SPINLOCK(clk_lock);
+
+/**
+ * sunxi_osc_clk_setup() - Setup function for gatable oscillator
+ */
+
+#define SUNXI_OSC24M_GATE	0
+
+static void __init sunxi_osc_clk_setup(struct device_node *node)
+{
+	struct clk *clk;
+	const char *clk_name = node->name;
+	const char *parent;
+	void *reg;
+
+	reg = of_iomap(node, 0);
+
+	parent = of_clk_get_parent_name(node, 0);
+
+	clk = clk_register_gate(NULL, clk_name, parent, CLK_IGNORE_UNUSED,
+				reg, SUNXI_OSC24M_GATE, 0, &clk_lock);
+
+	if (clk) {
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+		clk_register_clkdev(clk, clk_name, NULL);
+	}
+}
+
+
+
+/**
+ * sunxi_get_pll1_factors() - calculates n, k, m, p factors for PLL1
+ * PLL1 rate is calculated as follows
+ * rate = (parent_rate * n * (k + 1) >> p) / (m + 1);
+ * parent_rate is always 24Mhz
+ */
+
+static void sunxi_get_pll1_factors(u32 *freq, u32 parent_rate,
+				   u8 *n, u8 *k, u8 *m, u8 *p)
+{
+	u8 div;
+
+	/* Normalize value to a 6M multiple */
+	div = *freq / 6000000;
+	*freq = 6000000 * div;
+
+	/* we were called to round the frequency, we can now return */
+	if (n == NULL)
+		return;
+
+	/* m is always zero for pll1 */
+	*m = 0;
+
+	/* k is 1 only on these cases */
+	if (*freq >= 768000000 || *freq == 42000000 || *freq == 54000000)
+		*k = 1;
+	else
+		*k = 0;
+
+	/* p will be 3 for divs under 10 */
+	if (div < 10)
+		*p = 3;
+
+	/* p will be 2 for divs between 10 - 20 and odd divs under 32 */
+	else if (div < 20 || (div < 32 && (div & 1)))
+		*p = 2;
+
+	/* p will be 1 for even divs under 32, divs under 40 and odd pairs
+	 * of divs between 40-62 */
+	else if (div < 40 || (div < 64 && (div & 2)))
+		*p = 1;
+
+	/* any other entries have p = 0 */
+	else
+		*p = 0;
+
+	/* calculate a suitable n based on k and p */
+	div <<= *p;
+	div /= (*k + 1);
+	*n = div / 4;
+}
+
+
+
+/**
+ * sunxi_get_apb1_factors() - calculates m, p factors for APB1
+ * APB1 rate is calculated as follows
+ * rate = (parent_rate >> p) / (m + 1);
+ */
+
+static void sunxi_get_apb1_factors(u32 *freq, u32 parent_rate,
+				   u8 *n, u8 *k, u8 *m, u8 *p)
+{
+	u8 calcm, calcp;
+
+	if (parent_rate < *freq)
+		*freq = parent_rate;
+
+	parent_rate = (parent_rate + (*freq - 1)) / *freq;
+
+	/* Invalid rate! */
+	if (parent_rate > 32)
+		return;
+
+	if (parent_rate <= 4)
+		calcp = 0;
+	else if (parent_rate <= 8)
+		calcp = 1;
+	else if (parent_rate <= 16)
+		calcp = 2;
+	else
+		calcp = 3;
+
+	calcm = (parent_rate >> calcp) - 1;
+
+	*freq = (parent_rate >> calcp) / (calcm + 1);
+
+	/* we were called to round the frequency, we can now return */
+	if (n == NULL)
+		return;
+
+	*m = calcm;
+	*p = calcp;
+}
+
+
+
+/**
+ * sunxi_factors_clk_setup() - Setup function for factor clocks
+ */
+
+struct factors_data {
+	struct clk_factors_config *table;
+	void (*getter) (u32 *rate, u32 parent_rate, u8 *n, u8 *k, u8 *m, u8 *p);
+};
+
+static struct clk_factors_config pll1_config = {
+	.nshift = 8,
+	.nwidth = 5,
+	.kshift = 4,
+	.kwidth = 2,
+	.mshift = 0,
+	.mwidth = 2,
+	.pshift = 16,
+	.pwidth = 2,
+};
+
+static struct clk_factors_config apb1_config = {
+	.mshift = 0,
+	.mwidth = 5,
+	.pshift = 16,
+	.pwidth = 2,
+};
+
+static const __initconst struct factors_data pll1_data = {
+	.table = &pll1_config,
+	.getter = sunxi_get_pll1_factors,
+};
+
+static const __initconst struct factors_data apb1_data = {
+	.table = &apb1_config,
+	.getter = sunxi_get_apb1_factors,
+};
+
+static void __init sunxi_factors_clk_setup(struct device_node *node,
+					   struct factors_data *data)
+{
+	struct clk *clk;
+	const char *clk_name = node->name;
+	const char *parent;
+	void *reg;
+
+	reg = of_iomap(node, 0);
+
+	parent = of_clk_get_parent_name(node, 0);
+
+	clk = clk_register_factors(NULL, clk_name, parent, CLK_IGNORE_UNUSED,
+				   reg, data->table, data->getter, &clk_lock);
+
+	if (clk) {
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+		clk_register_clkdev(clk, clk_name, NULL);
+	}
+}
+
+
+
+/**
+ * sunxi_mux_clk_setup() - Setup function for muxes
+ */
+
+#define SUNXI_MUX_GATE_WIDTH	2
+
+struct mux_data {
+	u8 shift;
+};
+
+static const __initconst struct mux_data cpu_data = {
+	.shift = 16,
+};
+
+static const __initconst struct mux_data apb1_mux_data = {
+	.shift = 24,
+};
+
+static void __init sunxi_mux_clk_setup(struct device_node *node,
+				       struct mux_data *data)
+{
+	struct clk *clk;
+	const char *clk_name = node->name;
+	const char **parents = kmalloc(sizeof(char *) * 5, GFP_KERNEL);
+	void *reg;
+	int i = 0;
+
+	reg = of_iomap(node, 0);
+
+	while (i < 5 && (parents[i] = of_clk_get_parent_name(node, i)) != NULL)
+		i++;
+
+	clk = clk_register_mux(NULL, clk_name, parents, i, 0, reg,
+			       data->shift, SUNXI_MUX_GATE_WIDTH,
+			       0, &clk_lock);
+
+	if (clk) {
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+		clk_register_clkdev(clk, clk_name, NULL);
+	}
+}
+
+
+
+/**
+ * sunxi_divider_clk_setup() - Setup function for simple divider clocks
+ */
+
+#define SUNXI_DIVISOR_WIDTH	2
+
+struct div_data {
+	u8 shift;
+	u8 pow;
+};
+
+static const __initconst struct div_data axi_data = {
+	.shift = 0,
+	.pow = 0,
+};
+
+static const __initconst struct div_data ahb_data = {
+	.shift = 4,
+	.pow = 1,
+};
+
+static const __initconst struct div_data apb0_data = {
+	.shift = 8,
+	.pow = 1,
+};
+
+static void __init sunxi_divider_clk_setup(struct device_node *node,
+					   struct div_data *data)
+{
+	struct clk *clk;
+	const char *clk_name = node->name;
+	const char *clk_parent;
+	void *reg;
+
+	reg = of_iomap(node, 0);
+
+	clk_parent = of_clk_get_parent_name(node, 0);
+
+	clk = clk_register_divider(NULL, clk_name, clk_parent, 0,
+				   reg, data->shift, SUNXI_DIVISOR_WIDTH,
+				   data->pow ? CLK_DIVIDER_POWER_OF_TWO : 0,
+				   &clk_lock);
+	if (clk) {
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+		clk_register_clkdev(clk, clk_name, NULL);
+	}
+}
+
+
+/* Matches for of_clk_init */
+static const __initconst struct of_device_id clk_match[] = {
+	{.compatible = "fixed-clock", .data = of_fixed_clk_setup,},
+	{.compatible = "allwinner,sunxi-osc-clk", .data = sunxi_osc_clk_setup,},
+	{}
+};
+
+/* Matches for factors clocks */
+static const __initconst struct of_device_id clk_factors_match[] = {
+	{.compatible = "allwinner,sunxi-pll1-clk", .data = &pll1_data,},
+	{.compatible = "allwinner,sunxi-apb1-clk", .data = &apb1_data,},
+	{}
+};
+
+/* Matches for divider clocks */
+static const __initconst struct of_device_id clk_div_match[] = {
+	{.compatible = "allwinner,sunxi-axi-clk", .data = &axi_data,},
+	{.compatible = "allwinner,sunxi-ahb-clk", .data = &ahb_data,},
+	{.compatible = "allwinner,sunxi-apb0-clk", .data = &apb0_data,},
+	{}
+};
+
+/* Matches for mux clocks */
+static const __initconst struct of_device_id clk_mux_match[] = {
+	{.compatible = "allwinner,sunxi-cpu-clk", .data = &cpu_data,},
+	{.compatible = "allwinner,sunxi-apb1-mux-clk", .data = &apb1_mux_data,},
+	{}
+};
+
+static void __init of_sunxi_table_clock_setup(const struct of_device_id *clk_match,
+					      void *function)
+{
+	struct device_node *np;
+	const struct div_data *data;
+	const struct of_device_id *match;
+	void (*setup_function)(struct device_node *, const void *) = function;
+
+	for_each_matching_node(np, clk_match) {
+		match = of_match_node(clk_match, np);
+		data = match->data;
+		setup_function(np, data);
+	}
+}
+
+void __init sunxi_init_clocks(void)
+{
+	/* Register all the simple sunxi clocks on DT */
+	of_clk_init(clk_match);
+
+	/* Register factor clocks */
+	of_sunxi_table_clock_setup(clk_factors_match, sunxi_factors_clk_setup);
+
+	/* Register divider clocks */
+	of_sunxi_table_clock_setup(clk_div_match, sunxi_divider_clk_setup);
+
+	/* Register mux clocks */
+	of_sunxi_table_clock_setup(clk_mux_match, sunxi_mux_clk_setup);
+}
diff --git a/drivers/clocksource/sunxi_timer.c b/drivers/clocksource/sunxi_timer.c
index 4086b9167159..0ce85e29769b 100644
--- a/drivers/clocksource/sunxi_timer.c
+++ b/drivers/clocksource/sunxi_timer.c
@@ -23,7 +23,7 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/sunxi_timer.h>
-#include <linux/clk-provider.h>
+#include <linux/clk/sunxi.h>
 
 #define TIMER_CTL_REG		0x00
 #define TIMER_CTL_ENABLE		(1 << 0)
@@ -123,7 +123,7 @@ void __init sunxi_timer_init(void)
 	if (irq <= 0)
 		panic("Can't parse IRQ");
 
-	of_clk_init(NULL);
+	sunxi_init_clocks();
 
 	clk = of_clk_get(node, 0);
 	if (IS_ERR(clk))
diff --git a/include/linux/clk/sunxi.h b/include/linux/clk/sunxi.h
new file mode 100644
index 000000000000..e074fdd5a236
--- /dev/null
+++ b/include/linux/clk/sunxi.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2012 Maxime Ripard
+ *
+ * Maxime Ripard <maxime.ripard@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_CLK_SUNXI_H_
+#define __LINUX_CLK_SUNXI_H_
+
+void __init sunxi_init_clocks(void);
+
+#endif
-- 
cgit 


From 2db76d7c3c6db93058f983c8240f7c7c25e87ee6 Mon Sep 17 00:00:00 2001
From: Imre Deak <imre.deak@intel.com>
Date: Tue, 26 Mar 2013 15:14:18 +0200
Subject: lib/scatterlist: sg_page_iter: support sg lists w/o backing pages

The i915 driver uses sg lists for memory without backing 'struct page'
pages, similarly to other IO memory regions, setting only the DMA
address for these. It does this, so that it can program the HW MMU
tables in a uniform way both for sg lists with and without backing pages.

Without a valid page pointer we can't call nth_page to get the current
page in __sg_page_iter_next, so add a helper that relevant users can
call separately. Also add a helper to get the DMA address of the current
page (idea from Daniel).

Convert all places in i915, to use the new API.

Signed-off-by: Imre Deak <imre.deak@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_cache.c            |  2 +-
 drivers/gpu/drm/i915/i915_drv.h        |  2 +-
 drivers/gpu/drm/i915/i915_gem.c        |  8 ++++----
 drivers/gpu/drm/i915/i915_gem_dmabuf.c |  2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c    |  6 ++----
 drivers/gpu/drm/i915/i915_gem_tiling.c |  4 ++--
 include/linux/scatterlist.h            | 28 +++++++++++++++++++++++-----
 lib/scatterlist.c                      |  4 +---
 8 files changed, 35 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index bc8edbeca3fd..bb8f58012189 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -109,7 +109,7 @@ drm_clflush_sg(struct sg_table *st)
 
 		mb();
 		for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
-			drm_clflush_page(sg_iter.page);
+			drm_clflush_page(sg_page_iter_page(&sg_iter));
 		mb();
 
 		return;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d091ea12fad..f69538508d8c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1543,7 +1543,7 @@ static inline struct page *i915_gem_object_get_page(struct drm_i915_gem_object *
 	struct sg_page_iter sg_iter;
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, n)
-		return sg_iter.page;
+		return sg_page_iter_page(&sg_iter);
 
 	return NULL;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a1123a32dc27..911bd40ef513 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -442,7 +442,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
 			 offset >> PAGE_SHIFT) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 
 		if (remain <= 0)
 			break;
@@ -765,7 +765,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents,
 			 offset >> PAGE_SHIFT) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 		int partial_cacheline_write;
 
 		if (remain <= 0)
@@ -1647,7 +1647,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 		obj->dirty = 0;
 
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 
 		if (obj->dirty)
 			set_page_dirty(page);
@@ -1827,7 +1827,7 @@ i915_gem_object_get_pages_gtt(struct drm_i915_gem_object *obj)
 err_pages:
 	sg_mark_end(sg);
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0)
-		page_cache_release(sg_iter.page);
+		page_cache_release(sg_page_iter_page(&sg_iter));
 	sg_free_table(st);
 	kfree(st);
 	return PTR_ERR(page);
diff --git a/drivers/gpu/drm/i915/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
index 898615d2d5e2..c6dfc1466e3a 100644
--- a/drivers/gpu/drm/i915/i915_gem_dmabuf.c
+++ b/drivers/gpu/drm/i915/i915_gem_dmabuf.c
@@ -130,7 +130,7 @@ static void *i915_gem_dmabuf_vmap(struct dma_buf *dma_buf)
 
 	i = 0;
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0);
-		pages[i++] = sg_iter.page;
+		pages[i++] = sg_page_iter_page(&sg_iter);
 
 	obj->dma_buf_vmapping = vmap(pages, i, 0, PAGE_KERNEL);
 	drm_free_large(pages);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 4cbae7bbb833..24a23b31b55f 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -123,8 +123,7 @@ static void gen6_ppgtt_insert_entries(struct i915_hw_ppgtt *ppgtt,
 	for_each_sg_page(pages->sgl, &sg_iter, pages->nents, 0) {
 		dma_addr_t page_addr;
 
-		page_addr = sg_dma_address(sg_iter.sg) +
-				(sg_iter.sg_pgoffset << PAGE_SHIFT);
+		page_addr = sg_page_iter_dma_address(&sg_iter);
 		pt_vaddr[act_pte] = gen6_pte_encode(ppgtt->dev, page_addr,
 						    cache_level);
 		if (++act_pte == I915_PPGTT_PT_ENTRIES) {
@@ -424,8 +423,7 @@ static void gen6_ggtt_insert_entries(struct drm_device *dev,
 	dma_addr_t addr;
 
 	for_each_sg_page(st->sgl, &sg_iter, st->nents, 0) {
-		addr = sg_dma_address(sg_iter.sg) +
-			(sg_iter.sg_pgoffset << PAGE_SHIFT);
+		addr = sg_page_iter_dma_address(&sg_iter);
 		iowrite32(gen6_pte_encode(dev, addr, level), &gtt_entries[i]);
 		i++;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index f799708bcb85..c807eb93755b 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -481,7 +481,7 @@ i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj)
 
 	i = 0;
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		struct page *page = sg_iter.page;
+		struct page *page = sg_page_iter_page(&sg_iter);
 		char new_bit_17 = page_to_phys(page) >> 17;
 		if ((new_bit_17 & 0x1) !=
 		    (test_bit(i, obj->bit_17) != 0)) {
@@ -511,7 +511,7 @@ i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj)
 
 	i = 0;
 	for_each_sg_page(obj->pages->sgl, &sg_iter, obj->pages->nents, 0) {
-		if (page_to_phys(sg_iter.page) & (1 << 17))
+		if (page_to_phys(sg_page_iter_page(&sg_iter)) & (1 << 17))
 			__set_bit(i, obj->bit_17);
 		else
 			__clear_bit(i, obj->bit_17);
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 2d8bdaef9611..e96b9546c4c6 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -235,13 +235,13 @@ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents,
  * sg page iterator
  *
  * Iterates over sg entries page-by-page.  On each successful iteration,
- * @piter->page points to the current page, @piter->sg to the sg holding this
- * page and @piter->sg_pgoffset to the page's page offset within the sg. The
- * iteration will stop either when a maximum number of sg entries was reached
- * or a terminating sg (sg_last(sg) == true) was reached.
+ * you can call sg_page_iter_page(@piter) and sg_page_iter_dma_address(@piter)
+ * to get the current page and its dma address. @piter->sg will point to the
+ * sg holding this page and @piter->sg_pgoffset to the page's page offset
+ * within the sg. The iteration will stop either when a maximum number of sg
+ * entries was reached or a terminating sg (sg_last(sg) == true) was reached.
  */
 struct sg_page_iter {
-	struct page		*page;		/* current page */
 	struct scatterlist	*sg;		/* sg holding the page */
 	unsigned int		sg_pgoffset;	/* page offset within the sg */
 
@@ -255,6 +255,24 @@ bool __sg_page_iter_next(struct sg_page_iter *piter);
 void __sg_page_iter_start(struct sg_page_iter *piter,
 			  struct scatterlist *sglist, unsigned int nents,
 			  unsigned long pgoffset);
+/**
+ * sg_page_iter_page - get the current page held by the page iterator
+ * @piter:	page iterator holding the page
+ */
+static inline struct page *sg_page_iter_page(struct sg_page_iter *piter)
+{
+	return nth_page(sg_page(piter->sg), piter->sg_pgoffset);
+}
+
+/**
+ * sg_page_iter_dma_address - get the dma address of the current page held by
+ * the page iterator.
+ * @piter:	page iterator holding the page
+ */
+static inline dma_addr_t sg_page_iter_dma_address(struct sg_page_iter *piter)
+{
+	return sg_dma_address(piter->sg) + (piter->sg_pgoffset << PAGE_SHIFT);
+}
 
 /**
  * for_each_sg_page - iterate over the pages of the given sg list
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index b83c144d731f..a1cf8cae60e7 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -401,7 +401,6 @@ void __sg_page_iter_start(struct sg_page_iter *piter,
 	piter->__pg_advance = 0;
 	piter->__nents = nents;
 
-	piter->page = NULL;
 	piter->sg = sglist;
 	piter->sg_pgoffset = pgoffset;
 }
@@ -426,7 +425,6 @@ bool __sg_page_iter_next(struct sg_page_iter *piter)
 		if (!--piter->__nents || !piter->sg)
 			return false;
 	}
-	piter->page = nth_page(sg_page(piter->sg), piter->sg_pgoffset);
 
 	return true;
 }
@@ -496,7 +494,7 @@ bool sg_miter_next(struct sg_mapping_iter *miter)
 		miter->__remaining = min_t(unsigned long, miter->__remaining,
 					   PAGE_SIZE - miter->__offset);
 	}
-	miter->page = miter->piter.page;
+	miter->page = sg_page_iter_page(&miter->piter);
 	miter->consumed = miter->length = miter->__remaining;
 
 	if (miter->__flags & SG_MITER_ATOMIC)
-- 
cgit 


From 5203cd28db6dc05c3618a602cf4cf81203d00257 Mon Sep 17 00:00:00 2001
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 26 Mar 2013 23:11:21 +0000
Subject: net: core: introduce skb_probe_transport_header()

Sometimes, we need probe and set the transport header for packets (e.g from
untrusted source). This patch introduces a new helper
skb_probe_transport_header() which tries to probe and set the l4 header through
skb_flow_dissect(), if not just set the transport header to the hint passed by
caller.

Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 497412165b1c..fa88b966cb8e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -32,6 +32,7 @@
 #include <linux/hrtimer.h>
 #include <linux/dma-mapping.h>
 #include <linux/netdev_features.h>
+#include <net/flow_keys.h>
 
 /* Don't change this without changing skb_csum_unnecessary! */
 #define CHECKSUM_NONE 0
@@ -1559,6 +1560,19 @@ static inline void skb_set_transport_header(struct sk_buff *skb,
 	skb->transport_header += offset;
 }
 
+static inline void skb_probe_transport_header(struct sk_buff *skb,
+					      const int offset_hint)
+{
+	struct flow_keys keys;
+
+	if (skb_transport_header_was_set(skb))
+		return;
+	else if (skb_flow_dissect(skb, &keys))
+		skb_set_transport_header(skb, keys.thoff);
+	else
+		skb_set_transport_header(skb, offset_hint);
+}
+
 static inline unsigned char *skb_network_header(const struct sk_buff *skb)
 {
 	return skb->head + skb->network_header;
-- 
cgit 


From d6b688cf2f7ca3e168acc73597f4d7102ae663fa Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Wed, 27 Mar 2013 17:23:10 +0100
Subject: bcma: handle more devices in bcma_pmu_get_alp_clock()

Add some more chip IDs to bcma_pmu_get_alp_clock()

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/driver_chipcommon_pmu.c        | 24 ++++++++++++++++++++----
 include/linux/bcma/bcma_driver_chipcommon.h |  1 +
 2 files changed, 21 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_chipcommon_pmu.c b/drivers/bcma/driver_chipcommon_pmu.c
index 7e88fffaf3f5..edca73af3cc0 100644
--- a/drivers/bcma/driver_chipcommon_pmu.c
+++ b/drivers/bcma/driver_chipcommon_pmu.c
@@ -174,19 +174,35 @@ u32 bcma_pmu_get_alp_clock(struct bcma_drv_cc *cc)
 	struct bcma_bus *bus = cc->core->bus;
 
 	switch (bus->chipinfo.id) {
+	case BCMA_CHIP_ID_BCM4313:
+	case BCMA_CHIP_ID_BCM43224:
+	case BCMA_CHIP_ID_BCM43225:
+	case BCMA_CHIP_ID_BCM43227:
+	case BCMA_CHIP_ID_BCM43228:
+	case BCMA_CHIP_ID_BCM4331:
+	case BCMA_CHIP_ID_BCM43421:
+	case BCMA_CHIP_ID_BCM43428:
+	case BCMA_CHIP_ID_BCM43431:
 	case BCMA_CHIP_ID_BCM4716:
-	case BCMA_CHIP_ID_BCM4748:
 	case BCMA_CHIP_ID_BCM47162:
-	case BCMA_CHIP_ID_BCM4313:
-	case BCMA_CHIP_ID_BCM5357:
+	case BCMA_CHIP_ID_BCM4748:
 	case BCMA_CHIP_ID_BCM4749:
+	case BCMA_CHIP_ID_BCM5357:
 	case BCMA_CHIP_ID_BCM53572:
+	case BCMA_CHIP_ID_BCM6362:
 		/* always 20Mhz */
 		return 20000 * 1000;
-	case BCMA_CHIP_ID_BCM5356:
 	case BCMA_CHIP_ID_BCM4706:
+	case BCMA_CHIP_ID_BCM5356:
 		/* always 25Mhz */
 		return 25000 * 1000;
+	case BCMA_CHIP_ID_BCM43460:
+	case BCMA_CHIP_ID_BCM4352:
+	case BCMA_CHIP_ID_BCM4360:
+		if (cc->status & BCMA_CC_CHIPST_4360_XTAL_40MZ)
+			return 40000 * 1000;
+		else
+			return 20000 * 1000;
 	default:
 		bcma_warn(bus, "No ALP clock specified for %04X device, pmu rev. %d, using default %d Hz\n",
 			  bus->chipinfo.id, cc->pmu.rev, BCMA_CC_PMU_ALP_CLOCK);
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 8390c474f69a..1db4c6de372e 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -104,6 +104,7 @@
 #define  BCMA_CC_CHIPST_4706_MIPS_BENDIAN	BIT(3) /* 0: little, 1: big endian */
 #define  BCMA_CC_CHIPST_4706_PCIE1_DISABLE	BIT(5) /* PCIE1 enable strap pin */
 #define  BCMA_CC_CHIPST_5357_NAND_BOOT		BIT(4) /* NAND boot, valid for CC rev 38 and/or BCM5357 */
+#define  BCMA_CC_CHIPST_4360_XTAL_40MZ		0x00000001
 #define BCMA_CC_JCMD			0x0030		/* Rev >= 10 only */
 #define  BCMA_CC_JCMD_START		0x80000000
 #define  BCMA_CC_JCMD_BUSY		0x80000000
-- 
cgit 


From 6951618b4b0bb022429ab17d49f2fa3650f21cb4 Mon Sep 17 00:00:00 2001
From: Hauke Mehrtens <hauke@hauke-m.de>
Date: Wed, 27 Mar 2013 17:23:11 +0100
Subject: bcma: export bcma_chipco_get_alp_clock()

This function will be used by brcmsmac.

Signed-off-by: Hauke Mehrtens <hauke@hauke-m.de>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/bcma/driver_chipcommon.c            | 3 ++-
 include/linux/bcma/bcma_driver_chipcommon.h | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/bcma/driver_chipcommon.c b/drivers/bcma/driver_chipcommon.c
index 28fa50ad87be..88db0cb7bf19 100644
--- a/drivers/bcma/driver_chipcommon.c
+++ b/drivers/bcma/driver_chipcommon.c
@@ -25,13 +25,14 @@ static inline u32 bcma_cc_write32_masked(struct bcma_drv_cc *cc, u16 offset,
 	return value;
 }
 
-static u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc)
+u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc)
 {
 	if (cc->capabilities & BCMA_CC_CAP_PMU)
 		return bcma_pmu_get_alp_clock(cc);
 
 	return 20000000;
 }
+EXPORT_SYMBOL_GPL(bcma_chipco_get_alp_clock);
 
 static u32 bcma_chipco_watchdog_get_max_timer(struct bcma_drv_cc *cc)
 {
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 1db4c6de372e..453fcc914683 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -608,6 +608,8 @@ void bcma_chipco_bcm4331_ext_pa_lines_ctl(struct bcma_drv_cc *cc, bool enable);
 
 extern u32 bcma_chipco_watchdog_timer_set(struct bcma_drv_cc *cc, u32 ticks);
 
+extern u32 bcma_chipco_get_alp_clock(struct bcma_drv_cc *cc);
+
 void bcma_chipco_irq_mask(struct bcma_drv_cc *cc, u32 mask, u32 value);
 
 u32 bcma_chipco_irq_status(struct bcma_drv_cc *cc, u32 mask);
-- 
cgit 


From 43a5911b3dcec81add87d833cd8c7ddaaa205a47 Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 21 Mar 2013 15:59:15 +0000
Subject: regulator: ab8500: Clean out SoC registers

Clean out initialisation that is handled by SoC. Regulator
settings for Vpll (partly), Vsmps1, Vsmps2, Vsmps3 (partly),
Vrf1, Varm, Vape, Vbb, Vmod are cleaned out. They should not
be touched by the kernel.

We also update many of the initialisation values to be more
in-line with the current development efforts of ST-Ericsson
internal engineers.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500-regulators.c | 28 ++------
 drivers/regulator/ab8500.c                    | 98 +++++++--------------------
 include/linux/regulator/ab8500.h              | 12 ----
 3 files changed, 28 insertions(+), 110 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-regulators.c b/arch/arm/mach-ux500/board-mop500-regulators.c
index c962d1d726bb..3d899c51bbac 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.c
+++ b/arch/arm/mach-ux500/board-mop500-regulators.c
@@ -139,10 +139,9 @@ static struct regulator_consumer_supply ab8500_vana_consumers[] = {
 static struct ab8500_regulator_reg_init ab8500_reg_init[] = {
 	/*
 	 * VanaRequestCtrl          = HP/LP depending on VxRequest
-	 * VpllRequestCtrl          = HP/LP depending on VxRequest
 	 * VextSupply1RequestCtrl   = HP/LP depending on VxRequest
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL2,       0xfc, 0x00),
+	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL2,       0xf0, 0x00),
 	/*
 	 * VextSupply2RequestCtrl   = HP/LP depending on VxRequest
 	 * VextSupply3RequestCtrl   = HP/LP depending on VxRequest
@@ -156,16 +155,12 @@ static struct ab8500_regulator_reg_init ab8500_reg_init[] = {
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_REGUREQUESTCTRL4,       0x07, 0x00),
 	/*
-	 * Vsmps1SysClkReq1HPValid  = enabled
-	 * Vsmps2SysClkReq1HPValid  = enabled
-	 * Vsmps3SysClkReq1HPValid  = enabled
 	 * VanaSysClkReq1HPValid    = disabled
-	 * VpllSysClkReq1HPValid    = enabled
 	 * Vaux1SysClkReq1HPValid   = disabled
 	 * Vaux2SysClkReq1HPValid   = disabled
 	 * Vaux3SysClkReq1HPValid   = disabled
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0xff, 0x17),
+	INIT_REGULATOR_REGISTER(AB8500_REGUSYSCLKREQ1HPVALID1, 0xe8, 0x00),
 	/*
 	 * VextSupply1SysClkReq1HPValid = disabled
 	 * VextSupply2SysClkReq1HPValid = disabled
@@ -252,17 +247,7 @@ static struct ab8500_regulator_reg_init ab8500_reg_init[] = {
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_REGUCTRL1VAMIC,         0x03, 0x00),
 	/*
-	 * Vsmps1Regu               = HW control
-	 * Vsmps1SelCtrl            = Vsmps1 voltage defined by Vsmsp1Sel2
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VSMPS1REGU,             0x0f, 0x06),
-	/*
-	 * Vsmps2Regu               = HW control
-	 * Vsmps2SelCtrl            = Vsmps2 voltage defined by Vsmsp2Sel2
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VSMPS2REGU,             0x0f, 0x06),
-	/*
-	 * VPll                     = Hw controlled
+	 * VPll                     = Hw controlled (NOTE! PRCMU bits)
 	 * VanaRegu                 = force off
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_VPLLVANAREGU,           0x0f, 0x02),
@@ -285,14 +270,9 @@ static struct ab8500_regulator_reg_init ab8500_reg_init[] = {
 	 */
 	INIT_REGULATOR_REGISTER(AB8500_VAUX12REGU,             0x0f, 0x01),
 	/*
-	 * Vrf1Regu                 = HW control
 	 * Vaux3Regu                = force off
 	 */
-	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU,          0x0f, 0x08),
-	/*
-	 * Vsmps1                   = 1.15V
-	 */
-	INIT_REGULATOR_REGISTER(AB8500_VSMPS1SEL1,             0x3f, 0x24),
+	INIT_REGULATOR_REGISTER(AB8500_VRF1VAUX3REGU,          0x03, 0x00),
 	/*
 	 * Vaux1Sel                 = 2.5 V
 	 */
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index f1453a66a0fd..919d9fa9605e 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -613,19 +613,10 @@ struct ab8500_reg_init {
 
 static struct ab8500_reg_init ab8500_reg_init[] = {
 	/*
-	 * 0x03, VarmRequestCtrl
-	 * 0x0c, VapeRequestCtrl
-	 * 0x30, Vsmps1RequestCtrl
-	 * 0xc0, Vsmps2RequestCtrl
-	 */
-	REG_INIT(AB8500_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
-	/*
-	 * 0x03, Vsmps3RequestCtrl
-	 * 0x0c, VpllRequestCtrl
 	 * 0x30, VanaRequestCtrl
 	 * 0xc0, VextSupply1RequestCtrl
 	 */
-	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xff),
+	REG_INIT(AB8500_REGUREQUESTCTRL2,	0x03, 0x04, 0xf0),
 	/*
 	 * 0x03, VextSupply2RequestCtrl
 	 * 0x0c, VextSupply3RequestCtrl
@@ -639,91 +630,74 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 */
 	REG_INIT(AB8500_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
 	/*
-	 * 0x01, Vsmps1SysClkReq1HPValid
-	 * 0x02, Vsmps2SysClkReq1HPValid
-	 * 0x04, Vsmps3SysClkReq1HPValid
 	 * 0x08, VanaSysClkReq1HPValid
-	 * 0x10, VpllSysClkReq1HPValid
 	 * 0x20, Vaux1SysClkReq1HPValid
 	 * 0x40, Vaux2SysClkReq1HPValid
 	 * 0x80, Vaux3SysClkReq1HPValid
 	 */
-	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xe8),
 	/*
-	 * 0x01, VapeSysClkReq1HPValid
-	 * 0x02, VarmSysClkReq1HPValid
-	 * 0x04, VbbSysClkReq1HPValid
-	 * 0x08, VmodSysClkReq1HPValid
 	 * 0x10, VextSupply1SysClkReq1HPValid
 	 * 0x20, VextSupply2SysClkReq1HPValid
 	 * 0x40, VextSupply3SysClkReq1HPValid
 	 */
-	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x7f),
+	REG_INIT(AB8500_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x70),
 	/*
-	 * 0x01, Vsmps1HwHPReq1Valid
-	 * 0x02, Vsmps2HwHPReq1Valid
-	 * 0x04, Vsmps3HwHPReq1Valid
 	 * 0x08, VanaHwHPReq1Valid
-	 * 0x10, VpllHwHPReq1Valid
 	 * 0x20, Vaux1HwHPReq1Valid
 	 * 0x40, Vaux2HwHPReq1Valid
 	 * 0x80, Vaux3HwHPReq1Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
+	REG_INIT(AB8500_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xe8),
 	/*
 	 * 0x01, VextSupply1HwHPReq1Valid
 	 * 0x02, VextSupply2HwHPReq1Valid
 	 * 0x04, VextSupply3HwHPReq1Valid
-	 * 0x08, VmodHwHPReq1Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x0f),
+	REG_INIT(AB8500_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
 	/*
-	 * 0x01, Vsmps1HwHPReq2Valid
-	 * 0x02, Vsmps2HwHPReq2Valid
-	 * 0x03, Vsmps3HwHPReq2Valid
 	 * 0x08, VanaHwHPReq2Valid
-	 * 0x10, VpllHwHPReq2Valid
 	 * 0x20, Vaux1HwHPReq2Valid
 	 * 0x40, Vaux2HwHPReq2Valid
 	 * 0x80, Vaux3HwHPReq2Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
+	REG_INIT(AB8500_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xe8),
 	/*
 	 * 0x01, VextSupply1HwHPReq2Valid
 	 * 0x02, VextSupply2HwHPReq2Valid
 	 * 0x04, VextSupply3HwHPReq2Valid
-	 * 0x08, VmodHwHPReq2Valid
 	 */
-	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x0f),
+	REG_INIT(AB8500_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
 	/*
-	 * 0x01, VapeSwHPReqValid
-	 * 0x02, VarmSwHPReqValid
-	 * 0x04, Vsmps1SwHPReqValid
-	 * 0x08, Vsmps2SwHPReqValid
-	 * 0x10, Vsmps3SwHPReqValid
 	 * 0x20, VanaSwHPReqValid
-	 * 0x40, VpllSwHPReqValid
 	 * 0x80, Vaux1SwHPReqValid
 	 */
-	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
+	REG_INIT(AB8500_REGUSWHPREQVALID1,	0x03, 0x0d, 0xa0),
 	/*
 	 * 0x01, Vaux2SwHPReqValid
 	 * 0x02, Vaux3SwHPReqValid
 	 * 0x04, VextSupply1SwHPReqValid
 	 * 0x08, VextSupply2SwHPReqValid
 	 * 0x10, VextSupply3SwHPReqValid
-	 * 0x20, VmodSwHPReqValid
 	 */
-	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x3f),
+	REG_INIT(AB8500_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
 	/*
 	 * 0x02, SysClkReq2Valid1
-	 * ...
+	 * 0x04, SysClkReq3Valid1
+	 * 0x08, SysClkReq4Valid1
+	 * 0x10, SysClkReq5Valid1
+	 * 0x20, SysClkReq6Valid1
+	 * 0x40, SysClkReq7Valid1
 	 * 0x80, SysClkReq8Valid1
 	 */
 	REG_INIT(AB8500_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xfe),
 	/*
 	 * 0x02, SysClkReq2Valid2
-	 * ...
+	 * 0x04, SysClkReq3Valid2
+	 * 0x08, SysClkReq4Valid2
+	 * 0x10, SysClkReq5Valid2
+	 * 0x20, SysClkReq6Valid2
+	 * 0x40, SysClkReq7Valid2
 	 * 0x80, SysClkReq8Valid2
 	 */
 	REG_INIT(AB8500_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xfe),
@@ -748,21 +722,7 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 */
 	REG_INIT(AB8500_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
 	/*
-	 * 0x03, Vsmps1Regu
-	 * 0x0c, Vsmps1SelCtrl
-	 * 0x10, Vsmps1AutoMode
-	 * 0x20, Vsmps1PWMMode
-	 */
-	REG_INIT(AB8500_VSMPS1REGU,		0x04, 0x03, 0x3f),
-	/*
-	 * 0x03, Vsmps2Regu
-	 * 0x0c, Vsmps2SelCtrl
-	 * 0x10, Vsmps2AutoMode
-	 * 0x20, Vsmps2PWMMode
-	 */
-	REG_INIT(AB8500_VSMPS2REGU,		0x04, 0x04, 0x3f),
-	/*
-	 * 0x03, VpllRegu
+	 * 0x03, VpllRegu (NOTE! PRCMU register bits)
 	 * 0x0c, VanaRegu
 	 */
 	REG_INIT(AB8500_VPLLVANAREGU,		0x04, 0x06, 0x0f),
@@ -785,14 +745,9 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 */
 	REG_INIT(AB8500_VAUX12REGU,		0x04, 0x09, 0x0f),
 	/*
-	 * 0x0c, Vrf1Regu
 	 * 0x03, Vaux3Regu
 	 */
-	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
-	/*
-	 * 0x3f, Vsmps1Sel1
-	 */
-	REG_INIT(AB8500_VSMPS1SEL1,		0x04, 0x13, 0x3f),
+	REG_INIT(AB8500_VRF1VAUX3REGU,		0x04, 0x0a, 0x03),
 	/*
 	 * 0x0f, Vaux1Sel
 	 */
@@ -803,16 +758,13 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	REG_INIT(AB8500_VAUX2SEL,		0x04, 0x20, 0x0f),
 	/*
 	 * 0x07, Vaux3Sel
-	 * 0x30, Vrf1Sel
 	 */
-	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x37),
+	REG_INIT(AB8500_VRF1VAUX3SEL,		0x04, 0x21, 0x07),
 	/*
 	 * 0x01, VextSupply12LP
 	 */
 	REG_INIT(AB8500_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
 	/*
-	 * 0x01, VpllDisch
-	 * 0x02, Vrf1Disch
 	 * 0x04, Vaux1Disch
 	 * 0x08, Vaux2Disch
 	 * 0x10, Vaux3Disch
@@ -820,15 +772,13 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	 * 0x40, VTVoutDisch
 	 * 0x80, VaudioDisch
 	 */
-	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xff),
+	REG_INIT(AB8500_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
 	/*
-	 * 0x01, VsimDisch
 	 * 0x02, VanaDisch
 	 * 0x04, VdmicPullDownEna
-	 * 0x08, VpllPullDownEna
 	 * 0x10, VdmicDisch
 	 */
-	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x1f),
+	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
 };
 
 static int ab8500_regulator_init_registers(struct platform_device *pdev,
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 3a8e02687f7b..26792ff360be 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -61,7 +61,6 @@ struct ab8500_regulator_reg_init {
 
 /* AB8500 registers */
 enum ab8500_regulator_reg {
-	AB8500_REGUREQUESTCTRL1,
 	AB8500_REGUREQUESTCTRL2,
 	AB8500_REGUREQUESTCTRL3,
 	AB8500_REGUREQUESTCTRL4,
@@ -78,22 +77,11 @@ enum ab8500_regulator_reg {
 	AB8500_REGUMISC1,
 	AB8500_VAUDIOSUPPLY,
 	AB8500_REGUCTRL1VAMIC,
-	AB8500_VSMPS1REGU,
-	AB8500_VSMPS2REGU,
-	AB8500_VSMPS3REGU, /* NOTE! PRCMU register */
 	AB8500_VPLLVANAREGU,
 	AB8500_VREFDDR,
 	AB8500_EXTSUPPLYREGU,
 	AB8500_VAUX12REGU,
 	AB8500_VRF1VAUX3REGU,
-	AB8500_VSMPS1SEL1,
-	AB8500_VSMPS1SEL2,
-	AB8500_VSMPS1SEL3,
-	AB8500_VSMPS2SEL1,
-	AB8500_VSMPS2SEL2,
-	AB8500_VSMPS2SEL3,
-	AB8500_VSMPS3SEL1, /* NOTE! PRCMU register */
-	AB8500_VSMPS3SEL2, /* NOTE! PRCMU register */
 	AB8500_VAUX1SEL,
 	AB8500_VAUX2SEL,
 	AB8500_VRF1VAUX3SEL,
-- 
cgit 


From fbbdb8f096e0e5d8244e1ffa46e364146ab9a440 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Wed, 27 Mar 2013 16:46:06 +0000
Subject: net: fix compile error of implicit declaration of
 skb_probe_transport_header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The commit 40893fd(net: switch to use skb_probe_transport_header())
involes a new error accidently. When NET_SKBUFF_DATA_USES_OFFSE is
not enabled, below compile error happens:

  CC      net/packet/af_packet.o
  net/packet/af_packet.c: In function ‘packet_sendmsg_spkt’:
  net/packet/af_packet.c:1516:2: error: implicit declaration of function ‘skb_probe_transport_header’ [-Werror=implicit-function-declaration]
  cc1: some warnings being treated as errors
  make[2]: *** [net/packet/af_packet.o] Error 1
  make[1]: *** [net/packet] Error 2
  make: *** [net] Error 2

As it seems skb_probe_transport_header() is not related to
NET_SKBUFF_DATA_USES_OFFSE, we should move the definition of
skb_probe_transport_header() out of scope of
NET_SKBUFF_DATA_USES_OFFSE macro.

Cc: Jason Wang <jasowang@redhat.com>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fa88b966cb8e..878e0ee81068 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1560,19 +1560,6 @@ static inline void skb_set_transport_header(struct sk_buff *skb,
 	skb->transport_header += offset;
 }
 
-static inline void skb_probe_transport_header(struct sk_buff *skb,
-					      const int offset_hint)
-{
-	struct flow_keys keys;
-
-	if (skb_transport_header_was_set(skb))
-		return;
-	else if (skb_flow_dissect(skb, &keys))
-		skb_set_transport_header(skb, keys.thoff);
-	else
-		skb_set_transport_header(skb, offset_hint);
-}
-
 static inline unsigned char *skb_network_header(const struct sk_buff *skb)
 {
 	return skb->head + skb->network_header;
@@ -1716,6 +1703,19 @@ static inline void skb_set_mac_header(struct sk_buff *skb, const int offset)
 }
 #endif /* NET_SKBUFF_DATA_USES_OFFSET */
 
+static inline void skb_probe_transport_header(struct sk_buff *skb,
+					      const int offset_hint)
+{
+	struct flow_keys keys;
+
+	if (skb_transport_header_was_set(skb))
+		return;
+	else if (skb_flow_dissect(skb, &keys))
+		skb_set_transport_header(skb, keys.thoff);
+	else
+		skb_set_transport_header(skb, offset_hint);
+}
+
 static inline void skb_mac_header_rebuild(struct sk_buff *skb)
 {
 	if (skb_mac_header_was_set(skb)) {
-- 
cgit 


From f3d4039242af92a9d93dee2fd9ae47066b20ca29 Mon Sep 17 00:00:00 2001
From: Paul Bolle <pebolle@tiscali.nl>
Date: Wed, 27 Mar 2013 10:52:28 +0000
Subject: tokenring: delete last holdout of CONFIG_TR

Tokenring support was deleted in v3.5. One last holdout of the macro
CONFIG_TR escaped that fate. Until now.

Signed-off-by: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 56e3e0665272..1dbb02c98946 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -144,8 +144,6 @@ static inline bool dev_xmit_complete(int rc)
 # else
 #  define LL_MAX_HEADER 96
 # endif
-#elif IS_ENABLED(CONFIG_TR)
-# define LL_MAX_HEADER 48
 #else
 # define LL_MAX_HEADER 32
 #endif
-- 
cgit 


From e5c5d22e8dcf7c2d430336cbf8e180bd38e8daf1 Mon Sep 17 00:00:00 2001
From: Simon Horman <horms@verge.net.au>
Date: Thu, 28 Mar 2013 13:38:25 +0900
Subject: net: add ETH_P_802_3_MIN

Add a new constant ETH_P_802_3_MIN, the minimum ethernet type for
an 802.3 frame. Frames with a lower value in the ethernet type field
are Ethernet II.

Also update all the users of this value that David Miller and
I could find to use the new constant.

Also correct a bug in util.c. The comparison with ETH_P_802_3_MIN
should be >= not >.

As suggested by Jesse Gross.

Compile tested only.

Cc: David Miller <davem@davemloft.net>
Cc: Jesse Gross <jesse@nicira.com>
Cc: Karsten Keil <isdn@linux-pingi.de>
Cc: John W. Linville <linville@tuxdriver.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Bart De Schuymer <bart.de.schuymer@pandora.be>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Patrick McHardy <kaber@trash.net>
Cc: Marcel Holtmann <marcel@holtmann.org>
Cc: Gustavo Padovan <gustavo@padovan.org>
Cc: Johan Hedberg <johan.hedberg@gmail.com>
Cc: linux-bluetooth@vger.kernel.org
Cc: netfilter-devel@vger.kernel.org
Cc: bridge@lists.linux-foundation.org
Cc: linux-wireless@vger.kernel.org
Cc: linux1394-devel@lists.sourceforge.net
Cc: linux-media@vger.kernel.org
Cc: netdev@vger.kernel.org
Cc: dev@openvswitch.org
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Acked-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/firewire/net.c           |  2 +-
 drivers/isdn/i4l/isdn_net.c      |  2 +-
 drivers/media/dvb-core/dvb_net.c | 10 +++++-----
 drivers/net/ethernet/sun/niu.c   |  2 +-
 drivers/net/plip/plip.c          |  2 +-
 drivers/net/wireless/ray_cs.c    |  2 +-
 include/linux/if_vlan.h          |  2 +-
 include/uapi/linux/if_ether.h    |  3 +++
 net/atm/lec.h                    |  2 +-
 net/bluetooth/bnep/netdev.c      |  2 +-
 net/bridge/netfilter/ebtables.c  |  2 +-
 net/ethernet/eth.c               |  2 +-
 net/mac80211/tx.c                |  2 +-
 net/openvswitch/datapath.c       |  2 +-
 net/openvswitch/flow.c           |  6 +++---
 net/wireless/util.c              |  2 +-
 16 files changed, 24 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firewire/net.c b/drivers/firewire/net.c
index 56796330a162..4d565365e476 100644
--- a/drivers/firewire/net.c
+++ b/drivers/firewire/net.c
@@ -547,7 +547,7 @@ static int fwnet_finish_incoming_packet(struct net_device *net,
 			if (memcmp(eth->h_dest, net->dev_addr, net->addr_len))
 				skb->pkt_type = PACKET_OTHERHOST;
 		}
-		if (ntohs(eth->h_proto) >= 1536) {
+		if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN) {
 			protocol = eth->h_proto;
 		} else {
 			rawp = (u16 *)skb->data;
diff --git a/drivers/isdn/i4l/isdn_net.c b/drivers/isdn/i4l/isdn_net.c
index babc621a07fb..88d657dff474 100644
--- a/drivers/isdn/i4l/isdn_net.c
+++ b/drivers/isdn/i4l/isdn_net.c
@@ -1385,7 +1385,7 @@ isdn_net_type_trans(struct sk_buff *skb, struct net_device *dev)
 		if (memcmp(eth->h_dest, dev->dev_addr, ETH_ALEN))
 			skb->pkt_type = PACKET_OTHERHOST;
 	}
-	if (ntohs(eth->h_proto) >= 1536)
+	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
 		return eth->h_proto;
 
 	rawp = skb->data;
diff --git a/drivers/media/dvb-core/dvb_net.c b/drivers/media/dvb-core/dvb_net.c
index 44225b186f6d..83a23afb13ab 100644
--- a/drivers/media/dvb-core/dvb_net.c
+++ b/drivers/media/dvb-core/dvb_net.c
@@ -185,7 +185,7 @@ static __be16 dvb_net_eth_type_trans(struct sk_buff *skb,
 			skb->pkt_type=PACKET_MULTICAST;
 	}
 
-	if (ntohs(eth->h_proto) >= 1536)
+	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
 		return eth->h_proto;
 
 	rawp = skb->data;
@@ -228,9 +228,9 @@ static int ule_test_sndu( struct dvb_net_priv *p )
 static int ule_bridged_sndu( struct dvb_net_priv *p )
 {
 	struct ethhdr *hdr = (struct ethhdr*) p->ule_next_hdr;
-	if(ntohs(hdr->h_proto) < 1536) {
+	if(ntohs(hdr->h_proto) < ETH_P_802_3_MIN) {
 		int framelen = p->ule_sndu_len - ((p->ule_next_hdr+sizeof(struct ethhdr)) - p->ule_skb->data);
-		/* A frame Type < 1536 for a bridged frame, introduces a LLC Length field. */
+		/* A frame Type < ETH_P_802_3_MIN for a bridged frame, introduces a LLC Length field. */
 		if(framelen != ntohs(hdr->h_proto)) {
 			return -1;
 		}
@@ -320,7 +320,7 @@ static int handle_ule_extensions( struct dvb_net_priv *p )
 			(int) p->ule_sndu_type, l, total_ext_len);
 #endif
 
-	} while (p->ule_sndu_type < 1536);
+	} while (p->ule_sndu_type < ETH_P_802_3_MIN);
 
 	return total_ext_len;
 }
@@ -712,7 +712,7 @@ static void dvb_net_ule( struct net_device *dev, const u8 *buf, size_t buf_len )
 				}
 
 				/* Handle ULE Extension Headers. */
-				if (priv->ule_sndu_type < 1536) {
+				if (priv->ule_sndu_type < ETH_P_802_3_MIN) {
 					/* There is an extension header.  Handle it accordingly. */
 					int l = handle_ule_extensions(priv);
 					if (l < 0) {
diff --git a/drivers/net/ethernet/sun/niu.c b/drivers/net/ethernet/sun/niu.c
index e4c1c88e4c2a..95cff98d8a34 100644
--- a/drivers/net/ethernet/sun/niu.c
+++ b/drivers/net/ethernet/sun/niu.c
@@ -6618,7 +6618,7 @@ static u64 niu_compute_tx_flags(struct sk_buff *skb, struct ethhdr *ehdr,
 	       (len << TXHDR_LEN_SHIFT) |
 	       ((l3off / 2) << TXHDR_L3START_SHIFT) |
 	       (ihl << TXHDR_IHL_SHIFT) |
-	       ((eth_proto_inner < 1536) ? TXHDR_LLC : 0) |
+	       ((eth_proto_inner < ETH_P_802_3_MIN) ? TXHDR_LLC : 0) |
 	       ((eth_proto == ETH_P_8021Q) ? TXHDR_VLAN : 0) |
 	       (ipv6 ? TXHDR_IP_VER : 0) |
 	       csum_bits);
diff --git a/drivers/net/plip/plip.c b/drivers/net/plip/plip.c
index bed62d9c53c8..1f7bef90b467 100644
--- a/drivers/net/plip/plip.c
+++ b/drivers/net/plip/plip.c
@@ -560,7 +560,7 @@ static __be16 plip_type_trans(struct sk_buff *skb, struct net_device *dev)
 	 *	so don't forget to remove it.
 	 */
 
-	if (ntohs(eth->h_proto) >= 1536)
+	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
 		return eth->h_proto;
 
 	rawp = skb->data;
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 4775b5d172d5..ebada812b3a5 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -953,7 +953,7 @@ static int translate_frame(ray_dev_t *local, struct tx_msg __iomem *ptx,
 			   unsigned char *data, int len)
 {
 	__be16 proto = ((struct ethhdr *)data)->h_proto;
-	if (ntohs(proto) >= 1536) { /* DIX II ethernet frame */
+	if (ntohs(proto) >= ETH_P_802_3_MIN) { /* DIX II ethernet frame */
 		pr_debug("ray_cs translate_frame DIX II\n");
 		/* Copy LLC header to card buffer */
 		memcpy_toio(&ptx->var, eth2_llc, sizeof(eth2_llc));
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 218a3b686d90..70962f3fdb79 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -339,7 +339,7 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
 	 */
 
 	proto = vhdr->h_vlan_encapsulated_proto;
-	if (ntohs(proto) >= 1536) {
+	if (ntohs(proto) >= ETH_P_802_3_MIN) {
 		skb->protocol = proto;
 		return;
 	}
diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 798032d01112..ade07f1c491a 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -94,6 +94,9 @@
 #define ETH_P_EDSA	0xDADA		/* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */
 #define ETH_P_AF_IUCV   0xFBFB		/* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */
 
+#define ETH_P_802_3_MIN	0x0600		/* If the value in the ethernet type is less than this value
+					 * then the frame is Ethernet II. Else it is 802.3 */
+
 /*
  *	Non DIX types. Won't clash for 1500 types.
  */
diff --git a/net/atm/lec.h b/net/atm/lec.h
index a86aff9a3c04..4149db1b7885 100644
--- a/net/atm/lec.h
+++ b/net/atm/lec.h
@@ -58,7 +58,7 @@ struct lane2_ops {
  *    field in h_type field. Data follows immediately after header.
  * 2. LLC Data frames whose total length, including LLC field and data,
  *    but not padding required to meet the minimum data frame length,
- *    is less than 1536(0x0600) MUST be encoded by placing that length
+ *    is less than ETH_P_802_3_MIN MUST be encoded by placing that length
  *    in the h_type field. The LLC field follows header immediately.
  * 3. LLC data frames longer than this maximum MUST be encoded by placing
  *    the value 0 in the h_type field.
diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c
index e58c8b32589c..4b488ec26105 100644
--- a/net/bluetooth/bnep/netdev.c
+++ b/net/bluetooth/bnep/netdev.c
@@ -136,7 +136,7 @@ static u16 bnep_net_eth_proto(struct sk_buff *skb)
 	struct ethhdr *eh = (void *) skb->data;
 	u16 proto = ntohs(eh->h_proto);
 
-	if (proto >= 1536)
+	if (proto >= ETH_P_802_3_MIN)
 		return proto;
 
 	if (get_unaligned((__be16 *) skb->data) == htons(0xFFFF))
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 8d493c91a562..3d110c4fc787 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -138,7 +138,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
 		ethproto = h->h_proto;
 
 	if (e->bitmask & EBT_802_3) {
-		if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO))
+		if (FWINV2(ntohs(ethproto) >= ETH_P_802_3_MIN, EBT_IPROTO))
 			return 1;
 	} else if (!(e->bitmask & EBT_NOPROTO) &&
 	   FWINV2(e->ethproto != ethproto, EBT_IPROTO))
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index a36c85eab5b4..5359560926bc 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -195,7 +195,7 @@ __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
 	if (netdev_uses_trailer_tags(dev))
 		return htons(ETH_P_TRAILER);
 
-	if (ntohs(eth->h_proto) >= 1536)
+	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
 		return eth->h_proto;
 
 	/*
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 8914d2d2881a..4e8a86163fc7 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -2085,7 +2085,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb,
 		encaps_data = bridge_tunnel_header;
 		encaps_len = sizeof(bridge_tunnel_header);
 		skip_header_bytes -= 2;
-	} else if (ethertype >= 0x600) {
+	} else if (ethertype >= ETH_P_802_3_MIN) {
 		encaps_data = rfc1042_header;
 		encaps_len = sizeof(rfc1042_header);
 		skip_header_bytes -= 2;
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index d61cd9971808..8759265a3e46 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -681,7 +681,7 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
 	/* Normally, setting the skb 'protocol' field would be handled by a
 	 * call to eth_type_trans(), but it assumes there's a sending
 	 * device, which we may not have. */
-	if (ntohs(eth->h_proto) >= 1536)
+	if (ntohs(eth->h_proto) >= ETH_P_802_3_MIN)
 		packet->protocol = eth->h_proto;
 	else
 		packet->protocol = htons(ETH_P_802_2);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index fe0e4215c73d..332486839347 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -466,7 +466,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
 	proto = *(__be16 *) skb->data;
 	__skb_pull(skb, sizeof(__be16));
 
-	if (ntohs(proto) >= 1536)
+	if (ntohs(proto) >= ETH_P_802_3_MIN)
 		return proto;
 
 	if (skb->len < sizeof(struct llc_snap_hdr))
@@ -483,7 +483,7 @@ static __be16 parse_ethertype(struct sk_buff *skb)
 
 	__skb_pull(skb, sizeof(struct llc_snap_hdr));
 
-	if (ntohs(llc->ethertype) >= 1536)
+	if (ntohs(llc->ethertype) >= ETH_P_802_3_MIN)
 		return llc->ethertype;
 
 	return htons(ETH_P_802_2);
@@ -1038,7 +1038,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
 
 	if (attrs & (1 << OVS_KEY_ATTR_ETHERTYPE)) {
 		swkey->eth.type = nla_get_be16(a[OVS_KEY_ATTR_ETHERTYPE]);
-		if (ntohs(swkey->eth.type) < 1536)
+		if (ntohs(swkey->eth.type) < ETH_P_802_3_MIN)
 			return -EINVAL;
 		attrs &= ~(1 << OVS_KEY_ATTR_ETHERTYPE);
 	} else {
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 37a56ee1e1ed..6cbac99ae03d 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -511,7 +511,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
 		encaps_data = bridge_tunnel_header;
 		encaps_len = sizeof(bridge_tunnel_header);
 		skip_header_bytes -= 2;
-	} else if (ethertype > 0x600) {
+	} else if (ethertype >= ETH_P_802_3_MIN) {
 		encaps_data = rfc1042_header;
 		encaps_len = sizeof(rfc1042_header);
 		skip_header_bytes -= 2;
-- 
cgit 


From 3d5a96582303e28c48699f3faaf920ef7d43e6f2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 19 Mar 2013 15:38:50 +0100
Subject: clocksource: make CLOCKSOURCE_OF_DECLARE type safe

This ensures that a function pointer passed into CLOCKSOURCE_OF_DECLARE
takes the same arguments that we use for calling that function later.

Also fix the extraneous semicolon at end of the CLOCKSOURCE_OF_DECLARE
definition.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Rob Herring <rob.herring@calxeda.com>
---
 drivers/clocksource/clksrc-of.c    |  3 ++-
 drivers/clocksource/vt8500_timer.c |  2 +-
 include/linux/clocksource.h        | 11 +++++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clocksource/clksrc-of.c b/drivers/clocksource/clksrc-of.c
index 3ef11fba781c..37f5325bec95 100644
--- a/drivers/clocksource/clksrc-of.c
+++ b/drivers/clocksource/clksrc-of.c
@@ -16,6 +16,7 @@
 
 #include <linux/init.h>
 #include <linux/of.h>
+#include <linux/clocksource.h>
 
 extern struct of_device_id __clksrc_of_table[];
 
@@ -26,7 +27,7 @@ void __init clocksource_of_init(void)
 {
 	struct device_node *np;
 	const struct of_device_id *match;
-	void (*init_func)(struct device_node *);
+	clocksource_of_init_fn init_func;
 
 	for_each_matching_node_and_match(np, __clksrc_of_table, &match) {
 		init_func = match->data;
diff --git a/drivers/clocksource/vt8500_timer.c b/drivers/clocksource/vt8500_timer.c
index 242255285597..64f553f04fa4 100644
--- a/drivers/clocksource/vt8500_timer.c
+++ b/drivers/clocksource/vt8500_timer.c
@@ -165,4 +165,4 @@ static void __init vt8500_timer_init(struct device_node *np)
 					4, 0xf0000000);
 }
 
-CLOCKSOURCE_OF_DECLARE(vt8500, "via,vt8500-timer", vt8500_timer_init)
+CLOCKSOURCE_OF_DECLARE(vt8500, "via,vt8500-timer", vt8500_timer_init);
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 08ed5e19d8c6..192d6d1771ee 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -332,16 +332,23 @@ extern int clocksource_mmio_init(void __iomem *, const char *,
 
 extern int clocksource_i8253_init(void);
 
+struct device_node;
+typedef void(*clocksource_of_init_fn)(struct device_node *);
 #ifdef CONFIG_CLKSRC_OF
 extern void clocksource_of_init(void);
 
 #define CLOCKSOURCE_OF_DECLARE(name, compat, fn)			\
 	static const struct of_device_id __clksrc_of_table_##name	\
 		__used __section(__clksrc_of_table)			\
-		 = { .compatible = compat, .data = fn };
+		 = { .compatible = compat,				\
+		     .data = (fn == (clocksource_of_init_fn)NULL) ? fn : fn }
 #else
 static inline void clocksource_of_init(void) {}
-#define CLOCKSOURCE_OF_DECLARE(name, compat, fn)
+#define CLOCKSOURCE_OF_DECLARE(name, compat, fn)			\
+	static const struct of_device_id __clksrc_of_table_##name	\
+		__attribute__((unused))					\
+		 = { .compatible = compat,				\
+		     .data = (fn == (clocksource_of_init_fn)NULL) ? fn : fn }
 #endif
 
 #endif /* _LINUX_CLOCKSOURCE_H */
-- 
cgit 


From b949be5857a4033e00fed67b707774f52619ce60 Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Wed, 27 Mar 2013 14:08:33 +0100
Subject: idr: document exit conditions on idr_for_each_entry better

And some manual common subexpression elimination which may help the
compiler produce smaller code.

Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/idr.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 2640c7e99e51..6ece0583362a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -122,11 +122,13 @@ static inline void *idr_find(struct idr *idr, int id)
  * @idp:     idr handle
  * @entry:   the type * to use as cursor
  * @id:      id entry's key
+ *
+ * @entry and @id do not need to be initialized before the loop, and
+ * after normal terminatinon @entry is left with the value NULL.  This
+ * is convenient for a "not found" value.
  */
-#define idr_for_each_entry(idp, entry, id)				\
-	for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
-	     entry != NULL;                                             \
-	     ++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
+#define idr_for_each_entry(idp, entry, id)			\
+	for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id)
 
 /*
  * Don't use the following functions.  These exist only to suppress
-- 
cgit 


From 2bd5ed5d6713594eb2b4d234d01217d506279c7d Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 27 Mar 2013 14:08:40 +0100
Subject: drbd: Fix disconnect to keep the peer disk state if connection breaks
 during operation

The issue was that if the connection broke while we did the
gracefull state change to C_DISCONNECTING (C_TEARDOWN), then
we returned a success code from the state engine. (SS_CW_NO_NEED)

The result of that is that we missed to call the fence-peer
script in such a case.

Fixed that by introducing a new error code (SS_OUTDATE_WO_CONN).
This one should never reach back into user space.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_nl.c      |  7 +++++--
 drivers/block/drbd/drbd_state.c   | 14 +++++++-------
 drivers/block/drbd/drbd_strings.c |  1 +
 include/linux/drbd.h              |  3 ++-
 4 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 56bafdcd943e..39e9a91a8f31 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -2198,8 +2198,11 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool for
 		return SS_SUCCESS;
 	case SS_PRIMARY_NOP:
 		/* Our state checking code wants to see the peer outdated. */
-		rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
-						pdsk, D_OUTDATED), CS_VERBOSE);
+		rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+		if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+			rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
 		break;
 	case SS_CW_FAILED_BY_PEER:
 		/* The peer probably wants to see us outdated. */
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 22e259f34370..90c5be2b1d30 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -642,6 +642,10 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_t
 	    && os.conn < C_WF_REPORT_PARAMS)
 		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
 
+	if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+	    os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+		rv = SS_OUTDATE_WO_CONN;
+
 	return rv;
 }
 
@@ -1748,13 +1752,9 @@ _conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state
 	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
 		return SS_CW_FAILED_BY_PEER;
 
-	rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
-
-	if (rv == SS_UNKNOWN_ERROR)
-		rv = conn_is_valid_transition(tconn, mask, val, 0);
-
-	if (rv == SS_SUCCESS)
-		rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+	rv = conn_is_valid_transition(tconn, mask, val, 0);
+	if (rv == SS_SUCCESS && tconn->cstate == C_WF_REPORT_PARAMS)
+		rv = SS_UNKNOWN_ERROR; /* continue waiting */
 
 	return rv;
 }
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index 9a664bd27404..58e08ff2b2ce 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@ static const char *drbd_state_sw_errors[] = {
 	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
 	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
 	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+	[-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
 	[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
 };
 
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 0c5a18ec322c..316330705fd7 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -319,7 +319,8 @@ enum drbd_state_rv {
 	SS_IN_TRANSIENT_STATE = -18,  /* Retry after the next state change */
 	SS_CONCURRENT_ST_CHG = -19,   /* Concurrent cluster side state change! */
 	SS_O_VOL_PEER_PRI = -20,
-	SS_AFTER_LAST_ERROR = -21,    /* Keep this at bottom */
+	SS_OUTDATE_WO_CONN = -21,
+	SS_AFTER_LAST_ERROR = -22,    /* Keep this at bottom */
 };
 
 /* from drbd_strings.c */
-- 
cgit 


From 3990e04df085e0561ab34f84731dc5929585c526 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 27 Mar 2013 14:08:48 +0100
Subject: drbd: use sched_setscheduler()

It was unnoticed for some time that assigning to current->policy is
no longer sufficient to set a real time priority for a kernel thread.

Reported-by: Charlie Suffin <Charlie.Suffin@stratus.com>
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 drivers/block/drbd/drbd_receiver.c | 6 ++++--
 include/linux/drbd.h               | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index a75c0b134856..0f449bbf0edf 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -5257,9 +5257,11 @@ int drbd_asender(struct drbd_thread *thi)
 	bool ping_timeout_active = false;
 	struct net_conf *nc;
 	int ping_timeo, tcp_cork, ping_int;
+	struct sched_param param = { .sched_priority = 2 };
 
-	current->policy = SCHED_RR;  /* Make this a realtime task! */
-	current->rt_priority = 2;    /* more important than all other tasks */
+	rv = sched_setscheduler(current, SCHED_RR, &param);
+	if (rv < 0)
+		conn_err(tconn, "drbd_asender: ERROR set priority, ret=%d\n", rv);
 
 	while (get_t_state(thi) == RUNNING) {
 		drbd_thread_current_set_cpu(thi);
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 316330705fd7..1b4d4ee1168f 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -52,7 +52,7 @@
 #endif
 
 extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.4.2"
+#define REL_VERSION "8.4.3"
 #define API_VERSION 1
 #define PRO_VERSION_MIN 86
 #define PRO_VERSION_MAX 101
-- 
cgit 


From 84ebc10294a3d7be4c66f51070b7aedbaa24de9b Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Wed, 27 Mar 2013 16:14:46 -0400
Subject: USB: remove CONFIG_USB_SUSPEND option

This patch (as1675) removes the CONFIG_USB_SUSPEND option, essentially
replacing it everywhere with CONFIG_PM_RUNTIME (except for one place
in hub.c, where it is replaced with CONFIG_PM because the code needs
to be used in both runtime and system PM).  The net result is code
shrinkage and simplification.

There's very little point in keeping CONFIG_USB_SUSPEND because almost
everybody enables it.  The few that don't will find that the usbcore
module has gotten somewhat bigger and they will have to take active
measures if they want to prevent hubs from being runtime suspended.

Signed-off-by: Alan Stern <stern@rowland.harvard.edu>
CC: Peter Chen <peter.chen@freescale.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/core/Kconfig     | 16 ----------------
 drivers/usb/core/driver.c    |  4 ++--
 drivers/usb/core/hcd.c       | 10 +++++-----
 drivers/usb/core/hub.c       | 42 +++++++-----------------------------------
 drivers/usb/core/port.c      |  4 ++--
 drivers/usb/core/sysfs.c     |  4 ++--
 drivers/usb/core/usb.c       |  4 ++--
 drivers/usb/core/usb.h       |  2 +-
 drivers/usb/host/ehci-pci.c  | 12 +-----------
 drivers/usb/host/ohci-hub.c  |  6 ------
 drivers/usb/host/sl811-hcd.c |  2 +-
 drivers/usb/host/u132-hcd.c  |  9 +++++----
 drivers/usb/host/xhci-hub.c  |  2 +-
 drivers/usb/host/xhci.c      |  4 ++--
 include/linux/usb.h          |  2 +-
 include/linux/usb/hcd.h      |  6 +++---
 16 files changed, 35 insertions(+), 94 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig
index f70c1a1694ad..175701a2dae4 100644
--- a/drivers/usb/core/Kconfig
+++ b/drivers/usb/core/Kconfig
@@ -38,22 +38,6 @@ config USB_DYNAMIC_MINORS
 
 	  If you are unsure about this, say N here.
 
-config USB_SUSPEND
-	bool "USB runtime power management (autosuspend) and wakeup"
-	depends on USB && PM_RUNTIME
-	help
-	  If you say Y here, you can use driver calls or the sysfs
-	  "power/control" file to enable or disable autosuspend for
-	  individual USB peripherals (see
-	  Documentation/usb/power-management.txt for more details).
-
-	  Also, USB "remote wakeup" signaling is supported, whereby some
-	  USB devices (like keyboards and network adapters) can wake up
-	  their parent hub.  That wakeup cascades up the USB tree, and
-	  could wake the system from states like suspend-to-RAM.
-
-	  If you are unsure about this, say N here.
-
 config USB_OTG
 	bool "OTG support"
 	depends on USB
diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c
index eb1d00a3543a..84d2b0585810 100644
--- a/drivers/usb/core/driver.c
+++ b/drivers/usb/core/driver.c
@@ -1412,7 +1412,7 @@ int usb_resume(struct device *dev, pm_message_t msg)
 
 #endif /* CONFIG_PM */
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 
 /**
  * usb_enable_autosuspend - allow a USB device to be autosuspended
@@ -1780,7 +1780,7 @@ int usb_set_usb2_hardware_lpm(struct usb_device *udev, int enable)
 	return ret;
 }
 
-#endif /* CONFIG_USB_SUSPEND */
+#endif /* CONFIG_PM_RUNTIME */
 
 struct bus_type usb_bus_type = {
 	.name =		"usb",
diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c
index f9ec44cbb82f..d53547d2e4c7 100644
--- a/drivers/usb/core/hcd.c
+++ b/drivers/usb/core/hcd.c
@@ -2125,7 +2125,7 @@ int hcd_bus_resume(struct usb_device *rhdev, pm_message_t msg)
 
 #endif	/* CONFIG_PM */
 
-#ifdef	CONFIG_USB_SUSPEND
+#ifdef	CONFIG_PM_RUNTIME
 
 /* Workqueue routine for root-hub remote wakeup */
 static void hcd_resume_work(struct work_struct *work)
@@ -2160,7 +2160,7 @@ void usb_hcd_resume_root_hub (struct usb_hcd *hcd)
 }
 EXPORT_SYMBOL_GPL(usb_hcd_resume_root_hub);
 
-#endif	/* CONFIG_USB_SUSPEND */
+#endif	/* CONFIG_PM_RUNTIME */
 
 /*-------------------------------------------------------------------------*/
 
@@ -2336,7 +2336,7 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver,
 	init_timer(&hcd->rh_timer);
 	hcd->rh_timer.function = rh_timer_func;
 	hcd->rh_timer.data = (unsigned long) hcd;
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 	INIT_WORK(&hcd->wakeup_work, hcd_resume_work);
 #endif
 
@@ -2590,7 +2590,7 @@ error_create_attr_group:
 	hcd->rh_registered = 0;
 	spin_unlock_irq(&hcd_root_hub_lock);
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 	cancel_work_sync(&hcd->wakeup_work);
 #endif
 	mutex_lock(&usb_bus_list_lock);
@@ -2645,7 +2645,7 @@ void usb_remove_hcd(struct usb_hcd *hcd)
 	hcd->rh_registered = 0;
 	spin_unlock_irq (&hcd_root_hub_lock);
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 	cancel_work_sync(&hcd->wakeup_work);
 #endif
 
diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
index 443d5cc9330b..feef9351463d 100644
--- a/drivers/usb/core/hub.c
+++ b/drivers/usb/core/hub.c
@@ -2827,7 +2827,7 @@ void usb_enable_ltm(struct usb_device *udev)
 }
 EXPORT_SYMBOL_GPL(usb_enable_ltm);
 
-#ifdef	CONFIG_USB_SUSPEND
+#ifdef	CONFIG_PM
 /*
  * usb_disable_function_remotewakeup - disable usb3.0
  * device's function remote wakeup
@@ -2886,7 +2886,7 @@ static int usb_disable_function_remotewakeup(struct usb_device *udev)
  * Linux (2.6) currently has NO mechanisms to initiate that:  no khubd
  * timer, no SRP, no requests through sysfs.
  *
- * If CONFIG_USB_SUSPEND isn't enabled, non-SuperSpeed devices really get
+ * If Runtime PM isn't enabled or used, non-SuperSpeed devices really get
  * suspended only when their bus goes into global suspend (i.e., the root
  * hub is suspended).  Nevertheless, we change @udev->state to
  * USB_STATE_SUSPENDED as this is the device's "logical" state.  The actual
@@ -3247,6 +3247,10 @@ int usb_port_resume(struct usb_device *udev, pm_message_t msg)
 	return status;
 }
 
+#endif	/* CONFIG_PM */
+
+#ifdef	CONFIG_PM_RUNTIME
+
 /* caller has locked udev */
 int usb_remote_wakeup(struct usb_device *udev)
 {
@@ -3263,38 +3267,6 @@ int usb_remote_wakeup(struct usb_device *udev)
 	return status;
 }
 
-#else	/* CONFIG_USB_SUSPEND */
-
-/* When CONFIG_USB_SUSPEND isn't set, we never suspend or resume any ports. */
-
-int usb_port_suspend(struct usb_device *udev, pm_message_t msg)
-{
-	return 0;
-}
-
-/* However we may need to do a reset-resume */
-
-int usb_port_resume(struct usb_device *udev, pm_message_t msg)
-{
-	struct usb_hub	*hub = usb_hub_to_struct_hub(udev->parent);
-	int		port1 = udev->portnum;
-	int		status;
-	u16		portchange, portstatus;
-
-	status = hub_port_status(hub, port1, &portstatus, &portchange);
-	status = check_port_resume_type(udev,
-			hub, port1, status, portchange, portstatus);
-
-	if (status) {
-		dev_dbg(&udev->dev, "can't resume, status %d\n", status);
-		hub_port_logical_disconnect(hub, port1);
-	} else if (udev->reset_resume) {
-		dev_dbg(&udev->dev, "reset-resume\n");
-		status = usb_reset_and_verify_device(udev);
-	}
-	return status;
-}
-
 #endif
 
 static int check_ports_changed(struct usb_hub *hub)
@@ -4356,7 +4328,7 @@ static void hub_port_connect_change(struct usb_hub *hub, int port1,
 		if (portstatus & USB_PORT_STAT_ENABLE) {
 			status = 0;		/* Nothing to do */
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 		} else if (udev->state == USB_STATE_SUSPENDED &&
 				udev->persist_enabled) {
 			/* For a suspended device, treat this as a
diff --git a/drivers/usb/core/port.c b/drivers/usb/core/port.c
index 797f9d514732..06c4894bf181 100644
--- a/drivers/usb/core/port.c
+++ b/drivers/usb/core/port.c
@@ -71,7 +71,7 @@ static void usb_port_device_release(struct device *dev)
 	kfree(port_dev);
 }
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 static int usb_port_runtime_resume(struct device *dev)
 {
 	struct usb_port *port_dev = to_usb_port(dev);
@@ -139,7 +139,7 @@ static int usb_port_runtime_suspend(struct device *dev)
 #endif
 
 static const struct dev_pm_ops usb_port_pm_ops = {
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 	.runtime_suspend =	usb_port_runtime_suspend,
 	.runtime_resume =	usb_port_runtime_resume,
 	.runtime_idle =		pm_generic_runtime_idle,
diff --git a/drivers/usb/core/sysfs.c b/drivers/usb/core/sysfs.c
index 3f81a3dc6867..aa38db44818a 100644
--- a/drivers/usb/core/sysfs.c
+++ b/drivers/usb/core/sysfs.c
@@ -338,7 +338,7 @@ static void remove_persist_attributes(struct device *dev)
 
 #endif	/* CONFIG_PM */
 
-#ifdef	CONFIG_USB_SUSPEND
+#ifdef	CONFIG_PM_RUNTIME
 
 static ssize_t
 show_connected_duration(struct device *dev, struct device_attribute *attr,
@@ -544,7 +544,7 @@ static void remove_power_attributes(struct device *dev)
 #define add_power_attributes(dev)	0
 #define remove_power_attributes(dev)	do {} while (0)
 
-#endif	/* CONFIG_USB_SUSPEND */
+#endif	/* CONFIG_PM_RUNTIME */
 
 
 /* Descriptor fields */
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index f81b92572735..03eb7ae8fc1a 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -49,7 +49,7 @@ const char *usbcore_name = "usbcore";
 
 static bool nousb;	/* Disable USB when built into kernel image */
 
-#ifdef	CONFIG_USB_SUSPEND
+#ifdef	CONFIG_PM_RUNTIME
 static int usb_autosuspend_delay = 2;		/* Default delay value,
 						 * in seconds */
 module_param_named(autosuspend, usb_autosuspend_delay, int, 0644);
@@ -307,7 +307,7 @@ static const struct dev_pm_ops usb_device_pm_ops = {
 	.thaw =		usb_dev_thaw,
 	.poweroff =	usb_dev_poweroff,
 	.restore =	usb_dev_restore,
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 	.runtime_suspend =	usb_runtime_suspend,
 	.runtime_resume =	usb_runtime_resume,
 	.runtime_idle =		usb_runtime_idle,
diff --git a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
index a7f20bde0e5e..823857767a16 100644
--- a/drivers/usb/core/usb.h
+++ b/drivers/usb/core/usb.h
@@ -93,7 +93,7 @@ static inline int usb_port_resume(struct usb_device *udev, pm_message_t msg)
 
 #endif
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 
 extern void usb_autosuspend_device(struct usb_device *udev);
 extern int usb_autoresume_device(struct usb_device *udev);
diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c
index 170b9399e09f..a573d5ff9adc 100644
--- a/drivers/usb/host/ehci-pci.c
+++ b/drivers/usb/host/ehci-pci.c
@@ -292,17 +292,7 @@ static int ehci_pci_setup(struct usb_hcd *hcd)
 		}
 	}
 
-#ifdef	CONFIG_USB_SUSPEND
-	/* REVISIT: the controller works fine for wakeup iff the root hub
-	 * itself is "globally" suspended, but usbcore currently doesn't
-	 * understand such things.
-	 *
-	 * System suspend currently expects to be able to suspend the entire
-	 * device tree, device-at-a-time.  If we failed selective suspend
-	 * reports, system suspend would fail; so the root hub code must claim
-	 * success.  That's lying to usbcore, and it matters for runtime
-	 * PM scenarios with selective suspend and remote wakeup...
-	 */
+#ifdef	CONFIG_PM_RUNTIME
 	if (ehci->no_selective_suspend && device_can_wakeup(&pdev->dev))
 		ehci_warn(ehci, "selective suspend/wakeup unavailable\n");
 #endif
diff --git a/drivers/usb/host/ohci-hub.c b/drivers/usb/host/ohci-hub.c
index db09dae7b557..60ff4220e8b4 100644
--- a/drivers/usb/host/ohci-hub.c
+++ b/drivers/usb/host/ohci-hub.c
@@ -580,14 +580,8 @@ static int ohci_start_port_reset (struct usb_hcd *hcd, unsigned port)
 
 /* See usb 7.1.7.5:  root hubs must issue at least 50 msec reset signaling,
  * not necessarily continuous ... to guard against resume signaling.
- * The short timeout is safe for non-root hubs, and is backward-compatible
- * with earlier Linux hosts.
  */
-#ifdef	CONFIG_USB_SUSPEND
 #define	PORT_RESET_MSEC		50
-#else
-#define	PORT_RESET_MSEC		10
-#endif
 
 /* this timer value might be vendor-specific ... */
 #define	PORT_RESET_HW_MSEC	10
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index d62f0404baaa..15ed7e8d887f 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -1755,7 +1755,7 @@ sl811h_probe(struct platform_device *dev)
 
 /* for this device there's no useful distinction between the controller
  * and its root hub, except that the root hub only gets direct PM calls
- * when CONFIG_USB_SUSPEND is enabled.
+ * when CONFIG_PM_RUNTIME is enabled.
  */
 
 static int
diff --git a/drivers/usb/host/u132-hcd.c b/drivers/usb/host/u132-hcd.c
index 5efdffe32365..5c124bf5d018 100644
--- a/drivers/usb/host/u132-hcd.c
+++ b/drivers/usb/host/u132-hcd.c
@@ -3141,10 +3141,11 @@ static int u132_probe(struct platform_device *pdev)
 
 
 #ifdef CONFIG_PM
-/* for this device there's no useful distinction between the controller
-* and its root hub, except that the root hub only gets direct PM calls
-* when CONFIG_USB_SUSPEND is enabled.
-*/
+/*
+ * for this device there's no useful distinction between the controller
+ * and its root hub, except that the root hub only gets direct PM calls
+ * when CONFIG_PM_RUNTIME is enabled.
+ */
 static int u132_suspend(struct platform_device *pdev, pm_message_t state)
 {
 	struct usb_hcd *hcd = platform_get_drvdata(pdev);
diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
index 68914429482f..187a3ec1069a 100644
--- a/drivers/usb/host/xhci-hub.c
+++ b/drivers/usb/host/xhci-hub.c
@@ -1075,7 +1075,7 @@ int xhci_bus_suspend(struct usb_hcd *hcd)
 			set_bit(port_index, &bus_state->bus_suspended);
 		}
 		/* USB core sets remote wake mask for USB 3.0 hubs,
-		 * including the USB 3.0 roothub, but only if CONFIG_USB_SUSPEND
+		 * including the USB 3.0 roothub, but only if CONFIG_PM_RUNTIME
 		 * is enabled, so also enable remote wake here.
 		 */
 		if (hcd->self.root_hub->do_remote_wakeup) {
diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
index 53b8f89a0b1c..5156b720a53a 100644
--- a/drivers/usb/host/xhci.c
+++ b/drivers/usb/host/xhci.c
@@ -3801,7 +3801,7 @@ int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1)
 	return raw_port;
 }
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 
 /* BESL to HIRD Encoding array for USB2 LPM */
 static int xhci_besl_encoding[16] = {125, 150, 200, 300, 400, 500, 1000, 2000,
@@ -4051,7 +4051,7 @@ int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev)
 	return 0;
 }
 
-#endif /* CONFIG_USB_SUSPEND */
+#endif /* CONFIG_PM_RUNTIME */
 
 /*---------------------- USB 3.0 Link PM functions ------------------------*/
 
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 8d4bc173d66a..a0bee5a28d1a 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -617,7 +617,7 @@ static inline bool usb_acpi_power_manageable(struct usb_device *hdev, int index)
 #endif
 
 /* USB autosuspend and autoresume */
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 extern void usb_enable_autosuspend(struct usb_device *udev);
 extern void usb_disable_autosuspend(struct usb_device *udev);
 
diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
index 59694b5e5e90..f5f5c7dfda90 100644
--- a/include/linux/usb/hcd.h
+++ b/include/linux/usb/hcd.h
@@ -84,7 +84,7 @@ struct usb_hcd {
 
 	struct timer_list	rh_timer;	/* drives root-hub polling */
 	struct urb		*status_urb;	/* the current status urb */
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 	struct work_struct	wakeup_work;	/* for remote wakeup */
 #endif
 
@@ -593,14 +593,14 @@ extern int hcd_bus_suspend(struct usb_device *rhdev, pm_message_t msg);
 extern int hcd_bus_resume(struct usb_device *rhdev, pm_message_t msg);
 #endif /* CONFIG_PM */
 
-#ifdef CONFIG_USB_SUSPEND
+#ifdef CONFIG_PM_RUNTIME
 extern void usb_hcd_resume_root_hub(struct usb_hcd *hcd);
 #else
 static inline void usb_hcd_resume_root_hub(struct usb_hcd *hcd)
 {
 	return;
 }
-#endif /* CONFIG_USB_SUSPEND */
+#endif /* CONFIG_PM_RUNTIME */
 
 /*-------------------------------------------------------------------------*/
 
-- 
cgit 


From 5d0f6131a79adfa1fb51309c5f81a2a4ef879dd4 Mon Sep 17 00:00:00 2001
From: Vishal Verma <vishal.l.verma@intel.com>
Date: Mon, 4 Mar 2013 18:40:58 -0700
Subject: NVMe: Add nvme-scsi.c

Translates SCSI commands in SG_IO ioctl to NVMe commands.
Uses the scsi-nvme translation spec from nvmexpress.org as reference.

Signed-off-by: Vishal Verma <vishal.l.verma@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/Makefile    |    2 +-
 drivers/block/nvme-core.c |   37 +-
 drivers/block/nvme-scsi.c | 2941 +++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nvme.h      |   35 +
 4 files changed, 2997 insertions(+), 18 deletions(-)
 create mode 100644 drivers/block/nvme-scsi.c

(limited to 'include/linux')

diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 2a41c86d3ad9..ca07399a8d99 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -42,5 +42,5 @@ obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
 
-nvme-y		:= nvme-core.o
+nvme-y		:= nvme-core.o nvme-scsi.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index d0cfb85d5582..a89f7dbefba0 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -39,7 +39,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-
+#include <scsi/sg.h>
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
 #define NVME_Q_DEPTH 1024
@@ -224,12 +224,12 @@ static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid,
 	return ctx;
 }
 
-static struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
+struct nvme_queue *get_nvmeq(struct nvme_dev *dev)
 {
 	return dev->queues[get_cpu() + 1];
 }
 
-static void put_nvmeq(struct nvme_queue *nvmeq)
+void put_nvmeq(struct nvme_queue *nvmeq)
 {
 	put_cpu();
 }
@@ -290,7 +290,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp)
 	return iod;
 }
 
-static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
+void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod)
 {
 	const int last_prp = PAGE_SIZE / 8 - 1;
 	int i;
@@ -339,9 +339,8 @@ static void bio_completion(struct nvme_dev *dev, void *ctx,
 }
 
 /* length is in bytes.  gfp flags indicates whether we may sleep. */
-static int nvme_setup_prps(struct nvme_dev *dev,
-			struct nvme_common_command *cmd, struct nvme_iod *iod,
-			int total_len, gfp_t gfp)
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
+			struct nvme_iod *iod, int total_len, gfp_t gfp)
 {
 	struct dma_pool *pool;
 	int length = total_len;
@@ -512,7 +511,7 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	return 0;
 }
 
-static int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
+int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
 {
 	int cmdid = alloc_cmdid(nvmeq, (void *)CMD_CTX_FLUSH,
 					special_completion, NVME_IO_TIMEOUT);
@@ -715,8 +714,8 @@ static void sync_completion(struct nvme_dev *dev, void *ctx,
  * Returns 0 on success.  If the result is negative, it's a Linux error code;
  * if the result is positive, it's an NVM Express status code
  */
-static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
-			struct nvme_command *cmd, u32 *result, unsigned timeout)
+int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+						u32 *result, unsigned timeout)
 {
 	int cmdid;
 	struct sync_cmd_info cmdinfo;
@@ -745,7 +744,7 @@ static int nvme_submit_sync_cmd(struct nvme_queue *nvmeq,
 	return cmdinfo.status;
 }
 
-static int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
+int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd,
 								u32 *result)
 {
 	return nvme_submit_sync_cmd(dev->queues[0], cmd, result, ADMIN_TIMEOUT);
@@ -818,7 +817,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid)
 	return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid);
 }
 
-static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
+int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 							dma_addr_t dma_addr)
 {
 	struct nvme_command c;
@@ -832,7 +831,7 @@ static int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns,
 	return nvme_submit_admin_cmd(dev, &c, NULL);
 }
 
-static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
+int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 					dma_addr_t dma_addr, u32 *result)
 {
 	struct nvme_command c;
@@ -846,8 +845,8 @@ static int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
 	return nvme_submit_admin_cmd(dev, &c, result);
 }
 
-static int nvme_set_features(struct nvme_dev *dev, unsigned fid,
-			unsigned dword11, dma_addr_t dma_addr, u32 *result)
+int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
+					dma_addr_t dma_addr, u32 *result)
 {
 	struct nvme_command c;
 
@@ -1065,7 +1064,7 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
 	return result;
 }
 
-static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 				unsigned long addr, unsigned length)
 {
 	int i, err, count, nents, offset;
@@ -1121,7 +1120,7 @@ static struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
 	return ERR_PTR(err);
 }
 
-static void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
 			struct nvme_iod *iod)
 {
 	int i;
@@ -1257,6 +1256,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
 		return nvme_user_admin_cmd(ns->dev, (void __user *)arg);
 	case NVME_IOCTL_SUBMIT_IO:
 		return nvme_submit_io(ns, (void __user *)arg);
+	case SG_GET_VERSION_NUM:
+		return nvme_sg_get_version_num((void __user *)arg);
+	case SG_IO:
+		return nvme_sg_io(ns, (void __user *)arg);
 	default:
 		return -ENOTTY;
 	}
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
new file mode 100644
index 000000000000..483af3585c92
--- /dev/null
+++ b/drivers/block/nvme-scsi.c
@@ -0,0 +1,2941 @@
+/*
+ * NVM Express device driver
+ * Copyright (c) 2011, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * Refer to the SCSI-NVMe Translation spec for details on how
+ * each command is translated.
+ */
+
+#include <linux/nvme.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/poison.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/version.h>
+#include <scsi/sg.h>
+#include <scsi/scsi.h>
+
+
+static int sg_version_num = 30534;	/* 2 digits for each component */
+
+#define SNTI_TRANSLATION_SUCCESS			0
+#define SNTI_INTERNAL_ERROR				1
+
+/* VPD Page Codes */
+#define VPD_SUPPORTED_PAGES				0x00
+#define VPD_SERIAL_NUMBER				0x80
+#define VPD_DEVICE_IDENTIFIERS				0x83
+#define VPD_EXTENDED_INQUIRY				0x86
+#define VPD_BLOCK_DEV_CHARACTERISTICS			0xB1
+
+/* CDB offsets */
+#define REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET		6
+#define REPORT_LUNS_SR_OFFSET				2
+#define READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET		10
+#define REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET		4
+#define REQUEST_SENSE_DESC_OFFSET			1
+#define REQUEST_SENSE_DESC_MASK				0x01
+#define DESCRIPTOR_FORMAT_SENSE_DATA_TYPE		1
+#define INQUIRY_EVPD_BYTE_OFFSET			1
+#define INQUIRY_PAGE_CODE_BYTE_OFFSET			2
+#define INQUIRY_EVPD_BIT_MASK				1
+#define INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET		3
+#define START_STOP_UNIT_CDB_IMMED_OFFSET		1
+#define START_STOP_UNIT_CDB_IMMED_MASK			0x1
+#define START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET	3
+#define START_STOP_UNIT_CDB_POWER_COND_MOD_MASK		0xF
+#define START_STOP_UNIT_CDB_POWER_COND_OFFSET		4
+#define START_STOP_UNIT_CDB_POWER_COND_MASK		0xF0
+#define START_STOP_UNIT_CDB_NO_FLUSH_OFFSET		4
+#define START_STOP_UNIT_CDB_NO_FLUSH_MASK		0x4
+#define START_STOP_UNIT_CDB_START_OFFSET		4
+#define START_STOP_UNIT_CDB_START_MASK			0x1
+#define WRITE_BUFFER_CDB_MODE_OFFSET			1
+#define WRITE_BUFFER_CDB_MODE_MASK			0x1F
+#define WRITE_BUFFER_CDB_BUFFER_ID_OFFSET		2
+#define WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET		3
+#define WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET	6
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET		1
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK		0xC0
+#define FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT		6
+#define FORMAT_UNIT_CDB_LONG_LIST_OFFSET		1
+#define FORMAT_UNIT_CDB_LONG_LIST_MASK			0x20
+#define FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET		1
+#define FORMAT_UNIT_CDB_FORMAT_DATA_MASK		0x10
+#define FORMAT_UNIT_SHORT_PARM_LIST_LEN			4
+#define FORMAT_UNIT_LONG_PARM_LIST_LEN			8
+#define FORMAT_UNIT_PROT_INT_OFFSET			3
+#define FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET		0
+#define FORMAT_UNIT_PROT_FIELD_USAGE_MASK		0x07
+
+/* Misc. defines */
+#define NIBBLE_SHIFT					4
+#define FIXED_SENSE_DATA				0x70
+#define DESC_FORMAT_SENSE_DATA				0x72
+#define FIXED_SENSE_DATA_ADD_LENGTH			10
+#define LUN_ENTRY_SIZE					8
+#define LUN_DATA_HEADER_SIZE				8
+#define ALL_LUNS_RETURNED				0x02
+#define ALL_WELL_KNOWN_LUNS_RETURNED			0x01
+#define RESTRICTED_LUNS_RETURNED			0x00
+#define NVME_POWER_STATE_START_VALID			0x00
+#define NVME_POWER_STATE_ACTIVE				0x01
+#define NVME_POWER_STATE_IDLE				0x02
+#define NVME_POWER_STATE_STANDBY			0x03
+#define NVME_POWER_STATE_LU_CONTROL			0x07
+#define POWER_STATE_0					0
+#define POWER_STATE_1					1
+#define POWER_STATE_2					2
+#define POWER_STATE_3					3
+#define DOWNLOAD_SAVE_ACTIVATE				0x05
+#define DOWNLOAD_SAVE_DEFER_ACTIVATE			0x0E
+#define ACTIVATE_DEFERRED_MICROCODE			0x0F
+#define FORMAT_UNIT_IMMED_MASK				0x2
+#define FORMAT_UNIT_IMMED_OFFSET			1
+#define KELVIN_TEMP_FACTOR				273
+#define FIXED_FMT_SENSE_DATA_SIZE			18
+#define DESC_FMT_SENSE_DATA_SIZE			8
+
+/* SCSI/NVMe defines and bit masks */
+#define INQ_STANDARD_INQUIRY_PAGE			0x00
+#define INQ_SUPPORTED_VPD_PAGES_PAGE			0x00
+#define INQ_UNIT_SERIAL_NUMBER_PAGE			0x80
+#define INQ_DEVICE_IDENTIFICATION_PAGE			0x83
+#define INQ_EXTENDED_INQUIRY_DATA_PAGE			0x86
+#define INQ_BDEV_CHARACTERISTICS_PAGE			0xB1
+#define INQ_SERIAL_NUMBER_LENGTH			0x14
+#define INQ_NUM_SUPPORTED_VPD_PAGES			5
+#define VERSION_SPC_4					0x06
+#define ACA_UNSUPPORTED					0
+#define STANDARD_INQUIRY_LENGTH				36
+#define ADDITIONAL_STD_INQ_LENGTH			31
+#define EXTENDED_INQUIRY_DATA_PAGE_LENGTH		0x3C
+#define RESERVED_FIELD					0
+
+/* SCSI READ/WRITE Defines */
+#define IO_CDB_WP_MASK					0xE0
+#define IO_CDB_WP_SHIFT					5
+#define IO_CDB_FUA_MASK					0x8
+#define IO_6_CDB_LBA_OFFSET				0
+#define IO_6_CDB_LBA_MASK				0x001FFFFF
+#define IO_6_CDB_TX_LEN_OFFSET				4
+#define IO_6_DEFAULT_TX_LEN				256
+#define IO_10_CDB_LBA_OFFSET				2
+#define IO_10_CDB_TX_LEN_OFFSET				7
+#define IO_10_CDB_WP_OFFSET				1
+#define IO_10_CDB_FUA_OFFSET				1
+#define IO_12_CDB_LBA_OFFSET				2
+#define IO_12_CDB_TX_LEN_OFFSET				6
+#define IO_12_CDB_WP_OFFSET				1
+#define IO_12_CDB_FUA_OFFSET				1
+#define IO_16_CDB_FUA_OFFSET				1
+#define IO_16_CDB_WP_OFFSET				1
+#define IO_16_CDB_LBA_OFFSET				2
+#define IO_16_CDB_TX_LEN_OFFSET				10
+
+/* Mode Sense/Select defines */
+#define MODE_PAGE_INFO_EXCEP				0x1C
+#define MODE_PAGE_CACHING				0x08
+#define MODE_PAGE_CONTROL				0x0A
+#define MODE_PAGE_POWER_CONDITION			0x1A
+#define MODE_PAGE_RETURN_ALL				0x3F
+#define MODE_PAGE_BLK_DES_LEN				0x08
+#define MODE_PAGE_LLBAA_BLK_DES_LEN			0x10
+#define MODE_PAGE_CACHING_LEN				0x14
+#define MODE_PAGE_CONTROL_LEN				0x0C
+#define MODE_PAGE_POW_CND_LEN				0x28
+#define MODE_PAGE_INF_EXC_LEN				0x0C
+#define MODE_PAGE_ALL_LEN				0x54
+#define MODE_SENSE6_MPH_SIZE				4
+#define MODE_SENSE6_ALLOC_LEN_OFFSET			4
+#define MODE_SENSE_PAGE_CONTROL_OFFSET			2
+#define MODE_SENSE_PAGE_CONTROL_MASK			0xC0
+#define MODE_SENSE_PAGE_CODE_OFFSET			2
+#define MODE_SENSE_PAGE_CODE_MASK			0x3F
+#define MODE_SENSE_LLBAA_OFFSET				1
+#define MODE_SENSE_LLBAA_MASK				0x10
+#define MODE_SENSE_LLBAA_SHIFT				4
+#define MODE_SENSE_DBD_OFFSET				1
+#define MODE_SENSE_DBD_MASK				8
+#define MODE_SENSE_DBD_SHIFT				3
+#define MODE_SENSE10_MPH_SIZE				8
+#define MODE_SENSE10_ALLOC_LEN_OFFSET			7
+#define MODE_SELECT_CDB_PAGE_FORMAT_OFFSET		1
+#define MODE_SELECT_CDB_SAVE_PAGES_OFFSET		1
+#define MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET	4
+#define MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET	7
+#define MODE_SELECT_CDB_PAGE_FORMAT_MASK		0x10
+#define MODE_SELECT_CDB_SAVE_PAGES_MASK			0x1
+#define MODE_SELECT_6_BD_OFFSET				3
+#define MODE_SELECT_10_BD_OFFSET			6
+#define MODE_SELECT_10_LLBAA_OFFSET			4
+#define MODE_SELECT_10_LLBAA_MASK			1
+#define MODE_SELECT_6_MPH_SIZE				4
+#define MODE_SELECT_10_MPH_SIZE				8
+#define CACHING_MODE_PAGE_WCE_MASK			0x04
+#define MODE_SENSE_BLK_DESC_ENABLED			0
+#define MODE_SENSE_BLK_DESC_COUNT			1
+#define MODE_SELECT_PAGE_CODE_MASK			0x3F
+#define SHORT_DESC_BLOCK				8
+#define LONG_DESC_BLOCK					16
+#define MODE_PAGE_POW_CND_LEN_FIELD			0x26
+#define MODE_PAGE_INF_EXC_LEN_FIELD			0x0A
+#define MODE_PAGE_CACHING_LEN_FIELD			0x12
+#define MODE_PAGE_CONTROL_LEN_FIELD			0x0A
+#define MODE_SENSE_PC_CURRENT_VALUES			0
+
+/* Log Sense defines */
+#define LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE		0x00
+#define LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH		0x07
+#define LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE		0x2F
+#define LOG_PAGE_TEMPERATURE_PAGE			0x0D
+#define LOG_SENSE_CDB_SP_OFFSET				1
+#define LOG_SENSE_CDB_SP_NOT_ENABLED			0
+#define LOG_SENSE_CDB_PC_OFFSET				2
+#define LOG_SENSE_CDB_PC_MASK				0xC0
+#define LOG_SENSE_CDB_PC_SHIFT				6
+#define LOG_SENSE_CDB_PC_CUMULATIVE_VALUES		1
+#define LOG_SENSE_CDB_PAGE_CODE_MASK			0x3F
+#define LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET		7
+#define REMAINING_INFO_EXCP_PAGE_LENGTH			0x8
+#define LOG_INFO_EXCP_PAGE_LENGTH			0xC
+#define REMAINING_TEMP_PAGE_LENGTH			0xC
+#define LOG_TEMP_PAGE_LENGTH				0x10
+#define LOG_TEMP_UNKNOWN				0xFF
+#define SUPPORTED_LOG_PAGES_PAGE_LENGTH			0x3
+
+/* Read Capacity defines */
+#define READ_CAP_10_RESP_SIZE				8
+#define READ_CAP_16_RESP_SIZE				32
+
+/* NVMe Namespace and Command Defines */
+#define NVME_GET_SMART_LOG_PAGE				0x02
+#define NVME_GET_FEAT_TEMP_THRESH			0x04
+#define BYTES_TO_DWORDS					4
+#define NVME_MAX_FIRMWARE_SLOT				7
+
+/* Report LUNs defines */
+#define REPORT_LUNS_FIRST_LUN_OFFSET			8
+
+/* SCSI ADDITIONAL SENSE Codes */
+
+#define SCSI_ASC_NO_SENSE				0x00
+#define SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT		0x03
+#define SCSI_ASC_LUN_NOT_READY				0x04
+#define SCSI_ASC_WARNING				0x0B
+#define SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED		0x10
+#define SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED		0x10
+#define SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED		0x10
+#define SCSI_ASC_UNRECOVERED_READ_ERROR			0x11
+#define SCSI_ASC_MISCOMPARE_DURING_VERIFY		0x1D
+#define SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID		0x20
+#define SCSI_ASC_ILLEGAL_COMMAND			0x20
+#define SCSI_ASC_ILLEGAL_BLOCK				0x21
+#define SCSI_ASC_INVALID_CDB				0x24
+#define SCSI_ASC_INVALID_LUN				0x25
+#define SCSI_ASC_INVALID_PARAMETER			0x26
+#define SCSI_ASC_FORMAT_COMMAND_FAILED			0x31
+#define SCSI_ASC_INTERNAL_TARGET_FAILURE		0x44
+
+/* SCSI ADDITIONAL SENSE Code Qualifiers */
+
+#define SCSI_ASCQ_CAUSE_NOT_REPORTABLE			0x00
+#define SCSI_ASCQ_FORMAT_COMMAND_FAILED			0x01
+#define SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED		0x01
+#define SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED		0x02
+#define SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED		0x03
+#define SCSI_ASCQ_FORMAT_IN_PROGRESS			0x04
+#define SCSI_ASCQ_POWER_LOSS_EXPECTED			0x08
+#define SCSI_ASCQ_INVALID_LUN_ID			0x09
+
+/**
+ * DEVICE_SPECIFIC_PARAMETER in mode parameter header (see sbc2r16) to
+ * enable DPOFUA support type 0x10 value.
+ */
+#define DEVICE_SPECIFIC_PARAMETER			0
+#define VPD_ID_DESCRIPTOR_LENGTH sizeof(VPD_IDENTIFICATION_DESCRIPTOR)
+
+/* MACROs to extract information from CDBs */
+
+#define GET_OPCODE(cdb)		cdb[0]
+
+#define GET_U8_FROM_CDB(cdb, index) (cdb[index] << 0)
+
+#define GET_U16_FROM_CDB(cdb, index) ((cdb[index] << 8) | (cdb[index + 1] << 0))
+
+#define GET_U24_FROM_CDB(cdb, index) ((cdb[index] << 16) | \
+(cdb[index + 1] <<  8) | \
+(cdb[index + 2] <<  0))
+
+#define GET_U32_FROM_CDB(cdb, index) ((cdb[index] << 24) | \
+(cdb[index + 1] << 16) | \
+(cdb[index + 2] <<  8) | \
+(cdb[index + 3] <<  0))
+
+#define GET_U64_FROM_CDB(cdb, index) ((((u64)cdb[index]) << 56) | \
+(((u64)cdb[index + 1]) << 48) | \
+(((u64)cdb[index + 2]) << 40) | \
+(((u64)cdb[index + 3]) << 32) | \
+(((u64)cdb[index + 4]) << 24) | \
+(((u64)cdb[index + 5]) << 16) | \
+(((u64)cdb[index + 6]) <<  8) | \
+(((u64)cdb[index + 7]) <<  0))
+
+/* Inquiry Helper Macros */
+#define GET_INQ_EVPD_BIT(cdb) \
+((GET_U8_FROM_CDB(cdb, INQUIRY_EVPD_BYTE_OFFSET) &		\
+INQUIRY_EVPD_BIT_MASK) ? 1 : 0)
+
+#define GET_INQ_PAGE_CODE(cdb)					\
+(GET_U8_FROM_CDB(cdb, INQUIRY_PAGE_CODE_BYTE_OFFSET))
+
+#define GET_INQ_ALLOC_LENGTH(cdb)				\
+(GET_U16_FROM_CDB(cdb, INQUIRY_CDB_ALLOCATION_LENGTH_OFFSET))
+
+/* Report LUNs Helper Macros */
+#define GET_REPORT_LUNS_ALLOC_LENGTH(cdb)			\
+(GET_U32_FROM_CDB(cdb, REPORT_LUNS_CDB_ALLOC_LENGTH_OFFSET))
+
+/* Read Capacity Helper Macros */
+#define GET_READ_CAP_16_ALLOC_LENGTH(cdb)			\
+(GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET))
+
+#define IS_READ_CAP_16(cdb)					\
+((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0)
+
+/* Request Sense Helper Macros */
+#define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb)			\
+(GET_U8_FROM_CDB(cdb, REQUEST_SENSE_CDB_ALLOC_LENGTH_OFFSET))
+
+/* Mode Sense Helper Macros */
+#define GET_MODE_SENSE_DBD(cdb)					\
+((GET_U8_FROM_CDB(cdb, MODE_SENSE_DBD_OFFSET) & MODE_SENSE_DBD_MASK) >>	\
+MODE_SENSE_DBD_SHIFT)
+
+#define GET_MODE_SENSE_LLBAA(cdb)				\
+((GET_U8_FROM_CDB(cdb, MODE_SENSE_LLBAA_OFFSET) &		\
+MODE_SENSE_LLBAA_MASK) >> MODE_SENSE_LLBAA_SHIFT)
+
+#define GET_MODE_SENSE_MPH_SIZE(cdb10)				\
+(cdb10 ? MODE_SENSE10_MPH_SIZE : MODE_SENSE6_MPH_SIZE)
+
+
+/* Struct to gather data that needs to be extracted from a SCSI CDB.
+   Not conforming to any particular CDB variant, but compatible with all. */
+
+struct nvme_trans_io_cdb {
+	u8 fua;
+	u8 prot_info;
+	u64 lba;
+	u32 xfer_len;
+};
+
+
+/* Internal Helper Functions */
+
+
+/* Copy data to userspace memory */
+
+static int nvme_trans_copy_to_user(struct sg_io_hdr *hdr, void *from,
+								unsigned long n)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	unsigned long not_copied;
+	int i;
+	void *index = from;
+	size_t remaining = n;
+	size_t xfer_len;
+
+	if (hdr->iovec_count > 0) {
+		struct sg_iovec *sgl = hdr->dxferp;
+
+		for (i = 0; i < hdr->iovec_count; i++) {
+			xfer_len = min(remaining, sgl[i].iov_len);
+			not_copied = copy_to_user(__user sgl[i].iov_base, index,
+								xfer_len);
+			if (not_copied) {
+				res = -EFAULT;
+				break;
+			}
+			index += xfer_len;
+			remaining -= xfer_len;
+			if (remaining == 0)
+				break;
+		}
+		return res;
+	}
+	not_copied = copy_to_user(__user hdr->dxferp, from, n);
+	if (not_copied)
+		res = -EFAULT;
+	return res;
+}
+
+/* Copy data from userspace memory */
+
+static int nvme_trans_copy_from_user(struct sg_io_hdr *hdr, void *to,
+								unsigned long n)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	unsigned long not_copied;
+	int i;
+	void *index = to;
+	size_t remaining = n;
+	size_t xfer_len;
+
+	if (hdr->iovec_count > 0) {
+		struct sg_iovec *sgl = hdr->dxferp;
+
+		for (i = 0; i < hdr->iovec_count; i++) {
+			xfer_len = min(remaining, sgl[i].iov_len);
+			not_copied = copy_from_user(index,
+					__user sgl[i].iov_base, xfer_len);
+			if (not_copied) {
+				res = -EFAULT;
+				break;
+			}
+			index += xfer_len;
+			remaining -= xfer_len;
+			if (remaining == 0)
+				break;
+		}
+		return res;
+	}
+
+	not_copied = copy_from_user(to, __user hdr->dxferp, n);
+	if (not_copied)
+		res = -EFAULT;
+	return res;
+}
+
+/* Status/Sense Buffer Writeback */
+
+static int nvme_trans_completion(struct sg_io_hdr *hdr, u8 status, u8 sense_key,
+				 u8 asc, u8 ascq)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 xfer_len;
+	u8 resp[DESC_FMT_SENSE_DATA_SIZE];
+
+	if (scsi_status_is_good(status)) {
+		hdr->status = SAM_STAT_GOOD;
+		hdr->masked_status = GOOD;
+		hdr->host_status = DID_OK;
+		hdr->driver_status = DRIVER_OK;
+		hdr->sb_len_wr = 0;
+	} else {
+		hdr->status = status;
+		hdr->masked_status = status >> 1;
+		hdr->host_status = DID_OK;
+		hdr->driver_status = DRIVER_OK;
+
+		memset(resp, 0, DESC_FMT_SENSE_DATA_SIZE);
+		resp[0] = DESC_FORMAT_SENSE_DATA;
+		resp[1] = sense_key;
+		resp[2] = asc;
+		resp[3] = ascq;
+
+		xfer_len = min_t(u8, hdr->mx_sb_len, DESC_FMT_SENSE_DATA_SIZE);
+		hdr->sb_len_wr = xfer_len;
+		if (copy_to_user(__user hdr->sbp, resp, xfer_len) > 0)
+			res = -EFAULT;
+	}
+
+	return res;
+}
+
+static int nvme_trans_status_code(struct sg_io_hdr *hdr, int nvme_sc)
+{
+	u8 status, sense_key, asc, ascq;
+	int res = SNTI_TRANSLATION_SUCCESS;
+
+	/* For non-nvme (Linux) errors, simply return the error code */
+	if (nvme_sc < 0)
+		return nvme_sc;
+
+	/* Mask DNR, More, and reserved fields */
+	nvme_sc &= 0x7FF;
+
+	switch (nvme_sc) {
+	/* Generic Command Status */
+	case NVME_SC_SUCCESS:
+		status = SAM_STAT_GOOD;
+		sense_key = NO_SENSE;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_INVALID_OPCODE:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ILLEGAL_COMMAND;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_INVALID_FIELD:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_INVALID_CDB;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_DATA_XFER_ERROR:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_POWER_LOSS:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_WARNING;
+		ascq = SCSI_ASCQ_POWER_LOSS_EXPECTED;
+		break;
+	case NVME_SC_INTERNAL:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = HARDWARE_ERROR;
+		asc = SCSI_ASC_INTERNAL_TARGET_FAILURE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_ABORT_REQ:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_ABORT_QUEUE:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_FUSED_FAIL:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_FUSED_MISSING:
+		status = SAM_STAT_TASK_ABORTED;
+		sense_key = ABORTED_COMMAND;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_INVALID_NS:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID;
+		ascq = SCSI_ASCQ_INVALID_LUN_ID;
+		break;
+	case NVME_SC_LBA_RANGE:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ILLEGAL_BLOCK;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_CAP_EXCEEDED:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_NS_NOT_READY:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = NOT_READY;
+		asc = SCSI_ASC_LUN_NOT_READY;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+
+	/* Command Specific Status */
+	case NVME_SC_INVALID_FORMAT:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_FORMAT_COMMAND_FAILED;
+		ascq = SCSI_ASCQ_FORMAT_COMMAND_FAILED;
+		break;
+	case NVME_SC_BAD_ATTRIBUTES:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_INVALID_CDB;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+
+	/* Media Errors */
+	case NVME_SC_WRITE_FAULT:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_PERIPHERAL_DEV_WRITE_FAULT;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_READ_ERROR:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_UNRECOVERED_READ_ERROR;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_GUARD_CHECK:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_LOG_BLOCK_GUARD_CHECK_FAILED;
+		ascq = SCSI_ASCQ_LOG_BLOCK_GUARD_CHECK_FAILED;
+		break;
+	case NVME_SC_APPTAG_CHECK:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_LOG_BLOCK_APPTAG_CHECK_FAILED;
+		ascq = SCSI_ASCQ_LOG_BLOCK_APPTAG_CHECK_FAILED;
+		break;
+	case NVME_SC_REFTAG_CHECK:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MEDIUM_ERROR;
+		asc = SCSI_ASC_LOG_BLOCK_REFTAG_CHECK_FAILED;
+		ascq = SCSI_ASCQ_LOG_BLOCK_REFTAG_CHECK_FAILED;
+		break;
+	case NVME_SC_COMPARE_FAILED:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = MISCOMPARE;
+		asc = SCSI_ASC_MISCOMPARE_DURING_VERIFY;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	case NVME_SC_ACCESS_DENIED:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_ACCESS_DENIED_INVALID_LUN_ID;
+		ascq = SCSI_ASCQ_INVALID_LUN_ID;
+		break;
+
+	/* Unspecified/Default */
+	case NVME_SC_CMDID_CONFLICT:
+	case NVME_SC_CMD_SEQ_ERROR:
+	case NVME_SC_CQ_INVALID:
+	case NVME_SC_QID_INVALID:
+	case NVME_SC_QUEUE_SIZE:
+	case NVME_SC_ABORT_LIMIT:
+	case NVME_SC_ABORT_MISSING:
+	case NVME_SC_ASYNC_LIMIT:
+	case NVME_SC_FIRMWARE_SLOT:
+	case NVME_SC_FIRMWARE_IMAGE:
+	case NVME_SC_INVALID_VECTOR:
+	case NVME_SC_INVALID_LOG_PAGE:
+	default:
+		status = SAM_STAT_CHECK_CONDITION;
+		sense_key = ILLEGAL_REQUEST;
+		asc = SCSI_ASC_NO_SENSE;
+		ascq = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		break;
+	}
+
+	res = nvme_trans_completion(hdr, status, sense_key, asc, ascq);
+
+	return res;
+}
+
+/* INQUIRY Helper Functions */
+
+static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *inq_response,
+					int alloc_len)
+{
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	int xfer_len;
+	u8 resp_data_format = 0x02;
+	u8 protect;
+	u8 cmdque = 0x01 << 1;
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+				&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* nvme ns identify - use DPS value for PROTECT field */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	/*
+	 * If nvme_sc was -ve, res will be -ve here.
+	 * If nvme_sc was +ve, the status would bace been translated, and res
+	 *  can only be 0 or -ve.
+	 *    - If 0 && nvme_sc > 0, then go into next if where res gets nvme_sc
+	 *    - If -ve, return because its a Linux error.
+	 */
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ns = mem;
+	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[2] = VERSION_SPC_4;
+	inq_response[3] = resp_data_format;	/*normaca=0 | hisup=0 */
+	inq_response[4] = ADDITIONAL_STD_INQ_LENGTH;
+	inq_response[5] = protect;	/* sccs=0 | acc=0 | tpgs=0 | pc3=0 */
+	inq_response[7] = cmdque;	/* wbus16=0 | sync=0 | vs=0 */
+	strncpy(&inq_response[8], "NVMe    ", 8);
+	strncpy(&inq_response[16], dev->model, 16);
+	strncpy(&inq_response[32], dev->firmware_rev, 4);
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out_dma:
+	return res;
+}
+
+static int nvme_trans_supported_vpd_pages(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *inq_response,
+					int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[1] = INQ_SUPPORTED_VPD_PAGES_PAGE;   /* Page Code */
+	inq_response[3] = INQ_NUM_SUPPORTED_VPD_PAGES;    /* Page Length */
+	inq_response[4] = INQ_SUPPORTED_VPD_PAGES_PAGE;
+	inq_response[5] = INQ_UNIT_SERIAL_NUMBER_PAGE;
+	inq_response[6] = INQ_DEVICE_IDENTIFICATION_PAGE;
+	inq_response[7] = INQ_EXTENDED_INQUIRY_DATA_PAGE;
+	inq_response[8] = INQ_BDEV_CHARACTERISTICS_PAGE;
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	return res;
+}
+
+static int nvme_trans_unit_serial_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *inq_response,
+					int alloc_len)
+{
+	struct nvme_dev *dev = ns->dev;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */
+	inq_response[3] = INQ_SERIAL_NUMBER_LENGTH;    /* Page Length */
+	strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH);
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	return res;
+}
+
+static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *inq_response, int alloc_len)
+{
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	u8 ieee[4];
+	int xfer_len;
+	u32 tmp_id = cpu_to_be64(ns->ns_id);
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+					&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* nvme controller identify */
+	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ctrl = mem;
+
+	/* Since SCSI tried to save 4 bits... [SPC-4(r34) Table 591] */
+	ieee[0] = id_ctrl->ieee[0] << 4;
+	ieee[1] = id_ctrl->ieee[0] >> 4 | id_ctrl->ieee[1] << 4;
+	ieee[2] = id_ctrl->ieee[1] >> 4 | id_ctrl->ieee[2] << 4;
+	ieee[3] = id_ctrl->ieee[2] >> 4;
+
+	memset(inq_response, 0, STANDARD_INQUIRY_LENGTH);
+	inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE;    /* Page Code */
+	inq_response[3] = 20;      /* Page Length */
+	/* Designation Descriptor start */
+	inq_response[4] = 0x01;    /* Proto ID=0h | Code set=1h */
+	inq_response[5] = 0x03;    /* PIV=0b | Asso=00b | Designator Type=3h */
+	inq_response[6] = 0x00;    /* Rsvd */
+	inq_response[7] = 16;      /* Designator Length */
+	/* Designator start */
+	inq_response[8] = 0x60 | ieee[3]; /* NAA=6h | IEEE ID MSB, High nibble*/
+	inq_response[9] = ieee[2];        /* IEEE ID */
+	inq_response[10] = ieee[1];       /* IEEE ID */
+	inq_response[11] = ieee[0];       /* IEEE ID| Vendor Specific ID... */
+	inq_response[12] = (dev->pci_dev->vendor & 0xFF00) >> 8;
+	inq_response[13] = (dev->pci_dev->vendor & 0x00FF);
+	inq_response[14] = dev->serial[0];
+	inq_response[15] = dev->serial[1];
+	inq_response[16] = dev->model[0];
+	inq_response[17] = dev->model[1];
+	memcpy(&inq_response[18], &tmp_id, sizeof(u32));
+	/* Last 2 bytes are zero */
+
+	xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out_dma:
+	return res;
+}
+
+static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	u8 *inq_response;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	struct nvme_id_ns *id_ns;
+	int xfer_len;
+	u8 microcode = 0x80;
+	u8 spt;
+	u8 spt_lut[8] = {0, 0, 2, 1, 4, 6, 5, 7};
+	u8 grd_chk, app_chk, ref_chk, protect;
+	u8 uask_sup = 0x20;
+	u8 v_sup;
+	u8 luiclr = 0x01;
+
+	inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+	if (inq_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ns = mem;
+	spt = spt_lut[(id_ns->dpc) & 0x07] << 3;
+	(id_ns->dps) ? (protect = 0x01) : (protect = 0);
+	grd_chk = protect << 2;
+	app_chk = protect << 1;
+	ref_chk = protect;
+
+	/* nvme controller identify */
+	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_free;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_free;
+	}
+	id_ctrl = mem;
+	v_sup = id_ctrl->vwc;
+
+	memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	inq_response[1] = INQ_EXTENDED_INQUIRY_DATA_PAGE;    /* Page Code */
+	inq_response[2] = 0x00;    /* Page Length MSB */
+	inq_response[3] = 0x3C;    /* Page Length LSB */
+	inq_response[4] = microcode | spt | grd_chk | app_chk | ref_chk;
+	inq_response[5] = uask_sup;
+	inq_response[6] = v_sup;
+	inq_response[7] = luiclr;
+	inq_response[8] = 0;
+	inq_response[9] = 0;
+
+	xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+ out_free:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out_dma:
+	kfree(inq_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_bdev_char_page(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	u8 *inq_response;
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+
+	inq_response = kmalloc(EXTENDED_INQUIRY_DATA_PAGE_LENGTH, GFP_KERNEL);
+	if (inq_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	memset(inq_response, 0, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	inq_response[1] = INQ_BDEV_CHARACTERISTICS_PAGE;    /* Page Code */
+	inq_response[2] = 0x00;    /* Page Length MSB */
+	inq_response[3] = 0x3C;    /* Page Length LSB */
+	inq_response[4] = 0x00;    /* Medium Rotation Rate MSB */
+	inq_response[5] = 0x01;    /* Medium Rotation Rate LSB */
+	inq_response[6] = 0x00;    /* Form Factor */
+
+	xfer_len = min(alloc_len, EXTENDED_INQUIRY_DATA_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, inq_response, xfer_len);
+
+	kfree(inq_response);
+ out_mem:
+	return res;
+}
+
+/* LOG SENSE Helper Functions */
+
+static int nvme_trans_log_supp_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *log_response;
+
+	log_response = kmalloc(LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH, GFP_KERNEL);
+	if (log_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+	memset(log_response, 0, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH);
+
+	log_response[0] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
+	/* Subpage=0x00, Page Length MSB=0 */
+	log_response[3] = SUPPORTED_LOG_PAGES_PAGE_LENGTH;
+	log_response[4] = LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE;
+	log_response[5] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
+	log_response[6] = LOG_PAGE_TEMPERATURE_PAGE;
+
+	xfer_len = min(alloc_len, LOG_PAGE_SUPPORTED_LOG_PAGES_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+	kfree(log_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_log_info_exceptions(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *log_response;
+	struct nvme_command c;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_smart_log *smart_log;
+	dma_addr_t dma_addr;
+	void *mem;
+	u8 temp_c;
+	u16 temp_k;
+
+	log_response = kmalloc(LOG_INFO_EXCP_PAGE_LENGTH, GFP_KERNEL);
+	if (log_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+	memset(log_response, 0, LOG_INFO_EXCP_PAGE_LENGTH);
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev,
+					sizeof(struct nvme_smart_log),
+					&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* Get SMART Log Page */
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_get_log_page;
+	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
+	c.common.prp1 = cpu_to_le64(dma_addr);
+	c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) /
+			BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE);
+	res = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (res != NVME_SC_SUCCESS) {
+		temp_c = LOG_TEMP_UNKNOWN;
+	} else {
+		smart_log = mem;
+		temp_k = (smart_log->temperature[1] << 8) +
+				(smart_log->temperature[0]);
+		temp_c = temp_k - KELVIN_TEMP_FACTOR;
+	}
+
+	log_response[0] = LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE;
+	/* Subpage=0x00, Page Length MSB=0 */
+	log_response[3] = REMAINING_INFO_EXCP_PAGE_LENGTH;
+	/* Informational Exceptions Log Parameter 1 Start */
+	/* Parameter Code=0x0000 bytes 4,5 */
+	log_response[6] = 0x23; /* DU=0, TSD=1, ETC=0, TMC=0, FMT_AND_LNK=11b */
+	log_response[7] = 0x04; /* PARAMETER LENGTH */
+	/* Add sense Code and qualifier = 0x00 each */
+	/* Use Temperature from NVMe Get Log Page, convert to C from K */
+	log_response[10] = temp_c;
+
+	xfer_len = min(alloc_len, LOG_INFO_EXCP_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+			  mem, dma_addr);
+ out_dma:
+	kfree(log_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					int alloc_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *log_response;
+	struct nvme_command c;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_smart_log *smart_log;
+	dma_addr_t dma_addr;
+	void *mem;
+	u32 feature_resp;
+	u8 temp_c_cur, temp_c_thresh;
+	u16 temp_k;
+
+	log_response = kmalloc(LOG_TEMP_PAGE_LENGTH, GFP_KERNEL);
+	if (log_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+	memset(log_response, 0, LOG_TEMP_PAGE_LENGTH);
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev,
+					sizeof(struct nvme_smart_log),
+					&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+
+	/* Get SMART Log Page */
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = nvme_admin_get_log_page;
+	c.common.nsid = cpu_to_le32(0xFFFFFFFF);
+	c.common.prp1 = cpu_to_le64(dma_addr);
+	c.common.cdw10[0] = cpu_to_le32(((sizeof(struct nvme_smart_log) /
+			BYTES_TO_DWORDS) << 16) | NVME_GET_SMART_LOG_PAGE);
+	res = nvme_submit_admin_cmd(dev, &c, NULL);
+	if (res != NVME_SC_SUCCESS) {
+		temp_c_cur = LOG_TEMP_UNKNOWN;
+	} else {
+		smart_log = mem;
+		temp_k = (smart_log->temperature[1] << 8) +
+				(smart_log->temperature[0]);
+		temp_c_cur = temp_k - KELVIN_TEMP_FACTOR;
+	}
+
+	/* Get Features for Temp Threshold */
+	res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0,
+								&feature_resp);
+	if (res != NVME_SC_SUCCESS)
+		temp_c_thresh = LOG_TEMP_UNKNOWN;
+	else
+		temp_c_thresh = (feature_resp & 0xFFFF) - KELVIN_TEMP_FACTOR;
+
+	log_response[0] = LOG_PAGE_TEMPERATURE_PAGE;
+	/* Subpage=0x00, Page Length MSB=0 */
+	log_response[3] = REMAINING_TEMP_PAGE_LENGTH;
+	/* Temperature Log Parameter 1 (Temperature) Start */
+	/* Parameter Code = 0x0000 */
+	log_response[6] = 0x01;		/* Format and Linking = 01b */
+	log_response[7] = 0x02;		/* Parameter Length */
+	/* Use Temperature from NVMe Get Log Page, convert to C from K */
+	log_response[9] = temp_c_cur;
+	/* Temperature Log Parameter 2 (Reference Temperature) Start */
+	log_response[11] = 0x01;	/* Parameter Code = 0x0001 */
+	log_response[12] = 0x01;	/* Format and Linking = 01b */
+	log_response[13] = 0x02;	/* Parameter Length */
+	/* Use Temperature Thresh from NVMe Get Log Page, convert to C from K */
+	log_response[15] = temp_c_thresh;
+
+	xfer_len = min(alloc_len, LOG_TEMP_PAGE_LENGTH);
+	res = nvme_trans_copy_to_user(hdr, log_response, xfer_len);
+
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_smart_log),
+			  mem, dma_addr);
+ out_dma:
+	kfree(log_response);
+ out_mem:
+	return res;
+}
+
+/* MODE SENSE Helper Functions */
+
+static int nvme_trans_fill_mode_parm_hdr(u8 *resp, int len, u8 cdb10, u8 llbaa,
+					u16 mode_data_length, u16 blk_desc_len)
+{
+	/* Quick check to make sure I don't stomp on my own memory... */
+	if ((cdb10 && len < 8) || (!cdb10 && len < 4))
+		return SNTI_INTERNAL_ERROR;
+
+	if (cdb10) {
+		resp[0] = (mode_data_length & 0xFF00) >> 8;
+		resp[1] = (mode_data_length & 0x00FF);
+		/* resp[2] and [3] are zero */
+		resp[4] = llbaa;
+		resp[5] = RESERVED_FIELD;
+		resp[6] = (blk_desc_len & 0xFF00) >> 8;
+		resp[7] = (blk_desc_len & 0x00FF);
+	} else {
+		resp[0] = (mode_data_length & 0x00FF);
+		/* resp[1] and [2] are zero */
+		resp[3] = (blk_desc_len & 0x00FF);
+	}
+
+	return SNTI_TRANSLATION_SUCCESS;
+}
+
+static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				    u8 *resp, int len, u8 llbaa)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 flbas;
+	u32 lba_length;
+
+	if (llbaa == 0 && len < MODE_PAGE_BLK_DES_LEN)
+		return SNTI_INTERNAL_ERROR;
+	else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ns = mem;
+	flbas = (id_ns->flbas) & 0x0F;
+	lba_length = (1 << (id_ns->lbaf[flbas].ds));
+
+	if (llbaa == 0) {
+		u32 tmp_cap = cpu_to_be32(id_ns->ncap);
+		/* Byte 4 is reserved */
+		u32 tmp_len = cpu_to_be32(lba_length) & 0x00FFFFFF;
+
+		memcpy(resp, &tmp_cap, sizeof(u32));
+		memcpy(&resp[4], &tmp_len, sizeof(u32));
+	} else {
+		u64 tmp_cap = cpu_to_be64(id_ns->ncap);
+		u32 tmp_len = cpu_to_be32(lba_length);
+
+		memcpy(resp, &tmp_cap, sizeof(u64));
+		/* Bytes 8, 9, 10, 11 are reserved */
+		memcpy(&resp[12], &tmp_len, sizeof(u32));
+	}
+
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+static int nvme_trans_fill_control_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *resp,
+					int len)
+{
+	if (len < MODE_PAGE_CONTROL_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	resp[0] = MODE_PAGE_CONTROL;
+	resp[1] = MODE_PAGE_CONTROL_LEN_FIELD;
+	resp[2] = 0x0E;		/* TST=000b, TMF_ONLY=0, DPICZ=1,
+				 * D_SENSE=1, GLTSD=1, RLEC=0 */
+	resp[3] = 0x12;		/* Q_ALGO_MODIFIER=1h, NUAR=0, QERR=01b */
+	/* Byte 4:  VS=0, RAC=0, UA_INT=0, SWP=0 */
+	resp[5] = 0x40;		/* ATO=0, TAS=1, ATMPE=0, RWWP=0, AUTOLOAD=0 */
+	/* resp[6] and [7] are obsolete, thus zero */
+	resp[8] = 0xFF;		/* Busy timeout period = 0xffff */
+	resp[9] = 0xFF;
+	/* Bytes 10,11: Extended selftest completion time = 0x0000 */
+
+	return SNTI_TRANSLATION_SUCCESS;
+}
+
+static int nvme_trans_fill_caching_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr,
+					u8 *resp, int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	u32 feature_resp;
+	u8 vwc;
+
+	if (len < MODE_PAGE_CACHING_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0,
+								&feature_resp);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out;
+	}
+	vwc = feature_resp & 0x00000001;
+
+	resp[0] = MODE_PAGE_CACHING;
+	resp[1] = MODE_PAGE_CACHING_LEN_FIELD;
+	resp[2] = vwc << 2;
+
+ out:
+	return res;
+}
+
+static int nvme_trans_fill_pow_cnd_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *resp,
+					int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+
+	if (len < MODE_PAGE_POW_CND_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	resp[0] = MODE_PAGE_POWER_CONDITION;
+	resp[1] = MODE_PAGE_POW_CND_LEN_FIELD;
+	/* All other bytes are zero */
+
+	return res;
+}
+
+static int nvme_trans_fill_inf_exc_page(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *resp,
+					int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+
+	if (len < MODE_PAGE_INF_EXC_LEN)
+		return SNTI_INTERNAL_ERROR;
+
+	resp[0] = MODE_PAGE_INFO_EXCEP;
+	resp[1] = MODE_PAGE_INF_EXC_LEN_FIELD;
+	resp[2] = 0x88;
+	/* All other bytes are zero */
+
+	return res;
+}
+
+static int nvme_trans_fill_all_pages(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				     u8 *resp, int len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u16 mode_pages_offset_1 = 0;
+	u16 mode_pages_offset_2, mode_pages_offset_3, mode_pages_offset_4;
+
+	mode_pages_offset_2 = mode_pages_offset_1 + MODE_PAGE_CACHING_LEN;
+	mode_pages_offset_3 = mode_pages_offset_2 + MODE_PAGE_CONTROL_LEN;
+	mode_pages_offset_4 = mode_pages_offset_3 + MODE_PAGE_POW_CND_LEN;
+
+	res = nvme_trans_fill_caching_page(ns, hdr, &resp[mode_pages_offset_1],
+					MODE_PAGE_CACHING_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+	res = nvme_trans_fill_control_page(ns, hdr, &resp[mode_pages_offset_2],
+					MODE_PAGE_CONTROL_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+	res = nvme_trans_fill_pow_cnd_page(ns, hdr, &resp[mode_pages_offset_3],
+					MODE_PAGE_POW_CND_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+	res = nvme_trans_fill_inf_exc_page(ns, hdr, &resp[mode_pages_offset_4],
+					MODE_PAGE_INF_EXC_LEN);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+ out:
+	return res;
+}
+
+static inline int nvme_trans_get_blk_desc_len(u8 dbd, u8 llbaa)
+{
+	if (dbd == MODE_SENSE_BLK_DESC_ENABLED) {
+		/* SPC-4: len = 8 x Num_of_descriptors if llbaa = 0, 16x if 1 */
+		return 8 * (llbaa + 1) * MODE_SENSE_BLK_DESC_COUNT;
+	} else {
+		return 0;
+	}
+}
+
+static int nvme_trans_mode_page_create(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *cmd,
+					u16 alloc_len, u8 cdb10,
+					int (*mode_page_fill_func)
+					(struct nvme_ns *,
+					struct sg_io_hdr *hdr, u8 *, int),
+					u16 mode_pages_tot_len)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int xfer_len;
+	u8 *response;
+	u8 dbd, llbaa;
+	u16 resp_size;
+	int mph_size;
+	u16 mode_pages_offset_1;
+	u16 blk_desc_len, blk_desc_offset, mode_data_length;
+
+	dbd = GET_MODE_SENSE_DBD(cmd);
+	llbaa = GET_MODE_SENSE_LLBAA(cmd);
+	mph_size = GET_MODE_SENSE_MPH_SIZE(cdb10);
+	blk_desc_len = nvme_trans_get_blk_desc_len(dbd, llbaa);
+
+	resp_size = mph_size + blk_desc_len + mode_pages_tot_len;
+	/* Refer spc4r34 Table 440 for calculation of Mode data Length field */
+	mode_data_length = 3 + (3 * cdb10) + blk_desc_len + mode_pages_tot_len;
+
+	blk_desc_offset = mph_size;
+	mode_pages_offset_1 = blk_desc_offset + blk_desc_len;
+
+	response = kmalloc(resp_size, GFP_KERNEL);
+	if (response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+	memset(response, 0, resp_size);
+
+	res = nvme_trans_fill_mode_parm_hdr(&response[0], mph_size, cdb10,
+					llbaa, mode_data_length, blk_desc_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_free;
+	if (blk_desc_len > 0) {
+		res = nvme_trans_fill_blk_desc(ns, hdr,
+					       &response[blk_desc_offset],
+					       blk_desc_len, llbaa);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			goto out_free;
+	}
+	res = mode_page_fill_func(ns, hdr, &response[mode_pages_offset_1],
+					mode_pages_tot_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_free;
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+ out_free:
+	kfree(response);
+ out_mem:
+	return res;
+}
+
+/* Read Capacity Helper Functions */
+
+static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns,
+								u8 cdb16)
+{
+	u8 flbas;
+	u32 lba_length;
+	u64 rlba;
+	u8 prot_en;
+	u8 p_type_lut[4] = {0, 0, 1, 2};
+	u64 tmp_rlba;
+	u32 tmp_rlba_32;
+	u32 tmp_len;
+
+	flbas = (id_ns->flbas) & 0x0F;
+	lba_length = (1 << (id_ns->lbaf[flbas].ds));
+	rlba = le64_to_cpup(&id_ns->nsze) - 1;
+	(id_ns->dps) ? (prot_en = 0x01) : (prot_en = 0);
+
+	if (!cdb16) {
+		if (rlba > 0xFFFFFFFF)
+			rlba = 0xFFFFFFFF;
+		tmp_rlba_32 = cpu_to_be32(rlba);
+		tmp_len = cpu_to_be32(lba_length);
+		memcpy(response, &tmp_rlba_32, sizeof(u32));
+		memcpy(&response[4], &tmp_len, sizeof(u32));
+	} else {
+		tmp_rlba = cpu_to_be64(rlba);
+		tmp_len = cpu_to_be32(lba_length);
+		memcpy(response, &tmp_rlba, sizeof(u64));
+		memcpy(&response[8], &tmp_len, sizeof(u32));
+		response[12] = (p_type_lut[id_ns->dps & 0x3] << 1) | prot_en;
+		/* P_I_Exponent = 0x0 | LBPPBE = 0x0 */
+		/* LBPME = 0 | LBPRZ = 0 | LALBA = 0x00 */
+		/* Bytes 16-31 - Reserved */
+	}
+}
+
+/* Start Stop Unit Helper Functions */
+
+static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+						u8 pc, u8 pcmod, u8 start)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	int lowest_pow_st;	/* max npss = lowest power consumption */
+	unsigned ps_desired = 0;
+
+	/* NVMe Controller Identify */
+	mem = dma_alloc_coherent(&dev->pci_dev->dev,
+				sizeof(struct nvme_id_ctrl),
+				&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ctrl = mem;
+	lowest_pow_st = id_ctrl->npss - 1;
+
+	switch (pc) {
+	case NVME_POWER_STATE_START_VALID:
+		/* Action unspecified if POWER CONDITION MODIFIER != 0 */
+		if (pcmod == 0 && start == 0x1)
+			ps_desired = POWER_STATE_0;
+		if (pcmod == 0 && start == 0x0)
+			ps_desired = lowest_pow_st;
+		break;
+	case NVME_POWER_STATE_ACTIVE:
+		/* Action unspecified if POWER CONDITION MODIFIER != 0 */
+		if (pcmod == 0)
+			ps_desired = POWER_STATE_0;
+		break;
+	case NVME_POWER_STATE_IDLE:
+		/* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */
+		/* min of desired state and (lps-1) because lps is STOP */
+		if (pcmod == 0x0)
+			ps_desired = min(POWER_STATE_1, (lowest_pow_st - 1));
+		else if (pcmod == 0x1)
+			ps_desired = min(POWER_STATE_2, (lowest_pow_st - 1));
+		else if (pcmod == 0x2)
+			ps_desired = min(POWER_STATE_3, (lowest_pow_st - 1));
+		break;
+	case NVME_POWER_STATE_STANDBY:
+		/* Action unspecified if POWER CONDITION MODIFIER != [0,1] */
+		if (pcmod == 0x0)
+			ps_desired = max(0, (lowest_pow_st - 2));
+		else if (pcmod == 0x1)
+			ps_desired = max(0, (lowest_pow_st - 1));
+		break;
+	case NVME_POWER_STATE_LU_CONTROL:
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+	nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0,
+				    NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc)
+		res = nvme_sc;
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+/* Write Buffer Helper Functions */
+/* Also using this for Format Unit with hdr passed as NULL, and buffer_id, 0 */
+
+static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 opcode, u32 tot_len, u32 offset,
+					u8 buffer_id)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_command c;
+	struct nvme_iod *iod = NULL;
+	unsigned length;
+
+	memset(&c, 0, sizeof(c));
+	c.common.opcode = opcode;
+	if (opcode == nvme_admin_download_fw) {
+		if (hdr->iovec_count > 0) {
+			/* Assuming SGL is not allowed for this command */
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out;
+		}
+		iod = nvme_map_user_pages(dev, DMA_TO_DEVICE,
+				(unsigned long)hdr->dxferp, tot_len);
+		if (IS_ERR(iod)) {
+			res = PTR_ERR(iod);
+			goto out;
+		}
+		length = nvme_setup_prps(dev, &c.common, iod, tot_len,
+								GFP_KERNEL);
+		if (length != tot_len) {
+			res = -ENOMEM;
+			goto out_unmap;
+		}
+
+		c.dlfw.numd = (tot_len/BYTES_TO_DWORDS) - 1;
+		c.dlfw.offset = offset/BYTES_TO_DWORDS;
+	} else if (opcode == nvme_admin_activate_fw) {
+		c.common.cdw10[0] = buffer_id;
+		/* AA=01b Replace & activate at reset */
+		c.common.cdw10[0] |= 0x00000008;
+	}
+
+	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_unmap;
+	if (nvme_sc)
+		res = nvme_sc;
+
+ out_unmap:
+	if (opcode == nvme_admin_download_fw) {
+		nvme_unmap_user_pages(dev, DMA_TO_DEVICE, iod);
+		nvme_free_iod(dev, iod);
+	}
+ out:
+	return res;
+}
+
+/* Mode Select Helper Functions */
+
+static inline void nvme_trans_modesel_get_bd_len(u8 *parm_list, u8 cdb10,
+						u16 *bd_len, u8 *llbaa)
+{
+	if (cdb10) {
+		/* 10 Byte CDB */
+		*bd_len = (parm_list[MODE_SELECT_10_BD_OFFSET] << 8) +
+			parm_list[MODE_SELECT_10_BD_OFFSET + 1];
+		*llbaa = parm_list[MODE_SELECT_10_LLBAA_OFFSET] &&
+				MODE_SELECT_10_LLBAA_MASK;
+	} else {
+		/* 6 Byte CDB */
+		*bd_len = parm_list[MODE_SELECT_6_BD_OFFSET];
+	}
+}
+
+static void nvme_trans_modesel_save_bd(struct nvme_ns *ns, u8 *parm_list,
+					u16 idx, u16 bd_len, u8 llbaa)
+{
+	u16 bd_num;
+
+	bd_num = bd_len / ((llbaa == 0) ?
+			SHORT_DESC_BLOCK : LONG_DESC_BLOCK);
+	/* Store block descriptor info if a FORMAT UNIT comes later */
+	/* TODO Saving 1st BD info; what to do if multiple BD received? */
+	if (llbaa == 0) {
+		/* Standard Block Descriptor - spc4r34 7.5.5.1 */
+		ns->mode_select_num_blocks =
+				(parm_list[idx + 1] << 16) +
+				(parm_list[idx + 2] << 8) +
+				(parm_list[idx + 3]);
+
+		ns->mode_select_block_len =
+				(parm_list[idx + 5] << 16) +
+				(parm_list[idx + 6] << 8) +
+				(parm_list[idx + 7]);
+	} else {
+		/* Long LBA Block Descriptor - sbc3r27 6.4.2.3 */
+		ns->mode_select_num_blocks =
+				(((u64)parm_list[idx + 0]) << 56) +
+				(((u64)parm_list[idx + 1]) << 48) +
+				(((u64)parm_list[idx + 2]) << 40) +
+				(((u64)parm_list[idx + 3]) << 32) +
+				(((u64)parm_list[idx + 4]) << 24) +
+				(((u64)parm_list[idx + 5]) << 16) +
+				(((u64)parm_list[idx + 6]) << 8) +
+				((u64)parm_list[idx + 7]);
+
+		ns->mode_select_block_len =
+				(parm_list[idx + 12] << 24) +
+				(parm_list[idx + 13] << 16) +
+				(parm_list[idx + 14] << 8) +
+				(parm_list[idx + 15]);
+	}
+}
+
+static u16 nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *mode_page, u8 page_code)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	unsigned dword11;
+
+	switch (page_code) {
+	case MODE_PAGE_CACHING:
+		dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0);
+		nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11,
+					    0, NULL);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			break;
+		if (nvme_sc) {
+			res = nvme_sc;
+			break;
+		}
+		break;
+	case MODE_PAGE_CONTROL:
+		break;
+	case MODE_PAGE_POWER_CONDITION:
+		/* Verify the OS is not trying to set timers */
+		if ((mode_page[2] & 0x01) != 0 || (mode_page[3] & 0x0F) != 0) {
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_PARAMETER,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			if (!res)
+				res = SNTI_INTERNAL_ERROR;
+			break;
+		}
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		if (!res)
+			res = SNTI_INTERNAL_ERROR;
+		break;
+	}
+
+	return res;
+}
+
+static int nvme_trans_modesel_data(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+					u8 *cmd, u16 parm_list_len, u8 pf,
+					u8 sp, u8 cdb10)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 *parm_list;
+	u16 bd_len;
+	u8 llbaa = 0;
+	u16 index, saved_index;
+	u8 page_code;
+	u16 mp_size;
+
+	/* Get parm list from data-in/out buffer */
+	parm_list = kmalloc(parm_list_len, GFP_KERNEL);
+	if (parm_list == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+
+	res = nvme_trans_copy_from_user(hdr, parm_list, parm_list_len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_mem;
+
+	nvme_trans_modesel_get_bd_len(parm_list, cdb10, &bd_len, &llbaa);
+	index = (cdb10) ? (MODE_SELECT_10_MPH_SIZE) : (MODE_SELECT_6_MPH_SIZE);
+
+	if (bd_len != 0) {
+		/* Block Descriptors present, parse */
+		nvme_trans_modesel_save_bd(ns, parm_list, index, bd_len, llbaa);
+		index += bd_len;
+	}
+	saved_index = index;
+
+	/* Multiple mode pages may be present; iterate through all */
+	/* In 1st Iteration, don't do NVME Command, only check for CDB errors */
+	do {
+		page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK;
+		mp_size = parm_list[index + 1] + 2;
+		if ((page_code != MODE_PAGE_CACHING) &&
+		    (page_code != MODE_PAGE_CONTROL) &&
+		    (page_code != MODE_PAGE_POWER_CONDITION)) {
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out_mem;
+		}
+		index += mp_size;
+	} while (index < parm_list_len);
+
+	/* In 2nd Iteration, do the NVME Commands */
+	index = saved_index;
+	do {
+		page_code = parm_list[index] & MODE_SELECT_PAGE_CODE_MASK;
+		mp_size = parm_list[index + 1] + 2;
+		res = nvme_trans_modesel_get_mp(ns, hdr, &parm_list[index],
+								page_code);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			break;
+		index += mp_size;
+	} while (index < parm_list_len);
+
+ out_mem:
+	kfree(parm_list);
+ out:
+	return res;
+}
+
+/* Format Unit Helper Functions */
+
+static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns,
+					     struct sg_io_hdr *hdr)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 flbas;
+
+	/*
+	 * SCSI Expects a MODE SELECT would have been issued prior to
+	 * a FORMAT UNIT, and the block size and number would be used
+	 * from the block descriptor in it. If a MODE SELECT had not
+	 * been issued, FORMAT shall use the current values for both.
+	 */
+
+	if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) {
+		mem = dma_alloc_coherent(&dev->pci_dev->dev,
+			sizeof(struct nvme_id_ns), &dma_addr, GFP_KERNEL);
+		if (mem == NULL) {
+			res = -ENOMEM;
+			goto out;
+		}
+		/* nvme ns identify */
+		nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			goto out_dma;
+		if (nvme_sc) {
+			res = nvme_sc;
+			goto out_dma;
+		}
+		id_ns = mem;
+
+		if (ns->mode_select_num_blocks == 0)
+			ns->mode_select_num_blocks = id_ns->ncap;
+		if (ns->mode_select_block_len == 0) {
+			flbas = (id_ns->flbas) & 0x0F;
+			ns->mode_select_block_len =
+						(1 << (id_ns->lbaf[flbas].ds));
+		}
+ out_dma:
+		dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+				  mem, dma_addr);
+	}
+ out:
+	return res;
+}
+
+static int nvme_trans_fmt_get_parm_header(struct sg_io_hdr *hdr, u8 len,
+					u8 format_prot_info, u8 *nvme_pf_code)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 *parm_list;
+	u8 pf_usage, pf_code;
+
+	parm_list = kmalloc(len, GFP_KERNEL);
+	if (parm_list == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	res = nvme_trans_copy_from_user(hdr, parm_list, len);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out_mem;
+
+	if ((parm_list[FORMAT_UNIT_IMMED_OFFSET] &
+				FORMAT_UNIT_IMMED_MASK) != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out_mem;
+	}
+
+	if (len == FORMAT_UNIT_LONG_PARM_LIST_LEN &&
+	    (parm_list[FORMAT_UNIT_PROT_INT_OFFSET] & 0x0F) != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out_mem;
+	}
+	pf_usage = parm_list[FORMAT_UNIT_PROT_FIELD_USAGE_OFFSET] &
+			FORMAT_UNIT_PROT_FIELD_USAGE_MASK;
+	pf_code = (pf_usage << 2) | format_prot_info;
+	switch (pf_code) {
+	case 0:
+		*nvme_pf_code = 0;
+		break;
+	case 2:
+		*nvme_pf_code = 1;
+		break;
+	case 3:
+		*nvme_pf_code = 2;
+		break;
+	case 7:
+		*nvme_pf_code = 3;
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out_mem:
+	kfree(parm_list);
+ out:
+	return res;
+}
+
+static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				   u8 prot_info)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 i;
+	u8 flbas, nlbaf;
+	u8 selected_lbaf = 0xFF;
+	u32 cdw10 = 0;
+	struct nvme_command c;
+
+	/* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ns = mem;
+	flbas = (id_ns->flbas) & 0x0F;
+	nlbaf = id_ns->nlbaf;
+
+	for (i = 0; i < nlbaf; i++) {
+		if (ns->mode_select_block_len == (1 << (id_ns->lbaf[i].ds))) {
+			selected_lbaf = i;
+			break;
+		}
+	}
+	if (selected_lbaf > 0x0F) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	}
+	if (ns->mode_select_num_blocks != id_ns->ncap) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_INVALID_PARAMETER,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	}
+
+	cdw10 |= prot_info << 5;
+	cdw10 |= selected_lbaf & 0x0F;
+	memset(&c, 0, sizeof(c));
+	c.format.opcode = nvme_admin_format_nvm;
+	c.format.nsid = ns->ns_id;
+	c.format.cdw10 = cpu_to_le32(cdw10);
+
+	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc)
+		res = nvme_sc;
+
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+/* Read/Write Helper Functions */
+
+static inline void nvme_trans_get_io_cdb6(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = 0;
+	cdb_info->prot_info = 0;
+	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_6_CDB_LBA_OFFSET) &
+					IO_6_CDB_LBA_MASK;
+	cdb_info->xfer_len = GET_U8_FROM_CDB(cmd, IO_6_CDB_TX_LEN_OFFSET);
+
+	/* sbc3r27 sec 5.32 - TRANSFER LEN of 0 implies a 256 Block transfer */
+	if (cdb_info->xfer_len == 0)
+		cdb_info->xfer_len = IO_6_DEFAULT_TX_LEN;
+}
+
+static inline void nvme_trans_get_io_cdb10(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_10_CDB_FUA_OFFSET) &
+					IO_CDB_FUA_MASK;
+	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_10_CDB_WP_OFFSET) &
+					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_10_CDB_LBA_OFFSET);
+	cdb_info->xfer_len = GET_U16_FROM_CDB(cmd, IO_10_CDB_TX_LEN_OFFSET);
+}
+
+static inline void nvme_trans_get_io_cdb12(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_12_CDB_FUA_OFFSET) &
+					IO_CDB_FUA_MASK;
+	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_12_CDB_WP_OFFSET) &
+					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+	cdb_info->lba = GET_U32_FROM_CDB(cmd, IO_12_CDB_LBA_OFFSET);
+	cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_12_CDB_TX_LEN_OFFSET);
+}
+
+static inline void nvme_trans_get_io_cdb16(u8 *cmd,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	cdb_info->fua = GET_U8_FROM_CDB(cmd, IO_16_CDB_FUA_OFFSET) &
+					IO_CDB_FUA_MASK;
+	cdb_info->prot_info = GET_U8_FROM_CDB(cmd, IO_16_CDB_WP_OFFSET) &
+					IO_CDB_WP_MASK >> IO_CDB_WP_SHIFT;
+	cdb_info->lba = GET_U64_FROM_CDB(cmd, IO_16_CDB_LBA_OFFSET);
+	cdb_info->xfer_len = GET_U32_FROM_CDB(cmd, IO_16_CDB_TX_LEN_OFFSET);
+}
+
+static inline u32 nvme_trans_io_get_num_cmds(struct sg_io_hdr *hdr,
+					struct nvme_trans_io_cdb *cdb_info,
+					u32 max_blocks)
+{
+	/* If using iovecs, send one nvme command per vector */
+	if (hdr->iovec_count > 0)
+		return hdr->iovec_count;
+	else if (cdb_info->xfer_len > max_blocks)
+		return ((cdb_info->xfer_len - 1) / max_blocks) + 1;
+	else
+		return 1;
+}
+
+static u16 nvme_trans_io_get_control(struct nvme_ns *ns,
+					struct nvme_trans_io_cdb *cdb_info)
+{
+	u16 control = 0;
+
+	/* When Protection information support is added, implement here */
+
+	if (cdb_info->fua > 0)
+		control |= NVME_RW_FUA;
+
+	return control;
+}
+
+static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+				struct nvme_trans_io_cdb *cdb_info, u8 is_write)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_dev *dev = ns->dev;
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+	u32 num_cmds;
+	struct nvme_iod *iod;
+	u64 unit_len;
+	u64 unit_num_blocks;	/* Number of blocks to xfer in each nvme cmd */
+	u32 retcode;
+	u32 i = 0;
+	u64 nvme_offset = 0;
+	void *next_mapping_addr;
+	struct nvme_command c;
+	u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
+	u16 control;
+	u32 max_blocks = (dev->max_hw_sectors << 9) >> ns->lba_shift;
+
+	num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
+
+	/*
+	 * This loop handles two cases.
+	 * First, when an SGL is used in the form of an iovec list:
+	 *   - Use iov_base as the next mapping address for the nvme command_id
+	 *   - Use iov_len as the data transfer length for the command.
+	 * Second, when we have a single buffer
+	 *   - If larger than max_blocks, split into chunks, offset
+	 *        each nvme command accordingly.
+	 */
+	for (i = 0; i < num_cmds; i++) {
+		memset(&c, 0, sizeof(c));
+		if (hdr->iovec_count > 0) {
+			struct sg_iovec *sgl = hdr->dxferp;
+
+			unit_len = sgl[i].iov_len;
+			unit_num_blocks = unit_len >> ns->lba_shift;
+			next_mapping_addr = sgl[i].iov_base;
+		} else {
+			unit_num_blocks = min((u64)max_blocks,
+					(cdb_info->xfer_len - nvme_offset));
+			unit_len = unit_num_blocks << ns->lba_shift;
+			next_mapping_addr = hdr->dxferp +
+					((1 << ns->lba_shift) * nvme_offset);
+		}
+
+		c.rw.opcode = opcode;
+		c.rw.nsid = cpu_to_le32(ns->ns_id);
+		c.rw.slba = cpu_to_le64(cdb_info->lba + nvme_offset);
+		c.rw.length = cpu_to_le16(unit_num_blocks - 1);
+		control = nvme_trans_io_get_control(ns, cdb_info);
+		c.rw.control = cpu_to_le16(control);
+
+		iod = nvme_map_user_pages(dev,
+			(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+			(unsigned long)next_mapping_addr, unit_len);
+		if (IS_ERR(iod)) {
+			res = PTR_ERR(iod);
+			goto out;
+		}
+		retcode = nvme_setup_prps(dev, &c.common, iod, unit_len,
+							GFP_KERNEL);
+		if (retcode != unit_len) {
+			nvme_unmap_user_pages(dev,
+				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+				iod);
+			nvme_free_iod(dev, iod);
+			res = -ENOMEM;
+			goto out;
+		}
+
+		nvme_offset += unit_num_blocks;
+
+		nvmeq = get_nvmeq(dev);
+		/*
+		 * Since nvme_submit_sync_cmd sleeps, we can't keep
+		 * preemption disabled.  We may be preempted at any
+		 * point, and be rescheduled to a different CPU.  That
+		 * will cause cacheline bouncing, but no additional
+		 * races since q_lock already protects against other
+		 * CPUs.
+		 */
+		put_nvmeq(nvmeq);
+		nvme_sc = nvme_submit_sync_cmd(nvmeq, &c, NULL,
+						NVME_IO_TIMEOUT);
+		if (nvme_sc != NVME_SC_SUCCESS) {
+			nvme_unmap_user_pages(dev,
+				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+				iod);
+			nvme_free_iod(dev, iod);
+			res = nvme_trans_status_code(hdr, nvme_sc);
+			goto out;
+		}
+		nvme_unmap_user_pages(dev,
+				(is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE,
+				iod);
+		nvme_free_iod(dev, iod);
+	}
+	res = nvme_trans_status_code(hdr, NVME_SC_SUCCESS);
+
+ out:
+	return res;
+}
+
+
+/* SCSI Command Translation Functions */
+
+static int nvme_trans_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 is_write,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	struct nvme_trans_io_cdb cdb_info;
+	u8 opcode = cmd[0];
+	u64 xfer_bytes;
+	u64 sum_iov_len = 0;
+	struct sg_iovec *sgl;
+	int i;
+
+	/* Extract Fields from CDB */
+	switch (opcode) {
+	case WRITE_6:
+	case READ_6:
+		nvme_trans_get_io_cdb6(cmd, &cdb_info);
+		break;
+	case WRITE_10:
+	case READ_10:
+		nvme_trans_get_io_cdb10(cmd, &cdb_info);
+		break;
+	case WRITE_12:
+	case READ_12:
+		nvme_trans_get_io_cdb12(cmd, &cdb_info);
+		break;
+	case WRITE_16:
+	case READ_16:
+		nvme_trans_get_io_cdb16(cmd, &cdb_info);
+		break;
+	default:
+		/* Will never really reach here */
+		res = SNTI_INTERNAL_ERROR;
+		goto out;
+	}
+
+	/* Calculate total length of transfer (in bytes) */
+	if (hdr->iovec_count > 0) {
+		sgl = hdr->dxferp;
+		for (i = 0; i < hdr->iovec_count; i++) {
+			sum_iov_len += sgl[i].iov_len;
+			/* IO vector sizes should be multiples of block size */
+			if (sgl[i].iov_len % (1 << ns->lba_shift) != 0) {
+				res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_PARAMETER,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+				goto out;
+			}
+		}
+	} else {
+		sum_iov_len = hdr->dxfer_len;
+	}
+
+	/* As Per sg ioctl howto, if the lengths differ, use the lower one */
+	xfer_bytes = min(((u64)hdr->dxfer_len), sum_iov_len);
+
+	/* If block count and actual data buffer size dont match, error out */
+	if (xfer_bytes != (cdb_info.xfer_len << ns->lba_shift)) {
+		res = -EINVAL;
+		goto out;
+	}
+
+	/* Check for 0 length transfer - it is not illegal */
+	if (cdb_info.xfer_len == 0)
+		goto out;
+
+	/* Send NVMe IO Command(s) */
+	res = nvme_trans_do_nvme_io(ns, hdr, &cdb_info, is_write);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+ out:
+	return res;
+}
+
+static int nvme_trans_inquiry(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 evpd;
+	u8 page_code;
+	int alloc_len;
+	u8 *inq_response;
+
+	evpd = GET_INQ_EVPD_BIT(cmd);
+	page_code = GET_INQ_PAGE_CODE(cmd);
+	alloc_len = GET_INQ_ALLOC_LENGTH(cmd);
+
+	inq_response = kmalloc(STANDARD_INQUIRY_LENGTH, GFP_KERNEL);
+	if (inq_response == NULL) {
+		res = -ENOMEM;
+		goto out_mem;
+	}
+
+	if (evpd == 0) {
+		if (page_code == INQ_STANDARD_INQUIRY_PAGE) {
+			res = nvme_trans_standard_inquiry_page(ns, hdr,
+						inq_response, alloc_len);
+		} else {
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		}
+	} else {
+		switch (page_code) {
+		case VPD_SUPPORTED_PAGES:
+			res = nvme_trans_supported_vpd_pages(ns, hdr,
+						inq_response, alloc_len);
+			break;
+		case VPD_SERIAL_NUMBER:
+			res = nvme_trans_unit_serial_page(ns, hdr, inq_response,
+								alloc_len);
+			break;
+		case VPD_DEVICE_IDENTIFIERS:
+			res = nvme_trans_device_id_page(ns, hdr, inq_response,
+								alloc_len);
+			break;
+		case VPD_EXTENDED_INQUIRY:
+			res = nvme_trans_ext_inq_page(ns, hdr, alloc_len);
+			break;
+		case VPD_BLOCK_DEV_CHARACTERISTICS:
+			res = nvme_trans_bdev_char_page(ns, hdr, alloc_len);
+			break;
+		default:
+			res = nvme_trans_completion(hdr,
+						SAM_STAT_CHECK_CONDITION,
+						ILLEGAL_REQUEST,
+						SCSI_ASC_INVALID_CDB,
+						SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			break;
+		}
+	}
+	kfree(inq_response);
+ out_mem:
+	return res;
+}
+
+static int nvme_trans_log_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u16 alloc_len;
+	u8 sp;
+	u8 pc;
+	u8 page_code;
+
+	sp = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_SP_OFFSET);
+	if (sp != LOG_SENSE_CDB_SP_NOT_ENABLED) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	pc = GET_U8_FROM_CDB(cmd, LOG_SENSE_CDB_PC_OFFSET);
+	page_code = pc & LOG_SENSE_CDB_PAGE_CODE_MASK;
+	pc = (pc & LOG_SENSE_CDB_PC_MASK) >> LOG_SENSE_CDB_PC_SHIFT;
+	if (pc != LOG_SENSE_CDB_PC_CUMULATIVE_VALUES) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	alloc_len = GET_U16_FROM_CDB(cmd, LOG_SENSE_CDB_ALLOC_LENGTH_OFFSET);
+	switch (page_code) {
+	case LOG_PAGE_SUPPORTED_LOG_PAGES_PAGE:
+		res = nvme_trans_log_supp_pages(ns, hdr, alloc_len);
+		break;
+	case LOG_PAGE_INFORMATIONAL_EXCEPTIONS_PAGE:
+		res = nvme_trans_log_info_exceptions(ns, hdr, alloc_len);
+		break;
+	case LOG_PAGE_TEMPERATURE_PAGE:
+		res = nvme_trans_log_temperature(ns, hdr, alloc_len);
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_trans_mode_select(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 cdb10 = 0;
+	u16 parm_list_len;
+	u8 page_format;
+	u8 save_pages;
+
+	page_format = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_PAGE_FORMAT_OFFSET);
+	page_format &= MODE_SELECT_CDB_PAGE_FORMAT_MASK;
+
+	save_pages = GET_U8_FROM_CDB(cmd, MODE_SELECT_CDB_SAVE_PAGES_OFFSET);
+	save_pages &= MODE_SELECT_CDB_SAVE_PAGES_MASK;
+
+	if (GET_OPCODE(cmd) == MODE_SELECT) {
+		parm_list_len = GET_U8_FROM_CDB(cmd,
+				MODE_SELECT_6_CDB_PARAM_LIST_LENGTH_OFFSET);
+	} else {
+		parm_list_len = GET_U16_FROM_CDB(cmd,
+				MODE_SELECT_10_CDB_PARAM_LIST_LENGTH_OFFSET);
+		cdb10 = 1;
+	}
+
+	if (parm_list_len != 0) {
+		/*
+		 * According to SPC-4 r24, a paramter list length field of 0
+		 * shall not be considered an error
+		 */
+		res = nvme_trans_modesel_data(ns, hdr, cmd, parm_list_len,
+						page_format, save_pages, cdb10);
+	}
+
+	return res;
+}
+
+static int nvme_trans_mode_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u16 alloc_len;
+	u8 cdb10 = 0;
+	u8 page_code;
+	u8 pc;
+
+	if (GET_OPCODE(cmd) == MODE_SENSE) {
+		alloc_len = GET_U8_FROM_CDB(cmd, MODE_SENSE6_ALLOC_LEN_OFFSET);
+	} else {
+		alloc_len = GET_U16_FROM_CDB(cmd,
+						MODE_SENSE10_ALLOC_LEN_OFFSET);
+		cdb10 = 1;
+	}
+
+	pc = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CONTROL_OFFSET) &
+						MODE_SENSE_PAGE_CONTROL_MASK;
+	if (pc != MODE_SENSE_PC_CURRENT_VALUES) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+
+	page_code = GET_U8_FROM_CDB(cmd, MODE_SENSE_PAGE_CODE_OFFSET) &
+					MODE_SENSE_PAGE_CODE_MASK;
+	switch (page_code) {
+	case MODE_PAGE_CACHING:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_caching_page,
+						MODE_PAGE_CACHING_LEN);
+		break;
+	case MODE_PAGE_CONTROL:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_control_page,
+						MODE_PAGE_CONTROL_LEN);
+		break;
+	case MODE_PAGE_POWER_CONDITION:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_pow_cnd_page,
+						MODE_PAGE_POW_CND_LEN);
+		break;
+	case MODE_PAGE_INFO_EXCEP:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_inf_exc_page,
+						MODE_PAGE_INF_EXC_LEN);
+		break;
+	case MODE_PAGE_RETURN_ALL:
+		res = nvme_trans_mode_page_create(ns, hdr, cmd, alloc_len,
+						cdb10,
+						&nvme_trans_fill_all_pages,
+						MODE_PAGE_ALL_LEN);
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	u32 alloc_len = READ_CAP_10_RESP_SIZE;
+	u32 resp_size = READ_CAP_10_RESP_SIZE;
+	u32 xfer_len;
+	u8 cdb16;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ns *id_ns;
+	u8 *response;
+
+	cdb16 = IS_READ_CAP_16(cmd);
+	if (cdb16) {
+		alloc_len = GET_READ_CAP_16_ALLOC_LENGTH(cmd);
+		resp_size = READ_CAP_16_RESP_SIZE;
+	}
+
+	mem = dma_alloc_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns),
+							&dma_addr, GFP_KERNEL);
+	if (mem == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	/* nvme ns identify */
+	nvme_sc = nvme_identify(dev, ns->ns_id, 0, dma_addr);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out_dma;
+	if (nvme_sc) {
+		res = nvme_sc;
+		goto out_dma;
+	}
+	id_ns = mem;
+
+	response = kmalloc(resp_size, GFP_KERNEL);
+	if (response == NULL) {
+		res = -ENOMEM;
+		goto out_dma;
+	}
+	memset(response, 0, resp_size);
+	nvme_trans_fill_read_cap(response, id_ns, cdb16);
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+	kfree(response);
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ns), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	u32 alloc_len, xfer_len, resp_size;
+	u8 select_report;
+	u8 *response;
+	struct nvme_dev *dev = ns->dev;
+	dma_addr_t dma_addr;
+	void *mem;
+	struct nvme_id_ctrl *id_ctrl;
+	u32 ll_length, lun_id;
+	u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET;
+	u32 tmp_len;
+
+	alloc_len = GET_REPORT_LUNS_ALLOC_LENGTH(cmd);
+	select_report = GET_U8_FROM_CDB(cmd, REPORT_LUNS_SR_OFFSET);
+
+	if ((select_report != ALL_LUNS_RETURNED) &&
+	    (select_report != ALL_WELL_KNOWN_LUNS_RETURNED) &&
+	    (select_report != RESTRICTED_LUNS_RETURNED)) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	} else {
+		/* NVMe Controller Identify */
+		mem = dma_alloc_coherent(&dev->pci_dev->dev,
+					sizeof(struct nvme_id_ctrl),
+					&dma_addr, GFP_KERNEL);
+		if (mem == NULL) {
+			res = -ENOMEM;
+			goto out;
+		}
+		nvme_sc = nvme_identify(dev, 0, 1, dma_addr);
+		res = nvme_trans_status_code(hdr, nvme_sc);
+		if (res)
+			goto out_dma;
+		if (nvme_sc) {
+			res = nvme_sc;
+			goto out_dma;
+		}
+		id_ctrl = mem;
+		ll_length = id_ctrl->nn * LUN_ENTRY_SIZE;
+		resp_size = ll_length + LUN_DATA_HEADER_SIZE;
+
+		if (alloc_len < resp_size) {
+			res = nvme_trans_completion(hdr,
+					SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+			goto out_dma;
+		}
+
+		response = kmalloc(resp_size, GFP_KERNEL);
+		if (response == NULL) {
+			res = -ENOMEM;
+			goto out_dma;
+		}
+		memset(response, 0, resp_size);
+
+		/* The first LUN ID will always be 0 per the SAM spec */
+		for (lun_id = 0; lun_id < id_ctrl->nn; lun_id++) {
+			/*
+			 * Set the LUN Id and then increment to the next LUN
+			 * location in the parameter data.
+			 */
+			u64 tmp_id = cpu_to_be64(lun_id);
+			memcpy(&response[lun_id_offset], &tmp_id, sizeof(u64));
+			lun_id_offset += LUN_ENTRY_SIZE;
+		}
+		tmp_len = cpu_to_be32(ll_length);
+		memcpy(response, &tmp_len, sizeof(u32));
+	}
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+	kfree(response);
+ out_dma:
+	dma_free_coherent(&dev->pci_dev->dev, sizeof(struct nvme_id_ctrl), mem,
+			  dma_addr);
+ out:
+	return res;
+}
+
+static int nvme_trans_request_sense(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 alloc_len, xfer_len, resp_size;
+	u8 desc_format;
+	u8 *response;
+
+	alloc_len = GET_REQUEST_SENSE_ALLOC_LENGTH(cmd);
+	desc_format = GET_U8_FROM_CDB(cmd, REQUEST_SENSE_DESC_OFFSET);
+	desc_format &= REQUEST_SENSE_DESC_MASK;
+
+	resp_size = ((desc_format) ? (DESC_FMT_SENSE_DATA_SIZE) :
+					(FIXED_FMT_SENSE_DATA_SIZE));
+	response = kmalloc(resp_size, GFP_KERNEL);
+	if (response == NULL) {
+		res = -ENOMEM;
+		goto out;
+	}
+	memset(response, 0, resp_size);
+
+	if (desc_format == DESCRIPTOR_FORMAT_SENSE_DATA_TYPE) {
+		/* Descriptor Format Sense Data */
+		response[0] = DESC_FORMAT_SENSE_DATA;
+		response[1] = NO_SENSE;
+		/* TODO How is LOW POWER CONDITION ON handled? (byte 2) */
+		response[2] = SCSI_ASC_NO_SENSE;
+		response[3] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		/* SDAT_OVFL = 0 | Additional Sense Length = 0 */
+	} else {
+		/* Fixed Format Sense Data */
+		response[0] = FIXED_SENSE_DATA;
+		/* Byte 1 = Obsolete */
+		response[2] = NO_SENSE; /* FM, EOM, ILI, SDAT_OVFL = 0 */
+		/* Bytes 3-6 - Information - set to zero */
+		response[7] = FIXED_SENSE_DATA_ADD_LENGTH;
+		/* Bytes 8-11 - Cmd Specific Information - set to zero */
+		response[12] = SCSI_ASC_NO_SENSE;
+		response[13] = SCSI_ASCQ_CAUSE_NOT_REPORTABLE;
+		/* Byte 14 = Field Replaceable Unit Code = 0 */
+		/* Bytes 15-17 - SKSV=0; Sense Key Specific = 0 */
+	}
+
+	xfer_len = min(alloc_len, resp_size);
+	res = nvme_trans_copy_to_user(hdr, response, xfer_len);
+
+	kfree(response);
+ out:
+	return res;
+}
+
+static int nvme_trans_security_protocol(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr,
+					u8 *cmd)
+{
+	return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+}
+
+static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+	u8 immed, pcmod, pc, no_flush, start;
+
+	immed = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_IMMED_OFFSET);
+	pcmod = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_MOD_OFFSET);
+	pc = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_POWER_COND_OFFSET);
+	no_flush = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_NO_FLUSH_OFFSET);
+	start = GET_U8_FROM_CDB(cmd, START_STOP_UNIT_CDB_START_OFFSET);
+
+	immed &= START_STOP_UNIT_CDB_IMMED_MASK;
+	pcmod &= START_STOP_UNIT_CDB_POWER_COND_MOD_MASK;
+	pc = (pc & START_STOP_UNIT_CDB_POWER_COND_MASK) >> NIBBLE_SHIFT;
+	no_flush &= START_STOP_UNIT_CDB_NO_FLUSH_MASK;
+	start &= START_STOP_UNIT_CDB_START_MASK;
+
+	if (immed != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	} else {
+		if (no_flush == 0) {
+			/* Issue NVME FLUSH command prior to START STOP UNIT */
+			nvme_sc = nvme_submit_flush_data(nvmeq, ns);
+			put_nvmeq(nvmeq);
+			res = nvme_trans_status_code(hdr, nvme_sc);
+			if (res)
+				goto out;
+			if (nvme_sc) {
+				res = nvme_sc;
+				goto out;
+			}
+		}
+		/* Setup the expected power state transition */
+		res = nvme_trans_power_state(ns, hdr, pc, pcmod, start);
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_trans_synchronize_cache(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr, u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	int nvme_sc;
+	struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+	put_nvmeq(nvmeq);
+	nvme_sc = nvme_submit_flush_data(nvmeq, ns);
+	res = nvme_trans_status_code(hdr, nvme_sc);
+	if (res)
+		goto out;
+	if (nvme_sc)
+		res = nvme_sc;
+
+ out:
+	return res;
+}
+
+static int nvme_trans_format_unit(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u8 parm_hdr_len = 0;
+	u8 nvme_pf_code = 0;
+	u8 format_prot_info, long_list, format_data;
+
+	format_prot_info = GET_U8_FROM_CDB(cmd,
+				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_OFFSET);
+	long_list = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_LONG_LIST_OFFSET);
+	format_data = GET_U8_FROM_CDB(cmd, FORMAT_UNIT_CDB_FORMAT_DATA_OFFSET);
+
+	format_prot_info = (format_prot_info &
+				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_MASK) >>
+				FORMAT_UNIT_CDB_FORMAT_PROT_INFO_SHIFT;
+	long_list &= FORMAT_UNIT_CDB_LONG_LIST_MASK;
+	format_data &= FORMAT_UNIT_CDB_FORMAT_DATA_MASK;
+
+	if (format_data != 0) {
+		if (format_prot_info != 0) {
+			if (long_list == 0)
+				parm_hdr_len = FORMAT_UNIT_SHORT_PARM_LIST_LEN;
+			else
+				parm_hdr_len = FORMAT_UNIT_LONG_PARM_LIST_LEN;
+		}
+	} else if (format_data == 0 && format_prot_info != 0) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+
+	/* Get parm header from data-in/out buffer */
+	/*
+	 * According to the translation spec, the only fields in the parameter
+	 * list we are concerned with are in the header. So allocate only that.
+	 */
+	if (parm_hdr_len > 0) {
+		res = nvme_trans_fmt_get_parm_header(hdr, parm_hdr_len,
+					format_prot_info, &nvme_pf_code);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			goto out;
+	}
+
+	/* Attempt to activate any previously downloaded firmware image */
+	res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw, 0, 0, 0);
+
+	/* Determine Block size and count and send format command */
+	res = nvme_trans_fmt_set_blk_size_count(ns, hdr);
+	if (res != SNTI_TRANSLATION_SUCCESS)
+		goto out;
+
+	res = nvme_trans_fmt_send_cmd(ns, hdr, nvme_pf_code);
+
+ out:
+	return res;
+}
+
+static int nvme_trans_test_unit_ready(struct nvme_ns *ns,
+					struct sg_io_hdr *hdr,
+					u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	struct nvme_dev *dev = ns->dev;
+
+	if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY))
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					    NOT_READY, SCSI_ASC_LUN_NOT_READY,
+					    SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+	else
+		res = nvme_trans_completion(hdr, SAM_STAT_GOOD, NO_SENSE, 0, 0);
+
+	return res;
+}
+
+static int nvme_trans_write_buffer(struct nvme_ns *ns, struct sg_io_hdr *hdr,
+							u8 *cmd)
+{
+	int res = SNTI_TRANSLATION_SUCCESS;
+	u32 buffer_offset, parm_list_length;
+	u8 buffer_id, mode;
+
+	parm_list_length =
+		GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_PARM_LIST_LENGTH_OFFSET);
+	if (parm_list_length % BYTES_TO_DWORDS != 0) {
+		/* NVMe expects Firmware file to be a whole number of DWORDS */
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	buffer_id = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_ID_OFFSET);
+	if (buffer_id > NVME_MAX_FIRMWARE_SLOT) {
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		goto out;
+	}
+	mode = GET_U8_FROM_CDB(cmd, WRITE_BUFFER_CDB_MODE_OFFSET) &
+						WRITE_BUFFER_CDB_MODE_MASK;
+	buffer_offset =
+		GET_U24_FROM_CDB(cmd, WRITE_BUFFER_CDB_BUFFER_OFFSET_OFFSET);
+
+	switch (mode) {
+	case DOWNLOAD_SAVE_ACTIVATE:
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		if (res != SNTI_TRANSLATION_SUCCESS)
+			goto out;
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		break;
+	case DOWNLOAD_SAVE_DEFER_ACTIVATE:
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_download_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		break;
+	case ACTIVATE_DEFERRED_MICROCODE:
+		res = nvme_trans_send_fw_cmd(ns, hdr, nvme_admin_activate_fw,
+						parm_list_length, buffer_offset,
+						buffer_id);
+		break;
+	default:
+		res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+					ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB,
+					SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+
+ out:
+	return res;
+}
+
+static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr)
+{
+	u8 cmd[BLK_MAX_CDB];
+	int retcode;
+	unsigned int opcode;
+
+	if (hdr->cmdp == NULL)
+		return -EMSGSIZE;
+	if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len))
+		return -EFAULT;
+
+	opcode = cmd[0];
+
+	switch (opcode) {
+	case READ_6:
+	case READ_10:
+	case READ_12:
+	case READ_16:
+		retcode = nvme_trans_io(ns, hdr, 0, cmd);
+		break;
+	case WRITE_6:
+	case WRITE_10:
+	case WRITE_12:
+	case WRITE_16:
+		retcode = nvme_trans_io(ns, hdr, 1, cmd);
+		break;
+	case INQUIRY:
+		retcode = nvme_trans_inquiry(ns, hdr, cmd);
+		break;
+	case LOG_SENSE:
+		retcode = nvme_trans_log_sense(ns, hdr, cmd);
+		break;
+	case MODE_SELECT:
+	case MODE_SELECT_10:
+		retcode = nvme_trans_mode_select(ns, hdr, cmd);
+		break;
+	case MODE_SENSE:
+	case MODE_SENSE_10:
+		retcode = nvme_trans_mode_sense(ns, hdr, cmd);
+		break;
+	case READ_CAPACITY:
+		retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+		break;
+	case SERVICE_ACTION_IN:
+		if (IS_READ_CAP_16(cmd))
+			retcode = nvme_trans_read_capacity(ns, hdr, cmd);
+		else
+			goto out;
+		break;
+	case REPORT_LUNS:
+		retcode = nvme_trans_report_luns(ns, hdr, cmd);
+		break;
+	case REQUEST_SENSE:
+		retcode = nvme_trans_request_sense(ns, hdr, cmd);
+		break;
+	case SECURITY_PROTOCOL_IN:
+	case SECURITY_PROTOCOL_OUT:
+		retcode = nvme_trans_security_protocol(ns, hdr, cmd);
+		break;
+	case START_STOP:
+		retcode = nvme_trans_start_stop(ns, hdr, cmd);
+		break;
+	case SYNCHRONIZE_CACHE:
+		retcode = nvme_trans_synchronize_cache(ns, hdr, cmd);
+		break;
+	case FORMAT_UNIT:
+		retcode = nvme_trans_format_unit(ns, hdr, cmd);
+		break;
+	case TEST_UNIT_READY:
+		retcode = nvme_trans_test_unit_ready(ns, hdr, cmd);
+		break;
+	case WRITE_BUFFER:
+		retcode = nvme_trans_write_buffer(ns, hdr, cmd);
+		break;
+	default:
+ out:
+		retcode = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION,
+				ILLEGAL_REQUEST, SCSI_ASC_ILLEGAL_COMMAND,
+				SCSI_ASCQ_CAUSE_NOT_REPORTABLE);
+		break;
+	}
+	return retcode;
+}
+
+int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr)
+{
+	struct sg_io_hdr hdr;
+	int retcode;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+	if (copy_from_user(&hdr, u_hdr, sizeof(hdr)))
+		return -EFAULT;
+	if (hdr.interface_id != 'S')
+		return -EINVAL;
+	if (hdr.cmd_len > BLK_MAX_CDB)
+		return -EINVAL;
+
+	retcode = nvme_scsi_translate(ns, &hdr);
+	if (retcode < 0)
+		return retcode;
+	if (retcode > 0)
+		retcode = SNTI_TRANSLATION_SUCCESS;
+	if (copy_to_user(__user u_hdr, &hdr, sizeof(sg_io_hdr_t)) > 0)
+		return -EFAULT;
+
+	return retcode;
+}
+
+int nvme_sg_get_version_num(int __user *ip)
+{
+	return put_user(sg_version_num, ip);
+}
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index f1974cab60cf..aa575033dbe7 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -546,6 +546,8 @@ struct nvme_ns {
 
 	int ns_id;
 	int lba_shift;
+	u64 mode_select_num_blocks;
+	u32 mode_select_block_len;
 };
 
 /*
@@ -563,6 +565,39 @@ struct nvme_iod {
 	dma_addr_t first_dma;
 	struct scatterlist sg[0];
 };
+
+/**
+ * nvme_free_iod - frees an nvme_iod
+ * @dev: The device that the I/O was submitted to
+ * @iod: The memory to free
+ */
+void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod);
+
+int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
+			struct nvme_iod *iod, int total_len, gfp_t gfp);
+struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write,
+				unsigned long addr, unsigned length);
+void nvme_unmap_user_pages(struct nvme_dev *dev, int write,
+			struct nvme_iod *iod);
+struct nvme_queue *get_nvmeq(struct nvme_dev *dev);
+void put_nvmeq(struct nvme_queue *nvmeq);
+int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
+						u32 *result, unsigned timeout);
+int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns);
+int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *,
+							u32 *result);
+int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns,
+							dma_addr_t dma_addr);
+int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid,
+			dma_addr_t dma_addr, u32 *result);
+int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11,
+			dma_addr_t dma_addr, u32 *result);
+
+struct sg_io_hdr;
+
+int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr);
+int nvme_sg_get_version_num(int __user *ip);
+
 #endif
 
 #endif /* _LINUX_NVME_H */
-- 
cgit 


From fddddb52a6c4e2438f4514ed979183653ca0732a Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Thu, 21 Mar 2013 17:59:14 +0100
Subject: bus: introduce an Marvell EBU MBus driver

The Marvell EBU SoCs have a configurable physical address space
layout: the physical ranges of memory used to address PCI(e)
interfaces, NOR flashes, SRAM and various other types of memory are
configurable by software, through a mechanism of so-called 'address
decoding windows'.

This new driver mvebu-mbus consolidates the existing code to address
the configuration of these memory ranges, which is spread into
mach-mvebu, mach-orion5x, mach-mv78xx0, mach-dove and mach-kirkwood.

Following patches convert each Marvell EBU SoC family to use this
driver, therefore removing the old code that was configuring the
address decoding windows.

It is worth mentioning that the MVEBU_MBUS Kconfig option is
intentionally added as a blind option. The new driver implements and
exports the mv_mbus_dram_info() function, which is used by various
Marvell drivers throughout the tree to get access to window
configuration parameters that they require. This function is also
implemented in arch/arm/plat-orion/addr-map.c, which ultimately gets
removed at the end of this patch series. So, in order to preserve
bisectability, we want to ensure that *either* this new driver, *or*
the legacy code in plat-orion/addr-map.c gets compiled in.

By making MVEBU_MBUS a blind option, we are sure that only a platform
that does 'select MVEBU_MBUS' will get this new driver compiled
in. Therefore, throughout the next patches that convert the Marvell
sub-architectures one after the other to this new driver, we add the
'select MVEBU_MBUS' and also ensure to remove plat-orion/addr-map.c
from the build for this specific sub-architecture. This ensures that
bisectability is preserved.

Ealier versions of this driver had a DT binding, but since those were
not yet agreed upon, they were removed. The driver still uses
of_device_id to find the SoC specific details according to the string
passed to mvebu_mbus_init(). The plan is to re-introduce a proper DT
binding as a followup set of patches.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Cooper <jason@lakedaemon.net>
---
 drivers/bus/Kconfig      |   7 +
 drivers/bus/Makefile     |   1 +
 drivers/bus/mvebu-mbus.c | 867 +++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mbus.h     |  24 +-
 4 files changed, 898 insertions(+), 1 deletion(-)
 create mode 100644 drivers/bus/mvebu-mbus.c

(limited to 'include/linux')

diff --git a/drivers/bus/Kconfig b/drivers/bus/Kconfig
index 0f51ed687dc8..b05ecab915c4 100644
--- a/drivers/bus/Kconfig
+++ b/drivers/bus/Kconfig
@@ -4,6 +4,13 @@
 
 menu "Bus devices"
 
+config MVEBU_MBUS
+	bool
+	depends on PLAT_ORION
+	help
+	  Driver needed for the MBus configuration on Marvell EBU SoCs
+	  (Kirkwood, Dove, Orion5x, MV78XX0 and Armada 370/XP).
+
 config OMAP_OCP2SCP
 	tristate "OMAP OCP2SCP DRIVER"
 	depends on ARCH_OMAP2PLUS
diff --git a/drivers/bus/Makefile b/drivers/bus/Makefile
index 45d997c85453..3c7b53c12091 100644
--- a/drivers/bus/Makefile
+++ b/drivers/bus/Makefile
@@ -2,6 +2,7 @@
 # Makefile for the bus drivers.
 #
 
+obj-$(CONFIG_MVEBU_MBUS) += mvebu-mbus.o
 obj-$(CONFIG_OMAP_OCP2SCP)	+= omap-ocp2scp.o
 
 # Interconnect bus driver for OMAP SoCs.
diff --git a/drivers/bus/mvebu-mbus.c b/drivers/bus/mvebu-mbus.c
new file mode 100644
index 000000000000..586d03e29e9e
--- /dev/null
+++ b/drivers/bus/mvebu-mbus.c
@@ -0,0 +1,867 @@
+/*
+ * Address map functions for Marvell EBU SoCs (Kirkwood, Armada
+ * 370/XP, Dove, Orion5x and MV78xx0)
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2.  This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ *
+ * The Marvell EBU SoCs have a configurable physical address space:
+ * the physical address at which certain devices (PCIe, NOR, NAND,
+ * etc.) sit can be configured. The configuration takes place through
+ * two sets of registers:
+ *
+ * - One to configure the access of the CPU to the devices. Depending
+ *   on the families, there are between 8 and 20 configurable windows,
+ *   each can be use to create a physical memory window that maps to a
+ *   specific device. Devices are identified by a tuple (target,
+ *   attribute).
+ *
+ * - One to configure the access to the CPU to the SDRAM. There are
+ *   either 2 (for Dove) or 4 (for other families) windows to map the
+ *   SDRAM into the physical address space.
+ *
+ * This driver:
+ *
+ * - Reads out the SDRAM address decoding windows at initialization
+ *   time, and fills the mvebu_mbus_dram_info structure with these
+ *   informations. The exported function mv_mbus_dram_info() allow
+ *   device drivers to get those informations related to the SDRAM
+ *   address decoding windows. This is because devices also have their
+ *   own windows (configured through registers that are part of each
+ *   device register space), and therefore the drivers for Marvell
+ *   devices have to configure those device -> SDRAM windows to ensure
+ *   that DMA works properly.
+ *
+ * - Provides an API for platform code or device drivers to
+ *   dynamically add or remove address decoding windows for the CPU ->
+ *   device accesses. This API is mvebu_mbus_add_window(),
+ *   mvebu_mbus_add_window_remap_flags() and
+ *   mvebu_mbus_del_window(). Since the (target, attribute) values
+ *   differ from one SoC family to another, the API uses a 'const char
+ *   *' string to identify devices, and this driver is responsible for
+ *   knowing the mapping between the name of a device and its
+ *   corresponding (target, attribute) in the current SoC family.
+ *
+ * - Provides a debugfs interface in /sys/kernel/debug/mvebu-mbus/ to
+ *   see the list of CPU -> SDRAM windows and their configuration
+ *   (file 'sdram') and the list of CPU -> devices windows and their
+ *   configuration (file 'devices').
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mbus.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/debugfs.h>
+
+/*
+ * DDR target is the same on all platforms.
+ */
+#define TARGET_DDR		0
+
+/*
+ * CPU Address Decode Windows registers
+ */
+#define WIN_CTRL_OFF		0x0000
+#define   WIN_CTRL_ENABLE       BIT(0)
+#define   WIN_CTRL_TGT_MASK     0xf0
+#define   WIN_CTRL_TGT_SHIFT    4
+#define   WIN_CTRL_ATTR_MASK    0xff00
+#define   WIN_CTRL_ATTR_SHIFT   8
+#define   WIN_CTRL_SIZE_MASK    0xffff0000
+#define   WIN_CTRL_SIZE_SHIFT   16
+#define WIN_BASE_OFF		0x0004
+#define   WIN_BASE_LOW          0xffff0000
+#define   WIN_BASE_HIGH         0xf
+#define WIN_REMAP_LO_OFF	0x0008
+#define   WIN_REMAP_LOW         0xffff0000
+#define WIN_REMAP_HI_OFF	0x000c
+
+#define ATTR_HW_COHERENCY	(0x1 << 4)
+
+#define DDR_BASE_CS_OFF(n)	(0x0000 + ((n) << 3))
+#define  DDR_BASE_CS_HIGH_MASK  0xf
+#define  DDR_BASE_CS_LOW_MASK   0xff000000
+#define DDR_SIZE_CS_OFF(n)	(0x0004 + ((n) << 3))
+#define  DDR_SIZE_ENABLED       BIT(0)
+#define  DDR_SIZE_CS_MASK       0x1c
+#define  DDR_SIZE_CS_SHIFT      2
+#define  DDR_SIZE_MASK          0xff000000
+
+#define DOVE_DDR_BASE_CS_OFF(n) ((n) << 4)
+
+struct mvebu_mbus_mapping {
+	const char *name;
+	u8 target;
+	u8 attr;
+	u8 attrmask;
+};
+
+/*
+ * Masks used for the 'attrmask' field of mvebu_mbus_mapping. They
+ * allow to get the real attribute value, discarding the special bits
+ * used to select a PCI MEM region or a PCI WA region. This allows the
+ * debugfs code to reverse-match the name of a device from its
+ * target/attr values.
+ *
+ * For all devices except PCI, all bits of 'attr' must be
+ * considered. For most SoCs, only bit 3 should be ignored (it allows
+ * to select between PCI MEM and PCI I/O). On Orion5x however, there
+ * is the special bit 5 to select a PCI WA region.
+ */
+#define MAPDEF_NOMASK       0xff
+#define MAPDEF_PCIMASK      0xf7
+#define MAPDEF_ORIONPCIMASK 0xd7
+
+/* Macro used to define one mvebu_mbus_mapping entry */
+#define MAPDEF(__n, __t, __a, __m) \
+	{ .name = __n, .target = __t, .attr = __a, .attrmask = __m }
+
+struct mvebu_mbus_state;
+
+struct mvebu_mbus_soc_data {
+	unsigned int num_wins;
+	unsigned int num_remappable_wins;
+	unsigned int (*win_cfg_offset)(const int win);
+	void (*setup_cpu_target)(struct mvebu_mbus_state *s);
+	int (*show_cpu_target)(struct mvebu_mbus_state *s,
+			       struct seq_file *seq, void *v);
+	const struct mvebu_mbus_mapping *map;
+};
+
+struct mvebu_mbus_state {
+	void __iomem *mbuswins_base;
+	void __iomem *sdramwins_base;
+	struct dentry *debugfs_root;
+	struct dentry *debugfs_sdram;
+	struct dentry *debugfs_devs;
+	const struct mvebu_mbus_soc_data *soc;
+	int hw_io_coherency;
+};
+
+static struct mvebu_mbus_state mbus_state;
+
+static struct mbus_dram_target_info mvebu_mbus_dram_info;
+const struct mbus_dram_target_info *mv_mbus_dram_info(void)
+{
+	return &mvebu_mbus_dram_info;
+}
+EXPORT_SYMBOL_GPL(mv_mbus_dram_info);
+
+/*
+ * Functions to manipulate the address decoding windows
+ */
+
+static void mvebu_mbus_read_window(struct mvebu_mbus_state *mbus,
+				   int win, int *enabled, u64 *base,
+				   u32 *size, u8 *target, u8 *attr,
+				   u64 *remap)
+{
+	void __iomem *addr = mbus->mbuswins_base +
+		mbus->soc->win_cfg_offset(win);
+	u32 basereg = readl(addr + WIN_BASE_OFF);
+	u32 ctrlreg = readl(addr + WIN_CTRL_OFF);
+
+	if (!(ctrlreg & WIN_CTRL_ENABLE)) {
+		*enabled = 0;
+		return;
+	}
+
+	*enabled = 1;
+	*base = ((u64)basereg & WIN_BASE_HIGH) << 32;
+	*base |= (basereg & WIN_BASE_LOW);
+	*size = (ctrlreg | ~WIN_CTRL_SIZE_MASK) + 1;
+
+	if (target)
+		*target = (ctrlreg & WIN_CTRL_TGT_MASK) >> WIN_CTRL_TGT_SHIFT;
+
+	if (attr)
+		*attr = (ctrlreg & WIN_CTRL_ATTR_MASK) >> WIN_CTRL_ATTR_SHIFT;
+
+	if (remap) {
+		if (win < mbus->soc->num_remappable_wins) {
+			u32 remap_low = readl(addr + WIN_REMAP_LO_OFF);
+			u32 remap_hi  = readl(addr + WIN_REMAP_HI_OFF);
+			*remap = ((u64)remap_hi << 32) | remap_low;
+		} else
+			*remap = 0;
+	}
+}
+
+static void mvebu_mbus_disable_window(struct mvebu_mbus_state *mbus,
+				      int win)
+{
+	void __iomem *addr;
+
+	addr = mbus->mbuswins_base + mbus->soc->win_cfg_offset(win);
+
+	writel(0, addr + WIN_BASE_OFF);
+	writel(0, addr + WIN_CTRL_OFF);
+	if (win < mbus->soc->num_remappable_wins) {
+		writel(0, addr + WIN_REMAP_LO_OFF);
+		writel(0, addr + WIN_REMAP_HI_OFF);
+	}
+}
+
+/* Checks whether the given window number is available */
+static int mvebu_mbus_window_is_free(struct mvebu_mbus_state *mbus,
+				     const int win)
+{
+	void __iomem *addr = mbus->mbuswins_base +
+		mbus->soc->win_cfg_offset(win);
+	u32 ctrl = readl(addr + WIN_CTRL_OFF);
+	return !(ctrl & WIN_CTRL_ENABLE);
+}
+
+/*
+ * Checks whether the given (base, base+size) area doesn't overlap an
+ * existing region
+ */
+static int mvebu_mbus_window_conflicts(struct mvebu_mbus_state *mbus,
+				       phys_addr_t base, size_t size,
+				       u8 target, u8 attr)
+{
+	u64 end = (u64)base + size;
+	int win;
+
+	for (win = 0; win < mbus->soc->num_wins; win++) {
+		u64 wbase, wend;
+		u32 wsize;
+		u8 wtarget, wattr;
+		int enabled;
+
+		mvebu_mbus_read_window(mbus, win,
+				       &enabled, &wbase, &wsize,
+				       &wtarget, &wattr, NULL);
+
+		if (!enabled)
+			continue;
+
+		wend = wbase + wsize;
+
+		/*
+		 * Check if the current window overlaps with the
+		 * proposed physical range
+		 */
+		if ((u64)base < wend && end > wbase)
+			return 0;
+
+		/*
+		 * Check if target/attribute conflicts
+		 */
+		if (target == wtarget && attr == wattr)
+			return 0;
+	}
+
+	return 1;
+}
+
+static int mvebu_mbus_find_window(struct mvebu_mbus_state *mbus,
+				  phys_addr_t base, size_t size)
+{
+	int win;
+
+	for (win = 0; win < mbus->soc->num_wins; win++) {
+		u64 wbase;
+		u32 wsize;
+		int enabled;
+
+		mvebu_mbus_read_window(mbus, win,
+				       &enabled, &wbase, &wsize,
+				       NULL, NULL, NULL);
+
+		if (!enabled)
+			continue;
+
+		if (base == wbase && size == wsize)
+			return win;
+	}
+
+	return -ENODEV;
+}
+
+static int mvebu_mbus_setup_window(struct mvebu_mbus_state *mbus,
+				   int win, phys_addr_t base, size_t size,
+				   phys_addr_t remap, u8 target,
+				   u8 attr)
+{
+	void __iomem *addr = mbus->mbuswins_base +
+		mbus->soc->win_cfg_offset(win);
+	u32 ctrl, remap_addr;
+
+	ctrl = ((size - 1) & WIN_CTRL_SIZE_MASK) |
+		(attr << WIN_CTRL_ATTR_SHIFT)    |
+		(target << WIN_CTRL_TGT_SHIFT)   |
+		WIN_CTRL_ENABLE;
+
+	writel(base & WIN_BASE_LOW, addr + WIN_BASE_OFF);
+	writel(ctrl, addr + WIN_CTRL_OFF);
+	if (win < mbus->soc->num_remappable_wins) {
+		if (remap == MVEBU_MBUS_NO_REMAP)
+			remap_addr = base;
+		else
+			remap_addr = remap;
+		writel(remap_addr & WIN_REMAP_LOW, addr + WIN_REMAP_LO_OFF);
+		writel(0, addr + WIN_REMAP_HI_OFF);
+	}
+
+	return 0;
+}
+
+static int mvebu_mbus_alloc_window(struct mvebu_mbus_state *mbus,
+				   phys_addr_t base, size_t size,
+				   phys_addr_t remap, u8 target,
+				   u8 attr)
+{
+	int win;
+
+	if (remap == MVEBU_MBUS_NO_REMAP) {
+		for (win = mbus->soc->num_remappable_wins;
+		     win < mbus->soc->num_wins; win++)
+			if (mvebu_mbus_window_is_free(mbus, win))
+				return mvebu_mbus_setup_window(mbus, win, base,
+							       size, remap,
+							       target, attr);
+	}
+
+
+	for (win = 0; win < mbus->soc->num_wins; win++)
+		if (mvebu_mbus_window_is_free(mbus, win))
+			return mvebu_mbus_setup_window(mbus, win, base, size,
+						       remap, target, attr);
+
+	return -ENOMEM;
+}
+
+/*
+ * Debugfs debugging
+ */
+
+/* Common function used for Dove, Kirkwood, Armada 370/XP and Orion 5x */
+static int mvebu_sdram_debug_show_orion(struct mvebu_mbus_state *mbus,
+					struct seq_file *seq, void *v)
+{
+	int i;
+
+	for (i = 0; i < 4; i++) {
+		u32 basereg = readl(mbus->sdramwins_base + DDR_BASE_CS_OFF(i));
+		u32 sizereg = readl(mbus->sdramwins_base + DDR_SIZE_CS_OFF(i));
+		u64 base;
+		u32 size;
+
+		if (!(sizereg & DDR_SIZE_ENABLED)) {
+			seq_printf(seq, "[%d] disabled\n", i);
+			continue;
+		}
+
+		base = ((u64)basereg & DDR_BASE_CS_HIGH_MASK) << 32;
+		base |= basereg & DDR_BASE_CS_LOW_MASK;
+		size = (sizereg | ~DDR_SIZE_MASK);
+
+		seq_printf(seq, "[%d] %016llx - %016llx : cs%d\n",
+			   i, (unsigned long long)base,
+			   (unsigned long long)base + size + 1,
+			   (sizereg & DDR_SIZE_CS_MASK) >> DDR_SIZE_CS_SHIFT);
+	}
+
+	return 0;
+}
+
+/* Special function for Dove */
+static int mvebu_sdram_debug_show_dove(struct mvebu_mbus_state *mbus,
+				       struct seq_file *seq, void *v)
+{
+	int i;
+
+	for (i = 0; i < 2; i++) {
+		u32 map = readl(mbus->sdramwins_base + DOVE_DDR_BASE_CS_OFF(i));
+		u64 base;
+		u32 size;
+
+		if (!(map & 1)) {
+			seq_printf(seq, "[%d] disabled\n", i);
+			continue;
+		}
+
+		base = map & 0xff800000;
+		size = 0x100000 << (((map & 0x000f0000) >> 16) - 4);
+
+		seq_printf(seq, "[%d] %016llx - %016llx : cs%d\n",
+			   i, (unsigned long long)base,
+			   (unsigned long long)base + size, i);
+	}
+
+	return 0;
+}
+
+static int mvebu_sdram_debug_show(struct seq_file *seq, void *v)
+{
+	struct mvebu_mbus_state *mbus = &mbus_state;
+	return mbus->soc->show_cpu_target(mbus, seq, v);
+}
+
+static int mvebu_sdram_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mvebu_sdram_debug_show, inode->i_private);
+}
+
+static const struct file_operations mvebu_sdram_debug_fops = {
+	.open = mvebu_sdram_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static int mvebu_devs_debug_show(struct seq_file *seq, void *v)
+{
+	struct mvebu_mbus_state *mbus = &mbus_state;
+	int win;
+
+	for (win = 0; win < mbus->soc->num_wins; win++) {
+		u64 wbase, wremap;
+		u32 wsize;
+		u8 wtarget, wattr;
+		int enabled, i;
+		const char *name;
+
+		mvebu_mbus_read_window(mbus, win,
+				       &enabled, &wbase, &wsize,
+				       &wtarget, &wattr, &wremap);
+
+		if (!enabled) {
+			seq_printf(seq, "[%02d] disabled\n", win);
+			continue;
+		}
+
+
+		for (i = 0; mbus->soc->map[i].name; i++)
+			if (mbus->soc->map[i].target == wtarget &&
+			    mbus->soc->map[i].attr ==
+			    (wattr & mbus->soc->map[i].attrmask))
+				break;
+
+		name = mbus->soc->map[i].name ?: "unknown";
+
+		seq_printf(seq, "[%02d] %016llx - %016llx : %s",
+			   win, (unsigned long long)wbase,
+			   (unsigned long long)(wbase + wsize), name);
+
+		if (win < mbus->soc->num_remappable_wins) {
+			seq_printf(seq, " (remap %016llx)\n",
+				   (unsigned long long)wremap);
+		} else
+			seq_printf(seq, "\n");
+	}
+
+	return 0;
+}
+
+static int mvebu_devs_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, mvebu_devs_debug_show, inode->i_private);
+}
+
+static const struct file_operations mvebu_devs_debug_fops = {
+	.open = mvebu_devs_debug_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+/*
+ * SoC-specific functions and definitions
+ */
+
+static unsigned int orion_mbus_win_offset(int win)
+{
+	return win << 4;
+}
+
+static unsigned int armada_370_xp_mbus_win_offset(int win)
+{
+	/* The register layout is a bit annoying and the below code
+	 * tries to cope with it.
+	 * - At offset 0x0, there are the registers for the first 8
+	 *   windows, with 4 registers of 32 bits per window (ctrl,
+	 *   base, remap low, remap high)
+	 * - Then at offset 0x80, there is a hole of 0x10 bytes for
+	 *   the internal registers base address and internal units
+	 *   sync barrier register.
+	 * - Then at offset 0x90, there the registers for 12
+	 *   windows, with only 2 registers of 32 bits per window
+	 *   (ctrl, base).
+	 */
+	if (win < 8)
+		return win << 4;
+	else
+		return 0x90 + ((win - 8) << 3);
+}
+
+static unsigned int mv78xx0_mbus_win_offset(int win)
+{
+	if (win < 8)
+		return win << 4;
+	else
+		return 0x900 + ((win - 8) << 4);
+}
+
+static void __init
+mvebu_mbus_default_setup_cpu_target(struct mvebu_mbus_state *mbus)
+{
+	int i;
+	int cs;
+
+	mvebu_mbus_dram_info.mbus_dram_target_id = TARGET_DDR;
+
+	for (i = 0, cs = 0; i < 4; i++) {
+		u32 base = readl(mbus->sdramwins_base + DDR_BASE_CS_OFF(i));
+		u32 size = readl(mbus->sdramwins_base + DDR_SIZE_CS_OFF(i));
+
+		/*
+		 * We only take care of entries for which the chip
+		 * select is enabled, and that don't have high base
+		 * address bits set (devices can only access the first
+		 * 32 bits of the memory).
+		 */
+		if ((size & DDR_SIZE_ENABLED) &&
+		    !(base & DDR_BASE_CS_HIGH_MASK)) {
+			struct mbus_dram_window *w;
+
+			w = &mvebu_mbus_dram_info.cs[cs++];
+			w->cs_index = i;
+			w->mbus_attr = 0xf & ~(1 << i);
+			if (mbus->hw_io_coherency)
+				w->mbus_attr |= ATTR_HW_COHERENCY;
+			w->base = base & DDR_BASE_CS_LOW_MASK;
+			w->size = (size | ~DDR_SIZE_MASK) + 1;
+		}
+	}
+	mvebu_mbus_dram_info.num_cs = cs;
+}
+
+static void __init
+mvebu_mbus_dove_setup_cpu_target(struct mvebu_mbus_state *mbus)
+{
+	int i;
+	int cs;
+
+	mvebu_mbus_dram_info.mbus_dram_target_id = TARGET_DDR;
+
+	for (i = 0, cs = 0; i < 2; i++) {
+		u32 map = readl(mbus->sdramwins_base + DOVE_DDR_BASE_CS_OFF(i));
+
+		/*
+		 * Chip select enabled?
+		 */
+		if (map & 1) {
+			struct mbus_dram_window *w;
+
+			w = &mvebu_mbus_dram_info.cs[cs++];
+			w->cs_index = i;
+			w->mbus_attr = 0; /* CS address decoding done inside */
+					  /* the DDR controller, no need to  */
+					  /* provide attributes */
+			w->base = map & 0xff800000;
+			w->size = 0x100000 << (((map & 0x000f0000) >> 16) - 4);
+		}
+	}
+
+	mvebu_mbus_dram_info.num_cs = cs;
+}
+
+static const struct mvebu_mbus_mapping armada_370_map[] = {
+	MAPDEF("bootrom",     1, 0xe0, MAPDEF_NOMASK),
+	MAPDEF("devbus-boot", 1, 0x2f, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs0",  1, 0x3e, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs1",  1, 0x3d, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs2",  1, 0x3b, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs3",  1, 0x37, MAPDEF_NOMASK),
+	MAPDEF("pcie0.0",     4, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.0",     8, 0xe0, MAPDEF_PCIMASK),
+	{},
+};
+
+static const struct mvebu_mbus_soc_data armada_370_mbus_data = {
+	.num_wins            = 20,
+	.num_remappable_wins = 8,
+	.win_cfg_offset      = armada_370_xp_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_default_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_orion,
+	.map                 = armada_370_map,
+};
+
+static const struct mvebu_mbus_mapping armada_xp_map[] = {
+	MAPDEF("bootrom",     1, 0x1d, MAPDEF_NOMASK),
+	MAPDEF("devbus-boot", 1, 0x2f, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs0",  1, 0x3e, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs1",  1, 0x3d, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs2",  1, 0x3b, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs3",  1, 0x37, MAPDEF_NOMASK),
+	MAPDEF("pcie0.0",     4, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie0.1",     4, 0xd0, MAPDEF_PCIMASK),
+	MAPDEF("pcie0.2",     4, 0xb0, MAPDEF_PCIMASK),
+	MAPDEF("pcie0.3",     4, 0x70, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.0",     8, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.1",     8, 0xd0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.2",     8, 0xb0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.3",     8, 0x70, MAPDEF_PCIMASK),
+	MAPDEF("pcie2.0",     4, 0xf0, MAPDEF_PCIMASK),
+	MAPDEF("pcie3.0",     8, 0xf0, MAPDEF_PCIMASK),
+	{},
+};
+
+static const struct mvebu_mbus_soc_data armada_xp_mbus_data = {
+	.num_wins            = 20,
+	.num_remappable_wins = 8,
+	.win_cfg_offset      = armada_370_xp_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_default_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_orion,
+	.map                 = armada_xp_map,
+};
+
+static const struct mvebu_mbus_mapping kirkwood_map[] = {
+	MAPDEF("pcie0.0", 4, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.0", 8, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("sram",    3, 0x01, MAPDEF_NOMASK),
+	MAPDEF("nand",    1, 0x2f, MAPDEF_NOMASK),
+	{},
+};
+
+static const struct mvebu_mbus_soc_data kirkwood_mbus_data = {
+	.num_wins            = 8,
+	.num_remappable_wins = 4,
+	.win_cfg_offset      = orion_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_default_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_orion,
+	.map                 = kirkwood_map,
+};
+
+static const struct mvebu_mbus_mapping dove_map[] = {
+	MAPDEF("pcie0.0",    0x4, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.0",    0x8, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("cesa",       0x3, 0x01, MAPDEF_NOMASK),
+	MAPDEF("bootrom",    0x1, 0xfd, MAPDEF_NOMASK),
+	MAPDEF("scratchpad", 0xd, 0x0, MAPDEF_NOMASK),
+	{},
+};
+
+static const struct mvebu_mbus_soc_data dove_mbus_data = {
+	.num_wins            = 8,
+	.num_remappable_wins = 4,
+	.win_cfg_offset      = orion_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_dove_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_dove,
+	.map                 = dove_map,
+};
+
+static const struct mvebu_mbus_mapping orion5x_map[] = {
+	MAPDEF("pcie0.0",     4, 0x51, MAPDEF_ORIONPCIMASK),
+	MAPDEF("pci0.0",      3, 0x51, MAPDEF_ORIONPCIMASK),
+	MAPDEF("devbus-boot", 1, 0x0f, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs0",  1, 0x1e, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs1",  1, 0x1d, MAPDEF_NOMASK),
+	MAPDEF("devbus-cs2",  1, 0x1b, MAPDEF_NOMASK),
+	MAPDEF("sram",        0, 0x00, MAPDEF_NOMASK),
+	{},
+};
+
+/*
+ * Some variants of Orion5x have 4 remappable windows, some other have
+ * only two of them.
+ */
+static const struct mvebu_mbus_soc_data orion5x_4win_mbus_data = {
+	.num_wins            = 8,
+	.num_remappable_wins = 4,
+	.win_cfg_offset      = orion_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_default_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_orion,
+	.map                 = orion5x_map,
+};
+
+static const struct mvebu_mbus_soc_data orion5x_2win_mbus_data = {
+	.num_wins            = 8,
+	.num_remappable_wins = 2,
+	.win_cfg_offset      = orion_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_default_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_orion,
+	.map                 = orion5x_map,
+};
+
+static const struct mvebu_mbus_mapping mv78xx0_map[] = {
+	MAPDEF("pcie0.0", 4, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie0.1", 4, 0xd0, MAPDEF_PCIMASK),
+	MAPDEF("pcie0.2", 4, 0xb0, MAPDEF_PCIMASK),
+	MAPDEF("pcie0.3", 4, 0x70, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.0", 8, 0xe0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.1", 8, 0xd0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.2", 8, 0xb0, MAPDEF_PCIMASK),
+	MAPDEF("pcie1.3", 8, 0x70, MAPDEF_PCIMASK),
+	MAPDEF("pcie2.0", 4, 0xf0, MAPDEF_PCIMASK),
+	MAPDEF("pcie3.0", 8, 0xf0, MAPDEF_PCIMASK),
+	{},
+};
+
+static const struct mvebu_mbus_soc_data mv78xx0_mbus_data = {
+	.num_wins            = 14,
+	.num_remappable_wins = 8,
+	.win_cfg_offset      = mv78xx0_mbus_win_offset,
+	.setup_cpu_target    = mvebu_mbus_default_setup_cpu_target,
+	.show_cpu_target     = mvebu_sdram_debug_show_orion,
+	.map                 = mv78xx0_map,
+};
+
+/*
+ * The driver doesn't yet have a DT binding because the details of
+ * this DT binding still need to be sorted out. However, as a
+ * preparation, we already use of_device_id to match a SoC description
+ * string against the SoC specific details of this driver.
+ */
+static const struct of_device_id of_mvebu_mbus_ids[] = {
+	{ .compatible = "marvell,armada370-mbus",
+	  .data = &armada_370_mbus_data, },
+	{ .compatible = "marvell,armadaxp-mbus",
+	  .data = &armada_xp_mbus_data, },
+	{ .compatible = "marvell,kirkwood-mbus",
+	  .data = &kirkwood_mbus_data, },
+	{ .compatible = "marvell,dove-mbus",
+	  .data = &dove_mbus_data, },
+	{ .compatible = "marvell,orion5x-88f5281-mbus",
+	  .data = &orion5x_4win_mbus_data, },
+	{ .compatible = "marvell,orion5x-88f5182-mbus",
+	  .data = &orion5x_2win_mbus_data, },
+	{ .compatible = "marvell,orion5x-88f5181-mbus",
+	  .data = &orion5x_2win_mbus_data, },
+	{ .compatible = "marvell,orion5x-88f6183-mbus",
+	  .data = &orion5x_4win_mbus_data, },
+	{ .compatible = "marvell,mv78xx0-mbus",
+	  .data = &mv78xx0_mbus_data, },
+	{ },
+};
+
+/*
+ * Public API of the driver
+ */
+int mvebu_mbus_add_window_remap_flags(const char *devname, phys_addr_t base,
+				      size_t size, phys_addr_t remap,
+				      unsigned int flags)
+{
+	struct mvebu_mbus_state *s = &mbus_state;
+	u8 target, attr;
+	int i;
+
+	if (!s->soc->map)
+		return -ENODEV;
+
+	for (i = 0; s->soc->map[i].name; i++)
+		if (!strcmp(s->soc->map[i].name, devname))
+			break;
+
+	if (!s->soc->map[i].name) {
+		pr_err("mvebu-mbus: unknown device '%s'\n", devname);
+		return -ENODEV;
+	}
+
+	target = s->soc->map[i].target;
+	attr   = s->soc->map[i].attr;
+
+	if (flags == MVEBU_MBUS_PCI_MEM)
+		attr |= 0x8;
+	else if (flags == MVEBU_MBUS_PCI_WA)
+		attr |= 0x28;
+
+	if (!mvebu_mbus_window_conflicts(s, base, size, target, attr)) {
+		pr_err("mvebu-mbus: cannot add window '%s', conflicts with another window\n",
+		       devname);
+		return -EINVAL;
+	}
+
+	return mvebu_mbus_alloc_window(s, base, size, remap, target, attr);
+
+}
+
+int mvebu_mbus_add_window(const char *devname, phys_addr_t base, size_t size)
+{
+	return mvebu_mbus_add_window_remap_flags(devname, base, size,
+						 MVEBU_MBUS_NO_REMAP, 0);
+}
+
+int mvebu_mbus_del_window(phys_addr_t base, size_t size)
+{
+	int win;
+
+	win = mvebu_mbus_find_window(&mbus_state, base, size);
+	if (win < 0)
+		return win;
+
+	mvebu_mbus_disable_window(&mbus_state, win);
+	return 0;
+}
+
+static __init int mvebu_mbus_debugfs_init(void)
+{
+	struct mvebu_mbus_state *s = &mbus_state;
+
+	/*
+	 * If no base has been initialized, doesn't make sense to
+	 * register the debugfs entries. We may be on a multiplatform
+	 * kernel that isn't running a Marvell EBU SoC.
+	 */
+	if (!s->mbuswins_base)
+		return 0;
+
+	s->debugfs_root = debugfs_create_dir("mvebu-mbus", NULL);
+	if (s->debugfs_root) {
+		s->debugfs_sdram = debugfs_create_file("sdram", S_IRUGO,
+						       s->debugfs_root, NULL,
+						       &mvebu_sdram_debug_fops);
+		s->debugfs_devs = debugfs_create_file("devices", S_IRUGO,
+						      s->debugfs_root, NULL,
+						      &mvebu_devs_debug_fops);
+	}
+
+	return 0;
+}
+fs_initcall(mvebu_mbus_debugfs_init);
+
+int __init mvebu_mbus_init(const char *soc, phys_addr_t mbuswins_phys_base,
+			   size_t mbuswins_size,
+			   phys_addr_t sdramwins_phys_base,
+			   size_t sdramwins_size)
+{
+	struct mvebu_mbus_state *mbus = &mbus_state;
+	const struct of_device_id *of_id;
+	int win;
+
+	for (of_id = of_mvebu_mbus_ids; of_id->compatible; of_id++)
+		if (!strcmp(of_id->compatible, soc))
+			break;
+
+	if (!of_id->compatible) {
+		pr_err("mvebu-mbus: could not find a matching SoC family\n");
+		return -ENODEV;
+	}
+
+	mbus->soc = of_id->data;
+
+	mbus->mbuswins_base = ioremap(mbuswins_phys_base, mbuswins_size);
+	if (!mbus->mbuswins_base)
+		return -ENOMEM;
+
+	mbus->sdramwins_base = ioremap(sdramwins_phys_base, sdramwins_size);
+	if (!mbus->sdramwins_base) {
+		iounmap(mbus_state.mbuswins_base);
+		return -ENOMEM;
+	}
+
+	for (win = 0; win < mbus->soc->num_wins; win++)
+		mvebu_mbus_disable_window(mbus, win);
+
+	mbus->soc->setup_cpu_target(mbus);
+
+	return 0;
+}
diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index efa1a6d7aca8..462eb9791012 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -32,6 +32,17 @@ struct mbus_dram_target_info
 	} cs[4];
 };
 
+/* Flags for PCI/PCIe address decoding regions */
+#define MVEBU_MBUS_PCI_IO  0x1
+#define MVEBU_MBUS_PCI_MEM 0x2
+#define MVEBU_MBUS_PCI_WA  0x3
+
+/*
+ * Magic value that explicits that we don't need a remapping-capable
+ * address decoding window.
+ */
+#define MVEBU_MBUS_NO_REMAP (0xffffffff)
+
 /*
  * The Marvell mbus is to be found only on SOCs from the Orion family
  * at the moment.  Provide a dummy stub for other architectures.
@@ -44,4 +55,15 @@ static inline const struct mbus_dram_target_info *mv_mbus_dram_info(void)
 	return NULL;
 }
 #endif
-#endif
+
+int mvebu_mbus_add_window_remap_flags(const char *devname, phys_addr_t base,
+				      size_t size, phys_addr_t remap,
+				      unsigned int flags);
+int mvebu_mbus_add_window(const char *devname, phys_addr_t base,
+			  size_t size);
+int mvebu_mbus_del_window(phys_addr_t base, size_t size);
+int mvebu_mbus_init(const char *soc, phys_addr_t mbus_phys_base,
+		    size_t mbus_size, phys_addr_t sdram_phys_base,
+		    size_t sdram_size);
+
+#endif /* __LINUX_MBUS_H */
-- 
cgit 


From e5354107e14755991da82e0d2a4791db92908d9d Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 27 Mar 2013 17:29:53 +0200
Subject: mei: bus: Initial MEI Client bus type implementation

mei client bus will present some of the mei clients
as devices for other standard subsystems

Implement the probe, remove, match, device addtion routines, along with
the sysfs and uevent ones. mei_cl_device_id is also added to
mod_devicetable.h
A mei-cleint-bus.txt document describing the rationale and the API usage
is also added while ABI/testing/sysfs-bus-mei describeis the modalias ABI.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/ABI/testing/sysfs-bus-mei           |   7 +
 Documentation/misc-devices/mei/mei-client-bus.txt | 135 +++++++++++++++++
 drivers/misc/mei/Makefile                         |   1 +
 drivers/misc/mei/bus.c                            | 172 ++++++++++++++++++++++
 drivers/misc/mei/mei_dev.h                        |  26 ++++
 include/linux/mei_cl_bus.h                        |  20 +++
 include/linux/mod_devicetable.h                   |   9 ++
 scripts/mod/devicetable-offsets.c                 |   3 +
 scripts/mod/file2alias.c                          |  12 ++
 9 files changed, 385 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-mei
 create mode 100644 Documentation/misc-devices/mei/mei-client-bus.txt
 create mode 100644 drivers/misc/mei/bus.c
 create mode 100644 include/linux/mei_cl_bus.h

(limited to 'include/linux')

diff --git a/Documentation/ABI/testing/sysfs-bus-mei b/Documentation/ABI/testing/sysfs-bus-mei
new file mode 100644
index 000000000000..2066f0bbd453
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-mei
@@ -0,0 +1,7 @@
+What:		/sys/bus/mei/devices/.../modalias
+Date:		March 2013
+KernelVersion:	3.10
+Contact:	Samuel Ortiz <sameo@linux.intel.com>
+		linux-mei@linux.intel.com
+Description:	Stores the same MODALIAS value emitted by uevent
+		Format: mei:<mei device name>
diff --git a/Documentation/misc-devices/mei/mei-client-bus.txt b/Documentation/misc-devices/mei/mei-client-bus.txt
new file mode 100644
index 000000000000..9dc5ebf94eb1
--- /dev/null
+++ b/Documentation/misc-devices/mei/mei-client-bus.txt
@@ -0,0 +1,135 @@
+Intel(R) Management Engine (ME) Client bus API
+===============================================
+
+
+Rationale
+=========
+MEI misc character device is useful for dedicated applications to send and receive
+data to the many FW appliance found in Intel's ME from the user space.
+However for some of the ME functionalities it make sense to leverage existing software
+stack and expose them through existing kernel subsystems.
+
+In order to plug seamlessly into the kernel device driver model we add kernel virtual
+bus abstraction on top of the MEI driver. This allows implementing linux kernel drivers
+for the various MEI features as a stand alone entities found in their respective subsystem.
+Existing device drivers can even potentially be re-used by adding an MEI CL bus layer to
+the existing code.
+
+
+MEI CL bus API
+===========
+A driver implementation for an MEI Client is very similar to existing bus
+based device drivers. The driver registers itself as an MEI CL bus driver through
+the mei_cl_driver structure:
+
+struct mei_cl_driver {
+	struct device_driver driver;
+	const char *name;
+
+	const struct mei_cl_device_id *id_table;
+
+	int (*probe)(struct mei_cl_device *dev, const struct mei_cl_id *id);
+	int (*remove)(struct mei_cl_device *dev);
+};
+
+struct mei_cl_id {
+	char name[MEI_NAME_SIZE];
+	kernel_ulong_t driver_info;
+};
+
+The mei_cl_id structure allows the driver to bind itself against a device name.
+
+To actually register a driver on the ME Client bus one must call the mei_cl_add_driver()
+API. This is typically called at module init time.
+
+Once registered on the ME Client bus, a driver will typically try to do some I/O on
+this bus and this should be done through the mei_cl_send() and mei_cl_recv()
+routines. The latter is synchronous (blocks and sleeps until data shows up).
+In order for drivers to be notified of pending events waiting for them (e.g.
+an Rx event) they can register an event handler through the
+mei_cl_register_event_cb() routine. Currently only the MEI_EVENT_RX event
+will trigger an event handler call and the driver implementation is supposed
+to call mei_recv() from the event handler in order to fetch the pending
+received buffers.
+
+
+Example
+=======
+As a theoretical example let's pretend the ME comes with a "contact" NFC IP.
+The driver init and exit routines for this device would look like:
+
+#define CONTACT_DRIVER_NAME "contact"
+
+static struct mei_cl_device_id contact_mei_cl_tbl[] = {
+	{ CONTACT_DRIVER_NAME, },
+
+	/* required last entry */
+	{ }
+};
+MODULE_DEVICE_TABLE(mei_cl, contact_mei_cl_tbl);
+
+static struct mei_cl_driver contact_driver = {
+       .id_table = contact_mei_tbl,
+       .name = CONTACT_DRIVER_NAME,
+
+       .probe = contact_probe,
+       .remove = contact_remove,
+};
+
+static int contact_init(void)
+{
+	int r;
+
+	r = mei_cl_driver_register(&contact_driver);
+	if (r) {
+		pr_err(CONTACT_DRIVER_NAME ": driver registration failed\n");
+		return r;
+	}
+
+	return 0;
+}
+
+static void __exit contact_exit(void)
+{
+	mei_cl_driver_unregister(&contact_driver);
+}
+
+module_init(contact_init);
+module_exit(contact_exit);
+
+And the driver's simplified probe routine would look like that:
+
+int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id)
+{
+	struct contact_driver *contact;
+
+	[...]
+	mei_cl_register_event_cb(dev, contact_event_cb, contact);
+
+	return 0;
+ }
+
+In the probe routine the driver basically registers an ME bus event handler
+which is as close as it can get to registering a threaded IRQ handler.
+The handler implementation will typically call some I/O routine depending on
+the pending events:
+
+#define MAX_NFC_PAYLOAD 128
+
+static void contact_event_cb(struct mei_cl_device *dev, u32 events,
+			     void *context)
+{
+	struct contact_driver *contact = context;
+
+	if (events & BIT(MEI_EVENT_RX)) {
+		u8 payload[MAX_NFC_PAYLOAD];
+		int payload_size;
+
+		payload_size = mei_recv(dev, payload, MAX_NFC_PAYLOAD);
+		if (payload_size <= 0)
+			return;
+
+		/* Hook to the NFC subsystem */
+		nfc_hci_recv_frame(contact->hdev, payload, payload_size);
+	}
+}
diff --git a/drivers/misc/mei/Makefile b/drivers/misc/mei/Makefile
index 2c336d087749..1b29f7ccac49 100644
--- a/drivers/misc/mei/Makefile
+++ b/drivers/misc/mei/Makefile
@@ -10,6 +10,7 @@ mei-objs += client.o
 mei-objs += main.o
 mei-objs += amthif.o
 mei-objs += wd.o
+mei-objs += bus.o
 
 obj-$(CONFIG_INTEL_MEI_ME) += mei-me.o
 mei-me-objs := pci-me.o
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
new file mode 100644
index 000000000000..78c876af2676
--- /dev/null
+++ b/drivers/misc/mei/bus.c
@@ -0,0 +1,172 @@
+/*
+ * Intel Management Engine Interface (Intel MEI) Linux driver
+ * Copyright (c) 2012-2013, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/mei_cl_bus.h>
+
+#include "mei_dev.h"
+
+#define to_mei_cl_driver(d) container_of(d, struct mei_cl_driver, driver)
+#define to_mei_cl_device(d) container_of(d, struct mei_cl_device, dev)
+
+static int mei_cl_device_match(struct device *dev, struct device_driver *drv)
+{
+	struct mei_cl_device *device = to_mei_cl_device(dev);
+	struct mei_cl_driver *driver = to_mei_cl_driver(drv);
+	const struct mei_cl_device_id *id;
+
+	if (!device)
+		return 0;
+
+	if (!driver || !driver->id_table)
+		return 0;
+
+	id = driver->id_table;
+
+	while (id->name[0]) {
+		if (!strcmp(dev_name(dev), id->name))
+			return 1;
+
+		id++;
+	}
+
+	return 0;
+}
+
+static int mei_cl_device_probe(struct device *dev)
+{
+	struct mei_cl_device *device = to_mei_cl_device(dev);
+	struct mei_cl_driver *driver;
+	struct mei_cl_device_id id;
+
+	if (!device)
+		return 0;
+
+	driver = to_mei_cl_driver(dev->driver);
+	if (!driver || !driver->probe)
+		return -ENODEV;
+
+	dev_dbg(dev, "Device probe\n");
+
+	strncpy(id.name, dev_name(dev), MEI_CL_NAME_SIZE);
+
+	return driver->probe(device, &id);
+}
+
+static int mei_cl_device_remove(struct device *dev)
+{
+	struct mei_cl_device *device = to_mei_cl_device(dev);
+	struct mei_cl_driver *driver;
+
+	if (!device || !dev->driver)
+		return 0;
+
+	driver = to_mei_cl_driver(dev->driver);
+	if (!driver->remove) {
+		dev->driver = NULL;
+
+		return 0;
+	}
+
+	return driver->remove(device);
+}
+
+static ssize_t modalias_show(struct device *dev, struct device_attribute *a,
+			     char *buf)
+{
+	int len;
+
+	len = snprintf(buf, PAGE_SIZE, "mei:%s\n", dev_name(dev));
+
+	return (len >= PAGE_SIZE) ? (PAGE_SIZE - 1) : len;
+}
+
+static struct device_attribute mei_cl_dev_attrs[] = {
+	__ATTR_RO(modalias),
+	__ATTR_NULL,
+};
+
+static int mei_cl_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+	if (add_uevent_var(env, "MODALIAS=mei:%s", dev_name(dev)))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static struct bus_type mei_cl_bus_type = {
+	.name		= "mei",
+	.dev_attrs	= mei_cl_dev_attrs,
+	.match		= mei_cl_device_match,
+	.probe		= mei_cl_device_probe,
+	.remove		= mei_cl_device_remove,
+	.uevent		= mei_cl_uevent,
+};
+
+static void mei_cl_dev_release(struct device *dev)
+{
+	kfree(to_mei_cl_device(dev));
+}
+
+static struct device_type mei_cl_device_type = {
+	.release	= mei_cl_dev_release,
+};
+
+struct mei_cl_device *mei_cl_add_device(struct mei_device *mei_device,
+				  uuid_le uuid, char *name)
+{
+	struct mei_cl_device *device;
+	int status;
+
+	device = kzalloc(sizeof(struct mei_cl_device), GFP_KERNEL);
+	if (!device)
+		return NULL;
+
+	device->dev.parent = &mei_device->pdev->dev;
+	device->dev.bus = &mei_cl_bus_type;
+	device->dev.type = &mei_cl_device_type;
+
+	dev_set_name(&device->dev, "%s", name);
+
+	status = device_register(&device->dev);
+	if (status)
+		goto out_err;
+
+	dev_dbg(&device->dev, "client %s registered\n", name);
+
+	return device;
+
+out_err:
+	dev_err(device->dev.parent, "Failed to register MEI client\n");
+
+	kfree(device);
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(mei_cl_add_device);
+
+void mei_cl_remove_device(struct mei_cl_device *device)
+{
+	device_unregister(&device->dev);
+}
+EXPORT_SYMBOL_GPL(mei_cl_remove_device);
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index b5d66076de3d..7abb705ddf3f 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -21,6 +21,7 @@
 #include <linux/watchdog.h>
 #include <linux/poll.h>
 #include <linux/mei.h>
+#include <linux/mei_cl_bus.h>
 
 #include "hw.h"
 #include "hw-me-regs.h"
@@ -262,6 +263,31 @@ struct mei_hw_ops {
 		     unsigned char *buf, unsigned long len);
 };
 
+/* MEI bus API*/
+struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
+				  uuid_le uuid, char *name);
+void mei_cl_remove_device(struct mei_cl_device *device);
+
+/**
+ * struct mei_cl_device - MEI device handle
+ * An mei_cl_device pointer is returned from mei_add_device()
+ * and links MEI bus clients to their actual ME host client pointer.
+ * Drivers for MEI devices will get an mei_cl_device pointer
+ * when being probed and shall use it for doing ME bus I/O.
+ *
+ * @dev: linux driver model device pointer
+ * @uuid: me client uuid
+ * @cl: mei client
+ * @priv_data: client private data
+ */
+struct mei_cl_device {
+	struct device dev;
+
+	struct mei_cl *cl;
+
+	void *priv_data;
+};
+
 /**
  * struct mei_device -  MEI private device struct
 
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
new file mode 100644
index 000000000000..4e7351de7eca
--- /dev/null
+++ b/include/linux/mei_cl_bus.h
@@ -0,0 +1,20 @@
+#ifndef _LINUX_MEI_CL_BUS_H
+#define _LINUX_MEI_CL_BUS_H
+
+#include <linux/device.h>
+#include <linux/uuid.h>
+
+struct mei_cl_device;
+
+struct mei_cl_driver {
+	struct device_driver driver;
+	const char *name;
+
+	const struct mei_cl_device_id *id_table;
+
+	int (*probe)(struct mei_cl_device *dev,
+		     const struct mei_cl_device_id *id);
+	int (*remove)(struct mei_cl_device *dev);
+};
+
+#endif /* _LINUX_MEI_CL_BUS_H */
diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h
index 779cf7c4a3d1..b508016fb76d 100644
--- a/include/linux/mod_devicetable.h
+++ b/include/linux/mod_devicetable.h
@@ -9,6 +9,7 @@
 
 #ifdef __KERNEL__
 #include <linux/types.h>
+#include <linux/uuid.h>
 typedef unsigned long kernel_ulong_t;
 #endif
 
@@ -568,4 +569,12 @@ struct ipack_device_id {
 	__u32 device;			/* Device ID or IPACK_ANY_ID */
 };
 
+#define MEI_CL_MODULE_PREFIX "mei:"
+#define MEI_CL_NAME_SIZE 32
+
+struct mei_cl_device_id {
+	char name[MEI_CL_NAME_SIZE];
+	kernel_ulong_t driver_info;
+};
+
 #endif /* LINUX_MOD_DEVICETABLE_H */
diff --git a/scripts/mod/devicetable-offsets.c b/scripts/mod/devicetable-offsets.c
index b45260bfeaa0..e66d4d258e1a 100644
--- a/scripts/mod/devicetable-offsets.c
+++ b/scripts/mod/devicetable-offsets.c
@@ -174,5 +174,8 @@ int main(void)
 	DEVID_FIELD(x86_cpu_id, model);
 	DEVID_FIELD(x86_cpu_id, vendor);
 
+	DEVID(mei_cl_device_id);
+	DEVID_FIELD(mei_cl_device_id, name);
+
 	return 0;
 }
diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c
index 771ac17f635d..45f9a3377dcd 100644
--- a/scripts/mod/file2alias.c
+++ b/scripts/mod/file2alias.c
@@ -1133,6 +1133,18 @@ static int do_x86cpu_entry(const char *filename, void *symval,
 }
 ADD_TO_DEVTABLE("x86cpu", x86_cpu_id, do_x86cpu_entry);
 
+/* Looks like: mei:S */
+static int do_mei_entry(const char *filename, void *symval,
+			char *alias)
+{
+	DEF_FIELD_ADDR(symval, mei_cl_device_id, name);
+
+	sprintf(alias, MEI_CL_MODULE_PREFIX "%s", *name);
+
+	return 1;
+}
+ADD_TO_DEVTABLE("mei", mei_cl_device_id, do_mei_entry);
+
 /* Does namelen bytes of name exactly match the symbol? */
 static bool sym_is(const char *name, unsigned namelen, const char *symbol)
 {
-- 
cgit 


From 333e4ee0781bd0b5938da263c4bb7ab66a0d1b57 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 27 Mar 2013 17:29:54 +0200
Subject: mei: bus: Implement driver registration

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 26 ++++++++++++++++++++++++++
 include/linux/mei_cl_bus.h |  7 +++++++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 78c876af2676..d16b3c3e1b38 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -170,3 +170,29 @@ void mei_cl_remove_device(struct mei_cl_device *device)
 	device_unregister(&device->dev);
 }
 EXPORT_SYMBOL_GPL(mei_cl_remove_device);
+
+int __mei_cl_driver_register(struct mei_cl_driver *driver, struct module *owner)
+{
+	int err;
+
+	driver->driver.name = driver->name;
+	driver->driver.owner = owner;
+	driver->driver.bus = &mei_cl_bus_type;
+
+	err = driver_register(&driver->driver);
+	if (err)
+		return err;
+
+	pr_debug("mei: driver [%s] registered\n", driver->driver.name);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__mei_cl_driver_register);
+
+void mei_cl_driver_unregister(struct mei_cl_driver *driver)
+{
+	driver_unregister(&driver->driver);
+
+	pr_debug("mei: driver [%s] unregistered\n", driver->driver.name);
+}
+EXPORT_SYMBOL_GPL(mei_cl_driver_unregister);
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 4e7351de7eca..ba2aa3b66f30 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -17,4 +17,11 @@ struct mei_cl_driver {
 	int (*remove)(struct mei_cl_device *dev);
 };
 
+int __mei_cl_driver_register(struct mei_cl_driver *driver,
+				struct module *owner);
+#define mei_cl_driver_register(driver)             \
+	__mei_cl_driver_register(driver, THIS_MODULE)
+
+void mei_cl_driver_unregister(struct mei_cl_driver *driver);
+
 #endif /* _LINUX_MEI_CL_BUS_H */
-- 
cgit 


From 3e8332952dedd2c17bb497e3909e3b6fbac10ce7 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 27 Mar 2013 17:29:55 +0200
Subject: mei: bus: Initial implementation for I/O routines

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 226 +++++++++++++++++++++++++++++++++++++++++++++
 drivers/misc/mei/mei_dev.h |  30 ++++++
 include/linux/mei_cl_bus.h |  11 +++
 3 files changed, 267 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index d16b3c3e1b38..16c7fff50549 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/device.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/slab.h>
@@ -25,6 +26,8 @@
 #include <linux/mei_cl_bus.h>
 
 #include "mei_dev.h"
+#include "hw-me.h"
+#include "client.h"
 
 #define to_mei_cl_driver(d) container_of(d, struct mei_cl_driver, driver)
 #define to_mei_cl_device(d) container_of(d, struct mei_cl_device, dev)
@@ -81,6 +84,11 @@ static int mei_cl_device_remove(struct device *dev)
 	if (!device || !dev->driver)
 		return 0;
 
+	if (device->event_cb) {
+		device->event_cb = NULL;
+		cancel_work_sync(&device->event_work);
+	}
+
 	driver = to_mei_cl_driver(dev->driver);
 	if (!driver->remove) {
 		dev->driver = NULL;
@@ -196,3 +204,221 @@ void mei_cl_driver_unregister(struct mei_cl_driver *driver)
 	pr_debug("mei: driver [%s] unregistered\n", driver->driver.name);
 }
 EXPORT_SYMBOL_GPL(mei_cl_driver_unregister);
+
+int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length)
+{
+	struct mei_device *dev;
+	struct mei_msg_hdr mei_hdr;
+	struct mei_cl_cb *cb;
+	int me_cl_id, err;
+
+	if (WARN_ON(!cl || !cl->dev))
+		return -ENODEV;
+
+	if (cl->state != MEI_FILE_CONNECTED)
+		return -ENODEV;
+
+	cb = mei_io_cb_init(cl, NULL);
+	if (!cb)
+		return -ENOMEM;
+
+	err = mei_io_cb_alloc_req_buf(cb, length);
+	if (err < 0) {
+		mei_io_cb_free(cb);
+		return err;
+	}
+
+	memcpy(cb->request_buffer.data, buf, length);
+	cb->fop_type = MEI_FOP_WRITE;
+
+	dev = cl->dev;
+
+	mutex_lock(&dev->device_lock);
+
+	/* Check if we have an ME client device */
+	me_cl_id = mei_me_cl_by_id(dev, cl->me_client_id);
+	if (me_cl_id == dev->me_clients_num) {
+		err = -ENODEV;
+		goto out_err;
+	}
+
+	if (length > dev->me_clients[me_cl_id].props.max_msg_length) {
+		err = -EINVAL;
+		goto out_err;
+	}
+
+	err = mei_cl_flow_ctrl_creds(cl);
+	if (err < 0)
+		goto out_err;
+
+	/* Host buffer is not ready, we queue the request */
+	if (err == 0 || !dev->hbuf_is_ready) {
+		cb->buf_idx = 0;
+		mei_hdr.msg_complete = 0;
+		cl->writing_state = MEI_WRITING;
+		list_add_tail(&cb->list, &dev->write_list.list);
+
+		mutex_unlock(&dev->device_lock);
+
+		return length;
+	}
+
+	dev->hbuf_is_ready = false;
+
+	/* Check for a maximum length */
+	if (length > mei_hbuf_max_len(dev)) {
+		mei_hdr.length = mei_hbuf_max_len(dev);
+		mei_hdr.msg_complete = 0;
+	} else {
+		mei_hdr.length = length;
+		mei_hdr.msg_complete = 1;
+	}
+
+	mei_hdr.host_addr = cl->host_client_id;
+	mei_hdr.me_addr = cl->me_client_id;
+	mei_hdr.reserved = 0;
+
+	if (mei_write_message(dev, &mei_hdr, buf)) {
+		err = -EIO;
+		goto out_err;
+	}
+
+	cl->writing_state = MEI_WRITING;
+	cb->buf_idx = mei_hdr.length;
+
+	if (!mei_hdr.msg_complete) {
+		list_add_tail(&cb->list, &dev->write_list.list);
+	} else {
+		if (mei_cl_flow_ctrl_reduce(cl)) {
+			err = -EIO;
+			goto out_err;
+		}
+
+		list_add_tail(&cb->list, &dev->write_waiting_list.list);
+	}
+
+	mutex_unlock(&dev->device_lock);
+
+	return mei_hdr.length;
+
+out_err:
+	mutex_unlock(&dev->device_lock);
+	mei_io_cb_free(cb);
+
+	return err;
+}
+
+int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length)
+{
+	struct mei_device *dev;
+	struct mei_cl_cb *cb;
+	size_t r_length;
+	int err;
+
+	if (WARN_ON(!cl || !cl->dev))
+		return -ENODEV;
+
+	dev = cl->dev;
+
+	mutex_lock(&dev->device_lock);
+
+	if (!cl->read_cb) {
+		err = mei_cl_read_start(cl);
+		if (err < 0) {
+			mutex_unlock(&dev->device_lock);
+			return err;
+		}
+	}
+
+	if (cl->reading_state != MEI_READ_COMPLETE &&
+	    !waitqueue_active(&cl->rx_wait)) {
+		mutex_unlock(&dev->device_lock);
+
+		if (wait_event_interruptible(cl->rx_wait,
+				(MEI_READ_COMPLETE == cl->reading_state))) {
+			if (signal_pending(current))
+				return -EINTR;
+			return -ERESTARTSYS;
+		}
+
+		mutex_lock(&dev->device_lock);
+	}
+
+	cb = cl->read_cb;
+
+	if (cl->reading_state != MEI_READ_COMPLETE) {
+		r_length = 0;
+		goto out;
+	}
+
+	r_length = min_t(size_t, length, cb->buf_idx);
+
+	memcpy(buf, cb->response_buffer.data, r_length);
+
+	mei_io_cb_free(cb);
+	cl->reading_state = MEI_IDLE;
+	cl->read_cb = NULL;
+
+out:
+	mutex_unlock(&dev->device_lock);
+
+	return r_length;
+}
+
+int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length)
+{
+	struct mei_cl *cl = NULL;
+
+	/* TODO: hook between mei_bus_client and mei_cl */
+
+	if (device->ops && device->ops->send)
+		return device->ops->send(device, buf, length);
+
+	return __mei_cl_send(cl, buf, length);
+}
+EXPORT_SYMBOL_GPL(mei_cl_send);
+
+int mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length)
+{
+	struct mei_cl *cl = NULL;
+
+	/* TODO: hook between mei_bus_client and mei_cl */
+
+	if (device->ops && device->ops->recv)
+		return device->ops->recv(device, buf, length);
+
+	return __mei_cl_recv(cl, buf, length);
+}
+EXPORT_SYMBOL_GPL(mei_cl_recv);
+
+static void mei_bus_event_work(struct work_struct *work)
+{
+	struct mei_cl_device *device;
+
+	device = container_of(work, struct mei_cl_device, event_work);
+
+	if (device->event_cb)
+		device->event_cb(device, device->events, device->event_context);
+
+	device->events = 0;
+
+	/* Prepare for the next read */
+	mei_cl_read_start(device->cl);
+}
+
+int mei_cl_register_event_cb(struct mei_cl_device *device,
+			  mei_cl_event_cb_t event_cb, void *context)
+{
+	if (device->event_cb)
+		return -EALREADY;
+
+	device->events = 0;
+	device->event_cb = event_cb;
+	device->event_context = context;
+	INIT_WORK(&device->event_work, mei_bus_event_work);
+
+	mei_cl_read_start(device->cl);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mei_cl_register_event_cb);
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index 7abb705ddf3f..cde5687039f3 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -268,6 +268,25 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
 				  uuid_le uuid, char *name);
 void mei_cl_remove_device(struct mei_cl_device *device);
 
+int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
+int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
+
+/**
+ * struct mei_cl_transport_ops - MEI CL device transport ops
+ * This structure allows ME host clients to implement technology
+ * specific transport layers.
+ *
+ * @send: Tx hook for the device. This allows ME host clients to trap
+ *	the device driver buffers before actually physically
+ *	pushing it to the ME.
+ * @recv: Rx hook for the device. This allows ME host clients to trap the
+ *	ME buffers before forwarding them to the device driver.
+ */
+struct mei_cl_transport_ops {
+	int (*send)(struct mei_cl_device *device, u8 *buf, size_t length);
+	int (*recv)(struct mei_cl_device *device, u8 *buf, size_t length);
+};
+
 /**
  * struct mei_cl_device - MEI device handle
  * An mei_cl_device pointer is returned from mei_add_device()
@@ -278,6 +297,10 @@ void mei_cl_remove_device(struct mei_cl_device *device);
  * @dev: linux driver model device pointer
  * @uuid: me client uuid
  * @cl: mei client
+ * @ops: ME transport ops
+ * @event_cb: Drivers register this callback to get asynchronous ME
+ *	events (e.g. Rx buffer pending) notifications.
+ * @events: Events bitmask sent to the driver.
  * @priv_data: client private data
  */
 struct mei_cl_device {
@@ -285,6 +308,13 @@ struct mei_cl_device {
 
 	struct mei_cl *cl;
 
+	const struct mei_cl_transport_ops *ops;
+
+	struct work_struct event_work;
+	mei_cl_event_cb_t event_cb;
+	void *event_context;
+	unsigned long events;
+
 	void *priv_data;
 };
 
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index ba2aa3b66f30..d9958c3960a2 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -24,4 +24,15 @@ int __mei_cl_driver_register(struct mei_cl_driver *driver,
 
 void mei_cl_driver_unregister(struct mei_cl_driver *driver);
 
+int mei_cl_send(struct mei_cl_device *device, u8 *buf, size_t length);
+int mei_cl_recv(struct mei_cl_device *device, u8 *buf, size_t length);
+
+typedef void (*mei_cl_event_cb_t)(struct mei_cl_device *device,
+			       u32 events, void *context);
+int mei_cl_register_event_cb(struct mei_cl_device *device,
+			  mei_cl_event_cb_t read_cb, void *context);
+
+#define MEI_CL_EVENT_RX 0
+#define MEI_CL_EVENT_TX 1
+
 #endif /* _LINUX_MEI_CL_BUS_H */
-- 
cgit 


From aa6aef216f8aea1a00b56aafc29b8745237a9b62 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Wed, 27 Mar 2013 17:29:59 +0200
Subject: mei: bus: Implement bus driver data setter/getter

MEI drivers should be able to carry their private data around.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/misc/mei/bus.c     | 12 ++++++++++++
 include/linux/mei_cl_bus.h |  3 +++
 2 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 2b4b5b3f639f..8dbcb1516dc6 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -461,6 +461,18 @@ int mei_cl_register_event_cb(struct mei_cl_device *device,
 }
 EXPORT_SYMBOL_GPL(mei_cl_register_event_cb);
 
+void *mei_cl_get_drvdata(const struct mei_cl_device *device)
+{
+	return dev_get_drvdata(&device->dev);
+}
+EXPORT_SYMBOL_GPL(mei_cl_get_drvdata);
+
+void mei_cl_set_drvdata(struct mei_cl_device *device, void *data)
+{
+	dev_set_drvdata(&device->dev, data);
+}
+EXPORT_SYMBOL_GPL(mei_cl_set_drvdata);
+
 void mei_cl_bus_rx_event(struct mei_cl *cl)
 {
 	struct mei_cl_device *device = cl->device;
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index d9958c3960a2..1bece18825ba 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -35,4 +35,7 @@ int mei_cl_register_event_cb(struct mei_cl_device *device,
 #define MEI_CL_EVENT_RX 0
 #define MEI_CL_EVENT_TX 1
 
+void *mei_cl_get_drvdata(const struct mei_cl_device *device);
+void mei_cl_set_drvdata(struct mei_cl_device *device, void *data);
+
 #endif /* _LINUX_MEI_CL_BUS_H */
-- 
cgit 


From 6ae07f27ab202069bd567967a0099070eb7f77d5 Mon Sep 17 00:00:00 2001
From: Fabio Porcedda <fabio.porcedda@gmail.com>
Date: Tue, 26 Mar 2013 10:35:17 +0100
Subject: driver core: platform_device.h: fix checkpatch errors and warnings

Signed-off-by: Fabio Porcedda <fabio.porcedda@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 include/linux/platform_device.h | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_device.h b/include/linux/platform_device.h
index c082c71f7225..9abf1db6aea6 100644
--- a/include/linux/platform_device.h
+++ b/include/linux/platform_device.h
@@ -20,12 +20,12 @@
 struct mfd_cell;
 
 struct platform_device {
-	const char	* name;
+	const char	*name;
 	int		id;
 	bool		id_auto;
 	struct device	dev;
 	u32		num_resources;
-	struct resource	* resource;
+	struct resource	*resource;
 
 	const struct platform_device_id	*id_entry;
 
@@ -47,9 +47,12 @@ extern struct bus_type platform_bus_type;
 extern struct device platform_bus;
 
 extern void arch_setup_pdev_archdata(struct platform_device *);
-extern struct resource *platform_get_resource(struct platform_device *, unsigned int, unsigned int);
+extern struct resource *platform_get_resource(struct platform_device *,
+					      unsigned int, unsigned int);
 extern int platform_get_irq(struct platform_device *, unsigned int);
-extern struct resource *platform_get_resource_byname(struct platform_device *, unsigned int, const char *);
+extern struct resource *platform_get_resource_byname(struct platform_device *,
+						     unsigned int,
+						     const char *);
 extern int platform_get_irq_byname(struct platform_device *, const char *);
 extern int platform_add_devices(struct platform_device **, int);
 
@@ -161,7 +164,8 @@ extern struct platform_device *platform_device_alloc(const char *name, int id);
 extern int platform_device_add_resources(struct platform_device *pdev,
 					 const struct resource *res,
 					 unsigned int num);
-extern int platform_device_add_data(struct platform_device *pdev, const void *data, size_t size);
+extern int platform_device_add_data(struct platform_device *pdev,
+				    const void *data, size_t size);
 extern int platform_device_add(struct platform_device *pdev);
 extern void platform_device_del(struct platform_device *pdev);
 extern void platform_device_put(struct platform_device *pdev);
@@ -190,7 +194,8 @@ static inline void *platform_get_drvdata(const struct platform_device *pdev)
 	return dev_get_drvdata(&pdev->dev);
 }
 
-static inline void platform_set_drvdata(struct platform_device *pdev, void *data)
+static inline void platform_set_drvdata(struct platform_device *pdev,
+					void *data)
 {
 	dev_set_drvdata(&pdev->dev, data);
 }
@@ -222,10 +227,10 @@ static void __exit __platform_driver##_exit(void) \
 } \
 module_exit(__platform_driver##_exit);
 
-extern struct platform_device *platform_create_bundle(struct platform_driver *driver,
-					int (*probe)(struct platform_device *),
-					struct resource *res, unsigned int n_res,
-					const void *data, size_t size);
+extern struct platform_device *platform_create_bundle(
+	struct platform_driver *driver, int (*probe)(struct platform_device *),
+	struct resource *res, unsigned int n_res,
+	const void *data, size_t size);
 
 /* early platform driver interface */
 struct early_platform_driver {
-- 
cgit 


From cb06ff102e2d79a82cf780aa5e6947b2e0529ac0 Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Wed, 27 Mar 2013 18:38:11 +0900
Subject: ARM: PL011: Add support for Rx DMA buffer polling.

In DMA support, The received data is not pushed to tty until the DMA buffer
is filled. But some megabyte rate chips such as BT expect fast response and
data should be pushed immediately. In order to fix this issue, We suggest
the use of the timer for polling DMA buffer.
In our test, no data loss occurred at high-baudrate as compared with interrupt-
driven (We tested with 3Mbps).
We changes:

- We add timer for polling. If we set poll_timer to 10, every 10ms,
 timer handler checks the residue in the dma buffer and transfer data
 to the tty. Also, last_residue is updated for the next polling.

- poll_timeout is used to prevent the timer's system cost.
  If poll_timeout is set to 3000 and no data is received in 3 seconds,
  we inactivate poll timer and driver falls back to interrupt-driven.
  When data is received again in FIFO and UART irq is occurred, we switch
  back to DMA mode and start polling.

- We use consistent DMA mappings to avoid from the frequent cache operation
  of the timer function for default.

- pl011_dma_rx_chars is modified. the pending size is recalculated because
  data can be taken by polling.

- the polling time is adjusted if dma rx poll is enabled but no rate is
  specified. Ideal polling interval to push 1 character at every interval
  is the reciprocal of 'baud rate / 10 line bits per character / 1000 ms
  per sec'. But It is very aggressive to system. Experimentally,
 '10000000 / baud' is suitable to receive dozens of characters. the poll rate
 can be specified statically by dma_rx_poll_rate of the platform data as well.

Changes compared to v1:
 - Use of consistent DMA mappings.
 - Added dma_rx_poll_rate in platform data to specify the polling interval.
 - Added dma_rx_poll_timeout in platform data to specify the polling timeout.

Changes compared to v2:
 - Use of consistent DMA mappings for default.
 - Added dma_rx_poll_enable in platform data to adjust the polling time
   according to the baud rate.
 - remove unnecessary lock from the polling function.

Signed-off-by: Chanho Min <chanho.min@lge.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/amba-pl011.c | 157 +++++++++++++++++++++++++++++++++++-----
 include/linux/amba/serial.h     |   3 +
 2 files changed, 141 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c
index 3ea5408fcbeb..b031abf43a7a 100644
--- a/drivers/tty/serial/amba-pl011.c
+++ b/drivers/tty/serial/amba-pl011.c
@@ -29,6 +29,7 @@
  * and hooked into this driver.
  */
 
+
 #if defined(CONFIG_SERIAL_AMBA_PL011_CONSOLE) && defined(CONFIG_MAGIC_SYSRQ)
 #define SUPPORT_SYSRQ
 #endif
@@ -117,6 +118,12 @@ struct pl011_dmarx_data {
 	struct pl011_sgbuf	sgbuf_b;
 	dma_cookie_t		cookie;
 	bool			running;
+	struct timer_list	timer;
+	unsigned int last_residue;
+	unsigned long last_jiffies;
+	bool auto_poll_rate;
+	unsigned int poll_rate;
+	unsigned int poll_timeout;
 };
 
 struct pl011_dmatx_data {
@@ -223,16 +230,18 @@ static int pl011_fifo_to_tty(struct uart_amba_port *uap)
 static int pl011_sgbuf_init(struct dma_chan *chan, struct pl011_sgbuf *sg,
 	enum dma_data_direction dir)
 {
-	sg->buf = kmalloc(PL011_DMA_BUFFER_SIZE, GFP_KERNEL);
+	dma_addr_t dma_addr;
+
+	sg->buf = dma_alloc_coherent(chan->device->dev,
+		PL011_DMA_BUFFER_SIZE, &dma_addr, GFP_KERNEL);
 	if (!sg->buf)
 		return -ENOMEM;
 
-	sg_init_one(&sg->sg, sg->buf, PL011_DMA_BUFFER_SIZE);
+	sg_init_table(&sg->sg, 1);
+	sg_set_page(&sg->sg, phys_to_page(dma_addr),
+		PL011_DMA_BUFFER_SIZE, offset_in_page(dma_addr));
+	sg_dma_address(&sg->sg) = dma_addr;
 
-	if (dma_map_sg(chan->device->dev, &sg->sg, 1, dir) != 1) {
-		kfree(sg->buf);
-		return -EINVAL;
-	}
 	return 0;
 }
 
@@ -240,8 +249,9 @@ static void pl011_sgbuf_free(struct dma_chan *chan, struct pl011_sgbuf *sg,
 	enum dma_data_direction dir)
 {
 	if (sg->buf) {
-		dma_unmap_sg(chan->device->dev, &sg->sg, 1, dir);
-		kfree(sg->buf);
+		dma_free_coherent(chan->device->dev,
+			PL011_DMA_BUFFER_SIZE, sg->buf,
+			sg_dma_address(&sg->sg));
 	}
 }
 
@@ -300,6 +310,29 @@ static void pl011_dma_probe_initcall(struct uart_amba_port *uap)
 		dmaengine_slave_config(chan, &rx_conf);
 		uap->dmarx.chan = chan;
 
+		if (plat->dma_rx_poll_enable) {
+			/* Set poll rate if specified. */
+			if (plat->dma_rx_poll_rate) {
+				uap->dmarx.auto_poll_rate = false;
+				uap->dmarx.poll_rate = plat->dma_rx_poll_rate;
+			} else {
+				/*
+				 * 100 ms defaults to poll rate if not
+				 * specified. This will be adjusted with
+				 * the baud rate at set_termios.
+				 */
+				uap->dmarx.auto_poll_rate = true;
+				uap->dmarx.poll_rate =  100;
+			}
+			/* 3 secs defaults poll_timeout if not specified. */
+			if (plat->dma_rx_poll_timeout)
+				uap->dmarx.poll_timeout =
+					plat->dma_rx_poll_timeout;
+			else
+				uap->dmarx.poll_timeout = 3000;
+		} else
+			uap->dmarx.auto_poll_rate = false;
+
 		dev_info(uap->port.dev, "DMA channel RX %s\n",
 			 dma_chan_name(uap->dmarx.chan));
 	}
@@ -701,24 +734,30 @@ static void pl011_dma_rx_chars(struct uart_amba_port *uap,
 	struct tty_port *port = &uap->port.state->port;
 	struct pl011_sgbuf *sgbuf = use_buf_b ?
 		&uap->dmarx.sgbuf_b : &uap->dmarx.sgbuf_a;
-	struct device *dev = uap->dmarx.chan->device->dev;
 	int dma_count = 0;
 	u32 fifotaken = 0; /* only used for vdbg() */
 
-	/* Pick everything from the DMA first */
+	struct pl011_dmarx_data *dmarx = &uap->dmarx;
+	int dmataken = 0;
+
+	if (uap->dmarx.poll_rate) {
+		/* The data can be taken by polling */
+		dmataken = sgbuf->sg.length - dmarx->last_residue;
+		/* Recalculate the pending size */
+		if (pending >= dmataken)
+			pending -= dmataken;
+	}
+
+	/* Pick the remain data from the DMA */
 	if (pending) {
-		/* Sync in buffer */
-		dma_sync_sg_for_cpu(dev, &sgbuf->sg, 1, DMA_FROM_DEVICE);
 
 		/*
 		 * First take all chars in the DMA pipe, then look in the FIFO.
 		 * Note that tty_insert_flip_buf() tries to take as many chars
 		 * as it can.
 		 */
-		dma_count = tty_insert_flip_string(port, sgbuf->buf, pending);
-
-		/* Return buffer to device */
-		dma_sync_sg_for_device(dev, &sgbuf->sg, 1, DMA_FROM_DEVICE);
+		dma_count = tty_insert_flip_string(port, sgbuf->buf + dmataken,
+				pending);
 
 		uap->port.icount.rx += dma_count;
 		if (dma_count < pending)
@@ -726,6 +765,10 @@ static void pl011_dma_rx_chars(struct uart_amba_port *uap,
 				 "couldn't insert all characters (TTY is full?)\n");
 	}
 
+	/* Reset the last_residue for Rx DMA poll */
+	if (uap->dmarx.poll_rate)
+		dmarx->last_residue = sgbuf->sg.length;
+
 	/*
 	 * Only continue with trying to read the FIFO if all DMA chars have
 	 * been taken first.
@@ -865,6 +908,57 @@ static inline void pl011_dma_rx_stop(struct uart_amba_port *uap)
 	writew(uap->dmacr, uap->port.membase + UART011_DMACR);
 }
 
+/*
+ * Timer handler for Rx DMA polling.
+ * Every polling, It checks the residue in the dma buffer and transfer
+ * data to the tty. Also, last_residue is updated for the next polling.
+ */
+static void pl011_dma_rx_poll(unsigned long args)
+{
+	struct uart_amba_port *uap = (struct uart_amba_port *)args;
+	struct tty_port *port = &uap->port.state->port;
+	struct pl011_dmarx_data *dmarx = &uap->dmarx;
+	struct dma_chan *rxchan = uap->dmarx.chan;
+	unsigned long flags = 0;
+	unsigned int dmataken = 0;
+	unsigned int size = 0;
+	struct pl011_sgbuf *sgbuf;
+	int dma_count;
+	struct dma_tx_state state;
+
+	sgbuf = dmarx->use_buf_b ? &uap->dmarx.sgbuf_b : &uap->dmarx.sgbuf_a;
+	rxchan->device->device_tx_status(rxchan, dmarx->cookie, &state);
+	if (likely(state.residue < dmarx->last_residue)) {
+		dmataken = sgbuf->sg.length - dmarx->last_residue;
+		size = dmarx->last_residue - state.residue;
+		dma_count = tty_insert_flip_string(port, sgbuf->buf + dmataken,
+				size);
+		if (dma_count == size)
+			dmarx->last_residue =  state.residue;
+		dmarx->last_jiffies = jiffies;
+	}
+	tty_flip_buffer_push(port);
+
+	/*
+	 * If no data is received in poll_timeout, the driver will fall back
+	 * to interrupt mode. We will retrigger DMA at the first interrupt.
+	 */
+	if (jiffies_to_msecs(jiffies - dmarx->last_jiffies)
+			> uap->dmarx.poll_timeout) {
+
+		spin_lock_irqsave(&uap->port.lock, flags);
+		pl011_dma_rx_stop(uap);
+		spin_unlock_irqrestore(&uap->port.lock, flags);
+
+		uap->dmarx.running = false;
+		dmaengine_terminate_all(rxchan);
+		del_timer(&uap->dmarx.timer);
+	} else {
+		mod_timer(&uap->dmarx.timer,
+			jiffies + msecs_to_jiffies(uap->dmarx.poll_rate));
+	}
+}
+
 static void pl011_dma_startup(struct uart_amba_port *uap)
 {
 	int ret;
@@ -927,6 +1021,16 @@ skip_rx:
 		if (pl011_dma_rx_trigger_dma(uap))
 			dev_dbg(uap->port.dev, "could not trigger initial "
 				"RX DMA job, fall back to interrupt mode\n");
+		if (uap->dmarx.poll_rate) {
+			init_timer(&(uap->dmarx.timer));
+			uap->dmarx.timer.function = pl011_dma_rx_poll;
+			uap->dmarx.timer.data = (unsigned long)uap;
+			mod_timer(&uap->dmarx.timer,
+				jiffies +
+				msecs_to_jiffies(uap->dmarx.poll_rate));
+			uap->dmarx.last_residue = PL011_DMA_BUFFER_SIZE;
+			uap->dmarx.last_jiffies = jiffies;
+		}
 	}
 }
 
@@ -962,6 +1066,8 @@ static void pl011_dma_shutdown(struct uart_amba_port *uap)
 		/* Clean up the RX DMA */
 		pl011_sgbuf_free(uap->dmarx.chan, &uap->dmarx.sgbuf_a, DMA_FROM_DEVICE);
 		pl011_sgbuf_free(uap->dmarx.chan, &uap->dmarx.sgbuf_b, DMA_FROM_DEVICE);
+		if (uap->dmarx.poll_rate)
+			del_timer_sync(&uap->dmarx.timer);
 		uap->using_rx_dma = false;
 	}
 }
@@ -976,7 +1082,6 @@ static inline bool pl011_dma_rx_running(struct uart_amba_port *uap)
 	return uap->using_rx_dma && uap->dmarx.running;
 }
 
-
 #else
 /* Blank functions if the DMA engine is not available */
 static inline void pl011_dma_probe(struct uart_amba_port *uap)
@@ -1088,8 +1193,18 @@ static void pl011_rx_chars(struct uart_amba_port *uap)
 			dev_dbg(uap->port.dev, "could not trigger RX DMA job "
 				"fall back to interrupt mode again\n");
 			uap->im |= UART011_RXIM;
-		} else
+		} else {
 			uap->im &= ~UART011_RXIM;
+			/* Start Rx DMA poll */
+			if (uap->dmarx.poll_rate) {
+				uap->dmarx.last_jiffies = jiffies;
+				uap->dmarx.last_residue	= PL011_DMA_BUFFER_SIZE;
+				mod_timer(&uap->dmarx.timer,
+					jiffies +
+					msecs_to_jiffies(uap->dmarx.poll_rate));
+			}
+		}
+
 		writew(uap->im, uap->port.membase + UART011_IMSC);
 	}
 	spin_lock(&uap->port.lock);
@@ -1164,7 +1279,6 @@ static irqreturn_t pl011_int(int irq, void *dev_id)
 	unsigned int dummy_read;
 
 	spin_lock_irqsave(&uap->port.lock, flags);
-
 	status = readw(uap->port.membase + UART011_MIS);
 	if (status) {
 		do {
@@ -1551,6 +1665,11 @@ pl011_set_termios(struct uart_port *port, struct ktermios *termios,
 	 */
 	baud = uart_get_baud_rate(port, termios, old, 0,
 				  port->uartclk / clkdiv);
+	/*
+	 * Adjust RX DMA polling rate with baud rate if not specified.
+	 */
+	if (uap->dmarx.auto_poll_rate)
+		uap->dmarx.poll_rate = DIV_ROUND_UP(10000000, baud);
 
 	if (baud > port->uartclk/16)
 		quot = DIV_ROUND_CLOSEST(port->uartclk * 8, baud);
diff --git a/include/linux/amba/serial.h b/include/linux/amba/serial.h
index f612c783170f..62d9303c2837 100644
--- a/include/linux/amba/serial.h
+++ b/include/linux/amba/serial.h
@@ -203,6 +203,9 @@ struct amba_pl011_data {
 	bool (*dma_filter)(struct dma_chan *chan, void *filter_param);
 	void *dma_rx_param;
 	void *dma_tx_param;
+	bool dma_rx_poll_enable;
+	unsigned int dma_rx_poll_rate;
+	unsigned int dma_rx_poll_timeout;
         void (*init) (void);
 	void (*exit) (void);
 };
-- 
cgit 


From d1a820011b2fbc11d5af80d1a961fe66c613fa4b Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 28 Mar 2013 16:11:01 +0000
Subject: regulator: ab8500-ext: New driver to control external regulators

The ABx500 is capable of controlling three external regulator supplies.
Most commonly on and off are supported, but if an external regulator
chipset or power supply supports high-power and low-power mode settings,
we can control those too.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/Makefile       |   2 +-
 drivers/regulator/ab8500-ext.c   | 394 +++++++++++++++++++++++++++++++++++++++
 drivers/regulator/ab8500.c       |  12 +-
 include/linux/regulator/ab8500.h |  28 +++
 4 files changed, 434 insertions(+), 2 deletions(-)
 create mode 100644 drivers/regulator/ab8500-ext.c

(limited to 'include/linux')

diff --git a/drivers/regulator/Makefile b/drivers/regulator/Makefile
index 6e8250382def..47a34ff88f98 100644
--- a/drivers/regulator/Makefile
+++ b/drivers/regulator/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_REGULATOR_USERSPACE_CONSUMER) += userspace-consumer.o
 obj-$(CONFIG_REGULATOR_88PM8607) += 88pm8607.o
 obj-$(CONFIG_REGULATOR_AAT2870) += aat2870-regulator.o
 obj-$(CONFIG_REGULATOR_AB3100) += ab3100.o
-obj-$(CONFIG_REGULATOR_AB8500)	+= ab8500.o
+obj-$(CONFIG_REGULATOR_AB8500)	+= ab8500.o ab8500-ext.o
 obj-$(CONFIG_REGULATOR_AD5398) += ad5398.o
 obj-$(CONFIG_REGULATOR_ANATOP) += anatop-regulator.o
 obj-$(CONFIG_REGULATOR_ARIZONA) += arizona-micsupp.o arizona-ldo1.o
diff --git a/drivers/regulator/ab8500-ext.c b/drivers/regulator/ab8500-ext.c
new file mode 100644
index 000000000000..95008dec5190
--- /dev/null
+++ b/drivers/regulator/ab8500-ext.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * License Terms: GNU General Public License v2
+ *
+ * Authors: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
+ *
+ * This file is based on drivers/regulator/ab8500.c
+ *
+ * AB8500 external regulators
+ *
+ * ab8500-ext supports the following regulators:
+ * - VextSupply3
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/regulator/driver.h>
+#include <linux/regulator/machine.h>
+#include <linux/mfd/abx500.h>
+#include <linux/mfd/abx500/ab8500.h>
+#include <linux/regulator/ab8500.h>
+
+/**
+ * struct ab8500_ext_regulator_info - ab8500 regulator information
+ * @dev: device pointer
+ * @desc: regulator description
+ * @rdev: regulator device
+ * @is_enabled: status of regulator (on/off)
+ * @update_bank: bank to control on/off
+ * @update_reg: register to control on/off
+ * @update_mask: mask to enable/disable and set mode of regulator
+ * @update_val: bits holding the regulator current mode
+ * @update_val_en: bits to set EN pin active (LPn pin deactive)
+ *                 normally this means high power mode
+ * @update_val_en_lp: bits to set EN pin active and LPn pin active
+ *                    normally this means low power mode
+ * @delay: startup delay in ms
+ */
+struct ab8500_ext_regulator_info {
+	struct device *dev;
+	struct regulator_desc desc;
+	struct regulator_dev *rdev;
+	bool is_enabled;
+	u8 update_bank;
+	u8 update_reg;
+	u8 update_mask;
+	u8 update_val;
+	u8 update_val_en;
+	u8 update_val_en_lp;
+};
+
+static int ab8500_ext_regulator_enable(struct regulator_dev *rdev)
+{
+	int ret;
+	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+
+	if (info == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
+		return -EINVAL;
+	}
+
+	ret = abx500_mask_and_set_register_interruptible(info->dev,
+		info->update_bank, info->update_reg,
+		info->update_mask, info->update_val);
+	if (ret < 0)
+		dev_err(rdev_get_dev(info->rdev),
+			"couldn't set enable bits for regulator\n");
+
+	info->is_enabled = true;
+
+	dev_dbg(rdev_get_dev(rdev), "%s-enable (bank, reg, mask, value):"
+		" 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
+		info->desc.name, info->update_bank, info->update_reg,
+		info->update_mask, info->update_val);
+
+	return ret;
+}
+
+static int ab8500_ext_regulator_disable(struct regulator_dev *rdev)
+{
+	int ret;
+	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+
+	if (info == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
+		return -EINVAL;
+	}
+
+	ret = abx500_mask_and_set_register_interruptible(info->dev,
+		info->update_bank, info->update_reg,
+		info->update_mask, 0x0);
+	if (ret < 0)
+		dev_err(rdev_get_dev(info->rdev),
+			"couldn't set disable bits for regulator\n");
+
+	info->is_enabled = false;
+
+	dev_dbg(rdev_get_dev(rdev), "%s-disable (bank, reg, mask, value):"
+		" 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
+		info->desc.name, info->update_bank, info->update_reg,
+		info->update_mask, 0x0);
+
+	return ret;
+}
+
+static int ab8500_ext_regulator_is_enabled(struct regulator_dev *rdev)
+{
+	int ret;
+	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+	u8 regval;
+
+	if (info == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
+		return -EINVAL;
+	}
+
+	ret = abx500_get_register_interruptible(info->dev,
+		info->update_bank, info->update_reg, &regval);
+	if (ret < 0) {
+		dev_err(rdev_get_dev(rdev),
+			"couldn't read 0x%x register\n", info->update_reg);
+		return ret;
+	}
+
+	dev_dbg(rdev_get_dev(rdev), "%s-is_enabled (bank, reg, mask, value):"
+		" 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
+		info->desc.name, info->update_bank, info->update_reg,
+		info->update_mask, regval);
+
+	if (regval & info->update_mask)
+		info->is_enabled = true;
+	else
+		info->is_enabled = false;
+
+	return info->is_enabled;
+}
+
+static int ab8500_ext_regulator_set_mode(struct regulator_dev *rdev,
+					 unsigned int mode)
+{
+	int ret = 0;
+	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+
+	if (info == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
+		return -EINVAL;
+	}
+
+	switch (mode) {
+	case REGULATOR_MODE_NORMAL:
+		info->update_val = info->update_val_hp;
+		break;
+	case REGULATOR_MODE_IDLE:
+		info->update_val = info->update_val_lp;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (info->is_enabled) {
+		u8 regval;
+
+		ret = enable(info, &regval);
+		if (ret < 0)
+			dev_err(rdev_get_dev(rdev),
+				"Could not set regulator mode.\n");
+
+		dev_dbg(rdev_get_dev(rdev),
+			"%s-set_mode (bank, reg, mask, value): "
+			"0x%x, 0x%x, 0x%x, 0x%x\n",
+			info->desc.name, info->update_bank, info->update_reg,
+			info->update_mask, regval);
+	}
+
+	return ret;
+}
+
+static unsigned int ab8500_ext_regulator_get_mode(struct regulator_dev *rdev)
+{
+	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+	int ret;
+
+	if (info == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
+		return -EINVAL;
+	}
+
+	if (info->update_val == info->update_val_hp)
+		ret = REGULATOR_MODE_NORMAL;
+	else if (info->update_val == info->update_val_lp)
+		ret = REGULATOR_MODE_IDLE;
+	else
+		ret = -EINVAL;
+
+	return ret;
+}
+
+static int ab8500_ext_fixed_get_voltage(struct regulator_dev *rdev)
+{
+	struct regulation_constraints *regu_constraints = rdev->constraints;
+
+	if (regu_constraints == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator constraints null pointer\n");
+		return -EINVAL;
+	}
+	if (regu_constraints->min_uV && regu_constraints->max_uV) {
+		if (regu_constraints->min_uV == regu_constraints->max_uV)
+			return regu_constraints->min_uV;
+	}
+	return -EINVAL;
+}
+
+static int ab8500_ext_list_voltage(struct regulator_dev *rdev,
+				   unsigned selector)
+{
+	struct regulation_constraints *regu_constraints = rdev->constraints;
+
+	if (regu_constraints == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator constraints null pointer\n");
+		return -EINVAL;
+	}
+	/* return the uV for the fixed regulators */
+	if (regu_constraints->min_uV && regu_constraints->max_uV) {
+		if (regu_constraints->min_uV == regu_constraints->max_uV)
+			return regu_constraints->min_uV;
+	}
+	return -EINVAL;
+}
+
+static struct regulator_ops ab8500_ext_regulator_ops = {
+	.enable			= ab8500_ext_regulator_enable,
+	.disable		= ab8500_ext_regulator_disable,
+	.is_enabled		= ab8500_ext_regulator_is_enabled,
+	.set_mode		= ab8500_ext_regulator_set_mode,
+	.get_mode		= ab8500_ext_regulator_get_mode,
+	.get_voltage		= ab8500_ext_fixed_get_voltage,
+	.list_voltage		= ab8500_ext_list_voltage,
+};
+
+
+static struct ab8500_ext_regulator_info
+		ab8500_ext_regulator_info[AB8500_NUM_EXT_REGULATORS] = {
+	[AB8500_EXT_SUPPLY1] = {
+		.desc = {
+			.name		= "VEXTSUPPLY1",
+			.ops		= &ab8500_ext_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_EXT_SUPPLY1,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x04,
+		.update_reg		= 0x08,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_hp		= 0x01,
+		.update_val_lp		= 0x03,
+		.update_val_hw		= 0x02,
+	},
+	[AB8500_EXT_SUPPLY2] = {
+		.desc = {
+			.name		= "VEXTSUPPLY2",
+			.ops		= &ab8500_ext_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_EXT_SUPPLY2,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x04,
+		.update_reg		= 0x08,
+		.update_mask		= 0x0c,
+		.update_val		= 0x04,
+		.update_val_hp		= 0x04,
+		.update_val_lp		= 0x0c,
+		.update_val_hw		= 0x08,
+	},
+	[AB8500_EXT_SUPPLY3] = {
+		.desc = {
+			.name		= "VEXTSUPPLY3",
+			.ops		= &ab8500_ext_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_EXT_SUPPLY3,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x04,
+		.update_reg		= 0x08,
+		.update_mask		= 0x30,
+		.update_val		= 0x10,
+		.update_val_en		= 0x10,
+		.update_val_en_lp	= 0x30,
+	},
+};
+
+int ab8500_ext_regulator_init(struct platform_device *pdev)
+{
+	struct ab8500 *ab8500 = dev_get_drvdata(pdev->dev.parent);
+	struct ab8500_platform_data *ppdata;
+	struct ab8500_regulator_platform_data *pdata;
+	struct regulator_config config = { };
+	int i, err;
+
+	if (!ab8500) {
+		dev_err(&pdev->dev, "null mfd parent\n");
+		return -EINVAL;
+	}
+	ppdata = dev_get_platdata(ab8500->dev);
+	if (!ppdata) {
+		dev_err(&pdev->dev, "null parent pdata\n");
+		return -EINVAL;
+	}
+
+	pdata = ppdata->regulator;
+	if (!pdata) {
+		dev_err(&pdev->dev, "null pdata\n");
+		return -EINVAL;
+	}
+
+	/* make sure the platform data has the correct size */
+	if (pdata->num_ext_regulator != ARRAY_SIZE(ab8500_ext_regulator_info)) {
+		dev_err(&pdev->dev, "Configuration error: size mismatch.\n");
+		return -EINVAL;
+	}
+
+	/* check for AB8500 2.x */
+	if (abx500_get_chip_id(&pdev->dev) < 0x30) {
+		struct ab8500_ext_regulator_info *info;
+
+		/* VextSupply3LPn is inverted on AB8500 2.x */
+		info = &ab8500_ext_regulator_info[AB8500_EXT_SUPPLY3];
+		info->update_val = 0x30;
+		info->update_val_en = 0x30;
+		info->update_val_en_lp = 0x10;
+	}
+
+	/* register all regulators */
+	for (i = 0; i < ARRAY_SIZE(ab8500_ext_regulator_info); i++) {
+		struct ab8500_ext_regulator_info *info = NULL;
+
+		/* assign per-regulator data */
+		info = &ab8500_ext_regulator_info[i];
+		info->dev = &pdev->dev;
+
+		config.dev = &pdev->dev;
+		config.init_data = &pdata->ext_regulator[i];
+		config.driver_data = info;
+
+		/* register regulator with framework */
+		info->rdev = regulator_register(&info->desc, &config);
+
+		if (IS_ERR(info->rdev)) {
+			err = PTR_ERR(info->rdev);
+			dev_err(&pdev->dev, "failed to register regulator %s\n",
+					info->desc.name);
+			/* when we fail, un-register all earlier regulators */
+			while (--i >= 0) {
+				info = &ab8500_ext_regulator_info[i];
+				regulator_unregister(info->rdev);
+			}
+			return err;
+		}
+
+		dev_dbg(rdev_get_dev(info->rdev),
+			"%s-probed\n", info->desc.name);
+	}
+
+	return 0;
+}
+
+int ab8500_ext_regulator_exit(struct platform_device *pdev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ab8500_ext_regulator_info); i++) {
+		struct ab8500_ext_regulator_info *info = NULL;
+		info = &ab8500_ext_regulator_info[i];
+
+		dev_vdbg(rdev_get_dev(info->rdev),
+			"%s-remove\n", info->desc.name);
+
+		regulator_unregister(info->rdev);
+	}
+
+	return 0;
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Bengt Jonsson <bengt.g.jonsson@stericsson.com>");
+MODULE_DESCRIPTION("AB8500 external regulator driver");
+MODULE_ALIAS("platform:ab8500-ext-regulator");
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 3a1896655557..49746884923d 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -947,6 +947,11 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 			return err;
 	}
 
+	/* register external regulators (before Vaux1, 2 and 3) */
+	err = ab8500_ext_regulator_init(pdev);
+	if (err)
+		return err;
+
 	/* register all regulators */
 	for (i = 0; i < ARRAY_SIZE(ab8500_regulator_info); i++) {
 		err = ab8500_regulator_register(pdev, &pdata->regulator[i], i, NULL);
@@ -959,7 +964,7 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 
 static int ab8500_regulator_remove(struct platform_device *pdev)
 {
-	int i;
+	int i, err;
 
 	for (i = 0; i < ARRAY_SIZE(ab8500_regulator_info); i++) {
 		struct ab8500_regulator_info *info = NULL;
@@ -971,6 +976,11 @@ static int ab8500_regulator_remove(struct platform_device *pdev)
 		regulator_unregister(info->regulator);
 	}
 
+	/* remove external regulators (after Vaux1, 2 and 3) */
+	err = ab8500_ext_regulator_exit(pdev);
+	if (err)
+		return err;
+
 	return 0;
 }
 
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 26792ff360be..4e92e5b879a5 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -10,6 +10,8 @@
 #ifndef __LINUX_MFD_AB8500_REGULATOR_H
 #define __LINUX_MFD_AB8500_REGULATOR_H
 
+#include <linux/platform_device.h>
+
 /* AB8500 regulators */
 enum ab8500_regulator_id {
 	AB8500_LDO_AUX1,
@@ -140,11 +142,37 @@ enum ab9540_regulator_reg {
 	AB9540_NUM_REGULATOR_REGISTERS,
 };
 
+/* AB8500 external regulators */
+enum ab8500_ext_regulator_id {
+	AB8500_EXT_SUPPLY1,
+	AB8500_EXT_SUPPLY2,
+	AB8500_EXT_SUPPLY3,
+	AB8500_NUM_EXT_REGULATORS,
+};
+
+/* AB8500 regulator platform data */
 struct ab8500_regulator_platform_data {
 	int num_reg_init;
 	struct ab8500_regulator_reg_init *reg_init;
 	int num_regulator;
 	struct regulator_init_data *regulator;
+	int num_ext_regulator;
+	struct regulator_init_data *ext_regulator;
 };
 
+/* AB8500 external regulator functions (internal) */
+#ifdef CONFIG_REGULATOR_AB8500_EXT
+int ab8500_ext_regulator_init(struct platform_device *pdev);
+int ab8500_ext_regulator_exit(struct platform_device *pdev);
+#else
+inline int ab8500_ext_regulator_init(struct platform_device *pdev)
+{
+	return 0;
+}
+inline int ab8500_ext_regulator_exit(struct platform_device *pdev)
+{
+	return 0;
+}
+#endif
+
 #endif
-- 
cgit 


From 18bc2b39307b45527efc6c84836953c7a8f2181e Mon Sep 17 00:00:00 2001
From: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Date: Thu, 28 Mar 2013 16:11:06 +0000
Subject: regulator: ab8500-ext: Add HW request support

Support for HW request is added in the external regulator
driver. A flag in the board configuration can be set to
let HW control the regulator when there is no SW request.
This means that the regulator will be put in high power
mode when there is a SW request and in HW-request mode
otherwise.

Signed-off-by: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Mattias NILSSON <mattias.i.nilsson@stericsson.com>
Reviewed-by: Jonas ABERG <jonas.aberg@stericsson.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/ab8500-ext.c   | 98 +++++++++++++++++++++++++++++-----------
 include/linux/regulator/ab8500.h |  4 ++
 2 files changed, 75 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500-ext.c b/drivers/regulator/ab8500-ext.c
index 95008dec5190..21b9bfb0fc5e 100644
--- a/drivers/regulator/ab8500-ext.c
+++ b/drivers/regulator/ab8500-ext.c
@@ -28,80 +28,121 @@
  * @dev: device pointer
  * @desc: regulator description
  * @rdev: regulator device
+ * @cfg: regulator configuration (extension of regulator FW configuration)
  * @is_enabled: status of regulator (on/off)
  * @update_bank: bank to control on/off
  * @update_reg: register to control on/off
  * @update_mask: mask to enable/disable and set mode of regulator
  * @update_val: bits holding the regulator current mode
- * @update_val_en: bits to set EN pin active (LPn pin deactive)
+ * @update_val_hp: bits to set EN pin active (LPn pin deactive)
  *                 normally this means high power mode
- * @update_val_en_lp: bits to set EN pin active and LPn pin active
- *                    normally this means low power mode
- * @delay: startup delay in ms
+ * @update_val_lp: bits to set EN pin active and LPn pin active
+ *                 normally this means low power mode
+ * @update_val_hw: bits to set regulator pins in HW control
+ *                 SysClkReq pins and logic will choose mode
  */
 struct ab8500_ext_regulator_info {
 	struct device *dev;
 	struct regulator_desc desc;
 	struct regulator_dev *rdev;
+	struct ab8500_ext_regulator_cfg *cfg;
 	bool is_enabled;
 	u8 update_bank;
 	u8 update_reg;
 	u8 update_mask;
 	u8 update_val;
-	u8 update_val_en;
-	u8 update_val_en_lp;
+	u8 update_val_hp;
+	u8 update_val_lp;
+	u8 update_val_hw;
 };
 
-static int ab8500_ext_regulator_enable(struct regulator_dev *rdev)
+static int enable(struct ab8500_ext_regulator_info *info, u8 *regval)
 {
 	int ret;
-	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
 
-	if (info == NULL) {
-		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
-		return -EINVAL;
-	}
+	*regval = info->update_val;
+
+	/*
+	 * To satisfy both HW high power request and SW request, the regulator
+	 * must be on in high power.
+	 */
+	if (info->cfg && info->cfg->hwreq)
+		*regval = info->update_val_hp;
 
 	ret = abx500_mask_and_set_register_interruptible(info->dev,
 		info->update_bank, info->update_reg,
-		info->update_mask, info->update_val);
+		info->update_mask, *regval);
 	if (ret < 0)
 		dev_err(rdev_get_dev(info->rdev),
 			"couldn't set enable bits for regulator\n");
 
 	info->is_enabled = true;
 
-	dev_dbg(rdev_get_dev(rdev), "%s-enable (bank, reg, mask, value):"
-		" 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
-		info->desc.name, info->update_bank, info->update_reg,
-		info->update_mask, info->update_val);
-
 	return ret;
 }
 
-static int ab8500_ext_regulator_disable(struct regulator_dev *rdev)
+static int ab8500_ext_regulator_enable(struct regulator_dev *rdev)
 {
 	int ret;
 	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+	u8 regval;
 
 	if (info == NULL) {
 		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
 		return -EINVAL;
 	}
 
+	ret = enable(info, &regval);
+
+	dev_dbg(rdev_get_dev(rdev), "%s-enable (bank, reg, mask, value):"
+		" 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
+		info->desc.name, info->update_bank, info->update_reg,
+		info->update_mask, regval);
+
+	return ret;
+}
+
+static int disable(struct ab8500_ext_regulator_info *info, u8 *regval)
+{
+	int ret;
+
+	*regval = 0x0;
+
+	/*
+	 * Set the regulator in HW request mode if configured
+	 */
+	if (info->cfg && info->cfg->hwreq)
+		*regval = info->update_val_hw;
+
 	ret = abx500_mask_and_set_register_interruptible(info->dev,
 		info->update_bank, info->update_reg,
-		info->update_mask, 0x0);
+		info->update_mask, *regval);
 	if (ret < 0)
 		dev_err(rdev_get_dev(info->rdev),
 			"couldn't set disable bits for regulator\n");
 
 	info->is_enabled = false;
 
+	return ret;
+}
+
+static int ab8500_ext_regulator_disable(struct regulator_dev *rdev)
+{
+	int ret;
+	struct ab8500_ext_regulator_info *info = rdev_get_drvdata(rdev);
+	u8 regval;
+
+	if (info == NULL) {
+		dev_err(rdev_get_dev(rdev), "regulator info null pointer\n");
+		return -EINVAL;
+	}
+
+	ret = disable(info, &regval);
+
 	dev_dbg(rdev_get_dev(rdev), "%s-disable (bank, reg, mask, value):"
 		" 0x%02x, 0x%02x, 0x%02x, 0x%02x\n",
 		info->desc.name, info->update_bank, info->update_reg,
-		info->update_mask, 0x0);
+		info->update_mask, regval);
 
 	return ret;
 }
@@ -130,7 +171,8 @@ static int ab8500_ext_regulator_is_enabled(struct regulator_dev *rdev)
 		info->desc.name, info->update_bank, info->update_reg,
 		info->update_mask, regval);
 
-	if (regval & info->update_mask)
+	if (((regval & info->update_mask) == info->update_val_lp) ||
+	    ((regval & info->update_mask) == info->update_val_hp))
 		info->is_enabled = true;
 	else
 		info->is_enabled = false;
@@ -241,7 +283,6 @@ static struct regulator_ops ab8500_ext_regulator_ops = {
 	.list_voltage		= ab8500_ext_list_voltage,
 };
 
-
 static struct ab8500_ext_regulator_info
 		ab8500_ext_regulator_info[AB8500_NUM_EXT_REGULATORS] = {
 	[AB8500_EXT_SUPPLY1] = {
@@ -291,8 +332,9 @@ static struct ab8500_ext_regulator_info
 		.update_reg		= 0x08,
 		.update_mask		= 0x30,
 		.update_val		= 0x10,
-		.update_val_en		= 0x10,
-		.update_val_en_lp	= 0x30,
+		.update_val_hp		= 0x10,
+		.update_val_lp		= 0x30,
+		.update_val_hw		= 0x20,
 	},
 };
 
@@ -333,8 +375,8 @@ int ab8500_ext_regulator_init(struct platform_device *pdev)
 		/* VextSupply3LPn is inverted on AB8500 2.x */
 		info = &ab8500_ext_regulator_info[AB8500_EXT_SUPPLY3];
 		info->update_val = 0x30;
-		info->update_val_en = 0x30;
-		info->update_val_en_lp = 0x10;
+		info->update_val_hp = 0x30;
+		info->update_val_lp = 0x10;
 	}
 
 	/* register all regulators */
@@ -344,6 +386,8 @@ int ab8500_ext_regulator_init(struct platform_device *pdev)
 		/* assign per-regulator data */
 		info = &ab8500_ext_regulator_info[i];
 		info->dev = &pdev->dev;
+		info->cfg = (struct ab8500_ext_regulator_cfg *)
+			pdata->ext_regulator[i].driver_data;
 
 		config.dev = &pdev->dev;
 		config.init_data = &pdata->ext_regulator[i];
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 4e92e5b879a5..cf496e93c0cf 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -143,6 +143,10 @@ enum ab9540_regulator_reg {
 };
 
 /* AB8500 external regulators */
+struct ab8500_ext_regulator_cfg {
+	bool hwreq; /* requires hw mode or high power mode */
+};
+
 enum ab8500_ext_regulator_id {
 	AB8500_EXT_SUPPLY1,
 	AB8500_EXT_SUPPLY2,
-- 
cgit 


From 41a06aa738ad889cf96f56024ddf84ecf4a18a6f Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 28 Mar 2013 16:11:08 +0000
Subject: regulator: ab8500: Remove USB regulator

The USB regulator is controlled by hardware. The software support
was only needed for early hardware (ED) which is no longer supported.

Signed-off-by: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/ab8500.c       | 20 --------------------
 include/linux/regulator/ab8500.h |  1 -
 2 files changed, 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 49746884923d..4d88a604efd1 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -488,25 +488,6 @@ static struct ab8500_regulator_info
 		.update_val_idle	= 0x82,
 		.update_val_normal	= 0x02,
 	},
-
-	/*
-	 * Regulators with fixed voltage and normal mode
-	 */
-	[AB8500_LDO_USB] = {
-		.desc = {
-			.name           = "LDO-USB",
-			.ops            = &ab8500_regulator_ops,
-			.type           = REGULATOR_VOLTAGE,
-			.id             = AB8500_LDO_USB,
-			.owner          = THIS_MODULE,
-			.n_voltages     = 1,
-			.min_uV		= 3300000,
-			.enable_time	= 150,
-		},
-		.update_bank            = 0x03,
-		.update_reg             = 0x82,
-		.update_mask            = 0x03,
-	},
 	[AB8500_LDO_AUDIO] = {
 		.desc = {
 			.name		= "LDO-AUDIO",
@@ -862,7 +843,6 @@ static struct of_regulator_match ab8500_regulator_matches[] = {
 	{ .name	= "ab8500_ldo_aux3",    .driver_data = (void *) AB8500_LDO_AUX3, },
 	{ .name	= "ab8500_ldo_intcore", .driver_data = (void *) AB8500_LDO_INTCORE, },
 	{ .name	= "ab8500_ldo_tvout",   .driver_data = (void *) AB8500_LDO_TVOUT, },
-	{ .name = "ab8500_ldo_usb",     .driver_data = (void *) AB8500_LDO_USB, },
 	{ .name = "ab8500_ldo_audio",   .driver_data = (void *) AB8500_LDO_AUDIO, },
 	{ .name	= "ab8500_ldo_anamic1", .driver_data = (void *) AB8500_LDO_ANAMIC1, },
 	{ .name	= "ab8500_ldo_amamic2", .driver_data = (void *) AB8500_LDO_ANAMIC2, },
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index cf496e93c0cf..b86e089195ea 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -19,7 +19,6 @@ enum ab8500_regulator_id {
 	AB8500_LDO_AUX3,
 	AB8500_LDO_INTCORE,
 	AB8500_LDO_TVOUT,
-	AB8500_LDO_USB,
 	AB8500_LDO_AUDIO,
 	AB8500_LDO_ANAMIC1,
 	AB8500_LDO_ANAMIC2,
-- 
cgit 


From da0b0c47dcfd92317e2ece4c3434e1f82b55cf8a Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 28 Mar 2013 16:11:09 +0000
Subject: regulator: ab8500: Init debug from regulator driver

The purpose of this patch is to guarantee that ab8500-debug will
record the regulator registers before they are modified by the
ab8500 regulator driver.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/ab8500.c       | 10 ++++++++++
 include/linux/regulator/ab8500.h | 14 ++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 4d88a604efd1..bf34c4cd6631 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -911,6 +911,11 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
+	/* initialize debug (initial state is recorded with this call) */
+	err = ab8500_regulator_debug_init(pdev);
+	if (err)
+		return err;
+
 	/* initialize registers */
 	for (i = 0; i < pdata->num_reg_init; i++) {
 		int id, mask, value;
@@ -961,6 +966,11 @@ static int ab8500_regulator_remove(struct platform_device *pdev)
 	if (err)
 		return err;
 
+	/* remove regulator debug */
+	err = ab8500_regulator_debug_exit(pdev);
+	if (err)
+		return err;
+
 	return 0;
 }
 
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index b86e089195ea..592a3f3994c0 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -178,4 +178,18 @@ inline int ab8500_ext_regulator_exit(struct platform_device *pdev)
 }
 #endif
 
+#ifdef CONFIG_REGULATOR_AB8500_DEBUG
+int ab8500_regulator_debug_init(struct platform_device *pdev);
+int ab8500_regulator_debug_exit(struct platform_device *pdev);
+#else
+static inline int ab8500_regulator_debug_init(struct platform_device *pdev)
+{
+	return 0;
+}
+static inline int ab8500_regulator_debug_exit(struct platform_device *pdev)
+{
+	return 0;
+}
+#endif
+
 #endif
-- 
cgit 


From a3f109bd793dfe5c611220ca5ab6c72f1aed479e Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Thu, 28 Mar 2013 11:51:31 +0000
Subject: sh_eth: add R-Car support for real

Commit d0418bb7123f44b23d69ac349eec7daf9103472f (net: sh_eth: Add eth support
for R8A7779 device) was a failed attempt to add support for one of members of
the R-Car SoC family.  That's for three reasons: it treated R8A7779 the  same
as SH7724 except including quite dirty hack adding ECMR_ELB  bit  to the mask
in sh_eth_set_rate() while not removing ECMR_RTM bit (despite it's reserved in
R-Car Ether), and it didn't add a new register offset array despite the closest
SH_ETH_REG_FAST_SH4 mapping differs by 0x200 to the offsets all the R-Car Ether
registers have, and also some of the registers in this old mapping don't exist
on R-Car Ether (due to this, SH7724's 'sh_eth_my_cpu_data' structure is not
adequeate for R-Car too).  Fix all these shortcomings, restoring the SH7724
related section to its pristine state...

Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/renesas/sh_eth.c | 107 +++++++++++++++++++++++++++++++---
 include/linux/sh_eth.h                |   1 +
 2 files changed, 100 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 13abe917cbdf..da604059b148 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -2,7 +2,8 @@
  *  SuperH Ethernet device driver
  *
  *  Copyright (C) 2006-2012 Nobuhiro Iwamatsu
- *  Copyright (C) 2008-2012 Renesas Solutions Corp.
+ *  Copyright (C) 2008-2013 Renesas Solutions Corp.
+ *  Copyright (C) 2013 Cogent Embedded, Inc.
  *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms and conditions of the GNU General Public License,
@@ -147,6 +148,51 @@ static const u16 sh_eth_offset_gigabit[SH_ETH_MAX_REGISTER_OFFSET] = {
 	[FWALCR1]	= 0x00b4,
 };
 
+static const u16 sh_eth_offset_fast_rcar[SH_ETH_MAX_REGISTER_OFFSET] = {
+	[ECMR]		= 0x0300,
+	[RFLR]		= 0x0308,
+	[ECSR]		= 0x0310,
+	[ECSIPR]	= 0x0318,
+	[PIR]		= 0x0320,
+	[PSR]		= 0x0328,
+	[RDMLR]		= 0x0340,
+	[IPGR]		= 0x0350,
+	[APR]		= 0x0354,
+	[MPR]		= 0x0358,
+	[RFCF]		= 0x0360,
+	[TPAUSER]	= 0x0364,
+	[TPAUSECR]	= 0x0368,
+	[MAHR]		= 0x03c0,
+	[MALR]		= 0x03c8,
+	[TROCR]		= 0x03d0,
+	[CDCR]		= 0x03d4,
+	[LCCR]		= 0x03d8,
+	[CNDCR]		= 0x03dc,
+	[CEFCR]		= 0x03e4,
+	[FRECR]		= 0x03e8,
+	[TSFRCR]	= 0x03ec,
+	[TLFRCR]	= 0x03f0,
+	[RFCR]		= 0x03f4,
+	[MAFCR]		= 0x03f8,
+
+	[EDMR]		= 0x0200,
+	[EDTRR]		= 0x0208,
+	[EDRRR]		= 0x0210,
+	[TDLAR]		= 0x0218,
+	[RDLAR]		= 0x0220,
+	[EESR]		= 0x0228,
+	[EESIPR]	= 0x0230,
+	[TRSCER]	= 0x0238,
+	[RMFCR]		= 0x0240,
+	[TFTR]		= 0x0248,
+	[FDR]		= 0x0250,
+	[RMCR]		= 0x0258,
+	[TFUCR]		= 0x0264,
+	[RFOCR]		= 0x0268,
+	[FCFTR]		= 0x0270,
+	[TRIMD]		= 0x027c,
+};
+
 static const u16 sh_eth_offset_fast_sh4[SH_ETH_MAX_REGISTER_OFFSET] = {
 	[ECMR]		= 0x0100,
 	[RFLR]		= 0x0108,
@@ -296,7 +342,7 @@ static void sh_eth_select_mii(struct net_device *ndev)
 #endif
 
 /* There is CPU dependent code */
-#if defined(CONFIG_CPU_SUBTYPE_SH7724) || defined(CONFIG_ARCH_R8A7779)
+#if defined(CONFIG_ARCH_R8A7779)
 #define SH_ETH_RESET_DEFAULT	1
 static void sh_eth_set_duplex(struct net_device *ndev)
 {
@@ -311,18 +357,60 @@ static void sh_eth_set_duplex(struct net_device *ndev)
 static void sh_eth_set_rate(struct net_device *ndev)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
-	unsigned int bits = ECMR_RTM;
 
-#if defined(CONFIG_ARCH_R8A7779)
-	bits |= ECMR_ELB;
-#endif
+	switch (mdp->speed) {
+	case 10: /* 10BASE */
+		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~ECMR_ELB, ECMR);
+		break;
+	case 100:/* 100BASE */
+		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | ECMR_ELB, ECMR);
+		break;
+	default:
+		break;
+	}
+}
+
+/* R8A7779 */
+static struct sh_eth_cpu_data sh_eth_my_cpu_data = {
+	.set_duplex	= sh_eth_set_duplex,
+	.set_rate	= sh_eth_set_rate,
+
+	.ecsr_value	= ECSR_PSRTO | ECSR_LCHNG | ECSR_ICD,
+	.ecsipr_value	= ECSIPR_PSRTOIP | ECSIPR_LCHNGIP | ECSIPR_ICDIP,
+	.eesipr_value	= 0x01ff009f,
+
+	.tx_check	= EESR_FTC | EESR_CND | EESR_DLC | EESR_CD | EESR_RTO,
+	.eesr_err_check	= EESR_TWB | EESR_TABT | EESR_RABT | EESR_RDE |
+			  EESR_RFRMER | EESR_TFE | EESR_TDE | EESR_ECI,
+	.tx_error_check	= EESR_TWB | EESR_TABT | EESR_TDE | EESR_TFE,
+
+	.apr		= 1,
+	.mpr		= 1,
+	.tpauser	= 1,
+	.hw_swap	= 1,
+};
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define SH_ETH_RESET_DEFAULT	1
+static void sh_eth_set_duplex(struct net_device *ndev)
+{
+	struct sh_eth_private *mdp = netdev_priv(ndev);
+
+	if (mdp->duplex) /* Full */
+		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | ECMR_DM, ECMR);
+	else		/* Half */
+		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~ECMR_DM, ECMR);
+}
+
+static void sh_eth_set_rate(struct net_device *ndev)
+{
+	struct sh_eth_private *mdp = netdev_priv(ndev);
 
 	switch (mdp->speed) {
 	case 10: /* 10BASE */
-		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~bits, ECMR);
+		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~ECMR_RTM, ECMR);
 		break;
 	case 100:/* 100BASE */
-		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | bits, ECMR);
+		sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | ECMR_RTM, ECMR);
 		break;
 	default:
 		break;
@@ -2521,6 +2609,9 @@ static const u16 *sh_eth_get_register_offset(int register_type)
 	case SH_ETH_REG_GIGABIT:
 		reg_offset = sh_eth_offset_gigabit;
 		break;
+	case SH_ETH_REG_FAST_RCAR:
+		reg_offset = sh_eth_offset_fast_rcar;
+		break;
 	case SH_ETH_REG_FAST_SH4:
 		reg_offset = sh_eth_offset_fast_sh4;
 		break;
diff --git a/include/linux/sh_eth.h b/include/linux/sh_eth.h
index b17d765ded84..fc305713fc6d 100644
--- a/include/linux/sh_eth.h
+++ b/include/linux/sh_eth.h
@@ -6,6 +6,7 @@
 enum {EDMAC_LITTLE_ENDIAN, EDMAC_BIG_ENDIAN};
 enum {
 	SH_ETH_REG_GIGABIT,
+	SH_ETH_REG_FAST_RCAR,
 	SH_ETH_REG_FAST_SH4,
 	SH_ETH_REG_FAST_SH3_SH2
 };
-- 
cgit 


From 72f4dc117b57e05120aaac6e218b8abc09a5c350 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 16 Mar 2013 15:54:25 -0400
Subject: NFS: Remove unneeded forward declaration

I've built with NFSv4 enabled and disabled.  This forward
declaration does not seem to be required.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/nfs_xdr.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 90a4aa190b43..c1ca1f3f4935 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -14,9 +14,6 @@
 #define NFS_DEF_FILE_IO_SIZE	(4096U)
 #define NFS_MIN_FILE_IO_SIZE	(1024U)
 
-/* Forward declaration for NFS v3 */
-struct nfs4_secinfo_flavors;
-
 struct nfs4_string {
 	unsigned int len;
 	char *data;
-- 
cgit 


From fb15b26f8ba3ff629a052faf3f4a4744585ca2dc Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 16 Mar 2013 15:54:34 -0400
Subject: SUNRPC: Define rpcsec_gss_info structure

The NFSv4 SECINFO procedure returns a list of security flavors.  Any
GSS flavor also has a GSS tuple containing an OID, a quality-of-
protection value, and a service value, which specifies a particular
GSS pseudoflavor.

For simplicity and efficiency, I'd like to return each GSS tuple
from the NFSv4 SECINFO XDR decoder and pass it straight into the RPC
client.

Define a data structure that is visible to both the NFS client and
the RPC client.  Take structure and field names from the relevant
standards to avoid confusion.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4namespace.c              | 12 ++++++------
 fs/nfs/nfs4xdr.c                    | 21 ++++++++++++---------
 include/linux/nfs_xdr.h             | 21 +++++----------------
 include/linux/sunrpc/gss_api.h      | 14 ++++++++++++--
 net/sunrpc/auth_gss/gss_krb5_mech.c |  2 +-
 5 files changed, 36 insertions(+), 34 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 0dd766079e1c..88231c92317c 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -138,23 +138,23 @@ rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
 	struct gss_api_mech *mech;
 	struct xdr_netobj oid;
-	int i;
+	unsigned int i;
 	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
 
 	for (i = 0; i < flavors->num_flavors; i++) {
-		struct nfs4_secinfo_flavor *flavor;
-		flavor = &flavors->flavors[i];
+		struct nfs4_secinfo4 *flavor = &flavors->flavors[i];
 
 		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
 			pseudoflavor = flavor->flavor;
 			break;
 		} else if (flavor->flavor == RPC_AUTH_GSS) {
-			oid.len  = flavor->gss.sec_oid4.len;
-			oid.data = flavor->gss.sec_oid4.data;
+			oid.len  = flavor->flavor_info.oid.len;
+			oid.data = flavor->flavor_info.oid.data;
 			mech = gss_mech_get_by_OID(&oid);
 			if (!mech)
 				continue;
-			pseudoflavor = gss_svc_to_pseudoflavor(mech, flavor->gss.service);
+			pseudoflavor = gss_svc_to_pseudoflavor(mech,
+						flavor->flavor_info.service);
 			gss_mech_put(mech);
 			break;
 		}
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0b744895b9e1..a38fd179c34f 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -5205,27 +5205,30 @@ static int decode_delegreturn(struct xdr_stream *xdr)
 	return decode_op_hdr(xdr, OP_DELEGRETURN);
 }
 
-static int decode_secinfo_gss(struct xdr_stream *xdr, struct nfs4_secinfo_flavor *flavor)
+static int decode_secinfo_gss(struct xdr_stream *xdr,
+			      struct nfs4_secinfo4 *flavor)
 {
+	u32 oid_len;
 	__be32 *p;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	flavor->gss.sec_oid4.len = be32_to_cpup(p);
-	if (flavor->gss.sec_oid4.len > GSS_OID_MAX_LEN)
+	oid_len = be32_to_cpup(p);
+	if (oid_len > GSS_OID_MAX_LEN)
 		goto out_err;
 
-	p = xdr_inline_decode(xdr, flavor->gss.sec_oid4.len);
+	p = xdr_inline_decode(xdr, oid_len);
 	if (unlikely(!p))
 		goto out_overflow;
-	memcpy(flavor->gss.sec_oid4.data, p, flavor->gss.sec_oid4.len);
+	memcpy(flavor->flavor_info.oid.data, p, oid_len);
+	flavor->flavor_info.oid.len = oid_len;
 
 	p = xdr_inline_decode(xdr, 8);
 	if (unlikely(!p))
 		goto out_overflow;
-	flavor->gss.qop4 = be32_to_cpup(p++);
-	flavor->gss.service = be32_to_cpup(p);
+	flavor->flavor_info.qop = be32_to_cpup(p++);
+	flavor->flavor_info.service = be32_to_cpup(p);
 
 	return 0;
 
@@ -5238,10 +5241,10 @@ out_err:
 
 static int decode_secinfo_common(struct xdr_stream *xdr, struct nfs4_secinfo_res *res)
 {
-	struct nfs4_secinfo_flavor *sec_flavor;
+	struct nfs4_secinfo4 *sec_flavor;
+	unsigned int i, num_flavors;
 	int status;
 	__be32 *p;
-	int i, num_flavors;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index c1ca1f3f4935..b759467741eb 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1049,25 +1049,14 @@ struct nfs4_fs_locations_res {
 	struct nfs4_fs_locations       *fs_locations;
 };
 
-struct nfs4_secinfo_oid {
-	unsigned int len;
-	char data[GSS_OID_MAX_LEN];
-};
-
-struct nfs4_secinfo_gss {
-	struct nfs4_secinfo_oid sec_oid4;
-	unsigned int qop4;
-	unsigned int service;
-};
-
-struct nfs4_secinfo_flavor {
-	unsigned int 		flavor;
-	struct nfs4_secinfo_gss	gss;
+struct nfs4_secinfo4 {
+	u32			flavor;
+	struct rpcsec_gss_info	flavor_info;
 };
 
 struct nfs4_secinfo_flavors {
-	unsigned int num_flavors;
-	struct nfs4_secinfo_flavor flavors[0];
+	unsigned int		num_flavors;
+	struct nfs4_secinfo4	flavors[0];
 };
 
 struct nfs4_secinfo_arg {
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index a19e2547ae6a..98950e5a8877 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -25,10 +25,20 @@ struct gss_ctx {
 
 #define GSS_C_NO_BUFFER		((struct xdr_netobj) 0)
 #define GSS_C_NO_CONTEXT	((struct gss_ctx *) 0)
-#define GSS_C_NULL_OID		((struct xdr_netobj) 0)
 
 /*XXX  arbitrary length - is this set somewhere? */
 #define GSS_OID_MAX_LEN 32
+struct rpcsec_gss_oid {
+	unsigned int	len;
+	u8		data[GSS_OID_MAX_LEN];
+};
+
+/* From RFC 3530 */
+struct rpcsec_gss_info {
+	struct rpcsec_gss_oid	oid;
+	u32			qop;
+	u32			service;
+};
 
 /* gss-api prototypes; note that these are somewhat simplified versions of
  * the prototypes specified in RFC 2744. */
@@ -76,7 +86,7 @@ struct pf_desc {
 struct gss_api_mech {
 	struct list_head	gm_list;
 	struct module		*gm_owner;
-	struct xdr_netobj	gm_oid;
+	struct rpcsec_gss_oid	gm_oid;
 	char			*gm_name;
 	const struct gss_api_ops *gm_ops;
 	/* pseudoflavors supported by this mechanism: */
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index d3611f11a8df..61d36ce3b366 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -754,7 +754,7 @@ MODULE_ALIAS("rpc-auth-gss-390005");
 static struct gss_api_mech gss_kerberos_mech = {
 	.gm_name	= "krb5",
 	.gm_owner	= THIS_MODULE,
-	.gm_oid		= {9, (void *)"\x2a\x86\x48\x86\xf7\x12\x01\x02\x02"},
+	.gm_oid		= { 9, "\x2a\x86\x48\x86\xf7\x12\x01\x02\x02" },
 	.gm_ops		= &gss_kerberos_ops,
 	.gm_pf_num	= ARRAY_SIZE(gss_kerberos_pfs),
 	.gm_pfs		= gss_kerberos_pfs,
-- 
cgit 


From 9568c5e9a61de49f67f524404a27a1014a8d7f1e Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 16 Mar 2013 15:54:43 -0400
Subject: SUNRPC: Introduce rpcauth_get_pseudoflavor()

A SECINFO reply may contain flavors whose kernel module is not
yet loaded by the client's kernel.  A new RPC client API, called
rpcauth_get_pseudoflavor(), is introduced to do proper checking
for support of a security flavor.

When this API is invoked, the RPC client now tries to load the
module for each flavor first before performing the "is this
supported?" check.  This means if a module is available on the
client, but has not been loaded yet, it will be loaded and
registered automatically when the SECINFO reply is processed.

The new API can take a full GSS tuple (OID, QoP, and service).
Previously only the OID and service were considered.

nfs_find_best_sec() is updated to verify all flavors requested in a
SECINFO reply, including AUTH_NULL and AUTH_UNIX.  Previously these
two flavors were simply assumed to be supported without consulting
the RPC client.

Note that the replaced version of nfs_find_best_sec() can return
RPC_AUTH_MAXFLAVOR if the server returns a recognized OID but an
unsupported "service" value.  nfs_find_best_sec() now returns
RPC_AUTH_UNIX in this case.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4namespace.c                | 41 ++++++++++++++++++++---------------
 include/linux/sunrpc/auth.h           |  5 +++++
 include/linux/sunrpc/gss_api.h        |  5 ++---
 net/sunrpc/auth.c                     | 35 ++++++++++++++++++++++++++++++
 net/sunrpc/auth_gss/auth_gss.c        |  1 +
 net/sunrpc/auth_gss/gss_mech_switch.c | 28 +++++++++++++++++++-----
 6 files changed, 89 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 88231c92317c..cdb0b41a4810 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -134,33 +134,38 @@ static size_t nfs_parse_server_name(char *string, size_t len,
 	return ret;
 }
 
+/**
+ * nfs_find_best_sec - Find a security mechanism supported locally
+ * @flavors: List of security tuples returned by SECINFO procedure
+ *
+ * Return the pseudoflavor of the first security mechanism in
+ * "flavors" that is locally supported.  Return RPC_AUTH_UNIX if
+ * no matching flavor is found in the array.  The "flavors" array
+ * is searched in the order returned from the server, per RFC 3530
+ * recommendation.
+ */
 rpc_authflavor_t nfs_find_best_sec(struct nfs4_secinfo_flavors *flavors)
 {
-	struct gss_api_mech *mech;
-	struct xdr_netobj oid;
+	rpc_authflavor_t pseudoflavor;
+	struct nfs4_secinfo4 *secinfo;
 	unsigned int i;
-	rpc_authflavor_t pseudoflavor = RPC_AUTH_UNIX;
 
 	for (i = 0; i < flavors->num_flavors; i++) {
-		struct nfs4_secinfo4 *flavor = &flavors->flavors[i];
-
-		if (flavor->flavor == RPC_AUTH_NULL || flavor->flavor == RPC_AUTH_UNIX) {
-			pseudoflavor = flavor->flavor;
-			break;
-		} else if (flavor->flavor == RPC_AUTH_GSS) {
-			oid.len  = flavor->flavor_info.oid.len;
-			oid.data = flavor->flavor_info.oid.data;
-			mech = gss_mech_get_by_OID(&oid);
-			if (!mech)
-				continue;
-			pseudoflavor = gss_svc_to_pseudoflavor(mech,
-						flavor->flavor_info.service);
-			gss_mech_put(mech);
+		secinfo = &flavors->flavors[i];
+
+		switch (secinfo->flavor) {
+		case RPC_AUTH_NULL:
+		case RPC_AUTH_UNIX:
+		case RPC_AUTH_GSS:
+			pseudoflavor = rpcauth_get_pseudoflavor(secinfo->flavor,
+							&secinfo->flavor_info);
+			if (pseudoflavor != RPC_AUTH_MAXFLAVOR)
+				return pseudoflavor;
 			break;
 		}
 	}
 
-	return pseudoflavor;
+	return RPC_AUTH_UNIX;
 }
 
 static rpc_authflavor_t nfs4_negotiate_security(struct inode *inode, struct qstr *name)
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 58fda1c3c783..6851da4cb416 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -22,6 +22,8 @@
 /* size of the nodename buffer */
 #define UNX_MAXNODENAME	32
 
+struct rpcsec_gss_info;
+
 /* Work around the lack of a VFS credential */
 struct auth_cred {
 	kuid_t	uid;
@@ -103,6 +105,7 @@ struct rpc_authops {
 	int			(*pipes_create)(struct rpc_auth *);
 	void			(*pipes_destroy)(struct rpc_auth *);
 	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
+	rpc_authflavor_t	(*info2flavor)(struct rpcsec_gss_info *);
 };
 
 struct rpc_credops {
@@ -137,6 +140,8 @@ int			rpcauth_register(const struct rpc_authops *);
 int			rpcauth_unregister(const struct rpc_authops *);
 struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
 void			rpcauth_release(struct rpc_auth *);
+rpc_authflavor_t	rpcauth_get_pseudoflavor(rpc_authflavor_t,
+				struct rpcsec_gss_info *);
 int			rpcauth_list_flavors(rpc_authflavor_t *, int);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index 98950e5a8877..aba7687ca884 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -127,9 +127,8 @@ struct gss_api_ops {
 int gss_mech_register(struct gss_api_mech *);
 void gss_mech_unregister(struct gss_api_mech *);
 
-/* returns a mechanism descriptor given an OID, and increments the mechanism's
- * reference count. */
-struct gss_api_mech * gss_mech_get_by_OID(struct xdr_netobj *);
+/* Given a GSS security tuple, look up a pseudoflavor */
+rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *);
 
 /* Returns a reference to a mechanism, given a name like "krb5" etc. */
 struct gss_api_mech *gss_mech_get_by_name(const char *);
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index f5294047df77..9b81be8d9946 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -123,6 +123,41 @@ rpcauth_unregister(const struct rpc_authops *ops)
 }
 EXPORT_SYMBOL_GPL(rpcauth_unregister);
 
+/**
+ * rpcauth_get_pseudoflavor - check if security flavor is supported
+ * @flavor: a security flavor
+ * @info: a GSS mech OID, quality of protection, and service value
+ *
+ * Verifies that an appropriate kernel module is available or already loaded.
+ * Returns an equivalent pseudoflavor, or RPC_AUTH_MAXFLAVOR if "flavor" is
+ * not supported locally.
+ */
+rpc_authflavor_t
+rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
+{
+	const struct rpc_authops *ops;
+	rpc_authflavor_t pseudoflavor;
+
+	ops = auth_flavors[flavor];
+	if (ops == NULL)
+		request_module("rpc-auth-%u", flavor);
+	spin_lock(&rpc_authflavor_lock);
+	ops = auth_flavors[flavor];
+	if (ops == NULL || !try_module_get(ops->owner)) {
+		spin_unlock(&rpc_authflavor_lock);
+		return RPC_AUTH_MAXFLAVOR;
+	}
+	spin_unlock(&rpc_authflavor_lock);
+
+	pseudoflavor = flavor;
+	if (ops->info2flavor != NULL)
+		pseudoflavor = ops->info2flavor(info);
+
+	module_put(ops->owner);
+	return pseudoflavor;
+}
+EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
+
 /**
  * rpcauth_list_flavors - discover registered flavors and pseudoflavors
  * @array: array to fill in
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 282dfb14db05..a7420076ef39 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1641,6 +1641,7 @@ static const struct rpc_authops authgss_ops = {
 	.pipes_create	= gss_pipes_dentries_create,
 	.pipes_destroy	= gss_pipes_dentries_destroy,
 	.list_pseudoflavors = gss_mech_list_pseudoflavors,
+	.info2flavor	= gss_mech_info2flavor,
 };
 
 static const struct rpc_credops gss_credops = {
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index f0f4eee63a35..4db66f5f490e 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -171,8 +171,7 @@ struct gss_api_mech * gss_mech_get_by_name(const char *name)
 }
 EXPORT_SYMBOL_GPL(gss_mech_get_by_name);
 
-struct gss_api_mech *
-gss_mech_get_by_OID(struct xdr_netobj *obj)
+static struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj)
 {
 	struct gss_api_mech	*pos, *gm = NULL;
 
@@ -188,11 +187,8 @@ gss_mech_get_by_OID(struct xdr_netobj *obj)
 	}
 	spin_unlock(&registered_mechs_lock);
 	return gm;
-
 }
 
-EXPORT_SYMBOL_GPL(gss_mech_get_by_OID);
-
 static inline int
 mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor)
 {
@@ -282,6 +278,28 @@ gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
 }
 EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor);
 
+/**
+ * gss_mech_info2flavor - look up a pseudoflavor given a GSS tuple
+ * @info: a GSS mech OID, quality of protection, and service value
+ *
+ * Returns a matching pseudoflavor, or RPC_AUTH_MAXFLAVOR if the tuple is
+ * not supported.
+ */
+rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *info)
+{
+	rpc_authflavor_t pseudoflavor;
+	struct gss_api_mech *gm;
+
+	gm = gss_mech_get_by_OID(&info->oid);
+	if (gm == NULL)
+		return RPC_AUTH_MAXFLAVOR;
+
+	pseudoflavor = gss_svc_to_pseudoflavor(gm, info->service);
+
+	gss_mech_put(gm);
+	return pseudoflavor;
+}
+
 u32
 gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
 {
-- 
cgit 


From 83523d083a045a2069e5f3443d2e4f810a6e6d9a Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 16 Mar 2013 15:55:01 -0400
Subject: SUNRPC: Consider qop when looking up pseudoflavors

The NFSv4 SECINFO operation returns a list of security flavors that
the server supports for a particular share.  An NFSv4 client is
supposed to pick a pseudoflavor it supports that corresponds to one
of the flavors returned by the server.

GSS flavors in this list have a GSS tuple that identify a specific
GSS pseudoflavor.

Currently our client ignores the GSS tuple's "qop" value.  A
matching pseudoflavor is chosen based only on the OID and service
value.

So far this omission has not had much effect on Linux.  The NFSv4
protocol currently supports only one qop value: GSS_C_QOP_DEFAULT,
also known as zero.

However, if an NFSv4 server happens to return something other than
zero in the qop field, our client won't notice.  This could cause
the client to behave in incorrect ways that could have security
implications.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/gss_api.h        |  5 ++++-
 net/sunrpc/auth_gss/gss_krb5_mech.c   |  3 +++
 net/sunrpc/auth_gss/gss_mech_switch.c | 20 ++++++++++++++------
 net/sunrpc/auth_gss/svcauth_gss.c     |  4 +++-
 4 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index aba7687ca884..96e5a81a54d7 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -25,6 +25,7 @@ struct gss_ctx {
 
 #define GSS_C_NO_BUFFER		((struct xdr_netobj) 0)
 #define GSS_C_NO_CONTEXT	((struct gss_ctx *) 0)
+#define GSS_C_QOP_DEFAULT	(0)
 
 /*XXX  arbitrary length - is this set somewhere? */
 #define GSS_OID_MAX_LEN 32
@@ -68,12 +69,14 @@ u32 gss_unwrap(
 u32 gss_delete_sec_context(
 		struct gss_ctx		**ctx_id);
 
-u32 gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 service);
+rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *, u32 qop,
+					u32 service);
 u32 gss_pseudoflavor_to_service(struct gss_api_mech *, u32 pseudoflavor);
 char *gss_service_to_auth_domain_name(struct gss_api_mech *, u32 service);
 
 struct pf_desc {
 	u32	pseudoflavor;
+	u32	qop;
 	u32	service;
 	char	*name;
 	char	*auth_domain_name;
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index b822ec5cdc58..33255ff889c0 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -729,16 +729,19 @@ static const struct gss_api_ops gss_kerberos_ops = {
 static struct pf_desc gss_kerberos_pfs[] = {
 	[0] = {
 		.pseudoflavor = RPC_AUTH_GSS_KRB5,
+		.qop = GSS_C_QOP_DEFAULT,
 		.service = RPC_GSS_SVC_NONE,
 		.name = "krb5",
 	},
 	[1] = {
 		.pseudoflavor = RPC_AUTH_GSS_KRB5I,
+		.qop = GSS_C_QOP_DEFAULT,
 		.service = RPC_GSS_SVC_INTEGRITY,
 		.name = "krb5i",
 	},
 	[2] = {
 		.pseudoflavor = RPC_AUTH_GSS_KRB5P,
+		.qop = GSS_C_QOP_DEFAULT,
 		.service = RPC_GSS_SVC_PRIVACY,
 		.name = "krb5p",
 	},
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 92a72404e6d5..81fb6f3e2424 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -271,19 +271,27 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size)
 	return i;
 }
 
-u32
-gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 service)
+/**
+ * gss_svc_to_pseudoflavor - map a GSS service number to a pseudoflavor
+ * @gm: GSS mechanism handle
+ * @qop: GSS quality-of-protection value
+ * @service: GSS service value
+ *
+ * Returns a matching security flavor, or RPC_AUTH_MAXFLAVOR if none is found.
+ */
+rpc_authflavor_t gss_svc_to_pseudoflavor(struct gss_api_mech *gm, u32 qop,
+					 u32 service)
 {
 	int i;
 
 	for (i = 0; i < gm->gm_pf_num; i++) {
-		if (gm->gm_pfs[i].service == service) {
+		if (gm->gm_pfs[i].qop == qop &&
+		    gm->gm_pfs[i].service == service) {
 			return gm->gm_pfs[i].pseudoflavor;
 		}
 	}
-	return RPC_AUTH_MAXFLAVOR; /* illegal value */
+	return RPC_AUTH_MAXFLAVOR;
 }
-EXPORT_SYMBOL_GPL(gss_svc_to_pseudoflavor);
 
 /**
  * gss_mech_info2flavor - look up a pseudoflavor given a GSS tuple
@@ -301,7 +309,7 @@ rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *info)
 	if (gm == NULL)
 		return RPC_AUTH_MAXFLAVOR;
 
-	pseudoflavor = gss_svc_to_pseudoflavor(gm, info->service);
+	pseudoflavor = gss_svc_to_pseudoflavor(gm, info->qop, info->service);
 
 	gss_mech_put(gm);
 	return pseudoflavor;
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index f7d34e7b6f81..74f6d30f5ded 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -1216,7 +1216,9 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
 		svcdata->rsci = rsci;
 		cache_get(&rsci->h);
 		rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor(
-					rsci->mechctx->mech_type, gc->gc_svc);
+					rsci->mechctx->mech_type,
+					GSS_C_QOP_DEFAULT,
+					gc->gc_svc);
 		ret = SVC_OK;
 		goto out;
 	}
-- 
cgit 


From a77c806fb9d097bb7733b64207cf52fc2c6438bb Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 16 Mar 2013 15:55:10 -0400
Subject: SUNRPC: Refactor nfsd4_do_encode_secinfo()

Clean up.  This matches a similar API for the client side, and
keeps ULP fingers out the of the GSS mech switch.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Acked-by: J. Bruce Fields <bfields@redhat.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfsd/nfs4xdr.c                     | 24 +++++++++++-------------
 include/linux/sunrpc/auth.h           |  4 ++++
 include/linux/sunrpc/gss_api.h        |  3 +++
 net/sunrpc/auth.c                     | 35 +++++++++++++++++++++++++++++++++++
 net/sunrpc/auth_gss/auth_gss.c        |  1 +
 net/sunrpc/auth_gss/gss_mech_switch.c | 35 +++++++++++++++++++++++++++++++++--
 6 files changed, 87 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 01168865dd37..2a2745615b42 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3138,10 +3138,9 @@ nfsd4_encode_rename(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_
 
 static __be32
 nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
-			 __be32 nfserr,struct svc_export *exp)
+			 __be32 nfserr, struct svc_export *exp)
 {
-	int i = 0;
-	u32 nflavs;
+	u32 i, nflavs;
 	struct exp_flavor_info *flavs;
 	struct exp_flavor_info def_flavs[2];
 	__be32 *p;
@@ -3172,30 +3171,29 @@ nfsd4_do_encode_secinfo(struct nfsd4_compoundres *resp,
 	WRITE32(nflavs);
 	ADJUST_ARGS();
 	for (i = 0; i < nflavs; i++) {
-		u32 flav = flavs[i].pseudoflavor;
-		struct gss_api_mech *gm = gss_mech_get_by_pseudoflavor(flav);
+		struct rpcsec_gss_info info;
 
-		if (gm) {
+		if (rpcauth_get_gssinfo(flavs[i].pseudoflavor, &info) == 0) {
 			RESERVE_SPACE(4);
 			WRITE32(RPC_AUTH_GSS);
 			ADJUST_ARGS();
-			RESERVE_SPACE(4 + gm->gm_oid.len);
-			WRITE32(gm->gm_oid.len);
-			WRITEMEM(gm->gm_oid.data, gm->gm_oid.len);
+			RESERVE_SPACE(4 + info.oid.len);
+			WRITE32(info.oid.len);
+			WRITEMEM(info.oid.data, info.oid.len);
 			ADJUST_ARGS();
 			RESERVE_SPACE(4);
-			WRITE32(0); /* qop */
+			WRITE32(info.qop);
 			ADJUST_ARGS();
 			RESERVE_SPACE(4);
-			WRITE32(gss_pseudoflavor_to_service(gm, flav));
+			WRITE32(info.service);
 			ADJUST_ARGS();
-			gss_mech_put(gm);
 		} else {
 			RESERVE_SPACE(4);
-			WRITE32(flav);
+			WRITE32(flavs[i].pseudoflavor);
 			ADJUST_ARGS();
 		}
 	}
+
 out:
 	if (exp)
 		exp_put(exp);
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 6851da4cb416..0dd00f4f6810 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -106,6 +106,8 @@ struct rpc_authops {
 	void			(*pipes_destroy)(struct rpc_auth *);
 	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
 	rpc_authflavor_t	(*info2flavor)(struct rpcsec_gss_info *);
+	int			(*flavor2info)(rpc_authflavor_t,
+						struct rpcsec_gss_info *);
 };
 
 struct rpc_credops {
@@ -142,6 +144,8 @@ struct rpc_auth *	rpcauth_create(rpc_authflavor_t, struct rpc_clnt *);
 void			rpcauth_release(struct rpc_auth *);
 rpc_authflavor_t	rpcauth_get_pseudoflavor(rpc_authflavor_t,
 				struct rpcsec_gss_info *);
+int			rpcauth_get_gssinfo(rpc_authflavor_t,
+				struct rpcsec_gss_info *);
 int			rpcauth_list_flavors(rpc_authflavor_t *, int);
 struct rpc_cred *	rpcauth_lookup_credcache(struct rpc_auth *, struct auth_cred *, int);
 void			rpcauth_init_cred(struct rpc_cred *, const struct auth_cred *, struct rpc_auth *, const struct rpc_credops *);
diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index 96e5a81a54d7..fca23380e667 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -133,6 +133,9 @@ void gss_mech_unregister(struct gss_api_mech *);
 /* Given a GSS security tuple, look up a pseudoflavor */
 rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *);
 
+/* Given a pseudoflavor, look up a GSS security tuple */
+int gss_mech_flavor2info(rpc_authflavor_t, struct rpcsec_gss_info *);
+
 /* Returns a reference to a mechanism, given a name like "krb5" etc. */
 struct gss_api_mech *gss_mech_get_by_name(const char *);
 
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index 9b81be8d9946..2bc0cc2196e0 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -158,6 +158,41 @@ rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info)
 }
 EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor);
 
+/**
+ * rpcauth_get_gssinfo - find GSS tuple matching a GSS pseudoflavor
+ * @pseudoflavor: GSS pseudoflavor to match
+ * @info: rpcsec_gss_info structure to fill in
+ *
+ * Returns zero and fills in "info" if pseudoflavor matches a
+ * supported mechanism.
+ */
+int
+rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info)
+{
+	rpc_authflavor_t flavor = pseudoflavor_to_flavor(pseudoflavor);
+	const struct rpc_authops *ops;
+	int result;
+
+	ops = auth_flavors[flavor];
+	if (ops == NULL)
+		request_module("rpc-auth-%u", flavor);
+	spin_lock(&rpc_authflavor_lock);
+	ops = auth_flavors[flavor];
+	if (ops == NULL || !try_module_get(ops->owner)) {
+		spin_unlock(&rpc_authflavor_lock);
+		return -ENOENT;
+	}
+	spin_unlock(&rpc_authflavor_lock);
+
+	result = -ENOENT;
+	if (ops->flavor2info != NULL)
+		result = ops->flavor2info(pseudoflavor, info);
+
+	module_put(ops->owner);
+	return result;
+}
+EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo);
+
 /**
  * rpcauth_list_flavors - discover registered flavors and pseudoflavors
  * @array: array to fill in
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index a7420076ef39..51415b07174e 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1642,6 +1642,7 @@ static const struct rpc_authops authgss_ops = {
 	.pipes_destroy	= gss_pipes_dentries_destroy,
 	.list_pseudoflavors = gss_mech_list_pseudoflavors,
 	.info2flavor	= gss_mech_info2flavor,
+	.flavor2info	= gss_mech_flavor2info,
 };
 
 static const struct rpc_credops gss_credops = {
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index 81fb6f3e2424..deaa7ae81cdf 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -240,8 +240,6 @@ gss_mech_get_by_pseudoflavor(u32 pseudoflavor)
 	return gm;
 }
 
-EXPORT_SYMBOL_GPL(gss_mech_get_by_pseudoflavor);
-
 /**
  * gss_mech_list_pseudoflavors - Discover registered GSS pseudoflavors
  * @array: array to fill in
@@ -315,6 +313,39 @@ rpc_authflavor_t gss_mech_info2flavor(struct rpcsec_gss_info *info)
 	return pseudoflavor;
 }
 
+/**
+ * gss_mech_flavor2info - look up a GSS tuple for a given pseudoflavor
+ * @pseudoflavor: GSS pseudoflavor to match
+ * @info: rpcsec_gss_info structure to fill in
+ *
+ * Returns zero and fills in "info" if pseudoflavor matches a
+ * supported mechanism.  Otherwise a negative errno is returned.
+ */
+int gss_mech_flavor2info(rpc_authflavor_t pseudoflavor,
+			 struct rpcsec_gss_info *info)
+{
+	struct gss_api_mech *gm;
+	int i;
+
+	gm = gss_mech_get_by_pseudoflavor(pseudoflavor);
+	if (gm == NULL)
+		return -ENOENT;
+
+	for (i = 0; i < gm->gm_pf_num; i++) {
+		if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) {
+			memcpy(info->oid.data, gm->gm_oid.data, gm->gm_oid.len);
+			info->oid.len = gm->gm_oid.len;
+			info->qop = gm->gm_pfs[i].qop;
+			info->service = gm->gm_pfs[i].service;
+			gss_mech_put(gm);
+			return 0;
+		}
+	}
+
+	gss_mech_put(gm);
+	return -ENOENT;
+}
+
 u32
 gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor)
 {
-- 
cgit 


From 6599c0acae10e929b5315821c1d064cd13fe7648 Mon Sep 17 00:00:00 2001
From: Chuck Lever <chuck.lever@oracle.com>
Date: Sat, 16 Mar 2013 15:55:19 -0400
Subject: SUNRPC: Make gss_mech_get() static

gss_mech_get() is no longer used outside of gss_mech_switch.c.

Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/gss_api.h        | 3 ---
 net/sunrpc/auth_gss/gss_mech_switch.c | 5 +----
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index fca23380e667..f32b7a47e13f 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -145,9 +145,6 @@ struct gss_api_mech *gss_mech_get_by_pseudoflavor(u32);
 /* Fill in an array with a list of supported pseudoflavors */
 int gss_mech_list_pseudoflavors(rpc_authflavor_t *, int);
 
-/* Just increments the mechanism's reference count and returns its input: */
-struct gss_api_mech * gss_mech_get(struct gss_api_mech *);
-
 /* For every successful gss_mech_get or gss_mech_get_by_* call there must be a
  * corresponding call to gss_mech_put. */
 void gss_mech_put(struct gss_api_mech *);
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index deaa7ae81cdf..89416522ef79 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -132,15 +132,12 @@ gss_mech_unregister(struct gss_api_mech *gm)
 
 EXPORT_SYMBOL_GPL(gss_mech_unregister);
 
-struct gss_api_mech *
-gss_mech_get(struct gss_api_mech *gm)
+static struct gss_api_mech *gss_mech_get(struct gss_api_mech *gm)
 {
 	__module_get(gm->gm_owner);
 	return gm;
 }
 
-EXPORT_SYMBOL_GPL(gss_mech_get);
-
 static struct gss_api_mech *
 _gss_mech_get_by_name(const char *name)
 {
-- 
cgit 


From e4bcda28344cc4762c57ad7333f0472a39e83479 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Fri, 29 Mar 2013 17:38:18 -0600
Subject: ARM: tegra: move <mach/powergate.h> to <linux/tegra-powergate.h>

This is required so that code such as Tegra's PCIe and clock drivers
can still access this header file once Tegra is converted to
multiplatform, and <mach/> no longer exists.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
---
 arch/arm/mach-tegra/board.h                  |  1 +
 arch/arm/mach-tegra/common.c                 |  2 --
 arch/arm/mach-tegra/include/mach/powergate.h | 54 ----------------------------
 arch/arm/mach-tegra/pcie.c                   |  3 +-
 arch/arm/mach-tegra/powergate.c              |  3 +-
 drivers/clk/tegra/clk-tegra30.c              |  3 +-
 include/linux/tegra-powergate.h              | 49 +++++++++++++++++++++++++
 7 files changed, 53 insertions(+), 62 deletions(-)
 delete mode 100644 arch/arm/mach-tegra/include/mach/powergate.h
 create mode 100644 include/linux/tegra-powergate.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-tegra/board.h b/arch/arm/mach-tegra/board.h
index 60431de585ca..1787327fae3a 100644
--- a/arch/arm/mach-tegra/board.h
+++ b/arch/arm/mach-tegra/board.h
@@ -40,6 +40,7 @@ int tegra_clk_debugfs_init(void);
 static inline int tegra_clk_debugfs_init(void) { return 0; }
 #endif
 
+int __init tegra_powergate_init(void);
 #if defined(CONFIG_ARCH_TEGRA_2x_SOC) && defined(CONFIG_DEBUG_FS)
 int __init tegra_powergate_debugfs_init(void);
 #else
diff --git a/arch/arm/mach-tegra/common.c b/arch/arm/mach-tegra/common.c
index f0315c95c76d..7cc75636adc6 100644
--- a/arch/arm/mach-tegra/common.c
+++ b/arch/arm/mach-tegra/common.c
@@ -27,8 +27,6 @@
 
 #include <asm/hardware/cache-l2x0.h>
 
-#include <mach/powergate.h>
-
 #include "board.h"
 #include "common.h"
 #include "fuse.h"
diff --git a/arch/arm/mach-tegra/include/mach/powergate.h b/arch/arm/mach-tegra/include/mach/powergate.h
deleted file mode 100644
index 06763fe7529d..000000000000
--- a/arch/arm/mach-tegra/include/mach/powergate.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * drivers/regulator/tegra-regulator.c
- *
- * Copyright (c) 2010 Google, Inc
- *
- * Author:
- *	Colin Cross <ccross@google.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- */
-
-#ifndef _MACH_TEGRA_POWERGATE_H_
-#define _MACH_TEGRA_POWERGATE_H_
-
-struct clk;
-
-#define TEGRA_POWERGATE_CPU	0
-#define TEGRA_POWERGATE_3D	1
-#define TEGRA_POWERGATE_VENC	2
-#define TEGRA_POWERGATE_PCIE	3
-#define TEGRA_POWERGATE_VDEC	4
-#define TEGRA_POWERGATE_L2	5
-#define TEGRA_POWERGATE_MPE	6
-#define TEGRA_POWERGATE_HEG	7
-#define TEGRA_POWERGATE_SATA	8
-#define TEGRA_POWERGATE_CPU1	9
-#define TEGRA_POWERGATE_CPU2	10
-#define TEGRA_POWERGATE_CPU3	11
-#define TEGRA_POWERGATE_CELP	12
-#define TEGRA_POWERGATE_3D1	13
-
-#define TEGRA_POWERGATE_CPU0	TEGRA_POWERGATE_CPU
-#define TEGRA_POWERGATE_3D0	TEGRA_POWERGATE_3D
-
-int  __init tegra_powergate_init(void);
-
-int tegra_cpu_powergate_id(int cpuid);
-int tegra_powergate_is_powered(int id);
-int tegra_powergate_power_on(int id);
-int tegra_powergate_power_off(int id);
-int tegra_powergate_remove_clamping(int id);
-
-/* Must be called with clk disabled, and returns with clk enabled */
-int tegra_powergate_sequence_power_up(int id, struct clk *clk);
-
-#endif /* _MACH_TEGRA_POWERGATE_H_ */
diff --git a/arch/arm/mach-tegra/pcie.c b/arch/arm/mach-tegra/pcie.c
index b60165f1ca02..46144a19a7e7 100644
--- a/arch/arm/mach-tegra/pcie.c
+++ b/arch/arm/mach-tegra/pcie.c
@@ -34,12 +34,11 @@
 #include <linux/delay.h>
 #include <linux/export.h>
 #include <linux/clk/tegra.h>
+#include <linux/tegra-powergate.h>
 
 #include <asm/sizes.h>
 #include <asm/mach/pci.h>
 
-#include <mach/powergate.h>
-
 #include "board.h"
 #include "iomap.h"
 
diff --git a/arch/arm/mach-tegra/powergate.c b/arch/arm/mach-tegra/powergate.c
index c6bc8f85759c..585d2974a3c1 100644
--- a/arch/arm/mach-tegra/powergate.c
+++ b/arch/arm/mach-tegra/powergate.c
@@ -27,8 +27,7 @@
 #include <linux/seq_file.h>
 #include <linux/spinlock.h>
 #include <linux/clk/tegra.h>
-
-#include <mach/powergate.h>
+#include <linux/tegra-powergate.h>
 
 #include "fuse.h"
 #include "iomap.h"
diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index 32c61cb6d0bb..84584e529a7d 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -22,8 +22,7 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/clk/tegra.h>
-
-#include <mach/powergate.h>
+#include <linux/tegra-powergate.h>
 
 #include "clk.h"
 
diff --git a/include/linux/tegra-powergate.h b/include/linux/tegra-powergate.h
new file mode 100644
index 000000000000..55c29a8d5015
--- /dev/null
+++ b/include/linux/tegra-powergate.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2010 Google, Inc
+ *
+ * Author:
+ *	Colin Cross <ccross@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _MACH_TEGRA_POWERGATE_H_
+#define _MACH_TEGRA_POWERGATE_H_
+
+struct clk;
+
+#define TEGRA_POWERGATE_CPU	0
+#define TEGRA_POWERGATE_3D	1
+#define TEGRA_POWERGATE_VENC	2
+#define TEGRA_POWERGATE_PCIE	3
+#define TEGRA_POWERGATE_VDEC	4
+#define TEGRA_POWERGATE_L2	5
+#define TEGRA_POWERGATE_MPE	6
+#define TEGRA_POWERGATE_HEG	7
+#define TEGRA_POWERGATE_SATA	8
+#define TEGRA_POWERGATE_CPU1	9
+#define TEGRA_POWERGATE_CPU2	10
+#define TEGRA_POWERGATE_CPU3	11
+#define TEGRA_POWERGATE_CELP	12
+#define TEGRA_POWERGATE_3D1	13
+
+#define TEGRA_POWERGATE_CPU0	TEGRA_POWERGATE_CPU
+#define TEGRA_POWERGATE_3D0	TEGRA_POWERGATE_3D
+
+int tegra_powergate_is_powered(int id);
+int tegra_powergate_power_on(int id);
+int tegra_powergate_power_off(int id);
+int tegra_powergate_remove_clamping(int id);
+
+/* Must be called with clk disabled, and returns with clk enabled */
+int tegra_powergate_sequence_power_up(int id, struct clk *clk);
+
+#endif /* _MACH_TEGRA_POWERGATE_H_ */
-- 
cgit 


From 14b57a10553b5b768f77b247e6dd285c65816064 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 29 Mar 2013 14:46:51 +0100
Subject: openvswitch: Use ETH_ALEN to define ethernet addresses

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/linux/openvswitch.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 67d6c7b03581..8b9d7217eddc 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -20,6 +20,7 @@
 #define _LINUX_OPENVSWITCH_H 1
 
 #include <linux/types.h>
+#include <linux/if_ether.h>
 
 /**
  * struct ovs_header - header for OVS Generic Netlink messages.
@@ -269,8 +270,8 @@ enum ovs_frag_type {
 #define OVS_FRAG_TYPE_MAX (__OVS_FRAG_TYPE_MAX - 1)
 
 struct ovs_key_ethernet {
-	__u8	 eth_src[6];
-	__u8	 eth_dst[6];
+	__u8	 eth_src[ETH_ALEN];
+	__u8	 eth_dst[ETH_ALEN];
 };
 
 struct ovs_key_ipv4 {
@@ -316,14 +317,14 @@ struct ovs_key_arp {
 	__be32 arp_sip;
 	__be32 arp_tip;
 	__be16 arp_op;
-	__u8   arp_sha[6];
-	__u8   arp_tha[6];
+	__u8   arp_sha[ETH_ALEN];
+	__u8   arp_tha[ETH_ALEN];
 };
 
 struct ovs_key_nd {
 	__u32 nd_target[4];
-	__u8  nd_sll[6];
-	__u8  nd_tll[6];
+	__u8  nd_sll[ETH_ALEN];
+	__u8  nd_tll[ETH_ALEN];
 };
 
 /**
-- 
cgit 


From 22e3880a76bb9a0c4fa5c8fefdc8697a36a4dae1 Mon Sep 17 00:00:00 2001
From: Thomas Graf <tgraf@suug.ch>
Date: Fri, 29 Mar 2013 14:46:52 +0100
Subject: openvswitch: Expose <linux/openvswitch.h> to userspace

It contains the public netlink interface bits required by userspace to
make use of the interface.

Signed-off-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: Jesse Gross <jesse@nicira.com>
---
 include/linux/openvswitch.h      | 433 +------------------------------------
 include/uapi/linux/Kbuild        |   1 +
 include/uapi/linux/openvswitch.h | 456 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 458 insertions(+), 432 deletions(-)
 create mode 100644 include/uapi/linux/openvswitch.h

(limited to 'include/linux')

diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 8b9d7217eddc..e6b240b6196c 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -19,437 +19,6 @@
 #ifndef _LINUX_OPENVSWITCH_H
 #define _LINUX_OPENVSWITCH_H 1
 
-#include <linux/types.h>
-#include <linux/if_ether.h>
-
-/**
- * struct ovs_header - header for OVS Generic Netlink messages.
- * @dp_ifindex: ifindex of local port for datapath (0 to make a request not
- * specific to a datapath).
- *
- * Attributes following the header are specific to a particular OVS Generic
- * Netlink family, but all of the OVS families use this header.
- */
-
-struct ovs_header {
-	int dp_ifindex;
-};
-
-/* Datapaths. */
-
-#define OVS_DATAPATH_FAMILY  "ovs_datapath"
-#define OVS_DATAPATH_MCGROUP "ovs_datapath"
-#define OVS_DATAPATH_VERSION 0x1
-
-enum ovs_datapath_cmd {
-	OVS_DP_CMD_UNSPEC,
-	OVS_DP_CMD_NEW,
-	OVS_DP_CMD_DEL,
-	OVS_DP_CMD_GET,
-	OVS_DP_CMD_SET
-};
-
-/**
- * enum ovs_datapath_attr - attributes for %OVS_DP_* commands.
- * @OVS_DP_ATTR_NAME: Name of the network device that serves as the "local
- * port".  This is the name of the network device whose dp_ifindex is given in
- * the &struct ovs_header.  Always present in notifications.  Required in
- * %OVS_DP_NEW requests.  May be used as an alternative to specifying
- * dp_ifindex in other requests (with a dp_ifindex of 0).
- * @OVS_DP_ATTR_UPCALL_PID: The Netlink socket in userspace that is initially
- * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
- * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
- * not be sent.
- * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
- * datapath.  Always present in notifications.
- *
- * These attributes follow the &struct ovs_header within the Generic Netlink
- * payload for %OVS_DP_* commands.
- */
-enum ovs_datapath_attr {
-	OVS_DP_ATTR_UNSPEC,
-	OVS_DP_ATTR_NAME,       /* name of dp_ifindex netdev */
-	OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
-	OVS_DP_ATTR_STATS,      /* struct ovs_dp_stats */
-	__OVS_DP_ATTR_MAX
-};
-
-#define OVS_DP_ATTR_MAX (__OVS_DP_ATTR_MAX - 1)
-
-struct ovs_dp_stats {
-	__u64 n_hit;             /* Number of flow table matches. */
-	__u64 n_missed;          /* Number of flow table misses. */
-	__u64 n_lost;            /* Number of misses not sent to userspace. */
-	__u64 n_flows;           /* Number of flows present */
-};
-
-struct ovs_vport_stats {
-	__u64   rx_packets;		/* total packets received       */
-	__u64   tx_packets;		/* total packets transmitted    */
-	__u64   rx_bytes;		/* total bytes received         */
-	__u64   tx_bytes;		/* total bytes transmitted      */
-	__u64   rx_errors;		/* bad packets received         */
-	__u64   tx_errors;		/* packet transmit problems     */
-	__u64   rx_dropped;		/* no space in linux buffers    */
-	__u64   tx_dropped;		/* no space available in linux  */
-};
-
-/* Fixed logical ports. */
-#define OVSP_LOCAL      ((__u32)0)
-
-/* Packet transfer. */
-
-#define OVS_PACKET_FAMILY "ovs_packet"
-#define OVS_PACKET_VERSION 0x1
-
-enum ovs_packet_cmd {
-	OVS_PACKET_CMD_UNSPEC,
-
-	/* Kernel-to-user notifications. */
-	OVS_PACKET_CMD_MISS,    /* Flow table miss. */
-	OVS_PACKET_CMD_ACTION,  /* OVS_ACTION_ATTR_USERSPACE action. */
-
-	/* Userspace commands. */
-	OVS_PACKET_CMD_EXECUTE  /* Apply actions to a packet. */
-};
-
-/**
- * enum ovs_packet_attr - attributes for %OVS_PACKET_* commands.
- * @OVS_PACKET_ATTR_PACKET: Present for all notifications.  Contains the entire
- * packet as received, from the start of the Ethernet header onward.  For
- * %OVS_PACKET_CMD_ACTION, %OVS_PACKET_ATTR_PACKET reflects changes made by
- * actions preceding %OVS_ACTION_ATTR_USERSPACE, but %OVS_PACKET_ATTR_KEY is
- * the flow key extracted from the packet as originally received.
- * @OVS_PACKET_ATTR_KEY: Present for all notifications.  Contains the flow key
- * extracted from the packet as nested %OVS_KEY_ATTR_* attributes.  This allows
- * userspace to adapt its flow setup strategy by comparing its notion of the
- * flow key against the kernel's.
- * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet.  Used
- * for %OVS_PACKET_CMD_EXECUTE.  It has nested %OVS_ACTION_ATTR_* attributes.
- * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
- * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
- * %OVS_USERSPACE_ATTR_USERDATA attribute, with the same length and content
- * specified there.
- *
- * These attributes follow the &struct ovs_header within the Generic Netlink
- * payload for %OVS_PACKET_* commands.
- */
-enum ovs_packet_attr {
-	OVS_PACKET_ATTR_UNSPEC,
-	OVS_PACKET_ATTR_PACKET,      /* Packet data. */
-	OVS_PACKET_ATTR_KEY,         /* Nested OVS_KEY_ATTR_* attributes. */
-	OVS_PACKET_ATTR_ACTIONS,     /* Nested OVS_ACTION_ATTR_* attributes. */
-	OVS_PACKET_ATTR_USERDATA,    /* OVS_ACTION_ATTR_USERSPACE arg. */
-	__OVS_PACKET_ATTR_MAX
-};
-
-#define OVS_PACKET_ATTR_MAX (__OVS_PACKET_ATTR_MAX - 1)
-
-/* Virtual ports. */
-
-#define OVS_VPORT_FAMILY  "ovs_vport"
-#define OVS_VPORT_MCGROUP "ovs_vport"
-#define OVS_VPORT_VERSION 0x1
-
-enum ovs_vport_cmd {
-	OVS_VPORT_CMD_UNSPEC,
-	OVS_VPORT_CMD_NEW,
-	OVS_VPORT_CMD_DEL,
-	OVS_VPORT_CMD_GET,
-	OVS_VPORT_CMD_SET
-};
-
-enum ovs_vport_type {
-	OVS_VPORT_TYPE_UNSPEC,
-	OVS_VPORT_TYPE_NETDEV,   /* network device */
-	OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
-	__OVS_VPORT_TYPE_MAX
-};
-
-#define OVS_VPORT_TYPE_MAX (__OVS_VPORT_TYPE_MAX - 1)
-
-/**
- * enum ovs_vport_attr - attributes for %OVS_VPORT_* commands.
- * @OVS_VPORT_ATTR_PORT_NO: 32-bit port number within datapath.
- * @OVS_VPORT_ATTR_TYPE: 32-bit %OVS_VPORT_TYPE_* constant describing the type
- * of vport.
- * @OVS_VPORT_ATTR_NAME: Name of vport.  For a vport based on a network device
- * this is the name of the network device.  Maximum length %IFNAMSIZ-1 bytes
- * plus a null terminator.
- * @OVS_VPORT_ATTR_OPTIONS: Vport-specific configuration information.
- * @OVS_VPORT_ATTR_UPCALL_PID: The Netlink socket in userspace that
- * OVS_PACKET_CMD_MISS upcalls will be directed to for packets received on
- * this port.  A value of zero indicates that upcalls should not be sent.
- * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for
- * packets sent or received through the vport.
- *
- * These attributes follow the &struct ovs_header within the Generic Netlink
- * payload for %OVS_VPORT_* commands.
- *
- * For %OVS_VPORT_CMD_NEW requests, the %OVS_VPORT_ATTR_TYPE and
- * %OVS_VPORT_ATTR_NAME attributes are required.  %OVS_VPORT_ATTR_PORT_NO is
- * optional; if not specified a free port number is automatically selected.
- * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type
- * of vport.
- * and other attributes are ignored.
- *
- * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to
- * look up the vport to operate on; otherwise dp_idx from the &struct
- * ovs_header plus %OVS_VPORT_ATTR_PORT_NO determine the vport.
- */
-enum ovs_vport_attr {
-	OVS_VPORT_ATTR_UNSPEC,
-	OVS_VPORT_ATTR_PORT_NO,	/* u32 port number within datapath */
-	OVS_VPORT_ATTR_TYPE,	/* u32 OVS_VPORT_TYPE_* constant. */
-	OVS_VPORT_ATTR_NAME,	/* string name, up to IFNAMSIZ bytes long */
-	OVS_VPORT_ATTR_OPTIONS, /* nested attributes, varies by vport type */
-	OVS_VPORT_ATTR_UPCALL_PID, /* u32 Netlink PID to receive upcalls */
-	OVS_VPORT_ATTR_STATS,	/* struct ovs_vport_stats */
-	__OVS_VPORT_ATTR_MAX
-};
-
-#define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
-
-/* Flows. */
-
-#define OVS_FLOW_FAMILY  "ovs_flow"
-#define OVS_FLOW_MCGROUP "ovs_flow"
-#define OVS_FLOW_VERSION 0x1
-
-enum ovs_flow_cmd {
-	OVS_FLOW_CMD_UNSPEC,
-	OVS_FLOW_CMD_NEW,
-	OVS_FLOW_CMD_DEL,
-	OVS_FLOW_CMD_GET,
-	OVS_FLOW_CMD_SET
-};
-
-struct ovs_flow_stats {
-	__u64 n_packets;         /* Number of matched packets. */
-	__u64 n_bytes;           /* Number of matched bytes. */
-};
-
-enum ovs_key_attr {
-	OVS_KEY_ATTR_UNSPEC,
-	OVS_KEY_ATTR_ENCAP,	/* Nested set of encapsulated attributes. */
-	OVS_KEY_ATTR_PRIORITY,  /* u32 skb->priority */
-	OVS_KEY_ATTR_IN_PORT,   /* u32 OVS dp port number */
-	OVS_KEY_ATTR_ETHERNET,  /* struct ovs_key_ethernet */
-	OVS_KEY_ATTR_VLAN,	/* be16 VLAN TCI */
-	OVS_KEY_ATTR_ETHERTYPE,	/* be16 Ethernet type */
-	OVS_KEY_ATTR_IPV4,      /* struct ovs_key_ipv4 */
-	OVS_KEY_ATTR_IPV6,      /* struct ovs_key_ipv6 */
-	OVS_KEY_ATTR_TCP,       /* struct ovs_key_tcp */
-	OVS_KEY_ATTR_UDP,       /* struct ovs_key_udp */
-	OVS_KEY_ATTR_ICMP,      /* struct ovs_key_icmp */
-	OVS_KEY_ATTR_ICMPV6,    /* struct ovs_key_icmpv6 */
-	OVS_KEY_ATTR_ARP,       /* struct ovs_key_arp */
-	OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
-	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
-	__OVS_KEY_ATTR_MAX
-};
-
-#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
-
-/**
- * enum ovs_frag_type - IPv4 and IPv6 fragment type
- * @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
- * @OVS_FRAG_TYPE_FIRST: Packet is a fragment with offset 0.
- * @OVS_FRAG_TYPE_LATER: Packet is a fragment with nonzero offset.
- *
- * Used as the @ipv4_frag in &struct ovs_key_ipv4 and as @ipv6_frag &struct
- * ovs_key_ipv6.
- */
-enum ovs_frag_type {
-	OVS_FRAG_TYPE_NONE,
-	OVS_FRAG_TYPE_FIRST,
-	OVS_FRAG_TYPE_LATER,
-	__OVS_FRAG_TYPE_MAX
-};
-
-#define OVS_FRAG_TYPE_MAX (__OVS_FRAG_TYPE_MAX - 1)
-
-struct ovs_key_ethernet {
-	__u8	 eth_src[ETH_ALEN];
-	__u8	 eth_dst[ETH_ALEN];
-};
-
-struct ovs_key_ipv4 {
-	__be32 ipv4_src;
-	__be32 ipv4_dst;
-	__u8   ipv4_proto;
-	__u8   ipv4_tos;
-	__u8   ipv4_ttl;
-	__u8   ipv4_frag;	/* One of OVS_FRAG_TYPE_*. */
-};
-
-struct ovs_key_ipv6 {
-	__be32 ipv6_src[4];
-	__be32 ipv6_dst[4];
-	__be32 ipv6_label;	/* 20-bits in least-significant bits. */
-	__u8   ipv6_proto;
-	__u8   ipv6_tclass;
-	__u8   ipv6_hlimit;
-	__u8   ipv6_frag;	/* One of OVS_FRAG_TYPE_*. */
-};
-
-struct ovs_key_tcp {
-	__be16 tcp_src;
-	__be16 tcp_dst;
-};
-
-struct ovs_key_udp {
-	__be16 udp_src;
-	__be16 udp_dst;
-};
-
-struct ovs_key_icmp {
-	__u8 icmp_type;
-	__u8 icmp_code;
-};
-
-struct ovs_key_icmpv6 {
-	__u8 icmpv6_type;
-	__u8 icmpv6_code;
-};
-
-struct ovs_key_arp {
-	__be32 arp_sip;
-	__be32 arp_tip;
-	__be16 arp_op;
-	__u8   arp_sha[ETH_ALEN];
-	__u8   arp_tha[ETH_ALEN];
-};
-
-struct ovs_key_nd {
-	__u32 nd_target[4];
-	__u8  nd_sll[ETH_ALEN];
-	__u8  nd_tll[ETH_ALEN];
-};
-
-/**
- * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
- * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
- * key.  Always present in notifications.  Required for all requests (except
- * dumps).
- * @OVS_FLOW_ATTR_ACTIONS: Nested %OVS_ACTION_ATTR_* attributes specifying
- * the actions to take for packets that match the key.  Always present in
- * notifications.  Required for %OVS_FLOW_CMD_NEW requests, optional for
- * %OVS_FLOW_CMD_SET requests.
- * @OVS_FLOW_ATTR_STATS: &struct ovs_flow_stats giving statistics for this
- * flow.  Present in notifications if the stats would be nonzero.  Ignored in
- * requests.
- * @OVS_FLOW_ATTR_TCP_FLAGS: An 8-bit value giving the OR'd value of all of the
- * TCP flags seen on packets in this flow.  Only present in notifications for
- * TCP flows, and only if it would be nonzero.  Ignored in requests.
- * @OVS_FLOW_ATTR_USED: A 64-bit integer giving the time, in milliseconds on
- * the system monotonic clock, at which a packet was last processed for this
- * flow.  Only present in notifications if a packet has been processed for this
- * flow.  Ignored in requests.
- * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
- * last-used time, accumulated TCP flags, and statistics for this flow.
- * Otherwise ignored in requests.  Never present in notifications.
- *
- * These attributes follow the &struct ovs_header within the Generic Netlink
- * payload for %OVS_FLOW_* commands.
- */
-enum ovs_flow_attr {
-	OVS_FLOW_ATTR_UNSPEC,
-	OVS_FLOW_ATTR_KEY,       /* Sequence of OVS_KEY_ATTR_* attributes. */
-	OVS_FLOW_ATTR_ACTIONS,   /* Nested OVS_ACTION_ATTR_* attributes. */
-	OVS_FLOW_ATTR_STATS,     /* struct ovs_flow_stats. */
-	OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
-	OVS_FLOW_ATTR_USED,      /* u64 msecs last used in monotonic time. */
-	OVS_FLOW_ATTR_CLEAR,     /* Flag to clear stats, tcp_flags, used. */
-	__OVS_FLOW_ATTR_MAX
-};
-
-#define OVS_FLOW_ATTR_MAX (__OVS_FLOW_ATTR_MAX - 1)
-
-/**
- * enum ovs_sample_attr - Attributes for %OVS_ACTION_ATTR_SAMPLE action.
- * @OVS_SAMPLE_ATTR_PROBABILITY: 32-bit fraction of packets to sample with
- * @OVS_ACTION_ATTR_SAMPLE.  A value of 0 samples no packets, a value of
- * %UINT32_MAX samples all packets and intermediate values sample intermediate
- * fractions of packets.
- * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event.
- * Actions are passed as nested attributes.
- *
- * Executes the specified actions with the given probability on a per-packet
- * basis.
- */
-enum ovs_sample_attr {
-	OVS_SAMPLE_ATTR_UNSPEC,
-	OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
-	OVS_SAMPLE_ATTR_ACTIONS,     /* Nested OVS_ACTION_ATTR_* attributes. */
-	__OVS_SAMPLE_ATTR_MAX,
-};
-
-#define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
-
-/**
- * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
- * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
- * message should be sent.  Required.
- * @OVS_USERSPACE_ATTR_USERDATA: If present, its variable-length argument is
- * copied to the %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA.
- */
-enum ovs_userspace_attr {
-	OVS_USERSPACE_ATTR_UNSPEC,
-	OVS_USERSPACE_ATTR_PID,	      /* u32 Netlink PID to receive upcalls. */
-	OVS_USERSPACE_ATTR_USERDATA,  /* Optional user-specified cookie. */
-	__OVS_USERSPACE_ATTR_MAX
-};
-
-#define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1)
-
-/**
- * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument.
- * @vlan_tpid: Tag protocol identifier (TPID) to push.
- * @vlan_tci: Tag control identifier (TCI) to push.  The CFI bit must be set
- * (but it will not be set in the 802.1Q header that is pushed).
- *
- * The @vlan_tpid value is typically %ETH_P_8021Q.  The only acceptable TPID
- * values are those that the kernel module also parses as 802.1Q headers, to
- * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN
- * from having surprising results.
- */
-struct ovs_action_push_vlan {
-	__be16 vlan_tpid;	/* 802.1Q TPID. */
-	__be16 vlan_tci;	/* 802.1Q TCI (VLAN ID and priority). */
-};
-
-/**
- * enum ovs_action_attr - Action types.
- *
- * @OVS_ACTION_ATTR_OUTPUT: Output packet to port.
- * @OVS_ACTION_ATTR_USERSPACE: Send packet to userspace according to nested
- * %OVS_USERSPACE_ATTR_* attributes.
- * @OVS_ACTION_ATTR_SET: Replaces the contents of an existing header.  The
- * single nested %OVS_KEY_ATTR_* attribute specifies a header to modify and its
- * value.
- * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the
- * packet.
- * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
- * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
- * the nested %OVS_SAMPLE_ATTR_* attributes.
- *
- * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
- * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
- * type may not be changed.
- */
-
-enum ovs_action_attr {
-	OVS_ACTION_ATTR_UNSPEC,
-	OVS_ACTION_ATTR_OUTPUT,	      /* u32 port number. */
-	OVS_ACTION_ATTR_USERSPACE,    /* Nested OVS_USERSPACE_ATTR_*. */
-	OVS_ACTION_ATTR_SET,          /* One nested OVS_KEY_ATTR_*. */
-	OVS_ACTION_ATTR_PUSH_VLAN,    /* struct ovs_action_push_vlan. */
-	OVS_ACTION_ATTR_POP_VLAN,     /* No argument. */
-	OVS_ACTION_ATTR_SAMPLE,       /* Nested OVS_SAMPLE_ATTR_*. */
-	__OVS_ACTION_ATTR_MAX
-};
-
-#define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
+#include <uapi/linux/openvswitch.h>
 
 #endif /* _LINUX_OPENVSWITCH_H */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 5c8a1d25e21c..d8fbc6aeac86 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -285,6 +285,7 @@ header-y += nvram.h
 header-y += omap3isp.h
 header-y += omapfb.h
 header-y += oom.h
+header-y += openvswitch.h
 header-y += packet_diag.h
 header-y += param.h
 header-y += parport.h
diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h
new file mode 100644
index 000000000000..405918dd7b3f
--- /dev/null
+++ b/include/uapi/linux/openvswitch.h
@@ -0,0 +1,456 @@
+
+/*
+ * Copyright (c) 2007-2011 Nicira Networks.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA
+ */
+
+#ifndef _UAPI__LINUX_OPENVSWITCH_H
+#define _UAPI__LINUX_OPENVSWITCH_H 1
+
+#include <linux/types.h>
+#include <linux/if_ether.h>
+
+/**
+ * struct ovs_header - header for OVS Generic Netlink messages.
+ * @dp_ifindex: ifindex of local port for datapath (0 to make a request not
+ * specific to a datapath).
+ *
+ * Attributes following the header are specific to a particular OVS Generic
+ * Netlink family, but all of the OVS families use this header.
+ */
+
+struct ovs_header {
+	int dp_ifindex;
+};
+
+/* Datapaths. */
+
+#define OVS_DATAPATH_FAMILY  "ovs_datapath"
+#define OVS_DATAPATH_MCGROUP "ovs_datapath"
+#define OVS_DATAPATH_VERSION 0x1
+
+enum ovs_datapath_cmd {
+	OVS_DP_CMD_UNSPEC,
+	OVS_DP_CMD_NEW,
+	OVS_DP_CMD_DEL,
+	OVS_DP_CMD_GET,
+	OVS_DP_CMD_SET
+};
+
+/**
+ * enum ovs_datapath_attr - attributes for %OVS_DP_* commands.
+ * @OVS_DP_ATTR_NAME: Name of the network device that serves as the "local
+ * port".  This is the name of the network device whose dp_ifindex is given in
+ * the &struct ovs_header.  Always present in notifications.  Required in
+ * %OVS_DP_NEW requests.  May be used as an alternative to specifying
+ * dp_ifindex in other requests (with a dp_ifindex of 0).
+ * @OVS_DP_ATTR_UPCALL_PID: The Netlink socket in userspace that is initially
+ * set on the datapath port (for OVS_ACTION_ATTR_MISS).  Only valid on
+ * %OVS_DP_CMD_NEW requests. A value of zero indicates that upcalls should
+ * not be sent.
+ * @OVS_DP_ATTR_STATS: Statistics about packets that have passed through the
+ * datapath.  Always present in notifications.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_DP_* commands.
+ */
+enum ovs_datapath_attr {
+	OVS_DP_ATTR_UNSPEC,
+	OVS_DP_ATTR_NAME,       /* name of dp_ifindex netdev */
+	OVS_DP_ATTR_UPCALL_PID, /* Netlink PID to receive upcalls */
+	OVS_DP_ATTR_STATS,      /* struct ovs_dp_stats */
+	__OVS_DP_ATTR_MAX
+};
+
+#define OVS_DP_ATTR_MAX (__OVS_DP_ATTR_MAX - 1)
+
+struct ovs_dp_stats {
+	__u64 n_hit;             /* Number of flow table matches. */
+	__u64 n_missed;          /* Number of flow table misses. */
+	__u64 n_lost;            /* Number of misses not sent to userspace. */
+	__u64 n_flows;           /* Number of flows present */
+};
+
+struct ovs_vport_stats {
+	__u64   rx_packets;		/* total packets received       */
+	__u64   tx_packets;		/* total packets transmitted    */
+	__u64   rx_bytes;		/* total bytes received         */
+	__u64   tx_bytes;		/* total bytes transmitted      */
+	__u64   rx_errors;		/* bad packets received         */
+	__u64   tx_errors;		/* packet transmit problems     */
+	__u64   rx_dropped;		/* no space in linux buffers    */
+	__u64   tx_dropped;		/* no space available in linux  */
+};
+
+/* Fixed logical ports. */
+#define OVSP_LOCAL      ((__u32)0)
+
+/* Packet transfer. */
+
+#define OVS_PACKET_FAMILY "ovs_packet"
+#define OVS_PACKET_VERSION 0x1
+
+enum ovs_packet_cmd {
+	OVS_PACKET_CMD_UNSPEC,
+
+	/* Kernel-to-user notifications. */
+	OVS_PACKET_CMD_MISS,    /* Flow table miss. */
+	OVS_PACKET_CMD_ACTION,  /* OVS_ACTION_ATTR_USERSPACE action. */
+
+	/* Userspace commands. */
+	OVS_PACKET_CMD_EXECUTE  /* Apply actions to a packet. */
+};
+
+/**
+ * enum ovs_packet_attr - attributes for %OVS_PACKET_* commands.
+ * @OVS_PACKET_ATTR_PACKET: Present for all notifications.  Contains the entire
+ * packet as received, from the start of the Ethernet header onward.  For
+ * %OVS_PACKET_CMD_ACTION, %OVS_PACKET_ATTR_PACKET reflects changes made by
+ * actions preceding %OVS_ACTION_ATTR_USERSPACE, but %OVS_PACKET_ATTR_KEY is
+ * the flow key extracted from the packet as originally received.
+ * @OVS_PACKET_ATTR_KEY: Present for all notifications.  Contains the flow key
+ * extracted from the packet as nested %OVS_KEY_ATTR_* attributes.  This allows
+ * userspace to adapt its flow setup strategy by comparing its notion of the
+ * flow key against the kernel's.
+ * @OVS_PACKET_ATTR_ACTIONS: Contains actions for the packet.  Used
+ * for %OVS_PACKET_CMD_EXECUTE.  It has nested %OVS_ACTION_ATTR_* attributes.
+ * @OVS_PACKET_ATTR_USERDATA: Present for an %OVS_PACKET_CMD_ACTION
+ * notification if the %OVS_ACTION_ATTR_USERSPACE action specified an
+ * %OVS_USERSPACE_ATTR_USERDATA attribute, with the same length and content
+ * specified there.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_PACKET_* commands.
+ */
+enum ovs_packet_attr {
+	OVS_PACKET_ATTR_UNSPEC,
+	OVS_PACKET_ATTR_PACKET,      /* Packet data. */
+	OVS_PACKET_ATTR_KEY,         /* Nested OVS_KEY_ATTR_* attributes. */
+	OVS_PACKET_ATTR_ACTIONS,     /* Nested OVS_ACTION_ATTR_* attributes. */
+	OVS_PACKET_ATTR_USERDATA,    /* OVS_ACTION_ATTR_USERSPACE arg. */
+	__OVS_PACKET_ATTR_MAX
+};
+
+#define OVS_PACKET_ATTR_MAX (__OVS_PACKET_ATTR_MAX - 1)
+
+/* Virtual ports. */
+
+#define OVS_VPORT_FAMILY  "ovs_vport"
+#define OVS_VPORT_MCGROUP "ovs_vport"
+#define OVS_VPORT_VERSION 0x1
+
+enum ovs_vport_cmd {
+	OVS_VPORT_CMD_UNSPEC,
+	OVS_VPORT_CMD_NEW,
+	OVS_VPORT_CMD_DEL,
+	OVS_VPORT_CMD_GET,
+	OVS_VPORT_CMD_SET
+};
+
+enum ovs_vport_type {
+	OVS_VPORT_TYPE_UNSPEC,
+	OVS_VPORT_TYPE_NETDEV,   /* network device */
+	OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
+	__OVS_VPORT_TYPE_MAX
+};
+
+#define OVS_VPORT_TYPE_MAX (__OVS_VPORT_TYPE_MAX - 1)
+
+/**
+ * enum ovs_vport_attr - attributes for %OVS_VPORT_* commands.
+ * @OVS_VPORT_ATTR_PORT_NO: 32-bit port number within datapath.
+ * @OVS_VPORT_ATTR_TYPE: 32-bit %OVS_VPORT_TYPE_* constant describing the type
+ * of vport.
+ * @OVS_VPORT_ATTR_NAME: Name of vport.  For a vport based on a network device
+ * this is the name of the network device.  Maximum length %IFNAMSIZ-1 bytes
+ * plus a null terminator.
+ * @OVS_VPORT_ATTR_OPTIONS: Vport-specific configuration information.
+ * @OVS_VPORT_ATTR_UPCALL_PID: The Netlink socket in userspace that
+ * OVS_PACKET_CMD_MISS upcalls will be directed to for packets received on
+ * this port.  A value of zero indicates that upcalls should not be sent.
+ * @OVS_VPORT_ATTR_STATS: A &struct ovs_vport_stats giving statistics for
+ * packets sent or received through the vport.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_VPORT_* commands.
+ *
+ * For %OVS_VPORT_CMD_NEW requests, the %OVS_VPORT_ATTR_TYPE and
+ * %OVS_VPORT_ATTR_NAME attributes are required.  %OVS_VPORT_ATTR_PORT_NO is
+ * optional; if not specified a free port number is automatically selected.
+ * Whether %OVS_VPORT_ATTR_OPTIONS is required or optional depends on the type
+ * of vport.
+ * and other attributes are ignored.
+ *
+ * For other requests, if %OVS_VPORT_ATTR_NAME is specified then it is used to
+ * look up the vport to operate on; otherwise dp_idx from the &struct
+ * ovs_header plus %OVS_VPORT_ATTR_PORT_NO determine the vport.
+ */
+enum ovs_vport_attr {
+	OVS_VPORT_ATTR_UNSPEC,
+	OVS_VPORT_ATTR_PORT_NO,	/* u32 port number within datapath */
+	OVS_VPORT_ATTR_TYPE,	/* u32 OVS_VPORT_TYPE_* constant. */
+	OVS_VPORT_ATTR_NAME,	/* string name, up to IFNAMSIZ bytes long */
+	OVS_VPORT_ATTR_OPTIONS, /* nested attributes, varies by vport type */
+	OVS_VPORT_ATTR_UPCALL_PID, /* u32 Netlink PID to receive upcalls */
+	OVS_VPORT_ATTR_STATS,	/* struct ovs_vport_stats */
+	__OVS_VPORT_ATTR_MAX
+};
+
+#define OVS_VPORT_ATTR_MAX (__OVS_VPORT_ATTR_MAX - 1)
+
+/* Flows. */
+
+#define OVS_FLOW_FAMILY  "ovs_flow"
+#define OVS_FLOW_MCGROUP "ovs_flow"
+#define OVS_FLOW_VERSION 0x1
+
+enum ovs_flow_cmd {
+	OVS_FLOW_CMD_UNSPEC,
+	OVS_FLOW_CMD_NEW,
+	OVS_FLOW_CMD_DEL,
+	OVS_FLOW_CMD_GET,
+	OVS_FLOW_CMD_SET
+};
+
+struct ovs_flow_stats {
+	__u64 n_packets;         /* Number of matched packets. */
+	__u64 n_bytes;           /* Number of matched bytes. */
+};
+
+enum ovs_key_attr {
+	OVS_KEY_ATTR_UNSPEC,
+	OVS_KEY_ATTR_ENCAP,	/* Nested set of encapsulated attributes. */
+	OVS_KEY_ATTR_PRIORITY,  /* u32 skb->priority */
+	OVS_KEY_ATTR_IN_PORT,   /* u32 OVS dp port number */
+	OVS_KEY_ATTR_ETHERNET,  /* struct ovs_key_ethernet */
+	OVS_KEY_ATTR_VLAN,	/* be16 VLAN TCI */
+	OVS_KEY_ATTR_ETHERTYPE,	/* be16 Ethernet type */
+	OVS_KEY_ATTR_IPV4,      /* struct ovs_key_ipv4 */
+	OVS_KEY_ATTR_IPV6,      /* struct ovs_key_ipv6 */
+	OVS_KEY_ATTR_TCP,       /* struct ovs_key_tcp */
+	OVS_KEY_ATTR_UDP,       /* struct ovs_key_udp */
+	OVS_KEY_ATTR_ICMP,      /* struct ovs_key_icmp */
+	OVS_KEY_ATTR_ICMPV6,    /* struct ovs_key_icmpv6 */
+	OVS_KEY_ATTR_ARP,       /* struct ovs_key_arp */
+	OVS_KEY_ATTR_ND,        /* struct ovs_key_nd */
+	OVS_KEY_ATTR_SKB_MARK,  /* u32 skb mark */
+	__OVS_KEY_ATTR_MAX
+};
+
+#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
+
+/**
+ * enum ovs_frag_type - IPv4 and IPv6 fragment type
+ * @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
+ * @OVS_FRAG_TYPE_FIRST: Packet is a fragment with offset 0.
+ * @OVS_FRAG_TYPE_LATER: Packet is a fragment with nonzero offset.
+ *
+ * Used as the @ipv4_frag in &struct ovs_key_ipv4 and as @ipv6_frag &struct
+ * ovs_key_ipv6.
+ */
+enum ovs_frag_type {
+	OVS_FRAG_TYPE_NONE,
+	OVS_FRAG_TYPE_FIRST,
+	OVS_FRAG_TYPE_LATER,
+	__OVS_FRAG_TYPE_MAX
+};
+
+#define OVS_FRAG_TYPE_MAX (__OVS_FRAG_TYPE_MAX - 1)
+
+struct ovs_key_ethernet {
+	__u8	 eth_src[ETH_ALEN];
+	__u8	 eth_dst[ETH_ALEN];
+};
+
+struct ovs_key_ipv4 {
+	__be32 ipv4_src;
+	__be32 ipv4_dst;
+	__u8   ipv4_proto;
+	__u8   ipv4_tos;
+	__u8   ipv4_ttl;
+	__u8   ipv4_frag;	/* One of OVS_FRAG_TYPE_*. */
+};
+
+struct ovs_key_ipv6 {
+	__be32 ipv6_src[4];
+	__be32 ipv6_dst[4];
+	__be32 ipv6_label;	/* 20-bits in least-significant bits. */
+	__u8   ipv6_proto;
+	__u8   ipv6_tclass;
+	__u8   ipv6_hlimit;
+	__u8   ipv6_frag;	/* One of OVS_FRAG_TYPE_*. */
+};
+
+struct ovs_key_tcp {
+	__be16 tcp_src;
+	__be16 tcp_dst;
+};
+
+struct ovs_key_udp {
+	__be16 udp_src;
+	__be16 udp_dst;
+};
+
+struct ovs_key_icmp {
+	__u8 icmp_type;
+	__u8 icmp_code;
+};
+
+struct ovs_key_icmpv6 {
+	__u8 icmpv6_type;
+	__u8 icmpv6_code;
+};
+
+struct ovs_key_arp {
+	__be32 arp_sip;
+	__be32 arp_tip;
+	__be16 arp_op;
+	__u8   arp_sha[ETH_ALEN];
+	__u8   arp_tha[ETH_ALEN];
+};
+
+struct ovs_key_nd {
+	__u32 nd_target[4];
+	__u8  nd_sll[ETH_ALEN];
+	__u8  nd_tll[ETH_ALEN];
+};
+
+/**
+ * enum ovs_flow_attr - attributes for %OVS_FLOW_* commands.
+ * @OVS_FLOW_ATTR_KEY: Nested %OVS_KEY_ATTR_* attributes specifying the flow
+ * key.  Always present in notifications.  Required for all requests (except
+ * dumps).
+ * @OVS_FLOW_ATTR_ACTIONS: Nested %OVS_ACTION_ATTR_* attributes specifying
+ * the actions to take for packets that match the key.  Always present in
+ * notifications.  Required for %OVS_FLOW_CMD_NEW requests, optional for
+ * %OVS_FLOW_CMD_SET requests.
+ * @OVS_FLOW_ATTR_STATS: &struct ovs_flow_stats giving statistics for this
+ * flow.  Present in notifications if the stats would be nonzero.  Ignored in
+ * requests.
+ * @OVS_FLOW_ATTR_TCP_FLAGS: An 8-bit value giving the OR'd value of all of the
+ * TCP flags seen on packets in this flow.  Only present in notifications for
+ * TCP flows, and only if it would be nonzero.  Ignored in requests.
+ * @OVS_FLOW_ATTR_USED: A 64-bit integer giving the time, in milliseconds on
+ * the system monotonic clock, at which a packet was last processed for this
+ * flow.  Only present in notifications if a packet has been processed for this
+ * flow.  Ignored in requests.
+ * @OVS_FLOW_ATTR_CLEAR: If present in a %OVS_FLOW_CMD_SET request, clears the
+ * last-used time, accumulated TCP flags, and statistics for this flow.
+ * Otherwise ignored in requests.  Never present in notifications.
+ *
+ * These attributes follow the &struct ovs_header within the Generic Netlink
+ * payload for %OVS_FLOW_* commands.
+ */
+enum ovs_flow_attr {
+	OVS_FLOW_ATTR_UNSPEC,
+	OVS_FLOW_ATTR_KEY,       /* Sequence of OVS_KEY_ATTR_* attributes. */
+	OVS_FLOW_ATTR_ACTIONS,   /* Nested OVS_ACTION_ATTR_* attributes. */
+	OVS_FLOW_ATTR_STATS,     /* struct ovs_flow_stats. */
+	OVS_FLOW_ATTR_TCP_FLAGS, /* 8-bit OR'd TCP flags. */
+	OVS_FLOW_ATTR_USED,      /* u64 msecs last used in monotonic time. */
+	OVS_FLOW_ATTR_CLEAR,     /* Flag to clear stats, tcp_flags, used. */
+	__OVS_FLOW_ATTR_MAX
+};
+
+#define OVS_FLOW_ATTR_MAX (__OVS_FLOW_ATTR_MAX - 1)
+
+/**
+ * enum ovs_sample_attr - Attributes for %OVS_ACTION_ATTR_SAMPLE action.
+ * @OVS_SAMPLE_ATTR_PROBABILITY: 32-bit fraction of packets to sample with
+ * @OVS_ACTION_ATTR_SAMPLE.  A value of 0 samples no packets, a value of
+ * %UINT32_MAX samples all packets and intermediate values sample intermediate
+ * fractions of packets.
+ * @OVS_SAMPLE_ATTR_ACTIONS: Set of actions to execute in sampling event.
+ * Actions are passed as nested attributes.
+ *
+ * Executes the specified actions with the given probability on a per-packet
+ * basis.
+ */
+enum ovs_sample_attr {
+	OVS_SAMPLE_ATTR_UNSPEC,
+	OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */
+	OVS_SAMPLE_ATTR_ACTIONS,     /* Nested OVS_ACTION_ATTR_* attributes. */
+	__OVS_SAMPLE_ATTR_MAX,
+};
+
+#define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1)
+
+/**
+ * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action.
+ * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION
+ * message should be sent.  Required.
+ * @OVS_USERSPACE_ATTR_USERDATA: If present, its variable-length argument is
+ * copied to the %OVS_PACKET_CMD_ACTION message as %OVS_PACKET_ATTR_USERDATA.
+ */
+enum ovs_userspace_attr {
+	OVS_USERSPACE_ATTR_UNSPEC,
+	OVS_USERSPACE_ATTR_PID,	      /* u32 Netlink PID to receive upcalls. */
+	OVS_USERSPACE_ATTR_USERDATA,  /* Optional user-specified cookie. */
+	__OVS_USERSPACE_ATTR_MAX
+};
+
+#define OVS_USERSPACE_ATTR_MAX (__OVS_USERSPACE_ATTR_MAX - 1)
+
+/**
+ * struct ovs_action_push_vlan - %OVS_ACTION_ATTR_PUSH_VLAN action argument.
+ * @vlan_tpid: Tag protocol identifier (TPID) to push.
+ * @vlan_tci: Tag control identifier (TCI) to push.  The CFI bit must be set
+ * (but it will not be set in the 802.1Q header that is pushed).
+ *
+ * The @vlan_tpid value is typically %ETH_P_8021Q.  The only acceptable TPID
+ * values are those that the kernel module also parses as 802.1Q headers, to
+ * prevent %OVS_ACTION_ATTR_PUSH_VLAN followed by %OVS_ACTION_ATTR_POP_VLAN
+ * from having surprising results.
+ */
+struct ovs_action_push_vlan {
+	__be16 vlan_tpid;	/* 802.1Q TPID. */
+	__be16 vlan_tci;	/* 802.1Q TCI (VLAN ID and priority). */
+};
+
+/**
+ * enum ovs_action_attr - Action types.
+ *
+ * @OVS_ACTION_ATTR_OUTPUT: Output packet to port.
+ * @OVS_ACTION_ATTR_USERSPACE: Send packet to userspace according to nested
+ * %OVS_USERSPACE_ATTR_* attributes.
+ * @OVS_ACTION_ATTR_SET: Replaces the contents of an existing header.  The
+ * single nested %OVS_KEY_ATTR_* attribute specifies a header to modify and its
+ * value.
+ * @OVS_ACTION_ATTR_PUSH_VLAN: Push a new outermost 802.1Q header onto the
+ * packet.
+ * @OVS_ACTION_ATTR_POP_VLAN: Pop the outermost 802.1Q header off the packet.
+ * @OVS_ACTION_ATTR_SAMPLE: Probabilitically executes actions, as specified in
+ * the nested %OVS_SAMPLE_ATTR_* attributes.
+ *
+ * Only a single header can be set with a single %OVS_ACTION_ATTR_SET.  Not all
+ * fields within a header are modifiable, e.g. the IPv4 protocol and fragment
+ * type may not be changed.
+ */
+
+enum ovs_action_attr {
+	OVS_ACTION_ATTR_UNSPEC,
+	OVS_ACTION_ATTR_OUTPUT,	      /* u32 port number. */
+	OVS_ACTION_ATTR_USERSPACE,    /* Nested OVS_USERSPACE_ATTR_*. */
+	OVS_ACTION_ATTR_SET,          /* One nested OVS_KEY_ATTR_*. */
+	OVS_ACTION_ATTR_PUSH_VLAN,    /* struct ovs_action_push_vlan. */
+	OVS_ACTION_ATTR_POP_VLAN,     /* No argument. */
+	OVS_ACTION_ATTR_SAMPLE,       /* Nested OVS_SAMPLE_ATTR_*. */
+	__OVS_ACTION_ATTR_MAX
+};
+
+#define OVS_ACTION_ATTR_MAX (__OVS_ACTION_ATTR_MAX - 1)
+
+#endif /* _LINUX_OPENVSWITCH_H */
-- 
cgit 


From a691ce7fe451363d2f1fa48d30c8f4b87c2475d4 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Thu, 28 Mar 2013 15:24:53 +0000
Subject: include/linux: printk is needed in filter.h when CONFIG_BPF_JIT is
 defined

for make V=1 EXTRA_CFLAGS=-W ARCH=arm allmodconfig
    printk is need when CONFIG_BPF_JIT is defined
    or it will report pr_err and print_hex_dump are implicit declaration

Signed-off-by: Chen Gang <gang.chen@asianux.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d7d25083130b..d1248f401a56 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -48,6 +48,9 @@ extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
 extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len);
 
 #ifdef CONFIG_BPF_JIT
+#include <linux/linkage.h>
+#include <linux/printk.h>
+
 extern void bpf_jit_compile(struct sk_filter *fp);
 extern void bpf_jit_free(struct sk_filter *fp);
 
-- 
cgit 


From 4c3d5e7b41dda1b1372bfc2545ef092a1bc5ad33 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 30 Mar 2013 06:31:03 +0000
Subject: net: reorder some fields of net_device

As time passed, some fields were added in net_device, and not
at sensible offsets.

Lets reorder some fields to reduce number of cache lines in RX path.
Fields not used in data path should be moved out of this critical cache
line.

In particular, move broadcast[] to the end of the rx section,
as it is less used, and ethernet uses only the beginning of the 32bytes
field.

Before patch :

offsetof(struct net_device,dev_addr)=0x258
offsetof(struct net_device,rx_handler)=0x2b8
offsetof(struct net_device,ingress_queue)=0x2c8
offsetof(struct net_device,broadcast)=0x278

After :

offsetof(struct net_device,dev_addr)=0x280
offsetof(struct net_device,rx_handler)=0x298
offsetof(struct net_device,ingress_queue)=0x2a8
offsetof(struct net_device,broadcast)=0x2b0

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 1dbb02c98946..4491414a9218 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1071,6 +1071,8 @@ struct net_device {
 	struct list_head	dev_list;
 	struct list_head	napi_list;
 	struct list_head	unreg_list;
+	struct list_head	upper_dev_list; /* List of upper devices */
+
 
 	/* currently active device features */
 	netdev_features_t	features;
@@ -1143,6 +1145,13 @@ struct net_device {
 	spinlock_t		addr_list_lock;
 	struct netdev_hw_addr_list	uc;	/* Unicast mac addresses */
 	struct netdev_hw_addr_list	mc;	/* Multicast mac addresses */
+	struct netdev_hw_addr_list	dev_addrs; /* list of device
+						    * hw addresses
+						    */
+#ifdef CONFIG_SYSFS
+	struct kset		*queues_kset;
+#endif
+
 	bool			uc_promisc;
 	unsigned int		promiscuity;
 	unsigned int		allmulti;
@@ -1175,21 +1184,11 @@ struct net_device {
 						 * avoid dirtying this cache line.
 						 */
 
-	struct list_head	upper_dev_list; /* List of upper devices */
-
 	/* Interface address info used in eth_type_trans() */
 	unsigned char		*dev_addr;	/* hw address, (before bcast
 						   because most packets are
 						   unicast) */
 
-	struct netdev_hw_addr_list	dev_addrs; /* list of device
-						      hw addresses */
-
-	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
-
-#ifdef CONFIG_SYSFS
-	struct kset		*queues_kset;
-#endif
 
 #ifdef CONFIG_RPS
 	struct netdev_rx_queue	*_rx;
@@ -1200,18 +1199,14 @@ struct net_device {
 	/* Number of RX queues currently active in device */
 	unsigned int		real_num_rx_queues;
 
-#ifdef CONFIG_RFS_ACCEL
-	/* CPU reverse-mapping for RX completion interrupts, indexed
-	 * by RX queue number.  Assigned by driver.  This must only be
-	 * set if the ndo_rx_flow_steer operation is defined. */
-	struct cpu_rmap		*rx_cpu_rmap;
-#endif
 #endif
 
 	rx_handler_func_t __rcu	*rx_handler;
 	void __rcu		*rx_handler_data;
 
 	struct netdev_queue __rcu *ingress_queue;
+	unsigned char		broadcast[MAX_ADDR_LEN];	/* hw bcast add	*/
+
 
 /*
  * Cache lines mostly used on transmit path
@@ -1233,6 +1228,12 @@ struct net_device {
 #ifdef CONFIG_XPS
 	struct xps_dev_maps __rcu *xps_maps;
 #endif
+#ifdef CONFIG_RFS_ACCEL
+	/* CPU reverse-mapping for RX completion interrupts, indexed
+	 * by RX queue number.  Assigned by driver.  This must only be
+	 * set if the ndo_rx_flow_steer operation is defined. */
+	struct cpu_rmap		*rx_cpu_rmap;
+#endif
 
 	/* These may be needed for future network-power-down code. */
 
-- 
cgit 


From b60e6a0eb0273132cbb60a9806abf5f47a4aee1c Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Thu, 21 Mar 2013 12:21:31 +0000
Subject: cpuidle : handle clockevent notify from the cpuidle framework

When a cpu enters a deep idle state, the local timers are stopped and
the time framework falls back to the timer device used as a broadcast
timer.

The different cpuidle drivers are calling clockevents_notify ENTER/EXIT
when the idle state stops the local timer.

Add a new flag CPUIDLE_FLAG_TIMER_STOP which can be set by the cpuidle
drivers. If the flag is set, the cpuidle core code takes care of the
notification on behalf of the driver to avoid pointless code duplication.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Santosh Shilimkar <santosh.shilimkar@ti.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/cpuidle.c | 9 +++++++++
 include/linux/cpuidle.h   | 1 +
 2 files changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index eba69290e074..c50037029184 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -8,6 +8,7 @@
  * This code is licenced under the GPL.
  */
 
+#include <linux/clockchips.h>
 #include <linux/kernel.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
@@ -146,12 +147,20 @@ int cpuidle_idle_call(void)
 
 	trace_cpu_idle_rcuidle(next_state, dev->cpu);
 
+	if (drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP)
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
+				   &dev->cpu);
+
 	if (cpuidle_state_is_coupled(dev, drv, next_state))
 		entered_state = cpuidle_enter_state_coupled(dev, drv,
 							    next_state);
 	else
 		entered_state = cpuidle_enter_state(dev, drv, next_state);
 
+	if (drv->states[next_state].flags & CPUIDLE_FLAG_TIMER_STOP)
+		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
+				   &dev->cpu);
+
 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
 
 	/* give the governor an opportunity to reflect on the outcome */
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 480c14dc1ddd..a837b332df65 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -57,6 +57,7 @@ struct cpuidle_state {
 /* Idle State Flags */
 #define CPUIDLE_FLAG_TIME_VALID	(0x01) /* is residency time measurable? */
 #define CPUIDLE_FLAG_COUPLED	(0x02) /* state applies to multiple cpus */
+#define CPUIDLE_FLAG_TIMER_STOP (0x04)  /* timer is stopped on this state */
 
 #define CPUIDLE_DRIVER_FLAGS_MASK (0xFFFF0000)
 
-- 
cgit 


From 4dbad816febb6cb7340e36af4f5c0dc86e55a2ca Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 27 Mar 2013 10:22:09 +0000
Subject: timer: move enum definition out of ifdef section

The next patch will setup automatically the broadcast timer for
the different cpuidle driver when one idle state stops its timer.
This will be part of the generic code.

But some ARM boards, like s3c64xx, uses cpuidle but without the
CONFIG_GENERIC_CLOCKEVENTS_BUILD set. Hence the cpuidle framework
will be compiled with the code supposed to be generic, that is
with clockevents_notify and the different enum.

Also the function clockevents_notify is a noop macro, this is fine
except the usual code is:

	int cpu = smp_processor_id();
	clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ON, &cpu);

and that raises a warning for the variable cpu which is not used.

Move the clock_event_nofitiers enum definition out of the
CONFIG_GENERIC_CLOCKEVENTS_BUILD section to prevent a compilation
error when these are used in the code.

Change the clockevents_notify macro to a static inline noop function
to prevent a compilation warning.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/clockchips.h | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 66346521cb65..f9fd93758333 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -8,6 +8,20 @@
 #ifndef _LINUX_CLOCKCHIPS_H
 #define _LINUX_CLOCKCHIPS_H
 
+/* Clock event notification values */
+enum clock_event_nofitiers {
+	CLOCK_EVT_NOTIFY_ADD,
+	CLOCK_EVT_NOTIFY_BROADCAST_ON,
+	CLOCK_EVT_NOTIFY_BROADCAST_OFF,
+	CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
+	CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
+	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
+	CLOCK_EVT_NOTIFY_SUSPEND,
+	CLOCK_EVT_NOTIFY_RESUME,
+	CLOCK_EVT_NOTIFY_CPU_DYING,
+	CLOCK_EVT_NOTIFY_CPU_DEAD,
+};
+
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
 
 #include <linux/clocksource.h>
@@ -26,20 +40,6 @@ enum clock_event_mode {
 	CLOCK_EVT_MODE_RESUME,
 };
 
-/* Clock event notification values */
-enum clock_event_nofitiers {
-	CLOCK_EVT_NOTIFY_ADD,
-	CLOCK_EVT_NOTIFY_BROADCAST_ON,
-	CLOCK_EVT_NOTIFY_BROADCAST_OFF,
-	CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
-	CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
-	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-	CLOCK_EVT_NOTIFY_SUSPEND,
-	CLOCK_EVT_NOTIFY_RESUME,
-	CLOCK_EVT_NOTIFY_CPU_DYING,
-	CLOCK_EVT_NOTIFY_CPU_DEAD,
-};
-
 /*
  * Clock event features
  */
@@ -173,7 +173,7 @@ extern int tick_receive_broadcast(void);
 #ifdef CONFIG_GENERIC_CLOCKEVENTS
 extern void clockevents_notify(unsigned long reason, void *arg);
 #else
-# define clockevents_notify(reason, arg) do { } while (0)
+static inline void clockevents_notify(unsigned long reason, void *arg) {}
 #endif
 
 #else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */
@@ -181,7 +181,7 @@ extern void clockevents_notify(unsigned long reason, void *arg);
 static inline void clockevents_suspend(void) {}
 static inline void clockevents_resume(void) {}
 
-#define clockevents_notify(reason, arg) do { } while (0)
+static inline void clockevents_notify(unsigned long reason, void *arg) {}
 
 #endif
 
-- 
cgit 


From a06df062a189a8d5588babb8bf0bb78672497798 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Wed, 27 Mar 2013 10:22:10 +0000
Subject: cpuidle: initialize the broadcast timer framework

The commit 89878baa73f0f1c679355006bd8632e5d78f96c2 introduced
the CPUIDLE_FLAG_TIMER_STOP flag where we specify a specific idle
state stops the local timer.

Now use this flag to check at init time if one state will need
the broadcast timer and, in this case, setup the broadcast timer
framework. That prevents multiple code duplication in the drivers.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpuidle/driver.c | 31 +++++++++++++++++++++++++++++--
 include/linux/cpuidle.h  |  2 ++
 2 files changed, 31 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index 422c7b69ba7c..8dfaaae94444 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -11,6 +11,8 @@
 #include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/cpuidle.h>
+#include <linux/cpumask.h>
+#include <linux/clockchips.h>
 
 #include "cpuidle.h"
 
@@ -19,9 +21,28 @@ DEFINE_SPINLOCK(cpuidle_driver_lock);
 static void __cpuidle_set_cpu_driver(struct cpuidle_driver *drv, int cpu);
 static struct cpuidle_driver * __cpuidle_get_cpu_driver(int cpu);
 
-static void __cpuidle_driver_init(struct cpuidle_driver *drv)
+static void cpuidle_setup_broadcast_timer(void *arg)
 {
+	int cpu = smp_processor_id();
+	clockevents_notify((long)(arg), &cpu);
+}
+
+static void __cpuidle_driver_init(struct cpuidle_driver *drv, int cpu)
+{
+	int i;
+
 	drv->refcnt = 0;
+
+	for (i = drv->state_count - 1; i >= 0 ; i--) {
+
+		if (!(drv->states[i].flags & CPUIDLE_FLAG_TIMER_STOP))
+			continue;
+
+		drv->bctimer = 1;
+		on_each_cpu_mask(get_cpu_mask(cpu), cpuidle_setup_broadcast_timer,
+				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_ON, 1);
+		break;
+	}
 }
 
 static int __cpuidle_register_driver(struct cpuidle_driver *drv, int cpu)
@@ -35,7 +56,7 @@ static int __cpuidle_register_driver(struct cpuidle_driver *drv, int cpu)
 	if (__cpuidle_get_cpu_driver(cpu))
 		return -EBUSY;
 
-	__cpuidle_driver_init(drv);
+	__cpuidle_driver_init(drv, cpu);
 
 	__cpuidle_set_cpu_driver(drv, cpu);
 
@@ -49,6 +70,12 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv, int cpu)
 
 	if (!WARN_ON(drv->refcnt > 0))
 		__cpuidle_set_cpu_driver(NULL, cpu);
+
+	if (drv->bctimer) {
+		drv->bctimer = 0;
+		on_each_cpu_mask(get_cpu_mask(cpu), cpuidle_setup_broadcast_timer,
+				 (void *)CLOCK_EVT_NOTIFY_BROADCAST_OFF, 1);
+	}
 }
 
 #ifdef CONFIG_CPU_IDLE_MULTIPLE_DRIVERS
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index a837b332df65..fc3e5808b7ff 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -107,6 +107,8 @@ struct cpuidle_driver {
 
 	/* set to 1 to use the core cpuidle time keeping (for all states). */
 	unsigned int		en_core_tk_irqen:1;
+        /* used by the cpuidle framework to setup the broadcast timer */
+	unsigned int            bctimer:1;
 	/* states array must be ordered in decreasing power consumption */
 	struct cpuidle_state	states[CPUIDLE_STATE_MAX];
 	int			state_count;
-- 
cgit 


From 7bd353a995d9049262661d85811d6109140582a3 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 27 Mar 2013 15:58:57 +0000
Subject: cpufreq: Add per policy governor-init/exit infrastructure

Currently, there can't be multiple instances of single governor_type.
If we have a multi-package system, where we have multiple instances
of struct policy (per package), we can't have multiple instances of
same governor. i.e. We can't have multiple instances of ondemand
governor for multiple packages.

Governors directory in sysfs is created at /sys/devices/system/cpu/cpufreq/
governor-name/. Which again reflects that there can be only one
instance of a governor_type in the system.

This is a bottleneck for multicluster system, where we want different
packages to use same governor type, but with different tunables.

This patch is inclined towards providing this infrastructure. Because
we are required to allocate governor's resources dynamically now, we
must do it at policy creation and end. And so got
CPUFREQ_GOV_POLICY_INIT/EXIT.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c | 21 ++++++++++++++++++---
 include/linux/cpufreq.h   |  9 ++++++---
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index c5996fe7250e..08df7a196116 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1070,6 +1070,8 @@ static int __cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif
 
 	/* If cpu is last user of policy, free policy */
 	if (cpus == 1) {
+		__cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
+
 		lock_policy_rwsem_read(cpu);
 		kobj = &data->kobj;
 		cmp = &data->kobj_unregister;
@@ -1651,7 +1653,7 @@ EXPORT_SYMBOL(cpufreq_get_policy);
 static int __cpufreq_set_policy(struct cpufreq_policy *data,
 				struct cpufreq_policy *policy)
 {
-	int ret = 0;
+	int ret = 0, failed = 1;
 
 	pr_debug("setting new policy for CPU %u: %u - %u kHz\n", policy->cpu,
 		policy->min, policy->max);
@@ -1705,17 +1707,30 @@ static int __cpufreq_set_policy(struct cpufreq_policy *data,
 			pr_debug("governor switch\n");
 
 			/* end old governor */
-			if (data->governor)
+			if (data->governor) {
 				__cpufreq_governor(data, CPUFREQ_GOV_STOP);
+				__cpufreq_governor(data,
+						CPUFREQ_GOV_POLICY_EXIT);
+			}
 
 			/* start new governor */
 			data->governor = policy->governor;
-			if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
+			if (!__cpufreq_governor(data, CPUFREQ_GOV_POLICY_INIT)) {
+				if (!__cpufreq_governor(data, CPUFREQ_GOV_START))
+					failed = 0;
+				else
+					__cpufreq_governor(data,
+							CPUFREQ_GOV_POLICY_EXIT);
+			}
+
+			if (failed) {
 				/* new governor failed, so re-start old one */
 				pr_debug("starting governor %s failed\n",
 							data->governor->name);
 				if (old_gov) {
 					data->governor = old_gov;
+					__cpufreq_governor(data,
+							CPUFREQ_GOV_POLICY_INIT);
 					__cpufreq_governor(data,
 							   CPUFREQ_GOV_START);
 				}
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index a22944ca0526..b7393b56f552 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -106,6 +106,7 @@ struct cpufreq_policy {
 					 * governors are used */
 	unsigned int		policy; /* see above */
 	struct cpufreq_governor	*governor; /* see below */
+	void			*governor_data;
 
 	struct work_struct	update; /* if update_policy() needs to be
 					 * called, but you're in IRQ context */
@@ -178,9 +179,11 @@ static inline unsigned long cpufreq_scale(unsigned long old, u_int div, u_int mu
  *                          CPUFREQ GOVERNORS                        *
  *********************************************************************/
 
-#define CPUFREQ_GOV_START  1
-#define CPUFREQ_GOV_STOP   2
-#define CPUFREQ_GOV_LIMITS 3
+#define CPUFREQ_GOV_START	1
+#define CPUFREQ_GOV_STOP	2
+#define CPUFREQ_GOV_LIMITS	3
+#define CPUFREQ_GOV_POLICY_INIT	4
+#define CPUFREQ_GOV_POLICY_EXIT	5
 
 struct cpufreq_governor {
 	char	name[CPUFREQ_NAME_LEN];
-- 
cgit 


From 4d5dcc4211f9def4281eafb54b8ed483862e8135 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Wed, 27 Mar 2013 15:58:58 +0000
Subject: cpufreq: governor: Implement per policy instances of governors

Currently, there can't be multiple instances of single governor_type.
If we have a multi-package system, where we have multiple instances
of struct policy (per package), we can't have multiple instances of
same governor. i.e. We can't have multiple instances of ondemand
governor for multiple packages.

Governors directory in sysfs is created at /sys/devices/system/cpu/cpufreq/
governor-name/. Which again reflects that there can be only one
instance of a governor_type in the system.

This is a bottleneck for multicluster system, where we want different
packages to use same governor type, but with different tunables.

This patch uses the infrastructure provided by earlier patch and
implements init/exit routines for ondemand and conservative
governors.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 drivers/cpufreq/cpufreq.c              |  15 +-
 drivers/cpufreq/cpufreq_conservative.c | 193 ++++++++++++++----------
 drivers/cpufreq/cpufreq_governor.c     | 212 +++++++++++++++++---------
 drivers/cpufreq/cpufreq_governor.h     | 117 +++++++++++++--
 drivers/cpufreq/cpufreq_ondemand.c     | 263 ++++++++++++++++++++-------------
 include/linux/cpufreq.h                |   8 +
 6 files changed, 538 insertions(+), 270 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 08df7a196116..85963fc48a5f 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -128,6 +128,11 @@ void disable_cpufreq(void)
 static LIST_HEAD(cpufreq_governor_list);
 static DEFINE_MUTEX(cpufreq_governor_mutex);
 
+bool have_governor_per_policy(void)
+{
+	return cpufreq_driver->have_governor_per_policy;
+}
+
 static struct cpufreq_policy *__cpufreq_cpu_get(unsigned int cpu, bool sysfs)
 {
 	struct cpufreq_policy *data;
@@ -1546,10 +1551,12 @@ static int __cpufreq_governor(struct cpufreq_policy *policy,
 						policy->cpu, event);
 	ret = policy->governor->governor(policy, event);
 
-	if (event == CPUFREQ_GOV_START)
-		policy->governor->initialized++;
-	else if (event == CPUFREQ_GOV_STOP)
-		policy->governor->initialized--;
+	if (!ret) {
+		if (event == CPUFREQ_GOV_POLICY_INIT)
+			policy->governor->initialized++;
+		else if (event == CPUFREQ_GOV_POLICY_EXIT)
+			policy->governor->initialized--;
+	}
 
 	/* we keep one module reference alive for
 			each CPU governed by this CPU */
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 4fd0006b1291..98b49462f4e9 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -20,6 +20,7 @@
 #include <linux/mutex.h>
 #include <linux/notifier.h>
 #include <linux/percpu-defs.h>
+#include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/types.h>
 
@@ -31,17 +32,8 @@
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
 #define MAX_SAMPLING_DOWN_FACTOR		(10)
 
-static struct dbs_data cs_dbs_data;
 static DEFINE_PER_CPU(struct cs_cpu_dbs_info_s, cs_cpu_dbs_info);
 
-static struct cs_dbs_tuners cs_tuners = {
-	.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
-	.down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD,
-	.sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR,
-	.ignore_nice = 0,
-	.freq_step = 5,
-};
-
 /*
  * Every sampling_rate, we check, if current idle time is less than 20%
  * (default), then we try to increase frequency Every sampling_rate *
@@ -55,24 +47,26 @@ static void cs_check_cpu(int cpu, unsigned int load)
 {
 	struct cs_cpu_dbs_info_s *dbs_info = &per_cpu(cs_cpu_dbs_info, cpu);
 	struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
+	struct dbs_data *dbs_data = policy->governor_data;
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int freq_target;
 
 	/*
 	 * break out if we 'cannot' reduce the speed as the user might
 	 * want freq_step to be zero
 	 */
-	if (cs_tuners.freq_step == 0)
+	if (cs_tuners->freq_step == 0)
 		return;
 
 	/* Check for frequency increase */
-	if (load > cs_tuners.up_threshold) {
+	if (load > cs_tuners->up_threshold) {
 		dbs_info->down_skip = 0;
 
 		/* if we are already at full speed then break out early */
 		if (dbs_info->requested_freq == policy->max)
 			return;
 
-		freq_target = (cs_tuners.freq_step * policy->max) / 100;
+		freq_target = (cs_tuners->freq_step * policy->max) / 100;
 
 		/* max freq cannot be less than 100. But who knows.... */
 		if (unlikely(freq_target == 0))
@@ -92,8 +86,8 @@ static void cs_check_cpu(int cpu, unsigned int load)
 	 * support the current CPU usage without triggering the up policy. To be
 	 * safe, we focus 10 points under the threshold.
 	 */
-	if (load < (cs_tuners.down_threshold - 10)) {
-		freq_target = (cs_tuners.freq_step * policy->max) / 100;
+	if (load < (cs_tuners->down_threshold - 10)) {
+		freq_target = (cs_tuners->freq_step * policy->max) / 100;
 
 		dbs_info->requested_freq -= freq_target;
 		if (dbs_info->requested_freq < policy->min)
@@ -119,11 +113,13 @@ static void cs_dbs_timer(struct work_struct *work)
 	unsigned int cpu = dbs_info->cdbs.cur_policy->cpu;
 	struct cs_cpu_dbs_info_s *core_dbs_info = &per_cpu(cs_cpu_dbs_info,
 			cpu);
-	int delay = delay_for_sampling_rate(cs_tuners.sampling_rate);
+	struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
+	int delay = delay_for_sampling_rate(cs_tuners->sampling_rate);
 
 	mutex_lock(&core_dbs_info->cdbs.timer_mutex);
-	if (need_load_eval(&core_dbs_info->cdbs, cs_tuners.sampling_rate))
-		dbs_check_cpu(&cs_dbs_data, cpu);
+	if (need_load_eval(&core_dbs_info->cdbs, cs_tuners->sampling_rate))
+		dbs_check_cpu(dbs_data, cpu);
 
 	schedule_delayed_work_on(smp_processor_id(), dw, delay);
 	mutex_unlock(&core_dbs_info->cdbs.timer_mutex);
@@ -154,16 +150,12 @@ static int dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
 }
 
 /************************** sysfs interface ************************/
-static ssize_t show_sampling_rate_min(struct kobject *kobj,
-				      struct attribute *attr, char *buf)
-{
-	return sprintf(buf, "%u\n", cs_dbs_data.min_sampling_rate);
-}
+static struct common_dbs_data cs_dbs_cdata;
 
-static ssize_t store_sampling_down_factor(struct kobject *a,
-					  struct attribute *b,
-					  const char *buf, size_t count)
+static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
+		const char *buf, size_t count)
 {
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -171,13 +163,14 @@ static ssize_t store_sampling_down_factor(struct kobject *a,
 	if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
 		return -EINVAL;
 
-	cs_tuners.sampling_down_factor = input;
+	cs_tuners->sampling_down_factor = input;
 	return count;
 }
 
-static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
-				   const char *buf, size_t count)
+static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -185,43 +178,46 @@ static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
 	if (ret != 1)
 		return -EINVAL;
 
-	cs_tuners.sampling_rate = max(input, cs_dbs_data.min_sampling_rate);
+	cs_tuners->sampling_rate = max(input, dbs_data->min_sampling_rate);
 	return count;
 }
 
-static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
-				  const char *buf, size_t count)
+static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 
-	if (ret != 1 || input > 100 || input <= cs_tuners.down_threshold)
+	if (ret != 1 || input > 100 || input <= cs_tuners->down_threshold)
 		return -EINVAL;
 
-	cs_tuners.up_threshold = input;
+	cs_tuners->up_threshold = input;
 	return count;
 }
 
-static ssize_t store_down_threshold(struct kobject *a, struct attribute *b,
-				    const char *buf, size_t count)
+static ssize_t store_down_threshold(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 
 	/* cannot be lower than 11 otherwise freq will not fall */
 	if (ret != 1 || input < 11 || input > 100 ||
-			input >= cs_tuners.up_threshold)
+			input >= cs_tuners->up_threshold)
 		return -EINVAL;
 
-	cs_tuners.down_threshold = input;
+	cs_tuners->down_threshold = input;
 	return count;
 }
 
-static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
-				      const char *buf, size_t count)
+static ssize_t store_ignore_nice(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input, j;
 	int ret;
 
@@ -232,10 +228,10 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 	if (input > 1)
 		input = 1;
 
-	if (input == cs_tuners.ignore_nice) /* nothing to do */
+	if (input == cs_tuners->ignore_nice) /* nothing to do */
 		return count;
 
-	cs_tuners.ignore_nice = input;
+	cs_tuners->ignore_nice = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
@@ -243,16 +239,17 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 		dbs_info = &per_cpu(cs_cpu_dbs_info, j);
 		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
 						&dbs_info->cdbs.prev_cpu_wall);
-		if (cs_tuners.ignore_nice)
+		if (cs_tuners->ignore_nice)
 			dbs_info->cdbs.prev_cpu_nice =
 				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 	}
 	return count;
 }
 
-static ssize_t store_freq_step(struct kobject *a, struct attribute *b,
-			       const char *buf, size_t count)
+static ssize_t store_freq_step(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -267,43 +264,88 @@ static ssize_t store_freq_step(struct kobject *a, struct attribute *b,
 	 * no need to test here if freq_step is zero as the user might actually
 	 * want this, they would be crazy though :)
 	 */
-	cs_tuners.freq_step = input;
+	cs_tuners->freq_step = input;
 	return count;
 }
 
-show_one(cs, sampling_rate, sampling_rate);
-show_one(cs, sampling_down_factor, sampling_down_factor);
-show_one(cs, up_threshold, up_threshold);
-show_one(cs, down_threshold, down_threshold);
-show_one(cs, ignore_nice_load, ignore_nice);
-show_one(cs, freq_step, freq_step);
-
-define_one_global_rw(sampling_rate);
-define_one_global_rw(sampling_down_factor);
-define_one_global_rw(up_threshold);
-define_one_global_rw(down_threshold);
-define_one_global_rw(ignore_nice_load);
-define_one_global_rw(freq_step);
-define_one_global_ro(sampling_rate_min);
-
-static struct attribute *dbs_attributes[] = {
-	&sampling_rate_min.attr,
-	&sampling_rate.attr,
-	&sampling_down_factor.attr,
-	&up_threshold.attr,
-	&down_threshold.attr,
-	&ignore_nice_load.attr,
-	&freq_step.attr,
+show_store_one(cs, sampling_rate);
+show_store_one(cs, sampling_down_factor);
+show_store_one(cs, up_threshold);
+show_store_one(cs, down_threshold);
+show_store_one(cs, ignore_nice);
+show_store_one(cs, freq_step);
+declare_show_sampling_rate_min(cs);
+
+gov_sys_pol_attr_rw(sampling_rate);
+gov_sys_pol_attr_rw(sampling_down_factor);
+gov_sys_pol_attr_rw(up_threshold);
+gov_sys_pol_attr_rw(down_threshold);
+gov_sys_pol_attr_rw(ignore_nice);
+gov_sys_pol_attr_rw(freq_step);
+gov_sys_pol_attr_ro(sampling_rate_min);
+
+static struct attribute *dbs_attributes_gov_sys[] = {
+	&sampling_rate_min_gov_sys.attr,
+	&sampling_rate_gov_sys.attr,
+	&sampling_down_factor_gov_sys.attr,
+	&up_threshold_gov_sys.attr,
+	&down_threshold_gov_sys.attr,
+	&ignore_nice_gov_sys.attr,
+	&freq_step_gov_sys.attr,
 	NULL
 };
 
-static struct attribute_group cs_attr_group = {
-	.attrs = dbs_attributes,
+static struct attribute_group cs_attr_group_gov_sys = {
+	.attrs = dbs_attributes_gov_sys,
+	.name = "conservative",
+};
+
+static struct attribute *dbs_attributes_gov_pol[] = {
+	&sampling_rate_min_gov_pol.attr,
+	&sampling_rate_gov_pol.attr,
+	&sampling_down_factor_gov_pol.attr,
+	&up_threshold_gov_pol.attr,
+	&down_threshold_gov_pol.attr,
+	&ignore_nice_gov_pol.attr,
+	&freq_step_gov_pol.attr,
+	NULL
+};
+
+static struct attribute_group cs_attr_group_gov_pol = {
+	.attrs = dbs_attributes_gov_pol,
 	.name = "conservative",
 };
 
 /************************** sysfs end ************************/
 
+static int cs_init(struct dbs_data *dbs_data)
+{
+	struct cs_dbs_tuners *tuners;
+
+	tuners = kzalloc(sizeof(struct cs_dbs_tuners), GFP_KERNEL);
+	if (!tuners) {
+		pr_err("%s: kzalloc failed\n", __func__);
+		return -ENOMEM;
+	}
+
+	tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
+	tuners->down_threshold = DEF_FREQUENCY_DOWN_THRESHOLD;
+	tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
+	tuners->ignore_nice = 0;
+	tuners->freq_step = 5;
+
+	dbs_data->tuners = tuners;
+	dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
+		jiffies_to_usecs(10);
+	mutex_init(&dbs_data->mutex);
+	return 0;
+}
+
+static void cs_exit(struct dbs_data *dbs_data)
+{
+	kfree(dbs_data->tuners);
+}
+
 define_get_cpu_dbs_routines(cs_cpu_dbs_info);
 
 static struct notifier_block cs_cpufreq_notifier_block = {
@@ -314,21 +356,23 @@ static struct cs_ops cs_ops = {
 	.notifier_block = &cs_cpufreq_notifier_block,
 };
 
-static struct dbs_data cs_dbs_data = {
+static struct common_dbs_data cs_dbs_cdata = {
 	.governor = GOV_CONSERVATIVE,
-	.attr_group = &cs_attr_group,
-	.tuners = &cs_tuners,
+	.attr_group_gov_sys = &cs_attr_group_gov_sys,
+	.attr_group_gov_pol = &cs_attr_group_gov_pol,
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = cs_dbs_timer,
 	.gov_check_cpu = cs_check_cpu,
 	.gov_ops = &cs_ops,
+	.init = cs_init,
+	.exit = cs_exit,
 };
 
 static int cs_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 				   unsigned int event)
 {
-	return cpufreq_governor_dbs(&cs_dbs_data, policy, event);
+	return cpufreq_governor_dbs(policy, &cs_dbs_cdata, event);
 }
 
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
@@ -343,7 +387,6 @@ struct cpufreq_governor cpufreq_gov_conservative = {
 
 static int __init cpufreq_gov_dbs_init(void)
 {
-	mutex_init(&cs_dbs_data.mutex);
 	return cpufreq_register_governor(&cpufreq_gov_conservative);
 }
 
diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c
index 5a76086ff09b..26fbb729bc1c 100644
--- a/drivers/cpufreq/cpufreq_governor.c
+++ b/drivers/cpufreq/cpufreq_governor.c
@@ -22,12 +22,29 @@
 #include <linux/export.h>
 #include <linux/kernel_stat.h>
 #include <linux/mutex.h>
+#include <linux/slab.h>
 #include <linux/tick.h>
 #include <linux/types.h>
 #include <linux/workqueue.h>
 
 #include "cpufreq_governor.h"
 
+static struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy)
+{
+	if (have_governor_per_policy())
+		return &policy->kobj;
+	else
+		return cpufreq_global_kobject;
+}
+
+static struct attribute_group *get_sysfs_attr(struct dbs_data *dbs_data)
+{
+	if (have_governor_per_policy())
+		return dbs_data->cdata->attr_group_gov_pol;
+	else
+		return dbs_data->cdata->attr_group_gov_sys;
+}
+
 static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
 {
 	u64 idle_time;
@@ -65,7 +82,7 @@ EXPORT_SYMBOL_GPL(get_cpu_idle_time);
 
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 {
-	struct cpu_dbs_common_info *cdbs = dbs_data->get_cpu_cdbs(cpu);
+	struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
 	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
 	struct cpufreq_policy *policy;
@@ -73,7 +90,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 	unsigned int ignore_nice;
 	unsigned int j;
 
-	if (dbs_data->governor == GOV_ONDEMAND)
+	if (dbs_data->cdata->governor == GOV_ONDEMAND)
 		ignore_nice = od_tuners->ignore_nice;
 	else
 		ignore_nice = cs_tuners->ignore_nice;
@@ -87,7 +104,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 		unsigned int idle_time, wall_time, iowait_time;
 		unsigned int load;
 
-		j_cdbs = dbs_data->get_cpu_cdbs(j);
+		j_cdbs = dbs_data->cdata->get_cpu_cdbs(j);
 
 		cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
 
@@ -117,9 +134,9 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 			idle_time += jiffies_to_usecs(cur_nice_jiffies);
 		}
 
-		if (dbs_data->governor == GOV_ONDEMAND) {
+		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
 			struct od_cpu_dbs_info_s *od_j_dbs_info =
-				dbs_data->get_cpu_dbs_info_s(cpu);
+				dbs_data->cdata->get_cpu_dbs_info_s(cpu);
 
 			cur_iowait_time = get_cpu_iowait_time_us(j,
 					&cur_wall_time);
@@ -145,7 +162,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 
 		load = 100 * (wall_time - idle_time) / wall_time;
 
-		if (dbs_data->governor == GOV_ONDEMAND) {
+		if (dbs_data->cdata->governor == GOV_ONDEMAND) {
 			int freq_avg = __cpufreq_driver_getavg(policy, j);
 			if (freq_avg <= 0)
 				freq_avg = policy->cur;
@@ -157,7 +174,7 @@ void dbs_check_cpu(struct dbs_data *dbs_data, int cpu)
 			max_load = load;
 	}
 
-	dbs_data->gov_check_cpu(cpu, max_load);
+	dbs_data->cdata->gov_check_cpu(cpu, max_load);
 }
 EXPORT_SYMBOL_GPL(dbs_check_cpu);
 
@@ -165,14 +182,14 @@ static inline void dbs_timer_init(struct dbs_data *dbs_data, int cpu,
 				  unsigned int sampling_rate)
 {
 	int delay = delay_for_sampling_rate(sampling_rate);
-	struct cpu_dbs_common_info *cdbs = dbs_data->get_cpu_cdbs(cpu);
+	struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
 
 	schedule_delayed_work_on(cpu, &cdbs->work, delay);
 }
 
 static inline void dbs_timer_exit(struct dbs_data *dbs_data, int cpu)
 {
-	struct cpu_dbs_common_info *cdbs = dbs_data->get_cpu_cdbs(cpu);
+	struct cpu_dbs_common_info *cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
 
 	cancel_delayed_work_sync(&cdbs->work);
 }
@@ -196,31 +213,128 @@ bool need_load_eval(struct cpu_dbs_common_info *cdbs,
 }
 EXPORT_SYMBOL_GPL(need_load_eval);
 
-int cpufreq_governor_dbs(struct dbs_data *dbs_data,
-		struct cpufreq_policy *policy, unsigned int event)
+static void set_sampling_rate(struct dbs_data *dbs_data,
+		unsigned int sampling_rate)
+{
+	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
+		struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
+		cs_tuners->sampling_rate = sampling_rate;
+	} else {
+		struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+		od_tuners->sampling_rate = sampling_rate;
+	}
+}
+
+int cpufreq_governor_dbs(struct cpufreq_policy *policy,
+		struct common_dbs_data *cdata, unsigned int event)
 {
+	struct dbs_data *dbs_data;
 	struct od_cpu_dbs_info_s *od_dbs_info = NULL;
 	struct cs_cpu_dbs_info_s *cs_dbs_info = NULL;
-	struct cs_ops *cs_ops = NULL;
 	struct od_ops *od_ops = NULL;
-	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
-	struct cs_dbs_tuners *cs_tuners = dbs_data->tuners;
+	struct od_dbs_tuners *od_tuners = NULL;
+	struct cs_dbs_tuners *cs_tuners = NULL;
 	struct cpu_dbs_common_info *cpu_cdbs;
-	unsigned int *sampling_rate, latency, ignore_nice, j, cpu = policy->cpu;
+	unsigned int sampling_rate, latency, ignore_nice, j, cpu = policy->cpu;
 	int rc;
 
-	cpu_cdbs = dbs_data->get_cpu_cdbs(cpu);
+	if (have_governor_per_policy())
+		dbs_data = policy->governor_data;
+	else
+		dbs_data = cdata->gdbs_data;
+
+	WARN_ON(!dbs_data && (event != CPUFREQ_GOV_POLICY_INIT));
+
+	switch (event) {
+	case CPUFREQ_GOV_POLICY_INIT:
+		if (have_governor_per_policy()) {
+			WARN_ON(dbs_data);
+		} else if (dbs_data) {
+			policy->governor_data = dbs_data;
+			return 0;
+		}
+
+		dbs_data = kzalloc(sizeof(*dbs_data), GFP_KERNEL);
+		if (!dbs_data) {
+			pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
+			return -ENOMEM;
+		}
+
+		dbs_data->cdata = cdata;
+		rc = cdata->init(dbs_data);
+		if (rc) {
+			pr_err("%s: POLICY_INIT: init() failed\n", __func__);
+			kfree(dbs_data);
+			return rc;
+		}
+
+		rc = sysfs_create_group(get_governor_parent_kobj(policy),
+				get_sysfs_attr(dbs_data));
+		if (rc) {
+			cdata->exit(dbs_data);
+			kfree(dbs_data);
+			return rc;
+		}
+
+		policy->governor_data = dbs_data;
+
+		/* policy latency is in nS. Convert it to uS first */
+		latency = policy->cpuinfo.transition_latency / 1000;
+		if (latency == 0)
+			latency = 1;
+
+		/* Bring kernel and HW constraints together */
+		dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
+				MIN_LATENCY_MULTIPLIER * latency);
+		set_sampling_rate(dbs_data, max(dbs_data->min_sampling_rate,
+					latency * LATENCY_MULTIPLIER));
+
+		if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
+			struct cs_ops *cs_ops = dbs_data->cdata->gov_ops;
+
+			cpufreq_register_notifier(cs_ops->notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+		}
+
+		if (!have_governor_per_policy())
+			cdata->gdbs_data = dbs_data;
+
+		return 0;
+	case CPUFREQ_GOV_POLICY_EXIT:
+		if ((policy->governor->initialized == 1) ||
+				have_governor_per_policy()) {
+			sysfs_remove_group(get_governor_parent_kobj(policy),
+					get_sysfs_attr(dbs_data));
+
+			if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
+				struct cs_ops *cs_ops = dbs_data->cdata->gov_ops;
+
+				cpufreq_unregister_notifier(cs_ops->notifier_block,
+						CPUFREQ_TRANSITION_NOTIFIER);
+			}
+
+			cdata->exit(dbs_data);
+			kfree(dbs_data);
+			cdata->gdbs_data = NULL;
+		}
 
-	if (dbs_data->governor == GOV_CONSERVATIVE) {
-		cs_dbs_info = dbs_data->get_cpu_dbs_info_s(cpu);
-		sampling_rate = &cs_tuners->sampling_rate;
+		policy->governor_data = NULL;
+		return 0;
+	}
+
+	cpu_cdbs = dbs_data->cdata->get_cpu_cdbs(cpu);
+
+	if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
+		cs_tuners = dbs_data->tuners;
+		cs_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu);
+		sampling_rate = cs_tuners->sampling_rate;
 		ignore_nice = cs_tuners->ignore_nice;
-		cs_ops = dbs_data->gov_ops;
 	} else {
-		od_dbs_info = dbs_data->get_cpu_dbs_info_s(cpu);
-		sampling_rate = &od_tuners->sampling_rate;
+		od_tuners = dbs_data->tuners;
+		od_dbs_info = dbs_data->cdata->get_cpu_dbs_info_s(cpu);
+		sampling_rate = od_tuners->sampling_rate;
 		ignore_nice = od_tuners->ignore_nice;
-		od_ops = dbs_data->gov_ops;
+		od_ops = dbs_data->cdata->gov_ops;
 	}
 
 	switch (event) {
@@ -232,7 +346,7 @@ int cpufreq_governor_dbs(struct dbs_data *dbs_data,
 
 		for_each_cpu(j, policy->cpus) {
 			struct cpu_dbs_common_info *j_cdbs =
-				dbs_data->get_cpu_cdbs(j);
+				dbs_data->cdata->get_cpu_cdbs(j);
 
 			j_cdbs->cpu = j;
 			j_cdbs->cur_policy = policy;
@@ -244,69 +358,34 @@ int cpufreq_governor_dbs(struct dbs_data *dbs_data,
 
 			mutex_init(&j_cdbs->timer_mutex);
 			INIT_DEFERRABLE_WORK(&j_cdbs->work,
-					     dbs_data->gov_dbs_timer);
-		}
-
-		if (!policy->governor->initialized) {
-			rc = sysfs_create_group(cpufreq_global_kobject,
-					dbs_data->attr_group);
-			if (rc) {
-				mutex_unlock(&dbs_data->mutex);
-				return rc;
-			}
+					     dbs_data->cdata->gov_dbs_timer);
 		}
 
 		/*
 		 * conservative does not implement micro like ondemand
 		 * governor, thus we are bound to jiffes/HZ
 		 */
-		if (dbs_data->governor == GOV_CONSERVATIVE) {
+		if (dbs_data->cdata->governor == GOV_CONSERVATIVE) {
 			cs_dbs_info->down_skip = 0;
 			cs_dbs_info->enable = 1;
 			cs_dbs_info->requested_freq = policy->cur;
-
-			if (!policy->governor->initialized) {
-				cpufreq_register_notifier(cs_ops->notifier_block,
-						CPUFREQ_TRANSITION_NOTIFIER);
-
-				dbs_data->min_sampling_rate =
-					MIN_SAMPLING_RATE_RATIO *
-					jiffies_to_usecs(10);
-			}
 		} else {
 			od_dbs_info->rate_mult = 1;
 			od_dbs_info->sample_type = OD_NORMAL_SAMPLE;
 			od_ops->powersave_bias_init_cpu(cpu);
-
-			if (!policy->governor->initialized)
-				od_tuners->io_is_busy = od_ops->io_busy();
 		}
 
-		if (policy->governor->initialized)
-			goto unlock;
-
-		/* policy latency is in nS. Convert it to uS first */
-		latency = policy->cpuinfo.transition_latency / 1000;
-		if (latency == 0)
-			latency = 1;
-
-		/* Bring kernel and HW constraints together */
-		dbs_data->min_sampling_rate = max(dbs_data->min_sampling_rate,
-				MIN_LATENCY_MULTIPLIER * latency);
-		*sampling_rate = max(dbs_data->min_sampling_rate, latency *
-				LATENCY_MULTIPLIER);
-unlock:
 		mutex_unlock(&dbs_data->mutex);
 
 		/* Initiate timer time stamp */
 		cpu_cdbs->time_stamp = ktime_get();
 
 		for_each_cpu(j, policy->cpus)
-			dbs_timer_init(dbs_data, j, *sampling_rate);
+			dbs_timer_init(dbs_data, j, sampling_rate);
 		break;
 
 	case CPUFREQ_GOV_STOP:
-		if (dbs_data->governor == GOV_CONSERVATIVE)
+		if (dbs_data->cdata->governor == GOV_CONSERVATIVE)
 			cs_dbs_info->enable = 0;
 
 		for_each_cpu(j, policy->cpus)
@@ -315,13 +394,6 @@ unlock:
 		mutex_lock(&dbs_data->mutex);
 		mutex_destroy(&cpu_cdbs->timer_mutex);
 
-		if (policy->governor->initialized == 1) {
-			sysfs_remove_group(cpufreq_global_kobject,
-					dbs_data->attr_group);
-			if (dbs_data->governor == GOV_CONSERVATIVE)
-				cpufreq_unregister_notifier(cs_ops->notifier_block,
-						CPUFREQ_TRANSITION_NOTIFIER);
-		}
 		mutex_unlock(&dbs_data->mutex);
 
 		break;
diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h
index 46bde01eee62..c83cabf14b2f 100644
--- a/drivers/cpufreq/cpufreq_governor.h
+++ b/drivers/cpufreq/cpufreq_governor.h
@@ -40,14 +40,75 @@
 /* Ondemand Sampling types */
 enum {OD_NORMAL_SAMPLE, OD_SUB_SAMPLE};
 
-/* Macro creating sysfs show routines */
-#define show_one(_gov, file_name, object)				\
-static ssize_t show_##file_name						\
+/*
+ * Macro for creating governors sysfs routines
+ *
+ * - gov_sys: One governor instance per whole system
+ * - gov_pol: One governor instance per policy
+ */
+
+/* Create attributes */
+#define gov_sys_attr_ro(_name)						\
+static struct global_attr _name##_gov_sys =				\
+__ATTR(_name, 0444, show_##_name##_gov_sys, NULL)
+
+#define gov_sys_attr_rw(_name)						\
+static struct global_attr _name##_gov_sys =				\
+__ATTR(_name, 0644, show_##_name##_gov_sys, store_##_name##_gov_sys)
+
+#define gov_pol_attr_ro(_name)						\
+static struct freq_attr _name##_gov_pol =				\
+__ATTR(_name, 0444, show_##_name##_gov_pol, NULL)
+
+#define gov_pol_attr_rw(_name)						\
+static struct freq_attr _name##_gov_pol =				\
+__ATTR(_name, 0644, show_##_name##_gov_pol, store_##_name##_gov_pol)
+
+#define gov_sys_pol_attr_rw(_name)					\
+	gov_sys_attr_rw(_name);						\
+	gov_pol_attr_rw(_name)
+
+#define gov_sys_pol_attr_ro(_name)					\
+	gov_sys_attr_ro(_name);						\
+	gov_pol_attr_ro(_name)
+
+/* Create show/store routines */
+#define show_one(_gov, file_name)					\
+static ssize_t show_##file_name##_gov_sys				\
 (struct kobject *kobj, struct attribute *attr, char *buf)		\
 {									\
-	return sprintf(buf, "%u\n", _gov##_tuners.object);		\
+	struct _gov##_dbs_tuners *tuners = _gov##_dbs_cdata.gdbs_data->tuners; \
+	return sprintf(buf, "%u\n", tuners->file_name);			\
+}									\
+									\
+static ssize_t show_##file_name##_gov_pol					\
+(struct cpufreq_policy *policy, char *buf)				\
+{									\
+	struct dbs_data *dbs_data = policy->governor_data;		\
+	struct _gov##_dbs_tuners *tuners = dbs_data->tuners;		\
+	return sprintf(buf, "%u\n", tuners->file_name);			\
+}
+
+#define store_one(_gov, file_name)					\
+static ssize_t store_##file_name##_gov_sys				\
+(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count)	\
+{									\
+	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
+	return store_##file_name(dbs_data, buf, count);			\
+}									\
+									\
+static ssize_t store_##file_name##_gov_pol				\
+(struct cpufreq_policy *policy, const char *buf, size_t count)		\
+{									\
+	struct dbs_data *dbs_data = policy->governor_data;		\
+	return store_##file_name(dbs_data, buf, count);			\
 }
 
+#define show_store_one(_gov, file_name)					\
+show_one(_gov, file_name);						\
+store_one(_gov, file_name)
+
+/* create helper routines */
 #define define_get_cpu_dbs_routines(_dbs_info)				\
 static struct cpu_dbs_common_info *get_cpu_cdbs(int cpu)		\
 {									\
@@ -103,7 +164,7 @@ struct cs_cpu_dbs_info_s {
 	unsigned int enable:1;
 };
 
-/* Governers sysfs tunables */
+/* Per policy Governers sysfs tunables */
 struct od_dbs_tuners {
 	unsigned int ignore_nice;
 	unsigned int sampling_rate;
@@ -123,31 +184,42 @@ struct cs_dbs_tuners {
 	unsigned int freq_step;
 };
 
-/* Per Governer data */
-struct dbs_data {
+/* Common Governer data across policies */
+struct dbs_data;
+struct common_dbs_data {
 	/* Common across governors */
 	#define GOV_ONDEMAND		0
 	#define GOV_CONSERVATIVE	1
 	int governor;
-	unsigned int min_sampling_rate;
-	struct attribute_group *attr_group;
-	void *tuners;
+	struct attribute_group *attr_group_gov_sys; /* one governor - system */
+	struct attribute_group *attr_group_gov_pol; /* one governor - policy */
 
-	/* dbs_mutex protects dbs_enable in governor start/stop */
-	struct mutex mutex;
+	/* Common data for platforms that don't set have_governor_per_policy */
+	struct dbs_data *gdbs_data;
 
 	struct cpu_dbs_common_info *(*get_cpu_cdbs)(int cpu);
 	void *(*get_cpu_dbs_info_s)(int cpu);
 	void (*gov_dbs_timer)(struct work_struct *work);
 	void (*gov_check_cpu)(int cpu, unsigned int load);
+	int (*init)(struct dbs_data *dbs_data);
+	void (*exit)(struct dbs_data *dbs_data);
 
 	/* Governor specific ops, see below */
 	void *gov_ops;
 };
 
+/* Governer Per policy data */
+struct dbs_data {
+	struct common_dbs_data *cdata;
+	unsigned int min_sampling_rate;
+	void *tuners;
+
+	/* dbs_mutex protects dbs_enable in governor start/stop */
+	struct mutex mutex;
+};
+
 /* Governor specific ops, will be passed to dbs_data->gov_ops */
 struct od_ops {
-	int (*io_busy)(void);
 	void (*powersave_bias_init_cpu)(int cpu);
 	unsigned int (*powersave_bias_target)(struct cpufreq_policy *policy,
 			unsigned int freq_next, unsigned int relation);
@@ -169,10 +241,25 @@ static inline int delay_for_sampling_rate(unsigned int sampling_rate)
 	return delay;
 }
 
+#define declare_show_sampling_rate_min(_gov)				\
+static ssize_t show_sampling_rate_min_gov_sys				\
+(struct kobject *kobj, struct attribute *attr, char *buf)		\
+{									\
+	struct dbs_data *dbs_data = _gov##_dbs_cdata.gdbs_data;		\
+	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
+}									\
+									\
+static ssize_t show_sampling_rate_min_gov_pol				\
+(struct cpufreq_policy *policy, char *buf)				\
+{									\
+	struct dbs_data *dbs_data = policy->governor_data;		\
+	return sprintf(buf, "%u\n", dbs_data->min_sampling_rate);	\
+}
+
 u64 get_cpu_idle_time(unsigned int cpu, u64 *wall);
 void dbs_check_cpu(struct dbs_data *dbs_data, int cpu);
 bool need_load_eval(struct cpu_dbs_common_info *cdbs,
 		unsigned int sampling_rate);
-int cpufreq_governor_dbs(struct dbs_data *dbs_data,
-		struct cpufreq_policy *policy, unsigned int event);
+int cpufreq_governor_dbs(struct cpufreq_policy *policy,
+		struct common_dbs_data *cdata, unsigned int event);
 #endif /* _CPUFREQ_GOVERNER_H */
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index f3eb26cd848f..15e80ee61352 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -20,6 +20,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/percpu-defs.h>
+#include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/tick.h>
 #include <linux/types.h>
@@ -37,22 +38,12 @@
 #define MIN_FREQUENCY_UP_THRESHOLD		(11)
 #define MAX_FREQUENCY_UP_THRESHOLD		(100)
 
-static struct dbs_data od_dbs_data;
 static DEFINE_PER_CPU(struct od_cpu_dbs_info_s, od_cpu_dbs_info);
 
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
 static struct cpufreq_governor cpufreq_gov_ondemand;
 #endif
 
-static struct od_dbs_tuners od_tuners = {
-	.up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
-	.sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR,
-	.adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD -
-			    DEF_FREQUENCY_DOWN_DIFFERENTIAL,
-	.ignore_nice = 0,
-	.powersave_bias = 0,
-};
-
 static void ondemand_powersave_bias_init_cpu(int cpu)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
@@ -98,6 +89,8 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
 	unsigned int jiffies_total, jiffies_hi, jiffies_lo;
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info,
 						   policy->cpu);
+	struct dbs_data *dbs_data = policy->governor_data;
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
 	if (!dbs_info->freq_table) {
 		dbs_info->freq_lo = 0;
@@ -108,7 +101,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
 	cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next,
 			relation, &index);
 	freq_req = dbs_info->freq_table[index].frequency;
-	freq_reduc = freq_req * od_tuners.powersave_bias / 1000;
+	freq_reduc = freq_req * od_tuners->powersave_bias / 1000;
 	freq_avg = freq_req - freq_reduc;
 
 	/* Find freq bounds for freq_avg in freq_table */
@@ -127,7 +120,7 @@ static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
 		dbs_info->freq_lo_jiffies = 0;
 		return freq_lo;
 	}
-	jiffies_total = usecs_to_jiffies(od_tuners.sampling_rate);
+	jiffies_total = usecs_to_jiffies(od_tuners->sampling_rate);
 	jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
 	jiffies_hi += ((freq_hi - freq_lo) / 2);
 	jiffies_hi /= (freq_hi - freq_lo);
@@ -148,12 +141,15 @@ static void ondemand_powersave_bias_init(void)
 
 static void dbs_freq_increase(struct cpufreq_policy *p, unsigned int freq)
 {
-	if (od_tuners.powersave_bias)
+	struct dbs_data *dbs_data = p->governor_data;
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
+
+	if (od_tuners->powersave_bias)
 		freq = powersave_bias_target(p, freq, CPUFREQ_RELATION_H);
 	else if (p->cur == p->max)
 		return;
 
-	__cpufreq_driver_target(p, freq, od_tuners.powersave_bias ?
+	__cpufreq_driver_target(p, freq, od_tuners->powersave_bias ?
 			CPUFREQ_RELATION_L : CPUFREQ_RELATION_H);
 }
 
@@ -170,15 +166,17 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
 {
 	struct od_cpu_dbs_info_s *dbs_info = &per_cpu(od_cpu_dbs_info, cpu);
 	struct cpufreq_policy *policy = dbs_info->cdbs.cur_policy;
+	struct dbs_data *dbs_data = policy->governor_data;
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 
 	dbs_info->freq_lo = 0;
 
 	/* Check for frequency increase */
-	if (load_freq > od_tuners.up_threshold * policy->cur) {
+	if (load_freq > od_tuners->up_threshold * policy->cur) {
 		/* If switching to max speed, apply sampling_down_factor */
 		if (policy->cur < policy->max)
 			dbs_info->rate_mult =
-				od_tuners.sampling_down_factor;
+				od_tuners->sampling_down_factor;
 		dbs_freq_increase(policy, policy->max);
 		return;
 	}
@@ -193,9 +191,10 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
 	 * support the current CPU usage without triggering the up policy. To be
 	 * safe, we focus 10 points under the threshold.
 	 */
-	if (load_freq < od_tuners.adj_up_threshold * policy->cur) {
+	if (load_freq < od_tuners->adj_up_threshold
+			* policy->cur) {
 		unsigned int freq_next;
-		freq_next = load_freq / od_tuners.adj_up_threshold;
+		freq_next = load_freq / od_tuners->adj_up_threshold;
 
 		/* No longer fully busy, reset rate_mult */
 		dbs_info->rate_mult = 1;
@@ -203,7 +202,7 @@ static void od_check_cpu(int cpu, unsigned int load_freq)
 		if (freq_next < policy->min)
 			freq_next = policy->min;
 
-		if (!od_tuners.powersave_bias) {
+		if (!od_tuners->powersave_bias) {
 			__cpufreq_driver_target(policy, freq_next,
 					CPUFREQ_RELATION_L);
 		} else {
@@ -223,12 +222,14 @@ static void od_dbs_timer(struct work_struct *work)
 	unsigned int cpu = dbs_info->cdbs.cur_policy->cpu;
 	struct od_cpu_dbs_info_s *core_dbs_info = &per_cpu(od_cpu_dbs_info,
 			cpu);
+	struct dbs_data *dbs_data = dbs_info->cdbs.cur_policy->governor_data;
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int delay, sample_type = core_dbs_info->sample_type;
 	bool eval_load;
 
 	mutex_lock(&core_dbs_info->cdbs.timer_mutex);
 	eval_load = need_load_eval(&core_dbs_info->cdbs,
-			od_tuners.sampling_rate);
+			od_tuners->sampling_rate);
 
 	/* Common NORMAL_SAMPLE setup */
 	core_dbs_info->sample_type = OD_NORMAL_SAMPLE;
@@ -240,13 +241,13 @@ static void od_dbs_timer(struct work_struct *work)
 						CPUFREQ_RELATION_H);
 	} else {
 		if (eval_load)
-			dbs_check_cpu(&od_dbs_data, cpu);
+			dbs_check_cpu(dbs_data, cpu);
 		if (core_dbs_info->freq_lo) {
 			/* Setup timer for SUB_SAMPLE */
 			core_dbs_info->sample_type = OD_SUB_SAMPLE;
 			delay = core_dbs_info->freq_hi_jiffies;
 		} else {
-			delay = delay_for_sampling_rate(od_tuners.sampling_rate
+			delay = delay_for_sampling_rate(od_tuners->sampling_rate
 						* core_dbs_info->rate_mult);
 		}
 	}
@@ -256,12 +257,7 @@ static void od_dbs_timer(struct work_struct *work)
 }
 
 /************************** sysfs interface ************************/
-
-static ssize_t show_sampling_rate_min(struct kobject *kobj,
-				      struct attribute *attr, char *buf)
-{
-	return sprintf(buf, "%u\n", od_dbs_data.min_sampling_rate);
-}
+static struct common_dbs_data od_dbs_cdata;
 
 /**
  * update_sampling_rate - update sampling rate effective immediately if needed.
@@ -276,12 +272,14 @@ static ssize_t show_sampling_rate_min(struct kobject *kobj,
  * reducing the sampling rate, we need to make the new value effective
  * immediately.
  */
-static void update_sampling_rate(unsigned int new_rate)
+static void update_sampling_rate(struct dbs_data *dbs_data,
+		unsigned int new_rate)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	int cpu;
 
-	od_tuners.sampling_rate = new_rate = max(new_rate,
-			od_dbs_data.min_sampling_rate);
+	od_tuners->sampling_rate = new_rate = max(new_rate,
+			dbs_data->min_sampling_rate);
 
 	for_each_online_cpu(cpu) {
 		struct cpufreq_policy *policy;
@@ -322,34 +320,37 @@ static void update_sampling_rate(unsigned int new_rate)
 	}
 }
 
-static ssize_t store_sampling_rate(struct kobject *a, struct attribute *b,
-				   const char *buf, size_t count)
+static ssize_t store_sampling_rate(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 	if (ret != 1)
 		return -EINVAL;
-	update_sampling_rate(input);
+
+	update_sampling_rate(dbs_data, input);
 	return count;
 }
 
-static ssize_t store_io_is_busy(struct kobject *a, struct attribute *b,
-				   const char *buf, size_t count)
+static ssize_t store_io_is_busy(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 
 	ret = sscanf(buf, "%u", &input);
 	if (ret != 1)
 		return -EINVAL;
-	od_tuners.io_is_busy = !!input;
+	od_tuners->io_is_busy = !!input;
 	return count;
 }
 
-static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
-				  const char *buf, size_t count)
+static ssize_t store_up_threshold(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -359,23 +360,24 @@ static ssize_t store_up_threshold(struct kobject *a, struct attribute *b,
 		return -EINVAL;
 	}
 	/* Calculate the new adj_up_threshold */
-	od_tuners.adj_up_threshold += input;
-	od_tuners.adj_up_threshold -= od_tuners.up_threshold;
+	od_tuners->adj_up_threshold += input;
+	od_tuners->adj_up_threshold -= od_tuners->up_threshold;
 
-	od_tuners.up_threshold = input;
+	od_tuners->up_threshold = input;
 	return count;
 }
 
-static ssize_t store_sampling_down_factor(struct kobject *a,
-			struct attribute *b, const char *buf, size_t count)
+static ssize_t store_sampling_down_factor(struct dbs_data *dbs_data,
+		const char *buf, size_t count)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input, j;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
 
 	if (ret != 1 || input > MAX_SAMPLING_DOWN_FACTOR || input < 1)
 		return -EINVAL;
-	od_tuners.sampling_down_factor = input;
+	od_tuners->sampling_down_factor = input;
 
 	/* Reset down sampling multiplier in case it was active */
 	for_each_online_cpu(j) {
@@ -386,9 +388,10 @@ static ssize_t store_sampling_down_factor(struct kobject *a,
 	return count;
 }
 
-static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
-				      const char *buf, size_t count)
+static ssize_t store_ignore_nice(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 
@@ -401,10 +404,10 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 	if (input > 1)
 		input = 1;
 
-	if (input == od_tuners.ignore_nice) { /* nothing to do */
+	if (input == od_tuners->ignore_nice) { /* nothing to do */
 		return count;
 	}
-	od_tuners.ignore_nice = input;
+	od_tuners->ignore_nice = input;
 
 	/* we need to re-evaluate prev_cpu_idle */
 	for_each_online_cpu(j) {
@@ -412,7 +415,7 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 		dbs_info = &per_cpu(od_cpu_dbs_info, j);
 		dbs_info->cdbs.prev_cpu_idle = get_cpu_idle_time(j,
 						&dbs_info->cdbs.prev_cpu_wall);
-		if (od_tuners.ignore_nice)
+		if (od_tuners->ignore_nice)
 			dbs_info->cdbs.prev_cpu_nice =
 				kcpustat_cpu(j).cpustat[CPUTIME_NICE];
 
@@ -420,9 +423,10 @@ static ssize_t store_ignore_nice_load(struct kobject *a, struct attribute *b,
 	return count;
 }
 
-static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b,
-				    const char *buf, size_t count)
+static ssize_t store_powersave_bias(struct dbs_data *dbs_data, const char *buf,
+		size_t count)
 {
+	struct od_dbs_tuners *od_tuners = dbs_data->tuners;
 	unsigned int input;
 	int ret;
 	ret = sscanf(buf, "%u", &input);
@@ -433,68 +437,138 @@ static ssize_t store_powersave_bias(struct kobject *a, struct attribute *b,
 	if (input > 1000)
 		input = 1000;
 
-	od_tuners.powersave_bias = input;
+	od_tuners->powersave_bias = input;
 	ondemand_powersave_bias_init();
 	return count;
 }
 
-show_one(od, sampling_rate, sampling_rate);
-show_one(od, io_is_busy, io_is_busy);
-show_one(od, up_threshold, up_threshold);
-show_one(od, sampling_down_factor, sampling_down_factor);
-show_one(od, ignore_nice_load, ignore_nice);
-show_one(od, powersave_bias, powersave_bias);
-
-define_one_global_rw(sampling_rate);
-define_one_global_rw(io_is_busy);
-define_one_global_rw(up_threshold);
-define_one_global_rw(sampling_down_factor);
-define_one_global_rw(ignore_nice_load);
-define_one_global_rw(powersave_bias);
-define_one_global_ro(sampling_rate_min);
-
-static struct attribute *dbs_attributes[] = {
-	&sampling_rate_min.attr,
-	&sampling_rate.attr,
-	&up_threshold.attr,
-	&sampling_down_factor.attr,
-	&ignore_nice_load.attr,
-	&powersave_bias.attr,
-	&io_is_busy.attr,
+show_store_one(od, sampling_rate);
+show_store_one(od, io_is_busy);
+show_store_one(od, up_threshold);
+show_store_one(od, sampling_down_factor);
+show_store_one(od, ignore_nice);
+show_store_one(od, powersave_bias);
+declare_show_sampling_rate_min(od);
+
+gov_sys_pol_attr_rw(sampling_rate);
+gov_sys_pol_attr_rw(io_is_busy);
+gov_sys_pol_attr_rw(up_threshold);
+gov_sys_pol_attr_rw(sampling_down_factor);
+gov_sys_pol_attr_rw(ignore_nice);
+gov_sys_pol_attr_rw(powersave_bias);
+gov_sys_pol_attr_ro(sampling_rate_min);
+
+static struct attribute *dbs_attributes_gov_sys[] = {
+	&sampling_rate_min_gov_sys.attr,
+	&sampling_rate_gov_sys.attr,
+	&up_threshold_gov_sys.attr,
+	&sampling_down_factor_gov_sys.attr,
+	&ignore_nice_gov_sys.attr,
+	&powersave_bias_gov_sys.attr,
+	&io_is_busy_gov_sys.attr,
 	NULL
 };
 
-static struct attribute_group od_attr_group = {
-	.attrs = dbs_attributes,
+static struct attribute_group od_attr_group_gov_sys = {
+	.attrs = dbs_attributes_gov_sys,
+	.name = "ondemand",
+};
+
+static struct attribute *dbs_attributes_gov_pol[] = {
+	&sampling_rate_min_gov_pol.attr,
+	&sampling_rate_gov_pol.attr,
+	&up_threshold_gov_pol.attr,
+	&sampling_down_factor_gov_pol.attr,
+	&ignore_nice_gov_pol.attr,
+	&powersave_bias_gov_pol.attr,
+	&io_is_busy_gov_pol.attr,
+	NULL
+};
+
+static struct attribute_group od_attr_group_gov_pol = {
+	.attrs = dbs_attributes_gov_pol,
 	.name = "ondemand",
 };
 
 /************************** sysfs end ************************/
 
+static int od_init(struct dbs_data *dbs_data)
+{
+	struct od_dbs_tuners *tuners;
+	u64 idle_time;
+	int cpu;
+
+	tuners = kzalloc(sizeof(struct od_dbs_tuners), GFP_KERNEL);
+	if (!tuners) {
+		pr_err("%s: kzalloc failed\n", __func__);
+		return -ENOMEM;
+	}
+
+	cpu = get_cpu();
+	idle_time = get_cpu_idle_time_us(cpu, NULL);
+	put_cpu();
+	if (idle_time != -1ULL) {
+		/* Idle micro accounting is supported. Use finer thresholds */
+		tuners->up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
+		tuners->adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD -
+			MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
+		/*
+		 * In nohz/micro accounting case we set the minimum frequency
+		 * not depending on HZ, but fixed (very low). The deferred
+		 * timer might skip some samples if idle/sleeping as needed.
+		*/
+		dbs_data->min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
+	} else {
+		tuners->up_threshold = DEF_FREQUENCY_UP_THRESHOLD;
+		tuners->adj_up_threshold = DEF_FREQUENCY_UP_THRESHOLD -
+			DEF_FREQUENCY_DOWN_DIFFERENTIAL;
+
+		/* For correct statistics, we need 10 ticks for each measure */
+		dbs_data->min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
+			jiffies_to_usecs(10);
+	}
+
+	tuners->sampling_down_factor = DEF_SAMPLING_DOWN_FACTOR;
+	tuners->ignore_nice = 0;
+	tuners->powersave_bias = 0;
+	tuners->io_is_busy = should_io_be_busy();
+
+	dbs_data->tuners = tuners;
+	pr_info("%s: tuners %p\n", __func__, tuners);
+	mutex_init(&dbs_data->mutex);
+	return 0;
+}
+
+static void od_exit(struct dbs_data *dbs_data)
+{
+	kfree(dbs_data->tuners);
+}
+
 define_get_cpu_dbs_routines(od_cpu_dbs_info);
 
 static struct od_ops od_ops = {
-	.io_busy = should_io_be_busy,
 	.powersave_bias_init_cpu = ondemand_powersave_bias_init_cpu,
 	.powersave_bias_target = powersave_bias_target,
 	.freq_increase = dbs_freq_increase,
 };
 
-static struct dbs_data od_dbs_data = {
+static struct common_dbs_data od_dbs_cdata = {
 	.governor = GOV_ONDEMAND,
-	.attr_group = &od_attr_group,
-	.tuners = &od_tuners,
+	.attr_group_gov_sys = &od_attr_group_gov_sys,
+	.attr_group_gov_pol = &od_attr_group_gov_pol,
 	.get_cpu_cdbs = get_cpu_cdbs,
 	.get_cpu_dbs_info_s = get_cpu_dbs_info_s,
 	.gov_dbs_timer = od_dbs_timer,
 	.gov_check_cpu = od_check_cpu,
 	.gov_ops = &od_ops,
+	.init = od_init,
+	.exit = od_exit,
 };
 
 static int od_cpufreq_governor_dbs(struct cpufreq_policy *policy,
 		unsigned int event)
 {
-	return cpufreq_governor_dbs(&od_dbs_data, policy, event);
+	return cpufreq_governor_dbs(policy, &od_dbs_cdata, event);
 }
 
 #ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
@@ -509,29 +583,6 @@ struct cpufreq_governor cpufreq_gov_ondemand = {
 
 static int __init cpufreq_gov_dbs_init(void)
 {
-	u64 idle_time;
-	int cpu = get_cpu();
-
-	mutex_init(&od_dbs_data.mutex);
-	idle_time = get_cpu_idle_time_us(cpu, NULL);
-	put_cpu();
-	if (idle_time != -1ULL) {
-		/* Idle micro accounting is supported. Use finer thresholds */
-		od_tuners.up_threshold = MICRO_FREQUENCY_UP_THRESHOLD;
-		od_tuners.adj_up_threshold = MICRO_FREQUENCY_UP_THRESHOLD -
-					     MICRO_FREQUENCY_DOWN_DIFFERENTIAL;
-		/*
-		 * In nohz/micro accounting case we set the minimum frequency
-		 * not depending on HZ, but fixed (very low). The deferred
-		 * timer might skip some samples if idle/sleeping as needed.
-		*/
-		od_dbs_data.min_sampling_rate = MICRO_FREQUENCY_MIN_SAMPLE_RATE;
-	} else {
-		/* For correct statistics, we need 10 ticks for each measure */
-		od_dbs_data.min_sampling_rate = MIN_SAMPLING_RATE_RATIO *
-			jiffies_to_usecs(10);
-	}
-
 	return cpufreq_register_governor(&cpufreq_gov_ondemand);
 }
 
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index b7393b56f552..4bbc572dd521 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -232,6 +232,13 @@ struct cpufreq_driver {
 	struct module           *owner;
 	char			name[CPUFREQ_NAME_LEN];
 	u8			flags;
+	/*
+	 * This should be set by platforms having multiple clock-domains, i.e.
+	 * supporting multiple policies. With this sysfs directories of governor
+	 * would be created in cpu/cpu<num>/cpufreq/ directory and so they can
+	 * use the same governor with different tunables for different clusters.
+	 */
+	bool			have_governor_per_policy;
 
 	/* needed by all drivers */
 	int	(*init)		(struct cpufreq_policy *policy);
@@ -332,6 +339,7 @@ const char *cpufreq_get_current_driver(void);
  *********************************************************************/
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_update_policy(unsigned int cpu);
+bool have_governor_per_policy(void);
 
 #ifdef CONFIG_CPU_FREQ
 /* query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it */
-- 
cgit 


From 6a8e95b071ecf7357d294782b6ef4e707ce0fbbd Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Mon, 25 Mar 2013 21:34:51 +0800
Subject: ARM: mxs: move icoll driver into drivers/irqchip

Move icoll.c into drivers/irqchip as irq-mxs.c, and along with the
renaming, change the driver to use IRQCHIP_DECLARE.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
---
 arch/arm/mach-mxs/Makefile              |   2 +-
 arch/arm/mach-mxs/icoll.c               | 128 --------------------------------
 arch/arm/mach-mxs/include/mach/common.h |   3 -
 arch/arm/mach-mxs/mach-mxs.c            |   6 +-
 drivers/irqchip/Makefile                |   1 +
 drivers/irqchip/irq-mxs.c               | 121 ++++++++++++++++++++++++++++++
 include/linux/irqchip/mxs.h             |  14 ++++
 7 files changed, 141 insertions(+), 134 deletions(-)
 delete mode 100644 arch/arm/mach-mxs/icoll.c
 create mode 100644 drivers/irqchip/irq-mxs.c
 create mode 100644 include/linux/irqchip/mxs.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-mxs/Makefile b/arch/arm/mach-mxs/Makefile
index 76c336e6f5b5..b934603e2765 100644
--- a/arch/arm/mach-mxs/Makefile
+++ b/arch/arm/mach-mxs/Makefile
@@ -1,5 +1,5 @@
 # Common support
-obj-y := icoll.o ocotp.o system.o mm.o
+obj-y := ocotp.o system.o mm.o
 
 obj-$(CONFIG_PM) += pm.o
 
diff --git a/arch/arm/mach-mxs/icoll.c b/arch/arm/mach-mxs/icoll.c
deleted file mode 100644
index b4d620765cf1..000000000000
--- a/arch/arm/mach-mxs/icoll.c
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (C) 2009-2010 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/irq.h>
-#include <linux/irqdomain.h>
-#include <linux/io.h>
-#include <linux/of.h>
-#include <linux/of_address.h>
-#include <linux/of_irq.h>
-#include <linux/stmp_device.h>
-#include <asm/exception.h>
-
-#define HW_ICOLL_VECTOR				0x0000
-#define HW_ICOLL_LEVELACK			0x0010
-#define HW_ICOLL_CTRL				0x0020
-#define HW_ICOLL_STAT_OFFSET			0x0070
-#define HW_ICOLL_INTERRUPTn_SET(n)		(0x0124 + (n) * 0x10)
-#define HW_ICOLL_INTERRUPTn_CLR(n)		(0x0128 + (n) * 0x10)
-#define BM_ICOLL_INTERRUPTn_ENABLE		0x00000004
-#define BV_ICOLL_LEVELACK_IRQLEVELACK__LEVEL0	0x1
-
-#define ICOLL_NUM_IRQS		128
-
-static void __iomem *icoll_base;
-static struct irq_domain *icoll_domain;
-
-static void icoll_ack_irq(struct irq_data *d)
-{
-	/*
-	 * The Interrupt Collector is able to prioritize irqs.
-	 * Currently only level 0 is used. So acking can use
-	 * BV_ICOLL_LEVELACK_IRQLEVELACK__LEVEL0 unconditionally.
-	 */
-	__raw_writel(BV_ICOLL_LEVELACK_IRQLEVELACK__LEVEL0,
-			icoll_base + HW_ICOLL_LEVELACK);
-}
-
-static void icoll_mask_irq(struct irq_data *d)
-{
-	__raw_writel(BM_ICOLL_INTERRUPTn_ENABLE,
-			icoll_base + HW_ICOLL_INTERRUPTn_CLR(d->hwirq));
-}
-
-static void icoll_unmask_irq(struct irq_data *d)
-{
-	__raw_writel(BM_ICOLL_INTERRUPTn_ENABLE,
-			icoll_base + HW_ICOLL_INTERRUPTn_SET(d->hwirq));
-}
-
-static struct irq_chip mxs_icoll_chip = {
-	.irq_ack = icoll_ack_irq,
-	.irq_mask = icoll_mask_irq,
-	.irq_unmask = icoll_unmask_irq,
-};
-
-asmlinkage void __exception_irq_entry icoll_handle_irq(struct pt_regs *regs)
-{
-	u32 irqnr;
-
-	do {
-		irqnr = __raw_readl(icoll_base + HW_ICOLL_STAT_OFFSET);
-		if (irqnr != 0x7f) {
-			__raw_writel(irqnr, icoll_base + HW_ICOLL_VECTOR);
-			irqnr = irq_find_mapping(icoll_domain, irqnr);
-			handle_IRQ(irqnr, regs);
-			continue;
-		}
-		break;
-	} while (1);
-}
-
-static int icoll_irq_domain_map(struct irq_domain *d, unsigned int virq,
-				irq_hw_number_t hw)
-{
-	irq_set_chip_and_handler(virq, &mxs_icoll_chip, handle_level_irq);
-	set_irq_flags(virq, IRQF_VALID);
-
-	return 0;
-}
-
-static struct irq_domain_ops icoll_irq_domain_ops = {
-	.map = icoll_irq_domain_map,
-	.xlate = irq_domain_xlate_onecell,
-};
-
-static void __init icoll_of_init(struct device_node *np,
-			  struct device_node *interrupt_parent)
-{
-	icoll_base = of_iomap(np, 0);
-	WARN_ON(!icoll_base);
-
-	/*
-	 * Interrupt Collector reset, which initializes the priority
-	 * for each irq to level 0.
-	 */
-	stmp_reset_block(icoll_base + HW_ICOLL_CTRL);
-
-	icoll_domain = irq_domain_add_linear(np, ICOLL_NUM_IRQS,
-					     &icoll_irq_domain_ops, NULL);
-	WARN_ON(!icoll_domain);
-}
-
-static const struct of_device_id icoll_of_match[] __initconst = {
-	{.compatible = "fsl,icoll", .data = icoll_of_init},
-	{ /* sentinel */ }
-};
-
-void __init icoll_init_irq(void)
-{
-	of_irq_init(icoll_of_match);
-}
diff --git a/arch/arm/mach-mxs/include/mach/common.h b/arch/arm/mach-mxs/include/mach/common.h
index e043c4735b5a..df2a4ef14dae 100644
--- a/arch/arm/mach-mxs/include/mach/common.h
+++ b/arch/arm/mach-mxs/include/mach/common.h
@@ -22,7 +22,4 @@ extern void mx23_map_io(void);
 extern int mx28_clocks_init(void);
 extern void mx28_map_io(void);
 
-extern void icoll_init_irq(void);
-extern void icoll_handle_irq(struct pt_regs *);
-
 #endif /* __MACH_MXS_COMMON_H__ */
diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c
index c1c0fb414c28..10506381b446 100644
--- a/arch/arm/mach-mxs/mach-mxs.c
+++ b/arch/arm/mach-mxs/mach-mxs.c
@@ -18,6 +18,8 @@
 #include <linux/err.h>
 #include <linux/gpio.h>
 #include <linux/init.h>
+#include <linux/irqchip.h>
+#include <linux/irqchip/mxs.h>
 #include <linux/micrel_phy.h>
 #include <linux/mxsfb.h>
 #include <linux/of_platform.h>
@@ -469,7 +471,7 @@ static const char *imx28_dt_compat[] __initdata = {
 
 DT_MACHINE_START(IMX23, "Freescale i.MX23 (Device Tree)")
 	.map_io		= mx23_map_io,
-	.init_irq	= icoll_init_irq,
+	.init_irq	= irqchip_init,
 	.handle_irq	= icoll_handle_irq,
 	.init_time	= imx23_timer_init,
 	.init_machine	= mxs_machine_init,
@@ -479,7 +481,7 @@ MACHINE_END
 
 DT_MACHINE_START(IMX28, "Freescale i.MX28 (Device Tree)")
 	.map_io		= mx28_map_io,
-	.init_irq	= icoll_init_irq,
+	.init_irq	= irqchip_init,
 	.handle_irq	= icoll_handle_irq,
 	.init_time	= imx28_timer_init,
 	.init_machine	= mxs_machine_init,
diff --git a/drivers/irqchip/Makefile b/drivers/irqchip/Makefile
index 98e3b87bdf1b..9d8f4f1c6e39 100644
--- a/drivers/irqchip/Makefile
+++ b/drivers/irqchip/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_IRQCHIP)			+= irqchip.o
 
 obj-$(CONFIG_ARCH_BCM2835)		+= irq-bcm2835.o
 obj-$(CONFIG_ARCH_EXYNOS)		+= exynos-combiner.o
+obj-$(CONFIG_ARCH_MXS)			+= irq-mxs.o
 obj-$(CONFIG_METAG)			+= irq-metag-ext.o
 obj-$(CONFIG_METAG_PERFCOUNTER_IRQS)	+= irq-metag.o
 obj-$(CONFIG_ARCH_SUNXI)		+= irq-sunxi.o
diff --git a/drivers/irqchip/irq-mxs.c b/drivers/irqchip/irq-mxs.c
new file mode 100644
index 000000000000..29889bbdcc6d
--- /dev/null
+++ b/drivers/irqchip/irq-mxs.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2009-2010 Freescale Semiconductor, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/stmp_device.h>
+#include <asm/exception.h>
+
+#include "irqchip.h"
+
+#define HW_ICOLL_VECTOR				0x0000
+#define HW_ICOLL_LEVELACK			0x0010
+#define HW_ICOLL_CTRL				0x0020
+#define HW_ICOLL_STAT_OFFSET			0x0070
+#define HW_ICOLL_INTERRUPTn_SET(n)		(0x0124 + (n) * 0x10)
+#define HW_ICOLL_INTERRUPTn_CLR(n)		(0x0128 + (n) * 0x10)
+#define BM_ICOLL_INTERRUPTn_ENABLE		0x00000004
+#define BV_ICOLL_LEVELACK_IRQLEVELACK__LEVEL0	0x1
+
+#define ICOLL_NUM_IRQS		128
+
+static void __iomem *icoll_base;
+static struct irq_domain *icoll_domain;
+
+static void icoll_ack_irq(struct irq_data *d)
+{
+	/*
+	 * The Interrupt Collector is able to prioritize irqs.
+	 * Currently only level 0 is used. So acking can use
+	 * BV_ICOLL_LEVELACK_IRQLEVELACK__LEVEL0 unconditionally.
+	 */
+	__raw_writel(BV_ICOLL_LEVELACK_IRQLEVELACK__LEVEL0,
+			icoll_base + HW_ICOLL_LEVELACK);
+}
+
+static void icoll_mask_irq(struct irq_data *d)
+{
+	__raw_writel(BM_ICOLL_INTERRUPTn_ENABLE,
+			icoll_base + HW_ICOLL_INTERRUPTn_CLR(d->hwirq));
+}
+
+static void icoll_unmask_irq(struct irq_data *d)
+{
+	__raw_writel(BM_ICOLL_INTERRUPTn_ENABLE,
+			icoll_base + HW_ICOLL_INTERRUPTn_SET(d->hwirq));
+}
+
+static struct irq_chip mxs_icoll_chip = {
+	.irq_ack = icoll_ack_irq,
+	.irq_mask = icoll_mask_irq,
+	.irq_unmask = icoll_unmask_irq,
+};
+
+asmlinkage void __exception_irq_entry icoll_handle_irq(struct pt_regs *regs)
+{
+	u32 irqnr;
+
+	do {
+		irqnr = __raw_readl(icoll_base + HW_ICOLL_STAT_OFFSET);
+		if (irqnr != 0x7f) {
+			__raw_writel(irqnr, icoll_base + HW_ICOLL_VECTOR);
+			irqnr = irq_find_mapping(icoll_domain, irqnr);
+			handle_IRQ(irqnr, regs);
+			continue;
+		}
+		break;
+	} while (1);
+}
+
+static int icoll_irq_domain_map(struct irq_domain *d, unsigned int virq,
+				irq_hw_number_t hw)
+{
+	irq_set_chip_and_handler(virq, &mxs_icoll_chip, handle_level_irq);
+	set_irq_flags(virq, IRQF_VALID);
+
+	return 0;
+}
+
+static struct irq_domain_ops icoll_irq_domain_ops = {
+	.map = icoll_irq_domain_map,
+	.xlate = irq_domain_xlate_onecell,
+};
+
+static void __init icoll_of_init(struct device_node *np,
+			  struct device_node *interrupt_parent)
+{
+	icoll_base = of_iomap(np, 0);
+	WARN_ON(!icoll_base);
+
+	/*
+	 * Interrupt Collector reset, which initializes the priority
+	 * for each irq to level 0.
+	 */
+	stmp_reset_block(icoll_base + HW_ICOLL_CTRL);
+
+	icoll_domain = irq_domain_add_linear(np, ICOLL_NUM_IRQS,
+					     &icoll_irq_domain_ops, NULL);
+	WARN_ON(!icoll_domain);
+}
+IRQCHIP_DECLARE(mxs, "fsl,icoll", icoll_of_init);
diff --git a/include/linux/irqchip/mxs.h b/include/linux/irqchip/mxs.h
new file mode 100644
index 000000000000..9039a538a919
--- /dev/null
+++ b/include/linux/irqchip/mxs.h
@@ -0,0 +1,14 @@
+/*
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_IRQCHIP_MXS_H
+#define __LINUX_IRQCHIP_MXS_H
+
+extern void icoll_handle_irq(struct pt_regs *);
+
+#endif
-- 
cgit 


From 547f384f33dbd6171607f925ab246e25e315961e Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 28 Mar 2013 16:11:14 +0000
Subject: regulator: ab8500: add support for ab8505

To obtain full AB8505 regulator support, the AB8500 regulator driver
first needs to know its register layout and their initialisation values
for each. That information is provided via a couple of large data
structures which we provide here.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500-regulators.c | 511 ++++++++++++++++++++-
 arch/arm/mach-ux500/board-mop500-regulators.h |   1 +
 drivers/regulator/ab8500.c                    | 617 ++++++++++++++++++++++++++
 include/linux/regulator/ab8500.h              |  75 +++-
 4 files changed, 1195 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-regulators.c b/arch/arm/mach-ux500/board-mop500-regulators.c
index c1173a161a04..816151903d46 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.c
+++ b/arch/arm/mach-ux500/board-mop500-regulators.c
@@ -5,6 +5,7 @@
  *
  * Authors: Sundar Iyer <sundar.iyer@stericsson.com>
  *          Bengt Jonsson <bengt.g.jonsson@stericsson.com>
+ *          Daniel Willerud <daniel.willerud@stericsson.com>
  *
  * MOP500 board specific initialization for regulators
  */
@@ -99,6 +100,27 @@ static struct regulator_consumer_supply ab8500_vaux3_consumers[] = {
 	REGULATOR_SUPPLY("vmmc", "sdi0"),
 };
 
+static struct regulator_consumer_supply ab8505_vaux4_consumers[] = {
+};
+
+static struct regulator_consumer_supply ab8505_vaux5_consumers[] = {
+};
+
+static struct regulator_consumer_supply ab8505_vaux6_consumers[] = {
+};
+
+static struct regulator_consumer_supply ab8505_vaux8_consumers[] = {
+	/* AB8500 audio codec device */
+	REGULATOR_SUPPLY("v-aux8", NULL),
+};
+
+static struct regulator_consumer_supply ab8505_vadc_consumers[] = {
+	/* Internal general-purpose ADC */
+	REGULATOR_SUPPLY("vddadc", "ab8500-gpadc.0"),
+	/* ADC for charger */
+	REGULATOR_SUPPLY("vddadc", "ab8500-charger.0"),
+};
+
 static struct regulator_consumer_supply ab8500_vtvout_consumers[] = {
 	/* TV-out DENC supply */
 	REGULATOR_SUPPLY("vtvout", "ab8500-denc.0"),
@@ -133,6 +155,11 @@ static struct regulator_consumer_supply ab8500_vintcore_consumers[] = {
 	REGULATOR_SUPPLY("vddulpivio18", "ab8500-usb.0"),
 };
 
+static struct regulator_consumer_supply ab8505_usb_consumers[] = {
+	/* HS USB OTG physical interface */
+	REGULATOR_SUPPLY("v-ape", NULL),
+};
+
 static struct regulator_consumer_supply ab8500_vana_consumers[] = {
 	/* External displays, connector on board, 1v8 power supply */
 	REGULATOR_SUPPLY("vsmps2", "mcde.0"),
@@ -469,6 +496,450 @@ static struct regulator_init_data ab8500_ext_regulators[] = {
 	},
 };
 
+/* ab8505 regulator register initialization */
+static struct ab8500_regulator_reg_init ab8505_reg_init[] = {
+	/*
+	 * VarmRequestCtrl
+	 * VsmpsCRequestCtrl
+	 * VsmpsARequestCtrl
+	 * VsmpsBRequestCtrl
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUREQUESTCTRL1,       0x00, 0x00),
+	/*
+	 * VsafeRequestCtrl
+	 * VpllRequestCtrl
+	 * VanaRequestCtrl          = HP/LP depending on VxRequest
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUREQUESTCTRL2,       0x30, 0x00),
+	/*
+	 * Vaux1RequestCtrl         = HP/LP depending on VxRequest
+	 * Vaux2RequestCtrl         = HP/LP depending on VxRequest
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUREQUESTCTRL3,       0xf0, 0x00),
+	/*
+	 * Vaux3RequestCtrl         = HP/LP depending on VxRequest
+	 * SwHPReq                  = Control through SWValid disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUREQUESTCTRL4,       0x07, 0x00),
+	/*
+	 * VsmpsASysClkReq1HPValid
+	 * VsmpsBSysClkReq1HPValid
+	 * VsafeSysClkReq1HPValid
+	 * VanaSysClkReq1HPValid    = disabled
+	 * VpllSysClkReq1HPValid
+	 * Vaux1SysClkReq1HPValid   = disabled
+	 * Vaux2SysClkReq1HPValid   = disabled
+	 * Vaux3SysClkReq1HPValid   = disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUSYSCLKREQ1HPVALID1, 0xe8, 0x00),
+	/*
+	 * VsmpsCSysClkReq1HPValid
+	 * VarmSysClkReq1HPValid
+	 * VbbSysClkReq1HPValid
+	 * VsmpsMSysClkReq1HPValid
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUSYSCLKREQ1HPVALID2, 0x00, 0x00),
+	/*
+	 * VsmpsAHwHPReq1Valid
+	 * VsmpsBHwHPReq1Valid
+	 * VsafeHwHPReq1Valid
+	 * VanaHwHPReq1Valid        = disabled
+	 * VpllHwHPReq1Valid
+	 * Vaux1HwHPreq1Valid       = disabled
+	 * Vaux2HwHPReq1Valid       = disabled
+	 * Vaux3HwHPReqValid        = disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUHWHPREQ1VALID1,     0xe8, 0x00),
+	/*
+	 * VsmpsMHwHPReq1Valid
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUHWHPREQ1VALID2,     0x00, 0x00),
+	/*
+	 * VsmpsAHwHPReq2Valid
+	 * VsmpsBHwHPReq2Valid
+	 * VsafeHwHPReq2Valid
+	 * VanaHwHPReq2Valid        = disabled
+	 * VpllHwHPReq2Valid
+	 * Vaux1HwHPReq2Valid       = disabled
+	 * Vaux2HwHPReq2Valid       = disabled
+	 * Vaux3HwHPReq2Valid       = disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUHWHPREQ2VALID1,     0xe8, 0x00),
+	/*
+	 * VsmpsMHwHPReq2Valid
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUHWHPREQ2VALID2,     0x00, 0x00),
+	/**
+	 * VsmpsCSwHPReqValid
+	 * VarmSwHPReqValid
+	 * VsmpsASwHPReqValid
+	 * VsmpsBSwHPReqValid
+	 * VsafeSwHPReqValid
+	 * VanaSwHPReqValid
+	 * VanaSwHPReqValid         = disabled
+	 * VpllSwHPReqValid
+	 * Vaux1SwHPReqValid        = disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUSWHPREQVALID1,      0xa0, 0x00),
+	/*
+	 * Vaux2SwHPReqValid        = disabled
+	 * Vaux3SwHPReqValid        = disabled
+	 * VsmpsMSwHPReqValid
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUSWHPREQVALID2,      0x03, 0x00),
+	/*
+	 * SysClkReq2Valid1         = SysClkReq2 controlled
+	 * SysClkReq3Valid1         = disabled
+	 * SysClkReq4Valid1         = SysClkReq4 controlled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUSYSCLKREQVALID1,    0x0e, 0x0a),
+	/*
+	 * SysClkReq2Valid2         = disabled
+	 * SysClkReq3Valid2         = disabled
+	 * SysClkReq4Valid2         = disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUSYSCLKREQVALID2,    0x0e, 0x00),
+	/*
+	 * Vaux4SwHPReqValid
+	 * Vaux4HwHPReq2Valid
+	 * Vaux4HwHPReq1Valid
+	 * Vaux4SysClkReq1HPValid
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUVAUX4REQVALID,    0x00, 0x00),
+	/*
+	 * VadcEna                  = disabled
+	 * VintCore12Ena            = disabled
+	 * VintCore12Sel            = 1.25 V
+	 * VintCore12LP             = inactive (HP)
+	 * VadcLP                   = inactive (HP)
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUMISC1,              0xfe, 0x10),
+	/*
+	 * VaudioEna                = disabled
+	 * Vaux8Ena                 = disabled
+	 * Vamic1Ena                = disabled
+	 * Vamic2Ena                = disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUDIOSUPPLY,           0x1e, 0x00),
+	/*
+	 * Vamic1_dzout             = high-Z when Vamic1 is disabled
+	 * Vamic2_dzout             = high-Z when Vamic2 is disabled
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUCTRL1VAMIC,         0x03, 0x00),
+	/*
+	 * VsmpsARegu
+	 * VsmpsASelCtrl
+	 * VsmpsAAutoMode
+	 * VsmpsAPWMMode
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSAREGU,    0x00, 0x00),
+	/*
+	 * VsmpsBRegu
+	 * VsmpsBSelCtrl
+	 * VsmpsBAutoMode
+	 * VsmpsBPWMMode
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSBREGU,    0x00, 0x00),
+	/*
+	 * VsafeRegu
+	 * VsafeSelCtrl
+	 * VsafeAutoMode
+	 * VsafePWMMode
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSAFEREGU,    0x00, 0x00),
+	/*
+	 * VPll                     = Hw controlled (NOTE! PRCMU bits)
+	 * VanaRegu                 = force off
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VPLLVANAREGU,           0x0f, 0x02),
+	/*
+	 * VextSupply1Regu          = force OFF (OTP_ExtSupply12LPnPolarity 1)
+	 * VextSupply2Regu          = force OFF (OTP_ExtSupply12LPnPolarity 1)
+	 * VextSupply3Regu          = force OFF (OTP_ExtSupply3LPnPolarity 0)
+	 * ExtSupply2Bypass         = ExtSupply12LPn ball is 0 when Ena is 0
+	 * ExtSupply3Bypass         = ExtSupply3LPn ball is 0 when Ena is 0
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_EXTSUPPLYREGU,          0xff, 0x30),
+	/*
+	 * Vaux1Regu                = force HP
+	 * Vaux2Regu                = force off
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUX12REGU,             0x0f, 0x01),
+	/*
+	 * Vaux3Regu                = force off
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VRF1VAUX3REGU,          0x03, 0x00),
+	/*
+	 * VsmpsASel1
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSASEL1,    0x00, 0x00),
+	/*
+	 * VsmpsASel2
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSASEL2,    0x00, 0x00),
+	/*
+	 * VsmpsASel3
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSASEL3,    0x00, 0x00),
+	/*
+	 * VsmpsBSel1
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSBSEL1,    0x00, 0x00),
+	/*
+	 * VsmpsBSel2
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSBSEL2,    0x00, 0x00),
+	/*
+	 * VsmpsBSel3
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSMPSBSEL3,    0x00, 0x00),
+	/*
+	 * VsafeSel1
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSAFESEL1,    0x00, 0x00),
+	/*
+	 * VsafeSel2
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSAFESEL2,    0x00, 0x00),
+	/*
+	 * VsafeSel3
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VSAFESEL3,    0x00, 0x00),
+	/*
+	 * Vaux1Sel                 = 2.8 V
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUX1SEL,               0x0f, 0x0C),
+	/*
+	 * Vaux2Sel                 = 2.9 V
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUX2SEL,               0x0f, 0x0d),
+	/*
+	 * Vaux3Sel                 = 2.91 V
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VRF1VAUX3SEL,           0x07, 0x07),
+	/*
+	 * Vaux4RequestCtrl
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUX4REQCTRL,    0x00, 0x00),
+	/*
+	 * Vaux4Regu
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUX4REGU,    0x00, 0x00),
+	/*
+	 * Vaux4Sel
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_VAUX4SEL,    0x00, 0x00),
+	/*
+	 * Vaux1Disch               = short discharge time
+	 * Vaux2Disch               = short discharge time
+	 * Vaux3Disch               = short discharge time
+	 * Vintcore12Disch          = short discharge time
+	 * VTVoutDisch              = short discharge time
+	 * VaudioDisch              = short discharge time
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUCTRLDISCH,          0xfc, 0x00),
+	/*
+	 * VanaDisch                = short discharge time
+	 * Vaux8PullDownEna         = pulldown disabled when Vaux8 is disabled
+	 * Vaux8Disch               = short discharge time
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUCTRLDISCH2,         0x16, 0x00),
+	/*
+	 * Vaux4Disch               = short discharge time
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_REGUCTRLDISCH3,         0x01, 0x00),
+	/*
+	 * Vaux5Sel
+	 * Vaux5LP
+	 * Vaux5Ena
+	 * Vaux5Disch
+	 * Vaux5DisSfst
+	 * Vaux5DisPulld
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_CTRLVAUX5,              0x00, 0x00),
+	/*
+	 * Vaux6Sel
+	 * Vaux6LP
+	 * Vaux6Ena
+	 * Vaux6DisPulld
+	 */
+	INIT_REGULATOR_REGISTER(AB8505_CTRLVAUX6,              0x00, 0x00),
+};
+
+struct regulator_init_data ab8505_regulators[AB8505_NUM_REGULATORS] = {
+	/* supplies to the display/camera */
+	[AB8505_LDO_AUX1] = {
+		.constraints = {
+			.name = "V-DISPLAY",
+			.min_uV = 2800000,
+			.max_uV = 3300000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS,
+			.boot_on = 1, /* display is on at boot */
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaux1_consumers),
+		.consumer_supplies = ab8500_vaux1_consumers,
+	},
+	/* supplies to the on-board eMMC */
+	[AB8505_LDO_AUX2] = {
+		.constraints = {
+			.name = "V-eMMC1",
+			.min_uV = 1100000,
+			.max_uV = 3300000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaux2_consumers),
+		.consumer_supplies = ab8500_vaux2_consumers,
+	},
+	/* supply for VAUX3, supplies to SDcard slots */
+	[AB8505_LDO_AUX3] = {
+		.constraints = {
+			.name = "V-MMC-SD",
+			.min_uV = 1100000,
+			.max_uV = 3300000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaux3_consumers),
+		.consumer_supplies = ab8500_vaux3_consumers,
+	},
+	/* supply for VAUX4, supplies to NFC and standalone secure element */
+	[AB8505_LDO_AUX4] = {
+		.constraints = {
+			.name = "V-NFC-SE",
+			.min_uV = 1100000,
+			.max_uV = 3300000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8505_vaux4_consumers),
+		.consumer_supplies = ab8505_vaux4_consumers,
+	},
+	/* supply for VAUX5, supplies to TBD */
+	[AB8505_LDO_AUX5] = {
+		.constraints = {
+			.name = "V-AUX5",
+			.min_uV = 1050000,
+			.max_uV = 2790000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8505_vaux5_consumers),
+		.consumer_supplies = ab8505_vaux5_consumers,
+	},
+	/* supply for VAUX6, supplies to TBD */
+	[AB8505_LDO_AUX6] = {
+		.constraints = {
+			.name = "V-AUX6",
+			.min_uV = 1050000,
+			.max_uV = 2790000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8505_vaux6_consumers),
+		.consumer_supplies = ab8505_vaux6_consumers,
+	},
+	/* supply for gpadc, ADC LDO */
+	[AB8505_LDO_ADC] = {
+		.constraints = {
+			.name = "V-ADC",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8505_vadc_consumers),
+		.consumer_supplies = ab8505_vadc_consumers,
+	},
+	/* supply for ab8500-vaudio, VAUDIO LDO */
+	[AB8505_LDO_AUDIO] = {
+		.constraints = {
+			.name = "V-AUD",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vaud_consumers),
+		.consumer_supplies = ab8500_vaud_consumers,
+	},
+	/* supply for v-anamic1 VAMic1-LDO */
+	[AB8505_LDO_ANAMIC1] = {
+		.constraints = {
+			.name = "V-AMIC1",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vamic1_consumers),
+		.consumer_supplies = ab8500_vamic1_consumers,
+	},
+	/* supply for v-amic2, VAMIC2 LDO, reuse constants for AMIC1 */
+	[AB8505_LDO_ANAMIC2] = {
+		.constraints = {
+			.name = "V-AMIC2",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vamic2_consumers),
+		.consumer_supplies = ab8500_vamic2_consumers,
+	},
+	/* supply for v-aux8, VAUX8 LDO */
+	[AB8505_LDO_AUX8] = {
+		.constraints = {
+			.name = "V-AUX8",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8505_vaux8_consumers),
+		.consumer_supplies = ab8505_vaux8_consumers,
+	},
+	/* supply for v-intcore12, VINTCORE12 LDO */
+	[AB8505_LDO_INTCORE] = {
+		.constraints = {
+			.name = "V-INTCORE",
+			.min_uV = 1250000,
+			.max_uV = 1350000,
+			.input_uV = 1800000,
+			.valid_ops_mask = REGULATOR_CHANGE_VOLTAGE |
+					  REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE |
+					  REGULATOR_CHANGE_DRMS,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vintcore_consumers),
+		.consumer_supplies = ab8500_vintcore_consumers,
+	},
+	/* supply for LDO USB */
+	[AB8505_LDO_USB] = {
+		.constraints = {
+			.name = "V-USB",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS |
+					  REGULATOR_CHANGE_MODE,
+			.valid_modes_mask = REGULATOR_MODE_NORMAL |
+					    REGULATOR_MODE_IDLE,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8505_usb_consumers),
+		.consumer_supplies = ab8505_usb_consumers,
+	},
+	/* supply for U8500 CSI-DSI, VANA LDO */
+	[AB8505_LDO_ANA] = {
+		.constraints = {
+			.name = "V-CSI-DSI",
+			.valid_ops_mask = REGULATOR_CHANGE_STATUS,
+		},
+		.num_consumer_supplies = ARRAY_SIZE(ab8500_vana_consumers),
+		.consumer_supplies = ab8500_vana_consumers,
+	},
+};
+
 struct ab8500_regulator_platform_data ab8500_regulator_plat_data = {
 	.reg_init               = ab8500_reg_init,
 	.num_reg_init           = ARRAY_SIZE(ab8500_reg_init),
@@ -478,18 +949,39 @@ struct ab8500_regulator_platform_data ab8500_regulator_plat_data = {
 	.num_ext_regulator      = ARRAY_SIZE(ab8500_ext_regulators),
 };
 
+/* Use the AB8500 init settings for AB8505 as they are the same right now */
+struct ab8500_regulator_platform_data ab8505_regulator_plat_data = {
+	.reg_init               = ab8505_reg_init,
+	.num_reg_init           = ARRAY_SIZE(ab8505_reg_init),
+	.regulator              = ab8505_regulators,
+	.num_regulator          = ARRAY_SIZE(ab8505_regulators),
+};
+
 static void ab8500_modify_reg_init(int id, u8 mask, u8 value)
 {
 	int i;
 
-	for (i = ARRAY_SIZE(ab8500_reg_init) - 1; i >= 0; i--) {
-		if (ab8500_reg_init[i].id == id) {
-			u8 initval = ab8500_reg_init[i].value;
-			initval = (initval & ~mask) | (value & mask);
-			ab8500_reg_init[i].value = initval;
+	if (cpu_is_u8520()) {
+		for (i = ARRAY_SIZE(ab8505_reg_init) - 1; i >= 0; i--) {
+			if (ab8505_reg_init[i].id == id) {
+				u8 initval = ab8505_reg_init[i].value;
+				initval = (initval & ~mask) | (value & mask);
+				ab8505_reg_init[i].value = initval;
 
-			BUG_ON(mask & ~ab8500_reg_init[i].mask);
-			return;
+				BUG_ON(mask & ~ab8505_reg_init[i].mask);
+				return;
+			}
+		}
+	} else {
+		for (i = ARRAY_SIZE(ab8500_reg_init) - 1; i >= 0; i--) {
+			if (ab8500_reg_init[i].id == id) {
+				u8 initval = ab8500_reg_init[i].value;
+				initval = (initval & ~mask) | (value & mask);
+				ab8500_reg_init[i].value = initval;
+
+				BUG_ON(mask & ~ab8500_reg_init[i].mask);
+				return;
+			}
 		}
 	}
 
@@ -511,6 +1003,11 @@ void mop500_regulator_init(void)
 		regulator->constraints.state_standby.disabled = 1;
 	}
 
+	if (cpu_is_u8520()) {
+		/* Vaux2 initialized to be on */
+		ab8500_modify_reg_init(AB8505_VAUX12REGU, 0x0f, 0x05);
+	}
+
 	/*
 	 * Handle AB8500_EXT_SUPPLY2 on HREFP_V20_V50 boards (do it for
 	 * all HREFP_V20 boards)
diff --git a/arch/arm/mach-ux500/board-mop500-regulators.h b/arch/arm/mach-ux500/board-mop500-regulators.h
index 3d4c412d0b7a..9bece38fe933 100644
--- a/arch/arm/mach-ux500/board-mop500-regulators.h
+++ b/arch/arm/mach-ux500/board-mop500-regulators.h
@@ -15,6 +15,7 @@
 #include <linux/regulator/ab8500.h>
 
 extern struct ab8500_regulator_platform_data ab8500_regulator_plat_data;
+extern struct ab8500_regulator_platform_data ab8505_regulator_plat_data;
 extern struct regulator_init_data tps61052_regulator;
 extern struct regulator_init_data gpio_en_3v3_regulator;
 
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 9de3a211b0b4..1ab0f8a7c862 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -5,11 +5,15 @@
  *
  * Authors: Sundar Iyer <sundar.iyer@stericsson.com> for ST-Ericsson
  *          Bengt Jonsson <bengt.g.jonsson@stericsson.com> for ST-Ericsson
+ *          Daniel Willerud <daniel.willerud@stericsson.com> for ST-Ericsson
  *
  * AB8500 peripheral regulators
  *
  * AB8500 supports the following regulators:
  *   VAUX1/2/3, VINTCORE, VTVOUT, VUSB, VAUDIO, VAMIC1/2, VDMIC, VANA
+ *
+ * AB8505 supports the following regulators:
+ *   VAUX1/2/3/4/5/6, VINTCORE, VADC, VUSB, VAUDIO, VAMIC1/2, VDMIC, VANA
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -92,6 +96,17 @@ static const unsigned int ldo_vaux3_voltages[] = {
 	2910000,
 };
 
+static const int ldo_vaux56_voltages[] = {
+	1800000,
+	1050000,
+	1100000,
+	1200000,
+	1500000,
+	2200000,
+	2500000,
+	2790000,
+};
+
 static const unsigned int ldo_vintcore_voltages[] = {
 	1200000,
 	1225000,
@@ -589,6 +604,313 @@ static struct ab8500_regulator_info
 	},
 };
 
+/* AB8505 regulator information */
+static struct ab8500_regulator_info
+		ab8505_regulator_info[AB8505_NUM_REGULATORS] = {
+	/*
+	 * Variable Voltage Regulators
+	 *   name, min mV, max mV,
+	 *   update bank, reg, mask, enable val
+	 *   volt bank, reg, mask, table, table length
+	 */
+	[AB8505_LDO_AUX1] = {
+		.desc = {
+			.name		= "LDO-AUX1",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUX1,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
+		},
+		.min_uV			= 1100000,
+		.max_uV			= 3300000,
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x09,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x1f,
+		.voltage_mask		= 0x0f,
+		.voltages		= ldo_vauxn_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vauxn_voltages),
+	},
+	[AB8505_LDO_AUX2] = {
+		.desc = {
+			.name		= "LDO-AUX2",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUX2,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
+		},
+		.min_uV			= 1100000,
+		.max_uV			= 3300000,
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x09,
+		.update_mask		= 0x0c,
+		.update_val		= 0x04,
+		.update_val_idle	= 0x0c,
+		.update_val_normal	= 0x04,
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x20,
+		.voltage_mask		= 0x0f,
+		.voltages		= ldo_vauxn_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vauxn_voltages),
+	},
+	[AB8505_LDO_AUX3] = {
+		.desc = {
+			.name		= "LDO-AUX3",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUX3,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vaux3_voltages),
+		},
+		.min_uV			= 1100000,
+		.max_uV			= 3300000,
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x0a,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x21,
+		.voltage_mask		= 0x07,
+		.voltages		= ldo_vaux3_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vaux3_voltages),
+	},
+	[AB8505_LDO_AUX4] = {
+		.desc = {
+			.name		= "LDO-AUX4",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB9540_LDO_AUX4,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
+		},
+		.min_uV			= 1100000,
+		.max_uV			= 3300000,
+		.load_lp_uA		= 5000,
+		/* values for Vaux4Regu register */
+		.update_bank		= 0x04,
+		.update_reg		= 0x2e,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		/* values for Vaux4SEL register */
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x2f,
+		.voltage_mask		= 0x0f,
+		.voltages		= ldo_vauxn_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vauxn_voltages),
+	},
+	[AB8505_LDO_AUX5] = {
+		.desc = {
+			.name		= "LDO-AUX5",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8505_LDO_AUX5,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vaux56_voltages),
+		},
+		.min_uV			= 1050000,
+		.max_uV			= 2790000,
+		.load_lp_uA		= 2000,
+		/* values for CtrlVaux5 register */
+		.update_bank		= 0x01,
+		.update_reg		= 0x55,
+		.update_mask		= 0x08,
+		.update_val		= 0x00,
+		.update_val_idle	= 0x01,
+		.update_val_normal	= 0x00,
+		.voltage_bank		= 0x01,
+		.voltage_reg		= 0x55,
+		.voltage_mask		= 0x07,
+		.voltages		= ldo_vaux56_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vaux56_voltages),
+	},
+	[AB8505_LDO_AUX6] = {
+		.desc = {
+			.name		= "LDO-AUX6",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8505_LDO_AUX6,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vaux56_voltages),
+		},
+		.min_uV			= 1050000,
+		.max_uV			= 2790000,
+		.load_lp_uA		= 2000,
+		/* values for CtrlVaux6 register */
+		.update_bank		= 0x01,
+		.update_reg		= 0x56,
+		.update_mask		= 0x08,
+		.update_val		= 0x00,
+		.update_val_idle	= 0x01,
+		.update_val_normal	= 0x00,
+		.voltage_bank		= 0x01,
+		.voltage_reg		= 0x56,
+		.voltage_mask		= 0x07,
+		.voltages		= ldo_vaux56_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vaux56_voltages),
+	},
+	[AB8505_LDO_INTCORE] = {
+		.desc = {
+			.name		= "LDO-INTCORE",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_INTCORE,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vintcore_voltages),
+		},
+		.min_uV			= 1100000,
+		.max_uV			= 3300000,
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x80,
+		.update_mask		= 0x44,
+		.update_val		= 0x04,
+		.update_val_idle	= 0x44,
+		.update_val_normal	= 0x04,
+		.voltage_bank		= 0x03,
+		.voltage_reg		= 0x80,
+		.voltage_mask		= 0x38,
+		.voltages		= ldo_vintcore_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vintcore_voltages),
+		.voltage_shift		= 3,
+	},
+
+	/*
+	 * Fixed Voltage Regulators
+	 *   name, fixed mV,
+	 *   update bank, reg, mask, enable val
+	 */
+	[AB8505_LDO_ADC] = {
+		.desc = {
+			.name		= "LDO-ADC",
+			.ops		= &ab8500_regulator_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8505_LDO_ADC,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.delay			= 10000,
+		.fixed_uV		= 2000000,
+		.load_lp_uA		= 1000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x80,
+		.update_mask		= 0x82,
+		.update_val		= 0x02,
+		.update_val_idle	= 0x82,
+		.update_val_normal	= 0x02,
+	},
+	[AB8505_LDO_USB] = {
+		.desc = {
+			.name           = "LDO-USB",
+			.ops            = &ab8500_regulator_mode_ops,
+			.type           = REGULATOR_VOLTAGE,
+			.id             = AB9540_LDO_USB,
+			.owner          = THIS_MODULE,
+			.n_voltages     = 1,
+		},
+		.fixed_uV               = 3300000,
+		.update_bank            = 0x03,
+		.update_reg             = 0x82,
+		.update_mask            = 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+	},
+	[AB8505_LDO_AUDIO] = {
+		.desc = {
+			.name		= "LDO-AUDIO",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUDIO,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.fixed_uV		= 2000000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x02,
+		.update_val		= 0x02,
+	},
+	[AB8505_LDO_ANAMIC1] = {
+		.desc = {
+			.name		= "LDO-ANAMIC1",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_ANAMIC1,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.fixed_uV		= 2050000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x08,
+		.update_val		= 0x08,
+	},
+	[AB8505_LDO_ANAMIC2] = {
+		.desc = {
+			.name		= "LDO-ANAMIC2",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_ANAMIC2,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.fixed_uV		= 2050000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x10,
+		.update_val		= 0x10,
+	},
+	[AB8505_LDO_AUX8] = {
+		.desc = {
+			.name		= "LDO-AUX8",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8505_LDO_AUX8,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.fixed_uV		= 1800000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x04,
+		.update_val		= 0x04,
+	},
+	/*
+	 * Regulators with fixed voltage and normal/idle modes
+	 */
+	[AB8505_LDO_ANA] = {
+		.desc = {
+			.name		= "LDO-ANA",
+			.ops		= &ab8500_regulator_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_ANA,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.fixed_uV		= 1200000,
+		.load_lp_uA		= 1000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x06,
+		.update_mask		= 0x0c,
+		.update_val		= 0x04,
+		.update_val_idle	= 0x0c,
+		.update_val_normal	= 0x04,
+	},
+};
+
 /* AB9540 regulator information */
 static struct ab8500_regulator_info
 		ab9540_regulator_info[AB9540_NUM_REGULATORS] = {
@@ -1031,6 +1353,276 @@ static struct ab8500_reg_init ab8500_reg_init[] = {
 	REG_INIT(AB8500_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
 };
 
+/* AB8505 register init */
+static struct ab8500_reg_init ab8505_reg_init[] = {
+	/*
+	 * 0x03, VarmRequestCtrl
+	 * 0x0c, VsmpsCRequestCtrl
+	 * 0x30, VsmpsARequestCtrl
+	 * 0xc0, VsmpsBRequestCtrl
+	 */
+	REG_INIT(AB8505_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
+	/*
+	 * 0x03, VsafeRequestCtrl
+	 * 0x0c, VpllRequestCtrl
+	 * 0x30, VanaRequestCtrl
+	 */
+	REG_INIT(AB8505_REGUREQUESTCTRL2,	0x03, 0x04, 0x3f),
+	/*
+	 * 0x30, Vaux1RequestCtrl
+	 * 0xc0, Vaux2RequestCtrl
+	 */
+	REG_INIT(AB8505_REGUREQUESTCTRL3,	0x03, 0x05, 0xf0),
+	/*
+	 * 0x03, Vaux3RequestCtrl
+	 * 0x04, SwHPReq
+	 */
+	REG_INIT(AB8505_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
+	/*
+	 * 0x01, VsmpsASysClkReq1HPValid
+	 * 0x02, VsmpsBSysClkReq1HPValid
+	 * 0x04, VsafeSysClkReq1HPValid
+	 * 0x08, VanaSysClkReq1HPValid
+	 * 0x10, VpllSysClkReq1HPValid
+	 * 0x20, Vaux1SysClkReq1HPValid
+	 * 0x40, Vaux2SysClkReq1HPValid
+	 * 0x80, Vaux3SysClkReq1HPValid
+	 */
+	REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
+	/*
+	 * 0x01, VsmpsCSysClkReq1HPValid
+	 * 0x02, VarmSysClkReq1HPValid
+	 * 0x04, VbbSysClkReq1HPValid
+	 * 0x08, VsmpsMSysClkReq1HPValid
+	 */
+	REG_INIT(AB8505_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x0f),
+	/*
+	 * 0x01, VsmpsAHwHPReq1Valid
+	 * 0x02, VsmpsBHwHPReq1Valid
+	 * 0x04, VsafeHwHPReq1Valid
+	 * 0x08, VanaHwHPReq1Valid
+	 * 0x10, VpllHwHPReq1Valid
+	 * 0x20, Vaux1HwHPReq1Valid
+	 * 0x40, Vaux2HwHPReq1Valid
+	 * 0x80, Vaux3HwHPReq1Valid
+	 */
+	REG_INIT(AB8505_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
+	/*
+	 * 0x08, VsmpsMHwHPReq1Valid
+	 */
+	REG_INIT(AB8505_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x08),
+	/*
+	 * 0x01, VsmpsAHwHPReq2Valid
+	 * 0x02, VsmpsBHwHPReq2Valid
+	 * 0x04, VsafeHwHPReq2Valid
+	 * 0x08, VanaHwHPReq2Valid
+	 * 0x10, VpllHwHPReq2Valid
+	 * 0x20, Vaux1HwHPReq2Valid
+	 * 0x40, Vaux2HwHPReq2Valid
+	 * 0x80, Vaux3HwHPReq2Valid
+	 */
+	REG_INIT(AB8505_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
+	/*
+	 * 0x08, VsmpsMHwHPReq2Valid
+	 */
+	REG_INIT(AB8505_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x08),
+	/*
+	 * 0x01, VsmpsCSwHPReqValid
+	 * 0x02, VarmSwHPReqValid
+	 * 0x04, VsmpsASwHPReqValid
+	 * 0x08, VsmpsBSwHPReqValid
+	 * 0x10, VsafeSwHPReqValid
+	 * 0x20, VanaSwHPReqValid
+	 * 0x40, VpllSwHPReqValid
+	 * 0x80, Vaux1SwHPReqValid
+	 */
+	REG_INIT(AB8505_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
+	/*
+	 * 0x01, Vaux2SwHPReqValid
+	 * 0x02, Vaux3SwHPReqValid
+	 * 0x20, VsmpsMSwHPReqValid
+	 */
+	REG_INIT(AB8505_REGUSWHPREQVALID2,	0x03, 0x0e, 0x23),
+	/*
+	 * 0x02, SysClkReq2Valid1
+	 * 0x04, SysClkReq3Valid1
+	 * 0x08, SysClkReq4Valid1
+	 */
+	REG_INIT(AB8505_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0x0e),
+	/*
+	 * 0x02, SysClkReq2Valid2
+	 * 0x04, SysClkReq3Valid2
+	 * 0x08, SysClkReq4Valid2
+	 */
+	REG_INIT(AB8505_REGUSYSCLKREQVALID2,	0x03, 0x10, 0x0e),
+	/*
+	 * 0x01, Vaux4SwHPReqValid
+	 * 0x02, Vaux4HwHPReq2Valid
+	 * 0x04, Vaux4HwHPReq1Valid
+	 * 0x08, Vaux4SysClkReq1HPValid
+	 */
+	REG_INIT(AB8505_REGUVAUX4REQVALID,	0x03, 0x11, 0x0f),
+	/*
+	 * 0x02, VadcEna
+	 * 0x04, VintCore12Ena
+	 * 0x38, VintCore12Sel
+	 * 0x40, VintCore12LP
+	 * 0x80, VadcLP
+	 */
+	REG_INIT(AB8505_REGUMISC1,		0x03, 0x80, 0xfe),
+	/*
+	 * 0x02, VaudioEna
+	 * 0x04, VdmicEna
+	 * 0x08, Vamic1Ena
+	 * 0x10, Vamic2Ena
+	 */
+	REG_INIT(AB8505_VAUDIOSUPPLY,		0x03, 0x83, 0x1e),
+	/*
+	 * 0x01, Vamic1_dzout
+	 * 0x02, Vamic2_dzout
+	 */
+	REG_INIT(AB8505_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
+	/*
+	 * 0x03, VsmpsARegu
+	 * 0x0c, VsmpsASelCtrl
+	 * 0x10, VsmpsAAutoMode
+	 * 0x20, VsmpsAPWMMode
+	 */
+	REG_INIT(AB8505_VSMPSAREGU,		0x04, 0x03, 0x3f),
+	/*
+	 * 0x03, VsmpsBRegu
+	 * 0x0c, VsmpsBSelCtrl
+	 * 0x10, VsmpsBAutoMode
+	 * 0x20, VsmpsBPWMMode
+	 */
+	REG_INIT(AB8505_VSMPSBREGU,		0x04, 0x04, 0x3f),
+	/*
+	 * 0x03, VsafeRegu
+	 * 0x0c, VsafeSelCtrl
+	 * 0x10, VsafeAutoMode
+	 * 0x20, VsafePWMMode
+	 */
+	REG_INIT(AB8505_VSAFEREGU,		0x04, 0x05, 0x3f),
+	/*
+	 * 0x03, VpllRegu (NOTE! PRCMU register bits)
+	 * 0x0c, VanaRegu
+	 */
+	REG_INIT(AB8505_VPLLVANAREGU,		0x04, 0x06, 0x0f),
+	/*
+	 * 0x03, VextSupply1Regu
+	 * 0x0c, VextSupply2Regu
+	 * 0x30, VextSupply3Regu
+	 * 0x40, ExtSupply2Bypass
+	 * 0x80, ExtSupply3Bypass
+	 */
+	REG_INIT(AB8505_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
+	/*
+	 * 0x03, Vaux1Regu
+	 * 0x0c, Vaux2Regu
+	 */
+	REG_INIT(AB8505_VAUX12REGU,		0x04, 0x09, 0x0f),
+	/*
+	 * 0x0f, Vaux3Regu
+	 */
+	REG_INIT(AB8505_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
+	/*
+	 * 0x3f, VsmpsASel1
+	 */
+	REG_INIT(AB8505_VSMPSASEL1,		0x04, 0x13, 0x3f),
+	/*
+	 * 0x3f, VsmpsASel2
+	 */
+	REG_INIT(AB8505_VSMPSASEL2,		0x04, 0x14, 0x3f),
+	/*
+	 * 0x3f, VsmpsASel3
+	 */
+	REG_INIT(AB8505_VSMPSASEL3,		0x04, 0x15, 0x3f),
+	/*
+	 * 0x3f, VsmpsBSel1
+	 */
+	REG_INIT(AB8505_VSMPSBSEL1,		0x04, 0x17, 0x3f),
+	/*
+	 * 0x3f, VsmpsBSel2
+	 */
+	REG_INIT(AB8505_VSMPSBSEL2,		0x04, 0x18, 0x3f),
+	/*
+	 * 0x3f, VsmpsBSel3
+	 */
+	REG_INIT(AB8505_VSMPSBSEL3,		0x04, 0x19, 0x3f),
+	/*
+	 * 0x7f, VsafeSel1
+	 */
+	REG_INIT(AB8505_VSAFESEL1,		0x04, 0x1b, 0x7f),
+	/*
+	 * 0x3f, VsafeSel2
+	 */
+	REG_INIT(AB8505_VSAFESEL2,		0x04, 0x1c, 0x7f),
+	/*
+	 * 0x3f, VsafeSel3
+	 */
+	REG_INIT(AB8505_VSAFESEL3,		0x04, 0x1d, 0x7f),
+	/*
+	 * 0x0f, Vaux1Sel
+	 */
+	REG_INIT(AB8505_VAUX1SEL,		0x04, 0x1f, 0x0f),
+	/*
+	 * 0x0f, Vaux2Sel
+	 */
+	REG_INIT(AB8505_VAUX2SEL,		0x04, 0x20, 0x0f),
+	/*
+	 * 0x07, Vaux3Sel
+	 * 0x30, VRF1Sel
+	 */
+	REG_INIT(AB8505_VRF1VAUX3SEL,		0x04, 0x21, 0x37),
+	/*
+	 * 0x03, Vaux4RequestCtrl
+	 */
+	REG_INIT(AB8505_VAUX4REQCTRL,		0x04, 0x2d, 0x03),
+	/*
+	 * 0x03, Vaux4Regu
+	 */
+	REG_INIT(AB8505_VAUX4REGU,		0x04, 0x2e, 0x03),
+	/*
+	 * 0x0f, Vaux4Sel
+	 */
+	REG_INIT(AB8505_VAUX4SEL,		0x04, 0x2f, 0x0f),
+	/*
+	 * 0x04, Vaux1Disch
+	 * 0x08, Vaux2Disch
+	 * 0x10, Vaux3Disch
+	 * 0x20, Vintcore12Disch
+	 * 0x40, VTVoutDisch
+	 * 0x80, VaudioDisch
+	 */
+	REG_INIT(AB8505_REGUCTRLDISCH,		0x04, 0x43, 0xfc),
+	/*
+	 * 0x02, VanaDisch
+	 * 0x04, VdmicPullDownEna
+	 * 0x10, VdmicDisch
+	 */
+	REG_INIT(AB8505_REGUCTRLDISCH2,		0x04, 0x44, 0x16),
+	/*
+	 * 0x01, Vaux4Disch
+	 */
+	REG_INIT(AB8505_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
+	/*
+	 * 0x07, Vaux5Sel
+	 * 0x08, Vaux5LP
+	 * 0x10, Vaux5Ena
+	 * 0x20, Vaux5Disch
+	 * 0x40, Vaux5DisSfst
+	 * 0x80, Vaux5DisPulld
+	 */
+	REG_INIT(AB8505_CTRLVAUX5,		0x01, 0x55, 0xff),
+	/*
+	 * 0x07, Vaux6Sel
+	 * 0x08, Vaux6LP
+	 * 0x10, Vaux6Ena
+	 * 0x80, Vaux6DisPulld
+	 */
+	REG_INIT(AB8505_CTRLVAUX6,		0x01, 0x56, 0x9f),
+};
+
 /* AB9540 register init */
 static struct ab8500_reg_init ab9540_reg_init[] = {
 	/*
@@ -1396,6 +1988,22 @@ static struct of_regulator_match ab8500_regulator_match[] = {
 	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB8500_LDO_ANA, },
 };
 
+static struct of_regulator_match ab8505_regulator_match[] = {
+	{ .name	= "ab8500_ldo_aux1",    .driver_data = (void *) AB8505_LDO_AUX1, },
+	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB8505_LDO_AUX2, },
+	{ .name	= "ab8500_ldo_aux3",    .driver_data = (void *) AB8505_LDO_AUX3, },
+	{ .name	= "ab8500_ldo_aux4",    .driver_data = (void *) AB8505_LDO_AUX4, },
+	{ .name	= "ab8500_ldo_aux5",    .driver_data = (void *) AB8505_LDO_AUX5, },
+	{ .name	= "ab8500_ldo_aux6",    .driver_data = (void *) AB8505_LDO_AUX6, },
+	{ .name	= "ab8500_ldo_intcore", .driver_data = (void *) AB8505_LDO_INTCORE, },
+	{ .name	= "ab8500_ldo_adc",	.driver_data = (void *) AB8505_LDO_ADC, },
+	{ .name = "ab8500_ldo_audio",   .driver_data = (void *) AB8505_LDO_AUDIO, },
+	{ .name	= "ab8500_ldo_anamic1", .driver_data = (void *) AB8505_LDO_ANAMIC1, },
+	{ .name	= "ab8500_ldo_amamic2", .driver_data = (void *) AB8505_LDO_ANAMIC2, },
+	{ .name	= "ab8500_ldo_aux8",    .driver_data = (void *) AB8505_LDO_AUX8, },
+	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB8505_LDO_ANA, },
+};
+
 static struct of_regulator_match ab9540_regulator_match[] = {
 	{ .name	= "ab8500_ldo_aux1",    .driver_data = (void *) AB9540_LDO_AUX1, },
 	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB9540_LDO_AUX2, },
@@ -1450,6 +2058,11 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 		reg_init_size = AB9540_NUM_REGULATOR_REGISTERS;
 		match = ab9540_regulator_match;
 		match_size = ARRAY_SIZE(ab9540_regulator_match)
+	} else if (is_ab8505(ab8500)) {
+		regulator_info = ab8505_regulator_info;
+		regulator_info_size = ARRAY_SIZE(ab8505_regulator_info);
+		reg_init = ab8505_reg_init;
+		reg_init_size = AB8505_NUM_REGULATOR_REGISTERS;
 	} else {
 		regulator_info = ab8500_regulator_info;
 		regulator_info_size = ARRAY_SIZE(ab8500_regulator_info);
@@ -1543,6 +2156,9 @@ static int ab8500_regulator_remove(struct platform_device *pdev)
 	if (is_ab9540(ab8500)) {
 		regulator_info = ab9540_regulator_info;
 		regulator_info_size = ARRAY_SIZE(ab9540_regulator_info);
+	} else if (is_ab8505(ab8500)) {
+		regulator_info = ab8505_regulator_info;
+		regulator_info_size = ARRAY_SIZE(ab8505_regulator_info);
 	} else {
 		regulator_info = ab8500_regulator_info;
 		regulator_info_size = ARRAY_SIZE(ab8500_regulator_info);
@@ -1601,5 +2217,6 @@ module_exit(ab8500_regulator_exit);
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Sundar Iyer <sundar.iyer@stericsson.com>");
 MODULE_AUTHOR("Bengt Jonsson <bengt.g.jonsson@stericsson.com>");
+MODULE_AUTHOR("Daniel Willerud <daniel.willerud@stericsson.com>");
 MODULE_DESCRIPTION("Regulator Driver for ST-Ericsson AB8500 Mixed-Sig PMIC");
 MODULE_ALIAS("platform:ab8500-regulator");
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 592a3f3994c0..9a7cf97e5040 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -5,6 +5,7 @@
  *
  * Authors: Sundar Iyer <sundar.iyer@stericsson.com> for ST-Ericsson
  *          Bengt Jonsson <bengt.g.jonsson@stericsson.com> for ST-Ericsson
+ *          Daniel Willerud <daniel.willerud@stericsson.com> for ST-Ericsson
  */
 
 #ifndef __LINUX_MFD_AB8500_REGULATOR_H
@@ -27,7 +28,28 @@ enum ab8500_regulator_id {
 	AB8500_NUM_REGULATORS,
 };
 
-/* AB9450 regulators */
+/* AB8505 regulators */
+enum ab8505_regulator_id {
+	AB8505_LDO_AUX1,
+	AB8505_LDO_AUX2,
+	AB8505_LDO_AUX3,
+	AB8505_LDO_AUX4,
+	AB8505_LDO_AUX5,
+	AB8505_LDO_AUX6,
+	AB8505_LDO_INTCORE,
+	AB8505_LDO_ADC,
+	AB8505_LDO_USB,
+	AB8505_LDO_AUDIO,
+	AB8505_LDO_ANAMIC1,
+	AB8505_LDO_ANAMIC2,
+	AB8505_LDO_AUX8,
+	AB8505_LDO_ANA,
+	AB8505_SYSCLKREQ_2,
+	AB8505_SYSCLKREQ_4,
+	AB8505_NUM_REGULATORS,
+};
+
+/* AB9540 regulators */
 enum ab9540_regulator_id {
 	AB9540_LDO_AUX1,
 	AB9540_LDO_AUX2,
@@ -46,7 +68,7 @@ enum ab9540_regulator_id {
 	AB9540_NUM_REGULATORS,
 };
 
-/* AB8500 and AB9540 register initialization */
+/* AB8500, AB8505, and AB9540 register initialization */
 struct ab8500_regulator_reg_init {
 	int id;
 	u8 mask;
@@ -92,6 +114,55 @@ enum ab8500_regulator_reg {
 	AB8500_NUM_REGULATOR_REGISTERS,
 };
 
+/* AB8505 registers */
+enum ab8505_regulator_reg {
+	AB8505_REGUREQUESTCTRL1,
+	AB8505_REGUREQUESTCTRL2,
+	AB8505_REGUREQUESTCTRL3,
+	AB8505_REGUREQUESTCTRL4,
+	AB8505_REGUSYSCLKREQ1HPVALID1,
+	AB8505_REGUSYSCLKREQ1HPVALID2,
+	AB8505_REGUHWHPREQ1VALID1,
+	AB8505_REGUHWHPREQ1VALID2,
+	AB8505_REGUHWHPREQ2VALID1,
+	AB8505_REGUHWHPREQ2VALID2,
+	AB8505_REGUSWHPREQVALID1,
+	AB8505_REGUSWHPREQVALID2,
+	AB8505_REGUSYSCLKREQVALID1,
+	AB8505_REGUSYSCLKREQVALID2,
+	AB8505_REGUVAUX4REQVALID,
+	AB8505_REGUMISC1,
+	AB8505_VAUDIOSUPPLY,
+	AB8505_REGUCTRL1VAMIC,
+	AB8505_VSMPSAREGU,
+	AB8505_VSMPSBREGU,
+	AB8505_VSAFEREGU, /* NOTE! PRCMU register */
+	AB8505_VPLLVANAREGU,
+	AB8505_EXTSUPPLYREGU,
+	AB8505_VAUX12REGU,
+	AB8505_VRF1VAUX3REGU,
+	AB8505_VSMPSASEL1,
+	AB8505_VSMPSASEL2,
+	AB8505_VSMPSASEL3,
+	AB8505_VSMPSBSEL1,
+	AB8505_VSMPSBSEL2,
+	AB8505_VSMPSBSEL3,
+	AB8505_VSAFESEL1, /* NOTE! PRCMU register */
+	AB8505_VSAFESEL2, /* NOTE! PRCMU register */
+	AB8505_VSAFESEL3, /* NOTE! PRCMU register */
+	AB8505_VAUX1SEL,
+	AB8505_VAUX2SEL,
+	AB8505_VRF1VAUX3SEL,
+	AB8505_VAUX4REQCTRL,
+	AB8505_VAUX4REGU,
+	AB8505_VAUX4SEL,
+	AB8505_REGUCTRLDISCH,
+	AB8505_REGUCTRLDISCH2,
+	AB8505_REGUCTRLDISCH3,
+	AB8505_CTRLVAUX5,
+	AB8505_CTRLVAUX6,
+	AB8505_NUM_REGULATOR_REGISTERS,
+};
 
 /* AB9540 registers */
 enum ab9540_regulator_reg {
-- 
cgit 


From ae0a9a3efce22e77b5f0f2b266646431f492f7ed Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Thu, 28 Mar 2013 16:11:16 +0000
Subject: regulator: ab8500: Add support for the ab8540

To obtain full AB8540 regulator support, the AB8500 regulator driver
first needs to know its register layout and their initialisation values
for each. That information is provided via a couple of large data
structures which we provide here.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/ab8500.c       | 686 ++++++++++++++++++++++++++++++++++++++-
 include/linux/regulator/ab8500.h |  86 +++++
 2 files changed, 764 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 1ab0f8a7c862..ec609ab747ae 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -107,6 +107,18 @@ static const int ldo_vaux56_voltages[] = {
 	2790000,
 };
 
+static const int ldo_vaux3_ab8540_voltages[] = {
+	1200000,
+	1500000,
+	1800000,
+	2100000,
+	2500000,
+	2750000,
+	2790000,
+	2910000,
+	3050000,
+};
+
 static const unsigned int ldo_vintcore_voltages[] = {
 	1200000,
 	1225000,
@@ -117,6 +129,17 @@ static const unsigned int ldo_vintcore_voltages[] = {
 	1350000,
 };
 
+static const int ldo_sdio_voltages[] = {
+	1160000,
+	1050000,
+	1100000,
+	1500000,
+	1800000,
+	2200000,
+	2910000,
+	3050000,
+};
+
 static int ab8500_regulator_enable(struct regulator_dev *rdev)
 {
 	int ret;
@@ -726,10 +749,10 @@ static struct ab8500_regulator_info
 		/* values for CtrlVaux5 register */
 		.update_bank		= 0x01,
 		.update_reg		= 0x55,
-		.update_mask		= 0x08,
-		.update_val		= 0x00,
-		.update_val_idle	= 0x01,
-		.update_val_normal	= 0x00,
+		.update_mask		= 0x18,
+		.update_val		= 0x10,
+		.update_val_idle	= 0x18,
+		.update_val_normal	= 0x10,
 		.voltage_bank		= 0x01,
 		.voltage_reg		= 0x55,
 		.voltage_mask		= 0x07,
@@ -751,10 +774,10 @@ static struct ab8500_regulator_info
 		/* values for CtrlVaux6 register */
 		.update_bank		= 0x01,
 		.update_reg		= 0x56,
-		.update_mask		= 0x08,
-		.update_val		= 0x00,
-		.update_val_idle	= 0x01,
-		.update_val_normal	= 0x00,
+		.update_mask		= 0x18,
+		.update_val		= 0x10,
+		.update_val_idle	= 0x18,
+		.update_val_normal	= 0x10,
 		.voltage_bank		= 0x01,
 		.voltage_reg		= 0x56,
 		.voltage_mask		= 0x07,
@@ -1169,6 +1192,255 @@ static struct ab8500_regulator_info
 	},
 };
 
+/* AB8540 regulator information */
+static struct ab8500_regulator_info
+		ab8540_regulator_info[AB8540_NUM_REGULATORS] = {
+	/*
+	 * Variable Voltage Regulators
+	 *   name, min mV, max mV,
+	 *   update bank, reg, mask, enable val
+	 *   volt bank, reg, mask, table, table length
+	 */
+	[AB8540_LDO_AUX1] = {
+		.desc = {
+			.name		= "LDO-AUX1",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUX1,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
+		},
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x09,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x1f,
+		.voltage_mask		= 0x0f,
+		.voltages		= ldo_vauxn_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vauxn_voltages),
+	},
+	[AB8540_LDO_AUX2] = {
+		.desc = {
+			.name		= "LDO-AUX2",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUX2,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
+		},
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x09,
+		.update_mask		= 0x0c,
+		.update_val		= 0x04,
+		.update_val_idle	= 0x0c,
+		.update_val_normal	= 0x04,
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x20,
+		.voltage_mask		= 0x0f,
+		.voltages		= ldo_vauxn_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vauxn_voltages),
+	},
+	[AB8540_LDO_AUX3] = {
+		.desc = {
+			.name		= "LDO-AUX3",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUX3,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vaux3_ab8540_voltages),
+		},
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x0a,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x21,
+		.voltage_mask		= 0x07,
+		.voltages		= ldo_vaux3_ab8540_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vaux3_ab8540_voltages),
+	},
+	[AB8540_LDO_AUX4] = {
+		.desc = {
+			.name		= "LDO-AUX4",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB9540_LDO_AUX4,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vauxn_voltages),
+		},
+		.load_lp_uA		= 5000,
+		/* values for Vaux4Regu register */
+		.update_bank		= 0x04,
+		.update_reg		= 0x2e,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		/* values for Vaux4SEL register */
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x2f,
+		.voltage_mask		= 0x0f,
+		.voltages		= ldo_vauxn_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vauxn_voltages),
+	},
+	[AB8540_LDO_INTCORE] = {
+		.desc = {
+			.name		= "LDO-INTCORE",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_INTCORE,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vintcore_voltages),
+		},
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x80,
+		.update_mask		= 0x44,
+		.update_val		= 0x44,
+		.update_val_idle	= 0x44,
+		.update_val_normal	= 0x04,
+		.voltage_bank		= 0x03,
+		.voltage_reg		= 0x80,
+		.voltage_mask		= 0x38,
+		.voltages		= ldo_vintcore_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vintcore_voltages),
+		.voltage_shift		= 3,
+	},
+
+	/*
+	 * Fixed Voltage Regulators
+	 *   name, fixed mV,
+	 *   update bank, reg, mask, enable val
+	 */
+	[AB8540_LDO_TVOUT] = {
+		.desc = {
+			.name		= "LDO-TVOUT",
+			.ops		= &ab8500_regulator_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_TVOUT,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.delay			= 10000,
+		.load_lp_uA		= 1000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x80,
+		.update_mask		= 0x82,
+		.update_val		= 0x02,
+		.update_val_idle	= 0x82,
+		.update_val_normal	= 0x02,
+	},
+	[AB8540_LDO_AUDIO] = {
+		.desc = {
+			.name		= "LDO-AUDIO",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_AUDIO,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x02,
+		.update_val		= 0x02,
+	},
+	[AB8540_LDO_ANAMIC1] = {
+		.desc = {
+			.name		= "LDO-ANAMIC1",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_ANAMIC1,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x08,
+		.update_val		= 0x08,
+	},
+	[AB8540_LDO_ANAMIC2] = {
+		.desc = {
+			.name		= "LDO-ANAMIC2",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_ANAMIC2,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x10,
+		.update_val		= 0x10,
+	},
+	[AB8540_LDO_DMIC] = {
+		.desc = {
+			.name		= "LDO-DMIC",
+			.ops		= &ab8500_regulator_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_DMIC,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.update_bank		= 0x03,
+		.update_reg		= 0x83,
+		.update_mask		= 0x04,
+		.update_val		= 0x04,
+	},
+
+	/*
+	 * Regulators with fixed voltage and normal/idle modes
+	 */
+	[AB8540_LDO_ANA] = {
+		.desc = {
+			.name		= "LDO-ANA",
+			.ops		= &ab8500_regulator_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8500_LDO_ANA,
+			.owner		= THIS_MODULE,
+			.n_voltages	= 1,
+		},
+		.load_lp_uA		= 1000,
+		.update_bank		= 0x04,
+		.update_reg		= 0x06,
+		.update_mask		= 0x0c,
+		.update_val		= 0x04,
+		.update_val_idle	= 0x0c,
+		.update_val_normal	= 0x04,
+	},
+	[AB8540_LDO_SDIO] = {
+		.desc = {
+			.name		= "LDO-SDIO",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8540_LDO_SDIO,
+			.owner		= THIS_MODULE,
+			.n_voltages = ARRAY_SIZE(ldo_sdio_voltages),
+		},
+		.min_uV			= 1050000,
+		.max_uV			= 3050000,
+		.load_lp_uA		= 5000,
+		.update_bank		= 0x03,
+		.update_reg		= 0x88,
+		.update_mask		= 0x30,
+		.update_val		= 0x10,
+		.update_val_idle	= 0x30,
+		.update_val_normal	= 0x10,
+		.voltage_bank		= 0x03,
+		.voltage_reg		= 0x88,
+		.voltage_mask		= 0x07,
+		.voltages		= ldo_sdio_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_sdio_voltages),
+	},
+};
+
 struct ab8500_reg_init {
 	u8 bank;
 	u8 addr;
@@ -1898,6 +2170,384 @@ static struct ab8500_reg_init ab9540_reg_init[] = {
 	REG_INIT(AB9540_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
 };
 
+/* AB8540 register init */
+static struct ab8500_reg_init ab8540_reg_init[] = {
+	/*
+	 * 0x01, VSimSycClkReq1Valid
+	 * 0x02, VSimSycClkReq2Valid
+	 * 0x04, VSimSycClkReq3Valid
+	 * 0x08, VSimSycClkReq4Valid
+	 * 0x10, VSimSycClkReq5Valid
+	 * 0x20, VSimSycClkReq6Valid
+	 * 0x40, VSimSycClkReq7Valid
+	 * 0x80, VSimSycClkReq8Valid
+	 */
+	REG_INIT(AB8540_VSIMSYSCLKCTRL,		0x02, 0x33, 0xff),
+	/*
+	 * 0x03, VarmRequestCtrl
+	 * 0x0c, VapeRequestCtrl
+	 * 0x30, Vsmps1RequestCtrl
+	 * 0xc0, Vsmps2RequestCtrl
+	 */
+	REG_INIT(AB8540_REGUREQUESTCTRL1,	0x03, 0x03, 0xff),
+	/*
+	 * 0x03, Vsmps3RequestCtrl
+	 * 0x0c, VpllRequestCtrl
+	 * 0x30, VanaRequestCtrl
+	 * 0xc0, VextSupply1RequestCtrl
+	 */
+	REG_INIT(AB8540_REGUREQUESTCTRL2,	0x03, 0x04, 0xff),
+	/*
+	 * 0x03, VextSupply2RequestCtrl
+	 * 0x0c, VextSupply3RequestCtrl
+	 * 0x30, Vaux1RequestCtrl
+	 * 0xc0, Vaux2RequestCtrl
+	 */
+	REG_INIT(AB8540_REGUREQUESTCTRL3,	0x03, 0x05, 0xff),
+	/*
+	 * 0x03, Vaux3RequestCtrl
+	 * 0x04, SwHPReq
+	 */
+	REG_INIT(AB8540_REGUREQUESTCTRL4,	0x03, 0x06, 0x07),
+	/*
+	 * 0x01, Vsmps1SysClkReq1HPValid
+	 * 0x02, Vsmps2SysClkReq1HPValid
+	 * 0x04, Vsmps3SysClkReq1HPValid
+	 * 0x08, VanaSysClkReq1HPValid
+	 * 0x10, VpllSysClkReq1HPValid
+	 * 0x20, Vaux1SysClkReq1HPValid
+	 * 0x40, Vaux2SysClkReq1HPValid
+	 * 0x80, Vaux3SysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUSYSCLKREQ1HPVALID1,	0x03, 0x07, 0xff),
+	/*
+	 * 0x01, VapeSysClkReq1HPValid
+	 * 0x02, VarmSysClkReq1HPValid
+	 * 0x04, VbbSysClkReq1HPValid
+	 * 0x10, VextSupply1SysClkReq1HPValid
+	 * 0x20, VextSupply2SysClkReq1HPValid
+	 * 0x40, VextSupply3SysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUSYSCLKREQ1HPVALID2,	0x03, 0x08, 0x77),
+	/*
+	 * 0x01, Vsmps1HwHPReq1Valid
+	 * 0x02, Vsmps2HwHPReq1Valid
+	 * 0x04, Vsmps3HwHPReq1Valid
+	 * 0x08, VanaHwHPReq1Valid
+	 * 0x10, VpllHwHPReq1Valid
+	 * 0x20, Vaux1HwHPReq1Valid
+	 * 0x40, Vaux2HwHPReq1Valid
+	 * 0x80, Vaux3HwHPReq1Valid
+	 */
+	REG_INIT(AB8540_REGUHWHPREQ1VALID1,	0x03, 0x09, 0xff),
+	/*
+	 * 0x01, VextSupply1HwHPReq1Valid
+	 * 0x02, VextSupply2HwHPReq1Valid
+	 * 0x04, VextSupply3HwHPReq1Valid
+	 */
+	REG_INIT(AB8540_REGUHWHPREQ1VALID2,	0x03, 0x0a, 0x07),
+	/*
+	 * 0x01, Vsmps1HwHPReq2Valid
+	 * 0x02, Vsmps2HwHPReq2Valid
+	 * 0x03, Vsmps3HwHPReq2Valid
+	 * 0x08, VanaHwHPReq2Valid
+	 * 0x10, VpllHwHPReq2Valid
+	 * 0x20, Vaux1HwHPReq2Valid
+	 * 0x40, Vaux2HwHPReq2Valid
+	 * 0x80, Vaux3HwHPReq2Valid
+	 */
+	REG_INIT(AB8540_REGUHWHPREQ2VALID1,	0x03, 0x0b, 0xff),
+	/*
+	 * 0x01, VextSupply1HwHPReq2Valid
+	 * 0x02, VextSupply2HwHPReq2Valid
+	 * 0x04, VextSupply3HwHPReq2Valid
+	 */
+	REG_INIT(AB8540_REGUHWHPREQ2VALID2,	0x03, 0x0c, 0x07),
+	/*
+	 * 0x01, VapeSwHPReqValid
+	 * 0x02, VarmSwHPReqValid
+	 * 0x04, Vsmps1SwHPReqValid
+	 * 0x08, Vsmps2SwHPReqValid
+	 * 0x10, Vsmps3SwHPReqValid
+	 * 0x20, VanaSwHPReqValid
+	 * 0x40, VpllSwHPReqValid
+	 * 0x80, Vaux1SwHPReqValid
+	 */
+	REG_INIT(AB8540_REGUSWHPREQVALID1,	0x03, 0x0d, 0xff),
+	/*
+	 * 0x01, Vaux2SwHPReqValid
+	 * 0x02, Vaux3SwHPReqValid
+	 * 0x04, VextSupply1SwHPReqValid
+	 * 0x08, VextSupply2SwHPReqValid
+	 * 0x10, VextSupply3SwHPReqValid
+	 */
+	REG_INIT(AB8540_REGUSWHPREQVALID2,	0x03, 0x0e, 0x1f),
+	/*
+	 * 0x02, SysClkReq2Valid1
+	 * ...
+	 * 0x80, SysClkReq8Valid1
+	 */
+	REG_INIT(AB8540_REGUSYSCLKREQVALID1,	0x03, 0x0f, 0xff),
+	/*
+	 * 0x02, SysClkReq2Valid2
+	 * ...
+	 * 0x80, SysClkReq8Valid2
+	 */
+	REG_INIT(AB8540_REGUSYSCLKREQVALID2,	0x03, 0x10, 0xff),
+	/*
+	 * 0x01, Vaux4SwHPReqValid
+	 * 0x02, Vaux4HwHPReq2Valid
+	 * 0x04, Vaux4HwHPReq1Valid
+	 * 0x08, Vaux4SysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUVAUX4REQVALID,	0x03, 0x11, 0x0f),
+	/*
+	 * 0x01, Vaux5SwHPReqValid
+	 * 0x02, Vaux5HwHPReq2Valid
+	 * 0x04, Vaux5HwHPReq1Valid
+	 * 0x08, Vaux5SysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUVAUX5REQVALID,	0x03, 0x12, 0x0f),
+	/*
+	 * 0x01, Vaux6SwHPReqValid
+	 * 0x02, Vaux6HwHPReq2Valid
+	 * 0x04, Vaux6HwHPReq1Valid
+	 * 0x08, Vaux6SysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUVAUX6REQVALID,	0x03, 0x13, 0x0f),
+	/*
+	 * 0x01, VclkbSwHPReqValid
+	 * 0x02, VclkbHwHPReq2Valid
+	 * 0x04, VclkbHwHPReq1Valid
+	 * 0x08, VclkbSysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUVCLKBREQVALID,	0x03, 0x14, 0x0f),
+	/*
+	 * 0x01, Vrf1SwHPReqValid
+	 * 0x02, Vrf1HwHPReq2Valid
+	 * 0x04, Vrf1HwHPReq1Valid
+	 * 0x08, Vrf1SysClkReq1HPValid
+	 */
+	REG_INIT(AB8540_REGUVRF1REQVALID,	0x03, 0x15, 0x0f),
+	/*
+	 * 0x02, VTVoutEna
+	 * 0x04, Vintcore12Ena
+	 * 0x38, Vintcore12Sel
+	 * 0x40, Vintcore12LP
+	 * 0x80, VTVoutLP
+	 */
+	REG_INIT(AB8540_REGUMISC1,		0x03, 0x80, 0xfe),
+	/*
+	 * 0x02, VaudioEna
+	 * 0x04, VdmicEna
+	 * 0x08, Vamic1Ena
+	 * 0x10, Vamic2Ena
+	 * 0x20, Vamic12LP
+	 * 0xC0, VdmicSel
+	 */
+	REG_INIT(AB8540_VAUDIOSUPPLY,		0x03, 0x83, 0xfe),
+	/*
+	 * 0x01, Vamic1_dzout
+	 * 0x02, Vamic2_dzout
+	 */
+	REG_INIT(AB8540_REGUCTRL1VAMIC,		0x03, 0x84, 0x03),
+	/*
+	 * 0x07, VHSICSel
+	 * 0x08, VHSICOffState
+	 * 0x10, VHSIEna
+	 * 0x20, VHSICLP
+	 */
+	REG_INIT(AB8540_VHSIC,			0x03, 0x87, 0x3f),
+	/*
+	 * 0x07, VSDIOSel
+	 * 0x08, VSDIOOffState
+	 * 0x10, VSDIOEna
+	 * 0x20, VSDIOLP
+	 */
+	REG_INIT(AB8540_VSDIO,			0x03, 0x88, 0x3f),
+	/*
+	 * 0x03, Vsmps1Regu
+	 * 0x0c, Vsmps1SelCtrl
+	 * 0x10, Vsmps1AutoMode
+	 * 0x20, Vsmps1PWMMode
+	 */
+	REG_INIT(AB8540_VSMPS1REGU,		0x04, 0x03, 0x3f),
+	/*
+	 * 0x03, Vsmps2Regu
+	 * 0x0c, Vsmps2SelCtrl
+	 * 0x10, Vsmps2AutoMode
+	 * 0x20, Vsmps2PWMMode
+	 */
+	REG_INIT(AB8540_VSMPS2REGU,		0x04, 0x04, 0x3f),
+	/*
+	 * 0x03, Vsmps3Regu
+	 * 0x0c, Vsmps3SelCtrl
+	 * 0x10, Vsmps3AutoMode
+	 * 0x20, Vsmps3PWMMode
+	 * NOTE! PRCMU register
+	 */
+	REG_INIT(AB8540_VSMPS3REGU,		0x04, 0x05, 0x0f),
+	/*
+	 * 0x03, VpllRegu
+	 * 0x0c, VanaRegu
+	 */
+	REG_INIT(AB8540_VPLLVANAREGU,		0x04, 0x06, 0x0f),
+	/*
+	 * 0x03, VextSupply1Regu
+	 * 0x0c, VextSupply2Regu
+	 * 0x30, VextSupply3Regu
+	 * 0x40, ExtSupply2Bypass
+	 * 0x80, ExtSupply3Bypass
+	 */
+	REG_INIT(AB8540_EXTSUPPLYREGU,		0x04, 0x08, 0xff),
+	/*
+	 * 0x03, Vaux1Regu
+	 * 0x0c, Vaux2Regu
+	 */
+	REG_INIT(AB8540_VAUX12REGU,		0x04, 0x09, 0x0f),
+	/*
+	 * 0x0c, VRF1Regu
+	 * 0x03, Vaux3Regu
+	 */
+	REG_INIT(AB8540_VRF1VAUX3REGU,		0x04, 0x0a, 0x0f),
+	/*
+	 * 0x3f, Vsmps1Sel1
+	 */
+	REG_INIT(AB8540_VSMPS1SEL1,		0x04, 0x13, 0x3f),
+	/*
+	 * 0x3f, Vsmps1Sel2
+	 */
+	REG_INIT(AB8540_VSMPS1SEL2,		0x04, 0x14, 0x3f),
+	/*
+	 * 0x3f, Vsmps1Sel3
+	 */
+	REG_INIT(AB8540_VSMPS1SEL3,		0x04, 0x15, 0x3f),
+	/*
+	 * 0x3f, Vsmps2Sel1
+	 */
+	REG_INIT(AB8540_VSMPS2SEL1,		0x04, 0x17, 0x3f),
+	/*
+	 * 0x3f, Vsmps2Sel2
+	 */
+	REG_INIT(AB8540_VSMPS2SEL2,		0x04, 0x18, 0x3f),
+	/*
+	 * 0x3f, Vsmps2Sel3
+	 */
+	REG_INIT(AB8540_VSMPS2SEL3,		0x04, 0x19, 0x3f),
+	/*
+	 * 0x7f, Vsmps3Sel1
+	 * NOTE! PRCMU register
+	 */
+	REG_INIT(AB8540_VSMPS3SEL1,             0x04, 0x1b, 0x7f),
+	/*
+	 * 0x7f, Vsmps3Sel2
+	 * NOTE! PRCMU register
+	 */
+	REG_INIT(AB8540_VSMPS3SEL2,             0x04, 0x1c, 0x7f),
+	/*
+	 * 0x0f, Vaux1Sel
+	 */
+	REG_INIT(AB8540_VAUX1SEL,		0x04, 0x1f, 0x0f),
+	/*
+	 * 0x0f, Vaux2Sel
+	 */
+	REG_INIT(AB8540_VAUX2SEL,		0x04, 0x20, 0x0f),
+	/*
+	 * 0x07, Vaux3Sel
+	 * 0x70, Vrf1Sel
+	 */
+	REG_INIT(AB8540_VRF1VAUX3SEL,		0x04, 0x21, 0x77),
+	/*
+	 * 0x01, VextSupply12LP
+	 */
+	REG_INIT(AB8540_REGUCTRL2SPARE,		0x04, 0x22, 0x01),
+	/*
+	 * 0x07, Vanasel
+	 * 0x30, Vpllsel
+	 */
+	REG_INIT(AB8540_VANAVPLLSEL,		0x04, 0x29, 0x37),
+	/*
+	 * 0x03, Vaux4RequestCtrl
+	 */
+	REG_INIT(AB8540_VAUX4REQCTRL,		0x04, 0x2d, 0x03),
+	/*
+	 * 0x03, Vaux4Regu
+	 */
+	REG_INIT(AB8540_VAUX4REGU,		0x04, 0x2e, 0x03),
+	/*
+	 * 0x0f, Vaux4Sel
+	 */
+	REG_INIT(AB8540_VAUX4SEL,		0x04, 0x2f, 0x0f),
+	/*
+	 * 0x03, Vaux5RequestCtrl
+	 */
+	REG_INIT(AB8540_VAUX5REQCTRL,		0x04, 0x31, 0x03),
+	/*
+	 * 0x03, Vaux5Regu
+	 */
+	REG_INIT(AB8540_VAUX5REGU,		0x04, 0x32, 0x03),
+	/*
+	 * 0x3f, Vaux5Sel
+	 */
+	REG_INIT(AB8540_VAUX5SEL,		0x04, 0x33, 0x3f),
+	/*
+	 * 0x03, Vaux6RequestCtrl
+	 */
+	REG_INIT(AB8540_VAUX6REQCTRL,		0x04, 0x34, 0x03),
+	/*
+	 * 0x03, Vaux6Regu
+	 */
+	REG_INIT(AB8540_VAUX6REGU,		0x04, 0x35, 0x03),
+	/*
+	 * 0x3f, Vaux6Sel
+	 */
+	REG_INIT(AB8540_VAUX6SEL,		0x04, 0x36, 0x3f),
+	/*
+	 * 0x03, VCLKBRequestCtrl
+	 */
+	REG_INIT(AB8540_VCLKBREQCTRL,		0x04, 0x37, 0x03),
+	/*
+	 * 0x03, VCLKBRegu
+	 */
+	REG_INIT(AB8540_VCLKBREGU,		0x04, 0x38, 0x03),
+	/*
+	 * 0x07, VCLKBSel
+	 */
+	REG_INIT(AB8540_VCLKBSEL,		0x04, 0x39, 0x07),
+	/*
+	 * 0x03, Vrf1RequestCtrl
+	 */
+	REG_INIT(AB8540_VRF1REQCTRL,		0x04, 0x3a, 0x03),
+	/*
+	 * 0x01, VpllDisch
+	 * 0x02, Vrf1Disch
+	 * 0x04, Vaux1Disch
+	 * 0x08, Vaux2Disch
+	 * 0x10, Vaux3Disch
+	 * 0x20, Vintcore12Disch
+	 * 0x40, VTVoutDisch
+	 * 0x80, VaudioDisch
+	 */
+	REG_INIT(AB8540_REGUCTRLDISCH,		0x04, 0x43, 0xff),
+	/*
+	 * 0x02, VanaDisch
+	 * 0x04, VdmicPullDownEna
+	 * 0x08, VpllPullDownEna
+	 * 0x10, VdmicDisch
+	 */
+	REG_INIT(AB8540_REGUCTRLDISCH2,		0x04, 0x44, 0x1e),
+	/*
+	 * 0x01, Vaux4Disch
+	 */
+	REG_INIT(AB8540_REGUCTRLDISCH3,		0x04, 0x48, 0x01),
+	/*
+	 * 0x01, Vaux5Disch
+	 * 0x02, Vaux6Disch
+	 * 0x04, VCLKBDisch
+	 */
+	REG_INIT(AB8540_REGUCTRLDISCH4,		0x04, 0x49, 0x07),
+};
+
 static int ab8500_regulator_init_registers(struct platform_device *pdev,
 					   struct ab8500_reg_init *reg_init,
 					   int id, int mask, int value)
@@ -2004,6 +2654,21 @@ static struct of_regulator_match ab8505_regulator_match[] = {
 	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB8505_LDO_ANA, },
 };
 
+static struct of_regulator_match ab8540_regulator_match[] = {
+	{ .name	= "ab8500_ldo_aux1",    .driver_data = (void *) AB8540_LDO_AUX1, },
+	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB8540_LDO_AUX2, },
+	{ .name	= "ab8500_ldo_aux3",    .driver_data = (void *) AB8540_LDO_AUX3, },
+	{ .name	= "ab8500_ldo_aux4",    .driver_data = (void *) AB8540_LDO_AUX4, },
+	{ .name	= "ab8500_ldo_intcore", .driver_data = (void *) AB8540_LDO_INTCORE, },
+	{ .name	= "ab8500_ldo_tvout",   .driver_data = (void *) AB8540_LDO_TVOUT, },
+	{ .name = "ab8500_ldo_audio",   .driver_data = (void *) AB8540_LDO_AUDIO, },
+	{ .name	= "ab8500_ldo_anamic1", .driver_data = (void *) AB8540_LDO_ANAMIC1, },
+	{ .name	= "ab8500_ldo_amamic2", .driver_data = (void *) AB8540_LDO_ANAMIC2, },
+	{ .name	= "ab8500_ldo_dmic",    .driver_data = (void *) AB8540_LDO_DMIC, },
+	{ .name	= "ab8500_ldo_ana",     .driver_data = (void *) AB8540_LDO_ANA, },
+	{ .name = "ab8500_ldo_sdio",    .driver_data = (void *) AB8540_LDO_SDIO, },
+};
+
 static struct of_regulator_match ab9540_regulator_match[] = {
 	{ .name	= "ab8500_ldo_aux1",    .driver_data = (void *) AB9540_LDO_AUX1, },
 	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB9540_LDO_AUX2, },
@@ -2063,6 +2728,11 @@ static int ab8500_regulator_probe(struct platform_device *pdev)
 		regulator_info_size = ARRAY_SIZE(ab8505_regulator_info);
 		reg_init = ab8505_reg_init;
 		reg_init_size = AB8505_NUM_REGULATOR_REGISTERS;
+	} else if (is_ab8540(ab8500)) {
+		regulator_info = ab8540_regulator_info;
+		regulator_info_size = ARRAY_SIZE(ab8540_regulator_info);
+		reg_init = ab8540_reg_init;
+		reg_init_size = AB8540_NUM_REGULATOR_REGISTERS;
 	} else {
 		regulator_info = ab8500_regulator_info;
 		regulator_info_size = ARRAY_SIZE(ab8500_regulator_info);
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 9a7cf97e5040..bb0140c9d4f4 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -68,6 +68,25 @@ enum ab9540_regulator_id {
 	AB9540_NUM_REGULATORS,
 };
 
+/* AB8540 regulators */
+enum ab8540_regulator_id {
+	AB8540_LDO_AUX1,
+	AB8540_LDO_AUX2,
+	AB8540_LDO_AUX3,
+	AB8540_LDO_AUX4,
+	AB8540_LDO_INTCORE,
+	AB8540_LDO_TVOUT,
+	AB8540_LDO_AUDIO,
+	AB8540_LDO_ANAMIC1,
+	AB8540_LDO_ANAMIC2,
+	AB8540_LDO_DMIC,
+	AB8540_LDO_ANA,
+	AB8540_LDO_SDIO,
+	AB8540_SYSCLKREQ_2,
+	AB8540_SYSCLKREQ_4,
+	AB8540_NUM_REGULATORS,
+};
+
 /* AB8500, AB8505, and AB9540 register initialization */
 struct ab8500_regulator_reg_init {
 	int id;
@@ -212,6 +231,73 @@ enum ab9540_regulator_reg {
 	AB9540_NUM_REGULATOR_REGISTERS,
 };
 
+/* AB8540 registers */
+enum ab8540_regulator_reg {
+	AB8540_REGUREQUESTCTRL1,
+	AB8540_REGUREQUESTCTRL2,
+	AB8540_REGUREQUESTCTRL3,
+	AB8540_REGUREQUESTCTRL4,
+	AB8540_REGUSYSCLKREQ1HPVALID1,
+	AB8540_REGUSYSCLKREQ1HPVALID2,
+	AB8540_REGUHWHPREQ1VALID1,
+	AB8540_REGUHWHPREQ1VALID2,
+	AB8540_REGUHWHPREQ2VALID1,
+	AB8540_REGUHWHPREQ2VALID2,
+	AB8540_REGUSWHPREQVALID1,
+	AB8540_REGUSWHPREQVALID2,
+	AB8540_REGUSYSCLKREQVALID1,
+	AB8540_REGUSYSCLKREQVALID2,
+	AB8540_REGUVAUX4REQVALID,
+	AB8540_REGUVAUX5REQVALID,
+	AB8540_REGUVAUX6REQVALID,
+	AB8540_REGUVCLKBREQVALID,
+	AB8540_REGUVRF1REQVALID,
+	AB8540_REGUMISC1,
+	AB8540_VAUDIOSUPPLY,
+	AB8540_REGUCTRL1VAMIC,
+	AB8540_VHSIC,
+	AB8540_VSDIO,
+	AB8540_VSMPS1REGU,
+	AB8540_VSMPS2REGU,
+	AB8540_VSMPS3REGU,
+	AB8540_VPLLVANAREGU,
+	AB8540_EXTSUPPLYREGU,
+	AB8540_VAUX12REGU,
+	AB8540_VRF1VAUX3REGU,
+	AB8540_VSMPS1SEL1,
+	AB8540_VSMPS1SEL2,
+	AB8540_VSMPS1SEL3,
+	AB8540_VSMPS2SEL1,
+	AB8540_VSMPS2SEL2,
+	AB8540_VSMPS2SEL3,
+	AB8540_VSMPS3SEL1,
+	AB8540_VSMPS3SEL2,
+	AB8540_VAUX1SEL,
+	AB8540_VAUX2SEL,
+	AB8540_VRF1VAUX3SEL,
+	AB8540_REGUCTRL2SPARE,
+	AB8540_VAUX4REQCTRL,
+	AB8540_VAUX4REGU,
+	AB8540_VAUX4SEL,
+	AB8540_VAUX5REQCTRL,
+	AB8540_VAUX5REGU,
+	AB8540_VAUX5SEL,
+	AB8540_VAUX6REQCTRL,
+	AB8540_VAUX6REGU,
+	AB8540_VAUX6SEL,
+	AB8540_VCLKBREQCTRL,
+	AB8540_VCLKBREGU,
+	AB8540_VCLKBSEL,
+	AB8540_VRF1REQCTRL,
+	AB8540_REGUCTRLDISCH,
+	AB8540_REGUCTRLDISCH2,
+	AB8540_REGUCTRLDISCH3,
+	AB8540_REGUCTRLDISCH4,
+	AB8540_VSIMSYSCLKCTRL,
+	AB8540_VANAVPLLSEL,
+	AB8540_NUM_REGULATOR_REGISTERS,
+};
+
 /* AB8500 external regulators */
 struct ab8500_ext_regulator_cfg {
 	bool hwreq; /* requires hw mode or high power mode */
-- 
cgit 


From 3cb7825bdc84d1d6c81ac9a2be201fe5bea5de05 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Fri, 29 Mar 2013 13:36:05 +0800
Subject: ARM: mxs: remove common.h

All three remaining functions declared in common.h are implemented by
clock driver.  Create header include/linux/clk/mxs.h to contain them
and remove common.h.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
---
 arch/arm/mach-mxs/include/mach/common.h | 20 --------------------
 arch/arm/mach-mxs/mach-mxs.c            |  2 +-
 include/linux/clk/mxs.h                 | 16 ++++++++++++++++
 3 files changed, 17 insertions(+), 21 deletions(-)
 delete mode 100644 arch/arm/mach-mxs/include/mach/common.h
 create mode 100644 include/linux/clk/mxs.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-mxs/include/mach/common.h b/arch/arm/mach-mxs/include/mach/common.h
deleted file mode 100644
index aca982c6d43f..000000000000
--- a/arch/arm/mach-mxs/include/mach/common.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright 2004-2007 Freescale Semiconductor, Inc. All Rights Reserved.
- */
-
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- */
-
-#ifndef __MACH_MXS_COMMON_H__
-#define __MACH_MXS_COMMON_H__
-
-extern int mxs_saif_clkmux_select(unsigned int clkmux);
-
-extern int mx23_clocks_init(void);
-
-extern int mx28_clocks_init(void);
-
-#endif /* __MACH_MXS_COMMON_H__ */
diff --git a/arch/arm/mach-mxs/mach-mxs.c b/arch/arm/mach-mxs/mach-mxs.c
index f346c432cda2..fc762579d30f 100644
--- a/arch/arm/mach-mxs/mach-mxs.c
+++ b/arch/arm/mach-mxs/mach-mxs.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/clk.h>
+#include <linux/clk/mxs.h>
 #include <linux/clkdev.h>
 #include <linux/clocksource.h>
 #include <linux/can/platform/flexcan.h>
@@ -30,7 +31,6 @@
 #include <asm/mach/map.h>
 #include <asm/mach/time.h>
 #include <asm/system_misc.h>
-#include <mach/common.h>
 #include <mach/digctl.h>
 #include <mach/mxs.h>
 
diff --git a/include/linux/clk/mxs.h b/include/linux/clk/mxs.h
new file mode 100644
index 000000000000..90c30dc3efc7
--- /dev/null
+++ b/include/linux/clk/mxs.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) 2013 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_CLK_MXS_H
+#define __LINUX_CLK_MXS_H
+
+int mx23_clocks_init(void);
+int mx28_clocks_init(void);
+int mxs_saif_clkmux_select(unsigned int clkmux);
+
+#endif
-- 
cgit 


From 543bb255a1987836e64f5b7a63664ead8b32b042 Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@wwwdotorg.org>
Date: Tue, 26 Mar 2013 20:37:57 -0600
Subject: spi: add ability to validate xfer->bits_per_word in SPI core

Allow SPI masters to define the set of bits_per_word values they support.
If they do this, then the SPI core will reject transfers that attempt to
use an unsupported bits_per_word value. This eliminates the need for each
SPI driver to implement this checking in most cases.

Signed-off-by: Stephen Warren <swarren@wwwdotorg.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/spi/spi.c       | 8 ++++++++
 include/linux/spi/spi.h | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/spi/spi.c b/drivers/spi/spi.c
index f996c600eb8c..0cabf1560550 100644
--- a/drivers/spi/spi.c
+++ b/drivers/spi/spi.c
@@ -1377,6 +1377,14 @@ static int __spi_async(struct spi_device *spi, struct spi_message *message)
 			xfer->bits_per_word = spi->bits_per_word;
 		if (!xfer->speed_hz)
 			xfer->speed_hz = spi->max_speed_hz;
+		if (master->bits_per_word_mask) {
+			/* Only 32 bits fit in the mask */
+			if (xfer->bits_per_word > 32)
+				return -EINVAL;
+			if (!(master->bits_per_word_mask &
+					BIT(xfer->bits_per_word - 1)))
+				return -EINVAL;
+		}
 	}
 
 	message->spi = spi;
diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h
index 38c2b925923d..733eb5ee31c5 100644
--- a/include/linux/spi/spi.h
+++ b/include/linux/spi/spi.h
@@ -228,6 +228,11 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv)
  *	every chipselect is connected to a slave.
  * @dma_alignment: SPI controller constraint on DMA buffers alignment.
  * @mode_bits: flags understood by this controller driver
+ * @bits_per_word_mask: A mask indicating which values of bits_per_word are
+ *	supported by the driver. Bit n indicates that a bits_per_word n+1 is
+ *	suported. If set, the SPI core will reject any transfer with an
+ *	unsupported bits_per_word. If not set, this value is simply ignored,
+ *	and it's up to the individual driver to perform any validation.
  * @flags: other constraints relevant to this driver
  * @bus_lock_spinlock: spinlock for SPI bus locking
  * @bus_lock_mutex: mutex for SPI bus locking
@@ -301,6 +306,9 @@ struct spi_master {
 	/* spi_device.mode flags understood by this controller driver */
 	u16			mode_bits;
 
+	/* bitmask of supported bits_per_word for transfers */
+	u32			bits_per_word_mask;
+
 	/* other constraints relevant to this driver */
 	u16			flags;
 #define SPI_MASTER_HALF_DUPLEX	BIT(0)		/* can't do full duplex */
-- 
cgit 


From 9fac2cf316b070ae43d2ae2525e381ff2d1d68aa Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 24 Jan 2013 16:10:27 +0100
Subject: perf/x86: Add flags to event constraints

This patch adds a flags field to each event constraint.
It can be used to store event specific features which can
then later be used by scheduling code or low-level x86 code.

The flags are propagated into event->hw.flags during the
get_event_constraint() call. They are cleared during the
put_event_constraint() call.

This mechanism is going to be used by the PEBS-LL patches.
It avoids defining yet another table to hold event specific
information.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: ak@linux.intel.com
Cc: jolsa@redhat.com
Cc: namhyung.kim@lge.com
Link: http://lkml.kernel.org/r/1359040242-8269-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 arch/x86/kernel/cpu/perf_event.c              | 2 +-
 arch/x86/kernel/cpu/perf_event.h              | 8 +++++---
 arch/x86/kernel/cpu/perf_event_intel.c        | 6 +++++-
 arch/x86/kernel/cpu/perf_event_intel_ds.c     | 4 +++-
 arch/x86/kernel/cpu/perf_event_intel_uncore.c | 2 +-
 include/linux/perf_event.h                    | 1 +
 6 files changed, 16 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6e8ab0427041..8ba51518f689 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1489,7 +1489,7 @@ static int __init init_hw_perf_events(void)
 
 	unconstrained = (struct event_constraint)
 		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
-				   0, x86_pmu.num_counters, 0);
+				   0, x86_pmu.num_counters, 0, 0);
 
 	x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */
 	x86_pmu_format_group.attrs = x86_pmu.format_attrs;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index b1518eed5f99..9686d38eb458 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -59,6 +59,7 @@ struct event_constraint {
 	u64	cmask;
 	int	weight;
 	int	overlap;
+	int	flags;
 };
 
 struct amd_nb {
@@ -170,16 +171,17 @@ struct cpu_hw_events {
 	void				*kfree_on_online;
 };
 
-#define __EVENT_CONSTRAINT(c, n, m, w, o) {\
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
 	{ .idxmsk64 = (n) },		\
 	.code = (c),			\
 	.cmask = (m),			\
 	.weight = (w),			\
 	.overlap = (o),			\
+	.flags = f,			\
 }
 
 #define EVENT_CONSTRAINT(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0)
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
 
 /*
  * The overlap flag marks event constraints with overlapping counter
@@ -203,7 +205,7 @@ struct cpu_hw_events {
  * and its counter masks must be kept at a minimum.
  */
 #define EVENT_CONSTRAINT_OVERLAP(c, n, m)	\
-	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1)
+	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0)
 
 /*
  * Constraint on the Event code.
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index dab7580c47ae..df3beaac3397 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1392,8 +1392,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 
 	if (x86_pmu.event_constraints) {
 		for_each_event_constraint(c, x86_pmu.event_constraints) {
-			if ((event->hw.config & c->cmask) == c->code)
+			if ((event->hw.config & c->cmask) == c->code) {
+				/* hw.flags zeroed at initialization */
+				event->hw.flags |= c->flags;
 				return c;
+			}
 		}
 	}
 
@@ -1438,6 +1441,7 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
+	event->hw.flags = 0;
 	intel_put_shared_regs_event_constraints(cpuc, event);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 826054a4f2ee..f30d85bcbda9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -430,8 +430,10 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 
 	if (x86_pmu.pebs_constraints) {
 		for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-			if ((event->hw.config & c->cmask) == c->code)
+			if ((event->hw.config & c->cmask) == c->code) {
+				event->hw.flags |= c->flags;
 				return c;
+			}
 		}
 	}
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
index b43200dbfe7e..75da9e18b128 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -2438,7 +2438,7 @@ static int __init uncore_type_init(struct intel_uncore_type *type)
 
 	type->unconstrainted = (struct event_constraint)
 		__EVENT_CONSTRAINT(0, (1ULL << type->num_counters) - 1,
-				0, type->num_counters, 0);
+				0, type->num_counters, 0, 0);
 
 	for (i = 0; i < type->num_boxes; i++) {
 		pmus[i].func_id = -1;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 1c592114c437..cd3bb2cd9494 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -127,6 +127,7 @@ struct hw_perf_event {
 			int		event_base_rdpmc;
 			int		idx;
 			int		last_cpu;
+			int		flags;
 
 			struct hw_perf_event_extra extra_reg;
 			struct hw_perf_event_extra branch_reg;
-- 
cgit 


From c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 24 Jan 2013 16:10:28 +0100
Subject: perf/core: Add weighted samples

For some events it's useful to weight sample with a hardware
provided number. This expresses how expensive the action the
sample represent was.  This allows the profiler to scale
the samples to be more informative to the programmer.

There is already the period which is used similarly, but it
means something different, so I chose to not overload it.
Instead a new sample type for WEIGHT is added.

Can be used for multiple things. Initially it is used for TSX
abort costs and profiling by memory latencies (so to make
expensive load appear higher up in the histograms). The concept
is quite generic and can be extended to many other kinds of
events or architectures, as long as the hardware provides
suitable auxillary values. In principle it could be also used
for software tracepoints.

This adds the generic glue. A new optional sample format for a
64-bit weight value.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: namhyung.kim@lge.com
Link: http://lkml.kernel.org/r/1359040242-8269-5-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/perf_event.h      | 2 ++
 include/uapi/linux/perf_event.h | 6 +++++-
 kernel/events/core.c            | 6 ++++++
 3 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index cd3bb2cd9494..7ce0b37b155b 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -573,6 +573,7 @@ struct perf_sample_data {
 	struct perf_branch_stack	*br_stack;
 	struct perf_regs_user		regs_user;
 	u64				stack_user_size;
+	u64				weight;
 };
 
 static inline void perf_sample_data_init(struct perf_sample_data *data,
@@ -586,6 +587,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->regs_user.abi = PERF_SAMPLE_REGS_ABI_NONE;
 	data->regs_user.regs = NULL;
 	data->stack_user_size = 0;
+	data->weight = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9fa9c622a7f4..cdc255da02e2 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -132,8 +132,10 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_BRANCH_STACK		= 1U << 11,
 	PERF_SAMPLE_REGS_USER			= 1U << 12,
 	PERF_SAMPLE_STACK_USER			= 1U << 13,
+	PERF_SAMPLE_WEIGHT			= 1U << 14,
+
+	PERF_SAMPLE_MAX = 1U << 15,		/* non-ABI */
 
-	PERF_SAMPLE_MAX = 1U << 14,		/* non-ABI */
 };
 
 /*
@@ -588,6 +590,8 @@ enum perf_event_type {
 	 * 	{ u64			size;
 	 * 	  char			data[size];
 	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
+	 *
+	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b4a55d41efc..9e3edb272b3e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -976,6 +976,9 @@ static void perf_event__header_size(struct perf_event *event)
 	if (sample_type & PERF_SAMPLE_PERIOD)
 		size += sizeof(data->period);
 
+	if (sample_type & PERF_SAMPLE_WEIGHT)
+		size += sizeof(data->weight);
+
 	if (sample_type & PERF_SAMPLE_READ)
 		size += event->read_size;
 
@@ -4193,6 +4196,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 		perf_output_sample_ustack(handle,
 					  data->stack_user_size,
 					  data->regs_user.regs);
+
+	if (sample_type & PERF_SAMPLE_WEIGHT)
+		perf_output_put(handle, data->weight);
 }
 
 void perf_prepare_sample(struct perf_event_header *header,
-- 
cgit 


From d6be9ad6c960f43800a6f118932bc8a5a4eadcd1 Mon Sep 17 00:00:00 2001
From: Stephane Eranian <eranian@google.com>
Date: Thu, 24 Jan 2013 16:10:31 +0100
Subject: perf: Add generic memory sampling interface

This patch adds PERF_SAMPLE_DATA_SRC.

PERF_SAMPLE_DATA_SRC collects the data source, i.e., where
did the data associated with the sampled instruction
come from. Information is stored in a perf_mem_data_src
structure. It contains opcode, mem level, tlb, snoop,
lock information, subject to availability in hardware.

Signed-off-by: Stephane Eranian <eranian@google.com>
Cc: peterz@infradead.org
Cc: ak@linux.intel.com
Cc: acme@redhat.com
Cc: jolsa@redhat.com
Cc: namhyung.kim@lge.com
Link: http://lkml.kernel.org/r/1359040242-8269-8-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 include/linux/perf_event.h      |  2 ++
 include/uapi/linux/perf_event.h | 68 +++++++++++++++++++++++++++++++++++++++--
 kernel/events/core.c            |  6 ++++
 3 files changed, 74 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7ce0b37b155b..42a6daaf4e0a 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -568,6 +568,7 @@ struct perf_sample_data {
 		u32	reserved;
 	}				cpu_entry;
 	u64				period;
+	union  perf_mem_data_src	data_src;
 	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 	struct perf_branch_stack	*br_stack;
@@ -588,6 +589,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data,
 	data->regs_user.regs = NULL;
 	data->stack_user_size = 0;
 	data->weight = 0;
+	data->data_src.val = 0;
 }
 
 extern void perf_output_sample(struct perf_output_handle *handle,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index cdc255da02e2..5b5762006855 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -133,9 +133,9 @@ enum perf_event_sample_format {
 	PERF_SAMPLE_REGS_USER			= 1U << 12,
 	PERF_SAMPLE_STACK_USER			= 1U << 13,
 	PERF_SAMPLE_WEIGHT			= 1U << 14,
+	PERF_SAMPLE_DATA_SRC			= 1U << 15,
 
-	PERF_SAMPLE_MAX = 1U << 15,		/* non-ABI */
-
+	PERF_SAMPLE_MAX = 1U << 16,		/* non-ABI */
 };
 
 /*
@@ -592,6 +592,7 @@ enum perf_event_type {
 	 * 	  u64			dyn_size; } && PERF_SAMPLE_STACK_USER
 	 *
 	 *	{ u64			weight;   } && PERF_SAMPLE_WEIGHT
+	 *	{ u64			data_src;     } && PERF_SAMPLE_DATA_SRC
 	 * };
 	 */
 	PERF_RECORD_SAMPLE			= 9,
@@ -617,4 +618,67 @@ enum perf_callchain_context {
 #define PERF_FLAG_FD_OUTPUT		(1U << 1)
 #define PERF_FLAG_PID_CGROUP		(1U << 2) /* pid=cgroup id, per-cpu mode only */
 
+union perf_mem_data_src {
+	__u64 val;
+	struct {
+		__u64   mem_op:5,	/* type of opcode */
+			mem_lvl:14,	/* memory hierarchy level */
+			mem_snoop:5,	/* snoop mode */
+			mem_lock:2,	/* lock instr */
+			mem_dtlb:7,	/* tlb access */
+			mem_rsvd:31;
+	};
+};
+
+/* type of opcode (load/store/prefetch,code) */
+#define PERF_MEM_OP_NA		0x01 /* not available */
+#define PERF_MEM_OP_LOAD	0x02 /* load instruction */
+#define PERF_MEM_OP_STORE	0x04 /* store instruction */
+#define PERF_MEM_OP_PFETCH	0x08 /* prefetch */
+#define PERF_MEM_OP_EXEC	0x10 /* code (execution) */
+#define PERF_MEM_OP_SHIFT	0
+
+/* memory hierarchy (memory level, hit or miss) */
+#define PERF_MEM_LVL_NA		0x01  /* not available */
+#define PERF_MEM_LVL_HIT	0x02  /* hit level */
+#define PERF_MEM_LVL_MISS	0x04  /* miss level  */
+#define PERF_MEM_LVL_L1		0x08  /* L1 */
+#define PERF_MEM_LVL_LFB	0x10  /* Line Fill Buffer */
+#define PERF_MEM_LVL_L2		0x20  /* L2 hit */
+#define PERF_MEM_LVL_L3		0x40  /* L3 hit */
+#define PERF_MEM_LVL_LOC_RAM	0x80  /* Local DRAM */
+#define PERF_MEM_LVL_REM_RAM1	0x100 /* Remote DRAM (1 hop) */
+#define PERF_MEM_LVL_REM_RAM2	0x200 /* Remote DRAM (2 hops) */
+#define PERF_MEM_LVL_REM_CCE1	0x400 /* Remote Cache (1 hop) */
+#define PERF_MEM_LVL_REM_CCE2	0x800 /* Remote Cache (2 hops) */
+#define PERF_MEM_LVL_IO		0x1000 /* I/O memory */
+#define PERF_MEM_LVL_UNC	0x2000 /* Uncached memory */
+#define PERF_MEM_LVL_SHIFT	5
+
+/* snoop mode */
+#define PERF_MEM_SNOOP_NA	0x01 /* not available */
+#define PERF_MEM_SNOOP_NONE	0x02 /* no snoop */
+#define PERF_MEM_SNOOP_HIT	0x04 /* snoop hit */
+#define PERF_MEM_SNOOP_MISS	0x08 /* snoop miss */
+#define PERF_MEM_SNOOP_HITM	0x10 /* snoop hit modified */
+#define PERF_MEM_SNOOP_SHIFT	19
+
+/* locked instruction */
+#define PERF_MEM_LOCK_NA	0x01 /* not available */
+#define PERF_MEM_LOCK_LOCKED	0x02 /* locked transaction */
+#define PERF_MEM_LOCK_SHIFT	24
+
+/* TLB access */
+#define PERF_MEM_TLB_NA		0x01 /* not available */
+#define PERF_MEM_TLB_HIT	0x02 /* hit level */
+#define PERF_MEM_TLB_MISS	0x04 /* miss level */
+#define PERF_MEM_TLB_L1		0x08 /* L1 */
+#define PERF_MEM_TLB_L2		0x10 /* L2 */
+#define PERF_MEM_TLB_WK		0x20 /* Hardware Walker*/
+#define PERF_MEM_TLB_OS		0x40 /* OS fault handler */
+#define PERF_MEM_TLB_SHIFT	26
+
+#define PERF_MEM_S(a, s) \
+	(((u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT)
+
 #endif /* _UAPI_LINUX_PERF_EVENT_H */
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9e3edb272b3e..77c96d18c23a 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -982,6 +982,9 @@ static void perf_event__header_size(struct perf_event *event)
 	if (sample_type & PERF_SAMPLE_READ)
 		size += event->read_size;
 
+	if (sample_type & PERF_SAMPLE_DATA_SRC)
+		size += sizeof(data->data_src.val);
+
 	event->header_size = size;
 }
 
@@ -4199,6 +4202,9 @@ void perf_output_sample(struct perf_output_handle *handle,
 
 	if (sample_type & PERF_SAMPLE_WEIGHT)
 		perf_output_put(handle, data->weight);
+
+	if (sample_type & PERF_SAMPLE_DATA_SRC)
+		perf_output_put(handle, data->data_src.val);
 }
 
 void perf_prepare_sample(struct perf_event_header *header,
-- 
cgit 


From ff45262a85dbf1bc74463c5dcea1d71a406d4d8e Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 18 Feb 2013 21:10:14 -0800
Subject: leds: add new LP5562 LED driver

LP5562 can drive up to 4 channels, RGB and White.
LEDs can be controlled directly via the led class control interface.

 LP55xx common driver
  LP5562 is one of LP55xx family device, so LP55xx common code are used.
  On the other hand, chip specific configuration is defined in the structure
  'lp55xx_device_config'

 LED pattern data
  LP5562 has also internal program memory which is used for running various LED
  patterns. LP5562 driver supports the firmware interface and the predefined
  pattern data as well.

 LP5562 device attributes: 'led_pattern' and 'engine_mux'
  A 'led_pattern' is an index code which runs the predefined pattern data.
  And 'engine_mux' is updated with the firmware interface is activated.
  Detailed description has been updated in the documentation files,
  'leds-lp55xx.txt' and 'leds-lp5562.txt'.

 Changes on the header file
  LP5562 configurable definitions are added.
  Pattern RGB data is fixed as constant value.
  (No side effect on other devices, LP5521 or LP5523.)

(cooloney@gmail.com: remove redundant mutex_unlock(). Reported by Dan
Carpenter <dan.carpenter@oracle.com>)

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 Documentation/leds/00-INDEX               |   2 +
 Documentation/leds/leds-lp5562.txt        | 135 +++++++
 Documentation/leds/leds-lp55xx.txt        |  46 ++-
 drivers/leds/Kconfig                      |  14 +-
 drivers/leds/Makefile                     |   1 +
 drivers/leds/leds-lp5562.c                | 593 ++++++++++++++++++++++++++++++
 drivers/leds/leds-lp55xx-common.c         |   2 +-
 include/linux/platform_data/leds-lp55xx.h |  13 +-
 8 files changed, 799 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/leds/leds-lp5562.txt
 create mode 100644 drivers/leds/leds-lp5562.c

(limited to 'include/linux')

diff --git a/Documentation/leds/00-INDEX b/Documentation/leds/00-INDEX
index 5246090ef15c..1ecd1596633e 100644
--- a/Documentation/leds/00-INDEX
+++ b/Documentation/leds/00-INDEX
@@ -6,6 +6,8 @@ leds-lp5521.txt
 	- notes on how to use the leds-lp5521 driver.
 leds-lp5523.txt
 	- notes on how to use the leds-lp5523 driver.
+leds-lp5562.txt
+	- notes on how to use the leds-lp5562 driver.
 leds-lp55xx.txt
 	- description about lp55xx common driver.
 leds-lm3556.txt
diff --git a/Documentation/leds/leds-lp5562.txt b/Documentation/leds/leds-lp5562.txt
new file mode 100644
index 000000000000..96061000dd93
--- /dev/null
+++ b/Documentation/leds/leds-lp5562.txt
@@ -0,0 +1,135 @@
+Kernel driver for LP5562
+========================
+
+* TI LP5562 LED Driver
+
+Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+
+Description
+
+  LP5562 can drive up to 4 channels. R/G/B and White.
+  LEDs can be controlled directly via the led class control interface.
+
+  All four channels can be also controlled using the engine micro programs.
+  LP5562 has the internal program memory for running various LED patterns.
+  For the details, please refer to 'firmware' section in leds-lp55xx.txt
+
+Device attribute: engine_mux
+
+  3 Engines are allocated in LP5562, but the number of channel is 4.
+  Therefore each channel should be mapped to the engine number.
+  Value : RGB or W
+
+  This attribute is used for programming LED data with the firmware interface.
+  Unlike the LP5521/LP5523/55231, LP5562 has unique feature for the engine mux,
+  so additional sysfs is required.
+
+  LED Map
+  Red   ... Engine 1 (fixed)
+  Green ... Engine 2 (fixed)
+  Blue  ... Engine 3 (fixed)
+  White ... Engine 1 or 2 or 3 (selective)
+
+How to load the program data using engine_mux
+
+  Before loading the LP5562 program data, engine_mux should be written between
+  the engine selection and loading the firmware.
+  Engine mux has two different mode, RGB and W.
+  RGB is used for loading RGB program data, W is used for W program data.
+
+  For example, run blinking green channel pattern,
+  echo 2 > /sys/bus/i2c/devices/xxxx/select_engine     # 2 is for green channel
+  echo "RGB" > /sys/bus/i2c/devices/xxxx/engine_mux    # engine mux for RGB
+  echo 1 > /sys/class/firmware/lp5562/loading
+  echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
+  echo 0 > /sys/class/firmware/lp5562/loading
+  echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+  To run a blinking white pattern,
+  echo 1 or 2 or 3 > /sys/bus/i2c/devices/xxxx/select_engine
+  echo "W" > /sys/bus/i2c/devices/xxxx/engine_mux
+  echo 1 > /sys/class/firmware/lp5562/loading
+  echo "4000600040FF6000" > /sys/class/firmware/lp5562/data
+  echo 0 > /sys/class/firmware/lp5562/loading
+  echo 1 > /sys/bus/i2c/devices/xxxx/run_engine
+
+How to load the predefined patterns
+
+  Please refer to 'leds-lp55xx.txt"
+
+Setting Current of Each Channel
+
+  Like LP5521 and LP5523/55231, LP5562 provides LED current settings.
+  The 'led_current' and 'max_current' are used.
+
+(Example of Platform data)
+
+To configure the platform specific data, lp55xx_platform_data structure is used.
+
+static struct lp55xx_led_config lp5562_led_config[] = {
+	{
+		.name 		= "R",
+		.chan_nr	= 0,
+		.led_current	= 20,
+		.max_current	= 40,
+	},
+	{
+		.name 		= "G",
+		.chan_nr	= 1,
+		.led_current	= 20,
+		.max_current	= 40,
+	},
+	{
+		.name 		= "B",
+		.chan_nr	= 2,
+		.led_current	= 20,
+		.max_current	= 40,
+	},
+	{
+		.name 		= "W",
+		.chan_nr	= 3,
+		.led_current	= 20,
+		.max_current	= 40,
+	},
+};
+
+static int lp5562_setup(void)
+{
+	/* setup HW resources */
+}
+
+static void lp5562_release(void)
+{
+	/* Release HW resources */
+}
+
+static void lp5562_enable(bool state)
+{
+	/* Control of chip enable signal */
+}
+
+static struct lp55xx_platform_data lp5562_platform_data = {
+        .led_config     = lp5562_led_config,
+        .num_channels   = ARRAY_SIZE(lp5562_led_config),
+        .setup_resources   = lp5562_setup,
+        .release_resources = lp5562_release,
+        .enable            = lp5562_enable,
+};
+
+If the current is set to 0 in the platform data, that channel is
+disabled and it is not visible in the sysfs.
+
+The 'update_config' : CONFIG register (ADDR 08h)
+This value is platform-specific data.
+If update_config is not defined, the CONFIG register is set with
+'LP5562_PWRSAVE_EN | LP5562_CLK_AUTO'.
+(Enable auto-powersave, set automatic clock source selection)
+
+#define LP5562_CONFIGS	(LP5562_PWM_HF | LP5562_PWRSAVE_EN | \
+			 LP5562_CLK_SRC_EXT)
+
+static struct lp55xx_platform_data lp5562_pdata = {
+	.led_config = lp5562_led_config,
+	.num_channels = ARRAY_SIZE(lp5562_led_config),
+	.update_config = LP5562_CONFIGS,
+};
diff --git a/Documentation/leds/leds-lp55xx.txt b/Documentation/leds/leds-lp55xx.txt
index ced41868d2d1..eec8fa2ffe4e 100644
--- a/Documentation/leds/leds-lp55xx.txt
+++ b/Documentation/leds/leds-lp55xx.txt
@@ -5,7 +5,7 @@ Authors: Milo(Woogyom) Kim <milo.kim@ti.com>
 
 Description
 -----------
-LP5521, LP5523/55231 have common features as below.
+LP5521, LP5523/55231 and LP5562 have common features as below.
 
   Register access via the I2C
   Device initialization/deinitialization
@@ -116,3 +116,47 @@ To support this, 'run_engine' and 'firmware_cb' are configurable in each driver.
 run_engine  : Control the selected engine
 firmware_cb : The callback function after loading the firmware is done.
               Chip specific commands for loading and updating program memory.
+
+( Predefined pattern data )
+
+Without the firmware interface, LP55xx driver provides another method for
+loading a LED pattern. That is 'predefined' pattern.
+A predefined pattern is defined in the platform data and load it(or them)
+via the sysfs if needed.
+To use the predefined pattern concept, 'patterns' and 'num_patterns' should be
+configured.
+
+  Example of predefined pattern data:
+
+  /* mode_1: blinking data */
+  static const u8 mode_1[] = {
+		0x40, 0x00, 0x60, 0x00, 0x40, 0xFF, 0x60, 0x00,
+		};
+
+  /* mode_2: always on */
+  static const u8 mode_2[] = { 0x40, 0xFF, };
+
+  struct lp55xx_predef_pattern board_led_patterns[] = {
+	{
+		.r = mode_1,
+		.size_r = ARRAY_SIZE(mode_1),
+	},
+	{
+		.b = mode_2,
+		.size_b = ARRAY_SIZE(mode_2),
+	},
+  }
+
+  struct lp55xx_platform_data lp5562_pdata = {
+  ...
+	.patterns      = board_led_patterns,
+	.num_patterns  = ARRAY_SIZE(board_led_patterns),
+  };
+
+Then, mode_1 and mode_2 can be run via through the sysfs.
+
+  echo 1 > /sys/bus/i2c/devices/xxxx/led_pattern    # red blinking LED pattern
+  echo 2 > /sys/bus/i2c/devices/xxxx/led_pattern    # blue LED always on
+
+To stop running pattern,
+  echo 0 > /sys/bus/i2c/devices/xxxx/led_pattern
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index ec50824c02ec..c7f755034375 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -194,8 +194,8 @@ config LEDS_LP3944
 	  module will be called leds-lp3944.
 
 config LEDS_LP55XX_COMMON
-	tristate "Common Driver for TI/National LP5521 and LP5523/55231"
-	depends on LEDS_LP5521 || LEDS_LP5523
+	tristate "Common Driver for TI/National LP5521, LP5523/55231 and LP5562"
+	depends on LEDS_LP5521 || LEDS_LP5523 || LEDS_LP5562
 	select FW_LOADER
 	help
 	  This option supports common operations for LP5521 and LP5523/55231
@@ -222,6 +222,16 @@ config LEDS_LP5523
 	  Driver provides direct control via LED class and interface for
 	  programming the engines.
 
+config LEDS_LP5562
+	tristate "LED Support for TI LP5562 LED driver chip"
+	depends on LEDS_CLASS && I2C
+	select LEDS_LP55XX_COMMON
+	help
+	  If you say yes here you get support for TI LP5562 LED driver.
+	  It is 4 channels chip with programmable engines.
+	  Driver provides direct control via LED class and interface for
+	  programming the engines.
+
 config LEDS_LP8788
 	tristate "LED support for the TI LP8788 PMIC"
 	depends on LEDS_CLASS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index 215e7e3b6173..ab8f5c549ad3 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_LEDS_LP3944)		+= leds-lp3944.o
 obj-$(CONFIG_LEDS_LP55XX_COMMON)	+= leds-lp55xx-common.o
 obj-$(CONFIG_LEDS_LP5521)		+= leds-lp5521.o
 obj-$(CONFIG_LEDS_LP5523)		+= leds-lp5523.o
+obj-$(CONFIG_LEDS_LP5562)		+= leds-lp5562.o
 obj-$(CONFIG_LEDS_LP8788)		+= leds-lp8788.o
 obj-$(CONFIG_LEDS_TCA6507)		+= leds-tca6507.o
 obj-$(CONFIG_LEDS_CLEVO_MAIL)		+= leds-clevo-mail.o
diff --git a/drivers/leds/leds-lp5562.c b/drivers/leds/leds-lp5562.c
new file mode 100644
index 000000000000..f8b927788c3a
--- /dev/null
+++ b/drivers/leds/leds-lp5562.c
@@ -0,0 +1,593 @@
+/*
+ * LP5562 LED driver
+ *
+ * Copyright (C) 2013 Texas Instruments
+ *
+ * Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/firmware.h>
+#include <linux/i2c.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/platform_data/leds-lp55xx.h>
+#include <linux/slab.h>
+
+#include "leds-lp55xx-common.h"
+
+#define LP5562_PROGRAM_LENGTH		32
+#define LP5562_MAX_LEDS			4
+
+/* ENABLE Register 00h */
+#define LP5562_REG_ENABLE		0x00
+#define LP5562_EXEC_ENG1_M		0x30
+#define LP5562_EXEC_ENG2_M		0x0C
+#define LP5562_EXEC_ENG3_M		0x03
+#define LP5562_EXEC_M			0x3F
+#define LP5562_MASTER_ENABLE		0x40	/* Chip master enable */
+#define LP5562_LOGARITHMIC_PWM		0x80	/* Logarithmic PWM adjustment */
+#define LP5562_EXEC_RUN			0x2A
+#define LP5562_ENABLE_DEFAULT	\
+	(LP5562_MASTER_ENABLE | LP5562_LOGARITHMIC_PWM)
+#define LP5562_ENABLE_RUN_PROGRAM	\
+	(LP5562_ENABLE_DEFAULT | LP5562_EXEC_RUN)
+
+/* OPMODE Register 01h */
+#define LP5562_REG_OP_MODE		0x01
+#define LP5562_MODE_ENG1_M		0x30
+#define LP5562_MODE_ENG2_M		0x0C
+#define LP5562_MODE_ENG3_M		0x03
+#define LP5562_LOAD_ENG1		0x10
+#define LP5562_LOAD_ENG2		0x04
+#define LP5562_LOAD_ENG3		0x01
+#define LP5562_RUN_ENG1			0x20
+#define LP5562_RUN_ENG2			0x08
+#define LP5562_RUN_ENG3			0x02
+#define LP5562_ENG1_IS_LOADING(mode)	\
+	((mode & LP5562_MODE_ENG1_M) == LP5562_LOAD_ENG1)
+#define LP5562_ENG2_IS_LOADING(mode)	\
+	((mode & LP5562_MODE_ENG2_M) == LP5562_LOAD_ENG2)
+#define LP5562_ENG3_IS_LOADING(mode)	\
+	((mode & LP5562_MODE_ENG3_M) == LP5562_LOAD_ENG3)
+
+/* BRIGHTNESS Registers */
+#define LP5562_REG_R_PWM		0x04
+#define LP5562_REG_G_PWM		0x03
+#define LP5562_REG_B_PWM		0x02
+#define LP5562_REG_W_PWM		0x0E
+
+/* CURRENT Registers */
+#define LP5562_REG_R_CURRENT		0x07
+#define LP5562_REG_G_CURRENT		0x06
+#define LP5562_REG_B_CURRENT		0x05
+#define LP5562_REG_W_CURRENT		0x0F
+
+/* CONFIG Register 08h */
+#define LP5562_REG_CONFIG		0x08
+#define LP5562_DEFAULT_CFG	\
+	(LP5562_PWM_HF | LP5562_PWRSAVE_EN | LP5562_CLK_INT)
+
+/* RESET Register 0Dh */
+#define LP5562_REG_RESET		0x0D
+#define LP5562_RESET			0xFF
+
+/* PROGRAM ENGINE Registers */
+#define LP5562_REG_PROG_MEM_ENG1	0x10
+#define LP5562_REG_PROG_MEM_ENG2	0x30
+#define LP5562_REG_PROG_MEM_ENG3	0x50
+
+/* LEDMAP Register 70h */
+#define LP5562_REG_ENG_SEL		0x70
+#define LP5562_ENG_SEL_PWM		0
+#define LP5562_ENG_FOR_RGB_M		0x3F
+#define LP5562_ENG_SEL_RGB		0x1B	/* R:ENG1, G:ENG2, B:ENG3 */
+#define LP5562_ENG_FOR_W_M		0xC0
+#define LP5562_ENG1_FOR_W		0x40	/* W:ENG1 */
+#define LP5562_ENG2_FOR_W		0x80	/* W:ENG2 */
+#define LP5562_ENG3_FOR_W		0xC0	/* W:ENG3 */
+
+/* Program Commands */
+#define LP5562_CMD_DISABLE		0x00
+#define LP5562_CMD_LOAD			0x15
+#define LP5562_CMD_RUN			0x2A
+#define LP5562_CMD_DIRECT		0x3F
+#define LP5562_PATTERN_OFF		0
+
+static inline void lp5562_wait_opmode_done(void)
+{
+	/* operation mode change needs to be longer than 153 us */
+	usleep_range(200, 300);
+}
+
+static inline void lp5562_wait_enable_done(void)
+{
+	/* it takes more 488 us to update ENABLE register */
+	usleep_range(500, 600);
+}
+
+static void lp5562_set_led_current(struct lp55xx_led *led, u8 led_current)
+{
+	u8 addr[] = {
+		LP5562_REG_R_CURRENT,
+		LP5562_REG_G_CURRENT,
+		LP5562_REG_B_CURRENT,
+		LP5562_REG_W_CURRENT,
+	};
+
+	led->led_current = led_current;
+	lp55xx_write(led->chip, addr[led->chan_nr], led_current);
+}
+
+static void lp5562_load_engine(struct lp55xx_chip *chip)
+{
+	enum lp55xx_engine_index idx = chip->engine_idx;
+	u8 mask[] = {
+		[LP55XX_ENGINE_1] = LP5562_MODE_ENG1_M,
+		[LP55XX_ENGINE_2] = LP5562_MODE_ENG2_M,
+		[LP55XX_ENGINE_3] = LP5562_MODE_ENG3_M,
+	};
+
+	u8 val[] = {
+		[LP55XX_ENGINE_1] = LP5562_LOAD_ENG1,
+		[LP55XX_ENGINE_2] = LP5562_LOAD_ENG2,
+		[LP55XX_ENGINE_3] = LP5562_LOAD_ENG3,
+	};
+
+	lp55xx_update_bits(chip, LP5562_REG_OP_MODE, mask[idx], val[idx]);
+
+	lp5562_wait_opmode_done();
+}
+
+static void lp5562_stop_engine(struct lp55xx_chip *chip)
+{
+	lp55xx_write(chip, LP5562_REG_OP_MODE, LP5562_CMD_DISABLE);
+	lp5562_wait_opmode_done();
+}
+
+static void lp5562_run_engine(struct lp55xx_chip *chip, bool start)
+{
+	int ret;
+	u8 mode;
+	u8 exec;
+
+	/* stop engine */
+	if (!start) {
+		lp55xx_write(chip, LP5562_REG_ENABLE, LP5562_ENABLE_DEFAULT);
+		lp5562_wait_enable_done();
+		lp5562_stop_engine(chip);
+		lp55xx_write(chip, LP5562_REG_ENG_SEL, LP5562_ENG_SEL_PWM);
+		lp55xx_write(chip, LP5562_REG_OP_MODE, LP5562_CMD_DIRECT);
+		lp5562_wait_opmode_done();
+		return;
+	}
+
+	/*
+	 * To run the engine,
+	 * operation mode and enable register should updated at the same time
+	 */
+
+	ret = lp55xx_read(chip, LP5562_REG_OP_MODE, &mode);
+	if (ret)
+		return;
+
+	ret = lp55xx_read(chip, LP5562_REG_ENABLE, &exec);
+	if (ret)
+		return;
+
+	/* change operation mode to RUN only when each engine is loading */
+	if (LP5562_ENG1_IS_LOADING(mode)) {
+		mode = (mode & ~LP5562_MODE_ENG1_M) | LP5562_RUN_ENG1;
+		exec = (exec & ~LP5562_EXEC_ENG1_M) | LP5562_RUN_ENG1;
+	}
+
+	if (LP5562_ENG2_IS_LOADING(mode)) {
+		mode = (mode & ~LP5562_MODE_ENG2_M) | LP5562_RUN_ENG2;
+		exec = (exec & ~LP5562_EXEC_ENG2_M) | LP5562_RUN_ENG2;
+	}
+
+	if (LP5562_ENG3_IS_LOADING(mode)) {
+		mode = (mode & ~LP5562_MODE_ENG3_M) | LP5562_RUN_ENG3;
+		exec = (exec & ~LP5562_EXEC_ENG3_M) | LP5562_RUN_ENG3;
+	}
+
+	lp55xx_write(chip, LP5562_REG_OP_MODE, mode);
+	lp5562_wait_opmode_done();
+
+	lp55xx_update_bits(chip, LP5562_REG_ENABLE, LP5562_EXEC_M, exec);
+	lp5562_wait_enable_done();
+}
+
+static int lp5562_update_firmware(struct lp55xx_chip *chip,
+					const u8 *data, size_t size)
+{
+	enum lp55xx_engine_index idx = chip->engine_idx;
+	u8 pattern[LP5562_PROGRAM_LENGTH] = {0};
+	u8 addr[] = {
+		[LP55XX_ENGINE_1] = LP5562_REG_PROG_MEM_ENG1,
+		[LP55XX_ENGINE_2] = LP5562_REG_PROG_MEM_ENG2,
+		[LP55XX_ENGINE_3] = LP5562_REG_PROG_MEM_ENG3,
+	};
+	unsigned cmd;
+	char c[3];
+	int program_size;
+	int nrchars;
+	int offset = 0;
+	int ret;
+	int i;
+
+	/* clear program memory before updating */
+	for (i = 0; i < LP5562_PROGRAM_LENGTH; i++)
+		lp55xx_write(chip, addr[idx] + i, 0);
+
+	i = 0;
+	while ((offset < size - 1) && (i < LP5562_PROGRAM_LENGTH)) {
+		/* separate sscanfs because length is working only for %s */
+		ret = sscanf(data + offset, "%2s%n ", c, &nrchars);
+		if (ret != 1)
+			goto err;
+
+		ret = sscanf(c, "%2x", &cmd);
+		if (ret != 1)
+			goto err;
+
+		pattern[i] = (u8)cmd;
+		offset += nrchars;
+		i++;
+	}
+
+	/* Each instruction is 16bit long. Check that length is even */
+	if (i % 2)
+		goto err;
+
+	program_size = i;
+	for (i = 0; i < program_size; i++)
+		lp55xx_write(chip, addr[idx] + i, pattern[i]);
+
+	return 0;
+
+err:
+	dev_err(&chip->cl->dev, "wrong pattern format\n");
+	return -EINVAL;
+}
+
+static void lp5562_firmware_loaded(struct lp55xx_chip *chip)
+{
+	const struct firmware *fw = chip->fw;
+
+	if (fw->size > LP5562_PROGRAM_LENGTH) {
+		dev_err(&chip->cl->dev, "firmware data size overflow: %zu\n",
+			fw->size);
+		return;
+	}
+
+	/*
+	 * Program momery sequence
+	 *  1) set engine mode to "LOAD"
+	 *  2) write firmware data into program memory
+	 */
+
+	lp5562_load_engine(chip);
+	lp5562_update_firmware(chip, fw->data, fw->size);
+}
+
+static int lp5562_post_init_device(struct lp55xx_chip *chip)
+{
+	int ret;
+	u8 update_cfg = chip->pdata->update_config ? : LP5562_DEFAULT_CFG;
+
+	/* Set all PWMs to direct control mode */
+	ret = lp55xx_write(chip, LP5562_REG_OP_MODE, LP5562_CMD_DIRECT);
+	if (ret)
+		return ret;
+
+	lp5562_wait_opmode_done();
+
+	ret = lp55xx_write(chip, LP5562_REG_CONFIG, update_cfg);
+	if (ret)
+		return ret;
+
+	/* Initialize all channels PWM to zero -> leds off */
+	lp55xx_write(chip, LP5562_REG_R_PWM, 0);
+	lp55xx_write(chip, LP5562_REG_G_PWM, 0);
+	lp55xx_write(chip, LP5562_REG_B_PWM, 0);
+	lp55xx_write(chip, LP5562_REG_W_PWM, 0);
+
+	/* Set LED map as register PWM by default */
+	lp55xx_write(chip, LP5562_REG_ENG_SEL, LP5562_ENG_SEL_PWM);
+
+	return 0;
+}
+
+static void lp5562_led_brightness_work(struct work_struct *work)
+{
+	struct lp55xx_led *led = container_of(work, struct lp55xx_led,
+					      brightness_work);
+	struct lp55xx_chip *chip = led->chip;
+	u8 addr[] = {
+		LP5562_REG_R_PWM,
+		LP5562_REG_G_PWM,
+		LP5562_REG_B_PWM,
+		LP5562_REG_W_PWM,
+	};
+
+	mutex_lock(&chip->lock);
+	lp55xx_write(chip, addr[led->chan_nr], led->brightness);
+	mutex_unlock(&chip->lock);
+}
+
+static void lp5562_write_program_memory(struct lp55xx_chip *chip,
+					u8 base, const u8 *rgb, int size)
+{
+	int i;
+
+	if (!rgb || size <= 0)
+		return;
+
+	for (i = 0; i < size; i++)
+		lp55xx_write(chip, base + i, *(rgb + i));
+
+	lp55xx_write(chip, base + i, 0);
+	lp55xx_write(chip, base + i + 1, 0);
+}
+
+/* check the size of program count */
+static inline bool _is_pc_overflow(struct lp55xx_predef_pattern *ptn)
+{
+	return (ptn->size_r >= LP5562_PROGRAM_LENGTH ||
+		ptn->size_g >= LP5562_PROGRAM_LENGTH ||
+		ptn->size_b >= LP5562_PROGRAM_LENGTH);
+}
+
+static int lp5562_run_predef_led_pattern(struct lp55xx_chip *chip, int mode)
+{
+	struct lp55xx_predef_pattern *ptn;
+	int i;
+
+	if (mode == LP5562_PATTERN_OFF) {
+		lp5562_run_engine(chip, false);
+		return 0;
+	}
+
+	ptn = chip->pdata->patterns + (mode - 1);
+	if (!ptn || _is_pc_overflow(ptn)) {
+		dev_err(&chip->cl->dev, "invalid pattern data\n");
+		return -EINVAL;
+	}
+
+	lp5562_stop_engine(chip);
+
+	/* Set LED map as RGB */
+	lp55xx_write(chip, LP5562_REG_ENG_SEL, LP5562_ENG_SEL_RGB);
+
+	/* Load engines */
+	for (i = LP55XX_ENGINE_1; i <= LP55XX_ENGINE_3; i++) {
+		chip->engine_idx = i;
+		lp5562_load_engine(chip);
+	}
+
+	/* Clear program registers */
+	lp55xx_write(chip, LP5562_REG_PROG_MEM_ENG1, 0);
+	lp55xx_write(chip, LP5562_REG_PROG_MEM_ENG1 + 1, 0);
+	lp55xx_write(chip, LP5562_REG_PROG_MEM_ENG2, 0);
+	lp55xx_write(chip, LP5562_REG_PROG_MEM_ENG2 + 1, 0);
+	lp55xx_write(chip, LP5562_REG_PROG_MEM_ENG3, 0);
+	lp55xx_write(chip, LP5562_REG_PROG_MEM_ENG3 + 1, 0);
+
+	/* Program engines */
+	lp5562_write_program_memory(chip, LP5562_REG_PROG_MEM_ENG1,
+				ptn->r, ptn->size_r);
+	lp5562_write_program_memory(chip, LP5562_REG_PROG_MEM_ENG2,
+				ptn->g, ptn->size_g);
+	lp5562_write_program_memory(chip, LP5562_REG_PROG_MEM_ENG3,
+				ptn->b, ptn->size_b);
+
+	/* Run engines */
+	lp5562_run_engine(chip, true);
+
+	return 0;
+}
+
+static ssize_t lp5562_store_pattern(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t len)
+{
+	struct lp55xx_led *led = i2c_get_clientdata(to_i2c_client(dev));
+	struct lp55xx_chip *chip = led->chip;
+	struct lp55xx_predef_pattern *ptn = chip->pdata->patterns;
+	int num_patterns = chip->pdata->num_patterns;
+	unsigned long mode;
+	int ret;
+
+	ret = kstrtoul(buf, 0, &mode);
+	if (ret)
+		return ret;
+
+	if (mode > num_patterns || !ptn)
+		return -EINVAL;
+
+	mutex_lock(&chip->lock);
+	ret = lp5562_run_predef_led_pattern(chip, mode);
+	mutex_unlock(&chip->lock);
+
+	if (ret)
+		return ret;
+
+	return len;
+}
+
+static ssize_t lp5562_store_engine_mux(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf, size_t len)
+{
+	struct lp55xx_led *led = i2c_get_clientdata(to_i2c_client(dev));
+	struct lp55xx_chip *chip = led->chip;
+	u8 mask;
+	u8 val;
+
+	/* LED map
+	 * R ... Engine 1 (fixed)
+	 * G ... Engine 2 (fixed)
+	 * B ... Engine 3 (fixed)
+	 * W ... Engine 1 or 2 or 3
+	 */
+
+	if (sysfs_streq(buf, "RGB")) {
+		mask = LP5562_ENG_FOR_RGB_M;
+		val = LP5562_ENG_SEL_RGB;
+	} else if (sysfs_streq(buf, "W")) {
+		enum lp55xx_engine_index idx = chip->engine_idx;
+
+		mask = LP5562_ENG_FOR_W_M;
+		switch (idx) {
+		case LP55XX_ENGINE_1:
+			val = LP5562_ENG1_FOR_W;
+			break;
+		case LP55XX_ENGINE_2:
+			val = LP5562_ENG2_FOR_W;
+			break;
+		case LP55XX_ENGINE_3:
+			val = LP5562_ENG3_FOR_W;
+			break;
+		default:
+			return -EINVAL;
+		}
+
+	} else {
+		dev_err(dev, "choose RGB or W\n");
+		return -EINVAL;
+	}
+
+	mutex_lock(&chip->lock);
+	lp55xx_update_bits(chip, LP5562_REG_ENG_SEL, mask, val);
+	mutex_unlock(&chip->lock);
+
+	return len;
+}
+
+static DEVICE_ATTR(led_pattern, S_IWUSR, NULL, lp5562_store_pattern);
+static DEVICE_ATTR(engine_mux, S_IWUSR, NULL, lp5562_store_engine_mux);
+
+static struct attribute *lp5562_attributes[] = {
+	&dev_attr_led_pattern.attr,
+	&dev_attr_engine_mux.attr,
+	NULL,
+};
+
+static const struct attribute_group lp5562_group = {
+	.attrs = lp5562_attributes,
+};
+
+/* Chip specific configurations */
+static struct lp55xx_device_config lp5562_cfg = {
+	.max_channel  = LP5562_MAX_LEDS,
+	.reset = {
+		.addr = LP5562_REG_RESET,
+		.val  = LP5562_RESET,
+	},
+	.enable = {
+		.addr = LP5562_REG_ENABLE,
+		.val  = LP5562_ENABLE_DEFAULT,
+	},
+	.post_init_device   = lp5562_post_init_device,
+	.set_led_current    = lp5562_set_led_current,
+	.brightness_work_fn = lp5562_led_brightness_work,
+	.run_engine         = lp5562_run_engine,
+	.firmware_cb        = lp5562_firmware_loaded,
+	.dev_attr_group     = &lp5562_group,
+};
+
+static int lp5562_probe(struct i2c_client *client,
+			const struct i2c_device_id *id)
+{
+	int ret;
+	struct lp55xx_chip *chip;
+	struct lp55xx_led *led;
+	struct lp55xx_platform_data *pdata = client->dev.platform_data;
+
+	if (!pdata) {
+		dev_err(&client->dev, "no platform data\n");
+		return -EINVAL;
+	}
+
+	chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL);
+	if (!chip)
+		return -ENOMEM;
+
+	led = devm_kzalloc(&client->dev,
+			sizeof(*led) * pdata->num_channels, GFP_KERNEL);
+	if (!led)
+		return -ENOMEM;
+
+	chip->cl = client;
+	chip->pdata = pdata;
+	chip->cfg = &lp5562_cfg;
+
+	mutex_init(&chip->lock);
+
+	i2c_set_clientdata(client, led);
+
+	ret = lp55xx_init_device(chip);
+	if (ret)
+		goto err_init;
+
+	ret = lp55xx_register_leds(led, chip);
+	if (ret)
+		goto err_register_leds;
+
+	ret = lp55xx_register_sysfs(chip);
+	if (ret) {
+		dev_err(&client->dev, "registering sysfs failed\n");
+		goto err_register_sysfs;
+	}
+
+	return 0;
+
+err_register_sysfs:
+	lp55xx_unregister_leds(led, chip);
+err_register_leds:
+	lp55xx_deinit_device(chip);
+err_init:
+	return ret;
+}
+
+static int lp5562_remove(struct i2c_client *client)
+{
+	struct lp55xx_led *led = i2c_get_clientdata(client);
+	struct lp55xx_chip *chip = led->chip;
+
+	lp5562_stop_engine(chip);
+
+	lp55xx_unregister_sysfs(chip);
+	lp55xx_unregister_leds(led, chip);
+	lp55xx_deinit_device(chip);
+
+	return 0;
+}
+
+static const struct i2c_device_id lp5562_id[] = {
+	{ "lp5562", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, lp5562_id);
+
+static struct i2c_driver lp5562_driver = {
+	.driver = {
+		.name	= "lp5562",
+	},
+	.probe		= lp5562_probe,
+	.remove		= lp5562_remove,
+	.id_table	= lp5562_id,
+};
+
+module_i2c_driver(lp5562_driver);
+
+MODULE_DESCRIPTION("Texas Instruments LP5562 LED Driver");
+MODULE_AUTHOR("Milo Kim");
+MODULE_LICENSE("GPL");
diff --git a/drivers/leds/leds-lp55xx-common.c b/drivers/leds/leds-lp55xx-common.c
index d9eb84157423..8a388a4afed7 100644
--- a/drivers/leds/leds-lp55xx-common.c
+++ b/drivers/leds/leds-lp55xx-common.c
@@ -1,5 +1,5 @@
 /*
- * LP5521/LP5523/LP55231 Common Driver
+ * LP5521/LP5523/LP55231/LP5562 Common Driver
  *
  * Copyright 2012 Texas Instruments
  *
diff --git a/include/linux/platform_data/leds-lp55xx.h b/include/linux/platform_data/leds-lp55xx.h
index 1509570d5a3f..1f1041e8b4fc 100644
--- a/include/linux/platform_data/leds-lp55xx.h
+++ b/include/linux/platform_data/leds-lp55xx.h
@@ -32,6 +32,13 @@
 #define LP5521_CLK_INT			1	/* Internal clock */
 #define LP5521_CLK_AUTO			2	/* Automatic clock selection */
 
+/* Bits in LP5562 CONFIG register */
+#define LP5562_PWM_HF			LP5521_PWM_HF
+#define LP5562_PWRSAVE_EN		LP5521_PWRSAVE_EN
+#define LP5562_CLK_SRC_EXT		LP5521_CLK_SRC_EXT
+#define LP5562_CLK_INT			LP5521_CLK_INT
+#define LP5562_CLK_AUTO			LP5521_CLK_AUTO
+
 struct lp55xx_led_config {
 	const char *name;
 	u8 chan_nr;
@@ -40,9 +47,9 @@ struct lp55xx_led_config {
 };
 
 struct lp55xx_predef_pattern {
-	u8 *r;
-	u8 *g;
-	u8 *b;
+	const u8 *r;
+	const u8 *g;
+	const u8 *b;
 	u8 size_r;
 	u8 size_g;
 	u8 size_b;
-- 
cgit 


From 39f7e08af3fd9ca1cb94a8270354afb2ea5cfcd3 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Thu, 14 Mar 2013 04:29:19 -0700
Subject: leds: trigger: use inline functions instead of macros

Macros are used in case that an inline function doesn't work.
Otherwise, use an empty inline function.

(a) Case of !CONFIG_LEDS_TRIGGERS
Following macros are replaced with inline functions.
  led_trigger_register_simple()
  led_trigger_unregister_simple()
  led_trigger_event()
To make inline types, the structure, 'led_trigger' should be defined.
This structure has no member at all.

(b) Case of !CONFIG_LEDS_TRIGGER_IDE_DISK
ledtrig_ide_activity() macro is replaced with an inline function as well.

(c) DEFINE_LED_TRIGGER() and DEFINE_LED_TRIGGER_GLOBAL()
Struct 'led_trigger' is defined both cases, with CONFIG_LEDS_TRIGGERS and
without CONFIG_LEDS_TRIGGERS.
Those macros are moved out of CONFIG_LED_TRIGGERS because of no-dependency
on CONFIG_LEDS_TRIGGERS.

(d) Fix build errors in mmc-core driver
After replacing macros with inline functions, following build errors occur.
(condition: CONFIG_LEDS_TRIGGERS is not set)

  drivers/mmc/core/core.c: In function 'mmc_request_done':
  drivers/mmc/core/core.c:164:25: error: 'struct mmc_host' has no member named 'led'
  drivers/mmc/core/core.c: In function 'mmc_start_request':
  drivers/mmc/core/core.c:254:24: error: 'struct mmc_host' has no member named 'led'
  make[3]: *** [drivers/mmc/core/core.o] Error 1

The reason of these errors is non-existent member variable, 'led'.
It is only valid when CONFIG_LEDS_TRIGGERS is set.
But now, it can be used without this dependency.
To fix build errors, member 'led' is always used without its config option in
'include/linux/mmc/host.h'.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 include/linux/leds.h     | 25 ++++++++++++++-----------
 include/linux/mmc/host.h |  2 --
 2 files changed, 14 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 0d9b5eed714e..2d8c0b4f2f76 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -142,6 +142,10 @@ extern void led_set_brightness(struct led_classdev *led_cdev,
 /*
  * LED Triggers
  */
+/* Registration functions for simple triggers */
+#define DEFINE_LED_TRIGGER(x)		static struct led_trigger *x;
+#define DEFINE_LED_TRIGGER_GLOBAL(x)	struct led_trigger *x;
+
 #ifdef CONFIG_LEDS_TRIGGERS
 
 #define TRIG_NAME_MAX 50
@@ -164,9 +168,6 @@ struct led_trigger {
 extern int led_trigger_register(struct led_trigger *trigger);
 extern void led_trigger_unregister(struct led_trigger *trigger);
 
-/* Registration functions for simple triggers */
-#define DEFINE_LED_TRIGGER(x)		static struct led_trigger *x;
-#define DEFINE_LED_TRIGGER_GLOBAL(x)	struct led_trigger *x;
 extern void led_trigger_register_simple(const char *name,
 				struct led_trigger **trigger);
 extern void led_trigger_unregister_simple(struct led_trigger *trigger);
@@ -199,20 +200,22 @@ extern void led_trigger_rename_static(const char *name,
 
 #else
 
-/* Triggers aren't active - null macros */
-#define DEFINE_LED_TRIGGER(x)
-#define DEFINE_LED_TRIGGER_GLOBAL(x)
-#define led_trigger_register_simple(x, y) do {} while(0)
-#define led_trigger_unregister_simple(x) do {} while(0)
-#define led_trigger_event(x, y) do {} while(0)
+/* Trigger has no members */
+struct led_trigger {};
 
-#endif
+/* Trigger inline empty functions */
+static inline void led_trigger_register_simple(const char *name,
+					struct led_trigger **trigger) {}
+static inline void led_trigger_unregister_simple(struct led_trigger *trigger) {}
+static inline void led_trigger_event(struct led_trigger *trigger,
+				enum led_brightness event) {}
+#endif /* CONFIG_LEDS_TRIGGERS */
 
 /* Trigger specific functions */
 #ifdef CONFIG_LEDS_TRIGGER_IDE_DISK
 extern void ledtrig_ide_activity(void);
 #else
-#define ledtrig_ide_activity() do {} while(0)
+static inline void ledtrig_ide_activity(void) {}
 #endif
 
 /*
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index d6f20cc6415e..357e80efcde0 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -341,9 +341,7 @@ struct mmc_host {
 
 	mmc_pm_flag_t		pm_flags;	/* requested pm features */
 
-#ifdef CONFIG_LEDS_TRIGGERS
 	struct led_trigger	*led;		/* activity led */
-#endif
 
 #ifdef CONFIG_REGULATOR
 	bool			regulator_enabled; /* regulator state */
-- 
cgit 


From 48a1d032c954b9b06c3adbf35ef4735dd70ab757 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Thu, 14 Mar 2013 04:29:24 -0700
Subject: leds: add camera LED triggers

Some LED devices support flash/torch functionality through the LED subsystem.
This patch enables direct LED trigger controls by the driver.
Flash on/off and torch on/off can be done simply by other driver space.
Two trigger APIs are added, ledtrig_flash_ctrl() and ledtrig_torch_ctrl().

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 drivers/leds/trigger/Kconfig          |  8 +++++
 drivers/leds/trigger/Makefile         |  1 +
 drivers/leds/trigger/ledtrig-camera.c | 57 +++++++++++++++++++++++++++++++++++
 include/linux/leds.h                  |  8 +++++
 4 files changed, 74 insertions(+)
 create mode 100644 drivers/leds/trigger/ledtrig-camera.c

(limited to 'include/linux')

diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig
index eaa286dc494e..49794b47b51c 100644
--- a/drivers/leds/trigger/Kconfig
+++ b/drivers/leds/trigger/Kconfig
@@ -100,4 +100,12 @@ config LEDS_TRIGGER_TRANSIENT
 	  GPIO/PWM based hardware.
 	  If unsure, say Y.
 
+config LEDS_TRIGGER_CAMERA
+	tristate "LED Camera Flash/Torch Trigger"
+	depends on LEDS_TRIGGERS
+	help
+	  This allows LEDs to be controlled as a camera flash/torch device.
+	  This enables direct flash/torch on/off by the driver, kernel space.
+	  If unsure, say Y.
+
 endif # LEDS_TRIGGERS
diff --git a/drivers/leds/trigger/Makefile b/drivers/leds/trigger/Makefile
index 554e46ee4c24..1abf48dacf7e 100644
--- a/drivers/leds/trigger/Makefile
+++ b/drivers/leds/trigger/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_LEDS_TRIGGER_GPIO)		+= ledtrig-gpio.o
 obj-$(CONFIG_LEDS_TRIGGER_CPU)		+= ledtrig-cpu.o
 obj-$(CONFIG_LEDS_TRIGGER_DEFAULT_ON)	+= ledtrig-default-on.o
 obj-$(CONFIG_LEDS_TRIGGER_TRANSIENT)	+= ledtrig-transient.o
+obj-$(CONFIG_LEDS_TRIGGER_CAMERA)	+= ledtrig-camera.o
diff --git a/drivers/leds/trigger/ledtrig-camera.c b/drivers/leds/trigger/ledtrig-camera.c
new file mode 100644
index 000000000000..9bd73a8bad5c
--- /dev/null
+++ b/drivers/leds/trigger/ledtrig-camera.c
@@ -0,0 +1,57 @@
+/*
+ * Camera Flash and Torch On/Off Trigger
+ *
+ * based on ledtrig-ide-disk.c
+ *
+ * Copyright 2013 Texas Instruments
+ *
+ * Author: Milo(Woogyom) Kim <milo.kim@ti.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/leds.h>
+
+DEFINE_LED_TRIGGER(ledtrig_flash);
+DEFINE_LED_TRIGGER(ledtrig_torch);
+
+void ledtrig_flash_ctrl(bool on)
+{
+	enum led_brightness brt = on ? LED_FULL : LED_OFF;
+
+	led_trigger_event(ledtrig_flash, brt);
+}
+EXPORT_SYMBOL_GPL(ledtrig_flash_ctrl);
+
+void ledtrig_torch_ctrl(bool on)
+{
+	enum led_brightness brt = on ? LED_FULL : LED_OFF;
+
+	led_trigger_event(ledtrig_torch, brt);
+}
+EXPORT_SYMBOL_GPL(ledtrig_torch_ctrl);
+
+static int __init ledtrig_camera_init(void)
+{
+	led_trigger_register_simple("flash", &ledtrig_flash);
+	led_trigger_register_simple("torch", &ledtrig_torch);
+	return 0;
+}
+module_init(ledtrig_camera_init);
+
+static void __exit ledtrig_camera_exit(void)
+{
+	led_trigger_unregister_simple(ledtrig_torch);
+	led_trigger_unregister_simple(ledtrig_flash);
+}
+module_exit(ledtrig_camera_exit);
+
+MODULE_DESCRIPTION("LED Trigger for Camera Flash/Torch Control");
+MODULE_AUTHOR("Milo Kim");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/leds.h b/include/linux/leds.h
index 2d8c0b4f2f76..0287ab296689 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -218,6 +218,14 @@ extern void ledtrig_ide_activity(void);
 static inline void ledtrig_ide_activity(void) {}
 #endif
 
+#if defined(CONFIG_LEDS_TRIGGER_CAMERA) || defined(CONFIG_LEDS_TRIGGER_CAMERA_MODULE)
+extern void ledtrig_flash_ctrl(bool on);
+extern void ledtrig_torch_ctrl(bool on);
+#else
+static inline void ledtrig_flash_ctrl(bool on) {}
+static inline void ledtrig_torch_ctrl(bool on) {}
+#endif
+
 /*
  * Generic LED platform data for describing LED names and default triggers.
  */
-- 
cgit 


From 81f2a5b4a0570a662efd629c176fc1d67e56f7e3 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Wed, 20 Mar 2013 17:37:04 -0700
Subject: leds: lp55xx: configure the clock detection

Now LP55xx provides automatic clock detection API, lp55xx_is_extclk_used().
The clock configuration can be done by the driver itself.

(a) Concept
The default value is set by each driver with clock selection.
The internal clock selection bit is updated in case that the external clock
is not detected or clock rate is not 32KHz.

(b) Change on LP55xx platform data
The clock configuration is done automatically, so no need to define
'update_config' in the platform side.
Correlated information are removed in the documentations and header.

(c) Definitions moved from header to driver files
CONFIG register values are moved each driver, LP5521 and LP5562.
Not necessary definitions are removed also.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Bryan Wu <cooloney@gmail.com>
---
 Documentation/leds/leds-lp5521.txt        | 19 -------------------
 Documentation/leds/leds-lp5562.txt        | 15 ---------------
 drivers/leds/leds-lp5521.c                | 19 +++++++++++++++++--
 drivers/leds/leds-lp5562.c                | 14 ++++++++++----
 include/linux/platform_data/leds-lp55xx.h | 22 ----------------------
 5 files changed, 27 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/leds/leds-lp5521.txt b/Documentation/leds/leds-lp5521.txt
index 270f57196339..79e4c2e6e5e8 100644
--- a/Documentation/leds/leds-lp5521.txt
+++ b/Documentation/leds/leds-lp5521.txt
@@ -81,22 +81,3 @@ static struct lp55xx_platform_data lp5521_platform_data = {
 
 If the current is set to 0 in the platform data, that channel is
 disabled and it is not visible in the sysfs.
-
-The 'update_config' : CONFIG register (ADDR 08h)
-This value is platform-specific data.
-If update_config is not defined, the CONFIG register is set with
-'LP5521_PWRSAVE_EN | LP5521_CP_MODE_AUTO | LP5521_R_TO_BATT'.
-(Enable auto-powersave, set charge pump to auto, red to battery)
-
-example of update_config :
-
-#define LP5521_CONFIGS	(LP5521_PWM_HF | LP5521_PWRSAVE_EN | \
-			LP5521_CP_MODE_AUTO | LP5521_R_TO_BATT | \
-			LP5521_CLK_INT)
-
-static struct lp55xx_platform_data lp5521_pdata = {
-	.led_config = lp5521_led_config,
-	.num_channels = ARRAY_SIZE(lp5521_led_config),
-	.clock_mode = LP55XX_CLOCK_INT,
-	.update_config = LP5521_CONFIGS,
-};
diff --git a/Documentation/leds/leds-lp5562.txt b/Documentation/leds/leds-lp5562.txt
index 96061000dd93..5a823ff6b393 100644
--- a/Documentation/leds/leds-lp5562.txt
+++ b/Documentation/leds/leds-lp5562.txt
@@ -118,18 +118,3 @@ static struct lp55xx_platform_data lp5562_platform_data = {
 
 If the current is set to 0 in the platform data, that channel is
 disabled and it is not visible in the sysfs.
-
-The 'update_config' : CONFIG register (ADDR 08h)
-This value is platform-specific data.
-If update_config is not defined, the CONFIG register is set with
-'LP5562_PWRSAVE_EN | LP5562_CLK_AUTO'.
-(Enable auto-powersave, set automatic clock source selection)
-
-#define LP5562_CONFIGS	(LP5562_PWM_HF | LP5562_PWRSAVE_EN | \
-			 LP5562_CLK_SRC_EXT)
-
-static struct lp55xx_platform_data lp5562_pdata = {
-	.led_config = lp5562_led_config,
-	.num_channels = ARRAY_SIZE(lp5562_led_config),
-	.update_config = LP5562_CONFIGS,
-};
diff --git a/drivers/leds/leds-lp5521.c b/drivers/leds/leds-lp5521.c
index 7f10304219ea..19752c928aa2 100644
--- a/drivers/leds/leds-lp5521.c
+++ b/drivers/leds/leds-lp5521.c
@@ -68,6 +68,18 @@
 #define LP5521_ENABLE_RUN_PROGRAM	\
 	(LP5521_ENABLE_DEFAULT | LP5521_EXEC_RUN)
 
+/* CONFIG register */
+#define LP5521_PWM_HF			0x40	/* PWM: 0 = 256Hz, 1 = 558Hz */
+#define LP5521_PWRSAVE_EN		0x20	/* 1 = Power save mode */
+#define LP5521_CP_MODE_OFF		0	/* Charge pump (CP) off */
+#define LP5521_CP_MODE_BYPASS		8	/* CP forced to bypass mode */
+#define LP5521_CP_MODE_1X5		0x10	/* CP forced to 1.5x mode */
+#define LP5521_CP_MODE_AUTO		0x18	/* Automatic mode selection */
+#define LP5521_R_TO_BATT		0x04	/* R out: 0 = CP, 1 = Vbat */
+#define LP5521_CLK_INT			0x01	/* Internal clock */
+#define LP5521_DEFAULT_CFG		\
+	(LP5521_PWM_HF | LP5521_PWRSAVE_EN | LP5521_CP_MODE_AUTO)
+
 /* Status */
 #define LP5521_EXT_CLK_USED		0x08
 
@@ -296,8 +308,11 @@ static int lp5521_post_init_device(struct lp55xx_chip *chip)
 	/* Set all PWMs to direct control mode */
 	ret = lp55xx_write(chip, LP5521_REG_OP_MODE, LP5521_CMD_DIRECT);
 
-	val = chip->pdata->update_config ?
-		: (LP5521_PWRSAVE_EN | LP5521_CP_MODE_AUTO | LP5521_R_TO_BATT);
+	/* Update configuration for the clock setting */
+	val = LP5521_DEFAULT_CFG;
+	if (!lp55xx_is_extclk_used(chip))
+		val |= LP5521_CLK_INT;
+
 	ret = lp55xx_write(chip, LP5521_REG_CONFIG, val);
 	if (ret)
 		return ret;
diff --git a/drivers/leds/leds-lp5562.c b/drivers/leds/leds-lp5562.c
index f8b927788c3a..513f2390ca2d 100644
--- a/drivers/leds/leds-lp5562.c
+++ b/drivers/leds/leds-lp5562.c
@@ -71,8 +71,10 @@
 
 /* CONFIG Register 08h */
 #define LP5562_REG_CONFIG		0x08
-#define LP5562_DEFAULT_CFG	\
-	(LP5562_PWM_HF | LP5562_PWRSAVE_EN | LP5562_CLK_INT)
+#define LP5562_PWM_HF			0x40
+#define LP5562_PWRSAVE_EN		0x20
+#define LP5562_CLK_INT			0x01	/* Internal clock */
+#define LP5562_DEFAULT_CFG		(LP5562_PWM_HF | LP5562_PWRSAVE_EN)
 
 /* RESET Register 0Dh */
 #define LP5562_REG_RESET		0x0D
@@ -280,7 +282,7 @@ static void lp5562_firmware_loaded(struct lp55xx_chip *chip)
 static int lp5562_post_init_device(struct lp55xx_chip *chip)
 {
 	int ret;
-	u8 update_cfg = chip->pdata->update_config ? : LP5562_DEFAULT_CFG;
+	u8 cfg = LP5562_DEFAULT_CFG;
 
 	/* Set all PWMs to direct control mode */
 	ret = lp55xx_write(chip, LP5562_REG_OP_MODE, LP5562_CMD_DIRECT);
@@ -289,7 +291,11 @@ static int lp5562_post_init_device(struct lp55xx_chip *chip)
 
 	lp5562_wait_opmode_done();
 
-	ret = lp55xx_write(chip, LP5562_REG_CONFIG, update_cfg);
+	/* Update configuration for the clock setting */
+	if (!lp55xx_is_extclk_used(chip))
+		cfg |= LP5562_CLK_INT;
+
+	ret = lp55xx_write(chip, LP5562_REG_CONFIG, cfg);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/platform_data/leds-lp55xx.h b/include/linux/platform_data/leds-lp55xx.h
index 1f1041e8b4fc..202e290faea8 100644
--- a/include/linux/platform_data/leds-lp55xx.h
+++ b/include/linux/platform_data/leds-lp55xx.h
@@ -20,25 +20,6 @@
 #define LP55XX_CLOCK_INT	1
 #define LP55XX_CLOCK_EXT	2
 
-/* Bits in LP5521 CONFIG register. 'update_config' in lp55xx_platform_data */
-#define LP5521_PWM_HF			0x40	/* PWM: 0 = 256Hz, 1 = 558Hz */
-#define LP5521_PWRSAVE_EN		0x20	/* 1 = Power save mode */
-#define LP5521_CP_MODE_OFF		0	/* Charge pump (CP) off */
-#define LP5521_CP_MODE_BYPASS		8	/* CP forced to bypass mode */
-#define LP5521_CP_MODE_1X5		0x10	/* CP forced to 1.5x mode */
-#define LP5521_CP_MODE_AUTO		0x18	/* Automatic mode selection */
-#define LP5521_R_TO_BATT		4	/* R out: 0 = CP, 1 = Vbat */
-#define LP5521_CLK_SRC_EXT		0	/* Ext-clk source (CLK_32K) */
-#define LP5521_CLK_INT			1	/* Internal clock */
-#define LP5521_CLK_AUTO			2	/* Automatic clock selection */
-
-/* Bits in LP5562 CONFIG register */
-#define LP5562_PWM_HF			LP5521_PWM_HF
-#define LP5562_PWRSAVE_EN		LP5521_PWRSAVE_EN
-#define LP5562_CLK_SRC_EXT		LP5521_CLK_SRC_EXT
-#define LP5562_CLK_INT			LP5521_CLK_INT
-#define LP5562_CLK_AUTO			LP5521_CLK_AUTO
-
 struct lp55xx_led_config {
 	const char *name;
 	u8 chan_nr;
@@ -86,9 +67,6 @@ struct lp55xx_platform_data {
 	/* Predefined pattern data */
 	struct lp55xx_predef_pattern *patterns;
 	unsigned int num_patterns;
-
-	/* _CONFIG register */
-	u8 update_config;
 };
 
 #endif /* _LEDS_LP55XX_H */
-- 
cgit 


From d55262c4d164759a8debe772da6c9b16059dec47 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 1 Apr 2013 11:23:38 -0700
Subject: workqueue: update sysfs interface to reflect NUMA awareness and a
 kernel param to disable NUMA affinity

Unbound workqueues are now NUMA aware.  Let's add some control knobs
and update sysfs interface accordingly.

* Add kernel param workqueue.numa_disable which disables NUMA affinity
  globally.

* Replace sysfs file "pool_id" with "pool_ids" which contain
  node:pool_id pairs.  This change is userland-visible but "pool_id"
  hasn't seen a release yet, so this is okay.

* Add a new sysf files "numa" which can toggle NUMA affinity on
  individual workqueues.  This is implemented as attrs->no_numa whichn
  is special in that it isn't part of a pool's attributes.  It only
  affects how apply_workqueue_attrs() picks which pools to use.

After "pool_ids" change, first_pwq() doesn't have any user left.
Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
---
 Documentation/kernel-parameters.txt |  9 ++++
 include/linux/workqueue.h           |  5 +++
 kernel/workqueue.c                  | 82 ++++++++++++++++++++++++++-----------
 3 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4609e81dbc37..c75ea0b8ec59 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3222,6 +3222,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			or other driver-specific files in the
 			Documentation/watchdog/ directory.
 
+	workqueue.disable_numa
+			By default, all work items queued to unbound
+			workqueues are affine to the NUMA nodes they're
+			issued on, which results in better behavior in
+			general.  If NUMA affinity needs to be disabled for
+			whatever reason, this option can be used.  Note
+			that this also can be controlled per-workqueue for
+			workqueues visible under /sys/bus/workqueue/.
+
 	x2apic_phys	[X86-64,APIC] Use x2apic physical mode instead of
 			default x2apic cluster mode on platforms
 			supporting x2apic.
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 835d12b76960..717975639378 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -119,10 +119,15 @@ struct delayed_work {
 /*
  * A struct for workqueue attributes.  This can be used to change
  * attributes of an unbound workqueue.
+ *
+ * Unlike other fields, ->no_numa isn't a property of a worker_pool.  It
+ * only modifies how apply_workqueue_attrs() select pools and thus doesn't
+ * participate in pool hash calculations or equality comparisons.
  */
 struct workqueue_attrs {
 	int			nice;		/* nice level */
 	cpumask_var_t		cpumask;	/* allowed CPUs */
+	bool			no_numa;	/* disable NUMA affinity */
 };
 
 static inline struct delayed_work *to_delayed_work(struct work_struct *work)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 57cd77de4a4f..729ac6a44860 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -268,6 +268,9 @@ static int wq_numa_tbl_len;		/* highest possible NUMA node id + 1 */
 static cpumask_var_t *wq_numa_possible_cpumask;
 					/* possible CPUs of each node */
 
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
 static bool wq_numa_enabled;		/* unbound NUMA affinity enabled */
 
 /* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
@@ -516,21 +519,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
 	return ret;
 }
 
-/**
- * first_pwq - return the first pool_workqueue of the specified workqueue
- * @wq: the target workqueue
- *
- * This must be called either with wq->mutex held or sched RCU read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- */
-static struct pool_workqueue *first_pwq(struct workqueue_struct *wq)
-{
-	assert_rcu_or_wq_mutex(wq);
-	return list_first_or_null_rcu(&wq->pwqs, struct pool_workqueue,
-				      pwqs_node);
-}
-
 /**
  * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
  * @wq: the target workqueue
@@ -3114,16 +3102,21 @@ static struct device_attribute wq_sysfs_attrs[] = {
 	__ATTR_NULL,
 };
 
-static ssize_t wq_pool_id_show(struct device *dev,
-			       struct device_attribute *attr, char *buf)
+static ssize_t wq_pool_ids_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
 {
 	struct workqueue_struct *wq = dev_to_wq(dev);
-	struct worker_pool *pool;
-	int written;
+	const char *delim = "";
+	int node, written = 0;
 
 	rcu_read_lock_sched();
-	pool = first_pwq(wq)->pool;
-	written = scnprintf(buf, PAGE_SIZE, "%d\n", pool->id);
+	for_each_node(node) {
+		written += scnprintf(buf + written, PAGE_SIZE - written,
+				     "%s%d:%d", delim, node,
+				     unbound_pwq_by_node(wq, node)->pool->id);
+		delim = " ";
+	}
+	written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
 	rcu_read_unlock_sched();
 
 	return written;
@@ -3212,10 +3205,46 @@ static ssize_t wq_cpumask_store(struct device *dev,
 	return ret ?: count;
 }
 
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+			    char *buf)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	int written;
+
+	mutex_lock(&wq->mutex);
+	written = scnprintf(buf, PAGE_SIZE, "%d\n",
+			    !wq->unbound_attrs->no_numa);
+	mutex_unlock(&wq->mutex);
+
+	return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct workqueue_struct *wq = dev_to_wq(dev);
+	struct workqueue_attrs *attrs;
+	int v, ret;
+
+	attrs = wq_sysfs_prep_attrs(wq);
+	if (!attrs)
+		return -ENOMEM;
+
+	ret = -EINVAL;
+	if (sscanf(buf, "%d", &v) == 1) {
+		attrs->no_numa = !v;
+		ret = apply_workqueue_attrs(wq, attrs);
+	}
+
+	free_workqueue_attrs(attrs);
+	return ret ?: count;
+}
+
 static struct device_attribute wq_sysfs_unbound_attrs[] = {
-	__ATTR(pool_id, 0444, wq_pool_id_show, NULL),
+	__ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
 	__ATTR(nice, 0644, wq_nice_show, wq_nice_store),
 	__ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+	__ATTR(numa, 0644, wq_numa_show, wq_numa_store),
 	__ATTR_NULL,
 };
 
@@ -3750,7 +3779,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq)
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
 				 int cpu_going_down, cpumask_t *cpumask)
 {
-	if (!wq_numa_enabled)
+	if (!wq_numa_enabled || attrs->no_numa)
 		goto use_dfl;
 
 	/* does @node have any online CPUs @attrs wants? */
@@ -3951,6 +3980,8 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
 	cpumask = target_attrs->cpumask;
 
 	mutex_lock(&wq->mutex);
+	if (wq->unbound_attrs->no_numa)
+		goto out_unlock;
 
 	copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
 	pwq = unbound_pwq_by_node(wq, node);
@@ -4763,6 +4794,11 @@ static void __init wq_numa_init(void)
 	if (num_possible_nodes() <= 1)
 		return;
 
+	if (wq_disable_numa) {
+		pr_info("workqueue: NUMA affinity support disabled\n");
+		return;
+	}
+
 	wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
 	BUG_ON(!wq_update_unbound_numa_attrs_buf);
 
-- 
cgit 


From 253b5374f08f3908cc380c5665470a5b7609be1c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 11 Dec 2012 13:14:09 +0900
Subject: mfd: wm5102: Add registers for microphone detection level
 configuration

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/mfd/wm5102-tables.c           | 8 ++++++++
 include/linux/mfd/arizona/registers.h | 4 ++++
 2 files changed, 12 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/mfd/wm5102-tables.c b/drivers/mfd/wm5102-tables.c
index a433f580aa4c..ca2aed6bc830 100644
--- a/drivers/mfd/wm5102-tables.c
+++ b/drivers/mfd/wm5102-tables.c
@@ -331,6 +331,10 @@ static const struct reg_default wm5102_reg_default[] = {
 	{ 0x000002A3, 0x1102 },   /* R675   - Mic Detect 1 */ 
 	{ 0x000002A4, 0x009F },   /* R676   - Mic Detect 2 */ 
 	{ 0x000002A5, 0x0000 },   /* R677   - Mic Detect 3 */ 
+	{ 0x000002A6, 0x3737 },   /* R678   - Mic Detect Level 1 */
+	{ 0x000002A7, 0x372C },   /* R679   - Mic Detect Level 2 */
+	{ 0x000002A8, 0x1422 },   /* R680   - Mic Detect Level 3 */
+	{ 0x000002A9, 0x030A },   /* R681   - Mic Detect Level 4 */
 	{ 0x000002C3, 0x0000 },   /* R707   - Mic noise mix control 1 */ 
 	{ 0x000002CB, 0x0000 },   /* R715   - Isolation control */ 
 	{ 0x000002D3, 0x0000 },   /* R723   - Jack detect analogue */ 
@@ -1090,6 +1094,10 @@ static bool wm5102_readable_register(struct device *dev, unsigned int reg)
 	case ARIZONA_MIC_DETECT_1:
 	case ARIZONA_MIC_DETECT_2:
 	case ARIZONA_MIC_DETECT_3:
+	case ARIZONA_MIC_DETECT_LEVEL_1:
+	case ARIZONA_MIC_DETECT_LEVEL_2:
+	case ARIZONA_MIC_DETECT_LEVEL_3:
+	case ARIZONA_MIC_DETECT_LEVEL_4:
 	case ARIZONA_MIC_NOISE_MIX_CONTROL_1:
 	case ARIZONA_ISOLATION_CONTROL:
 	case ARIZONA_JACK_DETECT_ANALOGUE:
diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 340355136069..f43aa7c8d040 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -124,6 +124,10 @@
 #define ARIZONA_MIC_DETECT_1                     0x2A3
 #define ARIZONA_MIC_DETECT_2                     0x2A4
 #define ARIZONA_MIC_DETECT_3                     0x2A5
+#define ARIZONA_MIC_DETECT_LEVEL_1		 0x2A6
+#define ARIZONA_MIC_DETECT_LEVEL_2		 0x2A7
+#define ARIZONA_MIC_DETECT_LEVEL_3		 0x2A8
+#define ARIZONA_MIC_DETECT_LEVEL_4		 0x2A9
 #define ARIZONA_MIC_NOISE_MIX_CONTROL_1          0x2C3
 #define ARIZONA_ISOLATION_CONTROL                0x2CB
 #define ARIZONA_JACK_DETECT_ANALOGUE             0x2D3
-- 
cgit 


From 932bc4d7a53ba418de67fdab533248df5b36c752 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Thu, 21 Mar 2013 11:57:58 +0200
Subject: net: add skb_dst_set_noref_force

Rename skb_dst_set_noref to __skb_dst_set_noref and
add force flag as suggested by David Miller. The new wrapper
skb_dst_set_noref_force will force dst entries that are not
cached to be attached as skb dst without taking reference
as long as provided dst is reclaimed after RCU grace period.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off by: Hans Schillstrom <hans@schillstrom.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 include/linux/skbuff.h | 35 ++++++++++++++++++++++++++++++++++-
 net/core/dst.c         |  9 +++++----
 2 files changed, 39 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 878e0ee81068..364e2440a7ee 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -575,7 +575,40 @@ static inline void skb_dst_set(struct sk_buff *skb, struct dst_entry *dst)
 	skb->_skb_refdst = (unsigned long)dst;
 }
 
-extern void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst);
+extern void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst,
+				bool force);
+
+/**
+ * skb_dst_set_noref - sets skb dst, hopefully, without taking reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst.
+ * If dst entry is cached, we do not take reference and dst_release
+ * will be avoided by refdst_drop. If dst entry is not cached, we take
+ * reference, so that last dst_release can destroy the dst immediately.
+ */
+static inline void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+{
+	__skb_dst_set_noref(skb, dst, false);
+}
+
+/**
+ * skb_dst_set_noref_force - sets skb dst, without taking reference
+ * @skb: buffer
+ * @dst: dst entry
+ *
+ * Sets skb dst, assuming a reference was not taken on dst.
+ * No reference is taken and no dst_release will be called. While for
+ * cached dsts deferred reclaim is a basic feature, for entries that are
+ * not cached it is caller's job to guarantee that last dst_release for
+ * provided dst happens when nobody uses it, eg. after a RCU grace period.
+ */
+static inline void skb_dst_set_noref_force(struct sk_buff *skb,
+					   struct dst_entry *dst)
+{
+	__skb_dst_set_noref(skb, dst, true);
+}
 
 /**
  * skb_dst_is_noref - Test if skb dst isn't refcounted
diff --git a/net/core/dst.c b/net/core/dst.c
index 35fd12f1a69c..df9cc810ec8e 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -320,27 +320,28 @@ void __dst_destroy_metrics_generic(struct dst_entry *dst, unsigned long old)
 EXPORT_SYMBOL(__dst_destroy_metrics_generic);
 
 /**
- * skb_dst_set_noref - sets skb dst, without a reference
+ * __skb_dst_set_noref - sets skb dst, without a reference
  * @skb: buffer
  * @dst: dst entry
+ * @force: if force is set, use noref version even for DST_NOCACHE entries
  *
  * Sets skb dst, assuming a reference was not taken on dst
  * skb_dst_drop() should not dst_release() this dst
  */
-void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst)
+void __skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst, bool force)
 {
 	WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
 	/* If dst not in cache, we must take a reference, because
 	 * dst_release() will destroy dst as soon as its refcount becomes zero
 	 */
-	if (unlikely(dst->flags & DST_NOCACHE)) {
+	if (unlikely((dst->flags & DST_NOCACHE) && !force)) {
 		dst_hold(dst);
 		skb_dst_set(skb, dst);
 	} else {
 		skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF;
 	}
 }
-EXPORT_SYMBOL(skb_dst_set_noref);
+EXPORT_SYMBOL(__skb_dst_set_noref);
 
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
-- 
cgit 


From 094f7b69ea738d7d619cba449d2af97159949459 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 1 Apr 2013 08:14:24 -0400
Subject: selinux: make security_sb_clone_mnt_opts return an error on context
 mismatch

I had the following problem reported a while back. If you mount the
same filesystem twice using NFSv4 with different contexts, then the
second context= option is ignored. For instance:

    # mount server:/export /mnt/test1
    # mount server:/export /mnt/test2 -o context=system_u:object_r:tmp_t:s0
    # ls -dZ /mnt/test1
    drwxrwxrwt. root root system_u:object_r:nfs_t:s0       /mnt/test1
    # ls -dZ /mnt/test2
    drwxrwxrwt. root root system_u:object_r:nfs_t:s0       /mnt/test2

When we call into SELinux to set the context of a "cloned" superblock,
it will currently just bail out when it notices that we're reusing an
existing superblock. Since the existing superblock is already set up and
presumably in use, we can't go overwriting its context with the one from
the "original" sb. Because of this, the second context= option in this
case cannot take effect.

This patch fixes this by turning security_sb_clone_mnt_opts into an int
return operation. When it finds that the "new" superblock that it has
been handed is already set up, it checks to see whether the contexts on
the old superblock match it. If it does, then it will just return
success, otherwise it'll return -EBUSY and emit a printk to tell the
admin why the second mount failed.

Note that this patch may cause casualties. The NFSv4 code relies on
being able to walk down to an export from the pseudoroot. If you mount
filesystems that are nested within one another with different contexts,
then this patch will make those mounts fail in new and "exciting" ways.

For instance, suppose that /export is a separate filesystem on the
server:

    # mount server:/ /mnt/test1
    # mount salusa:/export /mnt/test2 -o context=system_u:object_r:tmp_t:s0
    mount.nfs: an incorrect mount option was specified

...with the printk in the ring buffer. Because we *might* eventually
walk down to /mnt/test1/export, the mount is denied due to this patch.
The second mount needs the pseudoroot superblock, but that's already
present with the wrong context.

OTOH, if we mount these in the reverse order, then both mounts work,
because the pseudoroot superblock created when mounting /export is
discarded once that mount is done. If we then however try to walk into
that directory, the automount fails for the similar reasons:

    # cd /mnt/test1/scratch/
    -bash: cd: /mnt/test1/scratch: Device or resource busy

The story I've gotten from the SELinux folks that I've talked to is that
this is desirable behavior. In SELinux-land, mounting the same data
under different contexts is wrong -- there can be only one.

Cc: Steve Dickson <steved@redhat.com>
Cc: Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <james.l.morris@oracle.com>
---
 fs/nfs/super.c           |  3 +--
 include/linux/security.h | 10 ++++++----
 security/capability.c    |  3 ++-
 security/security.c      |  4 ++--
 security/selinux/hooks.c | 39 +++++++++++++++++++++++++++++++++++----
 5 files changed, 46 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 95cdcb208dfb..6b4bf7622280 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2380,10 +2380,9 @@ int nfs_clone_sb_security(struct super_block *s, struct dentry *mntroot,
 			  struct nfs_mount_info *mount_info)
 {
 	/* clone any lsm security options from the parent to the new sb */
-	security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
 	if (mntroot->d_inode->i_op != NFS_SB(s)->nfs_client->rpc_ops->dir_inode_ops)
 		return -ESTALE;
-	return 0;
+	return security_sb_clone_mnt_opts(mount_info->cloned->sb, s);
 }
 EXPORT_SYMBOL_GPL(nfs_clone_sb_security);
 
diff --git a/include/linux/security.h b/include/linux/security.h
index eee7478cda70..4c7058dc5514 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1436,7 +1436,7 @@ struct security_operations {
 			     struct path *new_path);
 	int (*sb_set_mnt_opts) (struct super_block *sb,
 				struct security_mnt_opts *opts);
-	void (*sb_clone_mnt_opts) (const struct super_block *oldsb,
+	int (*sb_clone_mnt_opts) (const struct super_block *oldsb,
 				   struct super_block *newsb);
 	int (*sb_parse_opts_str) (char *options, struct security_mnt_opts *opts);
 
@@ -1721,7 +1721,7 @@ int security_sb_mount(const char *dev_name, struct path *path,
 int security_sb_umount(struct vfsmount *mnt, int flags);
 int security_sb_pivotroot(struct path *old_path, struct path *new_path);
 int security_sb_set_mnt_opts(struct super_block *sb, struct security_mnt_opts *opts);
-void security_sb_clone_mnt_opts(const struct super_block *oldsb,
+int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				struct super_block *newsb);
 int security_sb_parse_opts_str(char *options, struct security_mnt_opts *opts);
 
@@ -2011,9 +2011,11 @@ static inline int security_sb_set_mnt_opts(struct super_block *sb,
 	return 0;
 }
 
-static inline void security_sb_clone_mnt_opts(const struct super_block *oldsb,
+static inline int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 					      struct super_block *newsb)
-{ }
+{
+	return 0;
+}
 
 static inline int security_sb_parse_opts_str(char *options, struct security_mnt_opts *opts)
 {
diff --git a/security/capability.c b/security/capability.c
index 579775088967..a6290b625be9 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -98,9 +98,10 @@ static int cap_sb_set_mnt_opts(struct super_block *sb,
 	return 0;
 }
 
-static void cap_sb_clone_mnt_opts(const struct super_block *oldsb,
+static int cap_sb_clone_mnt_opts(const struct super_block *oldsb,
 				  struct super_block *newsb)
 {
+	return 0;
 }
 
 static int cap_sb_parse_opts_str(char *options, struct security_mnt_opts *opts)
diff --git a/security/security.c b/security/security.c
index 7b88c6aeaed4..108281d2307a 100644
--- a/security/security.c
+++ b/security/security.c
@@ -299,10 +299,10 @@ int security_sb_set_mnt_opts(struct super_block *sb,
 }
 EXPORT_SYMBOL(security_sb_set_mnt_opts);
 
-void security_sb_clone_mnt_opts(const struct super_block *oldsb,
+int security_sb_clone_mnt_opts(const struct super_block *oldsb,
 				struct super_block *newsb)
 {
-	security_ops->sb_clone_mnt_opts(oldsb, newsb);
+	return security_ops->sb_clone_mnt_opts(oldsb, newsb);
 }
 EXPORT_SYMBOL(security_sb_clone_mnt_opts);
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2fa28c88900c..3c02be3f6732 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -750,7 +750,37 @@ out_double_mount:
 	goto out;
 }
 
-static void selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
+static int selinux_cmp_sb_context(const struct super_block *oldsb,
+				    const struct super_block *newsb)
+{
+	struct superblock_security_struct *old = oldsb->s_security;
+	struct superblock_security_struct *new = newsb->s_security;
+	char oldflags = old->flags & SE_MNTMASK;
+	char newflags = new->flags & SE_MNTMASK;
+
+	if (oldflags != newflags)
+		goto mismatch;
+	if ((oldflags & FSCONTEXT_MNT) && old->sid != new->sid)
+		goto mismatch;
+	if ((oldflags & CONTEXT_MNT) && old->mntpoint_sid != new->mntpoint_sid)
+		goto mismatch;
+	if ((oldflags & DEFCONTEXT_MNT) && old->def_sid != new->def_sid)
+		goto mismatch;
+	if (oldflags & ROOTCONTEXT_MNT) {
+		struct inode_security_struct *oldroot = oldsb->s_root->d_inode->i_security;
+		struct inode_security_struct *newroot = newsb->s_root->d_inode->i_security;
+		if (oldroot->sid != newroot->sid)
+			goto mismatch;
+	}
+	return 0;
+mismatch:
+	printk(KERN_WARNING "SELinux: mount invalid.  Same superblock, "
+			    "different security settings for (dev %s, "
+			    "type %s)\n", newsb->s_id, newsb->s_type->name);
+	return -EBUSY;
+}
+
+static int selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
 					struct super_block *newsb)
 {
 	const struct superblock_security_struct *oldsbsec = oldsb->s_security;
@@ -765,14 +795,14 @@ static void selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
 	 * mount options.  thus we can safely deal with this superblock later
 	 */
 	if (!ss_initialized)
-		return;
+		return 0;
 
 	/* how can we clone if the old one wasn't set up?? */
 	BUG_ON(!(oldsbsec->flags & SE_SBINITIALIZED));
 
-	/* if fs is reusing a sb, just let its options stand... */
+	/* if fs is reusing a sb, make sure that the contexts match */
 	if (newsbsec->flags & SE_SBINITIALIZED)
-		return;
+		return selinux_cmp_sb_context(oldsb, newsb);
 
 	mutex_lock(&newsbsec->lock);
 
@@ -805,6 +835,7 @@ static void selinux_sb_clone_mnt_opts(const struct super_block *oldsb,
 
 	sb_finish_set_opts(newsb);
 	mutex_unlock(&newsbsec->lock);
+	return 0;
 }
 
 static int selinux_parse_opts_str(char *options,
-- 
cgit 


From 181387da2d64c3129e5b5186c4dd388bc5041d53 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 1 Apr 2013 19:08:06 -0700
Subject: writeback: remove unused bdi_pending_list

There's no user left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Fengguang Wu <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h | 1 -
 mm/backing-dev.c            | 4 +---
 2 files changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 350459910fe1..a5ef27f5411a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -130,7 +130,6 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
-extern struct list_head bdi_pending_list;
 
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41733c5dc820..657569b3fcf6 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -31,13 +31,11 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 
 /*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
  * locking.
  */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
 
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
-- 
cgit 


From 839a8e8660b6777e7fe4e80af1a048aebe2b5977 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 1 Apr 2013 19:08:06 -0700
Subject: writeback: replace custom worker pool implementation with unbound
 workqueue

Writeback implements its own worker pool - each bdi can be associated
with a worker thread which is created and destroyed dynamically.  The
worker thread for the default bdi is always present and serves as the
"forker" thread which forks off worker threads for other bdis.

there's no reason for writeback to implement its own worker pool when
using unbound workqueue instead is much simpler and more efficient.
This patch replaces custom worker pool implementation in writeback
with an unbound workqueue.

The conversion isn't too complicated but the followings are worth
mentioning.

* bdi_writeback->last_active, task and wakeup_timer are removed.
  delayed_work ->dwork is added instead.  Explicit timer handling is
  no longer necessary.  Everything works by either queueing / modding
  / flushing / canceling the delayed_work item.

* bdi_writeback_thread() becomes bdi_writeback_workfn() which runs off
  bdi_writeback->dwork.  On each execution, it processes
  bdi->work_list and reschedules itself if there are more things to
  do.

  The function also handles low-mem condition, which used to be
  handled by the forker thread.  If the function is running off a
  rescuer thread, it only writes out limited number of pages so that
  the rescuer can serve other bdis too.  This preserves the flusher
  creation failure behavior of the forker thread.

* INIT_LIST_HEAD(&bdi->bdi_list) is used to tell
  bdi_writeback_workfn() about on-going bdi unregistration so that it
  always drains work_list even if it's running off the rescuer.  Note
  that the original code was broken in this regard.  Under memory
  pressure, a bdi could finish unregistration with non-empty
  work_list.

* The default bdi is no longer special.  It now is treated the same as
  any other bdi and bdi_cap_flush_forker() is removed.

* BDI_pending is no longer used.  Removed.

* Some tracepoints become non-applicable.  The following TPs are
  removed - writeback_nothread, writeback_wake_thread,
  writeback_wake_forker_thread, writeback_thread_start,
  writeback_thread_stop.

Everything, including devices coming and going away and rescuer
operation under simulated memory pressure, seems to work fine in my
test setup.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
---
 fs/fs-writeback.c                | 102 +++++-----------
 include/linux/backing-dev.h      |  15 +--
 include/trace/events/writeback.h |   5 -
 mm/backing-dev.c                 | 255 +++++----------------------------------
 4 files changed, 65 insertions(+), 312 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 21f46fb3a101..8067d3719e94 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
 
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
-{
-	if (bdi->wb.task) {
-		wake_up_process(bdi->wb.task);
-	} else {
-		/*
-		 * The bdi thread isn't there, wake up the forker thread which
-		 * will create and run it.
-		 */
-		wake_up_process(default_backing_dev_info.wb.task);
-	}
-}
-
 static void bdi_queue_work(struct backing_dev_info *bdi,
 			   struct wb_writeback_work *work)
 {
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
 	spin_lock_bh(&bdi->wb_lock);
 	list_add_tail(&work->list, &bdi->work_list);
-	if (!bdi->wb.task)
-		trace_writeback_nothread(bdi, work);
-	bdi_wakeup_flusher(bdi);
 	spin_unlock_bh(&bdi->wb_lock);
+
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 
 static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	 */
 	work = kzalloc(sizeof(*work), GFP_ATOMIC);
 	if (!work) {
-		if (bdi->wb.task) {
-			trace_writeback_nowork(bdi);
-			wake_up_process(bdi->wb.task);
-		}
+		trace_writeback_nowork(bdi);
+		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 		return;
 	}
 
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
 	 * writeback as soon as there is no other work to do.
 	 */
 	trace_writeback_wake_background(bdi);
-	spin_lock_bh(&bdi->wb_lock);
-	bdi_wakeup_flusher(bdi);
-	spin_unlock_bh(&bdi->wb_lock);
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 
 /*
@@ -1020,66 +1000,48 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 /*
  * Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
  */
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
 {
-	struct bdi_writeback *wb = data;
+	struct bdi_writeback *wb = container_of(to_delayed_work(work),
+						struct bdi_writeback, dwork);
 	struct backing_dev_info *bdi = wb->bdi;
 	long pages_written;
 
 	current->flags |= PF_SWAPWRITE;
-	set_freezable();
-	wb->last_active = jiffies;
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	trace_writeback_thread_start(bdi);
 
-	while (!kthread_freezable_should_stop(NULL)) {
+	if (likely(!current_is_workqueue_rescuer() ||
+		   list_empty(&bdi->bdi_list))) {
 		/*
-		 * Remove own delayed wake-up timer, since we are already awake
-		 * and we'll take care of the periodic write-back.
+		 * The normal path.  Keep writing back @bdi until its
+		 * work_list is empty.  Note that this path is also taken
+		 * if @bdi is shutting down even when we're running off the
+		 * rescuer as work_list needs to be drained.
 		 */
-		del_timer(&wb->wakeup_timer);
-
-		pages_written = wb_do_writeback(wb, 0);
-
+		do {
+			pages_written = wb_do_writeback(wb, 0);
+			trace_writeback_pages_written(pages_written);
+		} while (!list_empty(&bdi->work_list));
+	} else {
+		/*
+		 * bdi_wq can't get enough workers and we're running off
+		 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+		 * enough for efficient IO.
+		 */
+		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+						    WB_REASON_FORKER_THREAD);
 		trace_writeback_pages_written(pages_written);
-
-		if (pages_written)
-			wb->last_active = jiffies;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-			__set_current_state(TASK_RUNNING);
-			continue;
-		}
-
-		if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-		else {
-			/*
-			 * We have nothing to do, so can go sleep without any
-			 * timeout and save power. When a work is queued or
-			 * something is made dirty - we will be woken up.
-			 */
-			schedule();
-		}
 	}
 
-	/* Flush any work that raced with us exiting */
-	if (!list_empty(&bdi->work_list))
-		wb_do_writeback(wb, 1);
+	if (!list_empty(&bdi->work_list) ||
+	    (wb_has_dirty_io(wb) && dirty_writeback_interval))
+		queue_delayed_work(bdi_wq, &wb->dwork,
+			msecs_to_jiffies(dirty_writeback_interval * 10));
 
-	trace_writeback_thread_stop(bdi);
-	return 0;
+	current->flags &= ~PF_SWAPWRITE;
 }
 
-
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index a5ef27f5411a..c3881553f7d1 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/atomic.h>
 #include <linux/sysctl.h>
+#include <linux/workqueue.h>
 
 struct page;
 struct device;
@@ -27,7 +28,6 @@ struct dentry;
  * Bits in backing_dev_info.state
  */
 enum bdi_state {
-	BDI_pending,		/* On its way to being activated */
 	BDI_wb_alloc,		/* Default embedded wb allocated */
 	BDI_async_congested,	/* The async (write) queue is getting full */
 	BDI_sync_congested,	/* The sync queue is getting full */
@@ -53,10 +53,8 @@ struct bdi_writeback {
 	unsigned int nr;
 
 	unsigned long last_old_flush;	/* last old data flush */
-	unsigned long last_active;	/* last time bdi thread was active */
 
-	struct task_struct *task;	/* writeback thread */
-	struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
+	struct delayed_work dwork;	/* work item used for writeback */
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
@@ -123,7 +121,7 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
-int bdi_writeback_thread(void *data);
+void bdi_writeback_workfn(struct work_struct *work);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
@@ -131,6 +129,8 @@ void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
+extern struct workqueue_struct *bdi_wq;
+
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
 	return !list_empty(&wb->b_dirty) ||
@@ -335,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
 	return bdi->capabilities & BDI_CAP_SWAP_BACKED;
 }
 
-static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
-{
-	return bdi == &default_backing_dev_info;
-}
-
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
 	return bdi_cap_writeback_dirty(mapping->backing_dev_info);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 6a16fd2e70ed..464ea82e10db 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 DEFINE_EVENT(writeback_work_class, name, \
 	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
 	TP_ARGS(bdi, work))
-DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
-DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
-DEFINE_WRITEBACK_EVENT(writeback_thread_start);
-DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
 
 DECLARE_EVENT_CLASS(wbc_class,
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 657569b3fcf6..2857d4f6bca4 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -37,6 +37,9 @@ static struct class *bdi_class;
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
 
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
+
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
 	if (wb1 < wb2) {
@@ -255,6 +258,11 @@ static int __init default_bdi_init(void)
 {
 	int err;
 
+	bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+					      WQ_UNBOUND, 0);
+	if (!bdi_wq)
+		return -ENOMEM;
+
 	err = bdi_init(&default_backing_dev_info);
 	if (!err)
 		bdi_register(&default_backing_dev_info, NULL, "default");
@@ -269,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
 	return wb_has_dirty_io(&bdi->wb);
 }
 
-static void wakeup_timer_fn(unsigned long data)
-{
-	struct backing_dev_info *bdi = (struct backing_dev_info *)data;
-
-	spin_lock_bh(&bdi->wb_lock);
-	if (bdi->wb.task) {
-		trace_writeback_wake_thread(bdi);
-		wake_up_process(bdi->wb.task);
-	} else if (bdi->dev) {
-		/*
-		 * When bdi tasks are inactive for long time, they are killed.
-		 * In this case we have to wake-up the forker thread which
-		 * should create and run the bdi thread.
-		 */
-		trace_writeback_wake_forker_thread(bdi);
-		wake_up_process(default_backing_dev_info.wb.task);
-	}
-	spin_unlock_bh(&bdi->wb_lock);
-}
-
 /*
  * This function is used when the first inode for this bdi is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
@@ -305,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
 	unsigned long timeout;
 
 	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-	mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
-}
-
-/*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
- */
-static unsigned long bdi_longest_inactive(void)
-{
-	unsigned long interval;
-
-	interval = msecs_to_jiffies(dirty_writeback_interval * 10);
-	return max(5UL * 60 * HZ, interval);
-}
-
-/*
- * Clear pending bit and wakeup anybody waiting for flusher thread creation or
- * shutdown
- */
-static void bdi_clear_pending(struct backing_dev_info *bdi)
-{
-	clear_bit(BDI_pending, &bdi->state);
-	smp_mb__after_clear_bit();
-	wake_up_bit(&bdi->state, BDI_pending);
-}
-
-static int bdi_forker_thread(void *ptr)
-{
-	struct bdi_writeback *me = ptr;
-
-	current->flags |= PF_SWAPWRITE;
-	set_freezable();
-
-	/*
-	 * Our parent may run at a different priority, just set us to normal
-	 */
-	set_user_nice(current, 0);
-
-	for (;;) {
-		struct task_struct *task = NULL;
-		struct backing_dev_info *bdi;
-		enum {
-			NO_ACTION,   /* Nothing to do */
-			FORK_THREAD, /* Fork bdi thread */
-			KILL_THREAD, /* Kill inactive bdi thread */
-		} action = NO_ACTION;
-
-		/*
-		 * Temporary measure, we want to make sure we don't see
-		 * dirty data on the default backing_dev_info
-		 */
-		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
-			del_timer(&me->wakeup_timer);
-			wb_do_writeback(me, 0);
-		}
-
-		spin_lock_bh(&bdi_lock);
-		/*
-		 * In the following loop we are going to check whether we have
-		 * some work to do without any synchronization with tasks
-		 * waking us up to do work for them. Set the task state here
-		 * so that we don't miss wakeups after verifying conditions.
-		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-
-		list_for_each_entry(bdi, &bdi_list, bdi_list) {
-			bool have_dirty_io;
-
-			if (!bdi_cap_writeback_dirty(bdi) ||
-			     bdi_cap_flush_forker(bdi))
-				continue;
-
-			WARN(!test_bit(BDI_registered, &bdi->state),
-			     "bdi %p/%s is not registered!\n", bdi, bdi->name);
-
-			have_dirty_io = !list_empty(&bdi->work_list) ||
-					wb_has_dirty_io(&bdi->wb);
-
-			/*
-			 * If the bdi has work to do, but the thread does not
-			 * exist - create it.
-			 */
-			if (!bdi->wb.task && have_dirty_io) {
-				/*
-				 * Set the pending bit - if someone will try to
-				 * unregister this bdi - it'll wait on this bit.
-				 */
-				set_bit(BDI_pending, &bdi->state);
-				action = FORK_THREAD;
-				break;
-			}
-
-			spin_lock(&bdi->wb_lock);
-
-			/*
-			 * If there is no work to do and the bdi thread was
-			 * inactive long enough - kill it. The wb_lock is taken
-			 * to make sure no-one adds more work to this bdi and
-			 * wakes the bdi thread up.
-			 */
-			if (bdi->wb.task && !have_dirty_io &&
-			    time_after(jiffies, bdi->wb.last_active +
-						bdi_longest_inactive())) {
-				task = bdi->wb.task;
-				bdi->wb.task = NULL;
-				spin_unlock(&bdi->wb_lock);
-				set_bit(BDI_pending, &bdi->state);
-				action = KILL_THREAD;
-				break;
-			}
-			spin_unlock(&bdi->wb_lock);
-		}
-		spin_unlock_bh(&bdi_lock);
-
-		/* Keep working if default bdi still has things to do */
-		if (!list_empty(&me->bdi->work_list))
-			__set_current_state(TASK_RUNNING);
-
-		switch (action) {
-		case FORK_THREAD:
-			__set_current_state(TASK_RUNNING);
-			task = kthread_create(bdi_writeback_thread, &bdi->wb,
-					      "flush-%s", dev_name(bdi->dev));
-			if (IS_ERR(task)) {
-				/*
-				 * If thread creation fails, force writeout of
-				 * the bdi from the thread. Hopefully 1024 is
-				 * large enough for efficient IO.
-				 */
-				writeback_inodes_wb(&bdi->wb, 1024,
-						    WB_REASON_FORKER_THREAD);
-			} else {
-				/*
-				 * The spinlock makes sure we do not lose
-				 * wake-ups when racing with 'bdi_queue_work()'.
-				 * And as soon as the bdi thread is visible, we
-				 * can start it.
-				 */
-				spin_lock_bh(&bdi->wb_lock);
-				bdi->wb.task = task;
-				spin_unlock_bh(&bdi->wb_lock);
-				wake_up_process(task);
-			}
-			bdi_clear_pending(bdi);
-			break;
-
-		case KILL_THREAD:
-			__set_current_state(TASK_RUNNING);
-			kthread_stop(task);
-			bdi_clear_pending(bdi);
-			break;
-
-		case NO_ACTION:
-			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
-				/*
-				 * There are no dirty data. The only thing we
-				 * should now care about is checking for
-				 * inactive bdi threads and killing them. Thus,
-				 * let's sleep for longer time, save energy and
-				 * be friendly for battery-driven devices.
-				 */
-				schedule_timeout(bdi_longest_inactive());
-			else
-				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-			try_to_freeze();
-			break;
-		}
-	}
-
-	return 0;
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
 }
 
 /*
@@ -487,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
 	spin_unlock_bh(&bdi_lock);
 
 	synchronize_rcu_expedited();
+
+	/* bdi_list is now unused, clear it to mark @bdi dying */
+	INIT_LIST_HEAD(&bdi->bdi_list);
 }
 
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -506,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
 	bdi->dev = dev;
 
-	/*
-	 * Just start the forker thread for our default backing_dev_info,
-	 * and add other bdi's to the list. They will get a thread created
-	 * on-demand when they need it.
-	 */
-	if (bdi_cap_flush_forker(bdi)) {
-		struct bdi_writeback *wb = &bdi->wb;
-
-		wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
-						dev_name(dev));
-		if (IS_ERR(wb->task))
-			return PTR_ERR(wb->task);
-	}
-
 	bdi_debug_register(bdi, dev_name(dev));
 	set_bit(BDI_registered, &bdi->state);
 
@@ -543,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
  */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-	struct task_struct *task;
-
 	if (!bdi_cap_writeback_dirty(bdi))
 		return;
 
@@ -554,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 	bdi_remove_from_list(bdi);
 
 	/*
-	 * If setup is pending, wait for that to complete first
+	 * Drain work list and shutdown the delayed_work.  At this point,
+	 * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+	 * is dying and its work_list needs to be drained no matter what.
 	 */
-	wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
-			TASK_UNINTERRUPTIBLE);
+	mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+	flush_delayed_work(&bdi->wb.dwork);
+	WARN_ON(!list_empty(&bdi->work_list));
 
 	/*
-	 * Finally, kill the kernel thread. We don't need to be RCU
-	 * safe anymore, since the bdi is gone from visibility.
+	 * This shouldn't be necessary unless @bdi for some reason has
+	 * unflushed dirty IO after work_list is drained.  Do it anyway
+	 * just in case.
 	 */
-	spin_lock_bh(&bdi->wb_lock);
-	task = bdi->wb.task;
-	bdi->wb.task = NULL;
-	spin_unlock_bh(&bdi->wb_lock);
-
-	if (task)
-		kthread_stop(task);
+	cancel_delayed_work_sync(&bdi->wb.dwork);
 }
 
 /*
@@ -595,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
 		bdi_set_min_ratio(bdi, 0);
 		trace_writeback_bdi_unregister(bdi);
 		bdi_prune_sb(bdi);
-		del_timer_sync(&bdi->wb.wakeup_timer);
 
-		if (!bdi_cap_flush_forker(bdi))
-			bdi_wb_shutdown(bdi);
+		bdi_wb_shutdown(bdi);
 		bdi_debug_unregister(bdi);
 
 		spin_lock_bh(&bdi->wb_lock);
@@ -620,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
 	spin_lock_init(&wb->list_lock);
-	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
 
 /*
@@ -693,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	bdi_unregister(bdi);
 
 	/*
-	 * If bdi_unregister() had already been called earlier, the
-	 * wakeup_timer could still be armed because bdi_prune_sb()
-	 * can race with the bdi_wakeup_thread_delayed() calls from
-	 * __mark_inode_dirty().
+	 * If bdi_unregister() had already been called earlier, the dwork
+	 * could still be pending because bdi_prune_sb() can race with the
+	 * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
 	 */
-	del_timer_sync(&bdi->wb.wakeup_timer);
+	cancel_delayed_work_sync(&bdi->wb.dwork);
 
 	for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
 		percpu_counter_destroy(&bdi->bdi_stat[i]);
-- 
cgit 


From 19baba4cb6843bbe3dfde87e1e913f6a9cd27da9 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 9 Mar 2013 08:16:44 +0000
Subject: i2c: Remove detach_adapter

The detach_adapter callback has been deprecated for quite some time and has no
user left. Keeping it alive blocks other cleanups, so remove it.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Reviewed-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 33 ++++++++++-----------------------
 include/linux/i2c.h    |  7 ++-----
 2 files changed, 12 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index 0d873ba2e82e..f7cd05b4f327 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -49,7 +49,7 @@
 
 /* core_lock protects i2c_adapter_idr, and guarantees
    that device detection, deletion of detected devices, and attach_adapter
-   and detach_adapter calls are serialized */
+   calls are serialized */
 static DEFINE_MUTEX(core_lock);
 static DEFINE_IDR(i2c_adapter_idr);
 
@@ -1172,11 +1172,10 @@ int i2c_add_numbered_adapter(struct i2c_adapter *adap)
 }
 EXPORT_SYMBOL_GPL(i2c_add_numbered_adapter);
 
-static int i2c_do_del_adapter(struct i2c_driver *driver,
+static void i2c_do_del_adapter(struct i2c_driver *driver,
 			      struct i2c_adapter *adapter)
 {
 	struct i2c_client *client, *_n;
-	int res;
 
 	/* Remove the devices we created ourselves as the result of hardware
 	 * probing (using a driver's detect method) */
@@ -1188,16 +1187,6 @@ static int i2c_do_del_adapter(struct i2c_driver *driver,
 			i2c_unregister_device(client);
 		}
 	}
-
-	if (!driver->detach_adapter)
-		return 0;
-	dev_warn(&adapter->dev, "%s: detach_adapter method is deprecated\n",
-		 driver->driver.name);
-	res = driver->detach_adapter(adapter);
-	if (res)
-		dev_err(&adapter->dev, "detach_adapter failed (%d) "
-			"for driver [%s]\n", res, driver->driver.name);
-	return res;
 }
 
 static int __unregister_client(struct device *dev, void *dummy)
@@ -1218,7 +1207,8 @@ static int __unregister_dummy(struct device *dev, void *dummy)
 
 static int __process_removed_adapter(struct device_driver *d, void *data)
 {
-	return i2c_do_del_adapter(to_i2c_driver(d), data);
+	i2c_do_del_adapter(to_i2c_driver(d), data);
+	return 0;
 }
 
 /**
@@ -1231,7 +1221,6 @@ static int __process_removed_adapter(struct device_driver *d, void *data)
  */
 int i2c_del_adapter(struct i2c_adapter *adap)
 {
-	int res = 0;
 	struct i2c_adapter *found;
 	struct i2c_client *client, *next;
 
@@ -1247,11 +1236,9 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 
 	/* Tell drivers about this removal */
 	mutex_lock(&core_lock);
-	res = bus_for_each_drv(&i2c_bus_type, NULL, adap,
+	bus_for_each_drv(&i2c_bus_type, NULL, adap,
 			       __process_removed_adapter);
 	mutex_unlock(&core_lock);
-	if (res)
-		return res;
 
 	/* Remove devices instantiated from sysfs */
 	mutex_lock_nested(&adap->userspace_clients_lock,
@@ -1270,8 +1257,8 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 	 * we can't remove the dummy devices during the first pass: they
 	 * could have been instantiated by real devices wishing to clean
 	 * them up properly, so we give them a chance to do that first. */
-	res = device_for_each_child(&adap->dev, NULL, __unregister_client);
-	res = device_for_each_child(&adap->dev, NULL, __unregister_dummy);
+	device_for_each_child(&adap->dev, NULL, __unregister_client);
+	device_for_each_child(&adap->dev, NULL, __unregister_dummy);
 
 #ifdef CONFIG_I2C_COMPAT
 	class_compat_remove_link(i2c_adapter_compat_class, &adap->dev,
@@ -1367,9 +1354,9 @@ EXPORT_SYMBOL(i2c_register_driver);
 
 static int __process_removed_driver(struct device *dev, void *data)
 {
-	if (dev->type != &i2c_adapter_type)
-		return 0;
-	return i2c_do_del_adapter(data, to_i2c_adapter(dev));
+	if (dev->type == &i2c_adapter_type)
+		i2c_do_del_adapter(data, to_i2c_adapter(dev));
+	return 0;
 }
 
 /**
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index 2eca3860b77f..f2bcd46ce194 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -125,7 +125,6 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
  * struct i2c_driver - represent an I2C device driver
  * @class: What kind of i2c device we instantiate (for detect)
  * @attach_adapter: Callback for bus addition (deprecated)
- * @detach_adapter: Callback for bus removal (deprecated)
  * @probe: Callback for device binding
  * @remove: Callback for device unbinding
  * @shutdown: Callback for device shutdown
@@ -162,12 +161,10 @@ extern s32 i2c_smbus_write_i2c_block_data(const struct i2c_client *client,
 struct i2c_driver {
 	unsigned int class;
 
-	/* Notifies the driver that a new bus has appeared or is about to be
-	 * removed. You should avoid using this, it will be removed in a
-	 * near future.
+	/* Notifies the driver that a new bus has appeared. You should avoid
+	 * using this, it will be removed in a near future.
 	 */
 	int (*attach_adapter)(struct i2c_adapter *) __deprecated;
-	int (*detach_adapter)(struct i2c_adapter *) __deprecated;
 
 	/* Standard driver model interfaces */
 	int (*probe)(struct i2c_client *, const struct i2c_device_id *);
-- 
cgit 


From 71546300c8684eb69286604c79624582c16f2f5b Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 9 Mar 2013 08:16:47 +0000
Subject: i2c: Make return type of i2c_del_adapter() void

i2c_del_adapter() is usually called from a drivers remove callback. The Linux
device driver model does not allow the remove callback to fail and all resources
allocated in the probe callback need to be freed, as well as all resources which
have been provided to the rest of the kernel(for example a I2C adapter) need to
be revoked. So any function revoking such resources isn't allowed to fail
either. i2c_del_adapter() adheres to this requirement and will never fail. But
i2c_del_adapter()'s return type is int, which may cause driver authors to think
that it can fail. This led to code constructs like:

	ret = i2c_del_adapter(...);
	BUG_ON(ret);

Since i2c_del_adapter() always returns 0 the BUG_ON is never hit and essentially
becomes dead code, which means it can be removed. Making the return type of
i2c_del_adapter() void makes it explicit that the function will never fail and
should prevent constructs like the above from re-appearing in the kernel code.

All callers of i2c_del_adapter() have already been updated in a previous patch
to ignore the return value, so the conversion of the return type from int to
void can be done without causing any build failures.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Reviewed-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-core.c | 6 ++----
 include/linux/i2c.h    | 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-core.c b/drivers/i2c/i2c-core.c
index e4fe4940fd82..e9ac0d84a01e 100644
--- a/drivers/i2c/i2c-core.c
+++ b/drivers/i2c/i2c-core.c
@@ -1219,7 +1219,7 @@ static int __process_removed_adapter(struct device_driver *d, void *data)
  * This unregisters an I2C adapter which was previously registered
  * by @i2c_add_adapter or @i2c_add_numbered_adapter.
  */
-int i2c_del_adapter(struct i2c_adapter *adap)
+void i2c_del_adapter(struct i2c_adapter *adap)
 {
 	struct i2c_adapter *found;
 	struct i2c_client *client, *next;
@@ -1231,7 +1231,7 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 	if (found != adap) {
 		pr_debug("i2c-core: attempting to delete unregistered "
 			 "adapter [%s]\n", adap->name);
-		return 0;
+		return;
 	}
 
 	/* Tell drivers about this removal */
@@ -1283,8 +1283,6 @@ int i2c_del_adapter(struct i2c_adapter *adap)
 	/* Clear the device structure in case this adapter is ever going to be
 	   added again */
 	memset(&adap->dev, 0, sizeof(adap->dev));
-
-	return 0;
 }
 EXPORT_SYMBOL(i2c_del_adapter);
 
diff --git a/include/linux/i2c.h b/include/linux/i2c.h
index f2bcd46ce194..e988fa935b3c 100644
--- a/include/linux/i2c.h
+++ b/include/linux/i2c.h
@@ -488,7 +488,7 @@ void i2c_unlock_adapter(struct i2c_adapter *);
  */
 #if defined(CONFIG_I2C) || defined(CONFIG_I2C_MODULE)
 extern int i2c_add_adapter(struct i2c_adapter *);
-extern int i2c_del_adapter(struct i2c_adapter *);
+extern void i2c_del_adapter(struct i2c_adapter *);
 extern int i2c_add_numbered_adapter(struct i2c_adapter *);
 
 extern int i2c_register_driver(struct module *, struct i2c_driver *);
-- 
cgit 


From 51d95709dddf7fdf6769a547de37a9c98edf8df9 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Sat, 9 Mar 2013 08:16:49 +0000
Subject: i2c: Make the return type of i2c_del_mux_adapter() void

i2c_del_mux_adapter always returns 0 and none of it current users check its
return value anyway. It is also an essential requirement of the Linux device
driver model, that functions which may be called from a device's remove callback
to free resources provided by the device, are not allowed to fail. This is the
case for i2c_del_mux_adapter(), so make its return type void to make the
fact that it won't fail explicit.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Reviewed-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Wolfram Sang <wsa@the-dreams.de>
---
 drivers/i2c/i2c-mux.c   | 4 +---
 include/linux/i2c-mux.h | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/i2c/i2c-mux.c b/drivers/i2c/i2c-mux.c
index 361b78d76759..7409ebb33c47 100644
--- a/drivers/i2c/i2c-mux.c
+++ b/drivers/i2c/i2c-mux.c
@@ -191,14 +191,12 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 }
 EXPORT_SYMBOL_GPL(i2c_add_mux_adapter);
 
-int i2c_del_mux_adapter(struct i2c_adapter *adap)
+void i2c_del_mux_adapter(struct i2c_adapter *adap)
 {
 	struct i2c_mux_priv *priv = adap->algo_data;
 
 	i2c_del_adapter(adap);
 	kfree(priv);
-
-	return 0;
 }
 EXPORT_SYMBOL_GPL(i2c_del_mux_adapter);
 
diff --git a/include/linux/i2c-mux.h b/include/linux/i2c-mux.h
index 40cb05a97b46..b5f9a007a3ab 100644
--- a/include/linux/i2c-mux.h
+++ b/include/linux/i2c-mux.h
@@ -42,7 +42,7 @@ struct i2c_adapter *i2c_add_mux_adapter(struct i2c_adapter *parent,
 				int (*deselect) (struct i2c_adapter *,
 						 void *mux_dev, u32 chan_id));
 
-int i2c_del_mux_adapter(struct i2c_adapter *adap);
+void i2c_del_mux_adapter(struct i2c_adapter *adap);
 
 #endif /* __KERNEL__ */
 
-- 
cgit 


From ef096542642874de10909f02686447a96a66ad14 Mon Sep 17 00:00:00 2001
From: Chao Xie <chao.xie@marvell.com>
Date: Mon, 25 Mar 2013 03:06:57 -0400
Subject: usb: mv_usb: remove clock name from pdata

Using pdata to pass clock name is not correct.
Directly get clock from usb drivers.

Signed-off-by: Chao Xie <chao.xie@marvell.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 include/linux/platform_data/mv_usb.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/mv_usb.h b/include/linux/platform_data/mv_usb.h
index 944b01dd103e..98b7925f1a2d 100644
--- a/include/linux/platform_data/mv_usb.h
+++ b/include/linux/platform_data/mv_usb.h
@@ -34,8 +34,6 @@ struct mv_usb_addon_irq {
 };
 
 struct mv_usb_platform_data {
-	unsigned int		clknum;
-	char			**clkname;
 	struct mv_usb_addon_irq	*id;	/* Only valid for OTG. ID pin change*/
 	struct mv_usb_addon_irq	*vbus;	/* valid for OTG/UDC. VBUS change*/
 
-- 
cgit 


From 225da3e3cb1f0db9e4cb7fa2a7dc3a360d1cf788 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Date: Sun, 31 Mar 2013 18:34:43 -0700
Subject: usb: renesas_usbhs: fixup sparse errors for common.c

This patch fixup below sparse errors

CHECK   ${RENESAS_USB}/common.c
${RENESAS_USB}/common.c:313:17: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/common.c:322:17: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/common.c:384:17: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/common.c:524:9: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/common.c:545:9: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/common.c:574:9: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/common.c:606:9: error: incompatible types in conditional expression (different base types)
${RENESAS_USB}/mod_gadget.c:233:28: warning: symbol 'req_clear_feature' was not declared. Should it be static?
${RENESAS_USB}/mod_gadget.c:274:28: warning: symbol 'req_set_feature' was not declared. Should it be static?
${RENESAS_USB}/mod_gadget.c:375:28: warning: symbol 'req_get_status' was not declared. Should it be static?

[ balbi@ti.com : added three sparse fixes to mod_gadget.c ]

Signed-off-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 arch/arm/mach-shmobile/board-armadillo800eva.c |  8 ++++++--
 arch/arm/mach-shmobile/board-kzm9g.c           |  8 ++++++--
 arch/arm/mach-shmobile/board-mackerel.c        | 12 +++++++++---
 arch/sh/boards/mach-ecovec24/setup.c           |  4 +++-
 drivers/usb/renesas_usbhs/mod_gadget.c         |  6 +++---
 include/linux/usb/renesas_usbhs.h              |  6 +++---
 6 files changed, 30 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-shmobile/board-armadillo800eva.c b/arch/arm/mach-shmobile/board-armadillo800eva.c
index f2ec0777cfbe..ff8b7ba9b93c 100644
--- a/arch/arm/mach-shmobile/board-armadillo800eva.c
+++ b/arch/arm/mach-shmobile/board-armadillo800eva.c
@@ -169,7 +169,7 @@ static int usbhsf_get_id(struct platform_device *pdev)
 	return USBHS_GADGET;
 }
 
-static void usbhsf_power_ctrl(struct platform_device *pdev,
+static int usbhsf_power_ctrl(struct platform_device *pdev,
 			      void __iomem *base, int enable)
 {
 	struct usbhsf_private *priv = usbhsf_get_priv(pdev);
@@ -223,6 +223,8 @@ static void usbhsf_power_ctrl(struct platform_device *pdev,
 		clk_disable(priv->pci);		/* usb work around */
 		clk_disable(priv->usb24);	/* usb work around */
 	}
+
+	return 0;
 }
 
 static int usbhsf_get_vbus(struct platform_device *pdev)
@@ -239,7 +241,7 @@ static irqreturn_t usbhsf_interrupt(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
-static void usbhsf_hardware_exit(struct platform_device *pdev)
+static int usbhsf_hardware_exit(struct platform_device *pdev)
 {
 	struct usbhsf_private *priv = usbhsf_get_priv(pdev);
 
@@ -264,6 +266,8 @@ static void usbhsf_hardware_exit(struct platform_device *pdev)
 	priv->usbh_base	= NULL;
 
 	free_irq(IRQ7, pdev);
+
+	return 0;
 }
 
 static int usbhsf_hardware_init(struct platform_device *pdev)
diff --git a/arch/arm/mach-shmobile/board-kzm9g.c b/arch/arm/mach-shmobile/board-kzm9g.c
index 7f3a6b7e7b7c..a385f570bbfc 100644
--- a/arch/arm/mach-shmobile/board-kzm9g.c
+++ b/arch/arm/mach-shmobile/board-kzm9g.c
@@ -155,12 +155,14 @@ static int usbhs_get_vbus(struct platform_device *pdev)
 	return !((1 << 7) & __raw_readw(priv->cr2));
 }
 
-static void usbhs_phy_reset(struct platform_device *pdev)
+static int usbhs_phy_reset(struct platform_device *pdev)
 {
 	struct usbhs_private *priv = usbhs_get_priv(pdev);
 
 	/* init phy */
 	__raw_writew(0x8a0a, priv->cr2);
+
+	return 0;
 }
 
 static int usbhs_get_id(struct platform_device *pdev)
@@ -202,7 +204,7 @@ static int usbhs_hardware_init(struct platform_device *pdev)
 	return 0;
 }
 
-static void usbhs_hardware_exit(struct platform_device *pdev)
+static int usbhs_hardware_exit(struct platform_device *pdev)
 {
 	struct usbhs_private *priv = usbhs_get_priv(pdev);
 
@@ -210,6 +212,8 @@ static void usbhs_hardware_exit(struct platform_device *pdev)
 	__raw_writew(USB_PHY_MODE | USB_PHY_INT_CLR, priv->phy);
 
 	free_irq(IRQ15, pdev);
+
+	return 0;
 }
 
 static u32 usbhs_pipe_cfg[] = {
diff --git a/arch/arm/mach-shmobile/board-mackerel.c b/arch/arm/mach-shmobile/board-mackerel.c
index db968a585ff0..979237c18dad 100644
--- a/arch/arm/mach-shmobile/board-mackerel.c
+++ b/arch/arm/mach-shmobile/board-mackerel.c
@@ -596,12 +596,14 @@ static int usbhs_get_vbus(struct platform_device *pdev)
 	return usbhs_is_connected(usbhs_get_priv(pdev));
 }
 
-static void usbhs_phy_reset(struct platform_device *pdev)
+static int usbhs_phy_reset(struct platform_device *pdev)
 {
 	struct usbhs_private *priv = usbhs_get_priv(pdev);
 
 	/* init phy */
 	__raw_writew(0x8a0a, priv->usbcrcaddr);
+
+	return 0;
 }
 
 static int usbhs0_get_id(struct platform_device *pdev)
@@ -628,11 +630,13 @@ static int usbhs0_hardware_init(struct platform_device *pdev)
 	return 0;
 }
 
-static void usbhs0_hardware_exit(struct platform_device *pdev)
+static int usbhs0_hardware_exit(struct platform_device *pdev)
 {
 	struct usbhs_private *priv = usbhs_get_priv(pdev);
 
 	cancel_delayed_work_sync(&priv->work);
+
+	return 0;
 }
 
 static struct usbhs_private usbhs0_private = {
@@ -735,7 +739,7 @@ static int usbhs1_hardware_init(struct platform_device *pdev)
 	return 0;
 }
 
-static void usbhs1_hardware_exit(struct platform_device *pdev)
+static int usbhs1_hardware_exit(struct platform_device *pdev)
 {
 	struct usbhs_private *priv = usbhs_get_priv(pdev);
 
@@ -743,6 +747,8 @@ static void usbhs1_hardware_exit(struct platform_device *pdev)
 	__raw_writew(USB_PHY_MODE | USB_PHY_INT_CLR, priv->usbphyaddr);
 
 	free_irq(IRQ8, pdev);
+
+	return 0;
 }
 
 static int usbhs1_get_id(struct platform_device *pdev)
diff --git a/arch/sh/boards/mach-ecovec24/setup.c b/arch/sh/boards/mach-ecovec24/setup.c
index aaff7671101b..764530c85aa9 100644
--- a/arch/sh/boards/mach-ecovec24/setup.c
+++ b/arch/sh/boards/mach-ecovec24/setup.c
@@ -254,11 +254,13 @@ static int usbhs_get_id(struct platform_device *pdev)
 	return gpio_get_value(GPIO_PTB3);
 }
 
-static void usbhs_phy_reset(struct platform_device *pdev)
+static int usbhs_phy_reset(struct platform_device *pdev)
 {
 	/* enable vbus if HOST */
 	if (!gpio_get_value(GPIO_PTB3))
 		gpio_set_value(GPIO_PTB5, 1);
+
+	return 0;
 }
 
 static struct renesas_usbhs_platform_info usbhs_info = {
diff --git a/drivers/usb/renesas_usbhs/mod_gadget.c b/drivers/usb/renesas_usbhs/mod_gadget.c
index c2781bc9dabe..ed4949faa70d 100644
--- a/drivers/usb/renesas_usbhs/mod_gadget.c
+++ b/drivers/usb/renesas_usbhs/mod_gadget.c
@@ -230,7 +230,7 @@ static int usbhsg_recip_handler_std_clear_endpoint(struct usbhs_priv *priv,
 	return 0;
 }
 
-struct usbhsg_recip_handle req_clear_feature = {
+static struct usbhsg_recip_handle req_clear_feature = {
 	.name		= "clear feature",
 	.device		= usbhsg_recip_handler_std_control_done,
 	.interface	= usbhsg_recip_handler_std_control_done,
@@ -271,7 +271,7 @@ static int usbhsg_recip_handler_std_set_endpoint(struct usbhs_priv *priv,
 	return 0;
 }
 
-struct usbhsg_recip_handle req_set_feature = {
+static struct usbhsg_recip_handle req_set_feature = {
 	.name		= "set feature",
 	.device		= usbhsg_recip_handler_std_set_device,
 	.interface	= usbhsg_recip_handler_std_control_done,
@@ -372,7 +372,7 @@ static int usbhsg_recip_handler_std_get_endpoint(struct usbhs_priv *priv,
 	return 0;
 }
 
-struct usbhsg_recip_handle req_get_status = {
+static struct usbhsg_recip_handle req_get_status = {
 	.name		= "get status",
 	.device		= usbhsg_recip_handler_std_get_device,
 	.interface	= usbhsg_recip_handler_std_get_interface,
diff --git a/include/linux/usb/renesas_usbhs.h b/include/linux/usb/renesas_usbhs.h
index c5d36c65c33b..e452ba6ec6bd 100644
--- a/include/linux/usb/renesas_usbhs.h
+++ b/include/linux/usb/renesas_usbhs.h
@@ -62,14 +62,14 @@ struct renesas_usbhs_platform_callback {
 	 * Hardware exit function for platform.
 	 * it is called when driver was removed
 	 */
-	void (*hardware_exit)(struct platform_device *pdev);
+	int (*hardware_exit)(struct platform_device *pdev);
 
 	/*
 	 * option:
 	 *
 	 * for board specific clock control
 	 */
-	void (*power_ctrl)(struct platform_device *pdev,
+	int (*power_ctrl)(struct platform_device *pdev,
 			   void __iomem *base, int enable);
 
 	/*
@@ -77,7 +77,7 @@ struct renesas_usbhs_platform_callback {
 	 *
 	 * Phy reset for platform
 	 */
-	void (*phy_reset)(struct platform_device *pdev);
+	int (*phy_reset)(struct platform_device *pdev);
 
 	/*
 	 * get USB ID function
-- 
cgit 


From 6fed4d869a11fdbb4c6a5e444dfb2c22f92c3e46 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 1 Apr 2013 22:03:06 +0100
Subject: extcon: arizona: Allow configuration of button detection

The Arizona button detection circuit is configurable, allowing the system
integrator to program a range of thresholds for the buttons supported on
the accessory but currently the driver uses the default button ranges and
does not provide any flexibility in how this is exposed to the application
layer.

Provide platform data allowing the user to control this and to map
the buttons to keys in the input subsystem.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/extcon/extcon-arizona.c   | 164 +++++++++++++++++++++++++++++---------
 include/linux/mfd/arizona/pdata.h |   9 +++
 2 files changed, 134 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 4bb0e9ae405d..e2339629126a 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -33,7 +33,7 @@
 #include <linux/mfd/arizona/pdata.h>
 #include <linux/mfd/arizona/registers.h>
 
-#define ARIZONA_NUM_BUTTONS 6
+#define ARIZONA_MAX_MICD_RANGE 8
 
 #define ARIZONA_ACCDET_MODE_MIC 0
 #define ARIZONA_ACCDET_MODE_HPL 1
@@ -50,6 +50,9 @@ struct arizona_extcon_info {
 	const struct arizona_micd_config *micd_modes;
 	int micd_num_modes;
 
+	const struct arizona_micd_range *micd_ranges;
+	int num_micd_ranges;
+
 	bool micd_reva;
 	bool micd_clamp;
 
@@ -71,20 +74,25 @@ struct arizona_extcon_info {
 };
 
 static const struct arizona_micd_config micd_default_modes[] = {
-	{ 0,                  2 << ARIZONA_MICD_BIAS_SRC_SHIFT, 1 },
 	{ ARIZONA_ACCDET_SRC, 1 << ARIZONA_MICD_BIAS_SRC_SHIFT, 0 },
+	{ 0,                  2 << ARIZONA_MICD_BIAS_SRC_SHIFT, 1 },
 };
 
-static struct {
-	u16 status;
-	int report;
-} arizona_lvl_to_key[ARIZONA_NUM_BUTTONS] = {
-	{  0x1, BTN_0 },
-	{  0x2, BTN_1 },
-	{  0x4, BTN_2 },
-	{  0x8, BTN_3 },
-	{ 0x10, BTN_4 },
-	{ 0x20, BTN_5 },
+static const struct arizona_micd_range micd_default_ranges[] = {
+	{ .max =  11, .key = BTN_0 },
+	{ .max =  28, .key = BTN_1 },
+	{ .max =  54, .key = BTN_2 },
+	{ .max = 100, .key = BTN_3 },
+	{ .max = 186, .key = BTN_4 },
+	{ .max = 430, .key = BTN_5 },
+};
+
+static const int arizona_micd_levels[] = {
+	3, 6, 8, 11, 13, 16, 18, 21, 23, 26, 28, 31, 34, 36, 39, 41, 44, 46,
+	49, 52, 54, 57, 60, 62, 65, 67, 70, 73, 75, 78, 81, 83, 89, 94, 100,
+	105, 111, 116, 122, 127, 139, 150, 161, 173, 186, 196, 209, 220, 245,
+	270, 295, 321, 348, 375, 402, 430, 489, 550, 614, 681, 752, 903, 1071,
+	1257,
 };
 
 #define ARIZONA_CABLE_MECHANICAL 0
@@ -153,7 +161,7 @@ static void arizona_extcon_set_mode(struct arizona_extcon_info *info, int mode)
 {
 	struct arizona *arizona = info->arizona;
 
-	mode %= info->num_micd_modes;
+	mode %= info->micd_num_modes;
 
 	if (arizona->pdata.micd_pol_gpio > 0)
 		gpio_set_value_cansleep(arizona->pdata.micd_pol_gpio,
@@ -728,7 +736,7 @@ static irqreturn_t arizona_micdet(int irq, void *data)
 	struct arizona_extcon_info *info = data;
 	struct arizona *arizona = info->arizona;
 	unsigned int val, lvl;
-	int ret, i;
+	int ret, i, key;
 
 	mutex_lock(&info->lock);
 
@@ -815,12 +823,13 @@ static irqreturn_t arizona_micdet(int irq, void *data)
 			lvl = val & ARIZONA_MICD_LVL_MASK;
 			lvl >>= ARIZONA_MICD_LVL_SHIFT;
 
-			for (i = 0; i < ARIZONA_NUM_BUTTONS; i++)
-				if (lvl & arizona_lvl_to_key[i].status)
-					input_report_key(info->input,
-							 arizona_lvl_to_key[i].report,
-							 1);
-			input_sync(info->input);
+			WARN_ON(!lvl);
+			WARN_ON(ffs(lvl) - 1 >= info->num_micd_ranges);
+			if (lvl && ffs(lvl) - 1 < info->num_micd_ranges) {
+				key = info->micd_ranges[ffs(lvl) - 1].key;
+				input_report_key(info->input, key, 1);
+				input_sync(info->input);
+			}
 
 		} else if (info->detecting) {
 			dev_dbg(arizona->dev, "Headphone detected\n");
@@ -834,9 +843,9 @@ static irqreturn_t arizona_micdet(int irq, void *data)
 		}
 	} else {
 		dev_dbg(arizona->dev, "Mic button released\n");
-		for (i = 0; i < ARIZONA_NUM_BUTTONS; i++)
+		for (i = 0; i < info->num_micd_ranges; i++)
 			input_report_key(info->input,
-					 arizona_lvl_to_key[i].report, 0);
+					 info->micd_ranges[i].key, 0);
 		input_sync(info->input);
 		arizona_extcon_pulse_micbias(info);
 	}
@@ -923,9 +932,9 @@ static irqreturn_t arizona_jackdet(int irq, void *data)
 		info->mic = false;
 		info->hpdet_done = false;
 
-		for (i = 0; i < ARIZONA_NUM_BUTTONS; i++)
+		for (i = 0; i < info->num_micd_ranges; i++)
 			input_report_key(info->input,
-					 arizona_lvl_to_key[i].report, 0);
+					 info->micd_ranges[i].key, 0);
 		input_sync(info->input);
 
 		ret = extcon_update_state(&info->edev, 0xffffffff, 0);
@@ -954,13 +963,33 @@ static irqreturn_t arizona_jackdet(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+/* Map a level onto a slot in the register bank */
+static void arizona_micd_set_level(struct arizona *arizona, int index,
+				   unsigned int level)
+{
+	int reg;
+	unsigned int mask;
+
+	reg = ARIZONA_MIC_DETECT_LEVEL_4 - (index / 2);
+
+	if (!(index % 2)) {
+		mask = 0x3f00;
+		level <<= 8;
+	} else {
+		mask = 0x3f;
+	}
+
+	/* Program the level itself */
+	regmap_update_bits(arizona->regmap, reg, mask, level);
+}
+
 static int arizona_extcon_probe(struct platform_device *pdev)
 {
 	struct arizona *arizona = dev_get_drvdata(pdev->dev.parent);
 	struct arizona_pdata *pdata;
 	struct arizona_extcon_info *info;
 	int jack_irq_fall, jack_irq_rise;
-	int ret, mode, i;
+	int ret, mode, i, j;
 
 	if (!arizona->dapm || !arizona->dapm->card)
 		return -EPROBE_DEFER;
@@ -1013,6 +1042,17 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 		goto err;
 	}
 
+	info->input = devm_input_allocate_device(&pdev->dev);
+	if (!info->input) {
+		dev_err(arizona->dev, "Can't allocate input dev\n");
+		ret = -ENOMEM;
+		goto err_register;
+	}
+
+	info->input->name = "Headset";
+	info->input->phys = "arizona/extcon";
+	info->input->dev.parent = &pdev->dev;
+
 	if (pdata->num_micd_configs) {
 		info->micd_modes = pdata->micd_configs;
 		info->micd_num_modes = pdata->num_micd_configs;
@@ -1068,6 +1108,66 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 				   arizona->pdata.micd_dbtime
 				   << ARIZONA_MICD_DBTIME_SHIFT);
 
+	BUILD_BUG_ON(ARRAY_SIZE(arizona_micd_levels) != 0x40);
+
+	if (arizona->pdata.num_micd_ranges) {
+		info->micd_ranges = pdata->micd_ranges;
+		info->num_micd_ranges = pdata->num_micd_ranges;
+	} else {
+		info->micd_ranges = micd_default_ranges;
+		info->num_micd_ranges = ARRAY_SIZE(micd_default_ranges);
+	}
+
+	if (arizona->pdata.num_micd_ranges > ARIZONA_MAX_MICD_RANGE) {
+		dev_err(arizona->dev, "Too many MICD ranges: %d\n",
+			arizona->pdata.num_micd_ranges);
+	}
+
+	if (info->num_micd_ranges > 1) {
+		for (i = 1; i < info->num_micd_ranges; i++) {
+			if (info->micd_ranges[i - 1].max >
+			    info->micd_ranges[i].max) {
+				dev_err(arizona->dev,
+					"MICD ranges must be sorted\n");
+				ret = -EINVAL;
+				goto err_input;
+			}
+		}
+	}
+
+	/* Disable all buttons by default */
+	regmap_update_bits(arizona->regmap, ARIZONA_MIC_DETECT_2,
+			   ARIZONA_MICD_LVL_SEL_MASK, 0x81);
+
+	/* Set up all the buttons the user specified */
+	for (i = 0; i < info->num_micd_ranges; i++) {
+		for (j = 0; j < ARRAY_SIZE(arizona_micd_levels); j++)
+			if (arizona_micd_levels[j] >= info->micd_ranges[i].max)
+				break;
+
+		if (j == ARRAY_SIZE(arizona_micd_levels)) {
+			dev_err(arizona->dev, "Unsupported MICD level %d\n",
+				info->micd_ranges[i].max);
+			ret = -EINVAL;
+			goto err_input;
+		}
+
+		dev_dbg(arizona->dev, "%d ohms for MICD threshold %d\n",
+			arizona_micd_levels[j], i);
+
+		arizona_micd_set_level(arizona, i, j);
+		input_set_capability(info->input, EV_KEY,
+				     info->micd_ranges[i].key);
+
+		/* Enable reporting of that range */
+		regmap_update_bits(arizona->regmap, ARIZONA_MIC_DETECT_2,
+				   1 << i, 1 << i);
+	}
+
+	/* Set all the remaining keys to a maximum */
+	for (; i < ARIZONA_MAX_MICD_RANGE; i++)
+		arizona_micd_set_level(arizona, i, 0x3f);
+
 	/*
 	 * If we have a clamp use it, activating in conjunction with
 	 * GPIO5 if that is connected for jack detect operation.
@@ -1095,20 +1195,6 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 
 	arizona_extcon_set_mode(info, 0);
 
-	info->input = devm_input_allocate_device(&pdev->dev);
-	if (!info->input) {
-		dev_err(arizona->dev, "Can't allocate input dev\n");
-		ret = -ENOMEM;
-		goto err_register;
-	}
-
-	for (i = 0; i < ARIZONA_NUM_BUTTONS; i++)
-		input_set_capability(info->input, EV_KEY,
-				     arizona_lvl_to_key[i].report);
-	info->input->name = "Headset";
-	info->input->phys = "arizona/extcon";
-	info->input->dev.parent = &pdev->dev;
-
 	pm_runtime_enable(&pdev->dev);
 	pm_runtime_idle(&pdev->dev);
 	pm_runtime_get_sync(&pdev->dev);
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 455c51d22d6b..eb11a8ac6db2 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -86,6 +86,11 @@ struct arizona_micd_config {
 	bool gpio;
 };
 
+struct arizona_micd_range {
+	int max;  /** Ohms */
+	int key;  /** Key to report to input layer */
+};
+
 struct arizona_pdata {
 	int reset;      /** GPIO controlling /RESET, if any */
 	int ldoena;     /** GPIO controlling LODENA, if any */
@@ -138,6 +143,10 @@ struct arizona_pdata {
 	/** Force MICBIAS on for mic detect */
 	bool micd_force_micbias;
 
+	/** Mic detect level parameters */
+	const struct arizona_micd_range *micd_ranges;
+	int num_micd_ranges;
+
 	/** Headset polarity configurations */
 	struct arizona_micd_config *micd_configs;
 	int num_micd_configs;
-- 
cgit 


From e56a0a572be150c79cdbf62ff98f4a63419e1c0b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 1 Apr 2013 19:03:52 +0100
Subject: extcon: arizona: Allow pull to be disabled on GPIO5 when used for
 JACKET

In some designs an external pull won't be needed.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/extcon/extcon-arizona.c   | 9 +++++++--
 include/linux/mfd/arizona/pdata.h | 3 +++
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 95748d3cbc4e..132bc99fdc06 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -986,6 +986,7 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 	struct arizona *arizona = dev_get_drvdata(pdev->dev.parent);
 	struct arizona_pdata *pdata;
 	struct arizona_extcon_info *info;
+	unsigned int val;
 	int jack_irq_fall, jack_irq_rise;
 	int ret, mode, i, j;
 
@@ -1172,9 +1173,13 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 	 */
 	if (info->micd_clamp) {
 		if (arizona->pdata.jd_gpio5) {
-			/* Put the GPIO into input mode */
+			/* Put the GPIO into input mode with optional pull */
+			val = 0xc101;
+			if (arizona->pdata.jd_gpio5_nopull)
+				val &= ~ARIZONA_GPN_PU;
+
 			regmap_write(arizona->regmap, ARIZONA_GPIO5_CTRL,
-				     0xc101);
+				     val);
 
 			regmap_update_bits(arizona->regmap,
 					   ARIZONA_MICD_CLAMP_CONTROL,
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index eb11a8ac6db2..008b8c40549f 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -122,6 +122,9 @@ struct arizona_pdata {
 	/** GPIO5 is used for jack detection */
 	bool jd_gpio5;
 
+	/** Internal pull on GPIO5 is disabled when used for jack detection */
+	bool jd_gpio5_nopull;
+
 	/** Use the headphone detect circuit to identify the accessory */
 	bool hpdet_acc_id;
 
-- 
cgit 


From 9c2ba270eaa227c999af451e1c2c9bf0d24aa8e5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 25 Feb 2013 23:42:31 +0000
Subject: extcon: arizona: Simplify HPDET based identification

Rather than measuring both HP channels we can simply directly measure the
microphone impedance and then rely on MICDET for final confirmation of the
presence of a suitable microphone. This improves the overall performance
of the identification process.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/extcon/extcon-arizona.c   | 46 +++++++++++++++++++++------------------
 include/linux/mfd/arizona/pdata.h |  3 +++
 2 files changed, 28 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 7c4ce812d735..a83ca27aa99f 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -459,7 +459,8 @@ static int arizona_hpdet_read(struct arizona_extcon_info *info)
 	return val;
 }
 
-static int arizona_hpdet_do_id(struct arizona_extcon_info *info, int *reading)
+static int arizona_hpdet_do_id(struct arizona_extcon_info *info, int *reading,
+			       bool *mic)
 {
 	struct arizona *arizona = info->arizona;
 	int id_gpio = arizona->pdata.hpdet_id_gpio;
@@ -470,11 +471,9 @@ static int arizona_hpdet_do_id(struct arizona_extcon_info *info, int *reading)
 	 */
 	if (arizona->pdata.hpdet_acc_id) {
 		info->hpdet_res[info->num_hpdet_res++] = *reading;
-		info->hpdet_res[info->num_hpdet_res++] = *reading;
 
 		/* Only check the mic directly if we didn't already ID it */
-		if (id_gpio && info->num_hpdet_res == 2 &&
-		    !((info->hpdet_res[0] > info->hpdet_res[1] * 2))) {
+		if (id_gpio && info->num_hpdet_res == 1) {
 			dev_dbg(arizona->dev, "Measuring mic\n");
 
 			regmap_update_bits(arizona->regmap,
@@ -493,10 +492,8 @@ static int arizona_hpdet_do_id(struct arizona_extcon_info *info, int *reading)
 		}
 
 		/* OK, got both.  Now, compare... */
-		dev_dbg(arizona->dev, "HPDET measured %d %d %d\n",
-			info->hpdet_res[0], info->hpdet_res[1],
-			info->hpdet_res[2]);
-
+		dev_dbg(arizona->dev, "HPDET measured %d %d\n",
+			info->hpdet_res[0], info->hpdet_res[1]);
 
 		/* Take the headphone impedance for the main report */
 		*reading = info->hpdet_res[0];
@@ -512,13 +509,11 @@ static int arizona_hpdet_do_id(struct arizona_extcon_info *info, int *reading)
 		}
 
 		/*
-		 * Either the two grounds measure differently or we
-		 * measure the mic as high impedance.
+		 * If we measure the mic as 
 		 */
-		if ((info->hpdet_res[0] > info->hpdet_res[1] * 2) ||
-		    (id_gpio && info->hpdet_res[2] > 1257)) {
+		if (!id_gpio || info->hpdet_res[1] > 50) {
 			dev_dbg(arizona->dev, "Detected mic\n");
-			info->mic = true;
+			*mic = true;
 			info->detecting = true;
 		} else {
 			dev_dbg(arizona->dev, "Detected headphone\n");
@@ -541,6 +536,7 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data)
 	int id_gpio = arizona->pdata.hpdet_id_gpio;
 	int report = ARIZONA_CABLE_HEADPHONE;
 	int ret, reading;
+	bool mic = false;
 
 	mutex_lock(&info->lock);
 
@@ -576,7 +572,7 @@ static irqreturn_t arizona_hpdet_irq(int irq, void *data)
 			   ARIZONA_HP_IMPEDANCE_RANGE_MASK | ARIZONA_HP_POLL,
 			   0);
 
-	ret = arizona_hpdet_do_id(info, &reading);
+	ret = arizona_hpdet_do_id(info, &reading, &mic);
 	if (ret == -EAGAIN) {
 		goto out;
 	} else if (ret < 0) {
@@ -606,7 +602,7 @@ done:
 			   ARIZONA_ACCDET_MODE_MASK, ARIZONA_ACCDET_MODE_MIC);
 
 	/* If we have a mic then reenable MICDET */
-	if (info->mic)
+	if (mic || info->mic)
 		arizona_start_mic(info);
 
 	if (info->hpdet_active) {
@@ -681,6 +677,8 @@ err:
 static void arizona_start_hpdet_acc_id(struct arizona_extcon_info *info)
 {
 	struct arizona *arizona = info->arizona;
+	int hp_reading = 32;
+	bool mic;
 	int ret;
 
 	dev_dbg(arizona->dev, "Starting identification via HPDET\n");
@@ -702,12 +700,18 @@ static void arizona_start_hpdet_acc_id(struct arizona_extcon_info *info)
 		goto err;
 	}
 
-	ret = regmap_update_bits(arizona->regmap, ARIZONA_HEADPHONE_DETECT_1,
-				 ARIZONA_HP_POLL, ARIZONA_HP_POLL);
-	if (ret != 0) {
-		dev_err(arizona->dev, "Can't start HPDETL measurement: %d\n",
-			ret);
-		goto err;
+	if (arizona->pdata.hpdet_acc_id_line) {
+		ret = regmap_update_bits(arizona->regmap,
+					 ARIZONA_HEADPHONE_DETECT_1,
+					 ARIZONA_HP_POLL, ARIZONA_HP_POLL);
+		if (ret != 0) {
+			dev_err(arizona->dev,
+				"Can't start HPDETL measurement: %d\n",
+				ret);
+			goto err;
+		}
+	} else {
+		arizona_hpdet_do_id(info, &hp_reading, &mic);
 	}
 
 	return;
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 008b8c40549f..45c84777c624 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -128,6 +128,9 @@ struct arizona_pdata {
 	/** Use the headphone detect circuit to identify the accessory */
 	bool hpdet_acc_id;
 
+	/** Check for line output with HPDET method */
+	bool hpdet_acc_id_line;
+
 	/** GPIO used for mic isolation with HPDET */
 	int hpdet_id_gpio;
 
-- 
cgit 


From cd59e79656f4e7137909166248a935d422b1245a Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 1 Apr 2013 19:21:48 +0100
Subject: extcon: arizona: Allow additional debounce during microphone
 detection

Help mitigate against mechanical bounce during the initial detection by
allowing the configuration of an additional debounce on top of that the
hardware does during the initial phase of microphone detection operation.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/extcon/extcon-arizona.c   | 35 ++++++++++++++++++++++++++++++-----
 include/linux/mfd/arizona/pdata.h |  3 +++
 2 files changed, 33 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index 26f9a1ae15c4..c7f8eb4299d2 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -64,6 +64,7 @@ struct arizona_extcon_info {
 	bool micd_clamp;
 
 	struct delayed_work hpdet_work;
+	struct delayed_work micd_detect_work;
 	struct delayed_work micd_timeout_work;
 
 	bool hpdet_active;
@@ -750,9 +751,11 @@ static void arizona_micd_timeout_work(struct work_struct *work)
 	mutex_unlock(&info->lock);
 }
 
-static irqreturn_t arizona_micdet(int irq, void *data)
+static void arizona_micd_detect(struct work_struct *work)
 {
-	struct arizona_extcon_info *info = data;
+	struct arizona_extcon_info *info = container_of(work,
+							struct arizona_extcon_info,
+							micd_detect_work.work);
 	struct arizona *arizona = info->arizona;
 	unsigned int val = 0, lvl;
 	int ret, i, key;
@@ -766,7 +769,7 @@ static irqreturn_t arizona_micdet(int irq, void *data)
 		if (ret != 0) {
 			dev_err(arizona->dev, "Failed to read MICDET: %d\n", ret);
 			mutex_unlock(&info->lock);
-			return IRQ_NONE;
+			return;
 		}
 
 		dev_dbg(arizona->dev, "MICDET: %x\n", val);
@@ -774,14 +777,14 @@ static irqreturn_t arizona_micdet(int irq, void *data)
 		if (!(val & ARIZONA_MICD_VALID)) {
 			dev_warn(arizona->dev, "Microphone detection state invalid\n");
 			mutex_unlock(&info->lock);
-			return IRQ_NONE;
+			return;
 		}
 	}
 
 	if (i == 10 && !(val & 0x7fc)) {
 		dev_err(arizona->dev, "Failed to get valid MICDET value\n");
 		mutex_unlock(&info->lock);
-		return IRQ_NONE;
+		return;
 	}
 
 	/* Due to jack detect this should never happen */
@@ -890,6 +893,27 @@ handled:
 
 	pm_runtime_mark_last_busy(info->dev);
 	mutex_unlock(&info->lock);
+}
+
+static irqreturn_t arizona_micdet(int irq, void *data)
+{
+	struct arizona_extcon_info *info = data;
+	struct arizona *arizona = info->arizona;
+	int debounce = arizona->pdata.micd_detect_debounce;
+
+	cancel_delayed_work_sync(&info->micd_detect_work);
+	cancel_delayed_work_sync(&info->micd_timeout_work);
+
+	mutex_lock(&info->lock);
+	if (!info->detecting)
+		debounce = 0;
+	mutex_unlock(&info->lock);
+
+	if (debounce)
+		schedule_delayed_work(&info->micd_detect_work,
+				      msecs_to_jiffies(debounce));
+	else
+		arizona_micd_detect(&info->micd_detect_work.work);
 
 	return IRQ_HANDLED;
 }
@@ -1072,6 +1096,7 @@ static int arizona_extcon_probe(struct platform_device *pdev)
 	info->dev = &pdev->dev;
 	info->last_jackdet = ~(ARIZONA_MICD_CLAMP_STS | ARIZONA_JD1_STS);
 	INIT_DELAYED_WORK(&info->hpdet_work, arizona_hpdet_work);
+	INIT_DELAYED_WORK(&info->micd_detect_work, arizona_micd_detect);
 	INIT_DELAYED_WORK(&info->micd_timeout_work, arizona_micd_timeout_work);
 	platform_set_drvdata(pdev, info);
 
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 45c84777c624..3ef300baa2e6 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -134,6 +134,9 @@ struct arizona_pdata {
 	/** GPIO used for mic isolation with HPDET */
 	int hpdet_id_gpio;
 
+	/** Extra debounce timeout used during initial mic detection (ms) */
+	int micd_detect_debounce;
+
 	/** GPIO for mic detection polarity */
 	int micd_pol_gpio;
 
-- 
cgit 


From 7abd4e2a8f1c3e534da44c35e2d3d6353573e51f Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Mon, 1 Apr 2013 19:25:55 +0100
Subject: extcon: arizona: Make mic detection timeout configurable

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/extcon/extcon-arizona.c   | 13 ++++++++++---
 include/linux/mfd/arizona/pdata.h |  3 +++
 2 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/extcon/extcon-arizona.c b/drivers/extcon/extcon-arizona.c
index c7f8eb4299d2..7a1b4a7791ba 100644
--- a/drivers/extcon/extcon-arizona.c
+++ b/drivers/extcon/extcon-arizona.c
@@ -42,7 +42,7 @@
 #define ARIZONA_HPDET_MAX 10000
 
 #define HPDET_DEBOUNCE 500
-#define MICD_TIMEOUT 2000
+#define DEFAULT_MICD_TIMEOUT 2000
 
 struct arizona_extcon_info {
 	struct device *dev;
@@ -60,6 +60,8 @@ struct arizona_extcon_info {
 	const struct arizona_micd_range *micd_ranges;
 	int num_micd_ranges;
 
+	int micd_timeout;
+
 	bool micd_reva;
 	bool micd_clamp;
 
@@ -889,7 +891,7 @@ static void arizona_micd_detect(struct work_struct *work)
 handled:
 	if (info->detecting)
 		schedule_delayed_work(&info->micd_timeout_work,
-				      msecs_to_jiffies(MICD_TIMEOUT));
+				      msecs_to_jiffies(info->micd_timeout));
 
 	pm_runtime_mark_last_busy(info->dev);
 	mutex_unlock(&info->lock);
@@ -970,7 +972,7 @@ static irqreturn_t arizona_jackdet(int irq, void *data)
 
 		if (cancelled_mic)
 			schedule_delayed_work(&info->micd_timeout_work,
-					      msecs_to_jiffies(MICD_TIMEOUT));
+					      msecs_to_jiffies(info->micd_timeout));
 
 		goto out;
 	}
@@ -1027,6 +1029,11 @@ static irqreturn_t arizona_jackdet(int irq, void *data)
 				   ARIZONA_MICD_CLAMP_DB | ARIZONA_JD1_DB);
 	}
 
+	if (arizona->pdata.micd_timeout)
+		info->micd_timeout = arizona->pdata.micd_timeout;
+	else
+		info->micd_timeout = DEFAULT_MICD_TIMEOUT;
+
 	/* Clear trig_sts to make sure DCVDD is not forced up */
 	regmap_write(arizona->regmap, ARIZONA_AOD_WKUP_AND_TRIG,
 		     ARIZONA_MICD_CLAMP_FALL_TRIG_STS |
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 3ef300baa2e6..a0f940987a3e 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -149,6 +149,9 @@ struct arizona_pdata {
 	/** Mic detect debounce level */
 	int micd_dbtime;
 
+	/** Mic detect timeout (ms) */
+	int micd_timeout;
+
 	/** Force MICBIAS on for mic detect */
 	bool micd_force_micbias;
 
-- 
cgit 


From b43a7ffbf33be7e4d3b10b7714ee663ea2c52fe2 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Sun, 24 Mar 2013 11:56:43 +0530
Subject: cpufreq: Notify all policy->cpus in cpufreq_notify_transition()

policy->cpus contains all online cpus that have single shared clock line. And
their frequencies are always updated together.

Many SMP system's cpufreq drivers take care of this in individual drivers but
the best place for this code is in cpufreq core.

This patch modifies cpufreq_notify_transition() to notify frequency change for
all cpus in policy->cpus and hence updates all users of this API.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Acked-by: Stephen Warren <swarren@nvidia.com>
Tested-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/mach-davinci/cpufreq.c              |  5 +-
 arch/arm/mach-imx/cpufreq.c                  |  5 +-
 arch/arm/mach-integrator/cpu.c               |  6 +--
 arch/arm/mach-pxa/cpufreq-pxa2xx.c           |  5 +-
 arch/arm/mach-pxa/cpufreq-pxa3xx.c           |  5 +-
 arch/arm/mach-s3c24xx/cpufreq.c              |  8 +--
 arch/arm/mach-sa1100/cpu-sa1100.c            |  5 +-
 arch/arm/mach-sa1100/cpu-sa1110.c            |  5 +-
 arch/arm/mach-tegra/cpu-tegra.c              | 15 +++---
 arch/avr32/mach-at32ap/cpufreq.c             |  5 +-
 arch/blackfin/mach-common/cpufreq.c          | 79 ++++++++++++----------------
 arch/cris/arch-v32/mach-a3/cpufreq.c         | 20 +++----
 arch/cris/arch-v32/mach-fs/cpufreq.c         | 17 +++---
 arch/ia64/kernel/cpufreq/acpi-cpufreq.c      | 22 ++++----
 arch/mips/kernel/cpufreq/loongson2_cpufreq.c |  5 +-
 arch/powerpc/platforms/cell/cbe_cpufreq.c    |  5 +-
 arch/powerpc/platforms/pasemi/cpufreq.c      |  5 +-
 arch/powerpc/platforms/powermac/cpufreq_32.c | 14 ++---
 arch/powerpc/platforms/powermac/cpufreq_64.c |  5 +-
 arch/sh/kernel/cpufreq.c                     |  5 +-
 arch/sparc/kernel/us2e_cpufreq.c             | 13 ++---
 arch/sparc/kernel/us3_cpufreq.c              | 13 ++---
 arch/unicore32/kernel/cpu-ucv2.c             |  5 +-
 drivers/cpufreq/acpi-cpufreq.c               | 11 +---
 drivers/cpufreq/cpufreq-cpu0.c               | 12 ++---
 drivers/cpufreq/cpufreq-nforce2.c            |  5 +-
 drivers/cpufreq/cpufreq.c                    | 45 +++++++++-------
 drivers/cpufreq/dbx500-cpufreq.c             |  6 +--
 drivers/cpufreq/e_powersaver.c               | 11 ++--
 drivers/cpufreq/elanfreq.c                   | 10 ++--
 drivers/cpufreq/exynos-cpufreq.c             |  7 +--
 drivers/cpufreq/gx-suspmod.c                 | 11 ++--
 drivers/cpufreq/imx6q-cpufreq.c              | 12 ++---
 drivers/cpufreq/kirkwood-cpufreq.c           | 10 ++--
 drivers/cpufreq/longhaul.c                   | 18 ++++---
 drivers/cpufreq/maple-cpufreq.c              |  5 +-
 drivers/cpufreq/omap-cpufreq.c               | 11 +---
 drivers/cpufreq/p4-clockmod.c                | 10 +---
 drivers/cpufreq/pcc-cpufreq.c                |  5 +-
 drivers/cpufreq/powernow-k6.c                | 12 ++---
 drivers/cpufreq/powernow-k7.c                | 10 ++--
 drivers/cpufreq/powernow-k8.c                | 16 +++---
 drivers/cpufreq/s3c2416-cpufreq.c            |  5 +-
 drivers/cpufreq/s3c64xx-cpufreq.c            |  7 ++-
 drivers/cpufreq/s5pv210-cpufreq.c            |  5 +-
 drivers/cpufreq/sc520_freq.c                 | 10 ++--
 drivers/cpufreq/spear-cpufreq.c              |  7 +--
 drivers/cpufreq/speedstep-centrino.c         | 24 ++-------
 drivers/cpufreq/speedstep-ich.c              | 12 +----
 drivers/cpufreq/speedstep-smi.c              |  5 +-
 include/linux/cpufreq.h                      |  4 +-
 51 files changed, 238 insertions(+), 340 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/cpufreq.c b/arch/arm/mach-davinci/cpufreq.c
index 4729eaab0f40..55eb8703043d 100644
--- a/arch/arm/mach-davinci/cpufreq.c
+++ b/arch/arm/mach-davinci/cpufreq.c
@@ -90,7 +90,6 @@ static int davinci_target(struct cpufreq_policy *policy,
 
 	freqs.old = davinci_getspeed(0);
 	freqs.new = clk_round_rate(armclk, target_freq * 1000) / 1000;
-	freqs.cpu = 0;
 
 	if (freqs.old == freqs.new)
 		return ret;
@@ -102,7 +101,7 @@ static int davinci_target(struct cpufreq_policy *policy,
 	if (ret)
 		return -EINVAL;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* if moving to higher frequency, up the voltage beforehand */
 	if (pdata->set_voltage && freqs.new > freqs.old) {
@@ -126,7 +125,7 @@ static int davinci_target(struct cpufreq_policy *policy,
 		pdata->set_voltage(idx);
 
 out:
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return ret;
 }
diff --git a/arch/arm/mach-imx/cpufreq.c b/arch/arm/mach-imx/cpufreq.c
index d8c75c3c925d..cfce5e3f67f5 100644
--- a/arch/arm/mach-imx/cpufreq.c
+++ b/arch/arm/mach-imx/cpufreq.c
@@ -87,13 +87,12 @@ static int mxc_set_target(struct cpufreq_policy *policy,
 
 	freqs.old = clk_get_rate(cpu_clk) / 1000;
 	freqs.new = freq_Hz / 1000;
-	freqs.cpu = 0;
 	freqs.flags = 0;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	ret = set_cpu_freq(freq_Hz);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return ret;
 }
diff --git a/arch/arm/mach-integrator/cpu.c b/arch/arm/mach-integrator/cpu.c
index 590c192cdf4d..df863c30771c 100644
--- a/arch/arm/mach-integrator/cpu.c
+++ b/arch/arm/mach-integrator/cpu.c
@@ -123,14 +123,12 @@ static int integrator_set_target(struct cpufreq_policy *policy,
 	vco = icst_hz_to_vco(&cclk_params, target_freq * 1000);
 	freqs.new = icst_hz(&cclk_params, vco) / 1000;
 
-	freqs.cpu = policy->cpu;
-
 	if (freqs.old == freqs.new) {
 		set_cpus_allowed(current, cpus_allowed);
 		return 0;
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	cm_osc = __raw_readl(CM_OSC);
 
@@ -151,7 +149,7 @@ static int integrator_set_target(struct cpufreq_policy *policy,
 	 */
 	set_cpus_allowed(current, cpus_allowed);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/arch/arm/mach-pxa/cpufreq-pxa2xx.c b/arch/arm/mach-pxa/cpufreq-pxa2xx.c
index 6a7aeab42f6c..f1ca4daa1ad6 100644
--- a/arch/arm/mach-pxa/cpufreq-pxa2xx.c
+++ b/arch/arm/mach-pxa/cpufreq-pxa2xx.c
@@ -311,7 +311,6 @@ static int pxa_set_target(struct cpufreq_policy *policy,
 	new_freq_mem = pxa_freq_settings[idx].membus;
 	freqs.old = policy->cur;
 	freqs.new = new_freq_cpu;
-	freqs.cpu = policy->cpu;
 
 	if (freq_debug)
 		pr_debug("Changing CPU frequency to %d Mhz, (SDRAM %d Mhz)\n",
@@ -327,7 +326,7 @@ static int pxa_set_target(struct cpufreq_policy *policy,
 	 * you should add a notify client with any platform specific
 	 * Vcc changing capability
 	 */
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* Calculate the next MDREFR.  If we're slowing down the SDRAM clock
 	 * we need to preset the smaller DRI before the change.	 If we're
@@ -382,7 +381,7 @@ static int pxa_set_target(struct cpufreq_policy *policy,
 	 * you should add a notify client with any platform specific
 	 * SDRAM refresh timer adjustments
 	 */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	/*
 	 * Even if voltage setting fails, we don't report it, as the frequency
diff --git a/arch/arm/mach-pxa/cpufreq-pxa3xx.c b/arch/arm/mach-pxa/cpufreq-pxa3xx.c
index b85b4ab7aac6..8c45b2b926a7 100644
--- a/arch/arm/mach-pxa/cpufreq-pxa3xx.c
+++ b/arch/arm/mach-pxa/cpufreq-pxa3xx.c
@@ -184,7 +184,6 @@ static int pxa3xx_cpufreq_set(struct cpufreq_policy *policy,
 
 	freqs.old = policy->cur;
 	freqs.new = next->cpufreq_mhz * 1000;
-	freqs.cpu = policy->cpu;
 
 	pr_debug("CPU frequency from %d MHz to %d MHz%s\n",
 			freqs.old / 1000, freqs.new / 1000,
@@ -193,14 +192,14 @@ static int pxa3xx_cpufreq_set(struct cpufreq_policy *policy,
 	if (freqs.old == target_freq)
 		return 0;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	local_irq_save(flags);
 	__update_core_freq(next);
 	__update_bus_freq(next);
 	local_irq_restore(flags);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/arch/arm/mach-s3c24xx/cpufreq.c b/arch/arm/mach-s3c24xx/cpufreq.c
index 5f181e733eee..3c0e78ede0da 100644
--- a/arch/arm/mach-s3c24xx/cpufreq.c
+++ b/arch/arm/mach-s3c24xx/cpufreq.c
@@ -204,7 +204,6 @@ static int s3c_cpufreq_settarget(struct cpufreq_policy *policy,
 	freqs.old = cpu_cur.freq;
 	freqs.new = cpu_new.freq;
 
-	freqs.freqs.cpu = 0;
 	freqs.freqs.old = cpu_cur.freq.armclk / 1000;
 	freqs.freqs.new = cpu_new.freq.armclk / 1000;
 
@@ -218,9 +217,7 @@ static int s3c_cpufreq_settarget(struct cpufreq_policy *policy,
 	s3c_cpufreq_updateclk(clk_pclk, cpu_new.freq.pclk);
 
 	/* start the frequency change */
-
-	if (policy)
-		cpufreq_notify_transition(&freqs.freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs.freqs, CPUFREQ_PRECHANGE);
 
 	/* If hclk is staying the same, then we do not need to
 	 * re-write the IO or the refresh timings whilst we are changing
@@ -264,8 +261,7 @@ static int s3c_cpufreq_settarget(struct cpufreq_policy *policy,
 	local_irq_restore(flags);
 
 	/* notify everyone we've done this */
-	if (policy)
-		cpufreq_notify_transition(&freqs.freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs.freqs, CPUFREQ_POSTCHANGE);
 
 	s3c_freq_dbg("%s: finished\n", __func__);
 	return 0;
diff --git a/arch/arm/mach-sa1100/cpu-sa1100.c b/arch/arm/mach-sa1100/cpu-sa1100.c
index e8f4d1e19233..32687617c7a5 100644
--- a/arch/arm/mach-sa1100/cpu-sa1100.c
+++ b/arch/arm/mach-sa1100/cpu-sa1100.c
@@ -201,9 +201,8 @@ static int sa1100_target(struct cpufreq_policy *policy,
 
 	freqs.old = cur;
 	freqs.new = sa11x0_ppcr_to_freq(new_ppcr);
-	freqs.cpu = 0;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	if (freqs.new > cur)
 		sa1100_update_dram_timings(cur, freqs.new);
@@ -213,7 +212,7 @@ static int sa1100_target(struct cpufreq_policy *policy,
 	if (freqs.new < cur)
 		sa1100_update_dram_timings(cur, freqs.new);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/arch/arm/mach-sa1100/cpu-sa1110.c b/arch/arm/mach-sa1100/cpu-sa1110.c
index 48c45b0c92bb..38a77330dc16 100644
--- a/arch/arm/mach-sa1100/cpu-sa1110.c
+++ b/arch/arm/mach-sa1100/cpu-sa1110.c
@@ -258,7 +258,6 @@ static int sa1110_target(struct cpufreq_policy *policy,
 
 	freqs.old = sa11x0_getspeed(0);
 	freqs.new = sa11x0_ppcr_to_freq(ppcr);
-	freqs.cpu = 0;
 
 	sdram_calculate_timing(&sd, freqs.new, sdram);
 
@@ -279,7 +278,7 @@ static int sa1110_target(struct cpufreq_policy *policy,
 	sd.mdcas[2] = 0xaaaaaaaa;
 #endif
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/*
 	 * The clock could be going away for some time.  Set the SDRAMs
@@ -327,7 +326,7 @@ static int sa1110_target(struct cpufreq_policy *policy,
 	 */
 	sdram_update_refresh(freqs.new, sdram);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/arch/arm/mach-tegra/cpu-tegra.c b/arch/arm/mach-tegra/cpu-tegra.c
index e3d6e15ff188..11ca730970f8 100644
--- a/arch/arm/mach-tegra/cpu-tegra.c
+++ b/arch/arm/mach-tegra/cpu-tegra.c
@@ -106,7 +106,8 @@ out:
 	return ret;
 }
 
-static int tegra_update_cpu_speed(unsigned long rate)
+static int tegra_update_cpu_speed(struct cpufreq_policy *policy,
+		unsigned long rate)
 {
 	int ret = 0;
 	struct cpufreq_freqs freqs;
@@ -128,8 +129,7 @@ static int tegra_update_cpu_speed(unsigned long rate)
 	else
 		clk_set_rate(emc_clk, 100000000);  /* emc 50Mhz */
 
-	for_each_online_cpu(freqs.cpu)
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 #ifdef CONFIG_CPU_FREQ_DEBUG
 	printk(KERN_DEBUG "cpufreq-tegra: transition: %u --> %u\n",
@@ -143,8 +143,7 @@ static int tegra_update_cpu_speed(unsigned long rate)
 		return ret;
 	}
 
-	for_each_online_cpu(freqs.cpu)
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
@@ -181,7 +180,7 @@ static int tegra_target(struct cpufreq_policy *policy,
 
 	target_cpu_speed[policy->cpu] = freq;
 
-	ret = tegra_update_cpu_speed(tegra_cpu_highest_speed());
+	ret = tegra_update_cpu_speed(policy, tegra_cpu_highest_speed());
 
 out:
 	mutex_unlock(&tegra_cpu_lock);
@@ -193,10 +192,12 @@ static int tegra_pm_notify(struct notifier_block *nb, unsigned long event,
 {
 	mutex_lock(&tegra_cpu_lock);
 	if (event == PM_SUSPEND_PREPARE) {
+		struct cpufreq_policy *policy = cpufreq_cpu_get(0);
 		is_suspended = true;
 		pr_info("Tegra cpufreq suspend: setting frequency to %d kHz\n",
 			freq_table[0].frequency);
-		tegra_update_cpu_speed(freq_table[0].frequency);
+		tegra_update_cpu_speed(policy, freq_table[0].frequency);
+		cpufreq_cpu_put(policy);
 	} else if (event == PM_POST_SUSPEND) {
 		is_suspended = false;
 	}
diff --git a/arch/avr32/mach-at32ap/cpufreq.c b/arch/avr32/mach-at32ap/cpufreq.c
index 18b765629a0c..654488723cb5 100644
--- a/arch/avr32/mach-at32ap/cpufreq.c
+++ b/arch/avr32/mach-at32ap/cpufreq.c
@@ -61,7 +61,6 @@ static int at32_set_target(struct cpufreq_policy *policy,
 
 	freqs.old = at32_get_speed(0);
 	freqs.new = (freq + 500) / 1000;
-	freqs.cpu = 0;
 	freqs.flags = 0;
 
 	if (!ref_freq) {
@@ -69,7 +68,7 @@ static int at32_set_target(struct cpufreq_policy *policy,
 		loops_per_jiffy_ref = boot_cpu_data.loops_per_jiffy;
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	if (freqs.old < freqs.new)
 		boot_cpu_data.loops_per_jiffy = cpufreq_scale(
 				loops_per_jiffy_ref, ref_freq, freqs.new);
@@ -77,7 +76,7 @@ static int at32_set_target(struct cpufreq_policy *policy,
 	if (freqs.new < freqs.old)
 		boot_cpu_data.loops_per_jiffy = cpufreq_scale(
 				loops_per_jiffy_ref, ref_freq, freqs.new);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	pr_debug("cpufreq: set frequency %lu Hz\n", freq);
 
diff --git a/arch/blackfin/mach-common/cpufreq.c b/arch/blackfin/mach-common/cpufreq.c
index d88bd31319e6..995511e80bef 100644
--- a/arch/blackfin/mach-common/cpufreq.c
+++ b/arch/blackfin/mach-common/cpufreq.c
@@ -127,13 +127,13 @@ unsigned long cpu_set_cclk(int cpu, unsigned long new)
 }
 #endif
 
-static int bfin_target(struct cpufreq_policy *poli,
+static int bfin_target(struct cpufreq_policy *policy,
 			unsigned int target_freq, unsigned int relation)
 {
 #ifndef CONFIG_BF60x
 	unsigned int plldiv;
 #endif
-	unsigned int index, cpu;
+	unsigned int index;
 	unsigned long cclk_hz;
 	struct cpufreq_freqs freqs;
 	static unsigned long lpj_ref;
@@ -144,59 +144,48 @@ static int bfin_target(struct cpufreq_policy *poli,
 	cycles_t cycles;
 #endif
 
-	for_each_online_cpu(cpu) {
-		struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
+	if (cpufreq_frequency_table_target(policy, bfin_freq_table, target_freq,
+				relation, &index))
+		return -EINVAL;
 
-		if (!policy)
-			continue;
+	cclk_hz = bfin_freq_table[index].frequency;
 
-		if (cpufreq_frequency_table_target(policy, bfin_freq_table,
-				 target_freq, relation, &index))
-			return -EINVAL;
+	freqs.old = bfin_getfreq_khz(0);
+	freqs.new = cclk_hz;
 
-		cclk_hz = bfin_freq_table[index].frequency;
+	pr_debug("cpufreq: changing cclk to %lu; target = %u, oldfreq = %u\n",
+			cclk_hz, target_freq, freqs.old);
 
-		freqs.old = bfin_getfreq_khz(0);
-		freqs.new = cclk_hz;
-		freqs.cpu = cpu;
-
-		pr_debug("cpufreq: changing cclk to %lu; target = %u, oldfreq = %u\n",
-			 cclk_hz, target_freq, freqs.old);
-
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-		if (cpu == CPUFREQ_CPU) {
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 #ifndef CONFIG_BF60x
-			plldiv = (bfin_read_PLL_DIV() & SSEL) |
-						dpm_state_table[index].csel;
-			bfin_write_PLL_DIV(plldiv);
+	plldiv = (bfin_read_PLL_DIV() & SSEL) | dpm_state_table[index].csel;
+	bfin_write_PLL_DIV(plldiv);
 #else
-			ret = cpu_set_cclk(cpu, freqs.new * 1000);
-			if (ret != 0) {
-				WARN_ONCE(ret, "cpufreq set freq failed %d\n", ret);
-				break;
-			}
+	ret = cpu_set_cclk(policy->cpu, freqs.new * 1000);
+	if (ret != 0) {
+		WARN_ONCE(ret, "cpufreq set freq failed %d\n", ret);
+		return ret;
+	}
 #endif
-			on_each_cpu(bfin_adjust_core_timer, &index, 1);
+	on_each_cpu(bfin_adjust_core_timer, &index, 1);
 #if defined(CONFIG_CYCLES_CLOCKSOURCE)
-			cycles = get_cycles();
-			SSYNC();
-			cycles += 10; /* ~10 cycles we lose after get_cycles() */
-			__bfin_cycles_off +=
-			    (cycles << __bfin_cycles_mod) - (cycles << index);
-			__bfin_cycles_mod = index;
+	cycles = get_cycles();
+	SSYNC();
+	cycles += 10; /* ~10 cycles we lose after get_cycles() */
+	__bfin_cycles_off += (cycles << __bfin_cycles_mod) - (cycles << index);
+	__bfin_cycles_mod = index;
 #endif
-			if (!lpj_ref_freq) {
-				lpj_ref = loops_per_jiffy;
-				lpj_ref_freq = freqs.old;
-			}
-			if (freqs.new != freqs.old) {
-				loops_per_jiffy = cpufreq_scale(lpj_ref,
-						lpj_ref_freq, freqs.new);
-			}
-		}
-		/* TODO: just test case for cycles clock source, remove later */
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	if (!lpj_ref_freq) {
+		lpj_ref = loops_per_jiffy;
+		lpj_ref_freq = freqs.old;
 	}
+	if (freqs.new != freqs.old) {
+		loops_per_jiffy = cpufreq_scale(lpj_ref,
+				lpj_ref_freq, freqs.new);
+	}
+
+	/* TODO: just test case for cycles clock source, remove later */
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	pr_debug("cpufreq: done\n");
 	return ret;
diff --git a/arch/cris/arch-v32/mach-a3/cpufreq.c b/arch/cris/arch-v32/mach-a3/cpufreq.c
index ee391ecb5bc9..ee142c490575 100644
--- a/arch/cris/arch-v32/mach-a3/cpufreq.c
+++ b/arch/cris/arch-v32/mach-a3/cpufreq.c
@@ -27,23 +27,17 @@ static unsigned int cris_freq_get_cpu_frequency(unsigned int cpu)
 	return clk_ctrl.pll ? 200000 : 6000;
 }
 
-static void cris_freq_set_cpu_state(unsigned int state)
+static void cris_freq_set_cpu_state(struct cpufreq_policy *policy,
+		unsigned int state)
 {
-	int i = 0;
 	struct cpufreq_freqs freqs;
 	reg_clkgen_rw_clk_ctrl clk_ctrl;
 	clk_ctrl = REG_RD(clkgen, regi_clkgen, rw_clk_ctrl);
 
-#ifdef CONFIG_SMP
-	for_each_present_cpu(i)
-#endif
-	{
-		freqs.old = cris_freq_get_cpu_frequency(i);
-		freqs.new = cris_freq_table[state].frequency;
-		freqs.cpu = i;
-	}
+	freqs.old = cris_freq_get_cpu_frequency(policy->cpu);
+	freqs.new = cris_freq_table[state].frequency;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	local_irq_disable();
 
@@ -57,7 +51,7 @@ static void cris_freq_set_cpu_state(unsigned int state)
 
 	local_irq_enable();
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 };
 
 static int cris_freq_verify(struct cpufreq_policy *policy)
@@ -75,7 +69,7 @@ static int cris_freq_target(struct cpufreq_policy *policy,
 			target_freq, relation, &newstate))
 		return -EINVAL;
 
-	cris_freq_set_cpu_state(newstate);
+	cris_freq_set_cpu_state(policy, newstate);
 
 	return 0;
 }
diff --git a/arch/cris/arch-v32/mach-fs/cpufreq.c b/arch/cris/arch-v32/mach-fs/cpufreq.c
index d92cf70d1cbe..12952235d5db 100644
--- a/arch/cris/arch-v32/mach-fs/cpufreq.c
+++ b/arch/cris/arch-v32/mach-fs/cpufreq.c
@@ -27,20 +27,17 @@ static unsigned int cris_freq_get_cpu_frequency(unsigned int cpu)
 	return clk_ctrl.pll ? 200000 : 6000;
 }
 
-static void cris_freq_set_cpu_state(unsigned int state)
+static void cris_freq_set_cpu_state(struct cpufreq_policy *policy,
+		unsigned int state)
 {
-	int i;
 	struct cpufreq_freqs freqs;
 	reg_config_rw_clk_ctrl clk_ctrl;
 	clk_ctrl = REG_RD(config, regi_config, rw_clk_ctrl);
 
-	for_each_possible_cpu(i) {
-		freqs.old = cris_freq_get_cpu_frequency(i);
-		freqs.new = cris_freq_table[state].frequency;
-		freqs.cpu = i;
-	}
+	freqs.old = cris_freq_get_cpu_frequency(policy->cpu);
+	freqs.new = cris_freq_table[state].frequency;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	local_irq_disable();
 
@@ -54,7 +51,7 @@ static void cris_freq_set_cpu_state(unsigned int state)
 
 	local_irq_enable();
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 };
 
 static int cris_freq_verify(struct cpufreq_policy *policy)
@@ -71,7 +68,7 @@ static int cris_freq_target(struct cpufreq_policy *policy,
 	    (policy, cris_freq_table, target_freq, relation, &newstate))
 		return -EINVAL;
 
-	cris_freq_set_cpu_state(newstate);
+	cris_freq_set_cpu_state(policy, newstate);
 
 	return 0;
 }
diff --git a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
index f09b174244d5..4700fef8d1fa 100644
--- a/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
+++ b/arch/ia64/kernel/cpufreq/acpi-cpufreq.c
@@ -137,7 +137,7 @@ migrate_end:
 static int
 processor_set_freq (
 	struct cpufreq_acpi_io	*data,
-	unsigned int		cpu,
+	struct cpufreq_policy   *policy,
 	int			state)
 {
 	int			ret = 0;
@@ -149,8 +149,8 @@ processor_set_freq (
 	pr_debug("processor_set_freq\n");
 
 	saved_mask = current->cpus_allowed;
-	set_cpus_allowed_ptr(current, cpumask_of(cpu));
-	if (smp_processor_id() != cpu) {
+	set_cpus_allowed_ptr(current, cpumask_of(policy->cpu));
+	if (smp_processor_id() != policy->cpu) {
 		retval = -EAGAIN;
 		goto migrate_end;
 	}
@@ -170,12 +170,11 @@ processor_set_freq (
 		data->acpi_data.state, state);
 
 	/* cpufreq frequency struct */
-	cpufreq_freqs.cpu = cpu;
 	cpufreq_freqs.old = data->freq_table[data->acpi_data.state].frequency;
 	cpufreq_freqs.new = data->freq_table[state].frequency;
 
 	/* notify cpufreq */
-	cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &cpufreq_freqs, CPUFREQ_PRECHANGE);
 
 	/*
 	 * First we write the target state's 'control' value to the
@@ -189,17 +188,20 @@ processor_set_freq (
 	ret = processor_set_pstate(value);
 	if (ret) {
 		unsigned int tmp = cpufreq_freqs.new;
-		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
+		cpufreq_notify_transition(policy, &cpufreq_freqs,
+				CPUFREQ_POSTCHANGE);
 		cpufreq_freqs.new = cpufreq_freqs.old;
 		cpufreq_freqs.old = tmp;
-		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_PRECHANGE);
-		cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
+		cpufreq_notify_transition(policy, &cpufreq_freqs,
+				CPUFREQ_PRECHANGE);
+		cpufreq_notify_transition(policy, &cpufreq_freqs,
+				CPUFREQ_POSTCHANGE);
 		printk(KERN_WARNING "Transition failed with error %d\n", ret);
 		retval = -ENODEV;
 		goto migrate_end;
 	}
 
-	cpufreq_notify_transition(&cpufreq_freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &cpufreq_freqs, CPUFREQ_POSTCHANGE);
 
 	data->acpi_data.state = state;
 
@@ -240,7 +242,7 @@ acpi_cpufreq_target (
 	if (result)
 		return (result);
 
-	result = processor_set_freq(data, policy->cpu, next_state);
+	result = processor_set_freq(data, policy, next_state);
 
 	return (result);
 }
diff --git a/arch/mips/kernel/cpufreq/loongson2_cpufreq.c b/arch/mips/kernel/cpufreq/loongson2_cpufreq.c
index 3237c5235f9c..bafda7063f03 100644
--- a/arch/mips/kernel/cpufreq/loongson2_cpufreq.c
+++ b/arch/mips/kernel/cpufreq/loongson2_cpufreq.c
@@ -80,7 +80,6 @@ static int loongson2_cpufreq_target(struct cpufreq_policy *policy,
 
 	pr_debug("cpufreq: requested frequency %u Hz\n", target_freq * 1000);
 
-	freqs.cpu = cpu;
 	freqs.old = loongson2_cpufreq_get(cpu);
 	freqs.new = freq;
 	freqs.flags = 0;
@@ -89,7 +88,7 @@ static int loongson2_cpufreq_target(struct cpufreq_policy *policy,
 		return 0;
 
 	/* notifiers */
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 
@@ -97,7 +96,7 @@ static int loongson2_cpufreq_target(struct cpufreq_policy *policy,
 	clk_set_rate(cpuclk, freq);
 
 	/* notifiers */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	pr_debug("cpufreq: set frequency %u kHz\n", freq);
 
diff --git a/arch/powerpc/platforms/cell/cbe_cpufreq.c b/arch/powerpc/platforms/cell/cbe_cpufreq.c
index d4c39e32f147..718c6a33023d 100644
--- a/arch/powerpc/platforms/cell/cbe_cpufreq.c
+++ b/arch/powerpc/platforms/cell/cbe_cpufreq.c
@@ -156,10 +156,9 @@ static int cbe_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = policy->cur;
 	freqs.new = cbe_freqs[cbe_pmode_new].frequency;
-	freqs.cpu = policy->cpu;
 
 	mutex_lock(&cbe_switch_mutex);
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	pr_debug("setting frequency for cpu %d to %d kHz, " \
 		 "1/%d of max frequency\n",
@@ -169,7 +168,7 @@ static int cbe_cpufreq_target(struct cpufreq_policy *policy,
 
 	rc = set_pmode(policy->cpu, cbe_pmode_new);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	mutex_unlock(&cbe_switch_mutex);
 
 	return rc;
diff --git a/arch/powerpc/platforms/pasemi/cpufreq.c b/arch/powerpc/platforms/pasemi/cpufreq.c
index 890f30e70f98..be1e7958909e 100644
--- a/arch/powerpc/platforms/pasemi/cpufreq.c
+++ b/arch/powerpc/platforms/pasemi/cpufreq.c
@@ -273,10 +273,9 @@ static int pas_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = policy->cur;
 	freqs.new = pas_freqs[pas_astate_new].frequency;
-	freqs.cpu = policy->cpu;
 
 	mutex_lock(&pas_switch_mutex);
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	pr_debug("setting frequency for cpu %d to %d kHz, 1/%d of max frequency\n",
 		 policy->cpu,
@@ -288,7 +287,7 @@ static int pas_cpufreq_target(struct cpufreq_policy *policy,
 	for_each_online_cpu(i)
 		set_astate(i, pas_astate_new);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	mutex_unlock(&pas_switch_mutex);
 
 	ppc_proc_freq = freqs.new * 1000ul;
diff --git a/arch/powerpc/platforms/powermac/cpufreq_32.c b/arch/powerpc/platforms/powermac/cpufreq_32.c
index 311b804353b1..3104fad82480 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_32.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_32.c
@@ -335,7 +335,8 @@ static int pmu_set_cpu_speed(int low_speed)
 	return 0;
 }
 
-static int do_set_cpu_speed(int speed_mode, int notify)
+static int do_set_cpu_speed(struct cpufreq_policy *policy, int speed_mode,
+		int notify)
 {
 	struct cpufreq_freqs freqs;
 	unsigned long l3cr;
@@ -343,13 +344,12 @@ static int do_set_cpu_speed(int speed_mode, int notify)
 
 	freqs.old = cur_freq;
 	freqs.new = (speed_mode == CPUFREQ_HIGH) ? hi_freq : low_freq;
-	freqs.cpu = smp_processor_id();
 
 	if (freqs.old == freqs.new)
 		return 0;
 
 	if (notify)
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+		cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	if (speed_mode == CPUFREQ_LOW &&
 	    cpu_has_feature(CPU_FTR_L3CR)) {
 		l3cr = _get_L3CR();
@@ -366,7 +366,7 @@ static int do_set_cpu_speed(int speed_mode, int notify)
 			_set_L3CR(prev_l3cr);
 	}
 	if (notify)
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+		cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	cur_freq = (speed_mode == CPUFREQ_HIGH) ? hi_freq : low_freq;
 
 	return 0;
@@ -393,7 +393,7 @@ static int pmac_cpufreq_target(	struct cpufreq_policy *policy,
 			target_freq, relation, &newstate))
 		return -EINVAL;
 
-	rc = do_set_cpu_speed(newstate, 1);
+	rc = do_set_cpu_speed(policy, newstate, 1);
 
 	ppc_proc_freq = cur_freq * 1000ul;
 	return rc;
@@ -442,7 +442,7 @@ static int pmac_cpufreq_suspend(struct cpufreq_policy *policy)
 	no_schedule = 1;
 	sleep_freq = cur_freq;
 	if (cur_freq == low_freq && !is_pmu_based)
-		do_set_cpu_speed(CPUFREQ_HIGH, 0);
+		do_set_cpu_speed(policy, CPUFREQ_HIGH, 0);
 	return 0;
 }
 
@@ -458,7 +458,7 @@ static int pmac_cpufreq_resume(struct cpufreq_policy *policy)
 	 * is that we force a switch to whatever it was, which is
 	 * probably high speed due to our suspend() routine
 	 */
-	do_set_cpu_speed(sleep_freq == low_freq ?
+	do_set_cpu_speed(policy, sleep_freq == low_freq ?
 			 CPUFREQ_LOW : CPUFREQ_HIGH, 0);
 
 	ppc_proc_freq = cur_freq * 1000ul;
diff --git a/arch/powerpc/platforms/powermac/cpufreq_64.c b/arch/powerpc/platforms/powermac/cpufreq_64.c
index 9650c6029c82..7ba423431cfe 100644
--- a/arch/powerpc/platforms/powermac/cpufreq_64.c
+++ b/arch/powerpc/platforms/powermac/cpufreq_64.c
@@ -339,11 +339,10 @@ static int g5_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = g5_cpu_freqs[g5_pmode_cur].frequency;
 	freqs.new = g5_cpu_freqs[newstate].frequency;
-	freqs.cpu = 0;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	rc = g5_switch_freq(newstate);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	mutex_unlock(&g5_switch_mutex);
 
diff --git a/arch/sh/kernel/cpufreq.c b/arch/sh/kernel/cpufreq.c
index e68b45b6f3f9..2c7bd94f95ee 100644
--- a/arch/sh/kernel/cpufreq.c
+++ b/arch/sh/kernel/cpufreq.c
@@ -69,15 +69,14 @@ static int sh_cpufreq_target(struct cpufreq_policy *policy,
 
 	dev_dbg(dev, "requested frequency %u Hz\n", target_freq * 1000);
 
-	freqs.cpu	= cpu;
 	freqs.old	= sh_cpufreq_get(cpu);
 	freqs.new	= (freq + 500) / 1000;
 	freqs.flags	= 0;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 	clk_set_rate(cpuclk, freq);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	dev_dbg(dev, "set frequency %lu Hz\n", freq);
 
diff --git a/arch/sparc/kernel/us2e_cpufreq.c b/arch/sparc/kernel/us2e_cpufreq.c
index 489fc15f3194..abe963d7b87c 100644
--- a/arch/sparc/kernel/us2e_cpufreq.c
+++ b/arch/sparc/kernel/us2e_cpufreq.c
@@ -248,8 +248,10 @@ static unsigned int us2e_freq_get(unsigned int cpu)
 	return clock_tick / estar_to_divisor(estar);
 }
 
-static void us2e_set_cpu_divider_index(unsigned int cpu, unsigned int index)
+static void us2e_set_cpu_divider_index(struct cpufreq_policy *policy,
+		unsigned int index)
 {
+	unsigned int cpu = policy->cpu;
 	unsigned long new_bits, new_freq;
 	unsigned long clock_tick, divisor, old_divisor, estar;
 	cpumask_t cpus_allowed;
@@ -272,14 +274,13 @@ static void us2e_set_cpu_divider_index(unsigned int cpu, unsigned int index)
 
 	freqs.old = clock_tick / old_divisor;
 	freqs.new = new_freq;
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	if (old_divisor != divisor)
 		us2e_transition(estar, new_bits, clock_tick * 1000,
 				old_divisor, divisor);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 }
@@ -295,7 +296,7 @@ static int us2e_freq_target(struct cpufreq_policy *policy,
 					   target_freq, relation, &new_index))
 		return -EINVAL;
 
-	us2e_set_cpu_divider_index(policy->cpu, new_index);
+	us2e_set_cpu_divider_index(policy, new_index);
 
 	return 0;
 }
@@ -335,7 +336,7 @@ static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy)
 static int us2e_freq_cpu_exit(struct cpufreq_policy *policy)
 {
 	if (cpufreq_us2e_driver)
-		us2e_set_cpu_divider_index(policy->cpu, 0);
+		us2e_set_cpu_divider_index(policy, 0);
 
 	return 0;
 }
diff --git a/arch/sparc/kernel/us3_cpufreq.c b/arch/sparc/kernel/us3_cpufreq.c
index eb1624b931d9..7ceb9c8458f0 100644
--- a/arch/sparc/kernel/us3_cpufreq.c
+++ b/arch/sparc/kernel/us3_cpufreq.c
@@ -96,8 +96,10 @@ static unsigned int us3_freq_get(unsigned int cpu)
 	return ret;
 }
 
-static void us3_set_cpu_divider_index(unsigned int cpu, unsigned int index)
+static void us3_set_cpu_divider_index(struct cpufreq_policy *policy,
+		unsigned int index)
 {
+	unsigned int cpu = policy->cpu;
 	unsigned long new_bits, new_freq, reg;
 	cpumask_t cpus_allowed;
 	struct cpufreq_freqs freqs;
@@ -131,14 +133,13 @@ static void us3_set_cpu_divider_index(unsigned int cpu, unsigned int index)
 
 	freqs.old = get_current_freq(cpu, reg);
 	freqs.new = new_freq;
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	reg &= ~SAFARI_CFG_DIV_MASK;
 	reg |= new_bits;
 	write_safari_cfg(reg);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	set_cpus_allowed_ptr(current, &cpus_allowed);
 }
@@ -156,7 +157,7 @@ static int us3_freq_target(struct cpufreq_policy *policy,
 					   &new_index))
 		return -EINVAL;
 
-	us3_set_cpu_divider_index(policy->cpu, new_index);
+	us3_set_cpu_divider_index(policy, new_index);
 
 	return 0;
 }
@@ -192,7 +193,7 @@ static int __init us3_freq_cpu_init(struct cpufreq_policy *policy)
 static int us3_freq_cpu_exit(struct cpufreq_policy *policy)
 {
 	if (cpufreq_us3_driver)
-		us3_set_cpu_divider_index(policy->cpu, 0);
+		us3_set_cpu_divider_index(policy, 0);
 
 	return 0;
 }
diff --git a/arch/unicore32/kernel/cpu-ucv2.c b/arch/unicore32/kernel/cpu-ucv2.c
index 4a99f62584c7..ba5a71ce2d71 100644
--- a/arch/unicore32/kernel/cpu-ucv2.c
+++ b/arch/unicore32/kernel/cpu-ucv2.c
@@ -52,15 +52,14 @@ static int ucv2_target(struct cpufreq_policy *policy,
 	struct cpufreq_freqs freqs;
 	struct clk *mclk = clk_get(NULL, "MAIN_CLK");
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	if (!clk_set_rate(mclk, target_freq * 1000)) {
 		freqs.old = cur;
 		freqs.new = target_freq;
-		freqs.cpu = 0;
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c
index 57a8774f0b4e..11b8b4b54ceb 100644
--- a/drivers/cpufreq/acpi-cpufreq.c
+++ b/drivers/cpufreq/acpi-cpufreq.c
@@ -423,7 +423,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	struct drv_cmd cmd;
 	unsigned int next_state = 0; /* Index into freq_table */
 	unsigned int next_perf_state = 0; /* Index into perf table */
-	unsigned int i;
 	int result = 0;
 
 	pr_debug("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
@@ -486,10 +485,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = perf->states[perf->state].core_frequency * 1000;
 	freqs.new = data->freq_table[next_state].frequency;
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	drv_write(&cmd);
 
@@ -502,10 +498,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	perf->state = next_perf_state;
 
 out:
diff --git a/drivers/cpufreq/cpufreq-cpu0.c b/drivers/cpufreq/cpufreq-cpu0.c
index a7e51bd20502..65618536abfa 100644
--- a/drivers/cpufreq/cpufreq-cpu0.c
+++ b/drivers/cpufreq/cpufreq-cpu0.c
@@ -46,7 +46,7 @@ static int cpu0_set_target(struct cpufreq_policy *policy,
 	struct opp *opp;
 	unsigned long volt = 0, volt_old = 0, tol = 0;
 	long freq_Hz;
-	unsigned int index, cpu;
+	unsigned int index;
 	int ret;
 
 	ret = cpufreq_frequency_table_target(policy, freq_table, target_freq,
@@ -66,10 +66,7 @@ static int cpu0_set_target(struct cpufreq_policy *policy,
 	if (freqs.old == freqs.new)
 		return 0;
 
-	for_each_online_cpu(cpu) {
-		freqs.cpu = cpu;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	if (cpu_reg) {
 		rcu_read_lock();
@@ -121,10 +118,7 @@ static int cpu0_set_target(struct cpufreq_policy *policy,
 	}
 
 post_notify:
-	for_each_online_cpu(cpu) {
-		freqs.cpu = cpu;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return ret;
 }
diff --git a/drivers/cpufreq/cpufreq-nforce2.c b/drivers/cpufreq/cpufreq-nforce2.c
index 13d311ee08b3..224a4787ecc2 100644
--- a/drivers/cpufreq/cpufreq-nforce2.c
+++ b/drivers/cpufreq/cpufreq-nforce2.c
@@ -263,7 +263,6 @@ static int nforce2_target(struct cpufreq_policy *policy,
 
 	freqs.old = nforce2_get(policy->cpu);
 	freqs.new = target_fsb * fid * 100;
-	freqs.cpu = 0;		/* Only one CPU on nForce2 platforms */
 
 	if (freqs.old == freqs.new)
 		return 0;
@@ -271,7 +270,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
 	pr_debug("Old CPU frequency %d kHz, new %d kHz\n",
 	       freqs.old, freqs.new);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* Disable IRQs */
 	/* local_irq_save(flags); */
@@ -286,7 +285,7 @@ static int nforce2_target(struct cpufreq_policy *policy,
 	/* Enable IRQs */
 	/* local_irq_restore(flags); */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 85963fc48a5f..0198cd0a60ce 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -249,19 +249,9 @@ static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
 #endif
 
 
-/**
- * cpufreq_notify_transition - call notifier chain and adjust_jiffies
- * on frequency transition.
- *
- * This function calls the transition notifiers and the "adjust_jiffies"
- * function. It is called twice on all CPU frequency changes that have
- * external effects.
- */
-void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
+void __cpufreq_notify_transition(struct cpufreq_policy *policy,
+		struct cpufreq_freqs *freqs, unsigned int state)
 {
-	struct cpufreq_policy *policy;
-	unsigned long flags;
-
 	BUG_ON(irqs_disabled());
 
 	if (cpufreq_disabled())
@@ -271,10 +261,6 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 	pr_debug("notification %u of frequency transition to %u kHz\n",
 		state, freqs->new);
 
-	read_lock_irqsave(&cpufreq_driver_lock, flags);
-	policy = per_cpu(cpufreq_cpu_data, freqs->cpu);
-	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
 	switch (state) {
 
 	case CPUFREQ_PRECHANGE:
@@ -308,6 +294,20 @@ void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
 		break;
 	}
 }
+/**
+ * cpufreq_notify_transition - call notifier chain and adjust_jiffies
+ * on frequency transition.
+ *
+ * This function calls the transition notifiers and the "adjust_jiffies"
+ * function. It is called twice on all CPU frequency changes that have
+ * external effects.
+ */
+void cpufreq_notify_transition(struct cpufreq_policy *policy,
+		struct cpufreq_freqs *freqs, unsigned int state)
+{
+	for_each_cpu(freqs->cpu, policy->cpus)
+		__cpufreq_notify_transition(policy, freqs, state);
+}
 EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
 
 
@@ -1141,16 +1141,23 @@ static void handle_update(struct work_struct *work)
 static void cpufreq_out_of_sync(unsigned int cpu, unsigned int old_freq,
 				unsigned int new_freq)
 {
+	struct cpufreq_policy *policy;
 	struct cpufreq_freqs freqs;
+	unsigned long flags;
+
 
 	pr_debug("Warning: CPU frequency out of sync: cpufreq and timing "
 	       "core thinks of %u, is %u kHz.\n", old_freq, new_freq);
 
-	freqs.cpu = cpu;
 	freqs.old = old_freq;
 	freqs.new = new_freq;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+
+	read_lock_irqsave(&cpufreq_driver_lock, flags);
+	policy = per_cpu(cpufreq_cpu_data, cpu);
+	read_unlock_irqrestore(&cpufreq_driver_lock, flags);
+
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 }
 
 
diff --git a/drivers/cpufreq/dbx500-cpufreq.c b/drivers/cpufreq/dbx500-cpufreq.c
index 72f0c3efa76e..7192a6df94c0 100644
--- a/drivers/cpufreq/dbx500-cpufreq.c
+++ b/drivers/cpufreq/dbx500-cpufreq.c
@@ -55,8 +55,7 @@ static int dbx500_cpufreq_target(struct cpufreq_policy *policy,
 		return 0;
 
 	/* pre-change notification */
-	for_each_cpu(freqs.cpu, policy->cpus)
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* update armss clk frequency */
 	ret = clk_set_rate(armss_clk, freqs.new * 1000);
@@ -68,8 +67,7 @@ static int dbx500_cpufreq_target(struct cpufreq_policy *policy,
 	}
 
 	/* post change notification */
-	for_each_cpu(freqs.cpu, policy->cpus)
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/e_powersaver.c b/drivers/cpufreq/e_powersaver.c
index 3fffbe6025cd..37380fb92621 100644
--- a/drivers/cpufreq/e_powersaver.c
+++ b/drivers/cpufreq/e_powersaver.c
@@ -104,7 +104,7 @@ static unsigned int eps_get(unsigned int cpu)
 }
 
 static int eps_set_state(struct eps_cpu_data *centaur,
-			 unsigned int cpu,
+			 struct cpufreq_policy *policy,
 			 u32 dest_state)
 {
 	struct cpufreq_freqs freqs;
@@ -112,10 +112,9 @@ static int eps_set_state(struct eps_cpu_data *centaur,
 	int err = 0;
 	int i;
 
-	freqs.old = eps_get(cpu);
+	freqs.old = eps_get(policy->cpu);
 	freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* Wait while CPU is busy */
 	rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
@@ -162,7 +161,7 @@ postchange:
 		current_multiplier);
 	}
 #endif
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	return err;
 }
 
@@ -190,7 +189,7 @@ static int eps_target(struct cpufreq_policy *policy,
 
 	/* Make frequency transition */
 	dest_state = centaur->freq_table[newstate].index & 0xffff;
-	ret = eps_set_state(centaur, cpu, dest_state);
+	ret = eps_set_state(centaur, policy, dest_state);
 	if (ret)
 		printk(KERN_ERR "eps: Timeout!\n");
 	return ret;
diff --git a/drivers/cpufreq/elanfreq.c b/drivers/cpufreq/elanfreq.c
index 960671fd3d7e..658d860344b0 100644
--- a/drivers/cpufreq/elanfreq.c
+++ b/drivers/cpufreq/elanfreq.c
@@ -117,15 +117,15 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
  *	There is no return value.
  */
 
-static void elanfreq_set_cpu_state(unsigned int state)
+static void elanfreq_set_cpu_state(struct cpufreq_policy *policy,
+		unsigned int state)
 {
 	struct cpufreq_freqs    freqs;
 
 	freqs.old = elanfreq_get_cpu_frequency(0);
 	freqs.new = elan_multiplier[state].clock;
-	freqs.cpu = 0; /* elanfreq.c is UP only driver */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
 			elan_multiplier[state].clock);
@@ -161,7 +161,7 @@ static void elanfreq_set_cpu_state(unsigned int state)
 	udelay(10000);
 	local_irq_enable();
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 };
 
 
@@ -188,7 +188,7 @@ static int elanfreq_target(struct cpufreq_policy *policy,
 				target_freq, relation, &newstate))
 		return -EINVAL;
 
-	elanfreq_set_cpu_state(newstate);
+	elanfreq_set_cpu_state(policy, newstate);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/exynos-cpufreq.c b/drivers/cpufreq/exynos-cpufreq.c
index 78057a357ddb..c0c4ce53b9e9 100644
--- a/drivers/cpufreq/exynos-cpufreq.c
+++ b/drivers/cpufreq/exynos-cpufreq.c
@@ -70,7 +70,6 @@ static int exynos_cpufreq_scale(unsigned int target_freq)
 
 	freqs.old = policy->cur;
 	freqs.new = target_freq;
-	freqs.cpu = policy->cpu;
 
 	if (freqs.new == freqs.old)
 		goto out;
@@ -105,8 +104,7 @@ static int exynos_cpufreq_scale(unsigned int target_freq)
 	}
 	arm_volt = volt_table[index];
 
-	for_each_cpu(freqs.cpu, policy->cpus)
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* When the new frequency is higher than current frequency */
 	if ((freqs.new > freqs.old) && !safe_arm_volt) {
@@ -131,8 +129,7 @@ static int exynos_cpufreq_scale(unsigned int target_freq)
 
 	exynos_info->set_freq(old_index, index);
 
-	for_each_cpu(freqs.cpu, policy->cpus)
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	/* When the new frequency is lower than current frequency */
 	if ((freqs.new < freqs.old) ||
diff --git a/drivers/cpufreq/gx-suspmod.c b/drivers/cpufreq/gx-suspmod.c
index 456bee058fe6..3dfc99b9ca86 100644
--- a/drivers/cpufreq/gx-suspmod.c
+++ b/drivers/cpufreq/gx-suspmod.c
@@ -251,14 +251,13 @@ static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
  * set cpu speed in khz.
  **/
 
-static void gx_set_cpuspeed(unsigned int khz)
+static void gx_set_cpuspeed(struct cpufreq_policy *policy, unsigned int khz)
 {
 	u8 suscfg, pmer1;
 	unsigned int new_khz;
 	unsigned long flags;
 	struct cpufreq_freqs freqs;
 
-	freqs.cpu = 0;
 	freqs.old = gx_get_cpuspeed(0);
 
 	new_khz = gx_validate_speed(khz, &gx_params->on_duration,
@@ -266,11 +265,9 @@ static void gx_set_cpuspeed(unsigned int khz)
 
 	freqs.new = new_khz;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	local_irq_save(flags);
 
-
-
 	if (new_khz != stock_freq) {
 		/* if new khz == 100% of CPU speed, it is special case */
 		switch (gx_params->cs55x0->device) {
@@ -317,7 +314,7 @@ static void gx_set_cpuspeed(unsigned int khz)
 
 	gx_params->pci_suscfg = suscfg;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	pr_debug("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
 		gx_params->on_duration * 32, gx_params->off_duration * 32);
@@ -397,7 +394,7 @@ static int cpufreq_gx_target(struct cpufreq_policy *policy,
 		tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
 	}
 
-	gx_set_cpuspeed(tmp_freq);
+	gx_set_cpuspeed(policy, tmp_freq);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/imx6q-cpufreq.c b/drivers/cpufreq/imx6q-cpufreq.c
index 54e336de373b..b78bc35973ba 100644
--- a/drivers/cpufreq/imx6q-cpufreq.c
+++ b/drivers/cpufreq/imx6q-cpufreq.c
@@ -50,7 +50,7 @@ static int imx6q_set_target(struct cpufreq_policy *policy,
 	struct cpufreq_freqs freqs;
 	struct opp *opp;
 	unsigned long freq_hz, volt, volt_old;
-	unsigned int index, cpu;
+	unsigned int index;
 	int ret;
 
 	ret = cpufreq_frequency_table_target(policy, freq_table, target_freq,
@@ -68,10 +68,7 @@ static int imx6q_set_target(struct cpufreq_policy *policy,
 	if (freqs.old == freqs.new)
 		return 0;
 
-	for_each_online_cpu(cpu) {
-		freqs.cpu = cpu;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	rcu_read_lock();
 	opp = opp_find_freq_ceil(cpu_dev, &freq_hz);
@@ -166,10 +163,7 @@ static int imx6q_set_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	for_each_online_cpu(cpu) {
-		freqs.cpu = cpu;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/kirkwood-cpufreq.c b/drivers/cpufreq/kirkwood-cpufreq.c
index 60524764fe4e..d36ea8dc96eb 100644
--- a/drivers/cpufreq/kirkwood-cpufreq.c
+++ b/drivers/cpufreq/kirkwood-cpufreq.c
@@ -55,7 +55,8 @@ static unsigned int kirkwood_cpufreq_get_cpu_frequency(unsigned int cpu)
 	return kirkwood_freq_table[0].frequency;
 }
 
-static void kirkwood_cpufreq_set_cpu_state(unsigned int index)
+static void kirkwood_cpufreq_set_cpu_state(struct cpufreq_policy *policy,
+		unsigned int index)
 {
 	struct cpufreq_freqs freqs;
 	unsigned int state = kirkwood_freq_table[index].index;
@@ -63,9 +64,8 @@ static void kirkwood_cpufreq_set_cpu_state(unsigned int index)
 
 	freqs.old = kirkwood_cpufreq_get_cpu_frequency(0);
 	freqs.new = kirkwood_freq_table[index].frequency;
-	freqs.cpu = 0; /* Kirkwood is UP */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	dev_dbg(priv.dev, "Attempting to set frequency to %i KHz\n",
 		kirkwood_freq_table[index].frequency);
@@ -99,7 +99,7 @@ static void kirkwood_cpufreq_set_cpu_state(unsigned int index)
 
 		local_irq_enable();
 	}
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 };
 
 static int kirkwood_cpufreq_verify(struct cpufreq_policy *policy)
@@ -117,7 +117,7 @@ static int kirkwood_cpufreq_target(struct cpufreq_policy *policy,
 				target_freq, relation, &index))
 		return -EINVAL;
 
-	kirkwood_cpufreq_set_cpu_state(index);
+	kirkwood_cpufreq_set_cpu_state(policy, index);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/longhaul.c b/drivers/cpufreq/longhaul.c
index 1180d536d1eb..b448638e34de 100644
--- a/drivers/cpufreq/longhaul.c
+++ b/drivers/cpufreq/longhaul.c
@@ -242,7 +242,8 @@ static void do_powersaver(int cx_address, unsigned int mults_index,
  * Sets a new clock ratio.
  */
 
-static void longhaul_setstate(unsigned int table_index)
+static void longhaul_setstate(struct cpufreq_policy *policy,
+		unsigned int table_index)
 {
 	unsigned int mults_index;
 	int speed, mult;
@@ -267,9 +268,8 @@ static void longhaul_setstate(unsigned int table_index)
 
 	freqs.old = calc_speed(longhaul_get_cpu_mult());
 	freqs.new = speed;
-	freqs.cpu = 0; /* longhaul.c is UP only driver */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	pr_debug("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
 			fsb, mult/10, mult%10, print_speed(speed/1000));
@@ -386,7 +386,7 @@ retry_loop:
 		}
 	}
 	/* Report true CPU frequency */
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	if (!bm_timeout)
 		printk(KERN_INFO PFX "Warning: Timeout while waiting for "
@@ -648,7 +648,7 @@ static int longhaul_target(struct cpufreq_policy *policy,
 		return 0;
 
 	if (!can_scale_voltage)
-		longhaul_setstate(table_index);
+		longhaul_setstate(policy, table_index);
 	else {
 		/* On test system voltage transitions exceeding single
 		 * step up or down were turning motherboard off. Both
@@ -663,7 +663,7 @@ static int longhaul_target(struct cpufreq_policy *policy,
 		while (i != table_index) {
 			vid = (longhaul_table[i].index >> 8) & 0x1f;
 			if (vid != current_vid) {
-				longhaul_setstate(i);
+				longhaul_setstate(policy, i);
 				current_vid = vid;
 				msleep(200);
 			}
@@ -672,7 +672,7 @@ static int longhaul_target(struct cpufreq_policy *policy,
 			else
 				i--;
 		}
-		longhaul_setstate(table_index);
+		longhaul_setstate(policy, table_index);
 	}
 	longhaul_index = table_index;
 	return 0;
@@ -998,15 +998,17 @@ static int __init longhaul_init(void)
 
 static void __exit longhaul_exit(void)
 {
+	struct cpufreq_policy *policy = cpufreq_cpu_get(0);
 	int i;
 
 	for (i = 0; i < numscales; i++) {
 		if (mults[i] == maxmult) {
-			longhaul_setstate(i);
+			longhaul_setstate(policy, i);
 			break;
 		}
 	}
 
+	cpufreq_cpu_put(policy);
 	cpufreq_unregister_driver(&longhaul_driver);
 	kfree(longhaul_table);
 }
diff --git a/drivers/cpufreq/maple-cpufreq.c b/drivers/cpufreq/maple-cpufreq.c
index d4c4989823dc..cdd62915efaf 100644
--- a/drivers/cpufreq/maple-cpufreq.c
+++ b/drivers/cpufreq/maple-cpufreq.c
@@ -158,11 +158,10 @@ static int maple_cpufreq_target(struct cpufreq_policy *policy,
 
 	freqs.old = maple_cpu_freqs[maple_pmode_cur].frequency;
 	freqs.new = maple_cpu_freqs[newstate].frequency;
-	freqs.cpu = 0;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	rc = maple_scom_switch_freq(newstate);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	mutex_unlock(&maple_switch_mutex);
 
diff --git a/drivers/cpufreq/omap-cpufreq.c b/drivers/cpufreq/omap-cpufreq.c
index 9128c07bafba..b610edd820b1 100644
--- a/drivers/cpufreq/omap-cpufreq.c
+++ b/drivers/cpufreq/omap-cpufreq.c
@@ -88,16 +88,12 @@ static int omap_target(struct cpufreq_policy *policy,
 	}
 
 	freqs.old = omap_getspeed(policy->cpu);
-	freqs.cpu = policy->cpu;
 
 	if (freqs.old == freqs.new && policy->cur == freqs.new)
 		return ret;
 
 	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	freq = freqs.new * 1000;
 	ret = clk_round_rate(mpu_clk, freq);
@@ -157,10 +153,7 @@ static int omap_target(struct cpufreq_policy *policy,
 
 done:
 	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return ret;
 }
diff --git a/drivers/cpufreq/p4-clockmod.c b/drivers/cpufreq/p4-clockmod.c
index 827629c9aad7..4b2e7737b939 100644
--- a/drivers/cpufreq/p4-clockmod.c
+++ b/drivers/cpufreq/p4-clockmod.c
@@ -125,10 +125,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 		return 0;
 
 	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* run on each logical CPU,
 	 * see section 13.15.3 of IA32 Intel Architecture Software
@@ -138,10 +135,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
 		cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
 
 	/* notifiers */
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/pcc-cpufreq.c b/drivers/cpufreq/pcc-cpufreq.c
index 503996a94a6a..0de00081a81e 100644
--- a/drivers/cpufreq/pcc-cpufreq.c
+++ b/drivers/cpufreq/pcc-cpufreq.c
@@ -215,8 +215,7 @@ static int pcc_cpufreq_target(struct cpufreq_policy *policy,
 		(pcch_virt_addr + pcc_cpu_data->input_offset));
 
 	freqs.new = target_freq;
-	freqs.cpu = cpu;
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	input_buffer = 0x1 | (((target_freq * 100)
 			       / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
@@ -237,7 +236,7 @@ static int pcc_cpufreq_target(struct cpufreq_policy *policy,
 	}
 	iowrite16(0, &pcch_hdr->status);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	pr_debug("target: was SUCCESSFUL for cpu %d\n", cpu);
 	spin_unlock(&pcc_lock);
 
diff --git a/drivers/cpufreq/powernow-k6.c b/drivers/cpufreq/powernow-k6.c
index af23e0b9ec92..ea0222a45b7b 100644
--- a/drivers/cpufreq/powernow-k6.c
+++ b/drivers/cpufreq/powernow-k6.c
@@ -68,7 +68,8 @@ static int powernow_k6_get_cpu_multiplier(void)
  *
  *   Tries to change the PowerNow! multiplier
  */
-static void powernow_k6_set_state(unsigned int best_i)
+static void powernow_k6_set_state(struct cpufreq_policy *policy,
+		unsigned int best_i)
 {
 	unsigned long outvalue = 0, invalue = 0;
 	unsigned long msrval;
@@ -81,9 +82,8 @@ static void powernow_k6_set_state(unsigned int best_i)
 
 	freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
 	freqs.new = busfreq * clock_ratio[best_i].index;
-	freqs.cpu = 0; /* powernow-k6.c is UP only driver */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* we now need to transform best_i to the BVC format, see AMD#23446 */
 
@@ -98,7 +98,7 @@ static void powernow_k6_set_state(unsigned int best_i)
 	msrval = POWERNOW_IOPORT + 0x0;
 	wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return;
 }
@@ -136,7 +136,7 @@ static int powernow_k6_target(struct cpufreq_policy *policy,
 				target_freq, relation, &newstate))
 		return -EINVAL;
 
-	powernow_k6_set_state(newstate);
+	powernow_k6_set_state(policy, newstate);
 
 	return 0;
 }
@@ -182,7 +182,7 @@ static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
 	unsigned int i;
 	for (i = 0; i < 8; i++) {
 		if (i == max_multiplier)
-			powernow_k6_set_state(i);
+			powernow_k6_set_state(policy, i);
 	}
 	cpufreq_frequency_table_put_attr(policy->cpu);
 	return 0;
diff --git a/drivers/cpufreq/powernow-k7.c b/drivers/cpufreq/powernow-k7.c
index 334cc2f1e9f1..53888dacbe58 100644
--- a/drivers/cpufreq/powernow-k7.c
+++ b/drivers/cpufreq/powernow-k7.c
@@ -248,7 +248,7 @@ static void change_VID(int vid)
 }
 
 
-static void change_speed(unsigned int index)
+static void change_speed(struct cpufreq_policy *policy, unsigned int index)
 {
 	u8 fid, vid;
 	struct cpufreq_freqs freqs;
@@ -263,15 +263,13 @@ static void change_speed(unsigned int index)
 	fid = powernow_table[index].index & 0xFF;
 	vid = (powernow_table[index].index & 0xFF00) >> 8;
 
-	freqs.cpu = 0;
-
 	rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
 	cfid = fidvidstatus.bits.CFID;
 	freqs.old = fsb * fid_codes[cfid] / 10;
 
 	freqs.new = powernow_table[index].frequency;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* Now do the magic poking into the MSRs.  */
 
@@ -292,7 +290,7 @@ static void change_speed(unsigned int index)
 	if (have_a0 == 1)
 		local_irq_enable();
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 }
 
 
@@ -546,7 +544,7 @@ static int powernow_target(struct cpufreq_policy *policy,
 				relation, &newstate))
 		return -EINVAL;
 
-	change_speed(newstate);
+	change_speed(policy, newstate);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/powernow-k8.c b/drivers/cpufreq/powernow-k8.c
index d13a13678b5f..52137a323965 100644
--- a/drivers/cpufreq/powernow-k8.c
+++ b/drivers/cpufreq/powernow-k8.c
@@ -928,9 +928,10 @@ static int get_transition_latency(struct powernow_k8_data *data)
 static int transition_frequency_fidvid(struct powernow_k8_data *data,
 		unsigned int index)
 {
+	struct cpufreq_policy *policy;
 	u32 fid = 0;
 	u32 vid = 0;
-	int res, i;
+	int res;
 	struct cpufreq_freqs freqs;
 
 	pr_debug("cpu %d transition to index %u\n", smp_processor_id(), index);
@@ -959,10 +960,10 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
 	freqs.old = find_khz_freq_from_fid(data->currfid);
 	freqs.new = find_khz_freq_from_fid(fid);
 
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	policy = cpufreq_cpu_get(smp_processor_id());
+	cpufreq_cpu_put(policy);
+
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	res = transition_fid_vid(data, fid, vid);
 	if (res)
@@ -970,10 +971,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data,
 
 	freqs.new = find_khz_freq_from_fid(data->currfid);
 
-	for_each_cpu(i, data->available_cores) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	return res;
 }
 
diff --git a/drivers/cpufreq/s3c2416-cpufreq.c b/drivers/cpufreq/s3c2416-cpufreq.c
index bcc053bc02c4..4f1881eee3f1 100644
--- a/drivers/cpufreq/s3c2416-cpufreq.c
+++ b/drivers/cpufreq/s3c2416-cpufreq.c
@@ -256,7 +256,6 @@ static int s3c2416_cpufreq_set_target(struct cpufreq_policy *policy,
 		goto out;
 	}
 
-	freqs.cpu = 0;
 	freqs.flags = 0;
 	freqs.old = s3c_freq->is_dvs ? FREQ_DVS
 				     : clk_get_rate(s3c_freq->armclk) / 1000;
@@ -274,7 +273,7 @@ static int s3c2416_cpufreq_set_target(struct cpufreq_policy *policy,
 	if (!to_dvs && freqs.old == freqs.new)
 		goto out;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	if (to_dvs) {
 		pr_debug("cpufreq: enter dvs\n");
@@ -287,7 +286,7 @@ static int s3c2416_cpufreq_set_target(struct cpufreq_policy *policy,
 		ret = s3c2416_cpufreq_set_armdiv(s3c_freq, freqs.new);
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 out:
 	mutex_unlock(&cpufreq_lock);
diff --git a/drivers/cpufreq/s3c64xx-cpufreq.c b/drivers/cpufreq/s3c64xx-cpufreq.c
index 6f9490b3c356..27cacb524796 100644
--- a/drivers/cpufreq/s3c64xx-cpufreq.c
+++ b/drivers/cpufreq/s3c64xx-cpufreq.c
@@ -84,7 +84,6 @@ static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
 	if (ret != 0)
 		return ret;
 
-	freqs.cpu = 0;
 	freqs.old = clk_get_rate(armclk) / 1000;
 	freqs.new = s3c64xx_freq_table[i].frequency;
 	freqs.flags = 0;
@@ -95,7 +94,7 @@ static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
 
 	pr_debug("Transition %d-%dkHz\n", freqs.old, freqs.new);
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 #ifdef CONFIG_REGULATOR
 	if (vddarm && freqs.new > freqs.old) {
@@ -117,7 +116,7 @@ static int s3c64xx_cpufreq_set_target(struct cpufreq_policy *policy,
 		goto err;
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 #ifdef CONFIG_REGULATOR
 	if (vddarm && freqs.new < freqs.old) {
@@ -141,7 +140,7 @@ err_clk:
 	if (clk_set_rate(armclk, freqs.old * 1000) < 0)
 		pr_err("Failed to restore original clock rate\n");
 err:
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return ret;
 }
diff --git a/drivers/cpufreq/s5pv210-cpufreq.c b/drivers/cpufreq/s5pv210-cpufreq.c
index a484aaea9809..5c7757073793 100644
--- a/drivers/cpufreq/s5pv210-cpufreq.c
+++ b/drivers/cpufreq/s5pv210-cpufreq.c
@@ -229,7 +229,6 @@ static int s5pv210_target(struct cpufreq_policy *policy,
 	}
 
 	freqs.new = s5pv210_freq_table[index].frequency;
-	freqs.cpu = 0;
 
 	if (freqs.new == freqs.old)
 		goto exit;
@@ -256,7 +255,7 @@ static int s5pv210_target(struct cpufreq_policy *policy,
 			goto exit;
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	/* Check if there need to change PLL */
 	if ((index == L0) || (priv_index == L0))
@@ -468,7 +467,7 @@ static int s5pv210_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	if (freqs.new < freqs.old) {
 		regulator_set_voltage(int_regulator,
diff --git a/drivers/cpufreq/sc520_freq.c b/drivers/cpufreq/sc520_freq.c
index e42e073cd9b8..f740b134d27b 100644
--- a/drivers/cpufreq/sc520_freq.c
+++ b/drivers/cpufreq/sc520_freq.c
@@ -53,7 +53,8 @@ static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
 	}
 }
 
-static void sc520_freq_set_cpu_state(unsigned int state)
+static void sc520_freq_set_cpu_state(struct cpufreq_policy *policy,
+		unsigned int state)
 {
 
 	struct cpufreq_freqs	freqs;
@@ -61,9 +62,8 @@ static void sc520_freq_set_cpu_state(unsigned int state)
 
 	freqs.old = sc520_freq_get_cpu_frequency(0);
 	freqs.new = sc520_freq_table[state].frequency;
-	freqs.cpu = 0; /* AMD Elan is UP */
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	pr_debug("attempting to set frequency to %i kHz\n",
 			sc520_freq_table[state].frequency);
@@ -75,7 +75,7 @@ static void sc520_freq_set_cpu_state(unsigned int state)
 
 	local_irq_enable();
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 };
 
 static int sc520_freq_verify(struct cpufreq_policy *policy)
@@ -93,7 +93,7 @@ static int sc520_freq_target(struct cpufreq_policy *policy,
 				target_freq, relation, &newstate))
 		return -EINVAL;
 
-	sc520_freq_set_cpu_state(newstate);
+	sc520_freq_set_cpu_state(policy, newstate);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/spear-cpufreq.c b/drivers/cpufreq/spear-cpufreq.c
index 7e4d77327957..156829f4576d 100644
--- a/drivers/cpufreq/spear-cpufreq.c
+++ b/drivers/cpufreq/spear-cpufreq.c
@@ -121,7 +121,6 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
 				target_freq, relation, &index))
 		return -EINVAL;
 
-	freqs.cpu = policy->cpu;
 	freqs.old = spear_cpufreq_get(0);
 
 	newfreq = spear_cpufreq.freq_tbl[index].frequency * 1000;
@@ -158,8 +157,7 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
 	freqs.new = newfreq / 1000;
 	freqs.new /= mult;
 
-	for_each_cpu(freqs.cpu, policy->cpus)
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	if (mult == 2)
 		ret = spear1340_set_cpu_rate(srcclk, newfreq);
@@ -172,8 +170,7 @@ static int spear_cpufreq_target(struct cpufreq_policy *policy,
 		freqs.new = clk_get_rate(spear_cpufreq.clk) / 1000;
 	}
 
-	for_each_cpu(freqs.cpu, policy->cpus)
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	return ret;
 }
 
diff --git a/drivers/cpufreq/speedstep-centrino.c b/drivers/cpufreq/speedstep-centrino.c
index 3a953d519f46..3dbbcc3519af 100644
--- a/drivers/cpufreq/speedstep-centrino.c
+++ b/drivers/cpufreq/speedstep-centrino.c
@@ -457,7 +457,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 	unsigned int	msr, oldmsr = 0, h = 0, cpu = policy->cpu;
 	struct cpufreq_freqs	freqs;
 	int			retval = 0;
-	unsigned int		j, k, first_cpu, tmp;
+	unsigned int		j, first_cpu, tmp;
 	cpumask_var_t covered_cpus;
 
 	if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
@@ -522,13 +522,8 @@ static int centrino_target (struct cpufreq_policy *policy,
 			pr_debug("target=%dkHz old=%d new=%d msr=%04x\n",
 				target_freq, freqs.old, freqs.new, msr);
 
-			for_each_cpu(k, policy->cpus) {
-				if (!cpu_online(k))
-					continue;
-				freqs.cpu = k;
-				cpufreq_notify_transition(&freqs,
+			cpufreq_notify_transition(policy, &freqs,
 					CPUFREQ_PRECHANGE);
-			}
 
 			first_cpu = 0;
 			/* all but 16 LSB are reserved, treat them with care */
@@ -544,12 +539,7 @@ static int centrino_target (struct cpufreq_policy *policy,
 		cpumask_set_cpu(j, covered_cpus);
 	}
 
-	for_each_cpu(k, policy->cpus) {
-		if (!cpu_online(k))
-			continue;
-		freqs.cpu = k;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	if (unlikely(retval)) {
 		/*
@@ -565,12 +555,8 @@ static int centrino_target (struct cpufreq_policy *policy,
 		tmp = freqs.new;
 		freqs.new = freqs.old;
 		freqs.old = tmp;
-		for_each_cpu(j, policy->cpus) {
-			if (!cpu_online(j))
-				continue;
-			cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-			cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-		}
+		cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
+		cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 	}
 	retval = 0;
 
diff --git a/drivers/cpufreq/speedstep-ich.c b/drivers/cpufreq/speedstep-ich.c
index e29b59aa68a8..e2e5aa971452 100644
--- a/drivers/cpufreq/speedstep-ich.c
+++ b/drivers/cpufreq/speedstep-ich.c
@@ -263,7 +263,6 @@ static int speedstep_target(struct cpufreq_policy *policy,
 {
 	unsigned int newstate = 0, policy_cpu;
 	struct cpufreq_freqs freqs;
-	int i;
 
 	if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
 				target_freq, relation, &newstate))
@@ -272,7 +271,6 @@ static int speedstep_target(struct cpufreq_policy *policy,
 	policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
 	freqs.old = speedstep_get(policy_cpu);
 	freqs.new = speedstep_freqs[newstate].frequency;
-	freqs.cpu = policy->cpu;
 
 	pr_debug("transiting from %u to %u kHz\n", freqs.old, freqs.new);
 
@@ -280,18 +278,12 @@ static int speedstep_target(struct cpufreq_policy *policy,
 	if (freqs.old == freqs.new)
 		return 0;
 
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 
 	smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
 				 true);
 
-	for_each_cpu(i, policy->cpus) {
-		freqs.cpu = i;
-		cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
-	}
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/drivers/cpufreq/speedstep-smi.c b/drivers/cpufreq/speedstep-smi.c
index 6a457fcaaad5..f5a6b70ee6c0 100644
--- a/drivers/cpufreq/speedstep-smi.c
+++ b/drivers/cpufreq/speedstep-smi.c
@@ -252,14 +252,13 @@ static int speedstep_target(struct cpufreq_policy *policy,
 
 	freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
 	freqs.new = speedstep_freqs[newstate].frequency;
-	freqs.cpu = 0; /* speedstep.c is UP only driver */
 
 	if (freqs.old == freqs.new)
 		return 0;
 
-	cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_PRECHANGE);
 	speedstep_set_state(newstate);
-	cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
+	cpufreq_notify_transition(policy, &freqs, CPUFREQ_POSTCHANGE);
 
 	return 0;
 }
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 4bbc572dd521..037d36ae63e5 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -278,8 +278,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data);
 int cpufreq_unregister_driver(struct cpufreq_driver *driver_data);
 
 
-void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state);
-
+void cpufreq_notify_transition(struct cpufreq_policy *policy,
+		struct cpufreq_freqs *freqs, unsigned int state);
 
 static inline void cpufreq_verify_within_limits(struct cpufreq_policy *policy, unsigned int min, unsigned int max)
 {
-- 
cgit 


From bb5547acfcd842950b8a22aa83f84af93388b9f2 Mon Sep 17 00:00:00 2001
From: Varun Sethi <Varun.Sethi@freescale.com>
Date: Fri, 29 Mar 2013 01:23:58 +0530
Subject: iommu/fsl: Make iova dma_addr_t in the iommu_iova_to_phys API.

This is required in case of PAMU, as it can support a window size of up
to 64G (even on 32bit).

Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
Signed-off-by: Joerg Roedel <joro@8bytes.org>
---
 drivers/iommu/amd_iommu.c      | 2 +-
 drivers/iommu/exynos-iommu.c   | 2 +-
 drivers/iommu/intel-iommu.c    | 2 +-
 drivers/iommu/iommu.c          | 3 +--
 drivers/iommu/msm_iommu.c      | 2 +-
 drivers/iommu/omap-iommu.c     | 2 +-
 drivers/iommu/shmobile-iommu.c | 2 +-
 drivers/iommu/tegra-gart.c     | 2 +-
 drivers/iommu/tegra-smmu.c     | 2 +-
 include/linux/iommu.h          | 9 +++------
 10 files changed, 12 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index b287ca33833d..a7f6b04eaa5e 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -3410,7 +3410,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
 }
 
 static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
-					  unsigned long iova)
+					  dma_addr_t iova)
 {
 	struct protection_domain *domain = dom->priv;
 	unsigned long offset_mask;
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 238a3caa949a..3f32d64ab87a 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -1027,7 +1027,7 @@ done:
 }
 
 static phys_addr_t exynos_iommu_iova_to_phys(struct iommu_domain *domain,
-					  unsigned long iova)
+					  dma_addr_t iova)
 {
 	struct exynos_iommu_domain *priv = domain->priv;
 	unsigned long *entry;
diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c
index 0099667a397e..6e0b9ffc79b5 100644
--- a/drivers/iommu/intel-iommu.c
+++ b/drivers/iommu/intel-iommu.c
@@ -4111,7 +4111,7 @@ static size_t intel_iommu_unmap(struct iommu_domain *domain,
 }
 
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
-					    unsigned long iova)
+					    dma_addr_t iova)
 {
 	struct dmar_domain *dmar_domain = domain->priv;
 	struct dma_pte *pte;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b972d430d92b..f730ed9d8af9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -706,8 +706,7 @@ void iommu_detach_group(struct iommu_domain *domain, struct iommu_group *group)
 }
 EXPORT_SYMBOL_GPL(iommu_detach_group);
 
-phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
-			       unsigned long iova)
+phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
 	if (unlikely(domain->ops->iova_to_phys == NULL))
 		return 0;
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index 6a8870a31668..8ab4f41090af 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -554,7 +554,7 @@ fail:
 }
 
 static phys_addr_t msm_iommu_iova_to_phys(struct iommu_domain *domain,
-					  unsigned long va)
+					  dma_addr_t va)
 {
 	struct msm_priv *priv;
 	struct msm_iommu_drvdata *iommu_drvdata;
diff --git a/drivers/iommu/omap-iommu.c b/drivers/iommu/omap-iommu.c
index 6ac02fa5910f..e02e5d71745b 100644
--- a/drivers/iommu/omap-iommu.c
+++ b/drivers/iommu/omap-iommu.c
@@ -1219,7 +1219,7 @@ static void omap_iommu_domain_destroy(struct iommu_domain *domain)
 }
 
 static phys_addr_t omap_iommu_iova_to_phys(struct iommu_domain *domain,
-					  unsigned long da)
+					  dma_addr_t da)
 {
 	struct omap_iommu_domain *omap_domain = domain->priv;
 	struct omap_iommu *oiommu = omap_domain->iommu_dev;
diff --git a/drivers/iommu/shmobile-iommu.c b/drivers/iommu/shmobile-iommu.c
index b6e8b57cf0a8..d572863dfccd 100644
--- a/drivers/iommu/shmobile-iommu.c
+++ b/drivers/iommu/shmobile-iommu.c
@@ -296,7 +296,7 @@ done:
 }
 
 static phys_addr_t shmobile_iommu_iova_to_phys(struct iommu_domain *domain,
-					       unsigned long iova)
+					       dma_addr_t iova)
 {
 	struct shmobile_iommu_domain *sh_domain = domain->priv;
 	uint32_t l1entry = 0, l2entry = 0;
diff --git a/drivers/iommu/tegra-gart.c b/drivers/iommu/tegra-gart.c
index 86437575f94d..4aec8be38054 100644
--- a/drivers/iommu/tegra-gart.c
+++ b/drivers/iommu/tegra-gart.c
@@ -279,7 +279,7 @@ static size_t gart_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 }
 
 static phys_addr_t gart_iommu_iova_to_phys(struct iommu_domain *domain,
-					   unsigned long iova)
+					   dma_addr_t iova)
 {
 	struct gart_device *gart = domain->priv;
 	unsigned long pte;
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index b34e5fd7fd9e..bc9b59949d09 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -757,7 +757,7 @@ static size_t smmu_iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 }
 
 static phys_addr_t smmu_iommu_iova_to_phys(struct iommu_domain *domain,
-					   unsigned long iova)
+					   dma_addr_t iova)
 {
 	struct smmu_as *as = domain->priv;
 	unsigned long *pte;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ba3b8a98a049..bb0a0fc26729 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -91,8 +91,7 @@ struct iommu_ops {
 		   phys_addr_t paddr, size_t size, int prot);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size);
-	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
-				    unsigned long iova);
+	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
 	int (*domain_has_cap)(struct iommu_domain *domain,
 			      unsigned long cap);
 	int (*add_device)(struct device *dev);
@@ -134,8 +133,7 @@ extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
 		     phys_addr_t paddr, size_t size, int prot);
 extern size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 		       size_t size);
-extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
-				      unsigned long iova);
+extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova);
 extern int iommu_domain_has_cap(struct iommu_domain *domain,
 				unsigned long cap);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
@@ -267,8 +265,7 @@ static inline void iommu_domain_window_disable(struct iommu_domain *domain,
 {
 }
 
-static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain,
-					     unsigned long iova)
+static inline phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
 	return 0;
 }
-- 
cgit 


From 80f97f0f73b82444f714651ea053838d27779dca Mon Sep 17 00:00:00 2001
From: Varun Sethi <Varun.Sethi@freescale.com>
Date: Fri, 29 Mar 2013 01:24:00 +0530
Subject: iommu/fsl: Add the window permission flag as a parameter to
 iommu_window_enable API.

Each iommu window can have access permissions associated with it. Extended the
window_enable API to incorporate window access permissions.

In case of PAMU each window can have its specific set of permissions.

Signed-off-by: Varun Sethi <Varun.Sethi@freescale.com>
Signed-off-by: Joerg Roedel <joro@8bytes.org>
---
 drivers/iommu/iommu.c | 5 +++--
 include/linux/iommu.h | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index f730ed9d8af9..1d72b4f5b006 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -853,12 +853,13 @@ EXPORT_SYMBOL_GPL(iommu_unmap);
 
 
 int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-			       phys_addr_t paddr, u64 size)
+			       phys_addr_t paddr, u64 size, int prot)
 {
 	if (unlikely(domain->ops->domain_window_enable == NULL))
 		return -ENODEV;
 
-	return domain->ops->domain_window_enable(domain, wnd_nr, paddr, size);
+	return domain->ops->domain_window_enable(domain, wnd_nr, paddr, size,
+						 prot);
 }
 EXPORT_SYMBOL_GPL(iommu_domain_window_enable);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index bb0a0fc26729..272781073110 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -104,7 +104,7 @@ struct iommu_ops {
 
 	/* Window handling functions */
 	int (*domain_window_enable)(struct iommu_domain *domain, u32 wnd_nr,
-				    phys_addr_t paddr, u64 size);
+				    phys_addr_t paddr, u64 size, int prot);
 	void (*domain_window_disable)(struct iommu_domain *domain, u32 wnd_nr);
 	/* Set the numer of window per domain */
 	int (*domain_set_windows)(struct iommu_domain *domain, u32 w_count);
@@ -169,7 +169,8 @@ extern int iommu_domain_set_attr(struct iommu_domain *domain, enum iommu_attr,
 
 /* Window handling function prototypes */
 extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr,
-				      phys_addr_t offset, u64 size);
+				      phys_addr_t offset, u64 size,
+				      int prot);
 extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr);
 /**
  * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework
@@ -255,7 +256,7 @@ static inline int iommu_unmap(struct iommu_domain *domain, unsigned long iova,
 
 static inline int iommu_domain_window_enable(struct iommu_domain *domain,
 					     u32 wnd_nr, phys_addr_t paddr,
-					     u64 size)
+					     u64 size, int prot)
 {
 	return -ENODEV;
 }
-- 
cgit 


From 65b3841b9cb5fe1b239f12dbf033f9827d73d032 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Tue, 2 Apr 2013 09:35:07 +0000
Subject: of_net.h: Provide empty functions if OF_NET is not configured

of_get_mac_address() and of_get_phy_mode() are only provided if OF_NET
is configured. While most callers check for the define, not all do, and those
who do require #ifdef around the code. For those who don't, the missing check
can result in errors such as

arch/powerpc/sysdev/tsi108_dev.c:107:3: error: implicit declaration of
	function 'of_get_mac_address' [-Werror=implicit-function-declaration]
arch/powerpc/sysdev/mv64x60_dev.c:253:2: error: implicit declaration of
	function 'of_get_mac_address' [-Werror=implicit-function-declaration]

Provide empty functions if OF_NET is not configured.
This is safe because all callers do check the return values.

Cc: David Daney <david.daney@cavium.com>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Acked-by: Rob Herring <rob.herring@calxeda.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/of_net.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of_net.h b/include/linux/of_net.h
index f47464188710..61bf53b02779 100644
--- a/include/linux/of_net.h
+++ b/include/linux/of_net.h
@@ -11,6 +11,16 @@
 #include <linux/of.h>
 extern const int of_get_phy_mode(struct device_node *np);
 extern const void *of_get_mac_address(struct device_node *np);
+#else
+static inline const int of_get_phy_mode(struct device_node *np)
+{
+	return -ENODEV;
+}
+
+static inline const void *of_get_mac_address(struct device_node *np)
+{
+	return NULL;
+}
 #endif
 
 #endif /* __LINUX_OF_NET_H */
-- 
cgit 


From 35e1d5f6344ed1bb2fab61ac7934aa0f19908b2c Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 2 Apr 2013 13:24:05 +0100
Subject: regulator: ab8500-ext: Remove unused REGULATOR_AB8500_EXT guard

Before the AB8500 External Regulator driver was Mainlined, it used
to be conditionally compiled in using the CONFIG_REGULATOR_AB8500_EXT
flag. During the review process that capability was removed, but the
guard controlling prototyping slipped though the net. This patch
cleans it up.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 include/linux/regulator/ab8500.h | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index bb0140c9d4f4..44f67e8f1a6d 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -320,21 +320,6 @@ struct ab8500_regulator_platform_data {
 	struct regulator_init_data *ext_regulator;
 };
 
-/* AB8500 external regulator functions (internal) */
-#ifdef CONFIG_REGULATOR_AB8500_EXT
-int ab8500_ext_regulator_init(struct platform_device *pdev);
-int ab8500_ext_regulator_exit(struct platform_device *pdev);
-#else
-inline int ab8500_ext_regulator_init(struct platform_device *pdev)
-{
-	return 0;
-}
-inline int ab8500_ext_regulator_exit(struct platform_device *pdev)
-{
-	return 0;
-}
-#endif
-
 #ifdef CONFIG_REGULATOR_AB8500_DEBUG
 int ab8500_regulator_debug_init(struct platform_device *pdev);
 int ab8500_regulator_debug_exit(struct platform_device *pdev);
@@ -349,4 +334,8 @@ static inline int ab8500_regulator_debug_exit(struct platform_device *pdev)
 }
 #endif
 
+/* AB8500 external regulator functions. */
+int ab8500_ext_regulator_init(struct platform_device *pdev);
+int ab8500_ext_regulator_exit(struct platform_device *pdev);
+
 #endif
-- 
cgit 


From 3566d40c1a4617461b38c82059bdc41d622faa8b Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 25 Mar 2013 14:35:07 +0000
Subject: clk: fix clk_mux::flags kerneldoc

The kerneldoc comment for struct clk_mux documented the non-existent
num_clks instead of flags. Correct this.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 include/linux/clk-provider.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 1f0352802794..b1675074fe7c 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -284,7 +284,7 @@ struct clk *clk_register_divider_table(struct device *dev, const char *name,
  * @reg:	register controlling multiplexer
  * @shift:	shift to multiplexer bit field
  * @width:	width of mutliplexer bit field
- * @num_clks:	number of parent clocks
+ * @flags:	hardware-specific flags
  * @lock:	register lock
  *
  * Clock with multiple selectable parents.  Implements .get_parent, .set_parent
-- 
cgit 


From 119f5e448d32c11faf22fe81f6f2d78467a47149 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@opensource.se>
Date: Wed, 13 Mar 2013 20:32:13 +0900
Subject: gpio: Renesas R-Car GPIO driver V3

This patch is V3 of a GPIO driver for the R-Car series of
SoCs from Renesas. This driver is designed to be reusable
between multiple SoCs that share the same basic building block,
but so far it has only been used on R-Car H1 (r8a7779).

Each driver instance handles 32 GPIOs with individually
maskable IRQs. The driver operates on a single I/O memory
range and the 32 GPIOs are hooked up a single interrupt.

In the case of R-Car H1 either external IRQ pins or GPIOs
with interrupts can be used for on-board interupts. For
external IRQs 4 pins are supported, and in the case of GPIO
there are 202 GPIOS as 202 interrupts hooked up via 6 driver
instances and to the GIC and the Cortex-A9 Quad.

At this point this driver is interfacing as a regular
platform device driver. In the future DT support will be
submitted as an incremental feature patch.

Signed-off-by: Magnus Damm <damm@opensource.se>
Reviewed-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/gpio/Kconfig                    |   6 +
 drivers/gpio/Makefile                   |   1 +
 drivers/gpio/gpio-rcar.c                | 373 ++++++++++++++++++++++++++++++++
 include/linux/platform_data/gpio-rcar.h |  25 +++
 4 files changed, 405 insertions(+)
 create mode 100644 drivers/gpio/gpio-rcar.c
 create mode 100644 include/linux/platform_data/gpio-rcar.h

(limited to 'include/linux')

diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 93aaadf99f28..d766e3cbef18 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -204,6 +204,12 @@ config GPIO_PXA
 	help
 	  Say yes here to support the PXA GPIO device
 
+config GPIO_RCAR
+	tristate "Renesas R-Car GPIO"
+	depends on ARM
+	help
+	  Say yes here to support GPIO on Renesas R-Car SoCs.
+
 config GPIO_SPEAR_SPICS
 	bool "ST SPEAr13xx SPI Chip Select as GPIO support"
 	depends on PLAT_SPEAR
diff --git a/drivers/gpio/Makefile b/drivers/gpio/Makefile
index 22e07bc9fcb5..b41c74d45287 100644
--- a/drivers/gpio/Makefile
+++ b/drivers/gpio/Makefile
@@ -57,6 +57,7 @@ obj-$(CONFIG_GPIO_PL061)	+= gpio-pl061.o
 obj-$(CONFIG_GPIO_PXA)		+= gpio-pxa.o
 obj-$(CONFIG_GPIO_RC5T583)	+= gpio-rc5t583.o
 obj-$(CONFIG_GPIO_RDC321X)	+= gpio-rdc321x.o
+obj-$(CONFIG_GPIO_RCAR)		+= gpio-rcar.o
 obj-$(CONFIG_PLAT_SAMSUNG)	+= gpio-samsung.o
 obj-$(CONFIG_ARCH_SA1100)	+= gpio-sa1100.o
 obj-$(CONFIG_GPIO_SCH)		+= gpio-sch.o
diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
new file mode 100644
index 000000000000..581ba56131a7
--- /dev/null
+++ b/drivers/gpio/gpio-rcar.c
@@ -0,0 +1,373 @@
+/*
+ * Renesas R-Car GPIO Support
+ *
+ *  Copyright (C) 2013 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/err.h>
+#include <linux/gpio.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/ioport.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/module.h>
+#include <linux/platform_data/gpio-rcar.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+
+struct gpio_rcar_priv {
+	void __iomem *base;
+	spinlock_t lock;
+	struct gpio_rcar_config config;
+	struct platform_device *pdev;
+	struct gpio_chip gpio_chip;
+	struct irq_chip irq_chip;
+	struct irq_domain *irq_domain;
+};
+
+#define IOINTSEL 0x00
+#define INOUTSEL 0x04
+#define OUTDT 0x08
+#define INDT 0x0c
+#define INTDT 0x10
+#define INTCLR 0x14
+#define INTMSK 0x18
+#define MSKCLR 0x1c
+#define POSNEG 0x20
+#define EDGLEVEL 0x24
+#define FILONOFF 0x28
+
+static inline u32 gpio_rcar_read(struct gpio_rcar_priv *p, int offs)
+{
+	return ioread32(p->base + offs);
+}
+
+static inline void gpio_rcar_write(struct gpio_rcar_priv *p, int offs,
+				   u32 value)
+{
+	iowrite32(value, p->base + offs);
+}
+
+static void gpio_rcar_modify_bit(struct gpio_rcar_priv *p, int offs,
+				 int bit, bool value)
+{
+	u32 tmp = gpio_rcar_read(p, offs);
+
+	if (value)
+		tmp |= BIT(bit);
+	else
+		tmp &= ~BIT(bit);
+
+	gpio_rcar_write(p, offs, tmp);
+}
+
+static void gpio_rcar_irq_disable(struct irq_data *d)
+{
+	struct gpio_rcar_priv *p = irq_data_get_irq_chip_data(d);
+
+	gpio_rcar_write(p, INTMSK, ~BIT(irqd_to_hwirq(d)));
+}
+
+static void gpio_rcar_irq_enable(struct irq_data *d)
+{
+	struct gpio_rcar_priv *p = irq_data_get_irq_chip_data(d);
+
+	gpio_rcar_write(p, MSKCLR, BIT(irqd_to_hwirq(d)));
+}
+
+static void gpio_rcar_config_interrupt_input_mode(struct gpio_rcar_priv *p,
+						  unsigned int hwirq,
+						  bool active_high_rising_edge,
+						  bool level_trigger)
+{
+	unsigned long flags;
+
+	/* follow steps in the GPIO documentation for
+	 * "Setting Edge-Sensitive Interrupt Input Mode" and
+	 * "Setting Level-Sensitive Interrupt Input Mode"
+	 */
+
+	spin_lock_irqsave(&p->lock, flags);
+
+	/* Configure postive or negative logic in POSNEG */
+	gpio_rcar_modify_bit(p, POSNEG, hwirq, !active_high_rising_edge);
+
+	/* Configure edge or level trigger in EDGLEVEL */
+	gpio_rcar_modify_bit(p, EDGLEVEL, hwirq, !level_trigger);
+
+	/* Select "Interrupt Input Mode" in IOINTSEL */
+	gpio_rcar_modify_bit(p, IOINTSEL, hwirq, true);
+
+	/* Write INTCLR in case of edge trigger */
+	if (!level_trigger)
+		gpio_rcar_write(p, INTCLR, BIT(hwirq));
+
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static int gpio_rcar_irq_set_type(struct irq_data *d, unsigned int type)
+{
+	struct gpio_rcar_priv *p = irq_data_get_irq_chip_data(d);
+	unsigned int hwirq = irqd_to_hwirq(d);
+
+	dev_dbg(&p->pdev->dev, "sense irq = %d, type = %d\n", hwirq, type);
+
+	switch (type & IRQ_TYPE_SENSE_MASK) {
+	case IRQ_TYPE_LEVEL_HIGH:
+		gpio_rcar_config_interrupt_input_mode(p, hwirq, true, true);
+		break;
+	case IRQ_TYPE_LEVEL_LOW:
+		gpio_rcar_config_interrupt_input_mode(p, hwirq, false, true);
+		break;
+	case IRQ_TYPE_EDGE_RISING:
+		gpio_rcar_config_interrupt_input_mode(p, hwirq, true, false);
+		break;
+	case IRQ_TYPE_EDGE_FALLING:
+		gpio_rcar_config_interrupt_input_mode(p, hwirq, false, false);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static irqreturn_t gpio_rcar_irq_handler(int irq, void *dev_id)
+{
+	struct gpio_rcar_priv *p = dev_id;
+	u32 pending;
+	unsigned int offset, irqs_handled = 0;
+
+	while ((pending = gpio_rcar_read(p, INTDT))) {
+		offset = __ffs(pending);
+		gpio_rcar_write(p, INTCLR, BIT(offset));
+		generic_handle_irq(irq_find_mapping(p->irq_domain, offset));
+		irqs_handled++;
+	}
+
+	return irqs_handled ? IRQ_HANDLED : IRQ_NONE;
+}
+
+static inline struct gpio_rcar_priv *gpio_to_priv(struct gpio_chip *chip)
+{
+	return container_of(chip, struct gpio_rcar_priv, gpio_chip);
+}
+
+static void gpio_rcar_config_general_input_output_mode(struct gpio_chip *chip,
+						       unsigned int gpio,
+						       bool output)
+{
+	struct gpio_rcar_priv *p = gpio_to_priv(chip);
+	unsigned long flags;
+
+	/* follow steps in the GPIO documentation for
+	 * "Setting General Output Mode" and
+	 * "Setting General Input Mode"
+	 */
+
+	spin_lock_irqsave(&p->lock, flags);
+
+	/* Configure postive logic in POSNEG */
+	gpio_rcar_modify_bit(p, POSNEG, gpio, false);
+
+	/* Select "General Input/Output Mode" in IOINTSEL */
+	gpio_rcar_modify_bit(p, IOINTSEL, gpio, false);
+
+	/* Select Input Mode or Output Mode in INOUTSEL */
+	gpio_rcar_modify_bit(p, INOUTSEL, gpio, output);
+
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static int gpio_rcar_direction_input(struct gpio_chip *chip, unsigned offset)
+{
+	gpio_rcar_config_general_input_output_mode(chip, offset, false);
+	return 0;
+}
+
+static int gpio_rcar_get(struct gpio_chip *chip, unsigned offset)
+{
+	return (int)(gpio_rcar_read(gpio_to_priv(chip), INDT) & BIT(offset));
+}
+
+static void gpio_rcar_set(struct gpio_chip *chip, unsigned offset, int value)
+{
+	struct gpio_rcar_priv *p = gpio_to_priv(chip);
+	unsigned long flags;
+
+	spin_lock_irqsave(&p->lock, flags);
+	gpio_rcar_modify_bit(p, OUTDT, offset, value);
+	spin_unlock_irqrestore(&p->lock, flags);
+}
+
+static int gpio_rcar_direction_output(struct gpio_chip *chip, unsigned offset,
+				      int value)
+{
+	/* write GPIO value to output before selecting output mode of pin */
+	gpio_rcar_set(chip, offset, value);
+	gpio_rcar_config_general_input_output_mode(chip, offset, true);
+	return 0;
+}
+
+static int gpio_rcar_to_irq(struct gpio_chip *chip, unsigned offset)
+{
+	return irq_create_mapping(gpio_to_priv(chip)->irq_domain, offset);
+}
+
+static int gpio_rcar_irq_domain_map(struct irq_domain *h, unsigned int virq,
+				 irq_hw_number_t hw)
+{
+	struct gpio_rcar_priv *p = h->host_data;
+
+	dev_dbg(&p->pdev->dev, "map hw irq = %d, virq = %d\n", (int)hw, virq);
+
+	irq_set_chip_data(virq, h->host_data);
+	irq_set_chip_and_handler(virq, &p->irq_chip, handle_level_irq);
+	set_irq_flags(virq, IRQF_VALID); /* kill me now */
+	return 0;
+}
+
+static struct irq_domain_ops gpio_rcar_irq_domain_ops = {
+	.map	= gpio_rcar_irq_domain_map,
+};
+
+static int gpio_rcar_probe(struct platform_device *pdev)
+{
+	struct gpio_rcar_config *pdata = pdev->dev.platform_data;
+	struct gpio_rcar_priv *p;
+	struct resource *io, *irq;
+	struct gpio_chip *gpio_chip;
+	struct irq_chip *irq_chip;
+	const char *name = dev_name(&pdev->dev);
+	int ret;
+
+	p = devm_kzalloc(&pdev->dev, sizeof(*p), GFP_KERNEL);
+	if (!p) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		ret = -ENOMEM;
+		goto err0;
+	}
+
+	/* deal with driver instance configuration */
+	if (pdata)
+		p->config = *pdata;
+
+	p->pdev = pdev;
+	platform_set_drvdata(pdev, p);
+	spin_lock_init(&p->lock);
+
+	io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	irq = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+
+	if (!io || !irq) {
+		dev_err(&pdev->dev, "missing IRQ or IOMEM\n");
+		ret = -EINVAL;
+		goto err0;
+	}
+
+	p->base = devm_ioremap_nocache(&pdev->dev, io->start,
+				       resource_size(io));
+	if (!p->base) {
+		dev_err(&pdev->dev, "failed to remap I/O memory\n");
+		ret = -ENXIO;
+		goto err0;
+	}
+
+	gpio_chip = &p->gpio_chip;
+	gpio_chip->direction_input = gpio_rcar_direction_input;
+	gpio_chip->get = gpio_rcar_get;
+	gpio_chip->direction_output = gpio_rcar_direction_output;
+	gpio_chip->set = gpio_rcar_set;
+	gpio_chip->to_irq = gpio_rcar_to_irq;
+	gpio_chip->label = name;
+	gpio_chip->owner = THIS_MODULE;
+	gpio_chip->base = p->config.gpio_base;
+	gpio_chip->ngpio = p->config.number_of_pins;
+
+	irq_chip = &p->irq_chip;
+	irq_chip->name = name;
+	irq_chip->irq_mask = gpio_rcar_irq_disable;
+	irq_chip->irq_unmask = gpio_rcar_irq_enable;
+	irq_chip->irq_enable = gpio_rcar_irq_enable;
+	irq_chip->irq_disable = gpio_rcar_irq_disable;
+	irq_chip->irq_set_type = gpio_rcar_irq_set_type;
+	irq_chip->flags	= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_SET_TYPE_MASKED;
+
+	p->irq_domain = irq_domain_add_simple(pdev->dev.of_node,
+					      p->config.number_of_pins,
+					      p->config.irq_base,
+					      &gpio_rcar_irq_domain_ops, p);
+	if (!p->irq_domain) {
+		ret = -ENXIO;
+		dev_err(&pdev->dev, "cannot initialize irq domain\n");
+		goto err1;
+	}
+
+	if (devm_request_irq(&pdev->dev, irq->start,
+			     gpio_rcar_irq_handler, 0, name, p)) {
+		dev_err(&pdev->dev, "failed to request IRQ\n");
+		ret = -ENOENT;
+		goto err1;
+	}
+
+	ret = gpiochip_add(gpio_chip);
+	if (ret) {
+		dev_err(&pdev->dev, "failed to add GPIO controller\n");
+		goto err1;
+	}
+
+	dev_info(&pdev->dev, "driving %d GPIOs\n", p->config.number_of_pins);
+
+	/* warn in case of mismatch if irq base is specified */
+	if (p->config.irq_base) {
+		ret = irq_find_mapping(p->irq_domain, 0);
+		if (p->config.irq_base != ret)
+			dev_warn(&pdev->dev, "irq base mismatch (%u/%u)\n",
+				 p->config.irq_base, ret);
+	}
+
+	return 0;
+
+err1:
+	irq_domain_remove(p->irq_domain);
+err0:
+	return ret;
+}
+
+static int gpio_rcar_remove(struct platform_device *pdev)
+{
+	struct gpio_rcar_priv *p = platform_get_drvdata(pdev);
+	int ret;
+
+	ret = gpiochip_remove(&p->gpio_chip);
+	if (ret)
+		return ret;
+
+	irq_domain_remove(p->irq_domain);
+	return 0;
+}
+
+static struct platform_driver gpio_rcar_device_driver = {
+	.probe		= gpio_rcar_probe,
+	.remove		= gpio_rcar_remove,
+	.driver		= {
+		.name	= "gpio_rcar",
+	}
+};
+
+module_platform_driver(gpio_rcar_device_driver);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("Renesas R-Car GPIO Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/platform_data/gpio-rcar.h b/include/linux/platform_data/gpio-rcar.h
new file mode 100644
index 000000000000..bebfcd86fb80
--- /dev/null
+++ b/include/linux/platform_data/gpio-rcar.h
@@ -0,0 +1,25 @@
+/*
+ * Renesas R-Car GPIO Support
+ *
+ *  Copyright (C) 2013 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __GPIO_RCAR_H__
+#define __GPIO_RCAR_H__
+
+struct gpio_rcar_config {
+	unsigned int gpio_base;
+	unsigned int irq_base;
+	unsigned int number_of_pins;
+};
+
+#endif /* __GPIO_RCAR_H__ */
-- 
cgit 


From dc3465a943ed2dd5de37d3d60df5c4e11c49efcb Mon Sep 17 00:00:00 2001
From: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Date: Sun, 10 Mar 2013 03:27:00 +0100
Subject: gpio-rcar: Add pinctrl support

Register the GPIO pin range, and request and free GPIO pins using the
pinctrl API.

Signed-off-by: Laurent Pinchart <laurent.pinchart+renesas@ideasonboard.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Simon Horman <horms+renesas@verge.net.au>
---
 drivers/gpio/gpio-rcar.c                | 23 +++++++++++++++++++++++
 include/linux/platform_data/gpio-rcar.h |  1 +
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-rcar.c b/drivers/gpio/gpio-rcar.c
index 581ba56131a7..b4ca450947b8 100644
--- a/drivers/gpio/gpio-rcar.c
+++ b/drivers/gpio/gpio-rcar.c
@@ -22,6 +22,7 @@
 #include <linux/irq.h>
 #include <linux/irqdomain.h>
 #include <linux/module.h>
+#include <linux/pinctrl/consumer.h>
 #include <linux/platform_data/gpio-rcar.h>
 #include <linux/platform_device.h>
 #include <linux/spinlock.h>
@@ -190,6 +191,21 @@ static void gpio_rcar_config_general_input_output_mode(struct gpio_chip *chip,
 	spin_unlock_irqrestore(&p->lock, flags);
 }
 
+static int gpio_rcar_request(struct gpio_chip *chip, unsigned offset)
+{
+	return pinctrl_request_gpio(chip->base + offset);
+}
+
+static void gpio_rcar_free(struct gpio_chip *chip, unsigned offset)
+{
+	pinctrl_free_gpio(chip->base + offset);
+
+	/* Set the GPIO as an input to ensure that the next GPIO request won't
+	 * drive the GPIO pin as an output.
+	 */
+	gpio_rcar_config_general_input_output_mode(chip, offset, false);
+}
+
 static int gpio_rcar_direction_input(struct gpio_chip *chip, unsigned offset)
 {
 	gpio_rcar_config_general_input_output_mode(chip, offset, false);
@@ -285,6 +301,8 @@ static int gpio_rcar_probe(struct platform_device *pdev)
 	}
 
 	gpio_chip = &p->gpio_chip;
+	gpio_chip->request = gpio_rcar_request;
+	gpio_chip->free = gpio_rcar_free;
 	gpio_chip->direction_input = gpio_rcar_direction_input;
 	gpio_chip->get = gpio_rcar_get;
 	gpio_chip->direction_output = gpio_rcar_direction_output;
@@ -337,6 +355,11 @@ static int gpio_rcar_probe(struct platform_device *pdev)
 				 p->config.irq_base, ret);
 	}
 
+	ret = gpiochip_add_pin_range(gpio_chip, p->config.pctl_name, 0,
+				     gpio_chip->base, gpio_chip->ngpio);
+	if (ret < 0)
+		dev_warn(&pdev->dev, "failed to add pin range\n");
+
 	return 0;
 
 err1:
diff --git a/include/linux/platform_data/gpio-rcar.h b/include/linux/platform_data/gpio-rcar.h
index bebfcd86fb80..b253f77a7ddf 100644
--- a/include/linux/platform_data/gpio-rcar.h
+++ b/include/linux/platform_data/gpio-rcar.h
@@ -20,6 +20,7 @@ struct gpio_rcar_config {
 	unsigned int gpio_base;
 	unsigned int irq_base;
 	unsigned int number_of_pins;
+	const char *pctl_name;
 };
 
 #endif /* __GPIO_RCAR_H__ */
-- 
cgit 


From d7ca4c755a82eda8f0fc4f72c52130056b28c7d2 Mon Sep 17 00:00:00 2001
From: "Manjunathappa, Prakash" <prakash.pm@ti.com>
Date: Thu, 28 Mar 2013 18:41:59 +0530
Subject: ARM: davinci: mmc: derive version information from device name

Remove specifying mmc controller IP version information via platform
data, instead specify device name so that driver derives it from
platform_device_id table. Also change the clock node name to match
the changed dev_id.
Tested on da850-evm to make sure driver loads without clk_get failures.

Signed-off-by: Manjunathappa, Prakash <prakash.pm@ti.com>
Reviewed-by: Sekhar Nori <nsekhar@ti.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Chris Ball <cjb@laptop.org>
Signed-off-by: Sekhar Nori <nsekhar@ti.com>
---
 arch/arm/mach-davinci/board-da830-evm.c     |  1 -
 arch/arm/mach-davinci/board-da850-evm.c     |  2 --
 arch/arm/mach-davinci/board-dm355-evm.c     |  1 -
 arch/arm/mach-davinci/board-dm365-evm.c     |  1 -
 arch/arm/mach-davinci/board-dm644x-evm.c    |  1 -
 arch/arm/mach-davinci/board-neuros-osd2.c   |  1 -
 arch/arm/mach-davinci/board-omapl138-hawk.c |  1 -
 arch/arm/mach-davinci/board-tnetv107x-evm.c |  1 -
 arch/arm/mach-davinci/da830.c               |  2 +-
 arch/arm/mach-davinci/da850.c               |  4 ++--
 arch/arm/mach-davinci/devices-da8xx.c       |  4 ++--
 arch/arm/mach-davinci/devices-tnetv107x.c   |  4 ++--
 arch/arm/mach-davinci/devices.c             |  6 ++++--
 arch/arm/mach-davinci/dm355.c               |  4 ++--
 arch/arm/mach-davinci/dm365.c               |  4 ++--
 arch/arm/mach-davinci/dm644x.c              |  2 +-
 arch/arm/mach-davinci/tnetv107x.c           |  4 ++--
 drivers/mmc/host/davinci_mmc.c              | 18 +++++++++++++++++-
 include/linux/platform_data/mmc-davinci.h   |  3 ---
 19 files changed, 35 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-davinci/board-da830-evm.c b/arch/arm/mach-davinci/board-da830-evm.c
index 6da25eebf911..12e6f756361d 100644
--- a/arch/arm/mach-davinci/board-da830-evm.c
+++ b/arch/arm/mach-davinci/board-da830-evm.c
@@ -246,7 +246,6 @@ static struct davinci_mmc_config da830_evm_mmc_config = {
 	.wires			= 8,
 	.max_freq		= 50000000,
 	.caps			= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
-	.version		= MMC_CTLR_VERSION_2,
 };
 
 static inline void da830_evm_init_mmc(void)
diff --git a/arch/arm/mach-davinci/board-da850-evm.c b/arch/arm/mach-davinci/board-da850-evm.c
index c2dfe06563df..dcc8710936a5 100644
--- a/arch/arm/mach-davinci/board-da850-evm.c
+++ b/arch/arm/mach-davinci/board-da850-evm.c
@@ -802,7 +802,6 @@ static struct davinci_mmc_config da850_mmc_config = {
 	.wires		= 4,
 	.max_freq	= 50000000,
 	.caps		= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
-	.version	= MMC_CTLR_VERSION_2,
 };
 
 static const short da850_evm_mmcsd0_pins[] __initconst = {
@@ -1372,7 +1371,6 @@ static struct davinci_mmc_config da850_wl12xx_mmc_config = {
 	.max_freq	= 25000000,
 	.caps		= MMC_CAP_4_BIT_DATA | MMC_CAP_NONREMOVABLE |
 			  MMC_CAP_POWER_OFF_CARD,
-	.version	= MMC_CTLR_VERSION_2,
 };
 
 static const short da850_wl12xx_pins[] __initconst = {
diff --git a/arch/arm/mach-davinci/board-dm355-evm.c b/arch/arm/mach-davinci/board-dm355-evm.c
index 147b8e1a4407..bfdf8b979a64 100644
--- a/arch/arm/mach-davinci/board-dm355-evm.c
+++ b/arch/arm/mach-davinci/board-dm355-evm.c
@@ -280,7 +280,6 @@ static struct davinci_mmc_config dm355evm_mmc_config = {
 	.wires		= 4,
 	.max_freq       = 50000000,
 	.caps           = MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
-	.version	= MMC_CTLR_VERSION_1,
 };
 
 /* Don't connect anything to J10 unless you're only using USB host
diff --git a/arch/arm/mach-davinci/board-dm365-evm.c b/arch/arm/mach-davinci/board-dm365-evm.c
index c2d4958a0cb6..4cfdd9109e19 100644
--- a/arch/arm/mach-davinci/board-dm365-evm.c
+++ b/arch/arm/mach-davinci/board-dm365-evm.c
@@ -253,7 +253,6 @@ static struct davinci_mmc_config dm365evm_mmc_config = {
 	.wires		= 4,
 	.max_freq	= 50000000,
 	.caps		= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
-	.version	= MMC_CTLR_VERSION_2,
 };
 
 static void dm365evm_emac_configure(void)
diff --git a/arch/arm/mach-davinci/board-dm644x-evm.c b/arch/arm/mach-davinci/board-dm644x-evm.c
index 71735e7797cc..c0206d5f2bf6 100644
--- a/arch/arm/mach-davinci/board-dm644x-evm.c
+++ b/arch/arm/mach-davinci/board-dm644x-evm.c
@@ -570,7 +570,6 @@ static struct davinci_mmc_config dm6446evm_mmc_config = {
 	.get_cd		= dm6444evm_mmc_get_cd,
 	.get_ro		= dm6444evm_mmc_get_ro,
 	.wires		= 4,
-	.version	= MMC_CTLR_VERSION_1
 };
 
 static struct i2c_board_info __initdata i2c_info[] =  {
diff --git a/arch/arm/mach-davinci/board-neuros-osd2.c b/arch/arm/mach-davinci/board-neuros-osd2.c
index 1c98107527fa..b70e83c03bed 100644
--- a/arch/arm/mach-davinci/board-neuros-osd2.c
+++ b/arch/arm/mach-davinci/board-neuros-osd2.c
@@ -164,7 +164,6 @@ static void __init davinci_ntosd2_map_io(void)
 
 static struct davinci_mmc_config davinci_ntosd2_mmc_config = {
 	.wires		= 4,
-	.version	= MMC_CTLR_VERSION_1
 };
 
 
diff --git a/arch/arm/mach-davinci/board-omapl138-hawk.c b/arch/arm/mach-davinci/board-omapl138-hawk.c
index 5a2bd44da54d..328dbd8a37f5 100644
--- a/arch/arm/mach-davinci/board-omapl138-hawk.c
+++ b/arch/arm/mach-davinci/board-omapl138-hawk.c
@@ -136,7 +136,6 @@ static struct davinci_mmc_config da850_mmc_config = {
 	.wires		= 4,
 	.max_freq	= 50000000,
 	.caps		= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
-	.version	= MMC_CTLR_VERSION_2,
 };
 
 static __init void omapl138_hawk_mmc_init(void)
diff --git a/arch/arm/mach-davinci/board-tnetv107x-evm.c b/arch/arm/mach-davinci/board-tnetv107x-evm.c
index 4f416023d4e2..ba798370fc96 100644
--- a/arch/arm/mach-davinci/board-tnetv107x-evm.c
+++ b/arch/arm/mach-davinci/board-tnetv107x-evm.c
@@ -85,7 +85,6 @@ static struct davinci_mmc_config mmc_config = {
 	.wires		= 4,
 	.max_freq	= 50000000,
 	.caps		= MMC_CAP_MMC_HIGHSPEED | MMC_CAP_SD_HIGHSPEED,
-	.version	= MMC_CTLR_VERSION_1,
 };
 
 static const short sdio1_pins[] __initconst = {
diff --git a/arch/arm/mach-davinci/da830.c b/arch/arm/mach-davinci/da830.c
index 678a54a64dae..abbaf0270be6 100644
--- a/arch/arm/mach-davinci/da830.c
+++ b/arch/arm/mach-davinci/da830.c
@@ -394,7 +394,7 @@ static struct clk_lookup da830_clks[] = {
 	CLK(NULL,		"tpcc",		&tpcc_clk),
 	CLK(NULL,		"tptc0",	&tptc0_clk),
 	CLK(NULL,		"tptc1",	&tptc1_clk),
-	CLK("davinci_mmc.0",	NULL,		&mmcsd_clk),
+	CLK("da830-mmc.0",	NULL,		&mmcsd_clk),
 	CLK(NULL,		"uart0",	&uart0_clk),
 	CLK(NULL,		"uart1",	&uart1_clk),
 	CLK(NULL,		"uart2",	&uart2_clk),
diff --git a/arch/arm/mach-davinci/da850.c b/arch/arm/mach-davinci/da850.c
index 2a2f60c54ec6..4d6933848abf 100644
--- a/arch/arm/mach-davinci/da850.c
+++ b/arch/arm/mach-davinci/da850.c
@@ -463,8 +463,8 @@ static struct clk_lookup da850_clks[] = {
 	CLK("davinci_emac.1",	NULL,		&emac_clk),
 	CLK("davinci-mcasp.0",	NULL,		&mcasp_clk),
 	CLK("da8xx_lcdc.0",	"fck",		&lcdc_clk),
-	CLK("davinci_mmc.0",	NULL,		&mmcsd0_clk),
-	CLK("davinci_mmc.1",	NULL,		&mmcsd1_clk),
+	CLK("da830-mmc.0",	NULL,		&mmcsd0_clk),
+	CLK("da830-mmc.1",	NULL,		&mmcsd1_clk),
 	CLK(NULL,		"aemif",	&aemif_clk),
 	CLK(NULL,		"usb11",	&usb11_clk),
 	CLK(NULL,		"usb20",	&usb20_clk),
diff --git a/arch/arm/mach-davinci/devices-da8xx.c b/arch/arm/mach-davinci/devices-da8xx.c
index fc50243b1481..cb97e07db284 100644
--- a/arch/arm/mach-davinci/devices-da8xx.c
+++ b/arch/arm/mach-davinci/devices-da8xx.c
@@ -664,7 +664,7 @@ static struct resource da8xx_mmcsd0_resources[] = {
 };
 
 static struct platform_device da8xx_mmcsd0_device = {
-	.name		= "davinci_mmc",
+	.name		= "da830-mmc",
 	.id		= 0,
 	.num_resources	= ARRAY_SIZE(da8xx_mmcsd0_resources),
 	.resource	= da8xx_mmcsd0_resources,
@@ -701,7 +701,7 @@ static struct resource da850_mmcsd1_resources[] = {
 };
 
 static struct platform_device da850_mmcsd1_device = {
-	.name		= "davinci_mmc",
+	.name		= "da830-mmc",
 	.id		= 1,
 	.num_resources	= ARRAY_SIZE(da850_mmcsd1_resources),
 	.resource	= da850_mmcsd1_resources,
diff --git a/arch/arm/mach-davinci/devices-tnetv107x.c b/arch/arm/mach-davinci/devices-tnetv107x.c
index 773ab07a71a0..cfb194df18ed 100644
--- a/arch/arm/mach-davinci/devices-tnetv107x.c
+++ b/arch/arm/mach-davinci/devices-tnetv107x.c
@@ -218,7 +218,7 @@ static u64 mmc1_dma_mask = DMA_BIT_MASK(32);
 
 static struct platform_device mmc_devices[2] = {
 	{
-		.name		= "davinci_mmc",
+		.name		= "dm6441-mmc",
 		.id		= 0,
 		.dev		= {
 			.dma_mask		= &mmc0_dma_mask,
@@ -228,7 +228,7 @@ static struct platform_device mmc_devices[2] = {
 		.resource	= mmc0_resources
 	},
 	{
-		.name		= "davinci_mmc",
+		.name		= "dm6441-mmc",
 		.id		= 1,
 		.dev		= {
 			.dma_mask		= &mmc1_dma_mask,
diff --git a/arch/arm/mach-davinci/devices.c b/arch/arm/mach-davinci/devices.c
index 4c48a36ee567..f6927df2dda8 100644
--- a/arch/arm/mach-davinci/devices.c
+++ b/arch/arm/mach-davinci/devices.c
@@ -150,7 +150,7 @@ static struct resource mmcsd0_resources[] = {
 };
 
 static struct platform_device davinci_mmcsd0_device = {
-	.name = "davinci_mmc",
+	.name = "dm6441-mmc",
 	.id = 0,
 	.dev = {
 		.dma_mask = &mmcsd0_dma_mask,
@@ -187,7 +187,7 @@ static struct resource mmcsd1_resources[] = {
 };
 
 static struct platform_device davinci_mmcsd1_device = {
-	.name = "davinci_mmc",
+	.name = "dm6441-mmc",
 	.id = 1,
 	.dev = {
 		.dma_mask = &mmcsd1_dma_mask,
@@ -235,6 +235,7 @@ void __init davinci_setup_mmc(int module, struct davinci_mmc_config *config)
 			mmcsd1_resources[0].end = DM365_MMCSD1_BASE +
 							SZ_4K - 1;
 			mmcsd1_resources[2].start = IRQ_DM365_SDIOINT1;
+			davinci_mmcsd1_device.name = "da830-mmc";
 		} else
 			break;
 
@@ -256,6 +257,7 @@ void __init davinci_setup_mmc(int module, struct davinci_mmc_config *config)
 			mmcsd0_resources[0].end = DM365_MMCSD0_BASE +
 							SZ_4K - 1;
 			mmcsd0_resources[2].start = IRQ_DM365_SDIOINT0;
+			davinci_mmcsd0_device.name = "da830-mmc";
 		} else if (cpu_is_davinci_dm644x()) {
 			/* REVISIT: should this be in board-init code? */
 			/* Power-on 3.3V IO cells */
diff --git a/arch/arm/mach-davinci/dm355.c b/arch/arm/mach-davinci/dm355.c
index b49c3b77d55e..87e6104f45e6 100644
--- a/arch/arm/mach-davinci/dm355.c
+++ b/arch/arm/mach-davinci/dm355.c
@@ -361,8 +361,8 @@ static struct clk_lookup dm355_clks[] = {
 	CLK("i2c_davinci.1", NULL, &i2c_clk),
 	CLK("davinci-mcbsp.0", NULL, &asp0_clk),
 	CLK("davinci-mcbsp.1", NULL, &asp1_clk),
-	CLK("davinci_mmc.0", NULL, &mmcsd0_clk),
-	CLK("davinci_mmc.1", NULL, &mmcsd1_clk),
+	CLK("dm6441-mmc.0", NULL, &mmcsd0_clk),
+	CLK("dm6441-mmc.1", NULL, &mmcsd1_clk),
 	CLK("spi_davinci.0", NULL, &spi0_clk),
 	CLK("spi_davinci.1", NULL, &spi1_clk),
 	CLK("spi_davinci.2", NULL, &spi2_clk),
diff --git a/arch/arm/mach-davinci/dm365.c b/arch/arm/mach-davinci/dm365.c
index 6c3980540be0..2791df9187b3 100644
--- a/arch/arm/mach-davinci/dm365.c
+++ b/arch/arm/mach-davinci/dm365.c
@@ -454,8 +454,8 @@ static struct clk_lookup dm365_clks[] = {
 	CLK(NULL, "uart0", &uart0_clk),
 	CLK(NULL, "uart1", &uart1_clk),
 	CLK("i2c_davinci.1", NULL, &i2c_clk),
-	CLK("davinci_mmc.0", NULL, &mmcsd0_clk),
-	CLK("davinci_mmc.1", NULL, &mmcsd1_clk),
+	CLK("da830-mmc.0", NULL, &mmcsd0_clk),
+	CLK("da830-mmc.1", NULL, &mmcsd1_clk),
 	CLK("spi_davinci.0", NULL, &spi0_clk),
 	CLK("spi_davinci.1", NULL, &spi1_clk),
 	CLK("spi_davinci.2", NULL, &spi2_clk),
diff --git a/arch/arm/mach-davinci/dm644x.c b/arch/arm/mach-davinci/dm644x.c
index db1dd92e00af..ab6bf54c65c7 100644
--- a/arch/arm/mach-davinci/dm644x.c
+++ b/arch/arm/mach-davinci/dm644x.c
@@ -310,7 +310,7 @@ static struct clk_lookup dm644x_clks[] = {
 	CLK("i2c_davinci.1", NULL, &i2c_clk),
 	CLK("palm_bk3710", NULL, &ide_clk),
 	CLK("davinci-mcbsp", NULL, &asp_clk),
-	CLK("davinci_mmc.0", NULL, &mmcsd_clk),
+	CLK("dm6441-mmc.0", NULL, &mmcsd_clk),
 	CLK(NULL, "spi", &spi_clk),
 	CLK(NULL, "gpio", &gpio_clk),
 	CLK(NULL, "usb", &usb_clk),
diff --git a/arch/arm/mach-davinci/tnetv107x.c b/arch/arm/mach-davinci/tnetv107x.c
index dc1a209b9b66..3b2a70d43efa 100644
--- a/arch/arm/mach-davinci/tnetv107x.c
+++ b/arch/arm/mach-davinci/tnetv107x.c
@@ -272,7 +272,7 @@ static struct clk_lookup clks[] = {
 	CLK("tnetv107x-keypad.0", NULL,			&clk_keypad),
 	CLK(NULL,		"clk_gpio",		&clk_gpio),
 	CLK(NULL,		"clk_mdio",		&clk_mdio),
-	CLK("davinci_mmc.0",	NULL,			&clk_sdio0),
+	CLK("dm6441-mmc.0",	NULL,			&clk_sdio0),
 	CLK(NULL,		"uart0",		&clk_uart0),
 	CLK(NULL,		"uart1",		&clk_uart1),
 	CLK(NULL,		"timer0",		&clk_timer0),
@@ -292,7 +292,7 @@ static struct clk_lookup clks[] = {
 	CLK(NULL,		"clk_system",		&clk_system),
 	CLK(NULL,		"clk_imcop",		&clk_imcop),
 	CLK(NULL,		"clk_spare",		&clk_spare),
-	CLK("davinci_mmc.1",	NULL,			&clk_sdio1),
+	CLK("dm6441-mmc.1",	NULL,			&clk_sdio1),
 	CLK(NULL,		"clk_ddr2_vrst",	&clk_ddr2_vrst),
 	CLK(NULL,		"clk_ddr2_vctl_rst",	&clk_ddr2_vctl_rst),
 	CLK(NULL,		NULL,			NULL),
diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c
index 20636772c09b..b5f1c019ecad 100644
--- a/drivers/mmc/host/davinci_mmc.c
+++ b/drivers/mmc/host/davinci_mmc.c
@@ -1157,6 +1157,18 @@ static void __init init_mmcsd_host(struct mmc_davinci_host *host)
 	mmc_davinci_reset_ctrl(host, 0);
 }
 
+static struct platform_device_id davinci_mmc_devtype[] = {
+	{
+		.name	= "dm6441-mmc",
+		.driver_data = MMC_CTLR_VERSION_1,
+	}, {
+		.name	= "da830-mmc",
+		.driver_data = MMC_CTLR_VERSION_2,
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(platform, davinci_mmc_devtype);
+
 static int __init davinci_mmcsd_probe(struct platform_device *pdev)
 {
 	struct davinci_mmc_config *pdata = pdev->dev.platform_data;
@@ -1165,6 +1177,7 @@ static int __init davinci_mmcsd_probe(struct platform_device *pdev)
 	struct resource *r, *mem = NULL;
 	int ret = 0, irq = 0;
 	size_t mem_size;
+	const struct platform_device_id *id_entry;
 
 	/* REVISIT:  when we're fully converted, fail if pdata is NULL */
 
@@ -1237,7 +1250,9 @@ static int __init davinci_mmcsd_probe(struct platform_device *pdev)
 	if (pdata && (pdata->wires == 8))
 		mmc->caps |= (MMC_CAP_4_BIT_DATA | MMC_CAP_8_BIT_DATA);
 
-	host->version = pdata->version;
+	id_entry = platform_get_device_id(pdev);
+	if (id_entry)
+		host->version = id_entry->driver_data;
 
 	mmc->ops = &mmc_davinci_ops;
 	mmc->f_min = 312500;
@@ -1408,6 +1423,7 @@ static struct platform_driver davinci_mmcsd_driver = {
 		.pm	= davinci_mmcsd_pm_ops,
 	},
 	.remove		= __exit_p(davinci_mmcsd_remove),
+	.id_table	= davinci_mmc_devtype,
 };
 
 static int __init davinci_mmcsd_init(void)
diff --git a/include/linux/platform_data/mmc-davinci.h b/include/linux/platform_data/mmc-davinci.h
index 5ba6b22ce338..9cea4ee377b5 100644
--- a/include/linux/platform_data/mmc-davinci.h
+++ b/include/linux/platform_data/mmc-davinci.h
@@ -23,9 +23,6 @@ struct davinci_mmc_config {
 	/* any additional host capabilities: OR'd in to mmc->f_caps */
 	u32     caps;
 
-	/* Version of the MMC/SD controller */
-	u8	version;
-
 	/* Number of sg segments */
 	u8	nr_sg;
 };
-- 
cgit 


From 88af8bbe4ef781031ad3370847553f3b42ba0076 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sun, 23 Dec 2012 21:10:24 +0100
Subject: usb: gadget: the start of the configfs interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

|# modprobe dummy_hcd num=2
|# modprobe libcomposite

|# lsmod
|Module                  Size  Used by
|libcomposite           31648  0
|dummy_hcd              19871  0

|# mkdir /sys/kernel/config/usb_gadget/oha
|# cd /sys/kernel/config/usb_gadget/oha
|# mkdir configs/def.1
|# mkdir configs/def.2
|# mkdir functions/acm.ttyS1
|# mkdir strings/0x1
|mkdir: cannot create directory `strings/0x1': Invalid argument
|# mkdir strings/0x409
|# mkdir strings/1033
|mkdir: cannot create directory `strings/1033': File exists
|# mkdir strings/1032
|# mkdir configs/def.1/strings/0x409
|# mkdir configs/def.2/strings/0x409

|#find . -ls
|   975    0 drwxr-xr-x   5 root     root            0 Dec 23 17:40 .
|   978    0 drwxr-xr-x   4 root     root            0 Dec 23 17:43 ./strings
|  4100    0 drwxr-xr-x   2 root     root            0 Dec 23 17:43 ./strings/1032
|   995    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./strings/1032/serialnumber
|   996    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./strings/1032/product
|   997    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./strings/1032/manufacturer
|  2002    0 drwxr-xr-x   2 root     root            0 Dec 23 17:41 ./strings/0x409
|   998    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./strings/0x409/serialnumber
|   999    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./strings/0x409/product
|  1000    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./strings/0x409/manufacturer
|   977    0 drwxr-xr-x   4 root     root            0 Dec 23 17:41 ./configs
|  4081    0 drwxr-xr-x   3 root     root            0 Dec 23 17:41 ./configs/def.2
|  4082    0 drwxr-xr-x   3 root     root            0 Dec 23 17:42 ./configs/def.2/strings
|  2016    0 drwxr-xr-x   2 root     root            0 Dec 23 17:42 ./configs/def.2/strings/0x409
|  1001    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./configs/def.2/strings/0x409/configuration
|  1002    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./configs/def.2/bmAttributes
|  1003    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./configs/def.2/MaxPower
|   979    0 drwxr-xr-x   3 root     root            0 Dec 23 17:42 ./configs/def.1
|   980    0 drwxr-xr-x   3 root     root            0 Dec 23 17:42 ./configs/def.1/strings
|  5122    0 drwxr-xr-x   2 root     root            0 Dec 23 17:42 ./configs/def.1/strings/0x409
|  1004    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./configs/def.1/strings/0x409/configuration
|  1005    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./configs/def.1/bmAttributes
|  1006    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./configs/def.1/MaxPower
|   976    0 drwxr-xr-x   3 root     root            0 Dec 23 17:41 ./functions
|   981    0 drwxr-xr-x   2 root     root            0 Dec 23 17:41 ./functions/acm.ttyS1
|  1007    0 -r--r--r--   1 root     root         4096 Dec 23 17:43 ./functions/acm.ttyS1/port_num
|  1008    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./UDC
|  1009    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./bcdUSB
|  1010    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./bcdDevice
|  1011    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./idProduct
|  1012    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./idVendor
|  1013    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./bMaxPacketSize0
|  1014    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./bDeviceProtocol
|  1015    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./bDeviceSubClass
|  1016    0 -rw-r--r--   1 root     root         4096 Dec 23 17:43 ./bDeviceClass

|# cat functions/acm.ttyS1/port_num
|0
|# ls -lah /dev/ttyGS*
|crw-rw---T 1 root dialout 252, 0 Dec 23 17:41 /dev/ttyGS0
|
|# echo 0x1234 > idProduct
|# echo 0xabcd > idVendor
|# echo 1122 > strings/0x409/serialnumber
|# echo "The manufacturer" > strings/0x409/manufacturer
|# echo 1 > strings/1032/manufacturer
|# echo 1sa > strings/1032/product
|# echo tada > strings/1032/serialnumber
|echo "Primary configuration" > configs/def.1/strings/0x409/configuration
|# echo "Secondary configuration" > configs/def.2/strings/0x409/configuration
|# ln -s functions/acm.ttyS1 configs/def.1/
|# ln -s functions/acm.ttyS1 configs/def.2/
|find configs/def.1/ -ls
|   979    0 drwxr-xr-x   3 root     root            0 Dec 23 17:49 configs/def.1/
|  6264    0 lrwxrwxrwx   1 root     root            0 Dec 23 17:48 configs/def.1/acm.ttyS1 -> ../../../../usb_gadget/oha/functions/acm.ttyS1
|   980    0 drwxr-xr-x   3 root     root            0 Dec 23 17:42 configs/def.1/strings
|  5122    0 drwxr-xr-x   2 root     root            0 Dec 23 17:49 configs/def.1/strings/0x409
|  6284    0 -rw-r--r--   1 root     root         4096 Dec 23 17:47 configs/def.1/strings/0x409/configuration
|  6285    0 -rw-r--r--   1 root     root         4096 Dec 23 17:49 configs/def.1/bmAttributes
|  6286    0 -rw-r--r--   1 root     root         4096 Dec 23 17:49 configs/def.1/MaxPower
|
|echo 120 > configs/def.1/MaxPower
|
|# ls -lh /sys/class/udc/
|total 0
|lrwxrwxrwx 1 root root 0 Dec 23 17:50 dummy_udc.0 -> ../../devices/platform/dummy_udc.0/udc/dummy_udc.0
|lrwxrwxrwx 1 root root 0 Dec 23 17:50 dummy_udc.1 -> ../../devices/platform/dummy_udc.1/udc/dummy_udc.1
|# echo dummy_udc.0 > UDC
|# lsusb
|Bus 001 Device 002: ID abcd:1234 Unknown
|
|lsusb -d abcd:1234 -v
|Device Descriptor:
…
|  idVendor           0xabcd Unknown
|  idProduct          0x1234
|  bcdDevice            3.06
|  iManufacturer           1 The manufacturer
|  iProduct                2
|  iSerial                 3 1122
|  bNumConfigurations      2
…
|echo "" > UDC

v5…v6
- wired up strings with usb_gstrings_attach()
- add UDC attribe. Write "udc-name" will bind the gadget. Write an empty
  string (it should contain \n since 0 bytes write get optimzed away)
  will unbind the UDC from the gadget. The name of available UDCs can be
  obtained from /sys/class/udc/

v4…v5
- string rework. This will add a strings folder incl. language code like
    strings/409/manufacturer
  as suggested by Alan.
- rebased ontop reworked functions.c which has usb_function_instance
  which is used prior after "mkdir acm.instance" and can be directly
  used for configuration via configfs.

v3…v4
 - moved functions from the root folde down to the gadget as suggested
   by Michał
 - configs have now their own configs folder as suggested by Michał.
   The folder is still name.bConfigurationValue where name becomes the
   sConfiguration. Is this usefull should we just stilc
   configs/bConfigurationValue/ ?
 - added configfs support to the ACM function. The port_num attribute is
   exported by f_acm. An argument has been added to the USB alloc
   function to distinguish between "old" (use facm_configure() to
   configure and configfs interface (expose a config_node).
   The port_num is currently a dumb counter. It will
   require some function re-work to make it work.

scheduled for v5:
- sym linking function into config.

v2…v3
- replaced one ifndef by ifdef as suggested by Micahał
- strstr()/strchr() function_make as suggested by Micahł
- replace [iSerialNumber|iProduct|iManufacturer] with
  [sSerialNumber|sProduct|sManufacturer] as suggested by Alan
- added creation of config descriptors

v1…v2
- moved gadgets from configfs' root directory into /udcs/ within our
  "usb_gadget" folder. Requested by Andrzej & Michał
- use a dot as a delimiter between function's name and its instance's name
  as suggested by Michał
- renamed all config_item_type, configfs_group_operations, make_group,
  drop_item as suggested by suggested by Andrzej to remain consisten
  within this file and within other configfs users
- Since configfs.c and functions.c are now part of the udc-core module,
  the module itself is now called udc. Also added a tiny ifdef around
  init code becuase udc-core is subsys init and this is too early for
  configfs in the built-in case. In the module case, we can only have
  one init function.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Felipe Balbi <balbi@ti.com>
---
 drivers/usb/gadget/Kconfig          |    1 +
 drivers/usb/gadget/Makefile         |    2 +-
 drivers/usb/gadget/composite.c      |    1 +
 drivers/usb/gadget/configfs.c       | 1003 +++++++++++++++++++++++++++++++++++
 drivers/usb/gadget/f_acm.c          |   55 ++
 include/linux/usb/composite.h       |    3 +
 include/linux/usb/gadget_configfs.h |  110 ++++
 7 files changed, 1174 insertions(+), 1 deletion(-)
 create mode 100644 drivers/usb/gadget/configfs.c
 create mode 100644 include/linux/usb/gadget_configfs.h

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index e1d3e0803cd5..74a29de8f254 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -493,6 +493,7 @@ endmenu
 # composite based drivers
 config USB_LIBCOMPOSITE
 	tristate
+	select CONFIGFS_FS
 	depends on USB_GADGET
 
 config USB_F_ACM
diff --git a/drivers/usb/gadget/Makefile b/drivers/usb/gadget/Makefile
index 82fb22511356..96e72433cd31 100644
--- a/drivers/usb/gadget/Makefile
+++ b/drivers/usb/gadget/Makefile
@@ -6,7 +6,7 @@ ccflags-$(CONFIG_USB_GADGET_DEBUG) := -DDEBUG
 obj-$(CONFIG_USB_GADGET)	+= udc-core.o
 obj-$(CONFIG_USB_LIBCOMPOSITE)	+= libcomposite.o
 libcomposite-y			:= usbstring.o config.o epautoconf.o
-libcomposite-y			+= composite.o functions.o
+libcomposite-y			+= composite.o functions.o configfs.o
 obj-$(CONFIG_USB_DUMMY_HCD)	+= dummy_hcd.o
 obj-$(CONFIG_USB_NET2272)	+= net2272.o
 obj-$(CONFIG_USB_NET2280)	+= net2280.o
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c
index c0d62b278610..55f4df60f327 100644
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -1637,6 +1637,7 @@ void composite_dev_cleanup(struct usb_composite_dev *cdev)
 		kfree(cdev->req->buf);
 		usb_ep_free_request(cdev->gadget->ep0, cdev->req);
 	}
+	cdev->next_string_id = 0;
 	device_remove_file(&cdev->gadget->dev, &dev_attr_suspended);
 }
 
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c
new file mode 100644
index 000000000000..a34633a898a1
--- /dev/null
+++ b/drivers/usb/gadget/configfs.c
@@ -0,0 +1,1003 @@
+#include <linux/configfs.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+#include <linux/usb/composite.h>
+#include <linux/usb/gadget_configfs.h>
+
+int check_user_usb_string(const char *name,
+		struct usb_gadget_strings *stringtab_dev)
+{
+	unsigned primary_lang;
+	unsigned sub_lang;
+	u16 num;
+	int ret;
+
+	ret = kstrtou16(name, 0, &num);
+	if (ret)
+		return ret;
+
+	primary_lang = num & 0x3ff;
+	sub_lang = num >> 10;
+
+	/* simple sanity check for valid langid */
+	switch (primary_lang) {
+	case 0:
+	case 0x62 ... 0xfe:
+	case 0x100 ... 0x3ff:
+		return -EINVAL;
+	}
+	if (!sub_lang)
+		return -EINVAL;
+
+	stringtab_dev->language = num;
+	return 0;
+}
+
+#define MAX_NAME_LEN	40
+#define MAX_USB_STRING_LANGS 2
+
+struct gadget_info {
+	struct config_group group;
+	struct config_group functions_group;
+	struct config_group configs_group;
+	struct config_group strings_group;
+	struct config_group *default_groups[4];
+
+	struct mutex lock;
+	struct usb_gadget_strings *gstrings[MAX_USB_STRING_LANGS + 1];
+	struct list_head string_list;
+	struct list_head available_func;
+
+	const char *udc_name;
+#ifdef CONFIG_USB_OTG
+	struct usb_otg_descriptor otg;
+#endif
+	struct usb_composite_driver composite;
+	struct usb_composite_dev cdev;
+};
+
+struct config_usb_cfg {
+	struct config_group group;
+	struct config_group strings_group;
+	struct config_group *default_groups[2];
+	struct list_head string_list;
+	struct usb_configuration c;
+	struct list_head func_list;
+	struct usb_gadget_strings *gstrings[MAX_USB_STRING_LANGS + 1];
+};
+
+struct gadget_strings {
+	struct usb_gadget_strings stringtab_dev;
+	struct usb_string strings[USB_GADGET_FIRST_AVAIL_IDX];
+	char *manufacturer;
+	char *product;
+	char *serialnumber;
+
+	struct config_group group;
+	struct list_head list;
+};
+
+struct gadget_config_name {
+	struct usb_gadget_strings stringtab_dev;
+	struct usb_string strings;
+	char *configuration;
+
+	struct config_group group;
+	struct list_head list;
+};
+
+static int usb_string_copy(const char *s, char **s_copy)
+{
+	int ret;
+	char *str;
+	char *copy = *s_copy;
+	ret = strlen(s);
+	if (ret > 126)
+		return -EOVERFLOW;
+
+	str = kstrdup(s, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+	if (str[ret - 1] == '\n')
+		str[ret - 1] = '\0';
+	kfree(copy);
+	*s_copy = str;
+	return 0;
+}
+
+CONFIGFS_ATTR_STRUCT(gadget_info);
+CONFIGFS_ATTR_STRUCT(config_usb_cfg);
+
+#define GI_DEVICE_DESC_ITEM_ATTR(name)	\
+	static struct gadget_info_attribute gadget_cdev_desc_##name = \
+		__CONFIGFS_ATTR(name,  S_IRUGO | S_IWUSR,		\
+				gadget_dev_desc_##name##_show,		\
+				gadget_dev_desc_##name##_store)
+
+#define GI_DEVICE_DESC_SIMPLE_R_u8(__name)	\
+	static ssize_t gadget_dev_desc_##__name##_show(struct gadget_info *gi, \
+			char *page)	\
+{	\
+	return sprintf(page, "0x%02x\n", gi->cdev.desc.__name);	\
+}
+
+#define GI_DEVICE_DESC_SIMPLE_R_u16(__name)	\
+	static ssize_t gadget_dev_desc_##__name##_show(struct gadget_info *gi, \
+			char *page)	\
+{	\
+	return sprintf(page, "0x%04x\n", le16_to_cpup(&gi->cdev.desc.__name)); \
+}
+
+
+#define GI_DEVICE_DESC_SIMPLE_W_u8(_name)		\
+	static ssize_t gadget_dev_desc_##_name##_store(struct gadget_info *gi, \
+		const char *page, size_t len)		\
+{							\
+	u8 val;						\
+	int ret;					\
+	ret = kstrtou8(page, 0, &val);			\
+	if (ret)					\
+		return ret;				\
+	gi->cdev.desc._name = val;			\
+	return len;					\
+}
+
+#define GI_DEVICE_DESC_SIMPLE_W_u16(_name)	\
+	static ssize_t gadget_dev_desc_##_name##_store(struct gadget_info *gi, \
+		const char *page, size_t len)		\
+{							\
+	u16 val;					\
+	int ret;					\
+	ret = kstrtou16(page, 0, &val);			\
+	if (ret)					\
+		return ret;				\
+	gi->cdev.desc._name = cpu_to_le16p(&val);	\
+	return len;					\
+}
+
+#define GI_DEVICE_DESC_SIMPLE_RW(_name, _type)	\
+	GI_DEVICE_DESC_SIMPLE_R_##_type(_name)	\
+	GI_DEVICE_DESC_SIMPLE_W_##_type(_name)
+
+GI_DEVICE_DESC_SIMPLE_R_u16(bcdUSB);
+GI_DEVICE_DESC_SIMPLE_RW(bDeviceClass, u8);
+GI_DEVICE_DESC_SIMPLE_RW(bDeviceSubClass, u8);
+GI_DEVICE_DESC_SIMPLE_RW(bDeviceProtocol, u8);
+GI_DEVICE_DESC_SIMPLE_RW(bMaxPacketSize0, u8);
+GI_DEVICE_DESC_SIMPLE_RW(idVendor, u16);
+GI_DEVICE_DESC_SIMPLE_RW(idProduct, u16);
+GI_DEVICE_DESC_SIMPLE_R_u16(bcdDevice);
+
+static ssize_t is_valid_bcd(u16 bcd_val)
+{
+	if ((bcd_val & 0xf) > 9)
+		return -EINVAL;
+	if (((bcd_val >> 4) & 0xf) > 9)
+		return -EINVAL;
+	if (((bcd_val >> 8) & 0xf) > 9)
+		return -EINVAL;
+	if (((bcd_val >> 12) & 0xf) > 9)
+		return -EINVAL;
+	return 0;
+}
+
+static ssize_t gadget_dev_desc_bcdDevice_store(struct gadget_info *gi,
+		const char *page, size_t len)
+{
+	u16 bcdDevice;
+	int ret;
+
+	ret = kstrtou16(page, 0, &bcdDevice);
+	if (ret)
+		return ret;
+	ret = is_valid_bcd(bcdDevice);
+	if (ret)
+		return ret;
+
+	gi->cdev.desc.bcdDevice = cpu_to_le16(bcdDevice);
+	return len;
+}
+
+static ssize_t gadget_dev_desc_bcdUSB_store(struct gadget_info *gi,
+		const char *page, size_t len)
+{
+	u16 bcdUSB;
+	int ret;
+
+	ret = kstrtou16(page, 0, &bcdUSB);
+	if (ret)
+		return ret;
+	ret = is_valid_bcd(bcdUSB);
+	if (ret)
+		return ret;
+
+	gi->cdev.desc.bcdUSB = cpu_to_le16(bcdUSB);
+	return len;
+}
+
+static ssize_t gadget_dev_desc_UDC_show(struct gadget_info *gi, char *page)
+{
+	return sprintf(page, "%s\n", gi->udc_name ?: "");
+}
+
+static int unregister_gadget(struct gadget_info *gi)
+{
+	int ret;
+
+	if (!gi->udc_name)
+		return -ENODEV;
+
+	ret = usb_gadget_unregister_driver(&gi->composite.gadget_driver);
+	if (ret)
+		return ret;
+	kfree(gi->udc_name);
+	gi->udc_name = NULL;
+	return 0;
+}
+
+static ssize_t gadget_dev_desc_UDC_store(struct gadget_info *gi,
+		const char *page, size_t len)
+{
+	char *name;
+	int ret;
+
+	name = kstrdup(page, GFP_KERNEL);
+	if (!name)
+		return -ENOMEM;
+	if (name[len - 1] == '\n')
+		name[len - 1] = '\0';
+
+	mutex_lock(&gi->lock);
+
+	if (!strlen(name)) {
+		ret = unregister_gadget(gi);
+		if (ret)
+			goto err;
+	} else {
+		if (gi->udc_name) {
+			ret = -EBUSY;
+			goto err;
+		}
+		ret = udc_attach_driver(name, &gi->composite.gadget_driver);
+		if (ret)
+			goto err;
+		gi->udc_name = name;
+	}
+	mutex_unlock(&gi->lock);
+	return len;
+err:
+	kfree(name);
+	mutex_unlock(&gi->lock);
+	return ret;
+}
+
+GI_DEVICE_DESC_ITEM_ATTR(bDeviceClass);
+GI_DEVICE_DESC_ITEM_ATTR(bDeviceSubClass);
+GI_DEVICE_DESC_ITEM_ATTR(bDeviceProtocol);
+GI_DEVICE_DESC_ITEM_ATTR(bMaxPacketSize0);
+GI_DEVICE_DESC_ITEM_ATTR(idVendor);
+GI_DEVICE_DESC_ITEM_ATTR(idProduct);
+GI_DEVICE_DESC_ITEM_ATTR(bcdDevice);
+GI_DEVICE_DESC_ITEM_ATTR(bcdUSB);
+GI_DEVICE_DESC_ITEM_ATTR(UDC);
+
+static struct configfs_attribute *gadget_root_attrs[] = {
+	&gadget_cdev_desc_bDeviceClass.attr,
+	&gadget_cdev_desc_bDeviceSubClass.attr,
+	&gadget_cdev_desc_bDeviceProtocol.attr,
+	&gadget_cdev_desc_bMaxPacketSize0.attr,
+	&gadget_cdev_desc_idVendor.attr,
+	&gadget_cdev_desc_idProduct.attr,
+	&gadget_cdev_desc_bcdDevice.attr,
+	&gadget_cdev_desc_bcdUSB.attr,
+	&gadget_cdev_desc_UDC.attr,
+	NULL,
+};
+
+static inline struct gadget_info *to_gadget_info(struct config_item *item)
+{
+	 return container_of(to_config_group(item), struct gadget_info, group);
+}
+
+static inline struct gadget_strings *to_gadget_strings(struct config_item *item)
+{
+	 return container_of(to_config_group(item), struct gadget_strings,
+			 group);
+}
+
+static inline struct gadget_config_name *to_gadget_config_name(
+		struct config_item *item)
+{
+	 return container_of(to_config_group(item), struct gadget_config_name,
+			 group);
+}
+
+static inline struct config_usb_cfg *to_config_usb_cfg(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct config_usb_cfg,
+			group);
+}
+
+static inline struct usb_function_instance *to_usb_function_instance(
+		struct config_item *item)
+{
+	 return container_of(to_config_group(item),
+			 struct usb_function_instance, group);
+}
+
+static void gadget_info_attr_release(struct config_item *item)
+{
+	struct gadget_info *gi = to_gadget_info(item);
+
+	WARN_ON(!list_empty(&gi->cdev.configs));
+	WARN_ON(!list_empty(&gi->string_list));
+	WARN_ON(!list_empty(&gi->available_func));
+	kfree(gi->composite.gadget_driver.function);
+	kfree(gi);
+}
+
+CONFIGFS_ATTR_OPS(gadget_info);
+
+static struct configfs_item_operations gadget_root_item_ops = {
+	.release                = gadget_info_attr_release,
+	.show_attribute         = gadget_info_attr_show,
+	.store_attribute        = gadget_info_attr_store,
+};
+
+static void gadget_config_attr_release(struct config_item *item)
+{
+	struct config_usb_cfg *cfg = to_config_usb_cfg(item);
+
+	WARN_ON(!list_empty(&cfg->c.functions));
+	list_del(&cfg->c.list);
+	kfree(cfg->c.label);
+	kfree(cfg);
+}
+
+static int config_usb_cfg_link(
+	struct config_item *usb_cfg_ci,
+	struct config_item *usb_func_ci)
+{
+	struct config_usb_cfg *cfg = to_config_usb_cfg(usb_cfg_ci);
+	struct usb_composite_dev *cdev = cfg->c.cdev;
+	struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+
+	struct config_group *group = to_config_group(usb_func_ci);
+	struct usb_function_instance *fi = container_of(group,
+			struct usb_function_instance, group);
+	struct usb_function_instance *a_fi;
+	struct usb_function *f;
+	int ret;
+
+	mutex_lock(&gi->lock);
+	/*
+	 * Make sure this function is from within our _this_ gadget and not
+	 * from another gadget or a random directory.
+	 * Also a function instance can only be linked once.
+	 */
+	list_for_each_entry(a_fi, &gi->available_func, cfs_list) {
+		if (a_fi == fi)
+			break;
+	}
+	if (a_fi != fi) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	list_for_each_entry(f, &cfg->func_list, list) {
+		if (f->fi == fi) {
+			ret = -EEXIST;
+			goto out;
+		}
+	}
+
+	f = usb_get_function(fi);
+	if (IS_ERR(f)) {
+		ret = PTR_ERR(f);
+		goto out;
+	}
+
+	/* stash the function until we bind it to the gadget */
+	list_add_tail(&f->list, &cfg->func_list);
+	ret = 0;
+out:
+	mutex_unlock(&gi->lock);
+	return ret;
+}
+
+static int config_usb_cfg_unlink(
+	struct config_item *usb_cfg_ci,
+	struct config_item *usb_func_ci)
+{
+	struct config_usb_cfg *cfg = to_config_usb_cfg(usb_cfg_ci);
+	struct usb_composite_dev *cdev = cfg->c.cdev;
+	struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+
+	struct config_group *group = to_config_group(usb_func_ci);
+	struct usb_function_instance *fi = container_of(group,
+			struct usb_function_instance, group);
+	struct usb_function *f;
+
+	/*
+	 * ideally I would like to forbid to unlink functions while a gadget is
+	 * bound to an UDC. Since this isn't possible at the moment, we simply
+	 * force an unbind, the function is available here and then we can
+	 * remove the function.
+	 */
+	mutex_lock(&gi->lock);
+	if (gi->udc_name)
+		unregister_gadget(gi);
+	WARN_ON(gi->udc_name);
+
+	list_for_each_entry(f, &cfg->func_list, list) {
+		if (f->fi == fi) {
+			list_del(&f->list);
+			usb_put_function(f);
+			mutex_unlock(&gi->lock);
+			return 0;
+		}
+	}
+	mutex_unlock(&gi->lock);
+	__WARN_printf("Unable to locate function to unbind\n");
+	return 0;
+}
+
+CONFIGFS_ATTR_OPS(config_usb_cfg);
+
+static struct configfs_item_operations gadget_config_item_ops = {
+	.release                = gadget_config_attr_release,
+	.show_attribute         = config_usb_cfg_attr_show,
+	.store_attribute        = config_usb_cfg_attr_store,
+	.allow_link             = config_usb_cfg_link,
+	.drop_link              = config_usb_cfg_unlink,
+};
+
+
+static ssize_t gadget_config_desc_MaxPower_show(struct config_usb_cfg *cfg,
+		char *page)
+{
+	return sprintf(page, "%u\n", cfg->c.MaxPower);
+}
+
+static ssize_t gadget_config_desc_MaxPower_store(struct config_usb_cfg *cfg,
+		const char *page, size_t len)
+{
+	u16 val;
+	int ret;
+	ret = kstrtou16(page, 0, &val);
+	if (ret)
+		return ret;
+	if (DIV_ROUND_UP(val, 8) > 0xff)
+		return -ERANGE;
+	cfg->c.MaxPower = val;
+	return len;
+}
+
+static ssize_t gadget_config_desc_bmAttributes_show(struct config_usb_cfg *cfg,
+		char *page)
+{
+	return sprintf(page, "0x%02x\n", cfg->c.bmAttributes);
+}
+
+static ssize_t gadget_config_desc_bmAttributes_store(struct config_usb_cfg *cfg,
+		const char *page, size_t len)
+{
+	u8 val;
+	int ret;
+	ret = kstrtou8(page, 0, &val);
+	if (ret)
+		return ret;
+	if (!(val & USB_CONFIG_ATT_ONE))
+		return -EINVAL;
+	if (val & ~(USB_CONFIG_ATT_ONE | USB_CONFIG_ATT_SELFPOWER |
+				USB_CONFIG_ATT_WAKEUP))
+		return -EINVAL;
+	cfg->c.bmAttributes = val;
+	return len;
+}
+
+#define CFG_CONFIG_DESC_ITEM_ATTR(name)	\
+	static struct config_usb_cfg_attribute gadget_usb_cfg_##name = \
+		__CONFIGFS_ATTR(name,  S_IRUGO | S_IWUSR,		\
+				gadget_config_desc_##name##_show,	\
+				gadget_config_desc_##name##_store)
+
+CFG_CONFIG_DESC_ITEM_ATTR(MaxPower);
+CFG_CONFIG_DESC_ITEM_ATTR(bmAttributes);
+
+static struct configfs_attribute *gadget_config_attrs[] = {
+	&gadget_usb_cfg_MaxPower.attr,
+	&gadget_usb_cfg_bmAttributes.attr,
+	NULL,
+};
+
+static struct config_item_type gadget_config_type = {
+	.ct_item_ops	= &gadget_config_item_ops,
+	.ct_attrs	= gadget_config_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct config_item_type gadget_root_type = {
+	.ct_item_ops	= &gadget_root_item_ops,
+	.ct_attrs	= gadget_root_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static void composite_init_dev(struct usb_composite_dev *cdev)
+{
+	spin_lock_init(&cdev->lock);
+	INIT_LIST_HEAD(&cdev->configs);
+	INIT_LIST_HEAD(&cdev->gstrings);
+}
+
+static struct config_group *function_make(
+		struct config_group *group,
+		const char *name)
+{
+	struct gadget_info *gi;
+	struct usb_function_instance *fi;
+	char buf[MAX_NAME_LEN];
+	char *func_name;
+	char *instance_name;
+	int ret;
+
+	ret = snprintf(buf, MAX_NAME_LEN, "%s", name);
+	if (ret >= MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	func_name = buf;
+	instance_name = strchr(func_name, '.');
+	if (!instance_name) {
+		pr_err("Unable to locate . in FUNC.INSTANCE\n");
+		return ERR_PTR(-EINVAL);
+	}
+	*instance_name = '\0';
+	instance_name++;
+
+	fi = usb_get_function_instance(func_name);
+	if (IS_ERR(fi))
+		return ERR_PTR(PTR_ERR(fi));
+
+	ret = config_item_set_name(&fi->group.cg_item, name);
+	if (ret) {
+		usb_put_function_instance(fi);
+		return ERR_PTR(ret);
+	}
+
+	gi = container_of(group, struct gadget_info, functions_group);
+
+	mutex_lock(&gi->lock);
+	list_add_tail(&fi->cfs_list, &gi->available_func);
+	mutex_unlock(&gi->lock);
+	return &fi->group;
+}
+
+static void function_drop(
+		struct config_group *group,
+		struct config_item *item)
+{
+	struct usb_function_instance *fi = to_usb_function_instance(item);
+	struct gadget_info *gi;
+
+	gi = container_of(group, struct gadget_info, functions_group);
+
+	mutex_lock(&gi->lock);
+	list_del(&fi->cfs_list);
+	mutex_unlock(&gi->lock);
+	config_item_put(item);
+}
+
+static struct configfs_group_operations functions_ops = {
+	.make_group     = &function_make,
+	.drop_item      = &function_drop,
+};
+
+static struct config_item_type functions_type = {
+	.ct_group_ops   = &functions_ops,
+	.ct_owner       = THIS_MODULE,
+};
+
+CONFIGFS_ATTR_STRUCT(gadget_config_name);
+GS_STRINGS_RW(gadget_config_name, configuration);
+
+static struct configfs_attribute *gadget_config_name_langid_attrs[] = {
+	&gadget_config_name_configuration.attr,
+	NULL,
+};
+
+static void gadget_config_name_attr_release(struct config_item *item)
+{
+	struct gadget_config_name *cn = to_gadget_config_name(item);
+
+	kfree(cn->configuration);
+
+	list_del(&cn->list);
+	kfree(cn);
+}
+
+USB_CONFIG_STRING_RW_OPS(gadget_config_name);
+USB_CONFIG_STRINGS_LANG(gadget_config_name, config_usb_cfg);
+
+static struct config_group *config_desc_make(
+		struct config_group *group,
+		const char *name)
+{
+	struct gadget_info *gi;
+	struct config_usb_cfg *cfg;
+	char buf[MAX_NAME_LEN];
+	char *num_str;
+	u8 num;
+	int ret;
+
+	gi = container_of(group, struct gadget_info, configs_group);
+	ret = snprintf(buf, MAX_NAME_LEN, "%s", name);
+	if (ret >= MAX_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	num_str = strchr(buf, '.');
+	if (!num_str) {
+		pr_err("Unable to locate . in name.bConfigurationValue\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	*num_str = '\0';
+	num_str++;
+
+	if (!strlen(buf))
+		return ERR_PTR(-EINVAL);
+
+	ret = kstrtou8(num_str, 0, &num);
+	if (ret)
+		return ERR_PTR(ret);
+
+	cfg = kzalloc(sizeof(*cfg), GFP_KERNEL);
+	if (!cfg)
+		return ERR_PTR(-ENOMEM);
+	cfg->c.label = kstrdup(buf, GFP_KERNEL);
+	if (!cfg->c.label) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	cfg->c.bConfigurationValue = num;
+	cfg->c.MaxPower = CONFIG_USB_GADGET_VBUS_DRAW;
+	cfg->c.bmAttributes = USB_CONFIG_ATT_ONE;
+	INIT_LIST_HEAD(&cfg->string_list);
+	INIT_LIST_HEAD(&cfg->func_list);
+
+	cfg->group.default_groups = cfg->default_groups;
+	cfg->default_groups[0] = &cfg->strings_group;
+
+	config_group_init_type_name(&cfg->group, name,
+				&gadget_config_type);
+	config_group_init_type_name(&cfg->strings_group, "strings",
+			&gadget_config_name_strings_type);
+
+	ret = usb_add_config_only(&gi->cdev, &cfg->c);
+	if (ret)
+		goto err;
+
+	return &cfg->group;
+err:
+	kfree(cfg->c.label);
+	kfree(cfg);
+	return ERR_PTR(ret);
+}
+
+static void config_desc_drop(
+		struct config_group *group,
+		struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations config_desc_ops = {
+	.make_group     = &config_desc_make,
+	.drop_item      = &config_desc_drop,
+};
+
+static struct config_item_type config_desc_type = {
+	.ct_group_ops   = &config_desc_ops,
+	.ct_owner       = THIS_MODULE,
+};
+
+CONFIGFS_ATTR_STRUCT(gadget_strings);
+GS_STRINGS_RW(gadget_strings, manufacturer);
+GS_STRINGS_RW(gadget_strings, product);
+GS_STRINGS_RW(gadget_strings, serialnumber);
+
+static struct configfs_attribute *gadget_strings_langid_attrs[] = {
+	&gadget_strings_manufacturer.attr,
+	&gadget_strings_product.attr,
+	&gadget_strings_serialnumber.attr,
+	NULL,
+};
+
+static void gadget_strings_attr_release(struct config_item *item)
+{
+	struct gadget_strings *gs = to_gadget_strings(item);
+
+	kfree(gs->manufacturer);
+	kfree(gs->product);
+	kfree(gs->serialnumber);
+
+	list_del(&gs->list);
+	kfree(gs);
+}
+
+USB_CONFIG_STRING_RW_OPS(gadget_strings);
+USB_CONFIG_STRINGS_LANG(gadget_strings, gadget_info);
+
+static int configfs_do_nothing(struct usb_composite_dev *cdev)
+{
+	__WARN();
+	return -EINVAL;
+}
+
+int composite_dev_prepare(struct usb_composite_driver *composite,
+		struct usb_composite_dev *dev);
+
+static void purge_configs_funcs(struct gadget_info *gi)
+{
+	struct usb_configuration	*c;
+
+	list_for_each_entry(c, &gi->cdev.configs, list) {
+		struct usb_function *f, *tmp;
+		struct config_usb_cfg *cfg;
+
+		cfg = container_of(c, struct config_usb_cfg, c);
+
+		list_for_each_entry_safe(f, tmp, &c->functions, list) {
+
+			list_move_tail(&f->list, &cfg->func_list);
+			if (f->unbind) {
+				dev_err(&gi->cdev.gadget->dev, "unbind function"
+						" '%s'/%p\n", f->name, f);
+				f->unbind(c, f);
+			}
+		}
+		c->next_interface_id = 0;
+		c->superspeed = 0;
+		c->highspeed = 0;
+		c->fullspeed = 0;
+	}
+}
+
+static int configfs_composite_bind(struct usb_gadget *gadget,
+		struct usb_gadget_driver *gdriver)
+{
+	struct usb_composite_driver     *composite = to_cdriver(gdriver);
+	struct gadget_info		*gi = container_of(composite,
+						struct gadget_info, composite);
+	struct usb_composite_dev	*cdev = &gi->cdev;
+	struct usb_configuration	*c;
+	struct usb_string		*s;
+	unsigned			i;
+	int				ret;
+
+	/* the gi->lock is hold by the caller */
+	cdev->gadget = gadget;
+	set_gadget_data(gadget, cdev);
+	ret = composite_dev_prepare(composite, cdev);
+	if (ret)
+		return ret;
+	/* and now the gadget bind */
+	ret = -EINVAL;
+
+	if (list_empty(&gi->cdev.configs)) {
+		pr_err("Need atleast one configuration in %s.\n",
+				gi->composite.name);
+		goto err_comp_cleanup;
+	}
+
+
+	list_for_each_entry(c, &gi->cdev.configs, list) {
+		struct config_usb_cfg *cfg;
+
+		cfg = container_of(c, struct config_usb_cfg, c);
+		if (list_empty(&cfg->func_list)) {
+			pr_err("Config %s/%d of %s needs atleast one function.\n",
+			      c->label, c->bConfigurationValue,
+			      gi->composite.name);
+			goto err_comp_cleanup;
+		}
+	}
+
+	/* init all strings */
+	if (!list_empty(&gi->string_list)) {
+		struct gadget_strings *gs;
+
+		i = 0;
+		list_for_each_entry(gs, &gi->string_list, list) {
+
+			gi->gstrings[i] = &gs->stringtab_dev;
+			gs->stringtab_dev.strings = gs->strings;
+			gs->strings[USB_GADGET_MANUFACTURER_IDX].s =
+				gs->manufacturer;
+			gs->strings[USB_GADGET_PRODUCT_IDX].s = gs->product;
+			gs->strings[USB_GADGET_SERIAL_IDX].s = gs->serialnumber;
+			i++;
+		}
+		gi->gstrings[i] = NULL;
+		s = usb_gstrings_attach(&gi->cdev, gi->gstrings,
+				USB_GADGET_FIRST_AVAIL_IDX);
+		if (IS_ERR(s))
+			goto err_comp_cleanup;
+
+		gi->cdev.desc.iManufacturer = s[USB_GADGET_MANUFACTURER_IDX].id;
+		gi->cdev.desc.iProduct = s[USB_GADGET_PRODUCT_IDX].id;
+		gi->cdev.desc.iSerialNumber = s[USB_GADGET_SERIAL_IDX].id;
+	}
+
+	/* Go through all configs, attach all functions */
+	list_for_each_entry(c, &gi->cdev.configs, list) {
+		struct config_usb_cfg *cfg;
+		struct usb_function *f;
+		struct usb_function *tmp;
+		struct gadget_config_name *cn;
+
+		cfg = container_of(c, struct config_usb_cfg, c);
+		if (!list_empty(&cfg->string_list)) {
+			i = 0;
+			list_for_each_entry(cn, &cfg->string_list, list) {
+				cfg->gstrings[i] = &cn->stringtab_dev;
+				cn->stringtab_dev.strings = &cn->strings;
+				cn->strings.s = cn->configuration;
+				i++;
+			}
+			cfg->gstrings[i] = NULL;
+			s = usb_gstrings_attach(&gi->cdev, cfg->gstrings, 1);
+			if (IS_ERR(s))
+				goto err_comp_cleanup;
+			c->iConfiguration = s[0].id;
+		}
+
+		list_for_each_entry_safe(f, tmp, &cfg->func_list, list) {
+			list_del(&f->list);
+			ret = usb_add_function(c, f);
+			if (ret)
+				goto err_purge_funcs;
+		}
+		usb_ep_autoconfig_reset(cdev->gadget);
+	}
+	usb_ep_autoconfig_reset(cdev->gadget);
+	return 0;
+
+err_purge_funcs:
+	purge_configs_funcs(gi);
+err_comp_cleanup:
+	composite_dev_cleanup(cdev);
+	return ret;
+}
+
+static void configfs_composite_unbind(struct usb_gadget *gadget)
+{
+	struct usb_composite_dev	*cdev;
+	struct gadget_info		*gi;
+
+	/* the gi->lock is hold by the caller */
+
+	cdev = get_gadget_data(gadget);
+	gi = container_of(cdev, struct gadget_info, cdev);
+
+	purge_configs_funcs(gi);
+	composite_dev_cleanup(cdev);
+	usb_ep_autoconfig_reset(cdev->gadget);
+	cdev->gadget = NULL;
+	set_gadget_data(gadget, NULL);
+}
+
+static const struct usb_gadget_driver configfs_driver_template = {
+	.bind           = configfs_composite_bind,
+	.unbind         = configfs_composite_unbind,
+
+	.setup          = composite_setup,
+	.disconnect     = composite_disconnect,
+
+	.max_speed	= USB_SPEED_SUPER,
+	.driver = {
+		.owner          = THIS_MODULE,
+		.name		= "configfs-gadget",
+	},
+};
+
+static struct config_group *gadgets_make(
+		struct config_group *group,
+		const char *name)
+{
+	struct gadget_info *gi;
+
+	gi = kzalloc(sizeof(*gi), GFP_KERNEL);
+	if (!gi)
+		return ERR_PTR(-ENOMEM);
+
+	gi->group.default_groups = gi->default_groups;
+	gi->group.default_groups[0] = &gi->functions_group;
+	gi->group.default_groups[1] = &gi->configs_group;
+	gi->group.default_groups[2] = &gi->strings_group;
+
+	config_group_init_type_name(&gi->functions_group, "functions",
+			&functions_type);
+	config_group_init_type_name(&gi->configs_group, "configs",
+			&config_desc_type);
+	config_group_init_type_name(&gi->strings_group, "strings",
+			&gadget_strings_strings_type);
+
+	gi->composite.bind = configfs_do_nothing;
+	gi->composite.unbind = configfs_do_nothing;
+	gi->composite.suspend = NULL;
+	gi->composite.resume = NULL;
+	gi->composite.max_speed = USB_SPEED_SUPER;
+
+	mutex_init(&gi->lock);
+	INIT_LIST_HEAD(&gi->string_list);
+	INIT_LIST_HEAD(&gi->available_func);
+
+	composite_init_dev(&gi->cdev);
+	gi->cdev.desc.bLength = USB_DT_DEVICE_SIZE;
+	gi->cdev.desc.bDescriptorType = USB_DT_DEVICE;
+	gi->cdev.desc.bcdDevice = cpu_to_le16(get_default_bcdDevice());
+
+	gi->composite.gadget_driver = configfs_driver_template;
+
+	gi->composite.gadget_driver.function = kstrdup(name, GFP_KERNEL);
+	gi->composite.name = gi->composite.gadget_driver.function;
+
+	if (!gi->composite.gadget_driver.function)
+		goto err;
+
+#ifdef CONFIG_USB_OTG
+	gi->otg.bLength = sizeof(struct usb_otg_descriptor);
+	gi->otg.bDescriptorType = USB_DT_OTG;
+	gi->otg.bmAttributes = USB_OTG_SRP | USB_OTG_HNP;
+#endif
+
+	config_group_init_type_name(&gi->group, name,
+				&gadget_root_type);
+	return &gi->group;
+err:
+	kfree(gi);
+	return ERR_PTR(-ENOMEM);
+}
+
+static void gadgets_drop(struct config_group *group, struct config_item *item)
+{
+	config_item_put(item);
+}
+
+static struct configfs_group_operations gadgets_ops = {
+	.make_group     = &gadgets_make,
+	.drop_item      = &gadgets_drop,
+};
+
+static struct config_item_type gadgets_type = {
+	.ct_group_ops   = &gadgets_ops,
+	.ct_owner       = THIS_MODULE,
+};
+
+static struct configfs_subsystem gadget_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "usb_gadget",
+			.ci_type = &gadgets_type,
+		},
+	},
+	.su_mutex = __MUTEX_INITIALIZER(gadget_subsys.su_mutex),
+};
+
+static int __init gadget_cfs_init(void)
+{
+	int ret;
+
+	config_group_init(&gadget_subsys.su_group);
+
+	ret = configfs_register_subsystem(&gadget_subsys);
+	return ret;
+}
+module_init(gadget_cfs_init);
+
+static void __exit gadget_cfs_exit(void)
+{
+	configfs_unregister_subsystem(&gadget_subsys);
+}
+module_exit(gadget_cfs_exit);
diff --git a/drivers/usb/gadget/f_acm.c b/drivers/usb/gadget/f_acm.c
index ba7daaaad148..4b7e33e5d9c6 100644
--- a/drivers/usb/gadget/f_acm.c
+++ b/drivers/usb/gadget/f_acm.c
@@ -763,6 +763,59 @@ static struct usb_function *acm_alloc_func(struct usb_function_instance *fi)
 	return &acm->port.func;
 }
 
+static inline struct f_serial_opts *to_f_serial_opts(struct config_item *item)
+{
+	return container_of(to_config_group(item), struct f_serial_opts,
+			func_inst.group);
+}
+
+CONFIGFS_ATTR_STRUCT(f_serial_opts);
+static ssize_t f_acm_attr_show(struct config_item *item,
+				 struct configfs_attribute *attr,
+				 char *page)
+{
+	struct f_serial_opts *opts = to_f_serial_opts(item);
+	struct f_serial_opts_attribute *f_serial_opts_attr =
+		container_of(attr, struct f_serial_opts_attribute, attr);
+	ssize_t ret = 0;
+
+	if (f_serial_opts_attr->show)
+		ret = f_serial_opts_attr->show(opts, page);
+	return ret;
+}
+
+static void acm_attr_release(struct config_item *item)
+{
+	struct f_serial_opts *opts = to_f_serial_opts(item);
+
+	usb_put_function_instance(&opts->func_inst);
+}
+
+static struct configfs_item_operations acm_item_ops = {
+	.release                = acm_attr_release,
+	.show_attribute		= f_acm_attr_show,
+};
+
+static ssize_t f_acm_port_num_show(struct f_serial_opts *opts, char *page)
+{
+	return sprintf(page, "%u\n", opts->port_num);
+}
+
+static struct f_serial_opts_attribute f_acm_port_num =
+	__CONFIGFS_ATTR_RO(port_num, f_acm_port_num_show);
+
+
+static struct configfs_attribute *acm_attrs[] = {
+	&f_acm_port_num.attr,
+	NULL,
+};
+
+static struct config_item_type acm_func_type = {
+	.ct_item_ops    = &acm_item_ops,
+	.ct_attrs	= acm_attrs,
+	.ct_owner       = THIS_MODULE,
+};
+
 static void acm_free_instance(struct usb_function_instance *fi)
 {
 	struct f_serial_opts *opts;
@@ -786,6 +839,8 @@ static struct usb_function_instance *acm_alloc_instance(void)
 		kfree(opts);
 		return ERR_PTR(ret);
 	}
+	config_group_init_type_name(&opts->func_inst.group, "",
+			&acm_func_type);
 	return &opts->func_inst;
 }
 DECLARE_USB_FUNCTION_INIT(acm, acm_alloc_instance, acm_alloc_func);
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h
index 8860594d6364..5e61589fc166 100644
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -39,6 +39,7 @@
 #include <linux/usb/ch9.h>
 #include <linux/usb/gadget.h>
 #include <linux/log2.h>
+#include <linux/configfs.h>
 
 /*
  * USB function drivers should return USB_GADGET_DELAYED_STATUS if they
@@ -464,6 +465,8 @@ struct usb_function_driver {
 };
 
 struct usb_function_instance {
+	struct config_group group;
+	struct list_head cfs_list;
 	struct usb_function_driver *fd;
 	void (*free_func_inst)(struct usb_function_instance *inst);
 };
diff --git a/include/linux/usb/gadget_configfs.h b/include/linux/usb/gadget_configfs.h
new file mode 100644
index 000000000000..d74c0ae989d5
--- /dev/null
+++ b/include/linux/usb/gadget_configfs.h
@@ -0,0 +1,110 @@
+#ifndef __GADGET_CONFIGFS__
+#define __GADGET_CONFIGFS__
+
+#include <linux/configfs.h>
+
+int check_user_usb_string(const char *name,
+		struct usb_gadget_strings *stringtab_dev);
+
+#define GS_STRINGS_W(__struct, __name)	\
+	static ssize_t __struct##_##__name##_store(struct __struct *gs, \
+		const char *page, size_t len)		\
+{							\
+	int ret;					\
+							\
+	ret = usb_string_copy(page, &gs->__name);	\
+	if (ret)					\
+		return ret;				\
+	return len;					\
+}
+
+#define GS_STRINGS_R(__struct, __name)	\
+	static ssize_t __struct##_##__name##_show(struct __struct *gs, \
+			char *page)	\
+{	\
+	return sprintf(page, "%s\n", gs->__name ?: "");	\
+}
+
+#define GS_STRING_ITEM_ATTR(struct_name, name)	\
+	static struct struct_name##_attribute struct_name##_##name = \
+		__CONFIGFS_ATTR(name,  S_IRUGO | S_IWUSR,		\
+				struct_name##_##name##_show,		\
+				struct_name##_##name##_store)
+
+#define GS_STRINGS_RW(struct_name, _name)	\
+	GS_STRINGS_R(struct_name, _name)	\
+	GS_STRINGS_W(struct_name, _name)	\
+	GS_STRING_ITEM_ATTR(struct_name, _name)
+
+#define USB_CONFIG_STRING_RW_OPS(struct_in)				\
+	CONFIGFS_ATTR_OPS(struct_in);					\
+									\
+static struct configfs_item_operations struct_in##_langid_item_ops = {	\
+	.release                = struct_in##_attr_release,		\
+	.show_attribute         = struct_in##_attr_show,		\
+	.store_attribute        = struct_in##_attr_store,		\
+};									\
+									\
+static struct config_item_type struct_in##_langid_type = {		\
+	.ct_item_ops	= &struct_in##_langid_item_ops,			\
+	.ct_attrs	= struct_in##_langid_attrs,			\
+	.ct_owner	= THIS_MODULE,					\
+}
+
+#define USB_CONFIG_STRINGS_LANG(struct_in, struct_member)	\
+	static struct config_group *struct_in##_strings_make(		\
+			struct config_group *group,			\
+			const char *name)				\
+	{								\
+	struct struct_member *gi;					\
+	struct struct_in *gs;						\
+	struct struct_in *new;						\
+	int langs = 0;							\
+	int ret;							\
+									\
+	new = kzalloc(sizeof(*new), GFP_KERNEL);			\
+	if (!new)							\
+		return ERR_PTR(-ENOMEM);				\
+									\
+	ret = check_user_usb_string(name, &new->stringtab_dev);		\
+	if (ret)							\
+		goto err;						\
+	config_group_init_type_name(&new->group, name,			\
+			&struct_in##_langid_type);			\
+									\
+	gi = container_of(group, struct struct_member, strings_group);	\
+	ret = -EEXIST;							\
+	list_for_each_entry(gs, &gi->string_list, list) {		\
+		if (gs->stringtab_dev.language == new->stringtab_dev.language) \
+			goto err;					\
+		langs++;						\
+	}								\
+	ret = -EOVERFLOW;						\
+	if (langs >= MAX_USB_STRING_LANGS)				\
+		goto err;						\
+									\
+	list_add_tail(&new->list, &gi->string_list);			\
+	return &new->group;						\
+err:									\
+	kfree(new);							\
+	return ERR_PTR(ret);						\
+}									\
+									\
+static void struct_in##_strings_drop(					\
+		struct config_group *group,				\
+		struct config_item *item)				\
+{									\
+	config_item_put(item);						\
+}									\
+									\
+static struct configfs_group_operations struct_in##_strings_ops = {	\
+	.make_group     = &struct_in##_strings_make,			\
+	.drop_item      = &struct_in##_strings_drop,			\
+};									\
+									\
+static struct config_item_type struct_in##_strings_type = {		\
+	.ct_group_ops   = &struct_in##_strings_ops,			\
+	.ct_owner       = THIS_MODULE,					\
+}
+
+#endif
-- 
cgit 


From 3451d0243c3cdfd729b36f9684a14659d4895ca3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 10 Aug 2011 23:21:01 +0200
Subject: nohz: Rename CONFIG_NO_HZ to CONFIG_NO_HZ_COMMON

We are planning to convert the dynticks Kconfig options layout
into a choice menu. The user must be able to easily pick
any of the following implementations: constant periodic tick,
idle dynticks, full dynticks.

As this implies a mutual exclusion, the two dynticks implementions
need to converge on the selection of a common Kconfig option in order
to ease the sharing of a common infrastructure.

It would thus seem pretty natural to reuse CONFIG_NO_HZ to
that end. It already implements all the idle dynticks code
and the full dynticks depends on all that code for now.
So ideally the choice menu would propose CONFIG_NO_HZ_IDLE and
CONFIG_NO_HZ_EXTENDED then both would select CONFIG_NO_HZ.

On the other hand we want to stay backward compatible: if
CONFIG_NO_HZ is set in an older config file, we want to
enable CONFIG_NO_HZ_IDLE by default.

But we can't afford both at the same time or we run into
a circular dependency:

1) CONFIG_NO_HZ_IDLE and CONFIG_NO_HZ_EXTENDED both select
   CONFIG_NO_HZ
2) If CONFIG_NO_HZ is set, we default to CONFIG_NO_HZ_IDLE

We might be able to support that from Kconfig/Kbuild but it
may not be wise to introduce such a confusing behaviour.

So to solve this, create a new CONFIG_NO_HZ_COMMON option
which gathers the common code between idle and full dynticks
(that common code for now is simply the idle dynticks code)
and select it from their referring Kconfig.

Then we'll later create CONFIG_NO_HZ_IDLE and map CONFIG_NO_HZ
to it for backward compatibility.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/RCU/stallwarn.txt         |  2 +-
 Documentation/cpu-freq/governors.txt    |  4 ++--
 arch/um/include/shared/common-offsets.h |  4 ++--
 arch/um/os-Linux/time.c                 |  2 +-
 include/linux/sched.h                   |  8 ++++----
 include/linux/tick.h                    |  8 ++++----
 init/Kconfig                            |  2 +-
 kernel/hrtimer.c                        |  4 ++--
 kernel/sched/core.c                     | 18 +++++++++---------
 kernel/sched/fair.c                     | 10 +++++-----
 kernel/sched/sched.h                    |  4 ++--
 kernel/softirq.c                        |  2 +-
 kernel/time/Kconfig                     | 13 +++++++++----
 kernel/time/tick-sched.c                | 12 ++++++------
 kernel/timer.c                          |  4 ++--
 15 files changed, 51 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index 1927151b386b..b336755b71ed 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -176,7 +176,7 @@ o	A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that
 o	A hardware or software issue shuts off the scheduler-clock
 	interrupt on a CPU that is not in dyntick-idle mode.  This
 	problem really has happened, and seems to be most likely to
-	result in RCU CPU stall warnings for CONFIG_NO_HZ=n kernels.
+	result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels.
 
 o	A bug in the RCU implementation.
 
diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt
index c7a2eb8450c2..e3e5d9ae50cd 100644
--- a/Documentation/cpu-freq/governors.txt
+++ b/Documentation/cpu-freq/governors.txt
@@ -131,8 +131,8 @@ sampling_rate_min:
 The sampling rate is limited by the HW transition latency:
 transition_latency * 100
 Or by kernel restrictions:
-If CONFIG_NO_HZ is set, the limit is 10ms fixed.
-If CONFIG_NO_HZ is not set or nohz=off boot parameter is used, the
+If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed.
+If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is used, the
 limits depend on the CONFIG_HZ option:
 HZ=1000: min=20000us  (20ms)
 HZ=250:  min=80000us  (80ms)
diff --git a/arch/um/include/shared/common-offsets.h b/arch/um/include/shared/common-offsets.h
index 2df313b6a586..c92306809029 100644
--- a/arch/um/include/shared/common-offsets.h
+++ b/arch/um/include/shared/common-offsets.h
@@ -30,8 +30,8 @@ DEFINE(UM_NSEC_PER_USEC, NSEC_PER_USEC);
 #ifdef CONFIG_PRINTK
 DEFINE(UML_CONFIG_PRINTK, CONFIG_PRINTK);
 #endif
-#ifdef CONFIG_NO_HZ
-DEFINE(UML_CONFIG_NO_HZ, CONFIG_NO_HZ);
+#ifdef CONFIG_NO_HZ_COMMON
+DEFINE(UML_CONFIG_NO_HZ_COMMON, CONFIG_NO_HZ_COMMON);
 #endif
 #ifdef CONFIG_UML_X86
 DEFINE(UML_CONFIG_UML_X86, CONFIG_UML_X86);
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index fac388cb464f..e9824d5dd7d5 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -79,7 +79,7 @@ long long os_nsecs(void)
 	return timeval_to_ns(&tv);
 }
 
-#ifdef UML_CONFIG_NO_HZ
+#ifdef UML_CONFIG_NO_HZ_COMMON
 static int after_sleep_interval(struct timespec *ts)
 {
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 10626e2ee688..1ff9e0a5de27 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -230,7 +230,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern int runqueue_is_locked(int cpu);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
 extern int get_nohz_timer_target(void);
@@ -1758,13 +1758,13 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 void calc_load_enter_idle(void);
 void calc_load_exit_idle(void);
 #else
 static inline void calc_load_enter_idle(void) { }
 static inline void calc_load_exit_idle(void) { }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 
 #ifndef CONFIG_CPUMASK_OFFSTACK
 static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
@@ -1850,7 +1850,7 @@ extern void idle_task_exit(void);
 static inline void idle_task_exit(void) {}
 #endif
 
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
 extern void wake_up_nohz_cpu(int cpu);
 #else
 static inline void wake_up_nohz_cpu(int cpu) { }
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 44bfa8aa439f..5e403339ee14 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -82,7 +82,7 @@ extern int tick_program_event(ktime_t expires, int force);
 extern void tick_setup_sched_timer(void);
 # endif
 
-# if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+# if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
 extern void tick_cancel_sched_timer(int cpu);
 # else
 static inline void tick_cancel_sched_timer(int cpu) { }
@@ -123,7 +123,7 @@ static inline void tick_check_idle(int cpu) { }
 static inline int tick_oneshot_mode_active(void) { return 0; }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
-# ifdef CONFIG_NO_HZ
+# ifdef CONFIG_NO_HZ_COMMON
 DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched);
 
 static inline int tick_nohz_tick_stopped(void)
@@ -138,7 +138,7 @@ extern ktime_t tick_nohz_get_sleep_length(void);
 extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time);
 extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 
-# else /* !CONFIG_NO_HZ */
+# else /* !CONFIG_NO_HZ_COMMON */
 static inline int tick_nohz_tick_stopped(void)
 {
 	return 0;
@@ -155,7 +155,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void)
 }
 static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
-# endif /* !NO_HZ */
+# endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_EXTENDED
 extern int tick_nohz_extended_cpu(int cpu);
diff --git a/init/Kconfig b/init/Kconfig
index 8a1dac2f80a9..edc8132584f1 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -580,7 +580,7 @@ config RCU_FANOUT_EXACT
 
 config RCU_FAST_NO_HZ
 	bool "Accelerate last non-dyntick-idle CPU's grace periods"
-	depends on NO_HZ && SMP
+	depends on NO_HZ_COMMON && SMP
 	default n
 	help
 	  This option causes RCU to attempt to accelerate grace periods in
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..ec60482d8b03 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -160,7 +160,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
  */
 static int hrtimer_get_target(int this_cpu, int pinned)
 {
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu))
 		return get_nohz_timer_target();
 #endif
@@ -1106,7 +1106,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e91ee589f793..9bb397da63d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -549,7 +549,7 @@ void resched_cpu(int cpu)
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * In the semi idle case, use the nearest busy cpu for migrating timers
  * from an idle cpu.  This is good for power-savings.
@@ -641,14 +641,14 @@ static inline bool got_nohz_idle_kick(void)
 	return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 }
 
-#else /* CONFIG_NO_HZ */
+#else /* CONFIG_NO_HZ_COMMON */
 
 static inline bool got_nohz_idle_kick(void)
 {
 	return false;
 }
 
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 
 void sched_avg_update(struct rq *rq)
 {
@@ -2139,7 +2139,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 	return load >> FSHIFT;
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * Handle NO_HZ for the global load-average.
  *
@@ -2365,12 +2365,12 @@ static void calc_global_nohz(void)
 	smp_wmb();
 	calc_load_idx++;
 }
-#else /* !CONFIG_NO_HZ */
+#else /* !CONFIG_NO_HZ_COMMON */
 
 static inline long calc_load_fold_idle(void) { return 0; }
 static inline void calc_global_nohz(void) { }
 
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2530,7 +2530,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 	sched_avg_update(this_rq);
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * There is no sane way to deal with nohz on smp when using jiffies because the
  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -2590,7 +2590,7 @@ void update_cpu_load_nohz(void)
 	}
 	raw_spin_unlock(&this_rq->lock);
 }
-#endif /* CONFIG_NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /*
  * Called from scheduler_tick()
@@ -7023,7 +7023,7 @@ void __init sched_init(void)
 		INIT_LIST_HEAD(&rq->cfs_tasks);
 
 		rq_attach_root(rq, &def_root_domain);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 		rq->nohz_flags = 0;
 #endif
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 539760ef00c4..5c97fca091a7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5331,7 +5331,7 @@ out_unlock:
 	return 0;
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * idle load balancing details
  * - When one of the busy CPUs notice that there may be an idle rebalancing
@@ -5541,9 +5541,9 @@ out:
 		rq->next_balance = next_balance;
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
- * In CONFIG_NO_HZ case, the idle balance kickee will do the
+ * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
  * rebalancing for all the cpus for whom scheduler ticks are stopped.
  */
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
@@ -5686,7 +5686,7 @@ void trigger_load_balance(struct rq *rq, int cpu)
 	if (time_after_eq(jiffies, rq->next_balance) &&
 	    likely(!on_null_domain(cpu)))
 		raise_softirq(SCHED_SOFTIRQ);
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
 		nohz_balancer_kick(cpu);
 #endif
@@ -6156,7 +6156,7 @@ __init void init_sched_fair_class(void)
 #ifdef CONFIG_SMP
 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	nohz.next_balance = jiffies;
 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
 	cpu_notifier(sched_ilb_notifier, 0);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3bd15a43eebc..889904dd6d77 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,7 +404,7 @@ struct rq {
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
 	unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	u64 nohz_stamp;
 	unsigned long nohz_flags;
 #endif
@@ -1333,7 +1333,7 @@ extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
 
 extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
 	NOHZ_TICK_STOPPED,
 	NOHZ_BALANCE_KICK,
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4d252fd195b..de15813f2a66 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -348,7 +348,7 @@ void irq_exit(void)
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	/* Make sure that timer wheel updates are propagated */
 	if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
 		tick_nohz_irq_exit();
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 726c33e00da2..c88fc43494c9 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -64,16 +64,21 @@ config GENERIC_CMOS_UPDATE
 if GENERIC_CLOCKEVENTS
 menu "Timers subsystem"
 
-# Core internal switch. Selected by NO_HZ / HIGH_RES_TIMERS. This is
+# Core internal switch. Selected by NO_HZ_COMMON / HIGH_RES_TIMERS. This is
 # only related to the tick functionality. Oneshot clockevent devices
 # are supported independ of this.
 config TICK_ONESHOT
 	bool
 
+config NO_HZ_COMMON
+	bool
+	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
+	select TICK_ONESHOT
+
 config NO_HZ
 	bool "Tickless System (Dynamic Ticks)"
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
-	select TICK_ONESHOT
+	select NO_HZ_COMMON
 	help
 	  This option enables a tickless system: timer interrupts will
 	  only trigger on an as-needed basis both when the system is
@@ -81,14 +86,14 @@ config NO_HZ
 
 config NO_HZ_EXTENDED
 	bool "Full dynticks system"
-	# NO_HZ dependency
+	# NO_HZ_COMMON dependency
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
 	# RCU_USER_QS
 	depends on HAVE_CONTEXT_TRACKING && SMP
 	# RCU_NOCB_CPU dependency
 	depends on TREE_RCU || TREE_PREEMPT_RCU
 	depends on VIRT_CPU_ACCOUNTING_GEN
-	select NO_HZ
+	select NO_HZ_COMMON
 	select RCU_USER_QS
 	select RCU_NOCB_CPU
 	select CONTEXT_TRACKING_FORCE
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 57bb3fe5aaa3..ccfc2086cd4b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -104,7 +104,7 @@ static void tick_sched_do_timer(ktime_t now)
 {
 	int cpu = smp_processor_id();
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	/*
 	 * Check if the do_timer duty was dropped. We don't care about
 	 * concurrency: This happens only when the cpu in charge went
@@ -124,7 +124,7 @@ static void tick_sched_do_timer(ktime_t now)
 
 static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 {
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	/*
 	 * When we are idle and the tick is stopped, we have to touch
 	 * the watchdog as we might not schedule for a really long
@@ -235,7 +235,7 @@ core_initcall(init_tick_nohz_extended);
 /*
  * NOHZ - aka dynamic tick functionality
  */
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * NO HZ enabled ?
  */
@@ -907,7 +907,7 @@ static inline void tick_check_nohz(int cpu)
 static inline void tick_nohz_switch_to_nohz(void) { }
 static inline void tick_check_nohz(int cpu) { }
 
-#endif /* NO_HZ */
+#endif /* CONFIG_NO_HZ_COMMON */
 
 /*
  * Called from irq_enter to notify about the possible interruption of idle()
@@ -992,14 +992,14 @@ void tick_setup_sched_timer(void)
 		now = ktime_get();
 	}
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 	if (tick_nohz_enabled)
 		ts->nohz_mode = NOHZ_MODE_HIGHRES;
 #endif
 }
 #endif /* HIGH_RES_TIMERS */
 
-#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
+#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS
 void tick_cancel_sched_timer(int cpu)
 {
 	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/timer.c b/kernel/timer.c
index 4e3040b40d16..1b7489fdea41 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -738,7 +738,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	cpu = smp_processor_id();
 
-#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
+#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
 	if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu))
 		cpu = get_nohz_timer_target();
 #endif
@@ -1188,7 +1188,7 @@ static inline void __run_timers(struct tvec_base *base)
 	spin_unlock_irq(&base->lock);
 }
 
-#ifdef CONFIG_NO_HZ
+#ifdef CONFIG_NO_HZ_COMMON
 /*
  * Find out when the next timer event is due to happen. This
  * is used on S/390 to stop all activity when a CPU is idle.
-- 
cgit 


From 684d5ce4afafddb2ad08f36ea30ca7d7adc88ebe Mon Sep 17 00:00:00 2001
From: Zhenhua HUANG <zhenhua.huang@stericsson.com>
Date: Tue, 2 Apr 2013 13:24:15 +0100
Subject: regulator: ab8500: Introduce aux5, aux6 regulators for AB8540

Introduce aux5, aux6 into ab8540 regulator framework.

Signed-off-by: Zhenhua HUANG <zhenhua.huang@stericsson.com>
Signed-off-by: Lee Jones <lee.jones@linaro.org>
Reviewed-by: Maxime COQUELIN <maxime.coquelin@stericsson.com>
Reviewed-by: David PARIS <david.paris@stericsson.com>
Reviewed-by: Philippe LANGLAIS <philippe.langlais@stericsson.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/ab8500.c       | 65 ++++++++++++++++++++++++++++++++++++++++
 include/linux/regulator/ab8500.h |  2 ++
 2 files changed, 67 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 9cb634807ff8..433cac4396df 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -149,6 +149,21 @@ static const unsigned int ldo_vaux3_ab8540_voltages[] = {
 	3050000,
 };
 
+static const unsigned int ldo_vaux56_ab8540_voltages[] = {
+	750000, 760000, 770000, 780000, 790000, 800000,
+	810000, 820000, 830000, 840000, 850000, 860000,
+	870000, 880000, 890000, 900000, 910000, 920000,
+	930000, 940000, 950000, 960000, 970000, 980000,
+	990000, 1000000, 1010000, 1020000, 1030000,
+	1040000, 1050000, 1060000, 1070000, 1080000,
+	1090000, 1100000, 1110000, 1120000, 1130000,
+	1140000, 1150000, 1160000, 1170000, 1180000,
+	1190000, 1200000, 1210000, 1220000, 1230000,
+	1240000, 1250000, 1260000, 1270000, 1280000,
+	1290000, 1300000, 1310000, 1320000, 1330000,
+	1340000, 1350000, 1360000, 1800000, 2790000,
+};
+
 static const unsigned int ldo_vintcore_voltages[] = {
 	1200000,
 	1225000,
@@ -1569,6 +1584,54 @@ static struct ab8500_regulator_info
 		.voltage_reg		= 0x2f,
 		.voltage_mask		= 0x0f,
 	},
+	[AB8540_LDO_AUX5] = {
+		.desc = {
+			.name		= "LDO-AUX5",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8540_LDO_AUX5,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vaux56_ab8540_voltages),
+		},
+		.load_lp_uA		= 20000,
+		/* values for Vaux5Regu register */
+		.update_bank		= 0x04,
+		.update_reg		= 0x32,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		/* values for Vaux5SEL register */
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x33,
+		.voltage_mask		= 0x3f,
+		.voltages		= ldo_vaux56_ab8540_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vaux56_ab8540_voltages),
+	},
+	[AB8540_LDO_AUX6] = {
+		.desc = {
+			.name		= "LDO-AUX6",
+			.ops		= &ab8500_regulator_volt_mode_ops,
+			.type		= REGULATOR_VOLTAGE,
+			.id		= AB8540_LDO_AUX6,
+			.owner		= THIS_MODULE,
+			.n_voltages	= ARRAY_SIZE(ldo_vaux56_ab8540_voltages),
+		},
+		.load_lp_uA		= 20000,
+		/* values for Vaux6Regu register */
+		.update_bank		= 0x04,
+		.update_reg		= 0x35,
+		.update_mask		= 0x03,
+		.update_val		= 0x01,
+		.update_val_idle	= 0x03,
+		.update_val_normal	= 0x01,
+		/* values for Vaux6SEL register */
+		.voltage_bank		= 0x04,
+		.voltage_reg		= 0x36,
+		.voltage_mask		= 0x3f,
+		.voltages		= ldo_vaux56_ab8540_voltages,
+		.voltages_len		= ARRAY_SIZE(ldo_vaux56_ab8540_voltages),
+	},
 	[AB8540_LDO_INTCORE] = {
 		.desc = {
 			.name		= "LDO-INTCORE",
@@ -2979,6 +3042,8 @@ static struct of_regulator_match ab8540_regulator_match[] = {
 	{ .name	= "ab8500_ldo_aux2",    .driver_data = (void *) AB8540_LDO_AUX2, },
 	{ .name	= "ab8500_ldo_aux3",    .driver_data = (void *) AB8540_LDO_AUX3, },
 	{ .name	= "ab8500_ldo_aux4",    .driver_data = (void *) AB8540_LDO_AUX4, },
+	{ .name	= "ab8500_ldo_aux5",    .driver_data = (void *) AB8540_LDO_AUX5, },
+	{ .name	= "ab8500_ldo_aux6",    .driver_data = (void *) AB8540_LDO_AUX6, },
 	{ .name	= "ab8500_ldo_intcore", .driver_data = (void *) AB8540_LDO_INTCORE, },
 	{ .name	= "ab8500_ldo_tvout",   .driver_data = (void *) AB8540_LDO_TVOUT, },
 	{ .name = "ab8500_ldo_audio",   .driver_data = (void *) AB8540_LDO_AUDIO, },
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 44f67e8f1a6d..90b8b5ae9a4e 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -74,6 +74,8 @@ enum ab8540_regulator_id {
 	AB8540_LDO_AUX2,
 	AB8540_LDO_AUX3,
 	AB8540_LDO_AUX4,
+	AB8540_LDO_AUX5,
+	AB8540_LDO_AUX6,
 	AB8540_LDO_INTCORE,
 	AB8540_LDO_TVOUT,
 	AB8540_LDO_AUDIO,
-- 
cgit 


From c4e67bbc99ce661808c3ee77b0bb5779d0df11ca Mon Sep 17 00:00:00 2001
From: Lee Jones <lee.jones@linaro.org>
Date: Tue, 2 Apr 2013 13:24:19 +0100
Subject: ARM: ux500: Pass regulator platform data using the new format

Regulator platform data is now passed though a single structure
as opposed to the old way where four separate struct elements
were required. This patch makes use of the new format.

Signed-off-by: Lee Jones <lee.jones@linaro.org>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-ux500/board-mop500.c |  5 +----
 include/linux/mfd/abx500/ab8500.h  | 11 ++---------
 2 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c
index b03457881c4b..b1124bd34a6c 100644
--- a/arch/arm/mach-ux500/board-mop500.c
+++ b/arch/arm/mach-ux500/board-mop500.c
@@ -198,10 +198,7 @@ static struct platform_device snowball_sbnet_dev = {
 
 struct ab8500_platform_data ab8500_platdata = {
 	.irq_base	= MOP500_AB8500_IRQ_BASE,
-	.regulator_reg_init = ab8500_regulator_reg_init,
-	.num_regulator_reg_init	= ARRAY_SIZE(ab8500_regulator_reg_init),
-	.regulator	= ab8500_regulators,
-	.num_regulator	= ARRAY_SIZE(ab8500_regulators),
+	.regulator	= &ab8500_regulator_plat_data,
 	.gpio		= &ab8500_gpio_pdata,
 	.codec		= &ab8500_codec_pdata,
 };
diff --git a/include/linux/mfd/abx500/ab8500.h b/include/linux/mfd/abx500/ab8500.h
index 9db0bda446a0..84f449475c25 100644
--- a/include/linux/mfd/abx500/ab8500.h
+++ b/include/linux/mfd/abx500/ab8500.h
@@ -364,8 +364,7 @@ struct ab8500 {
 	const int *irq_reg_offset;
 };
 
-struct regulator_reg_init;
-struct regulator_init_data;
+struct ab8500_regulator_platform_data;
 struct ab8500_gpio_platform_data;
 struct ab8500_codec_platform_data;
 struct ab8500_sysctrl_platform_data;
@@ -375,19 +374,13 @@ struct ab8500_sysctrl_platform_data;
  * @irq_base: start of AB8500 IRQs, AB8500_NR_IRQS will be used
  * @pm_power_off: Should machine pm power off hook be registered or not
  * @init: board-specific initialization after detection of ab8500
- * @num_regulator_reg_init: number of regulator init registers
- * @regulator_reg_init: regulator init registers
- * @num_regulator: number of regulators
  * @regulator: machine-specific constraints for regulators
  */
 struct ab8500_platform_data {
 	int irq_base;
 	bool pm_power_off;
 	void (*init) (struct ab8500 *);
-	int num_regulator_reg_init;
-	struct ab8500_regulator_reg_init *regulator_reg_init;
-	int num_regulator;
-	struct regulator_init_data *regulator;
+	struct ab8500_regulator_platform_data *regulator;
 	struct abx500_gpio_platform_data *gpio;
 	struct ab8500_codec_platform_data *codec;
 	struct ab8500_sysctrl_platform_data *sysctrl;
-- 
cgit 


From 056b205316cc3dcf8a67cf813a26ff8a72bf3cb9 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Tue, 2 Apr 2013 15:36:56 -0700
Subject: clk: divider: Introduce CLK_DIVIDER_ALLOW_ZERO flag

Dividers which have CLK_DIVIDER_ONE_BASED set have a redundant state,
being a divider value of zero. Some hardware implementations allow a
zero divider which simply doesn't alter the frequency. I.e. it acts like
a divide by one or bypassing the divider.
This flag is used to handle such HW in the clk-divider model.

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk-divider.c    | 5 +++--
 include/linux/clk-provider.h | 8 +++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-divider.c b/drivers/clk/clk-divider.c
index 68b402101170..6d9674160430 100644
--- a/drivers/clk/clk-divider.c
+++ b/drivers/clk/clk-divider.c
@@ -109,8 +109,9 @@ static unsigned long clk_divider_recalc_rate(struct clk_hw *hw,
 
 	div = _get_div(divider, val);
 	if (!div) {
-		WARN(1, "%s: Invalid divisor for clock %s\n", __func__,
-						__clk_get_name(hw->clk));
+		WARN(!(divider->flags & CLK_DIVIDER_ALLOW_ZERO),
+			"%s: Zero divisor and CLK_DIVIDER_ALLOW_ZERO not set\n",
+			__clk_get_name(hw->clk));
 		return parent_rate;
 	}
 
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index b1675074fe7c..9fdfae74d669 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -249,9 +249,14 @@ struct clk_div_table {
  * CLK_DIVIDER_ONE_BASED - by default the divisor is the value read from the
  * 	register plus one.  If CLK_DIVIDER_ONE_BASED is set then the divider is
  * 	the raw value read from the register, with the value of zero considered
- * 	invalid
+ *	invalid, unless CLK_DIVIDER_ALLOW_ZERO is set.
  * CLK_DIVIDER_POWER_OF_TWO - clock divisor is 2 raised to the value read from
  * 	the hardware register
+ * CLK_DIVIDER_ALLOW_ZERO - Allow zero divisors.  For dividers which have
+ *	CLK_DIVIDER_ONE_BASED set, it is possible to end up with a zero divisor.
+ *	Some hardware implementations gracefully handle this case and allow a
+ *	zero divisor by not modifying their input clock
+ *	(divide by one / bypass).
  */
 struct clk_divider {
 	struct clk_hw	hw;
@@ -265,6 +270,7 @@ struct clk_divider {
 
 #define CLK_DIVIDER_ONE_BASED		BIT(0)
 #define CLK_DIVIDER_POWER_OF_TWO	BIT(1)
+#define CLK_DIVIDER_ALLOW_ZERO		BIT(2)
 
 extern const struct clk_ops clk_divider_ops;
 struct clk *clk_register_divider(struct device *dev, const char *name,
-- 
cgit 


From d8668fcb0b257d9fdcfbe5c172a99b8d85e1cd82 Mon Sep 17 00:00:00 2001
From: Shan Hai <shan.hai@windriver.com>
Date: Mon, 18 Mar 2013 10:30:43 +0800
Subject: libata: Use integer return value for atapi_command_packet_set

The function returns type of ATAPI drives so it should return integer value.
The commit 4dce8ba94c7 (libata: Use 'bool' return value for ata_id_XXX) since
v2.6.39 changed the type of return value from int to bool, the change would
cause all of the ATAPI class drives to be treated as TYPE_TAPE and the
max_sectors of the drives to be set to 65535 because of the commit
f8d8e5799b7(libata: increase 128 KB / cmd limit for ATAPI tape drives), for the
function would return true for all ATAPI class drives and the TYPE_TAPE is
defined as 0x01.

Cc: stable@vger.kernel.org
Signed-off-by: Shan Hai <shan.hai@windriver.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 include/linux/ata.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 8f7a3d68371a..ee0bd9524055 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -954,7 +954,7 @@ static inline int atapi_cdb_len(const u16 *dev_id)
 	}
 }
 
-static inline bool atapi_command_packet_set(const u16 *dev_id)
+static inline int atapi_command_packet_set(const u16 *dev_id)
 {
 	return (dev_id[ATA_ID_CONFIG] >> 8) & 0x1f;
 }
-- 
cgit 


From a32450e127fc6e5ca6d958ceb3cfea4d30a00846 Mon Sep 17 00:00:00 2001
From: Shan Hai <shan.hai@windriver.com>
Date: Mon, 18 Mar 2013 10:30:44 +0800
Subject: libata: Set max sector to 65535 for Slimtype DVD A DS8A8SH drive

The Slimtype DVD A  DS8A8SH drive locks up when max sector is smaller than
65535, and the blow backtrace is observed on locking up:

INFO: task flush-8:32:1130 blocked for more than 120 seconds.
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
flush-8:32      D ffffffff8180cf60     0  1130      2 0x00000000
 ffff880273aef618 0000000000000046 0000000000000005 ffff880273aee000
 ffff880273aee000 ffff880273aeffd8 ffff880273aee010 ffff880273aee000
 ffff880273aeffd8 ffff880273aee000 ffff88026e842ea0 ffff880274a10000
Call Trace:
 [<ffffffff8168fc2d>] schedule+0x5d/0x70
 [<ffffffff8168fccc>] io_schedule+0x8c/0xd0
 [<ffffffff81324461>] get_request+0x731/0x7d0
 [<ffffffff8133dc60>] ? cfq_allow_merge+0x50/0x90
 [<ffffffff81083aa0>] ? wake_up_bit+0x40/0x40
 [<ffffffff81320443>] ? bio_attempt_back_merge+0x33/0x110
 [<ffffffff813248ea>] blk_queue_bio+0x23a/0x3f0
 [<ffffffff81322176>] generic_make_request+0xc6/0x120
 [<ffffffff81322308>] submit_bio+0x138/0x160
 [<ffffffff811d7596>] ? bio_alloc_bioset+0x96/0x120
 [<ffffffff811d1f61>] submit_bh+0x1f1/0x220
 [<ffffffff811d48b8>] __block_write_full_page+0x228/0x340
 [<ffffffff811d3650>] ? attach_nobh_buffers+0xc0/0xc0
 [<ffffffff811d8960>] ? I_BDEV+0x10/0x10
 [<ffffffff811d8960>] ? I_BDEV+0x10/0x10
 [<ffffffff811d4ab6>] block_write_full_page_endio+0xe6/0x100
 [<ffffffff811d4ae5>] block_write_full_page+0x15/0x20
 [<ffffffff811d9268>] blkdev_writepage+0x18/0x20
 [<ffffffff81142527>] __writepage+0x17/0x40
 [<ffffffff811438ba>] write_cache_pages+0x34a/0x4a0
 [<ffffffff81142510>] ? set_page_dirty+0x70/0x70
 [<ffffffff81143a61>] generic_writepages+0x51/0x80
 [<ffffffff81143ab0>] do_writepages+0x20/0x50
 [<ffffffff811c9ed6>] __writeback_single_inode+0xa6/0x2b0
 [<ffffffff811ca861>] writeback_sb_inodes+0x311/0x4d0
 [<ffffffff811caaa6>] __writeback_inodes_wb+0x86/0xd0
 [<ffffffff811cad43>] wb_writeback+0x1a3/0x330
 [<ffffffff816916cf>] ? _raw_spin_lock_irqsave+0x3f/0x50
 [<ffffffff811b8362>] ? get_nr_inodes+0x52/0x70
 [<ffffffff811cb0ac>] wb_do_writeback+0x1dc/0x260
 [<ffffffff8168dd34>] ? schedule_timeout+0x204/0x240
 [<ffffffff811cb232>] bdi_writeback_thread+0x102/0x2b0
 [<ffffffff811cb130>] ? wb_do_writeback+0x260/0x260
 [<ffffffff81083550>] kthread+0xc0/0xd0
 [<ffffffff81083490>] ? kthread_worker_fn+0x1b0/0x1b0
 [<ffffffff8169a3ec>] ret_from_fork+0x7c/0xb0
 [<ffffffff81083490>] ? kthread_worker_fn+0x1b0/0x1b0

 The above trace was triggered by
   "dd if=/dev/zero of=/dev/sr0 bs=2048 count=32768"

 It was previously working by accident, since another bug introduced
 by 4dce8ba94c7 (libata: Use 'bool' return value for ata_id_XXX) caused
 all drives to use maxsect=65535.

Cc: stable@vger.kernel.org
Signed-off-by: Shan Hai <shan.hai@windriver.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 4 ++++
 include/linux/libata.h    | 1 +
 2 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 497adea1f0d6..0075944a64dc 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -2439,6 +2439,9 @@ int ata_dev_configure(struct ata_device *dev)
 		dev->max_sectors = min_t(unsigned int, ATA_MAX_SECTORS_128,
 					 dev->max_sectors);
 
+	if (dev->horkage & ATA_HORKAGE_MAX_SEC_LBA48)
+		dev->max_sectors = ATA_MAX_SECTORS_LBA48;
+
 	if (ap->ops->dev_config)
 		ap->ops->dev_config(dev);
 
@@ -4100,6 +4103,7 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	/* Weird ATAPI devices */
 	{ "TORiSAN DVD-ROM DRD-N216", NULL,	ATA_HORKAGE_MAX_SEC_128 },
 	{ "QUANTUM DAT    DAT72-000", NULL,	ATA_HORKAGE_ATAPI_MOD16_DMA },
+	{ "Slimtype DVD A  DS8A8SH", NULL,	ATA_HORKAGE_MAX_SEC_LBA48 },
 
 	/* Devices we expect to fail diagnostics */
 
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 91c9d109e5f1..eae7a053dc51 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -398,6 +398,7 @@ enum {
 	ATA_HORKAGE_NOSETXFER	= (1 << 14),	/* skip SETXFER, SATA only */
 	ATA_HORKAGE_BROKEN_FPDMA_AA	= (1 << 15),	/* skip AA */
 	ATA_HORKAGE_DUMP_ID	= (1 << 16),	/* dump IDENTIFY data */
+	ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17),	/* Set max sects to 65535 */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
 	    renumber */
-- 
cgit 


From d76a3a77113db020d9bb1e894822869410450bd9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 3 Apr 2013 22:02:52 -0400
Subject: ext4/jbd2: don't wait (forever) for stale tid caused by wraparound

In the case where an inode has a very stale transaction id (tid) in
i_datasync_tid or i_sync_tid, it's possible that after a very large
(2**31) number of transactions, that the tid number space might wrap,
causing tid_geq()'s calculations to fail.

Commit deeeaf13 "jbd2: fix fsync() tid wraparound bug", later modified
by commit e7b04ac0 "jbd2: don't wake kjournald unnecessarily",
attempted to fix this problem, but it only avoided kjournald spinning
forever by fixing the logic in jbd2_log_start_commit().

Unfortunately, in the codepaths in fs/ext4/fsync.c and fs/ext4/inode.c
that might call jbd2_log_start_commit() with a stale tid, those
functions will subsequently call jbd2_log_wait_commit() with the same
stale tid, and then wait for a very long time.  To fix this, we
replace the calls to jbd2_log_start_commit() and
jbd2_log_wait_commit() with a call to a new function,
jbd2_complete_transaction(), which will correctly handle stale tid's.

As a bonus, jbd2_complete_transaction() will avoid locking
j_state_lock for writing unless a commit needs to be started.  This
should have a small (but probably not measurable) improvement for
ext4's scalability.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reported-by: Ben Hutchings <ben@decadent.org.uk>
Reported-by: George Barnett <gbarnett@atlassian.com>
Cc: stable@vger.kernel.org
---
 fs/ext4/fsync.c      |  3 +--
 fs/ext4/inode.c      |  3 +--
 fs/jbd2/journal.c    | 31 +++++++++++++++++++++++++++++++
 include/linux/jbd2.h |  1 +
 4 files changed, 34 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 3278e64e57b6..e0ba8a408def 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -166,8 +166,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
 		needs_barrier = true;
-	jbd2_log_start_commit(journal, commit_tid);
-	ret = jbd2_log_wait_commit(journal, commit_tid);
+	ret = jbd2_complete_transaction(journal, commit_tid);
 	if (needs_barrier) {
 		err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 		if (!ret)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 56ebd662033b..addba9e0a1a4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -210,8 +210,7 @@ void ext4_evict_inode(struct inode *inode)
 			journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
 			tid_t commit_tid = EXT4_I(inode)->i_datasync_tid;
 
-			jbd2_log_start_commit(journal, commit_tid);
-			jbd2_log_wait_commit(journal, commit_tid);
+			jbd2_complete_transaction(journal, commit_tid);
 			filemap_write_and_wait(&inode->i_data);
 		}
 		truncate_inode_pages(&inode->i_data, 0);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ed10991ab006..886ec2faa9b4 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -709,6 +709,37 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 	return err;
 }
 
+/*
+ * When this function returns the transaction corresponding to tid
+ * will be completed.  If the transaction has currently running, start
+ * committing that transaction before waiting for it to complete.  If
+ * the transaction id is stale, it is by definition already completed,
+ * so just return SUCCESS.
+ */
+int jbd2_complete_transaction(journal_t *journal, tid_t tid)
+{
+	int	need_to_wait = 1;
+
+	read_lock(&journal->j_state_lock);
+	if (journal->j_running_transaction &&
+	    journal->j_running_transaction->t_tid == tid) {
+		if (journal->j_commit_request != tid) {
+			/* transaction not yet started, so request it */
+			read_unlock(&journal->j_state_lock);
+			jbd2_log_start_commit(journal, tid);
+			goto wait_commit;
+		}
+	} else if (!(journal->j_committing_transaction &&
+		     journal->j_committing_transaction->t_tid == tid))
+		need_to_wait = 0;
+	read_unlock(&journal->j_state_lock);
+	if (!need_to_wait)
+		return 0;
+wait_commit:
+	return jbd2_log_wait_commit(journal, tid);
+}
+EXPORT_SYMBOL(jbd2_complete_transaction);
+
 /*
  * Log buffer allocation routines:
  */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 50e5a5e6a712..f0289754b464 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1200,6 +1200,7 @@ int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
 int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
 int jbd2_journal_force_commit_nested(journal_t *journal);
 int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
+int jbd2_complete_transaction(journal_t *journal, tid_t tid);
 int jbd2_log_do_checkpoint(journal_t *journal);
 int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);
 
-- 
cgit 


From 794446c6946513c684d448205fbd76fa35f38b72 Mon Sep 17 00:00:00 2001
From: Dmitry Monakhov <dmonakhov@openvz.org>
Date: Wed, 3 Apr 2013 22:06:52 -0400
Subject: jbd2: fix race between jbd2_journal_remove_checkpoint and
 ->j_commit_callback

The following race is possible:

[kjournald2]                              other_task
jbd2_journal_commit_transaction()
  j_state = T_FINISHED;
  spin_unlock(&journal->j_list_lock);
                                         ->jbd2_journal_remove_checkpoint()
					   ->jbd2_journal_free_transaction();
					     ->kmem_cache_free(transaction)
  ->j_commit_callback(journal, transaction);
    -> USE_AFTER_FREE

WARNING: at lib/list_debug.c:62 __list_del_entry+0x1c0/0x250()
Hardware name:
list_del corruption. prev->next should be ffff88019a4ec198, but was 6b6b6b6b6b6b6b6b
Modules linked in: cpufreq_ondemand acpi_cpufreq freq_table mperf coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode sg xhci_hcd button sd_mod crc_t10dif aesni_intel ablk_helper cryptd lrw aes_x86_64 xts gf128mul ahci libahci pata_acpi ata_generic dm_mirror dm_region_hash dm_log dm_mod
Pid: 16400, comm: jbd2/dm-1-8 Tainted: G        W    3.8.0-rc3+ #107
Call Trace:
 [<ffffffff8106fb0d>] warn_slowpath_common+0xad/0xf0
 [<ffffffff8106fc06>] warn_slowpath_fmt+0x46/0x50
 [<ffffffff813637e9>] ? ext4_journal_commit_callback+0x99/0xc0
 [<ffffffff8148cae0>] __list_del_entry+0x1c0/0x250
 [<ffffffff813637bf>] ext4_journal_commit_callback+0x6f/0xc0
 [<ffffffff813ca336>] jbd2_journal_commit_transaction+0x23a6/0x2570
 [<ffffffff8108aa42>] ? try_to_del_timer_sync+0x82/0xa0
 [<ffffffff8108b491>] ? del_timer_sync+0x91/0x1e0
 [<ffffffff813d3ecf>] kjournald2+0x19f/0x6a0
 [<ffffffff810ad630>] ? wake_up_bit+0x40/0x40
 [<ffffffff813d3d30>] ? bit_spin_lock+0x80/0x80
 [<ffffffff810ac6be>] kthread+0x10e/0x120
 [<ffffffff810ac5b0>] ? __init_kthread_worker+0x70/0x70
 [<ffffffff818ff6ac>] ret_from_fork+0x7c/0xb0
 [<ffffffff810ac5b0>] ? __init_kthread_worker+0x70/0x70

In order to demonstrace this issue one should mount ext4 with mount -o
discard option on SSD disk.  This makes callback longer and race
window becomes wider.

In order to fix this we should mark transaction as finished only after
callbacks have completed

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/jbd2/commit.c     | 50 ++++++++++++++++++++++++++++----------------------
 include/linux/jbd2.h |  1 +
 2 files changed, 29 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 750c70148eff..0f53946f13c1 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -382,7 +382,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 	int space_left = 0;
 	int first_tag = 0;
 	int tag_flag;
-	int i, to_free = 0;
+	int i;
 	int tag_bytes = journal_tag_bytes(journal);
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
@@ -1134,7 +1134,7 @@ restart_loop:
 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
 	spin_unlock(&journal->j_history_lock);
 
-	commit_transaction->t_state = T_FINISHED;
+	commit_transaction->t_state = T_COMMIT_CALLBACK;
 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
 	journal->j_commit_sequence = commit_transaction->t_tid;
 	journal->j_committing_transaction = NULL;
@@ -1149,38 +1149,44 @@ restart_loop:
 				journal->j_average_commit_time*3) / 4;
 	else
 		journal->j_average_commit_time = commit_time;
+
 	write_unlock(&journal->j_state_lock);
 
-	if (commit_transaction->t_checkpoint_list == NULL &&
-	    commit_transaction->t_checkpoint_io_list == NULL) {
-		__jbd2_journal_drop_transaction(journal, commit_transaction);
-		to_free = 1;
+	if (journal->j_checkpoint_transactions == NULL) {
+		journal->j_checkpoint_transactions = commit_transaction;
+		commit_transaction->t_cpnext = commit_transaction;
+		commit_transaction->t_cpprev = commit_transaction;
 	} else {
-		if (journal->j_checkpoint_transactions == NULL) {
-			journal->j_checkpoint_transactions = commit_transaction;
-			commit_transaction->t_cpnext = commit_transaction;
-			commit_transaction->t_cpprev = commit_transaction;
-		} else {
-			commit_transaction->t_cpnext =
-				journal->j_checkpoint_transactions;
-			commit_transaction->t_cpprev =
-				commit_transaction->t_cpnext->t_cpprev;
-			commit_transaction->t_cpnext->t_cpprev =
-				commit_transaction;
-			commit_transaction->t_cpprev->t_cpnext =
+		commit_transaction->t_cpnext =
+			journal->j_checkpoint_transactions;
+		commit_transaction->t_cpprev =
+			commit_transaction->t_cpnext->t_cpprev;
+		commit_transaction->t_cpnext->t_cpprev =
+			commit_transaction;
+		commit_transaction->t_cpprev->t_cpnext =
 				commit_transaction;
-		}
 	}
 	spin_unlock(&journal->j_list_lock);
-
+	/* Drop all spin_locks because commit_callback may be block.
+	 * __journal_remove_checkpoint() can not destroy transaction
+	 * under us because it is not marked as T_FINISHED yet */
 	if (journal->j_commit_callback)
 		journal->j_commit_callback(journal, commit_transaction);
 
 	trace_jbd2_end_commit(journal, commit_transaction);
 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
-	if (to_free)
-		jbd2_journal_free_transaction(commit_transaction);
 
+	write_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	commit_transaction->t_state = T_FINISHED;
+	/* Recheck checkpoint lists after j_list_lock was dropped */
+	if (commit_transaction->t_checkpoint_list == NULL &&
+	    commit_transaction->t_checkpoint_io_list == NULL) {
+		__jbd2_journal_drop_transaction(journal, commit_transaction);
+		jbd2_journal_free_transaction(commit_transaction);
+	}
+	spin_unlock(&journal->j_list_lock);
+	write_unlock(&journal->j_state_lock);
 	wake_up(&journal->j_wait_done_commit);
 }
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f0289754b464..f9fe88957b7a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -480,6 +480,7 @@ struct transaction_s
 		T_COMMIT,
 		T_COMMIT_DFLUSH,
 		T_COMMIT_JFLUSH,
+		T_COMMIT_CALLBACK,
 		T_FINISHED
 	}			t_state;
 
-- 
cgit 


From c31ad081e8734aab3fb45d2f32e9969994dd076e Mon Sep 17 00:00:00 2001
From: Arve Hjønnevåg <arve@android.com>
Date: Tue, 22 May 2012 16:33:23 -0700
Subject: pstore/ram: Allow specifying ecc parameters in platform data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Allow specifying ecc parameters in platform data

Signed-off-by: Arve Hjønnevåg <arve@android.com>
[jstultz: Tweaked commit subject & add commit message]
Signed-off-by: John Stultz <john.stultz@linaro.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 fs/pstore/ram.c            | 15 ++++++-----
 fs/pstore/ram_core.c       | 64 ++++++++++++++++++++++++----------------------
 include/linux/pstore_ram.h | 14 +++++++---
 3 files changed, 52 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 38babb3a9384..a5ee252c264e 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -83,7 +83,7 @@ struct ramoops_context {
 	size_t console_size;
 	size_t ftrace_size;
 	int dump_oops;
-	int ecc_size;
+	struct persistent_ram_ecc_info ecc_info;
 	unsigned int max_dump_cnt;
 	unsigned int dump_write_cnt;
 	unsigned int dump_read_cnt;
@@ -322,7 +322,8 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
 	for (i = 0; i < cxt->max_dump_cnt; i++) {
 		size_t sz = cxt->record_size;
 
-		cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, cxt->ecc_size);
+		cxt->przs[i] = persistent_ram_new(*paddr, sz, 0,
+						  &cxt->ecc_info);
 		if (IS_ERR(cxt->przs[i])) {
 			err = PTR_ERR(cxt->przs[i]);
 			dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
@@ -352,7 +353,7 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
 		return -ENOMEM;
 	}
 
-	*prz = persistent_ram_new(*paddr, sz, sig, cxt->ecc_size);
+	*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info);
 	if (IS_ERR(*prz)) {
 		int err = PTR_ERR(*prz);
 
@@ -406,7 +407,7 @@ static int ramoops_probe(struct platform_device *pdev)
 	cxt->console_size = pdata->console_size;
 	cxt->ftrace_size = pdata->ftrace_size;
 	cxt->dump_oops = pdata->dump_oops;
-	cxt->ecc_size = pdata->ecc_size;
+	cxt->ecc_info = pdata->ecc_info;
 
 	paddr = cxt->phys_addr;
 
@@ -464,9 +465,9 @@ static int ramoops_probe(struct platform_device *pdev)
 	record_size = pdata->record_size;
 	dump_oops = pdata->dump_oops;
 
-	pr_info("attached 0x%lx@0x%llx, ecc: %d\n",
+	pr_info("attached 0x%lx@0x%llx, ecc: %d/%d\n",
 		cxt->size, (unsigned long long)cxt->phys_addr,
-		cxt->ecc_size);
+		cxt->ecc_info.ecc_size, cxt->ecc_info.block_size);
 
 	return 0;
 
@@ -538,7 +539,7 @@ static void ramoops_register_dummy(void)
 	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
 	 * (using 1 byte for ECC isn't much of use anyway).
 	 */
-	dummy_data->ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
+	dummy_data->ecc_info.ecc_size = ramoops_ecc == 1 ? 16 : ramoops_ecc;
 
 	dummy = platform_device_register_data(NULL, "ramoops", -1,
 			dummy_data, sizeof(struct ramoops_platform_data));
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index e5afa222c213..c6f641c10179 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -82,12 +82,12 @@ static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
 	uint8_t *data, size_t len, uint8_t *ecc)
 {
 	int i;
-	uint16_t par[prz->ecc_size];
+	uint16_t par[prz->ecc_info.ecc_size];
 
 	/* Initialize the parity buffer */
 	memset(par, 0, sizeof(par));
 	encode_rs8(prz->rs_decoder, data, len, par, 0);
-	for (i = 0; i < prz->ecc_size; i++)
+	for (i = 0; i < prz->ecc_info.ecc_size; i++)
 		ecc[i] = par[i];
 }
 
@@ -95,9 +95,9 @@ static int persistent_ram_decode_rs8(struct persistent_ram_zone *prz,
 	void *data, size_t len, uint8_t *ecc)
 {
 	int i;
-	uint16_t par[prz->ecc_size];
+	uint16_t par[prz->ecc_info.ecc_size];
 
-	for (i = 0; i < prz->ecc_size; i++)
+	for (i = 0; i < prz->ecc_info.ecc_size; i++)
 		par[i] = ecc[i];
 	return decode_rs8(prz->rs_decoder, data, par, len,
 				NULL, 0, NULL, 0, NULL);
@@ -110,15 +110,15 @@ static void notrace persistent_ram_update_ecc(struct persistent_ram_zone *prz,
 	uint8_t *buffer_end = buffer->data + prz->buffer_size;
 	uint8_t *block;
 	uint8_t *par;
-	int ecc_block_size = prz->ecc_block_size;
-	int ecc_size = prz->ecc_size;
-	int size = prz->ecc_block_size;
+	int ecc_block_size = prz->ecc_info.block_size;
+	int ecc_size = prz->ecc_info.ecc_size;
+	int size = ecc_block_size;
 
-	if (!prz->ecc_size)
+	if (!ecc_size)
 		return;
 
 	block = buffer->data + (start & ~(ecc_block_size - 1));
-	par = prz->par_buffer + (start / ecc_block_size) * prz->ecc_size;
+	par = prz->par_buffer + (start / ecc_block_size) * ecc_size;
 
 	do {
 		if (block + ecc_block_size > buffer_end)
@@ -133,7 +133,7 @@ static void persistent_ram_update_header_ecc(struct persistent_ram_zone *prz)
 {
 	struct persistent_ram_buffer *buffer = prz->buffer;
 
-	if (!prz->ecc_size)
+	if (!prz->ecc_info.ecc_size)
 		return;
 
 	persistent_ram_encode_rs8(prz, (uint8_t *)buffer, sizeof(*buffer),
@@ -146,14 +146,14 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
 	uint8_t *block;
 	uint8_t *par;
 
-	if (!prz->ecc_size)
+	if (!prz->ecc_info.ecc_size)
 		return;
 
 	block = buffer->data;
 	par = prz->par_buffer;
 	while (block < buffer->data + buffer_size(prz)) {
 		int numerr;
-		int size = prz->ecc_block_size;
+		int size = prz->ecc_info.block_size;
 		if (block + size > buffer->data + prz->buffer_size)
 			size = buffer->data + prz->buffer_size - block;
 		numerr = persistent_ram_decode_rs8(prz, block, size, par);
@@ -166,45 +166,49 @@ static void persistent_ram_ecc_old(struct persistent_ram_zone *prz)
 				block);
 			prz->bad_blocks++;
 		}
-		block += prz->ecc_block_size;
-		par += prz->ecc_size;
+		block += prz->ecc_info.block_size;
+		par += prz->ecc_info.ecc_size;
 	}
 }
 
 static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
-				   int ecc_size)
+				   struct persistent_ram_ecc_info *ecc_info)
 {
 	int numerr;
 	struct persistent_ram_buffer *buffer = prz->buffer;
 	int ecc_blocks;
 	size_t ecc_total;
-	int ecc_symsize = 8;
-	int ecc_poly = 0x11d;
 
-	if (!ecc_size)
+	if (!ecc_info || !ecc_info->ecc_size)
 		return 0;
 
-	prz->ecc_block_size = 128;
-	prz->ecc_size = ecc_size;
+	prz->ecc_info.block_size = ecc_info->block_size ?: 128;
+	prz->ecc_info.ecc_size = ecc_info->ecc_size ?: 16;
+	prz->ecc_info.symsize = ecc_info->symsize ?: 8;
+	prz->ecc_info.poly = ecc_info->poly ?: 0x11d;
 
-	ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_size,
-				  prz->ecc_block_size + prz->ecc_size);
-	ecc_total = (ecc_blocks + 1) * prz->ecc_size;
+	ecc_blocks = DIV_ROUND_UP(prz->buffer_size - prz->ecc_info.ecc_size,
+				  prz->ecc_info.block_size +
+				  prz->ecc_info.ecc_size);
+	ecc_total = (ecc_blocks + 1) * prz->ecc_info.ecc_size;
 	if (ecc_total >= prz->buffer_size) {
 		pr_err("%s: invalid ecc_size %u (total %zu, buffer size %zu)\n",
-		       __func__, prz->ecc_size, ecc_total, prz->buffer_size);
+		       __func__, prz->ecc_info.ecc_size,
+		       ecc_total, prz->buffer_size);
 		return -EINVAL;
 	}
 
 	prz->buffer_size -= ecc_total;
 	prz->par_buffer = buffer->data + prz->buffer_size;
-	prz->par_header = prz->par_buffer + ecc_blocks * prz->ecc_size;
+	prz->par_header = prz->par_buffer +
+			  ecc_blocks * prz->ecc_info.ecc_size;
 
 	/*
 	 * first consecutive root is 0
 	 * primitive element to generate roots = 1
 	 */
-	prz->rs_decoder = init_rs(ecc_symsize, ecc_poly, 0, 1, prz->ecc_size);
+	prz->rs_decoder = init_rs(prz->ecc_info.symsize, prz->ecc_info.poly,
+				  0, 1, prz->ecc_info.ecc_size);
 	if (prz->rs_decoder == NULL) {
 		pr_info("persistent_ram: init_rs failed\n");
 		return -EINVAL;
@@ -392,11 +396,11 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
 }
 
 static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
-				    int ecc_size)
+				    struct persistent_ram_ecc_info *ecc_info)
 {
 	int ret;
 
-	ret = persistent_ram_init_ecc(prz, ecc_size);
+	ret = persistent_ram_init_ecc(prz, ecc_info);
 	if (ret)
 		return ret;
 
@@ -445,7 +449,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
 }
 
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
-					       u32 sig, int ecc_size)
+			u32 sig, struct persistent_ram_ecc_info *ecc_info)
 {
 	struct persistent_ram_zone *prz;
 	int ret = -ENOMEM;
@@ -460,7 +464,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 	if (ret)
 		goto err;
 
-	ret = persistent_ram_post_init(prz, sig, ecc_size);
+	ret = persistent_ram_post_init(prz, sig, ecc_info);
 	if (ret)
 		goto err;
 
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h
index cb6ab5feab67..9974975d40db 100644
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -26,6 +26,13 @@
 struct persistent_ram_buffer;
 struct rs_control;
 
+struct persistent_ram_ecc_info {
+	int block_size;
+	int ecc_size;
+	int symsize;
+	int poly;
+};
+
 struct persistent_ram_zone {
 	phys_addr_t paddr;
 	size_t size;
@@ -39,15 +46,14 @@ struct persistent_ram_zone {
 	struct rs_control *rs_decoder;
 	int corrected_bytes;
 	int bad_blocks;
-	int ecc_block_size;
-	int ecc_size;
+	struct persistent_ram_ecc_info ecc_info;
 
 	char *old_log;
 	size_t old_log_size;
 };
 
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
-					       u32 sig, int ecc_size);
+			u32 sig, struct persistent_ram_ecc_info *ecc_info);
 void persistent_ram_free(struct persistent_ram_zone *prz);
 void persistent_ram_zap(struct persistent_ram_zone *prz);
 
@@ -74,7 +80,7 @@ struct ramoops_platform_data {
 	unsigned long	console_size;
 	unsigned long	ftrace_size;
 	int		dump_oops;
-	int		ecc_size;
+	struct persistent_ram_ecc_info ecc_info;
 };
 
 #endif
-- 
cgit 


From 3daf37260e965aa4bb060db99c2ed10b28109e04 Mon Sep 17 00:00:00 2001
From: Tony Prisk <linux@prisktech.co.nz>
Date: Sat, 23 Mar 2013 17:02:15 +1300
Subject: of: Add support for reading a u32 from a multi-value property.

This patch adds an of_property_read_u32_index() function to allow
reading a single indexed u32 value from a property containing multiple
u32 values.

Signed-off-by: Tony Prisk <linux@prisktech.co.nz>
Reviewed-by: Stephen Warren <swarren@nvidia.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Rob Herring <robherring2@gmail.com>
---
 drivers/of/base.c  | 33 +++++++++++++++++++++++++++++++++
 include/linux/of.h |  9 +++++++++
 2 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 321d3ef05006..f6c89ed38db9 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -745,6 +745,39 @@ struct device_node *of_find_node_by_phandle(phandle handle)
 }
 EXPORT_SYMBOL(of_find_node_by_phandle);
 
+/**
+ * of_property_read_u32_index - Find and read a u32 from a multi-value property.
+ *
+ * @np:		device node from which the property value is to be read.
+ * @propname:	name of the property to be searched.
+ * @index:	index of the u32 in the list of values
+ * @out_value:	pointer to return value, modified only if no error.
+ *
+ * Search for a property in a device node and read nth 32-bit value from
+ * it. Returns 0 on success, -EINVAL if the property does not exist,
+ * -ENODATA if property does not have a value, and -EOVERFLOW if the
+ * property data isn't large enough.
+ *
+ * The out_value is modified only if a valid u32 value can be decoded.
+ */
+int of_property_read_u32_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u32 *out_value)
+{
+	struct property *prop = of_find_property(np, propname, NULL);
+
+	if (!prop)
+		return -EINVAL;
+	if (!prop->value)
+		return -ENODATA;
+	if (((index + 1) * sizeof(*out_value)) > prop->length)
+		return -EOVERFLOW;
+
+	*out_value = be32_to_cpup(((__be32 *)prop->value) + index);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(of_property_read_u32_index);
+
 /**
  * of_property_read_u8_array - Find and read an array of u8 from a property.
  *
diff --git a/include/linux/of.h b/include/linux/of.h
index a0f129284948..c0747a44eaff 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -235,6 +235,9 @@ extern struct device_node *of_find_node_with_property(
 extern struct property *of_find_property(const struct device_node *np,
 					 const char *name,
 					 int *lenp);
+extern int of_property_read_u32_index(const struct device_node *np,
+				       const char *propname,
+				       u32 index, u32 *out_value);
 extern int of_property_read_u8_array(const struct device_node *np,
 			const char *propname, u8 *out_values, size_t sz);
 extern int of_property_read_u16_array(const struct device_node *np,
@@ -394,6 +397,12 @@ static inline struct device_node *of_find_compatible_node(
 	return NULL;
 }
 
+static inline int of_property_read_u32_index(const struct device_node *np,
+			const char *propname, u32 index, u32 *out_value)
+{
+	return -ENOSYS;
+}
+
 static inline int of_property_read_u8_array(const struct device_node *np,
 			const char *propname, u8 *out_values, size_t sz)
 {
-- 
cgit 


From 3af4ae1e4b57a663fff9cfe711c84fefb6ec966f Mon Sep 17 00:00:00 2001
From: Tony Prisk <linux@prisktech.co.nz>
Date: Wed, 3 Apr 2013 07:20:34 +1300
Subject: video: vt8500: Remove unused platform_data/video-vt8500lcdfb.h

With the conversion to devicetree only for arch-vt8500, this
header is no longer required. This patch removes the #include
from the two framebuffer drivers that used it, and the header file.

Signed-off-by: Tony Prisk <linux@prisktech.co.nz>
Reviewed-by: Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 drivers/video/vt8500lcdfb.c                     |  2 --
 drivers/video/wm8505fb.c                        |  2 --
 include/linux/platform_data/video-vt8500lcdfb.h | 31 -------------------------
 3 files changed, 35 deletions(-)
 delete mode 100644 include/linux/platform_data/video-vt8500lcdfb.h

(limited to 'include/linux')

diff --git a/drivers/video/vt8500lcdfb.c b/drivers/video/vt8500lcdfb.c
index aa2579c2364a..2ff2312a16ac 100644
--- a/drivers/video/vt8500lcdfb.c
+++ b/drivers/video/vt8500lcdfb.c
@@ -30,8 +30,6 @@
 #include <linux/platform_device.h>
 #include <linux/wait.h>
 
-#include <linux/platform_data/video-vt8500lcdfb.h>
-
 #include "vt8500lcdfb.h"
 #include "wmt_ge_rops.h"
 
diff --git a/drivers/video/wm8505fb.c b/drivers/video/wm8505fb.c
index 4dd0580f96fd..fe9afd60a018 100644
--- a/drivers/video/wm8505fb.c
+++ b/drivers/video/wm8505fb.c
@@ -32,8 +32,6 @@
 #include <linux/of_fdt.h>
 #include <linux/memblock.h>
 
-#include <linux/platform_data/video-vt8500lcdfb.h>
-
 #include "wm8505fb_regs.h"
 #include "wmt_ge_rops.h"
 
diff --git a/include/linux/platform_data/video-vt8500lcdfb.h b/include/linux/platform_data/video-vt8500lcdfb.h
deleted file mode 100644
index 7f399c370fe0..000000000000
--- a/include/linux/platform_data/video-vt8500lcdfb.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- *  VT8500/WM8505 Frame Buffer platform data definitions
- *
- *  Copyright (C) 2010 Ed Spiridonov <edo.rus@gmail.com>
- *
- * This software is licensed under the terms of the GNU General Public
- * License version 2, as published by the Free Software Foundation, and
- * may be copied, distributed, and modified under those terms.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef _VT8500FB_H
-#define _VT8500FB_H
-
-#include <linux/fb.h>
-
-struct vt8500fb_platform_data {
-	struct fb_videomode	mode;
-	u32			xres_virtual;
-	u32			yres_virtual;
-	u32			bpp;
-	unsigned long		video_mem_phys;
-	void			*video_mem_virt;
-	unsigned long		video_mem_len;
-};
-
-#endif /* _VT8500FB_H */
-- 
cgit 


From 0908ad6e56b5a6e86745680bc324bdbfac64d0b6 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Fri, 22 Mar 2013 20:46:27 +0530
Subject: uprobes: Add trap variant helper

Some architectures like powerpc have multiple variants of the trap
instruction. Introduce an additional helper is_trap_insn() for run-time
handling of non-uprobe traps on such architectures.

While there, change is_swbp_at_addr() to is_trap_at_addr() for reading
clarity.

With this change, the uprobe registration path will supercede any trap
instruction inserted at the requested location, while taking care of
delivering the SIGTRAP for cases where the trap notification came in
for an address without a uprobe. See [1] for a more detailed explanation.

[1] https://lists.ozlabs.org/pipermail/linuxppc-dev/2013-March/104771.html

This change was suggested by Oleg Nesterov.

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/uprobes.h |  1 +
 kernel/events/uprobes.c | 34 +++++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 02b83db8e2c5..19612881399a 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -100,6 +100,7 @@ struct uprobes_state {
 extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
+extern bool __weak is_trap_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 26bc2e24e9e3..ca9012930ce7 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -173,6 +173,20 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 	return *insn == UPROBE_SWBP_INSN;
 }
 
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+	return is_swbp_insn(insn);
+}
+
 static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
 {
 	void *kaddr = kmap_atomic(page);
@@ -185,6 +199,15 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
 
+	/*
+	 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+	 * We do not check if it is any other 'trap variant' which could
+	 * be conditional trap instruction such as the one powerpc supports.
+	 *
+	 * The logic is that we do not care if the underlying instruction
+	 * is a trap variant; uprobes always wins over any other (gdb)
+	 * breakpoint.
+	 */
 	copy_opcode(page, vaddr, &old_opcode);
 	is_swbp = is_swbp_insn(&old_opcode);
 
@@ -204,7 +227,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
  * Expect the breakpoint instruction to be the smallest size instruction for
  * the architecture. If an arch has variable length instruction and the
  * breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
+ * supported by that architecture then we need to modify is_trap_at_addr and
  * write_opcode accordingly. This would never be a problem for archs that
  * have fixed length instructions.
  */
@@ -550,7 +573,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
 		goto out;
 
 	ret = -ENOTSUPP;
-	if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+	if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
 		goto out;
 
 	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -1431,7 +1454,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
 	clear_bit(MMF_HAS_UPROBES, &mm->flags);
 }
 
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 {
 	struct page *page;
 	uprobe_opcode_t opcode;
@@ -1452,7 +1475,8 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	copy_opcode(page, vaddr, &opcode);
 	put_page(page);
  out:
-	return is_swbp_insn(&opcode);
+	/* This needs to return true for any variant of the trap insn */
+	return is_trap_insn(&opcode);
 }
 
 static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1472,7 +1496,7 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 		}
 
 		if (!uprobe)
-			*is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+			*is_swbp = is_trap_at_addr(mm, bp_vaddr);
 	} else {
 		*is_swbp = -EFAULT;
 	}
-- 
cgit 


From 4aa02c7cbb6816913554dc18ff750a70a4ace796 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Wed, 13 Mar 2013 14:03:12 +0800
Subject: video: mxsfb: remove fb_phys/fb_size from platform_data

There is no in-tree users of mxsfb_platform_data fb_phys/fb_size.
With CMA support in the kernel, there is no real need for platform to
reserve memory and pass address and size into driver via platform_data.
So let's remove fb_phys/fb_size from mxsfb_platform_data to ease full
device tree adoption.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
---
 drivers/video/mxsfb.c | 39 +++++++--------------------------------
 include/linux/mxsfb.h |  9 ---------
 2 files changed, 7 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c
index 69fb3f1d1e12..9e8740bade32 100644
--- a/drivers/video/mxsfb.c
+++ b/drivers/video/mxsfb.c
@@ -168,7 +168,6 @@ struct mxsfb_info {
 	unsigned ld_intf_width;
 	unsigned dotclk_delay;
 	const struct mxsfb_devdata *devdata;
-	int mapped;
 	u32 sync;
 };
 
@@ -686,7 +685,7 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host)
 	struct mxsfb_platform_data *pdata = host->pdev->dev.platform_data;
 	dma_addr_t fb_phys;
 	void *fb_virt;
-	unsigned fb_size = pdata->fb_size;
+	unsigned fb_size;
 
 	fb_info->fbops = &mxsfb_ops;
 	fb_info->flags = FBINFO_FLAG_DEFAULT | FBINFO_READS_FAST;
@@ -706,30 +705,12 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host)
 	host->ld_intf_width = pdata->ld_intf_width;
 
 	/* Memory allocation for framebuffer */
-	if (pdata->fb_phys) {
-		if (!fb_size)
-			return -EINVAL;
-
-		fb_phys = pdata->fb_phys;
+	fb_size = SZ_2M;
+	fb_virt = alloc_pages_exact(fb_size, GFP_DMA);
+	if (!fb_virt)
+		return -ENOMEM;
 
-		if (!request_mem_region(fb_phys, fb_size, host->pdev->name))
-			return -ENOMEM;
-
-		fb_virt = ioremap(fb_phys, fb_size);
-		if (!fb_virt) {
-			release_mem_region(fb_phys, fb_size);
-			return -ENOMEM;
-		}
-		host->mapped = 1;
-	} else {
-		if (!fb_size)
-			fb_size = SZ_2M; /* default */
-		fb_virt = alloc_pages_exact(fb_size, GFP_DMA);
-		if (!fb_virt)
-			return -ENOMEM;
-
-		fb_phys = virt_to_phys(fb_virt);
-	}
+	fb_phys = virt_to_phys(fb_virt);
 
 	fb_info->fix.smem_start = fb_phys;
 	fb_info->screen_base = fb_virt;
@@ -745,13 +726,7 @@ static void mxsfb_free_videomem(struct mxsfb_info *host)
 {
 	struct fb_info *fb_info = &host->fb_info;
 
-	if (host->mapped) {
-		iounmap(fb_info->screen_base);
-		release_mem_region(fb_info->fix.smem_start,
-				fb_info->screen_size);
-	} else {
-		free_pages_exact(fb_info->screen_base, fb_info->fix.smem_len);
-	}
+	free_pages_exact(fb_info->screen_base, fb_info->fix.smem_len);
 }
 
 static struct platform_device_id mxsfb_devtype[] = {
diff --git a/include/linux/mxsfb.h b/include/linux/mxsfb.h
index f80af8674342..93696404ee55 100644
--- a/include/linux/mxsfb.h
+++ b/include/linux/mxsfb.h
@@ -35,15 +35,6 @@ struct mxsfb_platform_data {
 
 	unsigned dotclk_delay;	/* refer manual HW_LCDIF_VDCTRL4 register */
 	unsigned ld_intf_width;	/* refer STMLCDIF_* macros */
-
-	unsigned fb_size;	/* Size of the video memory. If zero a
-				 * default will be used
-				 */
-	unsigned long fb_phys;	/* physical address for the video memory. If
-				 * zero the framebuffer memory will be dynamically
-				 * allocated. If specified,fb_size must also be specified.
-				 * fb_phys must be unused by Linux.
-				 */
 	u32 sync;		/* sync mask, contains MXSFB specifics not
 				 * carried in fb_info->var.sync
 				 */
-- 
cgit 


From 36f3e99649baa77b2d22e385b2ea09e8f308c905 Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Wed, 13 Mar 2013 14:28:19 +0800
Subject: video: mxsfb: remove dotclk_delay from platform_data

There is no in-tree mxsfb users using mxsfb_platform_data dotclk_delay.
Let's remove it from mxsfb_platform_data to ease full device tree
adoption of mxsfb driver.  If later we have platform/board need to
configure this parameter, we can add it into device tree bindings.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
---
 drivers/video/mxsfb.c | 1 -
 include/linux/mxsfb.h | 1 -
 2 files changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c
index 9e8740bade32..a89901c7f5e9 100644
--- a/drivers/video/mxsfb.c
+++ b/drivers/video/mxsfb.c
@@ -701,7 +701,6 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host)
 	var->accel_flags = 0;
 	var->vmode = FB_VMODE_NONINTERLACED;
 
-	host->dotclk_delay = pdata->dotclk_delay;
 	host->ld_intf_width = pdata->ld_intf_width;
 
 	/* Memory allocation for framebuffer */
diff --git a/include/linux/mxsfb.h b/include/linux/mxsfb.h
index 93696404ee55..b78465cdb26e 100644
--- a/include/linux/mxsfb.h
+++ b/include/linux/mxsfb.h
@@ -33,7 +33,6 @@ struct mxsfb_platform_data {
 
 	unsigned default_bpp;
 
-	unsigned dotclk_delay;	/* refer manual HW_LCDIF_VDCTRL4 register */
 	unsigned ld_intf_width;	/* refer STMLCDIF_* macros */
 	u32 sync;		/* sync mask, contains MXSFB specifics not
 				 * carried in fb_info->var.sync
-- 
cgit 


From c8b5cfc8797203d5e952592be00a5acacf5cef6a Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Thu, 14 Mar 2013 13:21:56 +0800
Subject: video: mxsfb: remove mxsfb_platform_data

None of mxsfb users uses mxsfb_platform_data now.  Let's remove it
from mxsfb driver.

As the result, include/linux/mxsfb.h gets deleted with a few macros
moved into mxsfb.c.  Along with the change, the typo "FAILING" in macro
name is fixed to be "FALLING".

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
---
 drivers/video/mxsfb.c | 41 +++++++++++++++++------------------------
 include/linux/mxsfb.h | 42 ------------------------------------------
 2 files changed, 17 insertions(+), 66 deletions(-)
 delete mode 100644 include/linux/mxsfb.h

(limited to 'include/linux')

diff --git a/drivers/video/mxsfb.c b/drivers/video/mxsfb.c
index e5ceba54d22f..eac7c1ace7a5 100644
--- a/drivers/video/mxsfb.c
+++ b/drivers/video/mxsfb.c
@@ -49,7 +49,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/io.h>
 #include <linux/pinctrl/consumer.h>
-#include <linux/mxsfb.h>
+#include <linux/fb.h>
 #include <video/videomode.h>
 
 #define REG_SET	4
@@ -109,7 +109,7 @@
 #define VDCTRL0_ENABLE_PRESENT		(1 << 28)
 #define VDCTRL0_VSYNC_ACT_HIGH		(1 << 27)
 #define VDCTRL0_HSYNC_ACT_HIGH		(1 << 26)
-#define VDCTRL0_DOTCLK_ACT_FAILING	(1 << 25)
+#define VDCTRL0_DOTCLK_ACT_FALLING	(1 << 25)
 #define VDCTRL0_ENABLE_ACT_HIGH		(1 << 24)
 #define VDCTRL0_VSYNC_PERIOD_UNIT	(1 << 21)
 #define VDCTRL0_VSYNC_PULSE_WIDTH_UNIT	(1 << 20)
@@ -144,6 +144,14 @@
 #define BLUE 2
 #define TRANSP 3
 
+#define STMLCDIF_8BIT  1 /** pixel data bus to the display is of 8 bit width */
+#define STMLCDIF_16BIT 0 /** pixel data bus to the display is of 16 bit width */
+#define STMLCDIF_18BIT 2 /** pixel data bus to the display is of 18 bit width */
+#define STMLCDIF_24BIT 3 /** pixel data bus to the display is of 24 bit width */
+
+#define MXSFB_SYNC_DATA_ENABLE_HIGH_ACT	(1 << 6)
+#define MXSFB_SYNC_DOTCLK_FALLING_ACT	(1 << 7) /* negtive edge sampling */
+
 enum mxsfb_devtype {
 	MXSFB_V3,
 	MXSFB_V4,
@@ -460,8 +468,8 @@ static int mxsfb_set_par(struct fb_info *fb_info)
 		vdctrl0 |= VDCTRL0_VSYNC_ACT_HIGH;
 	if (host->sync & MXSFB_SYNC_DATA_ENABLE_HIGH_ACT)
 		vdctrl0 |= VDCTRL0_ENABLE_ACT_HIGH;
-	if (host->sync & MXSFB_SYNC_DOTCLK_FAILING_ACT)
-		vdctrl0 |= VDCTRL0_DOTCLK_ACT_FAILING;
+	if (host->sync & MXSFB_SYNC_DOTCLK_FALLING_ACT)
+		vdctrl0 |= VDCTRL0_DOTCLK_ACT_FALLING;
 
 	writel(vdctrl0, host->base + LCDC_VDCTRL0);
 
@@ -760,7 +768,7 @@ static int mxsfb_init_fbinfo_dt(struct mxsfb_info *host)
 		if (vm.data_flags & DISPLAY_FLAGS_DE_HIGH)
 			host->sync |= MXSFB_SYNC_DATA_ENABLE_HIGH_ACT;
 		if (vm.data_flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
-			host->sync |= MXSFB_SYNC_DOTCLK_FAILING_ACT;
+			host->sync |= MXSFB_SYNC_DOTCLK_FALLING_ACT;
 		fb_add_videomode(&fb_vm, &fb_info->modelist);
 	}
 
@@ -775,7 +783,6 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host)
 {
 	struct fb_info *fb_info = &host->fb_info;
 	struct fb_var_screeninfo *var = &fb_info->var;
-	struct mxsfb_platform_data *pdata = host->pdev->dev.platform_data;
 	dma_addr_t fb_phys;
 	void *fb_virt;
 	unsigned fb_size;
@@ -789,15 +796,9 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host)
 	fb_info->fix.visual = FB_VISUAL_TRUECOLOR,
 	fb_info->fix.accel = FB_ACCEL_NONE;
 
-	if (pdata) {
-		host->ld_intf_width = pdata->ld_intf_width;
-		var->bits_per_pixel =
-			pdata->default_bpp ? pdata->default_bpp : 16;
-	} else {
-		ret = mxsfb_init_fbinfo_dt(host);
-		if (ret)
-			return ret;
-	}
+	ret = mxsfb_init_fbinfo_dt(host);
+	if (ret)
+		return ret;
 
 	var->nonstd = 0;
 	var->activate = FB_ACTIVATE_NOW;
@@ -853,7 +854,6 @@ static int mxsfb_probe(struct platform_device *pdev)
 {
 	const struct of_device_id *of_id =
 			of_match_device(mxsfb_dt_ids, &pdev->dev);
-	struct mxsfb_platform_data *pdata = pdev->dev.platform_data;
 	struct resource *res;
 	struct mxsfb_info *host;
 	struct fb_info *fb_info;
@@ -861,7 +861,7 @@ static int mxsfb_probe(struct platform_device *pdev)
 	struct pinctrl *pinctrl;
 	int panel_enable;
 	enum of_gpio_flags flags;
-	int i, ret;
+	int ret;
 
 	if (of_id)
 		pdev->id_entry = of_id->data;
@@ -933,13 +933,6 @@ static int mxsfb_probe(struct platform_device *pdev)
 	if (ret != 0)
 		goto fb_release;
 
-	if (pdata) {
-		host->sync = pdata->sync;
-		for (i = 0; i < pdata->mode_count; i++)
-			fb_add_videomode(&pdata->mode_list[i],
-					 &fb_info->modelist);
-	}
-
 	modelist = list_first_entry(&fb_info->modelist,
 			struct fb_modelist, list);
 	fb_videomode_to_var(&fb_info->var, &modelist->mode);
diff --git a/include/linux/mxsfb.h b/include/linux/mxsfb.h
deleted file mode 100644
index b78465cdb26e..000000000000
--- a/include/linux/mxsfb.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
-
-#ifndef __LINUX_MXSFB_H
-#define __LINUX_MXSFB_H
-
-#include <linux/fb.h>
-
-#define STMLCDIF_8BIT 1	/** pixel data bus to the display is of 8 bit width */
-#define STMLCDIF_16BIT 0 /** pixel data bus to the display is of 16 bit width */
-#define STMLCDIF_18BIT 2 /** pixel data bus to the display is of 18 bit width */
-#define STMLCDIF_24BIT 3 /** pixel data bus to the display is of 24 bit width */
-
-#define MXSFB_SYNC_DATA_ENABLE_HIGH_ACT	(1 << 6)
-#define MXSFB_SYNC_DOTCLK_FAILING_ACT	(1 << 7) /* failing/negtive edge sampling */
-
-struct mxsfb_platform_data {
-	struct fb_videomode *mode_list;
-	unsigned mode_count;
-
-	unsigned default_bpp;
-
-	unsigned ld_intf_width;	/* refer STMLCDIF_* macros */
-	u32 sync;		/* sync mask, contains MXSFB specifics not
-				 * carried in fb_info->var.sync
-				 */
-};
-
-#endif /* __LINUX_MXSFB_H */
-- 
cgit 


From 26aafa77df61c4190eae80646211ee6f07c88eaf Mon Sep 17 00:00:00 2001
From: Shawn Guo <shawn.guo@linaro.org>
Date: Tue, 26 Feb 2013 11:07:32 +0800
Subject: spi: mxs-spi: move to use generic DMA helper

With the generic DMA device tree helper supported by mxs-dma driver,
client devices only need to call dma_request_slave_channel() for
requesting a DMA channel from dmaengine.

Since mxs is a DT only platform now, along with the changes, the non-DT
case handling in probe function also gets removed.

Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Reviewed-by: Arnd Bergmann <arnd@arndb.de>
---
 Documentation/devicetree/bindings/spi/mxs-spi.txt | 12 +++--
 drivers/spi/spi-mxs.c                             | 60 ++++-------------------
 include/linux/spi/mxs-spi.h                       |  4 +-
 3 files changed, 18 insertions(+), 58 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/spi/mxs-spi.txt b/Documentation/devicetree/bindings/spi/mxs-spi.txt
index e2e13957c2a4..3499b73293c2 100644
--- a/Documentation/devicetree/bindings/spi/mxs-spi.txt
+++ b/Documentation/devicetree/bindings/spi/mxs-spi.txt
@@ -3,8 +3,11 @@
 Required properties:
 - compatible: Should be "fsl,<soc>-spi", where soc is "imx23" or "imx28"
 - reg: Offset and length of the register set for the device
-- interrupts: Should contain SSP interrupts (error irq first, dma irq second)
-- fsl,ssp-dma-channel: APBX DMA channel for the SSP
+- interrupts: Should contain SSP ERROR interrupt
+- dmas: DMA specifier, consisting of a phandle to DMA controller node
+  and SSP DMA channel ID.
+  Refer to dma.txt and fsl-mxs-dma.txt for details.
+- dma-names: Must be "rx-tx".
 
 Optional properties:
 - clock-frequency : Input clock frequency to the SPI block in Hz.
@@ -17,6 +20,7 @@ ssp0: ssp@80010000 {
 	#size-cells = <0>;
 	compatible = "fsl,imx28-spi";
 	reg = <0x80010000 0x2000>;
-	interrupts = <96 82>;
-	fsl,ssp-dma-channel = <0>;
+	interrupts = <96>;
+	dmas = <&dma_apbh 0>;
+	dma-names = "rx-tx";
 };
diff --git a/drivers/spi/spi-mxs.c b/drivers/spi/spi-mxs.c
index 22a0af0147fb..7b1c014b0740 100644
--- a/drivers/spi/spi-mxs.c
+++ b/drivers/spi/spi-mxs.c
@@ -490,21 +490,6 @@ static int mxs_spi_transfer_one(struct spi_master *master,
 	return status;
 }
 
-static bool mxs_ssp_dma_filter(struct dma_chan *chan, void *param)
-{
-	struct mxs_ssp *ssp = param;
-
-	if (!mxs_dma_is_apbh(chan))
-		return false;
-
-	if (chan->chan_id != ssp->dma_channel)
-		return false;
-
-	chan->private = &ssp->dma_data;
-
-	return true;
-}
-
 static const struct of_device_id mxs_spi_dt_ids[] = {
 	{ .compatible = "fsl,imx23-spi", .data = (void *) IMX23_SSP, },
 	{ .compatible = "fsl,imx28-spi", .data = (void *) IMX28_SSP, },
@@ -520,13 +505,12 @@ static int mxs_spi_probe(struct platform_device *pdev)
 	struct spi_master *master;
 	struct mxs_spi *spi;
 	struct mxs_ssp *ssp;
-	struct resource *iores, *dmares;
+	struct resource *iores;
 	struct pinctrl *pinctrl;
 	struct clk *clk;
 	void __iomem *base;
-	int devid, dma_channel, clk_freq;
-	int ret = 0, irq_err, irq_dma;
-	dma_cap_mask_t mask;
+	int devid, clk_freq;
+	int ret = 0, irq_err;
 
 	/*
 	 * Default clock speed for the SPI core. 160MHz seems to
@@ -537,8 +521,7 @@ static int mxs_spi_probe(struct platform_device *pdev)
 
 	iores = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	irq_err = platform_get_irq(pdev, 0);
-	irq_dma = platform_get_irq(pdev, 1);
-	if (!iores || irq_err < 0 || irq_dma < 0)
+	if (!iores || irq_err < 0)
 		return -EINVAL;
 
 	base = devm_ioremap_resource(&pdev->dev, iores);
@@ -553,32 +536,11 @@ static int mxs_spi_probe(struct platform_device *pdev)
 	if (IS_ERR(clk))
 		return PTR_ERR(clk);
 
-	if (np) {
-		devid = (enum mxs_ssp_id) of_id->data;
-		/*
-		 * TODO: This is a temporary solution and should be changed
-		 * to use generic DMA binding later when the helpers get in.
-		 */
-		ret = of_property_read_u32(np, "fsl,ssp-dma-channel",
-					   &dma_channel);
-		if (ret) {
-			dev_err(&pdev->dev,
-				"Failed to get DMA channel\n");
-			return -EINVAL;
-		}
-
-		ret = of_property_read_u32(np, "clock-frequency",
-					   &clk_freq);
-		if (ret)
-			clk_freq = clk_freq_default;
-	} else {
-		dmares = platform_get_resource(pdev, IORESOURCE_DMA, 0);
-		if (!dmares)
-			return -EINVAL;
-		devid = pdev->id_entry->driver_data;
-		dma_channel = dmares->start;
+	devid = (enum mxs_ssp_id) of_id->data;
+	ret = of_property_read_u32(np, "clock-frequency",
+				   &clk_freq);
+	if (ret)
 		clk_freq = clk_freq_default;
-	}
 
 	master = spi_alloc_master(&pdev->dev, sizeof(*spi));
 	if (!master)
@@ -597,7 +559,6 @@ static int mxs_spi_probe(struct platform_device *pdev)
 	ssp->clk = clk;
 	ssp->base = base;
 	ssp->devid = devid;
-	ssp->dma_channel = dma_channel;
 
 	init_completion(&spi->c);
 
@@ -606,10 +567,7 @@ static int mxs_spi_probe(struct platform_device *pdev)
 	if (ret)
 		goto out_master_free;
 
-	dma_cap_zero(mask);
-	dma_cap_set(DMA_SLAVE, mask);
-	ssp->dma_data.chan_irq = irq_dma;
-	ssp->dmach = dma_request_channel(mask, mxs_ssp_dma_filter, ssp);
+	ssp->dmach = dma_request_slave_channel(&pdev->dev, "rx-tx");
 	if (!ssp->dmach) {
 		dev_err(ssp->dev, "Failed to request DMA\n");
 		goto out_master_free;
diff --git a/include/linux/spi/mxs-spi.h b/include/linux/spi/mxs-spi.h
index 61ae1306db23..4835486f58e5 100644
--- a/include/linux/spi/mxs-spi.h
+++ b/include/linux/spi/mxs-spi.h
@@ -24,7 +24,7 @@
 #ifndef __LINUX_SPI_MXS_SPI_H__
 #define __LINUX_SPI_MXS_SPI_H__
 
-#include <linux/fsl/mxs-dma.h>
+#include <linux/dmaengine.h>
 
 #define ssp_is_old(host)	((host)->devid == IMX23_SSP)
 
@@ -137,9 +137,7 @@ struct mxs_ssp {
 	unsigned int			clk_rate;
 	enum mxs_ssp_id			devid;
 
-	int				dma_channel;
 	struct dma_chan			*dmach;
-	struct mxs_dma_data		dma_data;
 	unsigned int			dma_dir;
 	enum dma_transfer_direction	slave_dirn;
 	u32				ssp_pio_words[SSP_PIO_NUM];
-- 
cgit 


From aa6f9c595d857328e5d815e5b94c0e7cd31a6b59 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Fri, 22 Mar 2013 11:31:29 -0700
Subject: ntp: Move do_adjtimex() and hardpps() functions to timekeeping.c

In preparation for changing the ntp locking rules, move
do_adjtimex and hardpps accessor functions to timekeeping.c,
but keep the code logic in ntp.c.

This patch also introduces a ntp_internal.h file so timekeeping
specific interfaces of ntp.c can be more limitedly shared with
timekeeping.c.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timex.h      |  7 -------
 kernel/time/ntp.c          |  9 ++++-----
 kernel/time/ntp_internal.h | 11 +++++++++++
 kernel/time/timekeeping.c  | 21 +++++++++++++++++++++
 4 files changed, 36 insertions(+), 12 deletions(-)
 create mode 100644 kernel/time/ntp_internal.h

(limited to 'include/linux')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 5ec87c60b97c..b3726e61368e 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -125,9 +125,6 @@
 extern unsigned long tick_usec;		/* USER_HZ period (usec) */
 extern unsigned long tick_nsec;		/* SHIFTED_HZ period (nsec) */
 
-extern void ntp_init(void);
-extern void ntp_clear(void);
-
 /* Required to safely shift negative values */
 #define shift_right(x, s) ({	\
 	__typeof__(x) __x = (x);	\
@@ -140,10 +137,6 @@ extern void ntp_clear(void);
 #define NTP_INTERVAL_FREQ  (HZ)
 #define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ)
 
-/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
-extern u64 ntp_tick_length(void);
-
-extern int second_overflow(unsigned long secs);
 extern int do_adjtimex(struct timex *);
 extern void hardpps(const struct timespec *, const struct timespec *);
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 457d2ba245fe..8b107068d7e3 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -18,6 +18,7 @@
 #include <linux/rtc.h>
 
 #include "tick-internal.h"
+#include "ntp_internal.h"
 
 /*
  * NTP timekeeping variables:
@@ -661,7 +662,7 @@ int ntp_validate_timex(struct timex *txc)
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
-int do_adjtimex(struct timex *txc)
+int __do_adjtimex(struct timex *txc)
 {
 	struct timespec ts;
 	u32 time_tai, orig_tai;
@@ -911,7 +912,7 @@ static void hardpps_update_phase(long error)
 }
 
 /*
- * hardpps() - discipline CPU clock oscillator to external PPS signal
+ * __hardpps() - discipline CPU clock oscillator to external PPS signal
  *
  * This routine is called at each PPS signal arrival in order to
  * discipline the CPU clock oscillator to the PPS signal. It takes two
@@ -922,7 +923,7 @@ static void hardpps_update_phase(long error)
  * This code is based on David Mills's reference nanokernel
  * implementation. It was mostly rewritten but keeps the same idea.
  */
-void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 {
 	struct pps_normtime pts_norm, freq_norm;
 	unsigned long flags;
@@ -976,8 +977,6 @@ void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
 
 	raw_spin_unlock_irqrestore(&ntp_lock, flags);
 }
-EXPORT_SYMBOL(hardpps);
-
 #endif	/* CONFIG_NTP_PPS */
 
 static int __init ntp_tick_adj_setup(char *str)
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
new file mode 100644
index 000000000000..fdee80cb34f3
--- /dev/null
+++ b/kernel/time/ntp_internal.h
@@ -0,0 +1,11 @@
+#ifndef _LINUX_NTP_INTERNAL_H
+#define _LINUX_NTP_INTERNAL_H
+
+extern void ntp_init(void);
+extern void ntp_clear(void);
+/* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
+extern u64 ntp_tick_length(void);
+extern int second_overflow(unsigned long secs);
+extern int __do_adjtimex(struct timex *);
+extern void __hardpps(const struct timespec *, const struct timespec *);
+#endif /* _LINUX_NTP_INTERNAL_H */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c5feb7aa3acb..a138ec2cde3e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -24,6 +24,7 @@
 #include <linux/pvclock_gtod.h>
 
 #include "tick-internal.h"
+#include "ntp_internal.h"
 
 static struct timekeeper timekeeper;
 static DEFINE_RAW_SPINLOCK(timekeeper_lock);
@@ -1612,6 +1613,26 @@ ktime_t ktime_get_monotonic_offset(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
 
+/**
+ * do_adjtimex() - Accessor function to NTP __do_adjtimex function
+ */
+int do_adjtimex(struct timex *txc)
+{
+	return __do_adjtimex(txc);
+}
+
+
+#ifdef CONFIG_NTP_PPS
+/**
+ * hardpps() - Accessor function to NTP __hardpps function
+ */
+void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts)
+{
+	__hardpps(phase_ts, raw_ts);
+}
+EXPORT_SYMBOL(hardpps);
+#endif
+
 /**
  * xtime_update() - advances the timekeeping infrastructure
  * @ticks:	number of ticks, that have elapsed since the last call.
-- 
cgit 


From 14a3b6abe98c8f53a13522610c257accef7321df Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Feb 2013 22:51:38 +0000
Subject: timekeeping: Store cycle_last value in timekeeper struct as well

For implementing a shadow timekeeper and a split calculation/update
region we need to store the cycle_last value in the timekeeper and
update the value in the clocksource struct only in the update region.

Add the extra storage to the timekeeper.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h | 2 ++
 kernel/time/timekeeping.c           | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index a151bd70e52b..c1825eb436ed 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -20,6 +20,8 @@ struct timekeeper {
 	u32			shift;
 	/* Number of clock cycles in one NTP interval. */
 	cycle_t			cycle_interval;
+	/* Last cycle value (also stored in clock->cycle_last) */
+	cycle_t			cycle_last;
 	/* Number of clock shifted nano seconds in one NTP interval. */
 	u64			xtime_interval;
 	/* shifted nano seconds left over when rounding cycle_interval */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f93f60cd97ad..4c276b2d022d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -101,7 +101,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 
 	old_clock = tk->clock;
 	tk->clock = clock;
-	clock->cycle_last = clock->read(clock);
+	tk->cycle_last = clock->cycle_last = clock->read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@@ -266,7 +266,7 @@ static void timekeeping_forward_now(struct timekeeper *tk)
 	clock = tk->clock;
 	cycle_now = clock->read(clock);
 	cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-	clock->cycle_last = cycle_now;
+	tk->cycle_last = clock->cycle_last = cycle_now;
 
 	tk->xtime_nsec += cycle_delta * tk->mult;
 
-- 
cgit 


From 441f199a37cfd66c5dd8dd45490bd3ea6971117d Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Mon, 25 Mar 2013 13:22:24 -0600
Subject: clk: tegra: defer application of init table

The Tegra clock driver is initialized during the ARM machine descriptor's
.init_irq() hook. It can't be initialized earlier, since dynamic memory
usage is required. It can't be initialized later, since the .init_timer()
hook needs the clocks initialized. However, at this time, udelay()
doesn't work.

The Tegra clock initialization table may enable some PLLs. Enabling a PLL
may require usage of udelay(). Hence, this can't happen right when the
clock driver is initialized.

To solve this, separate the clock driver initialization from the clock
table processing, so they can execute at separate times.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
---
 arch/arm/mach-tegra/tegra.c     |  3 +++
 drivers/clk/tegra/clk-tegra20.c |  7 ++++++-
 drivers/clk/tegra/clk-tegra30.c |  7 ++++++-
 drivers/clk/tegra/clk.c         | 10 ++++++++++
 drivers/clk/tegra/clk.h         |  3 +++
 include/linux/clk/tegra.h       |  1 +
 6 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-tegra/tegra.c b/arch/arm/mach-tegra/tegra.c
index 84deeab23ee7..61749e2d8111 100644
--- a/arch/arm/mach-tegra/tegra.c
+++ b/arch/arm/mach-tegra/tegra.c
@@ -36,6 +36,7 @@
 #include <linux/slab.h>
 #include <linux/sys_soc.h>
 #include <linux/usb/tegra_usb_phy.h>
+#include <linux/clk/tegra.h>
 
 #include <asm/mach-types.h>
 #include <asm/mach/arch.h>
@@ -87,6 +88,8 @@ static void __init tegra_dt_init(void)
 	struct soc_device *soc_dev;
 	struct device *parent = NULL;
 
+	tegra_clocks_apply_init_table();
+
 	soc_dev_attr = kzalloc(sizeof(*soc_dev_attr), GFP_KERNEL);
 	if (!soc_dev_attr)
 		goto out;
diff --git a/drivers/clk/tegra/clk-tegra20.c b/drivers/clk/tegra/clk-tegra20.c
index a7dc0a937361..a15fb28197b5 100644
--- a/drivers/clk/tegra/clk-tegra20.c
+++ b/drivers/clk/tegra/clk-tegra20.c
@@ -1252,6 +1252,11 @@ static __initdata struct tegra_clk_init_table init_table[] = {
 	{clk_max, clk_max, 0, 0}, /* This MUST be the last entry */
 };
 
+static void __init tegra20_clock_apply_init_table(void)
+{
+	tegra_init_from_table(init_table, clks, clk_max);
+}
+
 /*
  * Some clocks may be used by different drivers depending on the board
  * configuration.  List those here to register them twice in the clock lookup
@@ -1318,7 +1323,7 @@ void __init tegra20_clock_init(struct device_node *np)
 	clk_data.clk_num = ARRAY_SIZE(clks);
 	of_clk_add_provider(np, of_clk_src_onecell_get, &clk_data);
 
-	tegra_init_from_table(init_table, clks, clk_max);
+	tegra_clk_apply_init_table = tegra20_clock_apply_init_table;
 
 	tegra_cpu_car_ops = &tegra20_cpu_car_ops;
 }
diff --git a/drivers/clk/tegra/clk-tegra30.c b/drivers/clk/tegra/clk-tegra30.c
index 181a6eef5ce8..e0ee93c2db6c 100644
--- a/drivers/clk/tegra/clk-tegra30.c
+++ b/drivers/clk/tegra/clk-tegra30.c
@@ -1916,6 +1916,11 @@ static __initdata struct tegra_clk_init_table init_table[] = {
 	{clk_max, clk_max, 0, 0}, /* This MUST be the last entry. */
 };
 
+static void __init tegra30_clock_apply_init_table(void)
+{
+	tegra_init_from_table(init_table, clks, clk_max);
+}
+
 /*
  * Some clocks may be used by different drivers depending on the board
  * configuration.  List those here to register them twice in the clock lookup
@@ -1989,7 +1994,7 @@ void __init tegra30_clock_init(struct device_node *np)
 	clk_data.clk_num = ARRAY_SIZE(clks);
 	of_clk_add_provider(np, of_clk_src_onecell_get, &clk_data);
 
-	tegra_init_from_table(init_table, clks, clk_max);
+	tegra_clk_apply_init_table = tegra30_clock_apply_init_table;
 
 	tegra_cpu_car_ops = &tegra30_cpu_car_ops;
 }
diff --git a/drivers/clk/tegra/clk.c b/drivers/clk/tegra/clk.c
index a603b9af0ad3..4a61d15425dc 100644
--- a/drivers/clk/tegra/clk.c
+++ b/drivers/clk/tegra/clk.c
@@ -83,3 +83,13 @@ void __init tegra_clocks_init(void)
 {
 	of_clk_init(tegra_dt_clk_match);
 }
+
+tegra_clk_apply_init_table_func tegra_clk_apply_init_table;
+
+void __init tegra_clocks_apply_init_table(void)
+{
+	if (!tegra_clk_apply_init_table)
+		return;
+
+	tegra_clk_apply_init_table();
+}
diff --git a/drivers/clk/tegra/clk.h b/drivers/clk/tegra/clk.h
index a09d7dcaf183..3c566e2d518a 100644
--- a/drivers/clk/tegra/clk.h
+++ b/drivers/clk/tegra/clk.h
@@ -510,4 +510,7 @@ void tegra30_clock_init(struct device_node *np);
 static inline void tegra30_clock_init(struct device_node *np) {}
 #endif /* CONFIG_ARCH_TEGRA_3x_SOC */
 
+typedef void (*tegra_clk_apply_init_table_func)(void);
+extern tegra_clk_apply_init_table_func tegra_clk_apply_init_table;
+
 #endif /* TEGRA_CLK_H */
diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h
index 404d6f940872..642789baec74 100644
--- a/include/linux/clk/tegra.h
+++ b/include/linux/clk/tegra.h
@@ -123,5 +123,6 @@ static inline void tegra_cpu_clock_resume(void)
 void tegra_periph_reset_deassert(struct clk *c);
 void tegra_periph_reset_assert(struct clk *c);
 void tegra_clocks_init(void);
+void tegra_clocks_apply_init_table(void);
 
 #endif /* __LINUX_CLK_TEGRA_H_ */
-- 
cgit 


From deaf39efbc0829f26ae0b8fbe5de820588982f72 Mon Sep 17 00:00:00 2001
From: Simon Glass <sjg@chromium.org>
Date: Mon, 25 Feb 2013 14:08:36 -0800
Subject: mfd: Add ChromeOS EC messages header

This file is included verbatim from the ChromeOS EC respository.
Ideally we would prefer to avoid changing it, to make it easier
to track this rapidly-changing file.

Signed-off-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Che-Liang Chiou <clchiou@chromium.org>
Signed-off-by: Vincent Palatin <vpalatin@chromium.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/cros_ec_commands.h | 1369 ++++++++++++++++++++++++++++++++++
 1 file changed, 1369 insertions(+)
 create mode 100644 include/linux/mfd/cros_ec_commands.h

(limited to 'include/linux')

diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h
new file mode 100644
index 000000000000..86fd06953bcd
--- /dev/null
+++ b/include/linux/mfd/cros_ec_commands.h
@@ -0,0 +1,1369 @@
+/*
+ * Host communication command constants for ChromeOS EC
+ *
+ * Copyright (C) 2012 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * The ChromeOS EC multi function device is used to mux all the requests
+ * to the EC device for its multiple features: keyboard controller,
+ * battery charging and regulator control, firmware update.
+ *
+ * NOTE: This file is copied verbatim from the ChromeOS EC Open Source
+ * project in an attempt to make future updates easy to make.
+ */
+
+#ifndef __CROS_EC_COMMANDS_H
+#define __CROS_EC_COMMANDS_H
+
+/*
+ * Protocol overview
+ *
+ * request:  CMD [ P0 P1 P2 ... Pn S ]
+ * response: ERR [ P0 P1 P2 ... Pn S ]
+ *
+ * where the bytes are defined as follow :
+ *      - CMD is the command code. (defined by EC_CMD_ constants)
+ *      - ERR is the error code. (defined by EC_RES_ constants)
+ *      - Px is the optional payload.
+ *        it is not sent if the error code is not success.
+ *        (defined by ec_params_ and ec_response_ structures)
+ *      - S is the checksum which is the sum of all payload bytes.
+ *
+ * On LPC, CMD and ERR are sent/received at EC_LPC_ADDR_KERNEL|USER_CMD
+ * and the payloads are sent/received at EC_LPC_ADDR_KERNEL|USER_PARAM.
+ * On I2C, all bytes are sent serially in the same message.
+ */
+
+/* Current version of this protocol */
+#define EC_PROTO_VERSION          0x00000002
+
+/* Command version mask */
+#define EC_VER_MASK(version) (1UL << (version))
+
+/* I/O addresses for ACPI commands */
+#define EC_LPC_ADDR_ACPI_DATA  0x62
+#define EC_LPC_ADDR_ACPI_CMD   0x66
+
+/* I/O addresses for host command */
+#define EC_LPC_ADDR_HOST_DATA  0x200
+#define EC_LPC_ADDR_HOST_CMD   0x204
+
+/* I/O addresses for host command args and params */
+#define EC_LPC_ADDR_HOST_ARGS  0x800
+#define EC_LPC_ADDR_HOST_PARAM 0x804
+#define EC_HOST_PARAM_SIZE     0x0fc  /* Size of param area in bytes */
+
+/* I/O addresses for host command params, old interface */
+#define EC_LPC_ADDR_OLD_PARAM  0x880
+#define EC_OLD_PARAM_SIZE      0x080  /* Size of param area in bytes */
+
+/* EC command register bit functions */
+#define EC_LPC_CMDR_DATA	(1 << 0)  /* Data ready for host to read */
+#define EC_LPC_CMDR_PENDING	(1 << 1)  /* Write pending to EC */
+#define EC_LPC_CMDR_BUSY	(1 << 2)  /* EC is busy processing a command */
+#define EC_LPC_CMDR_CMD		(1 << 3)  /* Last host write was a command */
+#define EC_LPC_CMDR_ACPI_BRST	(1 << 4)  /* Burst mode (not used) */
+#define EC_LPC_CMDR_SCI		(1 << 5)  /* SCI event is pending */
+#define EC_LPC_CMDR_SMI		(1 << 6)  /* SMI event is pending */
+
+#define EC_LPC_ADDR_MEMMAP       0x900
+#define EC_MEMMAP_SIZE         255 /* ACPI IO buffer max is 255 bytes */
+#define EC_MEMMAP_TEXT_MAX     8   /* Size of a string in the memory map */
+
+/* The offset address of each type of data in mapped memory. */
+#define EC_MEMMAP_TEMP_SENSOR      0x00 /* Temp sensors */
+#define EC_MEMMAP_FAN              0x10 /* Fan speeds */
+#define EC_MEMMAP_TEMP_SENSOR_B    0x18 /* Temp sensors (second set) */
+#define EC_MEMMAP_ID               0x20 /* 'E' 'C' */
+#define EC_MEMMAP_ID_VERSION       0x22 /* Version of data in 0x20 - 0x2f */
+#define EC_MEMMAP_THERMAL_VERSION  0x23 /* Version of data in 0x00 - 0x1f */
+#define EC_MEMMAP_BATTERY_VERSION  0x24 /* Version of data in 0x40 - 0x7f */
+#define EC_MEMMAP_SWITCHES_VERSION 0x25 /* Version of data in 0x30 - 0x33 */
+#define EC_MEMMAP_EVENTS_VERSION   0x26 /* Version of data in 0x34 - 0x3f */
+#define EC_MEMMAP_HOST_CMD_FLAGS   0x27 /* Host command interface flags */
+#define EC_MEMMAP_SWITCHES         0x30
+#define EC_MEMMAP_HOST_EVENTS      0x34
+#define EC_MEMMAP_BATT_VOLT        0x40 /* Battery Present Voltage */
+#define EC_MEMMAP_BATT_RATE        0x44 /* Battery Present Rate */
+#define EC_MEMMAP_BATT_CAP         0x48 /* Battery Remaining Capacity */
+#define EC_MEMMAP_BATT_FLAG        0x4c /* Battery State, defined below */
+#define EC_MEMMAP_BATT_DCAP        0x50 /* Battery Design Capacity */
+#define EC_MEMMAP_BATT_DVLT        0x54 /* Battery Design Voltage */
+#define EC_MEMMAP_BATT_LFCC        0x58 /* Battery Last Full Charge Capacity */
+#define EC_MEMMAP_BATT_CCNT        0x5c /* Battery Cycle Count */
+#define EC_MEMMAP_BATT_MFGR        0x60 /* Battery Manufacturer String */
+#define EC_MEMMAP_BATT_MODEL       0x68 /* Battery Model Number String */
+#define EC_MEMMAP_BATT_SERIAL      0x70 /* Battery Serial Number String */
+#define EC_MEMMAP_BATT_TYPE        0x78 /* Battery Type String */
+
+/* Number of temp sensors at EC_MEMMAP_TEMP_SENSOR */
+#define EC_TEMP_SENSOR_ENTRIES     16
+/*
+ * Number of temp sensors at EC_MEMMAP_TEMP_SENSOR_B.
+ *
+ * Valid only if EC_MEMMAP_THERMAL_VERSION returns >= 2.
+ */
+#define EC_TEMP_SENSOR_B_ENTRIES      8
+#define EC_TEMP_SENSOR_NOT_PRESENT    0xff
+#define EC_TEMP_SENSOR_ERROR          0xfe
+#define EC_TEMP_SENSOR_NOT_POWERED    0xfd
+#define EC_TEMP_SENSOR_NOT_CALIBRATED 0xfc
+/*
+ * The offset of temperature value stored in mapped memory.  This allows
+ * reporting a temperature range of 200K to 454K = -73C to 181C.
+ */
+#define EC_TEMP_SENSOR_OFFSET      200
+
+#define EC_FAN_SPEED_ENTRIES       4       /* Number of fans at EC_MEMMAP_FAN */
+#define EC_FAN_SPEED_NOT_PRESENT   0xffff  /* Entry not present */
+#define EC_FAN_SPEED_STALLED       0xfffe  /* Fan stalled */
+
+/* Battery bit flags at EC_MEMMAP_BATT_FLAG. */
+#define EC_BATT_FLAG_AC_PRESENT   0x01
+#define EC_BATT_FLAG_BATT_PRESENT 0x02
+#define EC_BATT_FLAG_DISCHARGING  0x04
+#define EC_BATT_FLAG_CHARGING     0x08
+#define EC_BATT_FLAG_LEVEL_CRITICAL 0x10
+
+/* Switch flags at EC_MEMMAP_SWITCHES */
+#define EC_SWITCH_LID_OPEN               0x01
+#define EC_SWITCH_POWER_BUTTON_PRESSED   0x02
+#define EC_SWITCH_WRITE_PROTECT_DISABLED 0x04
+/* Recovery requested via keyboard */
+#define EC_SWITCH_KEYBOARD_RECOVERY      0x08
+/* Recovery requested via dedicated signal (from servo board) */
+#define EC_SWITCH_DEDICATED_RECOVERY     0x10
+/* Was fake developer mode switch; now unused.  Remove in next refactor. */
+#define EC_SWITCH_IGNORE0                0x20
+
+/* Host command interface flags */
+/* Host command interface supports LPC args (LPC interface only) */
+#define EC_HOST_CMD_FLAG_LPC_ARGS_SUPPORTED  0x01
+
+/* Wireless switch flags */
+#define EC_WIRELESS_SWITCH_WLAN      0x01
+#define EC_WIRELESS_SWITCH_BLUETOOTH 0x02
+
+/*
+ * This header file is used in coreboot both in C and ACPI code.  The ACPI code
+ * is pre-processed to handle constants but the ASL compiler is unable to
+ * handle actual C code so keep it separate.
+ */
+#ifndef __ACPI__
+
+/* LPC command status byte masks */
+/* EC has written a byte in the data register and host hasn't read it yet */
+#define EC_LPC_STATUS_TO_HOST     0x01
+/* Host has written a command/data byte and the EC hasn't read it yet */
+#define EC_LPC_STATUS_FROM_HOST   0x02
+/* EC is processing a command */
+#define EC_LPC_STATUS_PROCESSING  0x04
+/* Last write to EC was a command, not data */
+#define EC_LPC_STATUS_LAST_CMD    0x08
+/* EC is in burst mode.  Unsupported by Chrome EC, so this bit is never set */
+#define EC_LPC_STATUS_BURST_MODE  0x10
+/* SCI event is pending (requesting SCI query) */
+#define EC_LPC_STATUS_SCI_PENDING 0x20
+/* SMI event is pending (requesting SMI query) */
+#define EC_LPC_STATUS_SMI_PENDING 0x40
+/* (reserved) */
+#define EC_LPC_STATUS_RESERVED    0x80
+
+/*
+ * EC is busy.  This covers both the EC processing a command, and the host has
+ * written a new command but the EC hasn't picked it up yet.
+ */
+#define EC_LPC_STATUS_BUSY_MASK \
+	(EC_LPC_STATUS_FROM_HOST | EC_LPC_STATUS_PROCESSING)
+
+/* Host command response codes */
+enum ec_status {
+	EC_RES_SUCCESS = 0,
+	EC_RES_INVALID_COMMAND = 1,
+	EC_RES_ERROR = 2,
+	EC_RES_INVALID_PARAM = 3,
+	EC_RES_ACCESS_DENIED = 4,
+	EC_RES_INVALID_RESPONSE = 5,
+	EC_RES_INVALID_VERSION = 6,
+	EC_RES_INVALID_CHECKSUM = 7,
+	EC_RES_IN_PROGRESS = 8,		/* Accepted, command in progress */
+	EC_RES_UNAVAILABLE = 9,		/* No response available */
+	EC_RES_TIMEOUT = 10,		/* We got a timeout */
+	EC_RES_OVERFLOW = 11,		/* Table / data overflow */
+};
+
+/*
+ * Host event codes.  Note these are 1-based, not 0-based, because ACPI query
+ * EC command uses code 0 to mean "no event pending".  We explicitly specify
+ * each value in the enum listing so they won't change if we delete/insert an
+ * item or rearrange the list (it needs to be stable across platforms, not
+ * just within a single compiled instance).
+ */
+enum host_event_code {
+	EC_HOST_EVENT_LID_CLOSED = 1,
+	EC_HOST_EVENT_LID_OPEN = 2,
+	EC_HOST_EVENT_POWER_BUTTON = 3,
+	EC_HOST_EVENT_AC_CONNECTED = 4,
+	EC_HOST_EVENT_AC_DISCONNECTED = 5,
+	EC_HOST_EVENT_BATTERY_LOW = 6,
+	EC_HOST_EVENT_BATTERY_CRITICAL = 7,
+	EC_HOST_EVENT_BATTERY = 8,
+	EC_HOST_EVENT_THERMAL_THRESHOLD = 9,
+	EC_HOST_EVENT_THERMAL_OVERLOAD = 10,
+	EC_HOST_EVENT_THERMAL = 11,
+	EC_HOST_EVENT_USB_CHARGER = 12,
+	EC_HOST_EVENT_KEY_PRESSED = 13,
+	/*
+	 * EC has finished initializing the host interface.  The host can check
+	 * for this event following sending a EC_CMD_REBOOT_EC command to
+	 * determine when the EC is ready to accept subsequent commands.
+	 */
+	EC_HOST_EVENT_INTERFACE_READY = 14,
+	/* Keyboard recovery combo has been pressed */
+	EC_HOST_EVENT_KEYBOARD_RECOVERY = 15,
+
+	/* Shutdown due to thermal overload */
+	EC_HOST_EVENT_THERMAL_SHUTDOWN = 16,
+	/* Shutdown due to battery level too low */
+	EC_HOST_EVENT_BATTERY_SHUTDOWN = 17,
+
+	/*
+	 * The high bit of the event mask is not used as a host event code.  If
+	 * it reads back as set, then the entire event mask should be
+	 * considered invalid by the host.  This can happen when reading the
+	 * raw event status via EC_MEMMAP_HOST_EVENTS but the LPC interface is
+	 * not initialized on the EC, or improperly configured on the host.
+	 */
+	EC_HOST_EVENT_INVALID = 32
+};
+/* Host event mask */
+#define EC_HOST_EVENT_MASK(event_code) (1UL << ((event_code) - 1))
+
+/* Arguments at EC_LPC_ADDR_HOST_ARGS */
+struct ec_lpc_host_args {
+	uint8_t flags;
+	uint8_t command_version;
+	uint8_t data_size;
+	/*
+	 * Checksum; sum of command + flags + command_version + data_size +
+	 * all params/response data bytes.
+	 */
+	uint8_t checksum;
+} __packed;
+
+/* Flags for ec_lpc_host_args.flags */
+/*
+ * Args are from host.  Data area at EC_LPC_ADDR_HOST_PARAM contains command
+ * params.
+ *
+ * If EC gets a command and this flag is not set, this is an old-style command.
+ * Command version is 0 and params from host are at EC_LPC_ADDR_OLD_PARAM with
+ * unknown length.  EC must respond with an old-style response (that is,
+ * withouth setting EC_HOST_ARGS_FLAG_TO_HOST).
+ */
+#define EC_HOST_ARGS_FLAG_FROM_HOST 0x01
+/*
+ * Args are from EC.  Data area at EC_LPC_ADDR_HOST_PARAM contains response.
+ *
+ * If EC responds to a command and this flag is not set, this is an old-style
+ * response.  Command version is 0 and response data from EC is at
+ * EC_LPC_ADDR_OLD_PARAM with unknown length.
+ */
+#define EC_HOST_ARGS_FLAG_TO_HOST   0x02
+
+/*
+ * Notes on commands:
+ *
+ * Each command is an 8-byte command value.  Commands which take params or
+ * return response data specify structs for that data.  If no struct is
+ * specified, the command does not input or output data, respectively.
+ * Parameter/response length is implicit in the structs.  Some underlying
+ * communication protocols (I2C, SPI) may add length or checksum headers, but
+ * those are implementation-dependent and not defined here.
+ */
+
+/*****************************************************************************/
+/* General / test commands */
+
+/*
+ * Get protocol version, used to deal with non-backward compatible protocol
+ * changes.
+ */
+#define EC_CMD_PROTO_VERSION 0x00
+
+struct ec_response_proto_version {
+	uint32_t version;
+} __packed;
+
+/*
+ * Hello.  This is a simple command to test the EC is responsive to
+ * commands.
+ */
+#define EC_CMD_HELLO 0x01
+
+struct ec_params_hello {
+	uint32_t in_data;  /* Pass anything here */
+} __packed;
+
+struct ec_response_hello {
+	uint32_t out_data;  /* Output will be in_data + 0x01020304 */
+} __packed;
+
+/* Get version number */
+#define EC_CMD_GET_VERSION 0x02
+
+enum ec_current_image {
+	EC_IMAGE_UNKNOWN = 0,
+	EC_IMAGE_RO,
+	EC_IMAGE_RW
+};
+
+struct ec_response_get_version {
+	/* Null-terminated version strings for RO, RW */
+	char version_string_ro[32];
+	char version_string_rw[32];
+	char reserved[32];       /* Was previously RW-B string */
+	uint32_t current_image;  /* One of ec_current_image */
+} __packed;
+
+/* Read test */
+#define EC_CMD_READ_TEST 0x03
+
+struct ec_params_read_test {
+	uint32_t offset;   /* Starting value for read buffer */
+	uint32_t size;     /* Size to read in bytes */
+} __packed;
+
+struct ec_response_read_test {
+	uint32_t data[32];
+} __packed;
+
+/*
+ * Get build information
+ *
+ * Response is null-terminated string.
+ */
+#define EC_CMD_GET_BUILD_INFO 0x04
+
+/* Get chip info */
+#define EC_CMD_GET_CHIP_INFO 0x05
+
+struct ec_response_get_chip_info {
+	/* Null-terminated strings */
+	char vendor[32];
+	char name[32];
+	char revision[32];  /* Mask version */
+} __packed;
+
+/* Get board HW version */
+#define EC_CMD_GET_BOARD_VERSION 0x06
+
+struct ec_response_board_version {
+	uint16_t board_version;  /* A monotonously incrementing number. */
+} __packed;
+
+/*
+ * Read memory-mapped data.
+ *
+ * This is an alternate interface to memory-mapped data for bus protocols
+ * which don't support direct-mapped memory - I2C, SPI, etc.
+ *
+ * Response is params.size bytes of data.
+ */
+#define EC_CMD_READ_MEMMAP 0x07
+
+struct ec_params_read_memmap {
+	uint8_t offset;   /* Offset in memmap (EC_MEMMAP_*) */
+	uint8_t size;     /* Size to read in bytes */
+} __packed;
+
+/* Read versions supported for a command */
+#define EC_CMD_GET_CMD_VERSIONS 0x08
+
+struct ec_params_get_cmd_versions {
+	uint8_t cmd;      /* Command to check */
+} __packed;
+
+struct ec_response_get_cmd_versions {
+	/*
+	 * Mask of supported versions; use EC_VER_MASK() to compare with a
+	 * desired version.
+	 */
+	uint32_t version_mask;
+} __packed;
+
+/*
+ * Check EC communcations status (busy). This is needed on i2c/spi but not
+ * on lpc since it has its own out-of-band busy indicator.
+ *
+ * lpc must read the status from the command register. Attempting this on
+ * lpc will overwrite the args/parameter space and corrupt its data.
+ */
+#define EC_CMD_GET_COMMS_STATUS		0x09
+
+/* Avoid using ec_status which is for return values */
+enum ec_comms_status {
+	EC_COMMS_STATUS_PROCESSING	= 1 << 0,	/* Processing cmd */
+};
+
+struct ec_response_get_comms_status {
+	uint32_t flags;		/* Mask of enum ec_comms_status */
+} __packed;
+
+
+/*****************************************************************************/
+/* Flash commands */
+
+/* Get flash info */
+#define EC_CMD_FLASH_INFO 0x10
+
+struct ec_response_flash_info {
+	/* Usable flash size, in bytes */
+	uint32_t flash_size;
+	/*
+	 * Write block size.  Write offset and size must be a multiple
+	 * of this.
+	 */
+	uint32_t write_block_size;
+	/*
+	 * Erase block size.  Erase offset and size must be a multiple
+	 * of this.
+	 */
+	uint32_t erase_block_size;
+	/*
+	 * Protection block size.  Protection offset and size must be a
+	 * multiple of this.
+	 */
+	uint32_t protect_block_size;
+} __packed;
+
+/*
+ * Read flash
+ *
+ * Response is params.size bytes of data.
+ */
+#define EC_CMD_FLASH_READ 0x11
+
+struct ec_params_flash_read {
+	uint32_t offset;   /* Byte offset to read */
+	uint32_t size;     /* Size to read in bytes */
+} __packed;
+
+/* Write flash */
+#define EC_CMD_FLASH_WRITE 0x12
+
+struct ec_params_flash_write {
+	uint32_t offset;   /* Byte offset to write */
+	uint32_t size;     /* Size to write in bytes */
+	/*
+	 * Data to write.  Could really use EC_PARAM_SIZE - 8, but tidiest to
+	 * use a power of 2 so writes stay aligned.
+	 */
+	uint8_t data[64];
+} __packed;
+
+/* Erase flash */
+#define EC_CMD_FLASH_ERASE 0x13
+
+struct ec_params_flash_erase {
+	uint32_t offset;   /* Byte offset to erase */
+	uint32_t size;     /* Size to erase in bytes */
+} __packed;
+
+/*
+ * Get/set flash protection.
+ *
+ * If mask!=0, sets/clear the requested bits of flags.  Depending on the
+ * firmware write protect GPIO, not all flags will take effect immediately;
+ * some flags require a subsequent hard reset to take effect.  Check the
+ * returned flags bits to see what actually happened.
+ *
+ * If mask=0, simply returns the current flags state.
+ */
+#define EC_CMD_FLASH_PROTECT 0x15
+#define EC_VER_FLASH_PROTECT 1  /* Command version 1 */
+
+/* Flags for flash protection */
+/* RO flash code protected when the EC boots */
+#define EC_FLASH_PROTECT_RO_AT_BOOT         (1 << 0)
+/*
+ * RO flash code protected now.  If this bit is set, at-boot status cannot
+ * be changed.
+ */
+#define EC_FLASH_PROTECT_RO_NOW             (1 << 1)
+/* Entire flash code protected now, until reboot. */
+#define EC_FLASH_PROTECT_ALL_NOW            (1 << 2)
+/* Flash write protect GPIO is asserted now */
+#define EC_FLASH_PROTECT_GPIO_ASSERTED      (1 << 3)
+/* Error - at least one bank of flash is stuck locked, and cannot be unlocked */
+#define EC_FLASH_PROTECT_ERROR_STUCK        (1 << 4)
+/*
+ * Error - flash protection is in inconsistent state.  At least one bank of
+ * flash which should be protected is not protected.  Usually fixed by
+ * re-requesting the desired flags, or by a hard reset if that fails.
+ */
+#define EC_FLASH_PROTECT_ERROR_INCONSISTENT (1 << 5)
+/* Entile flash code protected when the EC boots */
+#define EC_FLASH_PROTECT_ALL_AT_BOOT        (1 << 6)
+
+struct ec_params_flash_protect {
+	uint32_t mask;   /* Bits in flags to apply */
+	uint32_t flags;  /* New flags to apply */
+} __packed;
+
+struct ec_response_flash_protect {
+	/* Current value of flash protect flags */
+	uint32_t flags;
+	/*
+	 * Flags which are valid on this platform.  This allows the caller
+	 * to distinguish between flags which aren't set vs. flags which can't
+	 * be set on this platform.
+	 */
+	uint32_t valid_flags;
+	/* Flags which can be changed given the current protection state */
+	uint32_t writable_flags;
+} __packed;
+
+/*
+ * Note: commands 0x14 - 0x19 version 0 were old commands to get/set flash
+ * write protect.  These commands may be reused with version > 0.
+ */
+
+/* Get the region offset/size */
+#define EC_CMD_FLASH_REGION_INFO 0x16
+#define EC_VER_FLASH_REGION_INFO 1
+
+enum ec_flash_region {
+	/* Region which holds read-only EC image */
+	EC_FLASH_REGION_RO,
+	/* Region which holds rewritable EC image */
+	EC_FLASH_REGION_RW,
+	/*
+	 * Region which should be write-protected in the factory (a superset of
+	 * EC_FLASH_REGION_RO)
+	 */
+	EC_FLASH_REGION_WP_RO,
+};
+
+struct ec_params_flash_region_info {
+	uint32_t region;  /* enum ec_flash_region */
+} __packed;
+
+struct ec_response_flash_region_info {
+	uint32_t offset;
+	uint32_t size;
+} __packed;
+
+/* Read/write VbNvContext */
+#define EC_CMD_VBNV_CONTEXT 0x17
+#define EC_VER_VBNV_CONTEXT 1
+#define EC_VBNV_BLOCK_SIZE 16
+
+enum ec_vbnvcontext_op {
+	EC_VBNV_CONTEXT_OP_READ,
+	EC_VBNV_CONTEXT_OP_WRITE,
+};
+
+struct ec_params_vbnvcontext {
+	uint32_t op;
+	uint8_t block[EC_VBNV_BLOCK_SIZE];
+} __packed;
+
+struct ec_response_vbnvcontext {
+	uint8_t block[EC_VBNV_BLOCK_SIZE];
+} __packed;
+
+/*****************************************************************************/
+/* PWM commands */
+
+/* Get fan target RPM */
+#define EC_CMD_PWM_GET_FAN_TARGET_RPM 0x20
+
+struct ec_response_pwm_get_fan_rpm {
+	uint32_t rpm;
+} __packed;
+
+/* Set target fan RPM */
+#define EC_CMD_PWM_SET_FAN_TARGET_RPM 0x21
+
+struct ec_params_pwm_set_fan_target_rpm {
+	uint32_t rpm;
+} __packed;
+
+/* Get keyboard backlight */
+#define EC_CMD_PWM_GET_KEYBOARD_BACKLIGHT 0x22
+
+struct ec_response_pwm_get_keyboard_backlight {
+	uint8_t percent;
+	uint8_t enabled;
+} __packed;
+
+/* Set keyboard backlight */
+#define EC_CMD_PWM_SET_KEYBOARD_BACKLIGHT 0x23
+
+struct ec_params_pwm_set_keyboard_backlight {
+	uint8_t percent;
+} __packed;
+
+/* Set target fan PWM duty cycle */
+#define EC_CMD_PWM_SET_FAN_DUTY 0x24
+
+struct ec_params_pwm_set_fan_duty {
+	uint32_t percent;
+} __packed;
+
+/*****************************************************************************/
+/*
+ * Lightbar commands. This looks worse than it is. Since we only use one HOST
+ * command to say "talk to the lightbar", we put the "and tell it to do X" part
+ * into a subcommand. We'll make separate structs for subcommands with
+ * different input args, so that we know how much to expect.
+ */
+#define EC_CMD_LIGHTBAR_CMD 0x28
+
+struct rgb_s {
+	uint8_t r, g, b;
+};
+
+#define LB_BATTERY_LEVELS 4
+/* List of tweakable parameters. NOTE: It's __packed so it can be sent in a
+ * host command, but the alignment is the same regardless. Keep it that way.
+ */
+struct lightbar_params {
+	/* Timing */
+	int google_ramp_up;
+	int google_ramp_down;
+	int s3s0_ramp_up;
+	int s0_tick_delay[2];			/* AC=0/1 */
+	int s0a_tick_delay[2];			/* AC=0/1 */
+	int s0s3_ramp_down;
+	int s3_sleep_for;
+	int s3_ramp_up;
+	int s3_ramp_down;
+
+	/* Oscillation */
+	uint8_t new_s0;
+	uint8_t osc_min[2];			/* AC=0/1 */
+	uint8_t osc_max[2];			/* AC=0/1 */
+	uint8_t w_ofs[2];			/* AC=0/1 */
+
+	/* Brightness limits based on the backlight and AC. */
+	uint8_t bright_bl_off_fixed[2];		/* AC=0/1 */
+	uint8_t bright_bl_on_min[2];		/* AC=0/1 */
+	uint8_t bright_bl_on_max[2];		/* AC=0/1 */
+
+	/* Battery level thresholds */
+	uint8_t battery_threshold[LB_BATTERY_LEVELS - 1];
+
+	/* Map [AC][battery_level] to color index */
+	uint8_t s0_idx[2][LB_BATTERY_LEVELS];	/* AP is running */
+	uint8_t s3_idx[2][LB_BATTERY_LEVELS];	/* AP is sleeping */
+
+	/* Color palette */
+	struct rgb_s color[8];			/* 0-3 are Google colors */
+} __packed;
+
+struct ec_params_lightbar {
+	uint8_t cmd;		      /* Command (see enum lightbar_command) */
+	union {
+		struct {
+			/* no args */
+		} dump, off, on, init, get_seq, get_params;
+
+		struct num {
+			uint8_t num;
+		} brightness, seq, demo;
+
+		struct reg {
+			uint8_t ctrl, reg, value;
+		} reg;
+
+		struct rgb {
+			uint8_t led, red, green, blue;
+		} rgb;
+
+		struct lightbar_params set_params;
+	};
+} __packed;
+
+struct ec_response_lightbar {
+	union {
+		struct dump {
+			struct {
+				uint8_t reg;
+				uint8_t ic0;
+				uint8_t ic1;
+			} vals[23];
+		} dump;
+
+		struct get_seq {
+			uint8_t num;
+		} get_seq;
+
+		struct lightbar_params get_params;
+
+		struct {
+			/* no return params */
+		} off, on, init, brightness, seq, reg, rgb, demo, set_params;
+	};
+} __packed;
+
+/* Lightbar commands */
+enum lightbar_command {
+	LIGHTBAR_CMD_DUMP = 0,
+	LIGHTBAR_CMD_OFF = 1,
+	LIGHTBAR_CMD_ON = 2,
+	LIGHTBAR_CMD_INIT = 3,
+	LIGHTBAR_CMD_BRIGHTNESS = 4,
+	LIGHTBAR_CMD_SEQ = 5,
+	LIGHTBAR_CMD_REG = 6,
+	LIGHTBAR_CMD_RGB = 7,
+	LIGHTBAR_CMD_GET_SEQ = 8,
+	LIGHTBAR_CMD_DEMO = 9,
+	LIGHTBAR_CMD_GET_PARAMS = 10,
+	LIGHTBAR_CMD_SET_PARAMS = 11,
+	LIGHTBAR_NUM_CMDS
+};
+
+/*****************************************************************************/
+/* Verified boot commands */
+
+/*
+ * Note: command code 0x29 version 0 was VBOOT_CMD in Link EVT; it may be
+ * reused for other purposes with version > 0.
+ */
+
+/* Verified boot hash command */
+#define EC_CMD_VBOOT_HASH 0x2A
+
+struct ec_params_vboot_hash {
+	uint8_t cmd;             /* enum ec_vboot_hash_cmd */
+	uint8_t hash_type;       /* enum ec_vboot_hash_type */
+	uint8_t nonce_size;      /* Nonce size; may be 0 */
+	uint8_t reserved0;       /* Reserved; set 0 */
+	uint32_t offset;         /* Offset in flash to hash */
+	uint32_t size;           /* Number of bytes to hash */
+	uint8_t nonce_data[64];  /* Nonce data; ignored if nonce_size=0 */
+} __packed;
+
+struct ec_response_vboot_hash {
+	uint8_t status;          /* enum ec_vboot_hash_status */
+	uint8_t hash_type;       /* enum ec_vboot_hash_type */
+	uint8_t digest_size;     /* Size of hash digest in bytes */
+	uint8_t reserved0;       /* Ignore; will be 0 */
+	uint32_t offset;         /* Offset in flash which was hashed */
+	uint32_t size;           /* Number of bytes hashed */
+	uint8_t hash_digest[64]; /* Hash digest data */
+} __packed;
+
+enum ec_vboot_hash_cmd {
+	EC_VBOOT_HASH_GET = 0,       /* Get current hash status */
+	EC_VBOOT_HASH_ABORT = 1,     /* Abort calculating current hash */
+	EC_VBOOT_HASH_START = 2,     /* Start computing a new hash */
+	EC_VBOOT_HASH_RECALC = 3,    /* Synchronously compute a new hash */
+};
+
+enum ec_vboot_hash_type {
+	EC_VBOOT_HASH_TYPE_SHA256 = 0, /* SHA-256 */
+};
+
+enum ec_vboot_hash_status {
+	EC_VBOOT_HASH_STATUS_NONE = 0, /* No hash (not started, or aborted) */
+	EC_VBOOT_HASH_STATUS_DONE = 1, /* Finished computing a hash */
+	EC_VBOOT_HASH_STATUS_BUSY = 2, /* Busy computing a hash */
+};
+
+/*
+ * Special values for offset for EC_VBOOT_HASH_START and EC_VBOOT_HASH_RECALC.
+ * If one of these is specified, the EC will automatically update offset and
+ * size to the correct values for the specified image (RO or RW).
+ */
+#define EC_VBOOT_HASH_OFFSET_RO 0xfffffffe
+#define EC_VBOOT_HASH_OFFSET_RW 0xfffffffd
+
+/*****************************************************************************/
+/* USB charging control commands */
+
+/* Set USB port charging mode */
+#define EC_CMD_USB_CHARGE_SET_MODE 0x30
+
+struct ec_params_usb_charge_set_mode {
+	uint8_t usb_port_id;
+	uint8_t mode;
+} __packed;
+
+/*****************************************************************************/
+/* Persistent storage for host */
+
+/* Maximum bytes that can be read/written in a single command */
+#define EC_PSTORE_SIZE_MAX 64
+
+/* Get persistent storage info */
+#define EC_CMD_PSTORE_INFO 0x40
+
+struct ec_response_pstore_info {
+	/* Persistent storage size, in bytes */
+	uint32_t pstore_size;
+	/* Access size; read/write offset and size must be a multiple of this */
+	uint32_t access_size;
+} __packed;
+
+/*
+ * Read persistent storage
+ *
+ * Response is params.size bytes of data.
+ */
+#define EC_CMD_PSTORE_READ 0x41
+
+struct ec_params_pstore_read {
+	uint32_t offset;   /* Byte offset to read */
+	uint32_t size;     /* Size to read in bytes */
+} __packed;
+
+/* Write persistent storage */
+#define EC_CMD_PSTORE_WRITE 0x42
+
+struct ec_params_pstore_write {
+	uint32_t offset;   /* Byte offset to write */
+	uint32_t size;     /* Size to write in bytes */
+	uint8_t data[EC_PSTORE_SIZE_MAX];
+} __packed;
+
+/*****************************************************************************/
+/* Real-time clock */
+
+/* RTC params and response structures */
+struct ec_params_rtc {
+	uint32_t time;
+} __packed;
+
+struct ec_response_rtc {
+	uint32_t time;
+} __packed;
+
+/* These use ec_response_rtc */
+#define EC_CMD_RTC_GET_VALUE 0x44
+#define EC_CMD_RTC_GET_ALARM 0x45
+
+/* These all use ec_params_rtc */
+#define EC_CMD_RTC_SET_VALUE 0x46
+#define EC_CMD_RTC_SET_ALARM 0x47
+
+/*****************************************************************************/
+/* Port80 log access */
+
+/* Get last port80 code from previous boot */
+#define EC_CMD_PORT80_LAST_BOOT 0x48
+
+struct ec_response_port80_last_boot {
+	uint16_t code;
+} __packed;
+
+/*****************************************************************************/
+/* Thermal engine commands */
+
+/* Set thershold value */
+#define EC_CMD_THERMAL_SET_THRESHOLD 0x50
+
+struct ec_params_thermal_set_threshold {
+	uint8_t sensor_type;
+	uint8_t threshold_id;
+	uint16_t value;
+} __packed;
+
+/* Get threshold value */
+#define EC_CMD_THERMAL_GET_THRESHOLD 0x51
+
+struct ec_params_thermal_get_threshold {
+	uint8_t sensor_type;
+	uint8_t threshold_id;
+} __packed;
+
+struct ec_response_thermal_get_threshold {
+	uint16_t value;
+} __packed;
+
+/* Toggle automatic fan control */
+#define EC_CMD_THERMAL_AUTO_FAN_CTRL 0x52
+
+/* Get TMP006 calibration data */
+#define EC_CMD_TMP006_GET_CALIBRATION 0x53
+
+struct ec_params_tmp006_get_calibration {
+	uint8_t index;
+} __packed;
+
+struct ec_response_tmp006_get_calibration {
+	float s0;
+	float b0;
+	float b1;
+	float b2;
+} __packed;
+
+/* Set TMP006 calibration data */
+#define EC_CMD_TMP006_SET_CALIBRATION 0x54
+
+struct ec_params_tmp006_set_calibration {
+	uint8_t index;
+	uint8_t reserved[3];  /* Reserved; set 0 */
+	float s0;
+	float b0;
+	float b1;
+	float b2;
+} __packed;
+
+/*****************************************************************************/
+/* MKBP - Matrix KeyBoard Protocol */
+
+/*
+ * Read key state
+ *
+ * Returns raw data for keyboard cols; see ec_response_mkbp_info.cols for
+ * expected response size.
+ */
+#define EC_CMD_MKBP_STATE 0x60
+
+/* Provide information about the matrix : number of rows and columns */
+#define EC_CMD_MKBP_INFO 0x61
+
+struct ec_response_mkbp_info {
+	uint32_t rows;
+	uint32_t cols;
+	uint8_t switches;
+} __packed;
+
+/* Simulate key press */
+#define EC_CMD_MKBP_SIMULATE_KEY 0x62
+
+struct ec_params_mkbp_simulate_key {
+	uint8_t col;
+	uint8_t row;
+	uint8_t pressed;
+} __packed;
+
+/* Configure keyboard scanning */
+#define EC_CMD_MKBP_SET_CONFIG 0x64
+#define EC_CMD_MKBP_GET_CONFIG 0x65
+
+/* flags */
+enum mkbp_config_flags {
+	EC_MKBP_FLAGS_ENABLE = 1,	/* Enable keyboard scanning */
+};
+
+enum mkbp_config_valid {
+	EC_MKBP_VALID_SCAN_PERIOD		= 1 << 0,
+	EC_MKBP_VALID_POLL_TIMEOUT		= 1 << 1,
+	EC_MKBP_VALID_MIN_POST_SCAN_DELAY	= 1 << 3,
+	EC_MKBP_VALID_OUTPUT_SETTLE		= 1 << 4,
+	EC_MKBP_VALID_DEBOUNCE_DOWN		= 1 << 5,
+	EC_MKBP_VALID_DEBOUNCE_UP		= 1 << 6,
+	EC_MKBP_VALID_FIFO_MAX_DEPTH		= 1 << 7,
+};
+
+/* Configuration for our key scanning algorithm */
+struct ec_mkbp_config {
+	uint32_t valid_mask;		/* valid fields */
+	uint8_t flags;		/* some flags (enum mkbp_config_flags) */
+	uint8_t valid_flags;		/* which flags are valid */
+	uint16_t scan_period_us;	/* period between start of scans */
+	/* revert to interrupt mode after no activity for this long */
+	uint32_t poll_timeout_us;
+	/*
+	 * minimum post-scan relax time. Once we finish a scan we check
+	 * the time until we are due to start the next one. If this time is
+	 * shorter this field, we use this instead.
+	 */
+	uint16_t min_post_scan_delay_us;
+	/* delay between setting up output and waiting for it to settle */
+	uint16_t output_settle_us;
+	uint16_t debounce_down_us;	/* time for debounce on key down */
+	uint16_t debounce_up_us;	/* time for debounce on key up */
+	/* maximum depth to allow for fifo (0 = no keyscan output) */
+	uint8_t fifo_max_depth;
+} __packed;
+
+struct ec_params_mkbp_set_config {
+	struct ec_mkbp_config config;
+} __packed;
+
+struct ec_response_mkbp_get_config {
+	struct ec_mkbp_config config;
+} __packed;
+
+/* Run the key scan emulation */
+#define EC_CMD_KEYSCAN_SEQ_CTRL 0x66
+
+enum ec_keyscan_seq_cmd {
+	EC_KEYSCAN_SEQ_STATUS = 0,	/* Get status information */
+	EC_KEYSCAN_SEQ_CLEAR = 1,	/* Clear sequence */
+	EC_KEYSCAN_SEQ_ADD = 2,		/* Add item to sequence */
+	EC_KEYSCAN_SEQ_START = 3,	/* Start running sequence */
+	EC_KEYSCAN_SEQ_COLLECT = 4,	/* Collect sequence summary data */
+};
+
+enum ec_collect_flags {
+	/*
+	 * Indicates this scan was processed by the EC. Due to timing, some
+	 * scans may be skipped.
+	 */
+	EC_KEYSCAN_SEQ_FLAG_DONE	= 1 << 0,
+};
+
+struct ec_collect_item {
+	uint8_t flags;		/* some flags (enum ec_collect_flags) */
+};
+
+struct ec_params_keyscan_seq_ctrl {
+	uint8_t cmd;	/* Command to send (enum ec_keyscan_seq_cmd) */
+	union {
+		struct {
+			uint8_t active;		/* still active */
+			uint8_t num_items;	/* number of items */
+			/* Current item being presented */
+			uint8_t cur_item;
+		} status;
+		struct {
+			/*
+			 * Absolute time for this scan, measured from the
+			 * start of the sequence.
+			 */
+			uint32_t time_us;
+			uint8_t scan[0];	/* keyscan data */
+		} add;
+		struct {
+			uint8_t start_item;	/* First item to return */
+			uint8_t num_items;	/* Number of items to return */
+		} collect;
+	};
+} __packed;
+
+struct ec_result_keyscan_seq_ctrl {
+	union {
+		struct {
+			uint8_t num_items;	/* Number of items */
+			/* Data for each item */
+			struct ec_collect_item item[0];
+		} collect;
+	};
+} __packed;
+
+/*****************************************************************************/
+/* Temperature sensor commands */
+
+/* Read temperature sensor info */
+#define EC_CMD_TEMP_SENSOR_GET_INFO 0x70
+
+struct ec_params_temp_sensor_get_info {
+	uint8_t id;
+} __packed;
+
+struct ec_response_temp_sensor_get_info {
+	char sensor_name[32];
+	uint8_t sensor_type;
+} __packed;
+
+/*****************************************************************************/
+
+/*
+ * Note: host commands 0x80 - 0x87 are reserved to avoid conflict with ACPI
+ * commands accidentally sent to the wrong interface.  See the ACPI section
+ * below.
+ */
+
+/*****************************************************************************/
+/* Host event commands */
+
+/*
+ * Host event mask params and response structures, shared by all of the host
+ * event commands below.
+ */
+struct ec_params_host_event_mask {
+	uint32_t mask;
+} __packed;
+
+struct ec_response_host_event_mask {
+	uint32_t mask;
+} __packed;
+
+/* These all use ec_response_host_event_mask */
+#define EC_CMD_HOST_EVENT_GET_B         0x87
+#define EC_CMD_HOST_EVENT_GET_SMI_MASK  0x88
+#define EC_CMD_HOST_EVENT_GET_SCI_MASK  0x89
+#define EC_CMD_HOST_EVENT_GET_WAKE_MASK 0x8d
+
+/* These all use ec_params_host_event_mask */
+#define EC_CMD_HOST_EVENT_SET_SMI_MASK  0x8a
+#define EC_CMD_HOST_EVENT_SET_SCI_MASK  0x8b
+#define EC_CMD_HOST_EVENT_CLEAR         0x8c
+#define EC_CMD_HOST_EVENT_SET_WAKE_MASK 0x8e
+#define EC_CMD_HOST_EVENT_CLEAR_B       0x8f
+
+/*****************************************************************************/
+/* Switch commands */
+
+/* Enable/disable LCD backlight */
+#define EC_CMD_SWITCH_ENABLE_BKLIGHT 0x90
+
+struct ec_params_switch_enable_backlight {
+	uint8_t enabled;
+} __packed;
+
+/* Enable/disable WLAN/Bluetooth */
+#define EC_CMD_SWITCH_ENABLE_WIRELESS 0x91
+
+struct ec_params_switch_enable_wireless {
+	uint8_t enabled;
+} __packed;
+
+/*****************************************************************************/
+/* GPIO commands. Only available on EC if write protect has been disabled. */
+
+/* Set GPIO output value */
+#define EC_CMD_GPIO_SET 0x92
+
+struct ec_params_gpio_set {
+	char name[32];
+	uint8_t val;
+} __packed;
+
+/* Get GPIO value */
+#define EC_CMD_GPIO_GET 0x93
+
+struct ec_params_gpio_get {
+	char name[32];
+} __packed;
+struct ec_response_gpio_get {
+	uint8_t val;
+} __packed;
+
+/*****************************************************************************/
+/* I2C commands. Only available when flash write protect is unlocked. */
+
+/* Read I2C bus */
+#define EC_CMD_I2C_READ 0x94
+
+struct ec_params_i2c_read {
+	uint16_t addr;
+	uint8_t read_size; /* Either 8 or 16. */
+	uint8_t port;
+	uint8_t offset;
+} __packed;
+struct ec_response_i2c_read {
+	uint16_t data;
+} __packed;
+
+/* Write I2C bus */
+#define EC_CMD_I2C_WRITE 0x95
+
+struct ec_params_i2c_write {
+	uint16_t data;
+	uint16_t addr;
+	uint8_t write_size; /* Either 8 or 16. */
+	uint8_t port;
+	uint8_t offset;
+} __packed;
+
+/*****************************************************************************/
+/* Charge state commands. Only available when flash write protect unlocked. */
+
+/* Force charge state machine to stop in idle mode */
+#define EC_CMD_CHARGE_FORCE_IDLE 0x96
+
+struct ec_params_force_idle {
+	uint8_t enabled;
+} __packed;
+
+/*****************************************************************************/
+/* Console commands. Only available when flash write protect is unlocked. */
+
+/* Snapshot console output buffer for use by EC_CMD_CONSOLE_READ. */
+#define EC_CMD_CONSOLE_SNAPSHOT 0x97
+
+/*
+ * Read next chunk of data from saved snapshot.
+ *
+ * Response is null-terminated string.  Empty string, if there is no more
+ * remaining output.
+ */
+#define EC_CMD_CONSOLE_READ 0x98
+
+/*****************************************************************************/
+
+/*
+ * Cut off battery power output if the battery supports.
+ *
+ * For unsupported battery, just don't implement this command and lets EC
+ * return EC_RES_INVALID_COMMAND.
+ */
+#define EC_CMD_BATTERY_CUT_OFF 0x99
+
+/*****************************************************************************/
+/* Temporary debug commands. TODO: remove this crosbug.com/p/13849 */
+
+/*
+ * Dump charge state machine context.
+ *
+ * Response is a binary dump of charge state machine context.
+ */
+#define EC_CMD_CHARGE_DUMP 0xa0
+
+/*
+ * Set maximum battery charging current.
+ */
+#define EC_CMD_CHARGE_CURRENT_LIMIT 0xa1
+
+struct ec_params_current_limit {
+	uint32_t limit;
+} __packed;
+
+/*****************************************************************************/
+/* System commands */
+
+/*
+ * TODO: this is a confusing name, since it doesn't necessarily reboot the EC.
+ * Rename to "set image" or something similar.
+ */
+#define EC_CMD_REBOOT_EC 0xd2
+
+/* Command */
+enum ec_reboot_cmd {
+	EC_REBOOT_CANCEL = 0,        /* Cancel a pending reboot */
+	EC_REBOOT_JUMP_RO = 1,       /* Jump to RO without rebooting */
+	EC_REBOOT_JUMP_RW = 2,       /* Jump to RW without rebooting */
+	/* (command 3 was jump to RW-B) */
+	EC_REBOOT_COLD = 4,          /* Cold-reboot */
+	EC_REBOOT_DISABLE_JUMP = 5,  /* Disable jump until next reboot */
+	EC_REBOOT_HIBERNATE = 6      /* Hibernate EC */
+};
+
+/* Flags for ec_params_reboot_ec.reboot_flags */
+#define EC_REBOOT_FLAG_RESERVED0      (1 << 0)  /* Was recovery request */
+#define EC_REBOOT_FLAG_ON_AP_SHUTDOWN (1 << 1)  /* Reboot after AP shutdown */
+
+struct ec_params_reboot_ec {
+	uint8_t cmd;           /* enum ec_reboot_cmd */
+	uint8_t flags;         /* See EC_REBOOT_FLAG_* */
+} __packed;
+
+/*
+ * Get information on last EC panic.
+ *
+ * Returns variable-length platform-dependent panic information.  See panic.h
+ * for details.
+ */
+#define EC_CMD_GET_PANIC_INFO 0xd3
+
+/*****************************************************************************/
+/*
+ * ACPI commands
+ *
+ * These are valid ONLY on the ACPI command/data port.
+ */
+
+/*
+ * ACPI Read Embedded Controller
+ *
+ * This reads from ACPI memory space on the EC (EC_ACPI_MEM_*).
+ *
+ * Use the following sequence:
+ *
+ *    - Write EC_CMD_ACPI_READ to EC_LPC_ADDR_ACPI_CMD
+ *    - Wait for EC_LPC_CMDR_PENDING bit to clear
+ *    - Write address to EC_LPC_ADDR_ACPI_DATA
+ *    - Wait for EC_LPC_CMDR_DATA bit to set
+ *    - Read value from EC_LPC_ADDR_ACPI_DATA
+ */
+#define EC_CMD_ACPI_READ 0x80
+
+/*
+ * ACPI Write Embedded Controller
+ *
+ * This reads from ACPI memory space on the EC (EC_ACPI_MEM_*).
+ *
+ * Use the following sequence:
+ *
+ *    - Write EC_CMD_ACPI_WRITE to EC_LPC_ADDR_ACPI_CMD
+ *    - Wait for EC_LPC_CMDR_PENDING bit to clear
+ *    - Write address to EC_LPC_ADDR_ACPI_DATA
+ *    - Wait for EC_LPC_CMDR_PENDING bit to clear
+ *    - Write value to EC_LPC_ADDR_ACPI_DATA
+ */
+#define EC_CMD_ACPI_WRITE 0x81
+
+/*
+ * ACPI Query Embedded Controller
+ *
+ * This clears the lowest-order bit in the currently pending host events, and
+ * sets the result code to the 1-based index of the bit (event 0x00000001 = 1,
+ * event 0x80000000 = 32), or 0 if no event was pending.
+ */
+#define EC_CMD_ACPI_QUERY_EVENT 0x84
+
+/* Valid addresses in ACPI memory space, for read/write commands */
+/* Memory space version; set to EC_ACPI_MEM_VERSION_CURRENT */
+#define EC_ACPI_MEM_VERSION            0x00
+/*
+ * Test location; writing value here updates test compliment byte to (0xff -
+ * value).
+ */
+#define EC_ACPI_MEM_TEST               0x01
+/* Test compliment; writes here are ignored. */
+#define EC_ACPI_MEM_TEST_COMPLIMENT    0x02
+/* Keyboard backlight brightness percent (0 - 100) */
+#define EC_ACPI_MEM_KEYBOARD_BACKLIGHT 0x03
+
+/* Current version of ACPI memory address space */
+#define EC_ACPI_MEM_VERSION_CURRENT 1
+
+
+/*****************************************************************************/
+/*
+ * Special commands
+ *
+ * These do not follow the normal rules for commands.  See each command for
+ * details.
+ */
+
+/*
+ * Reboot NOW
+ *
+ * This command will work even when the EC LPC interface is busy, because the
+ * reboot command is processed at interrupt level.  Note that when the EC
+ * reboots, the host will reboot too, so there is no response to this command.
+ *
+ * Use EC_CMD_REBOOT_EC to reboot the EC more politely.
+ */
+#define EC_CMD_REBOOT 0xd1  /* Think "die" */
+
+/*
+ * Resend last response (not supported on LPC).
+ *
+ * Returns EC_RES_UNAVAILABLE if there is no response available - for example,
+ * there was no previous command, or the previous command's response was too
+ * big to save.
+ */
+#define EC_CMD_RESEND_RESPONSE 0xdb
+
+/*
+ * This header byte on a command indicate version 0. Any header byte less
+ * than this means that we are talking to an old EC which doesn't support
+ * versioning. In that case, we assume version 0.
+ *
+ * Header bytes greater than this indicate a later version. For example,
+ * EC_CMD_VERSION0 + 1 means we are using version 1.
+ *
+ * The old EC interface must not use commands 0dc or higher.
+ */
+#define EC_CMD_VERSION0 0xdc
+
+#endif  /* !__ACPI__ */
+
+#endif  /* __CROS_EC_COMMANDS_H */
-- 
cgit 


From 4ab6174e8cdb007cf500e484bdf454b8d14d524a Mon Sep 17 00:00:00 2001
From: Simon Glass <sjg@chromium.org>
Date: Mon, 25 Feb 2013 14:08:37 -0800
Subject: mfd: Add ChromeOS EC implementation

This is the base EC implementation, which provides a high level
interface to the EC for use by the rest of the kernel. The actual
communcations is dealt with by a separate protocol driver which
registers itself with this interface.

Interrupts are passed on through a notifier.

A simple message structure is used to pass messages to the
protocol driver.
Signed-off-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Che-Liang Chiou <clchiou@chromium.org>
Signed-off-by: Jonathan Kliegman <kliegs@chromium.org>
Signed-off-by: Luigi Semenzato <semenzato@chromium.org>
Signed-off-by: Olof Johansson <olofj@chromium.org>
Signed-off-by: Vincent Palatin <vpalatin@chromium.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 Documentation/devicetree/bindings/mfd/cros-ec.txt |  56 +++++++
 drivers/mfd/Kconfig                               |   8 +
 drivers/mfd/Makefile                              |   1 +
 drivers/mfd/cros_ec.c                             | 189 ++++++++++++++++++++++
 include/linux/mfd/cros_ec.h                       | 170 +++++++++++++++++++
 5 files changed, 424 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/mfd/cros-ec.txt
 create mode 100644 drivers/mfd/cros_ec.c
 create mode 100644 include/linux/mfd/cros_ec.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/mfd/cros-ec.txt b/Documentation/devicetree/bindings/mfd/cros-ec.txt
new file mode 100644
index 000000000000..e0e59c58a1f9
--- /dev/null
+++ b/Documentation/devicetree/bindings/mfd/cros-ec.txt
@@ -0,0 +1,56 @@
+ChromeOS Embedded Controller
+
+Google's ChromeOS EC is a Cortex-M device which talks to the AP and
+implements various function such as keyboard and battery charging.
+
+The EC can be connect through various means (I2C, SPI, LPC) and the
+compatible string used depends on the inteface. Each connection method has
+its own driver which connects to the top level interface-agnostic EC driver.
+Other Linux driver (such as cros-ec-keyb for the matrix keyboard) connect to
+the top-level driver.
+
+Required properties (I2C):
+- compatible: "google,cros-ec-i2c"
+- reg: I2C slave address
+
+Required properties (SPI):
+- compatible: "google,cros-ec-spi"
+- reg: SPI chip select
+
+Required properties (LPC):
+- compatible: "google,cros-ec-lpc"
+- reg: List of (IO address, size) pairs defining the interface uses
+
+
+Example for I2C:
+
+i2c@12CA0000 {
+	cros-ec@1e {
+		reg = <0x1e>;
+		compatible = "google,cros-ec-i2c";
+		interrupts = <14 0>;
+		interrupt-parent = <&wakeup_eint>;
+		wakeup-source;
+	};
+
+
+Example for SPI:
+
+spi@131b0000 {
+	ec@0 {
+		compatible = "google,cros-ec-spi";
+		reg = <0x0>;
+		interrupts = <14 0>;
+		interrupt-parent = <&wakeup_eint>;
+		wakeup-source;
+		spi-max-frequency = <5000000>;
+		controller-data {
+		cs-gpio = <&gpf0 3 4 3 0>;
+		samsung,spi-cs;
+		samsung,spi-feedback-delay = <2>;
+		};
+	};
+};
+
+
+Example for LPC is not supplied as it is not yet implemented.
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index c346941a2515..a8bafb560196 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -21,6 +21,14 @@ config MFD_88PM860X
 	  select individual components like voltage regulators, RTC and
 	  battery-charger under the corresponding menus.
 
+config MFD_CROS_EC
+	tristate "Support ChromeOS Embedded Controller"
+	help
+	  If you say Y here you get support for the ChromeOS Embedded
+	  Controller (EC) providing keyboard, battery and power services.
+	  You also ned to enable the driver for the bus you are using. The
+	  protocol for talking to the EC is defined by the bus driver.
+
 config MFD_88PM800
 	tristate "Support Marvell 88PM800"
 	depends on I2C=y && GENERIC_HARDIRQS
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index b90409c23664..967767d3d759 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_MFD_88PM800)	+= 88pm800.o 88pm80x.o
 obj-$(CONFIG_MFD_88PM805)	+= 88pm805.o 88pm80x.o
 obj-$(CONFIG_MFD_SM501)		+= sm501.o
 obj-$(CONFIG_MFD_ASIC3)		+= asic3.o tmio_core.o
+obj-$(CONFIG_MFD_CROS_EC)	+= cros_ec.o
 
 rtsx_pci-objs			:= rtsx_pcr.o rts5209.o rts5229.o rtl8411.o rts5227.o
 obj-$(CONFIG_MFD_RTSX_PCI)	+= rtsx_pci.o
diff --git a/drivers/mfd/cros_ec.c b/drivers/mfd/cros_ec.c
new file mode 100644
index 000000000000..ac824ccab52a
--- /dev/null
+++ b/drivers/mfd/cros_ec.c
@@ -0,0 +1,189 @@
+/*
+ * ChromeOS EC multi-function device
+ *
+ * Copyright (C) 2012 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * The ChromeOS EC multi function device is used to mux all the requests
+ * to the EC device for its multiple features: keyboard controller,
+ * battery charging and regulator control, firmware update.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/mfd/core.h>
+#include <linux/mfd/cros_ec.h>
+#include <linux/mfd/cros_ec_commands.h>
+
+int cros_ec_prepare_tx(struct cros_ec_device *ec_dev,
+		       struct cros_ec_msg *msg)
+{
+	uint8_t *out;
+	int csum, i;
+
+	BUG_ON(msg->out_len > EC_HOST_PARAM_SIZE);
+	out = ec_dev->dout;
+	out[0] = EC_CMD_VERSION0 + msg->version;
+	out[1] = msg->cmd;
+	out[2] = msg->out_len;
+	csum = out[0] + out[1] + out[2];
+	for (i = 0; i < msg->out_len; i++)
+		csum += out[EC_MSG_TX_HEADER_BYTES + i] = msg->out_buf[i];
+	out[EC_MSG_TX_HEADER_BYTES + msg->out_len] = (uint8_t)(csum & 0xff);
+
+	return EC_MSG_TX_PROTO_BYTES + msg->out_len;
+}
+
+static int cros_ec_command_sendrecv(struct cros_ec_device *ec_dev,
+		uint16_t cmd, void *out_buf, int out_len,
+		void *in_buf, int in_len)
+{
+	struct cros_ec_msg msg;
+
+	msg.version = cmd >> 8;
+	msg.cmd = cmd & 0xff;
+	msg.out_buf = out_buf;
+	msg.out_len = out_len;
+	msg.in_buf = in_buf;
+	msg.in_len = in_len;
+
+	return ec_dev->command_xfer(ec_dev, &msg);
+}
+
+static int cros_ec_command_recv(struct cros_ec_device *ec_dev,
+		uint16_t cmd, void *buf, int buf_len)
+{
+	return cros_ec_command_sendrecv(ec_dev, cmd, NULL, 0, buf, buf_len);
+}
+
+static int cros_ec_command_send(struct cros_ec_device *ec_dev,
+		uint16_t cmd, void *buf, int buf_len)
+{
+	return cros_ec_command_sendrecv(ec_dev, cmd, buf, buf_len, NULL, 0);
+}
+
+static irqreturn_t ec_irq_thread(int irq, void *data)
+{
+	struct cros_ec_device *ec_dev = data;
+
+	if (device_may_wakeup(ec_dev->dev))
+		pm_wakeup_event(ec_dev->dev, 0);
+
+	blocking_notifier_call_chain(&ec_dev->event_notifier, 1, ec_dev);
+
+	return IRQ_HANDLED;
+}
+
+static struct mfd_cell cros_devs[] = {
+	{
+		.name = "cros-ec-keyb",
+		.id = 1,
+		.of_compatible = "google,cros-ec-keyb",
+	},
+};
+
+int cros_ec_register(struct cros_ec_device *ec_dev)
+{
+	struct device *dev = ec_dev->dev;
+	int err = 0;
+
+	BLOCKING_INIT_NOTIFIER_HEAD(&ec_dev->event_notifier);
+
+	ec_dev->command_send = cros_ec_command_send;
+	ec_dev->command_recv = cros_ec_command_recv;
+	ec_dev->command_sendrecv = cros_ec_command_sendrecv;
+
+	if (ec_dev->din_size) {
+		ec_dev->din = kmalloc(ec_dev->din_size, GFP_KERNEL);
+		if (!ec_dev->din) {
+			err = -ENOMEM;
+			goto fail_din;
+		}
+	}
+	if (ec_dev->dout_size) {
+		ec_dev->dout = kmalloc(ec_dev->dout_size, GFP_KERNEL);
+		if (!ec_dev->dout) {
+			err = -ENOMEM;
+			goto fail_dout;
+		}
+	}
+
+	if (!ec_dev->irq) {
+		dev_dbg(dev, "no valid IRQ: %d\n", ec_dev->irq);
+		goto fail_irq;
+	}
+
+	err = request_threaded_irq(ec_dev->irq, NULL, ec_irq_thread,
+				   IRQF_TRIGGER_LOW | IRQF_ONESHOT,
+				   "chromeos-ec", ec_dev);
+	if (err) {
+		dev_err(dev, "request irq %d: error %d\n", ec_dev->irq, err);
+		goto fail_irq;
+	}
+
+	err = mfd_add_devices(dev, 0, cros_devs,
+			      ARRAY_SIZE(cros_devs),
+			      NULL, ec_dev->irq, NULL);
+	if (err) {
+		dev_err(dev, "failed to add mfd devices\n");
+		goto fail_mfd;
+	}
+
+	dev_info(dev, "Chrome EC (%s)\n", ec_dev->name);
+
+	return 0;
+
+fail_mfd:
+	free_irq(ec_dev->irq, ec_dev);
+fail_irq:
+	kfree(ec_dev->dout);
+fail_dout:
+	kfree(ec_dev->din);
+fail_din:
+	return err;
+}
+
+int cros_ec_remove(struct cros_ec_device *ec_dev)
+{
+	mfd_remove_devices(ec_dev->dev);
+	free_irq(ec_dev->irq, ec_dev);
+	kfree(ec_dev->dout);
+	kfree(ec_dev->din);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+int cros_ec_suspend(struct cros_ec_device *ec_dev)
+{
+	struct device *dev = ec_dev->dev;
+
+	if (device_may_wakeup(dev))
+		ec_dev->wake_enabled = !enable_irq_wake(ec_dev->irq);
+
+	disable_irq(ec_dev->irq);
+	ec_dev->was_wake_device = ec_dev->wake_enabled;
+
+	return 0;
+}
+
+int cros_ec_resume(struct cros_ec_device *ec_dev)
+{
+	enable_irq(ec_dev->irq);
+
+	if (ec_dev->wake_enabled) {
+		disable_irq_wake(ec_dev->irq);
+		ec_dev->wake_enabled = 0;
+	}
+
+	return 0;
+}
+#endif
diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h
new file mode 100644
index 000000000000..032af7fc5b2e
--- /dev/null
+++ b/include/linux/mfd/cros_ec.h
@@ -0,0 +1,170 @@
+/*
+ * ChromeOS EC multi-function device
+ *
+ * Copyright (C) 2012 Google, Inc
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __LINUX_MFD_CROS_EC_H
+#define __LINUX_MFD_CROS_EC_H
+
+#include <linux/mfd/cros_ec_commands.h>
+
+/*
+ * Command interface between EC and AP, for LPC, I2C and SPI interfaces.
+ */
+enum {
+	EC_MSG_TX_HEADER_BYTES	= 3,
+	EC_MSG_TX_TRAILER_BYTES	= 1,
+	EC_MSG_TX_PROTO_BYTES	= EC_MSG_TX_HEADER_BYTES +
+					EC_MSG_TX_TRAILER_BYTES,
+	EC_MSG_RX_PROTO_BYTES	= 3,
+
+	/* Max length of messages */
+	EC_MSG_BYTES		= EC_HOST_PARAM_SIZE + EC_MSG_TX_PROTO_BYTES,
+
+};
+
+/**
+ * struct cros_ec_msg - A message sent to the EC, and its reply
+ *
+ * @version: Command version number (often 0)
+ * @cmd: Command to send (EC_CMD_...)
+ * @out_buf: Outgoing payload (to EC)
+ * @outlen: Outgoing length
+ * @in_buf: Incoming payload (from EC)
+ * @in_len: Incoming length
+ */
+struct cros_ec_msg {
+	u8 version;
+	u8 cmd;
+	uint8_t *out_buf;
+	int out_len;
+	uint8_t *in_buf;
+	int in_len;
+};
+
+/**
+ * struct cros_ec_device - Information about a ChromeOS EC device
+ *
+ * @name: Name of this EC interface
+ * @priv: Private data
+ * @irq: Interrupt to use
+ * @din: input buffer (from EC)
+ * @dout: output buffer (to EC)
+ * \note
+ * These two buffers will always be dword-aligned and include enough
+ * space for up to 7 word-alignment bytes also, so we can ensure that
+ * the body of the message is always dword-aligned (64-bit).
+ *
+ * We use this alignment to keep ARM and x86 happy. Probably word
+ * alignment would be OK, there might be a small performance advantage
+ * to using dword.
+ * @din_size: size of din buffer
+ * @dout_size: size of dout buffer
+ * @command_send: send a command
+ * @command_recv: receive a command
+ * @ec_name: name of EC device (e.g. 'chromeos-ec')
+ * @phys_name: name of physical comms layer (e.g. 'i2c-4')
+ * @parent: pointer to parent device (e.g. i2c or spi device)
+ * @dev: Device pointer
+ * dev_lock: Lock to prevent concurrent access
+ * @wake_enabled: true if this device can wake the system from sleep
+ * @was_wake_device: true if this device was set to wake the system from
+ * sleep at the last suspend
+ * @event_notifier: interrupt event notifier for transport devices
+ */
+struct cros_ec_device {
+	const char *name;
+	void *priv;
+	int irq;
+	uint8_t *din;
+	uint8_t *dout;
+	int din_size;
+	int dout_size;
+	int (*command_send)(struct cros_ec_device *ec,
+			uint16_t cmd, void *out_buf, int out_len);
+	int (*command_recv)(struct cros_ec_device *ec,
+			uint16_t cmd, void *in_buf, int in_len);
+	int (*command_sendrecv)(struct cros_ec_device *ec,
+			uint16_t cmd, void *out_buf, int out_len,
+			void *in_buf, int in_len);
+	int (*command_xfer)(struct cros_ec_device *ec,
+			struct cros_ec_msg *msg);
+
+	const char *ec_name;
+	const char *phys_name;
+	struct device *parent;
+
+	/* These are --private-- fields - do not assign */
+	struct device *dev;
+	struct mutex dev_lock;
+	bool wake_enabled;
+	bool was_wake_device;
+	struct blocking_notifier_head event_notifier;
+};
+
+/**
+ * cros_ec_suspend - Handle a suspend operation for the ChromeOS EC device
+ *
+ * This can be called by drivers to handle a suspend event.
+ *
+ * ec_dev: Device to suspend
+ * @return 0 if ok, -ve on error
+ */
+int cros_ec_suspend(struct cros_ec_device *ec_dev);
+
+/**
+ * cros_ec_resume - Handle a resume operation for the ChromeOS EC device
+ *
+ * This can be called by drivers to handle a resume event.
+ *
+ * @ec_dev: Device to resume
+ * @return 0 if ok, -ve on error
+ */
+int cros_ec_resume(struct cros_ec_device *ec_dev);
+
+/**
+ * cros_ec_prepare_tx - Prepare an outgoing message in the output buffer
+ *
+ * This is intended to be used by all ChromeOS EC drivers, but at present
+ * only SPI uses it. Once LPC uses the same protocol it can start using it.
+ * I2C could use it now, with a refactor of the existing code.
+ *
+ * @ec_dev: Device to register
+ * @msg: Message to write
+ */
+int cros_ec_prepare_tx(struct cros_ec_device *ec_dev,
+		       struct cros_ec_msg *msg);
+
+/**
+ * cros_ec_remove - Remove a ChromeOS EC
+ *
+ * Call this to deregister a ChromeOS EC. After this you should call
+ * cros_ec_free().
+ *
+ * @ec_dev: Device to register
+ * @return 0 if ok, -ve on error
+ */
+int cros_ec_remove(struct cros_ec_device *ec_dev);
+
+/**
+ * cros_ec_register - Register a new ChromeOS EC, using the provided info
+ *
+ * Before calling this, allocate a pointer to a new device and then fill
+ * in all the fields up to the --private-- marker.
+ *
+ * @ec_dev: Device to register
+ * @return 0 if ok, -ve on error
+ */
+int cros_ec_register(struct cros_ec_device *ec_dev);
+
+#endif /* __LINUX_MFD_CROS_EC_H */
-- 
cgit 


From 43840415339f1600f281211cfb5400fab696536e Mon Sep 17 00:00:00 2001
From: Simon Glass <sjg@chromium.org>
Date: Mon, 25 Feb 2013 14:08:40 -0800
Subject: input: matrix-keymap: Add function to read the new DT binding

We now have a binding which adds two parameters to the matrix keypad DT
node. This is separate from the GPIO-driven matrix keypad binding, and
unfortunately incompatible, since that uses row-gpios/col-gpios for the
row and column counts.

So the easiest option here is to provide a function for non-GPIO drivers
to use to decode the binding.

Note: We could in fact create an entirely separate structure to hold
these two fields, but it does not seem worth it, yet. If we have more
parameters then we can add this, and then refactor each driver to hold
such a structure.

Signed-off-by: Simon Glass <sjg@chromium.org>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Tested-by: Sourav Poddar <sourav.poddar@ti.com> (v2)
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/input/keyboard/lpc32xx-keys.c   | 11 ++++++-----
 drivers/input/keyboard/omap4-keypad.c   | 16 +++++-----------
 drivers/input/keyboard/tca8418_keypad.c |  7 +++++--
 drivers/input/matrix-keymap.c           | 19 +++++++++++++++++++
 include/linux/input/matrix_keypad.h     | 19 +++++++++++++++++++
 5 files changed, 54 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/input/keyboard/lpc32xx-keys.c b/drivers/input/keyboard/lpc32xx-keys.c
index 1b8add6cfb9d..42181435fe67 100644
--- a/drivers/input/keyboard/lpc32xx-keys.c
+++ b/drivers/input/keyboard/lpc32xx-keys.c
@@ -144,12 +144,13 @@ static int lpc32xx_parse_dt(struct device *dev,
 {
 	struct device_node *np = dev->of_node;
 	u32 rows = 0, columns = 0;
+	int err;
 
-	of_property_read_u32(np, "keypad,num-rows", &rows);
-	of_property_read_u32(np, "keypad,num-columns", &columns);
-	if (!rows || rows != columns) {
-		dev_err(dev,
-			"rows and columns must be specified and be equal!\n");
+	err = matrix_keypad_parse_of_params(dev, &rows, &columns);
+	if (err)
+		return err;
+	if (rows != columns) {
+		dev_err(dev, "rows and columns must be equal!\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/input/keyboard/omap4-keypad.c b/drivers/input/keyboard/omap4-keypad.c
index e25b022692cd..1b289092f4e3 100644
--- a/drivers/input/keyboard/omap4-keypad.c
+++ b/drivers/input/keyboard/omap4-keypad.c
@@ -215,18 +215,12 @@ static int omap4_keypad_parse_dt(struct device *dev,
 				 struct omap4_keypad *keypad_data)
 {
 	struct device_node *np = dev->of_node;
+	int err;
 
-	if (!np) {
-		dev_err(dev, "missing DT data");
-		return -EINVAL;
-	}
-
-	of_property_read_u32(np, "keypad,num-rows", &keypad_data->rows);
-	of_property_read_u32(np, "keypad,num-columns", &keypad_data->cols);
-	if (!keypad_data->rows || !keypad_data->cols) {
-		dev_err(dev, "number of keypad rows/columns not specified\n");
-		return -EINVAL;
-	}
+	err = matrix_keypad_parse_of_params(dev, &keypad_data->rows,
+					    &keypad_data->cols);
+	if (err)
+		return err;
 
 	if (of_get_property(np, "linux,input-no-autorepeat", NULL))
 		keypad_data->no_autorepeat = true;
diff --git a/drivers/input/keyboard/tca8418_keypad.c b/drivers/input/keyboard/tca8418_keypad.c
index a34cc6714e5b..55c15304ddbc 100644
--- a/drivers/input/keyboard/tca8418_keypad.c
+++ b/drivers/input/keyboard/tca8418_keypad.c
@@ -288,8 +288,11 @@ static int tca8418_keypad_probe(struct i2c_client *client,
 		irq_is_gpio = pdata->irq_is_gpio;
 	} else {
 		struct device_node *np = dev->of_node;
-		of_property_read_u32(np, "keypad,num-rows", &rows);
-		of_property_read_u32(np, "keypad,num-columns", &cols);
+		int err;
+
+		err = matrix_keypad_parse_of_params(dev, &rows, &cols);
+		if (err)
+			return err;
 		rep = of_property_read_bool(np, "keypad,autorepeat");
 	}
 
diff --git a/drivers/input/matrix-keymap.c b/drivers/input/matrix-keymap.c
index 3ae496ea5fe6..619b3824563c 100644
--- a/drivers/input/matrix-keymap.c
+++ b/drivers/input/matrix-keymap.c
@@ -50,6 +50,25 @@ static bool matrix_keypad_map_key(struct input_dev *input_dev,
 }
 
 #ifdef CONFIG_OF
+int matrix_keypad_parse_of_params(struct device *dev,
+				  unsigned int *rows, unsigned int *cols)
+{
+	struct device_node *np = dev->of_node;
+
+	if (!np) {
+		dev_err(dev, "missing DT data");
+		return -EINVAL;
+	}
+	of_property_read_u32(np, "keypad,num-rows", rows);
+	of_property_read_u32(np, "keypad,num-columns", cols);
+	if (!*rows || !*cols) {
+		dev_err(dev, "number of keypad rows/columns not specified\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 static int matrix_keypad_parse_of_keymap(const char *propname,
 					 unsigned int rows, unsigned int cols,
 					 struct input_dev *input_dev)
diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h
index 5f3aa6b11bfa..27e06acc509a 100644
--- a/include/linux/input/matrix_keypad.h
+++ b/include/linux/input/matrix_keypad.h
@@ -81,4 +81,23 @@ int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data,
 			       unsigned short *keymap,
 			       struct input_dev *input_dev);
 
+#ifdef CONFIG_OF
+/**
+ * matrix_keypad_parse_of_params() - Read parameters from matrix-keypad node
+ *
+ * @dev: Device containing of_node
+ * @rows: Returns number of matrix rows
+ * @cols: Returns number of matrix columns
+ * @return 0 if OK, <0 on error
+ */
+int matrix_keypad_parse_of_params(struct device *dev,
+				  unsigned int *rows, unsigned int *cols);
+#else
+static inline int matrix_keypad_parse_of_params(struct device *dev,
+				  unsigned int *rows, unsigned int *cols)
+{
+	return -ENOSYS;
+}
+#endif /* CONFIG_OF */
+
 #endif /* _MATRIX_KEYPAD_H */
-- 
cgit 


From 3ec6eb9cc2dbfa5b626f813ab8077eb71da60af2 Mon Sep 17 00:00:00 2001
From: Tomasz Figa <t.figa@samsung.com>
Date: Thu, 4 Apr 2013 18:17:19 +0200
Subject: regulator: max8952: Separate constraints from platform data struct

This patch modifies platform data structure of max8952 driver to
use pointer to regulator_init_data struct instead of embedding it.

This is a prerequisite for adding Device Tree support for the driver.

Signed-off-by: Tomasz Figa <t.figa@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 arch/arm/mach-exynos/mach-universal_c210.c | 27 ++++++++++++++-------------
 drivers/regulator/max8952.c                |  4 ++--
 include/linux/regulator/max8952.h          |  2 +-
 3 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/mach-universal_c210.c b/arch/arm/mach-exynos/mach-universal_c210.c
index 497fcb793dc1..d28c7fbaba2d 100644
--- a/arch/arm/mach-exynos/mach-universal_c210.c
+++ b/arch/arm/mach-exynos/mach-universal_c210.c
@@ -97,6 +97,19 @@ static struct s3c2410_uartcfg universal_uartcfgs[] __initdata = {
 static struct regulator_consumer_supply max8952_consumer =
 	REGULATOR_SUPPLY("vdd_arm", NULL);
 
+static struct regulator_init_data universal_max8952_reg_data = {
+	.constraints	= {
+		.name		= "VARM_1.2V",
+		.min_uV		= 770000,
+		.max_uV		= 1400000,
+		.valid_ops_mask	= REGULATOR_CHANGE_VOLTAGE,
+		.always_on	= 1,
+		.boot_on	= 1,
+	},
+	.num_consumer_supplies	= 1,
+	.consumer_supplies	= &max8952_consumer,
+};
+
 static struct max8952_platform_data universal_max8952_pdata __initdata = {
 	.gpio_vid0	= EXYNOS4_GPX0(3),
 	.gpio_vid1	= EXYNOS4_GPX0(4),
@@ -105,19 +118,7 @@ static struct max8952_platform_data universal_max8952_pdata __initdata = {
 	.dvs_mode	= { 48, 32, 28, 18 }, /* 1.25, 1.20, 1.05, 0.95V */
 	.sync_freq	= 0, /* default: fastest */
 	.ramp_speed	= 0, /* default: fastest */
-
-	.reg_data	= {
-		.constraints	= {
-			.name		= "VARM_1.2V",
-			.min_uV		= 770000,
-			.max_uV		= 1400000,
-			.valid_ops_mask	= REGULATOR_CHANGE_VOLTAGE,
-			.always_on	= 1,
-			.boot_on	= 1,
-		},
-		.num_consumer_supplies	= 1,
-		.consumer_supplies	= &max8952_consumer,
-	},
+	.reg_data	= &universal_max8952_reg_data,
 };
 
 static struct regulator_consumer_supply lp3974_buck1_consumer =
diff --git a/drivers/regulator/max8952.c b/drivers/regulator/max8952.c
index fc7935a19e3a..100b9177dba1 100644
--- a/drivers/regulator/max8952.c
+++ b/drivers/regulator/max8952.c
@@ -154,11 +154,11 @@ static int max8952_pmic_probe(struct i2c_client *client,
 	max8952->pdata = pdata;
 
 	config.dev = max8952->dev;
-	config.init_data = &pdata->reg_data;
+	config.init_data = pdata->reg_data;
 	config.driver_data = max8952;
 
 	config.ena_gpio = pdata->gpio_en;
-	if (pdata->reg_data.constraints.boot_on)
+	if (pdata->reg_data->constraints.boot_on)
 		config.ena_gpio_flags |= GPIOF_OUT_INIT_HIGH;
 
 	max8952->rdev = regulator_register(&regulator, &config);
diff --git a/include/linux/regulator/max8952.h b/include/linux/regulator/max8952.h
index 45e42855ad05..c13aa34d9019 100644
--- a/include/linux/regulator/max8952.h
+++ b/include/linux/regulator/max8952.h
@@ -128,7 +128,7 @@ struct max8952_platform_data {
 	u8 sync_freq;
 	u8 ramp_speed;
 
-	struct regulator_init_data reg_data;
+	struct regulator_init_data *reg_data;
 };
 
 
-- 
cgit 


From 3ff4aa95b3cd960a1c2e1422bc3dbd604f88271b Mon Sep 17 00:00:00 2001
From: Tomasz Figa <t.figa@samsung.com>
Date: Thu, 4 Apr 2013 18:17:20 +0200
Subject: regulator: max8952: Add Device Tree support

This patch adds Device Tree support to max8952 regulator driver.

Signed-off-by: Tomasz Figa <t.figa@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 .../devicetree/bindings/regulator/max8952.txt      | 52 ++++++++++++++++
 drivers/regulator/max8952.c                        | 70 ++++++++++++++++++++++
 include/linux/regulator/max8952.h                  |  8 +--
 3 files changed, 126 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/regulator/max8952.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/regulator/max8952.txt b/Documentation/devicetree/bindings/regulator/max8952.txt
new file mode 100644
index 000000000000..866fcdd0f4eb
--- /dev/null
+++ b/Documentation/devicetree/bindings/regulator/max8952.txt
@@ -0,0 +1,52 @@
+Maxim MAX8952 voltage regulator
+
+Required properties:
+- compatible: must be equal to "maxim,max8952"
+- reg: I2C slave address, usually 0x60
+- max8952,dvs-mode-microvolt: array of 4 integer values defining DVS voltages
+  in microvolts. All values must be from range <770000, 1400000>
+- any required generic properties defined in regulator.txt
+
+Optional properties:
+- max8952,vid-gpios: array of two GPIO pins used for DVS voltage selection
+- max8952,en-gpio: GPIO used to control enable status of regulator
+- max8952,default-mode: index of default DVS voltage, from <0, 3> range
+- max8952,sync-freq: sync frequency, must be one of following values:
+    - 0: 26 MHz
+    - 1: 13 MHz
+    - 2: 19.2 MHz
+  Defaults to 26 MHz if not specified.
+- max8952,ramp-speed: voltage ramp speed, must be one of following values:
+    - 0: 32mV/us
+    - 1: 16mV/us
+    - 2: 8mV/us
+    - 3: 4mV/us
+    - 4: 2mV/us
+    - 5: 1mV/us
+    - 6: 0.5mV/us
+    - 7: 0.25mV/us
+  Defaults to 32mV/us if not specified.
+- any available generic properties defined in regulator.txt
+
+Example:
+
+	vdd_arm_reg: pmic@60 {
+		compatible = "maxim,max8952";
+		reg = <0x60>;
+
+		/* max8952-specific properties */
+		max8952,vid-gpios = <&gpx0 3 0>, <&gpx0 4 0>;
+		max8952,en-gpio = <&gpx0 1 0>;
+		max8952,default-mode = <0>;
+		max8952,dvs-mode-microvolt = <1250000>, <1200000>,
+						<1050000>, <950000>;
+		max8952,sync-freq = <0>;
+		max8952,ramp-speed = <0>;
+
+		/* generic regulator properties */
+		regulator-name = "vdd_arm";
+		regulator-min-microvolt = <770000>;
+		regulator-max-microvolt = <1400000>;
+		regulator-always-on;
+		regulator-boot-on;
+	};
diff --git a/drivers/regulator/max8952.c b/drivers/regulator/max8952.c
index 100b9177dba1..4259c78abf88 100644
--- a/drivers/regulator/max8952.c
+++ b/drivers/regulator/max8952.c
@@ -28,6 +28,9 @@
 #include <linux/regulator/max8952.h>
 #include <linux/gpio.h>
 #include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_gpio.h>
+#include <linux/regulator/of_regulator.h>
 #include <linux/slab.h>
 
 /* Registers */
@@ -126,6 +129,69 @@ static const struct regulator_desc regulator = {
 	.owner		= THIS_MODULE,
 };
 
+#ifdef CONFIG_OF
+static struct of_device_id max8952_dt_match[] = {
+	{ .compatible = "maxim,max8952" },
+	{},
+};
+MODULE_DEVICE_TABLE(of, max8952_dt_match);
+
+static struct max8952_platform_data *max8952_parse_dt(struct device *dev)
+{
+	struct max8952_platform_data *pd;
+	struct device_node *np = dev->of_node;
+	int ret;
+	int i;
+
+	pd = devm_kzalloc(dev, sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		dev_err(dev, "Failed to allocate platform data\n");
+		return NULL;
+	}
+
+	pd->gpio_vid0 = of_get_named_gpio(np, "max8952,vid-gpios", 0);
+	pd->gpio_vid1 = of_get_named_gpio(np, "max8952,vid-gpios", 1);
+	pd->gpio_en = of_get_named_gpio(np, "max8952,en-gpio", 0);
+
+	if (of_property_read_u32(np, "max8952,default-mode", &pd->default_mode))
+		dev_warn(dev, "Default mode not specified, assuming 0\n");
+
+	ret = of_property_read_u32_array(np, "max8952,dvs-mode-microvolt",
+					pd->dvs_mode, ARRAY_SIZE(pd->dvs_mode));
+	if (ret) {
+		dev_err(dev, "max8952,dvs-mode-microvolt property not specified");
+		return NULL;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(pd->dvs_mode); ++i) {
+		if (pd->dvs_mode[i] < 770000 || pd->dvs_mode[i] > 1400000) {
+			dev_err(dev, "DVS voltage %d out of range\n", i);
+			return NULL;
+		}
+		pd->dvs_mode[i] = (pd->dvs_mode[i] - 770000) / 10000;
+	}
+
+	if (of_property_read_u32(np, "max8952,sync-freq", &pd->sync_freq))
+		dev_warn(dev, "max8952,sync-freq property not specified, defaulting to 26MHz\n");
+
+	if (of_property_read_u32(np, "max8952,ramp-speed", &pd->ramp_speed))
+		dev_warn(dev, "max8952,ramp-speed property not specified, defaulting to 32mV/us\n");
+
+	pd->reg_data = of_get_regulator_init_data(dev, np);
+	if (!pd->reg_data) {
+		dev_err(dev, "Failed to parse regulator init data\n");
+		return NULL;
+	}
+
+	return pd;
+}
+#else
+static struct max8952_platform_data *max8952_parse_dt(struct device *dev)
+{
+	return NULL;
+}
+#endif
+
 static int max8952_pmic_probe(struct i2c_client *client,
 		const struct i2c_device_id *i2c_id)
 {
@@ -136,6 +202,9 @@ static int max8952_pmic_probe(struct i2c_client *client,
 
 	int ret = 0, err = 0;
 
+	if (client->dev.of_node)
+		pdata = max8952_parse_dt(&client->dev);
+
 	if (!pdata) {
 		dev_err(&client->dev, "Require the platform data\n");
 		return -EINVAL;
@@ -271,6 +340,7 @@ static struct i2c_driver max8952_pmic_driver = {
 	.remove		= max8952_pmic_remove,
 	.driver		= {
 		.name	= "max8952",
+		.of_match_table = of_match_ptr(max8952_dt_match),
 	},
 	.id_table	= max8952_ids,
 };
diff --git a/include/linux/regulator/max8952.h b/include/linux/regulator/max8952.h
index c13aa34d9019..4dbb63a1d4ab 100644
--- a/include/linux/regulator/max8952.h
+++ b/include/linux/regulator/max8952.h
@@ -122,11 +122,11 @@ struct max8952_platform_data {
 	int gpio_vid1;
 	int gpio_en;
 
-	u8 default_mode;
-	u8 dvs_mode[MAX8952_NUM_DVS_MODE]; /* MAX8952_DVS_MODEx_XXXXmV */
+	u32 default_mode;
+	u32 dvs_mode[MAX8952_NUM_DVS_MODE]; /* MAX8952_DVS_MODEx_XXXXmV */
 
-	u8 sync_freq;
-	u8 ramp_speed;
+	u32 sync_freq;
+	u32 ramp_speed;
 
 	struct regulator_init_data *reg_data;
 };
-- 
cgit 


From 14c6578683367b1e7af0c3c09e872b45a45183a7 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 14:21:34 +0200
Subject: mtd: nand: remove AG-AND support

We have only one AG-AND driver and it was not touched since 2005. It looks
like AG-AND was not really make it to mass-production and can be considered
a dead technology.

Along with the AG-AND support, this patch removes the BBT_AUTO_REFRESH feature,
because the only user of this feature is AG-AND. And even though it is
implemented as a generic feature, I prefer to remove it because NAND flashes do
not really need it in this form.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 80 ++------------------------------------------
 drivers/mtd/nand/nand_bbt.c  | 25 --------------
 drivers/mtd/nand/nand_ids.c  | 13 -------
 include/linux/mtd/nand.h     | 16 ---------
 4 files changed, 2 insertions(+), 132 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 42c63927609d..7b7ebd5ed430 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4,7 +4,6 @@
  *  Overview:
  *   This is the generic MTD driver for NAND flash devices. It should be
  *   capable of working with almost all NAND chips currently available.
- *   Basic support for AG-AND chips is provided.
  *
  *	Additional technical information is available on
  *	http://www.linux-mtd.infradead.org/doc/nand.html
@@ -22,8 +21,6 @@
  *	Enable cached programming for 2k page size chips
  *	Check, if mtd->ecctype should be set to MTD_ECC_HW
  *	if we have HW ECC support.
- *	The AG-AND chips have nice features for speed improvement,
- *	which are not supported yet. Read / program 4 pages in one go.
  *	BBT table is not serialized, has to be fixed
  *
  * This program is free software; you can redistribute it and/or modify
@@ -836,10 +833,7 @@ static int nand_wait(struct mtd_info *mtd, struct nand_chip *chip)
 	 */
 	ndelay(100);
 
-	if ((state == FL_ERASING) && (chip->options & NAND_IS_AND))
-		chip->cmdfunc(mtd, NAND_CMD_STATUS_MULTI, -1, -1);
-	else
-		chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
+	chip->cmdfunc(mtd, NAND_CMD_STATUS, -1, -1);
 
 	if (in_interrupt() || oops_in_progress)
 		panic_nand_wait(mtd, chip, timeo);
@@ -2480,24 +2474,6 @@ static void single_erase_cmd(struct mtd_info *mtd, int page)
 	chip->cmdfunc(mtd, NAND_CMD_ERASE2, -1, -1);
 }
 
-/**
- * multi_erase_cmd - [GENERIC] AND specific block erase command function
- * @mtd: MTD device structure
- * @page: the page address of the block which will be erased
- *
- * AND multi block erase command function. Erase 4 consecutive blocks.
- */
-static void multi_erase_cmd(struct mtd_info *mtd, int page)
-{
-	struct nand_chip *chip = mtd->priv;
-	/* Send commands to erase a block */
-	chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page++);
-	chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page++);
-	chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page++);
-	chip->cmdfunc(mtd, NAND_CMD_ERASE1, -1, page);
-	chip->cmdfunc(mtd, NAND_CMD_ERASE2, -1, -1);
-}
-
 /**
  * nand_erase - [MTD Interface] erase block(s)
  * @mtd: MTD device structure
@@ -2510,7 +2486,6 @@ static int nand_erase(struct mtd_info *mtd, struct erase_info *instr)
 	return nand_erase_nand(mtd, instr, 0);
 }
 
-#define BBT_PAGE_MASK	0xffffff3f
 /**
  * nand_erase_nand - [INTERN] erase block(s)
  * @mtd: MTD device structure
@@ -2524,8 +2499,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 {
 	int page, status, pages_per_block, ret, chipnr;
 	struct nand_chip *chip = mtd->priv;
-	loff_t rewrite_bbt[NAND_MAX_CHIPS] = {0};
-	unsigned int bbt_masked_page = 0xffffffff;
 	loff_t len;
 
 	pr_debug("%s: start = 0x%012llx, len = %llu\n",
@@ -2556,15 +2529,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 		goto erase_exit;
 	}
 
-	/*
-	 * If BBT requires refresh, set the BBT page mask to see if the BBT
-	 * should be rewritten. Otherwise the mask is set to 0xffffffff which
-	 * can not be matched. This is also done when the bbt is actually
-	 * erased to avoid recursive updates.
-	 */
-	if (chip->options & BBT_AUTO_REFRESH && !allowbbt)
-		bbt_masked_page = chip->bbt_td->pages[chipnr] & BBT_PAGE_MASK;
-
 	/* Loop through the pages */
 	len = instr->len;
 
@@ -2610,15 +2574,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 			goto erase_exit;
 		}
 
-		/*
-		 * If BBT requires refresh, set the BBT rewrite flag to the
-		 * page being erased.
-		 */
-		if (bbt_masked_page != 0xffffffff &&
-		    (page & BBT_PAGE_MASK) == bbt_masked_page)
-			    rewrite_bbt[chipnr] =
-					((loff_t)page << chip->page_shift);
-
 		/* Increment page address and decrement length */
 		len -= (1 << chip->phys_erase_shift);
 		page += pages_per_block;
@@ -2628,15 +2583,6 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr,
 			chipnr++;
 			chip->select_chip(mtd, -1);
 			chip->select_chip(mtd, chipnr);
-
-			/*
-			 * If BBT requires refresh and BBT-PERCHIP, set the BBT
-			 * page mask to see if this BBT should be rewritten.
-			 */
-			if (bbt_masked_page != 0xffffffff &&
-			    (chip->bbt_td->options & NAND_BBT_PERCHIP))
-				bbt_masked_page = chip->bbt_td->pages[chipnr] &
-					BBT_PAGE_MASK;
 		}
 	}
 	instr->state = MTD_ERASE_DONE;
@@ -2653,23 +2599,6 @@ erase_exit:
 	if (!ret)
 		mtd_erase_callback(instr);
 
-	/*
-	 * If BBT requires refresh and erase was successful, rewrite any
-	 * selected bad block tables.
-	 */
-	if (bbt_masked_page == 0xffffffff || ret)
-		return ret;
-
-	for (chipnr = 0; chipnr < chip->numchips; chipnr++) {
-		if (!rewrite_bbt[chipnr])
-			continue;
-		/* Update the BBT for chip */
-		pr_debug("%s: nand_update_bbt (%d:0x%0llx 0x%0x)\n",
-				__func__, chipnr, rewrite_bbt[chipnr],
-				chip->bbt_td->pages[chipnr]);
-		nand_update_bbt(mtd, rewrite_bbt[chipnr]);
-	}
-
 	/* Return more or less happy */
 	return ret;
 }
@@ -3302,12 +3231,7 @@ ident_done:
 	}
 
 	chip->badblockbits = 8;
-
-	/* Check for AND chips with 4 page planes */
-	if (chip->options & NAND_4PAGE_ARRAY)
-		chip->erase_cmd = multi_erase_cmd;
-	else
-		chip->erase_cmd = single_erase_cmd;
+	chip->erase_cmd = single_erase_cmd;
 
 	/* Do not replace user supplied command function! */
 	if (mtd->writesize > 512 && chip->cmdfunc == nand_command)
diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c
index 916d6e9c0ab1..267264320e06 100644
--- a/drivers/mtd/nand/nand_bbt.c
+++ b/drivers/mtd/nand/nand_bbt.c
@@ -1240,15 +1240,6 @@ int nand_update_bbt(struct mtd_info *mtd, loff_t offs)
  */
 static uint8_t scan_ff_pattern[] = { 0xff, 0xff };
 
-static uint8_t scan_agand_pattern[] = { 0x1C, 0x71, 0xC7, 0x1C, 0x71, 0xC7 };
-
-static struct nand_bbt_descr agand_flashbased = {
-	.options = NAND_BBT_SCANEMPTY | NAND_BBT_SCANALLPAGES,
-	.offs = 0x20,
-	.len = 6,
-	.pattern = scan_agand_pattern
-};
-
 /* Generic flash bbt descriptors */
 static uint8_t bbt_pattern[] = {'B', 'b', 't', '0' };
 static uint8_t mirror_pattern[] = {'1', 't', 'b', 'B' };
@@ -1333,22 +1324,6 @@ int nand_default_bbt(struct mtd_info *mtd)
 {
 	struct nand_chip *this = mtd->priv;
 
-	/*
-	 * Default for AG-AND. We must use a flash based bad block table as the
-	 * devices have factory marked _good_ blocks. Erasing those blocks
-	 * leads to loss of the good / bad information, so we _must_ store this
-	 * information in a good / bad table during startup.
-	 */
-	if (this->options & NAND_IS_AND) {
-		/* Use the default pattern descriptors */
-		if (!this->bbt_td) {
-			this->bbt_td = &bbt_main_descr;
-			this->bbt_md = &bbt_mirror_descr;
-		}
-		this->bbt_options |= NAND_BBT_USE_FLASH;
-		return nand_scan_bbt(mtd, &agand_flashbased);
-	}
-
 	/* Is a flash based bad block table requested? */
 	if (this->bbt_options & NAND_BBT_USE_FLASH) {
 		/* Use the default pattern descriptors */
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index 818bb9e29819..a907e47ce668 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -134,19 +134,6 @@ struct nand_flash_dev nand_flash_ids[] = {
 	{"NAND 64GiB 1,8V 16-bit",	0x2E, 0, 65536, 0, LP_OPTIONS16},
 	{"NAND 64GiB 3,3V 16-bit",	0x4E, 0, 65536, 0, LP_OPTIONS16},
 
-	/*
-	 * Renesas AND 1 Gigabit. Those chips do not support extended id and
-	 * have a strange page/block layout !  The chosen minimum erasesize is
-	 * 4 * 2 * 2048 = 16384 Byte, as those chips have an array of 4 page
-	 * planes 1 block = 2 pages, but due to plane arrangement the blocks
-	 * 0-3 consists of page 0 + 4,1 + 5, 2 + 6, 3 + 7 Anyway JFFS2 would
-	 * increase the eraseblock size so we chose a combined one which can be
-	 * erased in one go There are more speed improvements for reads and
-	 * writes possible, but not implemented now
-	 */
-	{"AND 128MiB 3,3V 8-bit",	0x01, 2048, 128, 0x4000,
-	 NAND_IS_AND | NAND_4PAGE_ARRAY | BBT_AUTO_REFRESH},
-
 	{NULL,}
 };
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index ef52d9c91459..7cc299436f94 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -171,22 +171,6 @@ typedef enum {
 #define NAND_CACHEPRG		0x00000008
 /* Chip has copy back function */
 #define NAND_COPYBACK		0x00000010
-/*
- * AND Chip which has 4 banks and a confusing page / block
- * assignment. See Renesas datasheet for further information.
- */
-#define NAND_IS_AND		0x00000020
-/*
- * Chip has a array of 4 pages which can be read without
- * additional ready /busy waits.
- */
-#define NAND_4PAGE_ARRAY	0x00000040
-/*
- * Chip requires that BBT is periodically rewritten to prevent
- * bits from adjacent blocks from 'leaking' in altering data.
- * This happens with the Renesas AG-AND chips, possibly others.
- */
-#define BBT_AUTO_REFRESH	0x00000080
 /*
  * Chip requires ready check on read (for auto-incremented sequential read).
  * True only for small page devices; large page devices do not support
-- 
cgit 


From 0be718e5525a73557e76ea1c05b8001dde507049 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 14:35:56 +0200
Subject: mtd: nand: remove a bunch of unused commands

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/cafe_nand.c   |  6 ------
 drivers/mtd/nand/nand_base.c   | 10 ----------
 drivers/mtd/nand/nandsim.c     |  8 --------
 drivers/mtd/nand/nuc900_nand.c |  9 ---------
 include/linux/mtd/nand.h       | 20 --------------------
 5 files changed, 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c
index 010d61266536..a01ccb970102 100644
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -303,13 +303,7 @@ static void cafe_nand_cmdfunc(struct mtd_info *mtd, unsigned command,
 	case NAND_CMD_SEQIN:
 	case NAND_CMD_RNDIN:
 	case NAND_CMD_STATUS:
-	case NAND_CMD_DEPLETE1:
 	case NAND_CMD_RNDOUT:
-	case NAND_CMD_STATUS_ERROR:
-	case NAND_CMD_STATUS_ERROR0:
-	case NAND_CMD_STATUS_ERROR1:
-	case NAND_CMD_STATUS_ERROR2:
-	case NAND_CMD_STATUS_ERROR3:
 		cafe_writel(cafe, cafe->ctl2, NAND_CTRL2);
 		return;
 	}
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7b7ebd5ed430..39154aa50fcc 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -668,16 +668,6 @@ static void nand_command_lp(struct mtd_info *mtd, unsigned int command,
 	case NAND_CMD_SEQIN:
 	case NAND_CMD_RNDIN:
 	case NAND_CMD_STATUS:
-	case NAND_CMD_DEPLETE1:
-		return;
-
-	case NAND_CMD_STATUS_ERROR:
-	case NAND_CMD_STATUS_ERROR0:
-	case NAND_CMD_STATUS_ERROR1:
-	case NAND_CMD_STATUS_ERROR2:
-	case NAND_CMD_STATUS_ERROR3:
-		/* Read error status commands require only a short delay */
-		udelay(chip->chip_delay);
 		return;
 
 	case NAND_CMD_RESET:
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 891c52a30e6a..7199acc7e7f4 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -218,7 +218,6 @@ MODULE_PARM_DESC(bch,		 "Enable BCH ecc and set how many bits should "
 #define STATE_CMD_READOOB      0x00000005 /* read OOB area */
 #define STATE_CMD_ERASE1       0x00000006 /* sector erase first command */
 #define STATE_CMD_STATUS       0x00000007 /* read status */
-#define STATE_CMD_STATUS_M     0x00000008 /* read multi-plane status (isn't implemented) */
 #define STATE_CMD_SEQIN        0x00000009 /* sequential data input */
 #define STATE_CMD_READID       0x0000000A /* read ID */
 #define STATE_CMD_ERASE2       0x0000000B /* sector erase second command */
@@ -406,8 +405,6 @@ static struct nandsim_operations {
 	{OPT_ANY, {STATE_CMD_ERASE1, STATE_ADDR_SEC, STATE_CMD_ERASE2 | ACTION_SECERASE, STATE_READY}},
 	/* Read status */
 	{OPT_ANY, {STATE_CMD_STATUS, STATE_DATAOUT_STATUS, STATE_READY}},
-	/* Read multi-plane status */
-	{OPT_SMARTMEDIA, {STATE_CMD_STATUS_M, STATE_DATAOUT_STATUS_M, STATE_READY}},
 	/* Read ID */
 	{OPT_ANY, {STATE_CMD_READID, STATE_ADDR_ZERO, STATE_DATAOUT_ID, STATE_READY}},
 	/* Large page devices read page */
@@ -1079,8 +1076,6 @@ static char *get_state_name(uint32_t state)
 			return "STATE_CMD_ERASE1";
 		case STATE_CMD_STATUS:
 			return "STATE_CMD_STATUS";
-		case STATE_CMD_STATUS_M:
-			return "STATE_CMD_STATUS_M";
 		case STATE_CMD_SEQIN:
 			return "STATE_CMD_SEQIN";
 		case STATE_CMD_READID:
@@ -1145,7 +1140,6 @@ static int check_command(int cmd)
 	case NAND_CMD_RNDOUTSTART:
 		return 0;
 
-	case NAND_CMD_STATUS_MULTI:
 	default:
 		return 1;
 	}
@@ -1171,8 +1165,6 @@ static uint32_t get_state_by_command(unsigned command)
 			return STATE_CMD_ERASE1;
 		case NAND_CMD_STATUS:
 			return STATE_CMD_STATUS;
-		case NAND_CMD_STATUS_MULTI:
-			return STATE_CMD_STATUS_M;
 		case NAND_CMD_SEQIN:
 			return STATE_CMD_SEQIN;
 		case NAND_CMD_READID:
diff --git a/drivers/mtd/nand/nuc900_nand.c b/drivers/mtd/nand/nuc900_nand.c
index a6191198d259..cd6be2ed53a8 100644
--- a/drivers/mtd/nand/nuc900_nand.c
+++ b/drivers/mtd/nand/nuc900_nand.c
@@ -177,15 +177,6 @@ static void nuc900_nand_command_lp(struct mtd_info *mtd, unsigned int command,
 	case NAND_CMD_SEQIN:
 	case NAND_CMD_RNDIN:
 	case NAND_CMD_STATUS:
-	case NAND_CMD_DEPLETE1:
-		return;
-
-	case NAND_CMD_STATUS_ERROR:
-	case NAND_CMD_STATUS_ERROR0:
-	case NAND_CMD_STATUS_ERROR1:
-	case NAND_CMD_STATUS_ERROR2:
-	case NAND_CMD_STATUS_ERROR3:
-		udelay(chip->chip_delay);
 		return;
 
 	case NAND_CMD_RESET:
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 7cc299436f94..e5d6160a4511 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -86,7 +86,6 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 #define NAND_CMD_READOOB	0x50
 #define NAND_CMD_ERASE1		0x60
 #define NAND_CMD_STATUS		0x70
-#define NAND_CMD_STATUS_MULTI	0x71
 #define NAND_CMD_SEQIN		0x80
 #define NAND_CMD_RNDIN		0x85
 #define NAND_CMD_READID		0x90
@@ -105,25 +104,6 @@ extern int nand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len);
 #define NAND_CMD_RNDOUTSTART	0xE0
 #define NAND_CMD_CACHEDPROG	0x15
 
-/* Extended commands for AG-AND device */
-/*
- * Note: the command for NAND_CMD_DEPLETE1 is really 0x00 but
- *       there is no way to distinguish that from NAND_CMD_READ0
- *       until the remaining sequence of commands has been completed
- *       so add a high order bit and mask it off in the command.
- */
-#define NAND_CMD_DEPLETE1	0x100
-#define NAND_CMD_DEPLETE2	0x38
-#define NAND_CMD_STATUS_MULTI	0x71
-#define NAND_CMD_STATUS_ERROR	0x72
-/* multi-bank error status (banks 0-3) */
-#define NAND_CMD_STATUS_ERROR0	0x73
-#define NAND_CMD_STATUS_ERROR1	0x74
-#define NAND_CMD_STATUS_ERROR2	0x75
-#define NAND_CMD_STATUS_ERROR3	0x76
-#define NAND_CMD_STATUS_RESET	0x7f
-#define NAND_CMD_STATUS_CLEAR	0xff
-
 #define NAND_CMD_NONE		-1
 
 /* Status bits */
-- 
cgit 


From 96dca4c29c5e4fc158f8f31513994408c90c6818 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 14:50:43 +0200
Subject: mtd: nand: remove NAND_NO_PADDING macro

It is not used anywhere.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index e5d6160a4511..10fd7df07b5e 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -145,8 +145,6 @@ typedef enum {
  */
 /* Buswidth is 16 bit */
 #define NAND_BUSWIDTH_16	0x00000002
-/* Device supports partial programming without padding */
-#define NAND_NO_PADDING		0x00000004
 /* Chip has cache program function */
 #define NAND_CACHEPRG		0x00000008
 /* Chip has copy back function */
@@ -171,11 +169,9 @@ typedef enum {
 #define NAND_SUBPAGE_READ	0x00001000
 
 /* Options valid for Samsung large page devices */
-#define NAND_SAMSUNG_LP_OPTIONS \
-	(NAND_NO_PADDING | NAND_CACHEPRG | NAND_COPYBACK)
+#define NAND_SAMSUNG_LP_OPTIONS (NAND_CACHEPRG | NAND_COPYBACK)
 
 /* Macros to identify the above */
-#define NAND_MUST_PAD(chip) (!(chip->options & NAND_NO_PADDING))
 #define NAND_HAS_CACHEPROG(chip) ((chip->options & NAND_CACHEPRG))
 #define NAND_HAS_COPYBACK(chip) ((chip->options & NAND_COPYBACK))
 #define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ))
-- 
cgit 


From 88ad4b162adb3821815d0098987040fca26c7d80 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 14:52:25 +0200
Subject: mtd: nand: remove NAND_COPYBACK macro

It is unused.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 10fd7df07b5e..ec0a74877714 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -147,8 +147,6 @@ typedef enum {
 #define NAND_BUSWIDTH_16	0x00000002
 /* Chip has cache program function */
 #define NAND_CACHEPRG		0x00000008
-/* Chip has copy back function */
-#define NAND_COPYBACK		0x00000010
 /*
  * Chip requires ready check on read (for auto-incremented sequential read).
  * True only for small page devices; large page devices do not support
@@ -169,11 +167,10 @@ typedef enum {
 #define NAND_SUBPAGE_READ	0x00001000
 
 /* Options valid for Samsung large page devices */
-#define NAND_SAMSUNG_LP_OPTIONS (NAND_CACHEPRG | NAND_COPYBACK)
+#define NAND_SAMSUNG_LP_OPTIONS (NAND_CACHEPRG)
 
 /* Macros to identify the above */
 #define NAND_HAS_CACHEPROG(chip) ((chip->options & NAND_CACHEPRG))
-#define NAND_HAS_COPYBACK(chip) ((chip->options & NAND_COPYBACK))
 #define NAND_HAS_SUBPAGE_READ(chip) ((chip->options & NAND_SUBPAGE_READ))
 
 /* Non chip related options */
-- 
cgit 


From 3239a6cdef0be60ec2dadc501cf385dc419d7452 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 14:56:18 +0200
Subject: mtd: nand: use NAND_HAS_CACHEPROG

We have this unused macro, let's use it and justify its existence.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_base.c | 2 +-
 include/linux/mtd/nand.h     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 39154aa50fcc..11ee9d47f53e 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2059,7 +2059,7 @@ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
 	 */
 	cached = 0;
 
-	if (!cached || !(chip->options & NAND_CACHEPRG)) {
+	if (!cached || !NAND_HAS_CACHEPROG(chip)) {
 
 		chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
 		status = chip->waitfunc(mtd, chip);
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index ec0a74877714..07d7a3dc5582 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -167,7 +167,7 @@ typedef enum {
 #define NAND_SUBPAGE_READ	0x00001000
 
 /* Options valid for Samsung large page devices */
-#define NAND_SAMSUNG_LP_OPTIONS (NAND_CACHEPRG)
+#define NAND_SAMSUNG_LP_OPTIONS NAND_CACHEPRG
 
 /* Macros to identify the above */
 #define NAND_HAS_CACHEPROG(chip) ((chip->options & NAND_CACHEPRG))
-- 
cgit 


From 68aa352de28235bccdcee96ddf49f7628e93ec6f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 16:05:00 +0200
Subject: mtd: nand: rename the id field of 'struct nand_flash_dev'

The 'id' is a bit confusing name because NAND IDs are multi-byte. Re-name
it to 'dev_id' to make it clear that this is the "device ID" part (the second
byte).

While on it, clean-up the commentary for 'struct nand_flash_dev'.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/devices/doc2000.c     |  2 +-
 drivers/mtd/devices/doc2001.c     |  2 +-
 drivers/mtd/devices/doc2001plus.c |  2 +-
 drivers/mtd/nand/nand_base.c      |  2 +-
 drivers/mtd/nand/nandsim.c        |  6 +++---
 drivers/mtd/nand/pxa3xx_nand.c    |  2 +-
 include/linux/mtd/nand.h          | 19 +++++++++----------
 7 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
index a4eb8b5b85ec..363ec3c55d92 100644
--- a/drivers/mtd/devices/doc2000.c
+++ b/drivers/mtd/devices/doc2000.c
@@ -379,7 +379,7 @@ static int DoC_IdentChip(struct DiskOnChip *doc, int floor, int chip)
 
 	/* Print and store the manufacturer and ID codes. */
 	for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-		if (id == nand_flash_ids[i].id) {
+		if (id == nand_flash_ids[i].dev_id) {
 			/* Try to identify manufacturer */
 			for (j = 0; nand_manuf_ids[j].id != 0x0; j++) {
 				if (nand_manuf_ids[j].id == mfr)
diff --git a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c
index f6927955dab0..00644bb92cdf 100644
--- a/drivers/mtd/devices/doc2001.c
+++ b/drivers/mtd/devices/doc2001.c
@@ -206,7 +206,7 @@ static int DoC_IdentChip(struct DiskOnChip *doc, int floor, int chip)
 
 	/* FIXME: to deal with multi-flash on multi-Millennium case more carefully */
 	for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-		if ( id == nand_flash_ids[i].id) {
+		if ( id == nand_flash_ids[i].dev_id) {
 			/* Try to identify manufacturer */
 			for (j = 0; nand_manuf_ids[j].id != 0x0; j++) {
 				if (nand_manuf_ids[j].id == mfr)
diff --git a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c
index 4f2220ad8924..1b0c12f3ec02 100644
--- a/drivers/mtd/devices/doc2001plus.c
+++ b/drivers/mtd/devices/doc2001plus.c
@@ -314,7 +314,7 @@ static int DoC_IdentChip(struct DiskOnChip *doc, int floor, int chip)
 		return 0;
 
 	for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-		if (id == nand_flash_ids[i].id) {
+		if (id == nand_flash_ids[i].dev_id) {
 			/* Try to identify manufacturer */
 			for (j = 0; nand_manuf_ids[j].id != 0x0; j++) {
 				if (nand_manuf_ids[j].id == mfr)
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 11ee9d47f53e..7c11abc84464 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -3142,7 +3142,7 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd,
 		type = nand_flash_ids;
 
 	for (; type->name != NULL; type++)
-		if (*dev_id == type->id)
+		if (*dev_id == type->dev_id)
 			break;
 
 	chip->onfi_version = 0;
diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
index 7199acc7e7f4..eb1238fc9067 100644
--- a/drivers/mtd/nand/nandsim.c
+++ b/drivers/mtd/nand/nandsim.c
@@ -766,9 +766,9 @@ static int init_nandsim(struct mtd_info *mtd)
 	}
 
 	/* Detect how many ID bytes the NAND chip outputs */
-        for (i = 0; nand_flash_ids[i].name != NULL; i++) {
-                if (second_id_byte != nand_flash_ids[i].id)
-                        continue;
+	for (i = 0; nand_flash_ids[i].name != NULL; i++) {
+		if (second_id_byte != nand_flash_ids[i].dev_id)
+			continue;
 	}
 
 	if (ns->busw == 16)
diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c
index 37ee75c7bacb..dec80ca6a5ce 100644
--- a/drivers/mtd/nand/pxa3xx_nand.c
+++ b/drivers/mtd/nand/pxa3xx_nand.c
@@ -989,7 +989,7 @@ static int pxa3xx_nand_scan(struct mtd_info *mtd)
 	}
 
 	pxa3xx_flash_ids[0].name = f->name;
-	pxa3xx_flash_ids[0].id = (f->chip_id >> 8) & 0xffff;
+	pxa3xx_flash_ids[0].dev_id = (f->chip_id >> 8) & 0xffff;
 	pxa3xx_flash_ids[0].pagesize = f->page_size;
 	chipsize = (uint64_t)f->num_blocks * f->page_per_block * f->page_size;
 	pxa3xx_flash_ids[0].chipsize = chipsize >> 20;
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 07d7a3dc5582..9aed31a49af1 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -548,19 +548,18 @@ struct nand_chip {
 
 /**
  * struct nand_flash_dev - NAND Flash Device ID Structure
- * @name:	Identify the device type
- * @id:		device ID code
- * @pagesize:	Pagesize in bytes. Either 256 or 512 or 0
- *		If the pagesize is 0, then the real pagesize
- *		and the eraseize are determined from the
- *		extended id bytes in the chip
- * @erasesize:	Size of an erase block in the flash device.
- * @chipsize:	Total chipsize in Mega Bytes
- * @options:	Bitfield to store chip relevant options
+ * @name: a human-readable name of the NAND chip
+ * @dev_id: the device ID (the second byte of the full chip ID array)
+ * @pagesize: size of the NAND page in bytes; if 0, then the real page size (as
+ *            well as the eraseblock size) is determined from the extended NAND
+ *            chip ID array)
+ * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
+ * @chipsize: total chip size in MiB
+ * @options: stores various chip bit options
  */
 struct nand_flash_dev {
 	char *name;
-	int id;
+	int dev_id;
 	unsigned long pagesize;
 	unsigned long chipsize;
 	unsigned long erasesize;
-- 
cgit 


From 8dbfae1ef04311ba19d6b6c9a4d8fdddbb90ab0f Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 15:39:18 +0200
Subject: mtd: nand_ids: introduce helper macros

Introduce helper macros for defining NAND chips. These macros do not really add
much value in the current code-base. However, we are going to add full ID
support which adds some more complexity to the table, and helper macros become
useful for readability.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_ids.c  | 164 +++++++++++++++++++++----------------------
 drivers/mtd/nand/sm_common.c |  56 +++++++--------
 include/linux/mtd/nand.h     |  24 +++++++
 3 files changed, 133 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index a907e47ce668..852ae4d26512 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -25,36 +25,36 @@
  * extended chip ID.
  */
 struct nand_flash_dev nand_flash_ids[] = {
-	{"NAND 4MiB 5V 8-bit",          0x6B, 512, 4, 0x2000, SP_OPTIONS},
-	{"NAND 4MiB 3,3V 8-bit",        0xE3, 512, 4, 0x2000, SP_OPTIONS},
-	{"NAND 4MiB 3,3V 8-bit",        0xE5, 512, 4, 0x2000, SP_OPTIONS},
-	{"NAND 8MiB 3,3V 8-bit",        0xD6, 512, 8, 0x2000, SP_OPTIONS},
-	{"NAND 8MiB 3,3V 8-bit",        0xE6, 512, 8, 0x2000, SP_OPTIONS},
-
-	{"NAND 16MiB 1,8V 8-bit",	0x33, 512, 16, 0x4000, SP_OPTIONS},
-	{"NAND 16MiB 3,3V 8-bit",	0x73, 512, 16, 0x4000, SP_OPTIONS},
-	{"NAND 16MiB 1,8V 16-bit",	0x43, 512, 16, 0x4000, SP_OPTIONS16},
-	{"NAND 16MiB 3,3V 16-bit",	0x53, 512, 16, 0x4000, SP_OPTIONS16},
-
-	{"NAND 32MiB 1,8V 8-bit",	0x35, 512, 32, 0x4000, SP_OPTIONS},
-	{"NAND 32MiB 3,3V 8-bit",	0x75, 512, 32, 0x4000, SP_OPTIONS},
-	{"NAND 32MiB 1,8V 16-bit",	0x45, 512, 32, 0x4000, SP_OPTIONS16},
-	{"NAND 32MiB 3,3V 16-bit",	0x55, 512, 32, 0x4000, SP_OPTIONS16},
-
-	{"NAND 64MiB 1,8V 8-bit",	0x36, 512, 64, 0x4000, SP_OPTIONS},
-	{"NAND 64MiB 3,3V 8-bit",	0x76, 512, 64, 0x4000, SP_OPTIONS},
-	{"NAND 64MiB 1,8V 16-bit",	0x46, 512, 64, 0x4000, SP_OPTIONS16},
-	{"NAND 64MiB 3,3V 16-bit",	0x56, 512, 64, 0x4000, SP_OPTIONS16},
-
-	{"NAND 128MiB 1,8V 8-bit",	0x78, 512, 128, 0x4000, SP_OPTIONS},
-	{"NAND 128MiB 1,8V 8-bit",	0x39, 512, 128, 0x4000, SP_OPTIONS},
-	{"NAND 128MiB 3,3V 8-bit",	0x79, 512, 128, 0x4000, SP_OPTIONS},
-	{"NAND 128MiB 1,8V 16-bit",	0x72, 512, 128, 0x4000, SP_OPTIONS16},
-	{"NAND 128MiB 1,8V 16-bit",	0x49, 512, 128, 0x4000, SP_OPTIONS16},
-	{"NAND 128MiB 3,3V 16-bit",	0x74, 512, 128, 0x4000, SP_OPTIONS16},
-	{"NAND 128MiB 3,3V 16-bit",	0x59, 512, 128, 0x4000, SP_OPTIONS16},
-
-	{"NAND 256MiB 3,3V 8-bit",	0x71, 512, 256, 0x4000, SP_OPTIONS},
+	LEGACY_ID_NAND("NAND 4MiB 5V 8-bit",   0x6B, 512, 4, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE3, 512, 4, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE5, 512, 4, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xD6, 512, 8, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xE6, 512, 8, 0x2000, SP_OPTIONS),
+
+	LEGACY_ID_NAND("NAND 16MiB 1,8V 8-bit",  0x33, 512, 16, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 16MiB 3,3V 8-bit",  0x73, 512, 16, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 16MiB 1,8V 16-bit", 0x43, 512, 16, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 16MiB 3,3V 16-bit", 0x53, 512, 16, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 32MiB 1,8V 8-bit",  0x35, 512, 32, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 32MiB 3,3V 8-bit",  0x75, 512, 32, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 32MiB 1,8V 16-bit", 0x45, 512, 32, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 32MiB 3,3V 16-bit", 0x55, 512, 32, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 64MiB 1,8V 8-bit",  0x36, 512, 64, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 64MiB 3,3V 8-bit",  0x76, 512, 64, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 64MiB 1,8V 16-bit", 0x46, 512, 64, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 64MiB 3,3V 16-bit", 0x56, 512, 64, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x78, 512, 128, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x39, 512, 128, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 128MiB 3,3V 8-bit",  0x79, 512, 128, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x72, 512, 128, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x49, 512, 128, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x74, 512, 128, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x59, 512, 128, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 256MiB 3,3V 8-bit", 0x71, 512, 256, 0x4000, SP_OPTIONS),
 
 	/*
 	 * These are the new chips with large page size. Their page size and
@@ -62,79 +62,79 @@ struct nand_flash_dev nand_flash_ids[] = {
 	 */
 
 	/* 512 Megabit */
-	{"NAND 64MiB 1,8V 8-bit",	0xA2, 0,  64, 0, LP_OPTIONS},
-	{"NAND 64MiB 1,8V 8-bit",	0xA0, 0,  64, 0, LP_OPTIONS},
-	{"NAND 64MiB 3,3V 8-bit",	0xF2, 0,  64, 0, LP_OPTIONS},
-	{"NAND 64MiB 3,3V 8-bit",	0xD0, 0,  64, 0, LP_OPTIONS},
-	{"NAND 64MiB 3,3V 8-bit",	0xF0, 0,  64, 0, LP_OPTIONS},
-	{"NAND 64MiB 1,8V 16-bit",	0xB2, 0,  64, 0, LP_OPTIONS16},
-	{"NAND 64MiB 1,8V 16-bit",	0xB0, 0,  64, 0, LP_OPTIONS16},
-	{"NAND 64MiB 3,3V 16-bit",	0xC2, 0,  64, 0, LP_OPTIONS16},
-	{"NAND 64MiB 3,3V 16-bit",	0xC0, 0,  64, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 64MiB 1,8V 8-bit",  0xA2,  64, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64MiB 1,8V 8-bit",  0xA0,  64, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64MiB 3,3V 8-bit",  0xF2,  64, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64MiB 3,3V 8-bit",  0xD0,  64, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64MiB 3,3V 8-bit",  0xF0,  64, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64MiB 1,8V 16-bit", 0xB2,  64, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 64MiB 1,8V 16-bit", 0xB0,  64, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 64MiB 3,3V 16-bit", 0xC2,  64, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 64MiB 3,3V 16-bit", 0xC0,  64, LP_OPTIONS16),
 
 	/* 1 Gigabit */
-	{"NAND 128MiB 1,8V 8-bit",	0xA1, 0, 128, 0, LP_OPTIONS},
-	{"NAND 128MiB 3,3V 8-bit",	0xF1, 0, 128, 0, LP_OPTIONS},
-	{"NAND 128MiB 3,3V 8-bit",	0xD1, 0, 128, 0, LP_OPTIONS},
-	{"NAND 128MiB 1,8V 16-bit",	0xB1, 0, 128, 0, LP_OPTIONS16},
-	{"NAND 128MiB 3,3V 16-bit",	0xC1, 0, 128, 0, LP_OPTIONS16},
-	{"NAND 128MiB 1,8V 16-bit",     0xAD, 0, 128, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 128MiB 1,8V 8-bit",  0xA1, 128, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 128MiB 3,3V 8-bit",  0xF1, 128, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 128MiB 3,3V 8-bit",  0xD1, 128, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 128MiB 1,8V 16-bit", 0xB1, 128, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 128MiB 3,3V 16-bit", 0xC1, 128, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 128MiB 1,8V 16-bit", 0xAD, 128, LP_OPTIONS16),
 
 	/* 2 Gigabit */
-	{"NAND 256MiB 1,8V 8-bit",	0xAA, 0, 256, 0, LP_OPTIONS},
-	{"NAND 256MiB 3,3V 8-bit",	0xDA, 0, 256, 0, LP_OPTIONS},
-	{"NAND 256MiB 1,8V 16-bit",	0xBA, 0, 256, 0, LP_OPTIONS16},
-	{"NAND 256MiB 3,3V 16-bit",	0xCA, 0, 256, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 256MiB 1,8V 8-bit",  0xAA, 256, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 256MiB 3,3V 8-bit",  0xDA, 256, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 256MiB 1,8V 16-bit", 0xBA, 256, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 256MiB 3,3V 16-bit", 0xCA, 256, LP_OPTIONS16),
 
 	/* 4 Gigabit */
-	{"NAND 512MiB 1,8V 8-bit",	0xAC, 0, 512, 0, LP_OPTIONS},
-	{"NAND 512MiB 3,3V 8-bit",	0xDC, 0, 512, 0, LP_OPTIONS},
-	{"NAND 512MiB 1,8V 16-bit",	0xBC, 0, 512, 0, LP_OPTIONS16},
-	{"NAND 512MiB 3,3V 16-bit",	0xCC, 0, 512, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 512MiB 1,8V 8-bit",  0xAC, 512, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 512MiB 3,3V 8-bit",  0xDC, 512, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 512MiB 1,8V 16-bit", 0xBC, 512, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 512MiB 3,3V 16-bit", 0xCC, 512, LP_OPTIONS16),
 
 	/* 8 Gigabit */
-	{"NAND 1GiB 1,8V 8-bit",	0xA3, 0, 1024, 0, LP_OPTIONS},
-	{"NAND 1GiB 3,3V 8-bit",	0xD3, 0, 1024, 0, LP_OPTIONS},
-	{"NAND 1GiB 1,8V 16-bit",	0xB3, 0, 1024, 0, LP_OPTIONS16},
-	{"NAND 1GiB 3,3V 16-bit",	0xC3, 0, 1024, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 1GiB 1,8V 8-bit",  0xA3, 1024, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 1GiB 3,3V 8-bit",  0xD3, 1024, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 1GiB 1,8V 16-bit", 0xB3, 1024, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 1GiB 3,3V 16-bit", 0xC3, 1024, LP_OPTIONS16),
 
 	/* 16 Gigabit */
-	{"NAND 2GiB 1,8V 8-bit",	0xA5, 0, 2048, 0, LP_OPTIONS},
-	{"NAND 2GiB 3,3V 8-bit",	0xD5, 0, 2048, 0, LP_OPTIONS},
-	{"NAND 2GiB 1,8V 16-bit",	0xB5, 0, 2048, 0, LP_OPTIONS16},
-	{"NAND 2GiB 3,3V 16-bit",	0xC5, 0, 2048, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 2GiB 1,8V 8-bit",  0xA5, 2048, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 2GiB 3,3V 8-bit",  0xD5, 2048, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 2GiB 1,8V 16-bit", 0xB5, 2048, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 2GiB 3,3V 16-bit", 0xC5, 2048, LP_OPTIONS16),
 
 	/* 32 Gigabit */
-	{"NAND 4GiB 1,8V 8-bit",	0xA7, 0, 4096, 0, LP_OPTIONS},
-	{"NAND 4GiB 3,3V 8-bit",	0xD7, 0, 4096, 0, LP_OPTIONS},
-	{"NAND 4GiB 1,8V 16-bit",	0xB7, 0, 4096, 0, LP_OPTIONS16},
-	{"NAND 4GiB 3,3V 16-bit",	0xC7, 0, 4096, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 4GiB 1,8V 8-bit",  0xA7, 4096, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 4GiB 3,3V 8-bit",  0xD7, 4096, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 4GiB 1,8V 16-bit", 0xB7, 4096, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 4GiB 3,3V 16-bit", 0xC7, 4096, LP_OPTIONS16),
 
 	/* 64 Gigabit */
-	{"NAND 8GiB 1,8V 8-bit",	0xAE, 0, 8192, 0, LP_OPTIONS},
-	{"NAND 8GiB 3,3V 8-bit",	0xDE, 0, 8192, 0, LP_OPTIONS},
-	{"NAND 8GiB 1,8V 16-bit",	0xBE, 0, 8192, 0, LP_OPTIONS16},
-	{"NAND 8GiB 3,3V 16-bit",	0xCE, 0, 8192, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 8GiB 1,8V 8-bit",  0xAE, 8192, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 8GiB 3,3V 8-bit",  0xDE, 8192, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 8GiB 1,8V 16-bit", 0xBE, 8192, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 8GiB 3,3V 16-bit", 0xCE, 8192, LP_OPTIONS16),
 
 	/* 128 Gigabit */
-	{"NAND 16GiB 1,8V 8-bit",	0x1A, 0, 16384, 0, LP_OPTIONS},
-	{"NAND 16GiB 3,3V 8-bit",	0x3A, 0, 16384, 0, LP_OPTIONS},
-	{"NAND 16GiB 1,8V 16-bit",	0x2A, 0, 16384, 0, LP_OPTIONS16},
-	{"NAND 16GiB 3,3V 16-bit",	0x4A, 0, 16384, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 16GiB 1,8V 8-bit",  0x1A, 16384, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 16GiB 3,3V 8-bit",  0x3A, 16384, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 16GiB 1,8V 16-bit", 0x2A, 16384, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 16GiB 3,3V 16-bit", 0x4A, 16384, LP_OPTIONS16),
 
 	/* 256 Gigabit */
-	{"NAND 32GiB 1,8V 8-bit",	0x1C, 0, 32768, 0, LP_OPTIONS},
-	{"NAND 32GiB 3,3V 8-bit",	0x3C, 0, 32768, 0, LP_OPTIONS},
-	{"NAND 32GiB 1,8V 16-bit",	0x2C, 0, 32768, 0, LP_OPTIONS16},
-	{"NAND 32GiB 3,3V 16-bit",	0x4C, 0, 32768, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 32GiB 1,8V 8-bit",  0x1C, 32768, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 32GiB 3,3V 8-bit",  0x3C, 32768, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 32GiB 1,8V 16-bit", 0x2C, 32768, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 32GiB 3,3V 16-bit", 0x4C, 32768, LP_OPTIONS16),
 
 	/* 512 Gigabit */
-	{"NAND 64GiB 1,8V 8-bit",	0x1E, 0, 65536, 0, LP_OPTIONS},
-	{"NAND 64GiB 3,3V 8-bit",	0x3E, 0, 65536, 0, LP_OPTIONS},
-	{"NAND 64GiB 1,8V 16-bit",	0x2E, 0, 65536, 0, LP_OPTIONS16},
-	{"NAND 64GiB 3,3V 16-bit",	0x4E, 0, 65536, 0, LP_OPTIONS16},
+	EXTENDED_ID_NAND("NAND 64GiB 1,8V 8-bit",  0x1E, 65536, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64GiB 3,3V 8-bit",  0x3E, 65536, LP_OPTIONS),
+	EXTENDED_ID_NAND("NAND 64GiB 1,8V 16-bit", 0x2E, 65536, LP_OPTIONS16),
+	EXTENDED_ID_NAND("NAND 64GiB 3,3V 16-bit", 0x4E, 65536, LP_OPTIONS16),
 
-	{NULL,}
+	{NULL}
 };
 
 /* Manufacturer IDs */
diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c
index 201458f4516d..6cdf1e88ff30 100644
--- a/drivers/mtd/nand/sm_common.c
+++ b/drivers/mtd/nand/sm_common.c
@@ -67,39 +67,37 @@ static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
 	return error;
 }
 
-
 static struct nand_flash_dev nand_smartmedia_flash_ids[] = {
-	{"SmartMedia 2MiB 3,3V ROM",    0x5d, 512, 2, 0x2000, NAND_ROM},
-	{"SmartMedia 4MiB 3,3V",        0xe3, 512, 4, 0x2000, 0},
-	{"SmartMedia 4MiB 3,3/5V",      0xe5, 512, 4, 0x2000, 0},
-	{"SmartMedia 4MiB 5V",          0x6b, 512, 4, 0x2000, 0},
-	{"SmartMedia 4MiB 3,3V ROM",    0xd5, 512, 4, 0x2000, NAND_ROM},
-	{"SmartMedia 8MiB 3,3V",        0xe6, 512, 8, 0x2000, 0},
-	{"SmartMedia 8MiB 3,3V ROM",    0xd6, 512, 8, 0x2000, NAND_ROM},
-	{"SmartMedia 16MiB 3,3V",       0x73, 512, 16, 0x4000, 0},
-	{"SmartMedia 16MiB 3,3V ROM",   0x57, 512, 16, 0x4000, NAND_ROM},
-	{"SmartMedia 32MiB 3,3V",       0x75, 512, 32, 0x4000, 0},
-	{"SmartMedia 32MiB 3,3V ROM",   0x58, 512, 32, 0x4000, NAND_ROM},
-	{"SmartMedia 64MiB 3,3V",       0x76, 512, 64, 0x4000, 0},
-	{"SmartMedia 64MiB 3,3V ROM",   0xd9, 512, 64, 0x4000, NAND_ROM},
-	{"SmartMedia 128MiB 3,3V",      0x79, 512, 128, 0x4000, 0},
-	{"SmartMedia 128MiB 3,3V ROM",  0xda, 512, 128, 0x4000, NAND_ROM},
-	{"SmartMedia 256MiB 3,3V",      0x71, 512, 256, 0x4000 },
-	{"SmartMedia 256MiB 3,3V ROM",  0x5b, 512, 256, 0x4000, NAND_ROM},
-	{NULL,}
+	LEGACY_ID_NAND("SmartMedia 2MiB 3,3V ROM",   0x5d, 512, 2,   0x2000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 4MiB 3,3V",       0xe3, 512, 4,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 4MiB 3,3/5V",     0xe5, 512, 4,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 4MiB 5V",         0x6b, 512, 4,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 4MiB 3,3V ROM",   0xd5, 512, 4,   0x2000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 8MiB 3,3V",       0xe6, 512, 8,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 8MiB 3,3V ROM",   0xd6, 512, 8,   0x2000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 16MiB 3,3V",      0x73, 512, 16,  0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 16MiB 3,3V ROM",  0x57, 512, 16,  0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 32MiB 3,3V",      0x75, 512, 32,  0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 32MiB 3,3V ROM",  0x58, 512, 32,  0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 64MiB 3,3V",      0x76, 512, 64,  0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 64MiB 3,3V ROM",  0xd9, 512, 64,  0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 128MiB 3,3V",     0x79, 512, 128, 0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 128MiB 3,3V ROM", 0xda, 512, 128, 0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 256MiB 3, 3V",    0x71, 512, 256, 0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 256MiB 3,3V ROM", 0x5b, 512, 256, 0x4000, NAND_ROM),
+	{NULL}
 };
 
 static struct nand_flash_dev nand_xd_flash_ids[] = {
-
-	{"xD 16MiB 3,3V",    0x73, 512, 16, 0x4000, 0},
-	{"xD 32MiB 3,3V",    0x75, 512, 32, 0x4000, 0},
-	{"xD 64MiB 3,3V",    0x76, 512, 64, 0x4000, 0},
-	{"xD 128MiB 3,3V",   0x79, 512, 128, 0x4000, 0},
-	{"xD 256MiB 3,3V",   0x71, 512, 256, 0x4000, NAND_BROKEN_XD},
-	{"xD 512MiB 3,3V",   0xdc, 512, 512, 0x4000, NAND_BROKEN_XD},
-	{"xD 1GiB 3,3V",     0xd3, 512, 1024, 0x4000, NAND_BROKEN_XD},
-	{"xD 2GiB 3,3V",     0xd5, 512, 2048, 0x4000, NAND_BROKEN_XD},
-	{NULL,}
+	LEGACY_ID_NAND("xD 16MiB 3,3V",  0x73, 512, 16,   0x4000, 0),
+	LEGACY_ID_NAND("xD 32MiB 3,3V",  0x75, 512, 32,   0x4000, 0),
+	LEGACY_ID_NAND("xD 64MiB 3,3V",  0x76, 512, 64,   0x4000, 0),
+	LEGACY_ID_NAND("xD 128MiB 3,3V", 0x79, 512, 128,  0x4000, 0),
+	LEGACY_ID_NAND("xD 256MiB 3,3V", 0x71, 512, 256,  0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 512MiB 3,3V", 0xdc, 512, 512,  0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 1GiB 3,3V",   0xd3, 512, 1024, 0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 2GiB 3,3V",   0xd5, 512, 2048, 0x4000, NAND_BROKEN_XD),
+	{NULL}
 };
 
 int sm_register_device(struct mtd_info *mtd, int smartmedia)
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9aed31a49af1..63b319a6f98c 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -546,6 +546,30 @@ struct nand_chip {
 #define NAND_MFR_MACRONIX	0xc2
 #define NAND_MFR_EON		0x92
 
+/*
+ * A helper for defining older NAND chips where the second ID byte fully
+ * defined the chip, including the geometry (chip size, eraseblock size, page
+ * size).
+ */
+#define LEGACY_ID_NAND(nm, devid, pagesz, chipsz, erasesz, opts) \
+	{ .name = (nm), .dev_id = (devid), .pagesize = (pagesz), \
+	  .chipsize = (chipsz), .erasesize = (erasesz),          \
+	  .options = (opts) }
+
+/*
+ * A helper for defining newer chips which report their page size and
+ * eraseblock size via the extended ID bytes.
+ *
+ * The real difference between LEGACY_ID_NAND and EXTENDED_ID_NAND is that with
+ * EXTENDED_ID_NAND, manufacturers overloaded the same device ID so that the
+ * device ID now only represented a particular total chip size (and voltage,
+ * buswidth), and the page size, eraseblock size, and OOB size could vary while
+ * using the same device ID.
+ */
+#define EXTENDED_ID_NAND(nm, devid, chipsz, opts)                \
+	{ .name = (nm), .dev_id = (devid), .chipsize = (chipsz), \
+	  .options = (opts) }
+
 /**
  * struct nand_flash_dev - NAND Flash Device ID Structure
  * @name: a human-readable name of the NAND chip
-- 
cgit 


From 8e12b474f9a2349bcaebda65bdc38e8398ff408e Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 4 Mar 2013 16:26:56 +0200
Subject: mtd: nand: provision full ID support

Up until now we identified NAND chips by the 'device ID' part of the full chip
ID array, which is the second full ID array byte. However, the newest flashes
use the same device ID for chips with identical page and eraseblock sizes, but
different OOB sizes. And unfortunately, it is not clear if there is a
"standard" way to fetch the OOB size from chip's full ID array. Here is an
example:

Toshiba TC58NVG2S0F: 0x98, 0xdc, 0x90, 0x26, 0x76, 0x15, 0x01, 0x08
Toshiba TC58NVG3S0F: 0x98, 0xd3, 0x90, 0x26, 0x76, 0x15, 0x02, 0x08

The first one is a 512MiB NAND chip with 4KiB NAND pages, 256KiB eraseblock
size and 224 bytes OOB. The second one is a 1GiB NAND chip with the same page
and eraseblock sizes, but with 232 bytes OOB.

This means that we have to store full ID in our NAND flashes table in order to
distinguish between these 2.

This patch adds the 'id[8]' field to the 'struct nand_flash_dev' structure, and
it makes it to be a part of anonymous union, where the second member is a
structure containing the 'mfr_id' and 'dev_id' bytes. The union makes sure that
'mfr_id' refers the same RAM address as 'id[0]' and 'dev_id' refers the same
RAM address as 'id[1]'. The only motivation for the union is an assumption that
'type->dev_id' is more readable than 'type->id[1]'.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 63b319a6f98c..9a1b74c85044 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -551,9 +551,9 @@ struct nand_chip {
  * defined the chip, including the geometry (chip size, eraseblock size, page
  * size).
  */
-#define LEGACY_ID_NAND(nm, devid, pagesz, chipsz, erasesz, opts) \
-	{ .name = (nm), .dev_id = (devid), .pagesize = (pagesz), \
-	  .chipsize = (chipsz), .erasesize = (erasesz),          \
+#define LEGACY_ID_NAND(nm, devid, pagesz, chipsz, erasesz, opts)       \
+	{ .name = (nm), {{ .dev_id = (devid) }}, .pagesize = (pagesz), \
+	  .chipsize = (chipsz), .erasesize = (erasesz),                \
 	  .options = (opts) }
 
 /*
@@ -566,14 +566,19 @@ struct nand_chip {
  * buswidth), and the page size, eraseblock size, and OOB size could vary while
  * using the same device ID.
  */
-#define EXTENDED_ID_NAND(nm, devid, chipsz, opts)                \
-	{ .name = (nm), .dev_id = (devid), .chipsize = (chipsz), \
+#define EXTENDED_ID_NAND(nm, devid, chipsz, opts)                      \
+	{ .name = (nm), {{ .dev_id = (devid) }}, .chipsize = (chipsz), \
 	  .options = (opts) }
 
 /**
  * struct nand_flash_dev - NAND Flash Device ID Structure
  * @name: a human-readable name of the NAND chip
  * @dev_id: the device ID (the second byte of the full chip ID array)
+ * @mfr_id: manufecturer ID part of the full chip ID array (refers the same
+ *          memory address as @id[0])
+ * @dev_id: device ID part of the full chip ID array (refers the same memory
+ *          address as @id[1])
+ * @id: full device ID array
  * @pagesize: size of the NAND page in bytes; if 0, then the real page size (as
  *            well as the eraseblock size) is determined from the extended NAND
  *            chip ID array)
@@ -583,7 +588,13 @@ struct nand_chip {
  */
 struct nand_flash_dev {
 	char *name;
-	int dev_id;
+	union {
+		struct {
+			uint8_t mfr_id;
+			uint8_t dev_id;
+		};
+		uint8_t id[8];
+	};
 	unsigned long pagesize;
 	unsigned long chipsize;
 	unsigned long erasesize;
-- 
cgit 


From ad2457894c272279bf73ca46ae5ea5de4876d2a0 Mon Sep 17 00:00:00 2001
From: Daniel Mack <zonque@gmail.com>
Date: Mon, 4 Mar 2013 00:57:20 +0100
Subject: mtd: devices: elm: check for device's presence before configuration

In case the driver is not probed - due to config mismatches or errors
in the DTS files - dev_get_drvdata() returns NULL, leading to an Ooops
during boot.

Make elm_config() return an error in such cases to propagate the error
up to the user, so it can fall back to software mode.

Signed-off-by: Daniel Mack <zonque@gmail.com>
Acked-by: Peter Korsgaard <jacmet@sunsite.dk>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/devices/elm.c         | 9 ++++++++-
 drivers/mtd/nand/omap2.c          | 5 +++--
 include/linux/platform_data/elm.h | 2 +-
 3 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/devices/elm.c b/drivers/mtd/devices/elm.c
index 2ec5da9ee248..dccef9fdc1f2 100644
--- a/drivers/mtd/devices/elm.c
+++ b/drivers/mtd/devices/elm.c
@@ -81,14 +81,21 @@ static u32 elm_read_reg(struct elm_info *info, int offset)
  * @dev:	ELM device
  * @bch_type:	Type of BCH ecc
  */
-void elm_config(struct device *dev, enum bch_ecc bch_type)
+int elm_config(struct device *dev, enum bch_ecc bch_type)
 {
 	u32 reg_val;
 	struct elm_info *info = dev_get_drvdata(dev);
 
+	if (!info) {
+		dev_err(dev, "Unable to configure elm - device not probed?\n");
+		return -ENODEV;
+	}
+
 	reg_val = (bch_type & ECC_BCH_LEVEL_MASK) | (ELM_ECC_SIZE << 16);
 	elm_write_reg(info, ELM_LOCATION_CONFIG, reg_val);
 	info->bch_type = bch_type;
+
+	return 0;
 }
 EXPORT_SYMBOL(elm_config);
 
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index 8e820ddf4e08..b97ef3b68ec6 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -1701,8 +1701,9 @@ static int omap3_init_bch(struct mtd_info *mtd, int ecc_opt)
 		elm_node = of_find_node_by_phandle(be32_to_cpup(parp));
 		pdev = of_find_device_by_node(elm_node);
 		info->elm_dev = &pdev->dev;
-		elm_config(info->elm_dev, bch_type);
-		info->is_elm_used = true;
+
+		if (elm_config(info->elm_dev, bch_type) == 0)
+			info->is_elm_used = true;
 	}
 
 	if (info->is_elm_used && (mtd->writesize <= 4096)) {
diff --git a/include/linux/platform_data/elm.h b/include/linux/platform_data/elm.h
index 1bd5244d1dcd..bf0a83b7ed9d 100644
--- a/include/linux/platform_data/elm.h
+++ b/include/linux/platform_data/elm.h
@@ -50,5 +50,5 @@ struct elm_errorvec {
 
 void elm_decode_bch_error_page(struct device *dev, u8 *ecc_calc,
 		struct elm_errorvec *err_vec);
-void elm_config(struct device *dev, enum bch_ecc bch_type);
+int elm_config(struct device *dev, enum bch_ecc bch_type);
 #endif /* __ELM_H */
-- 
cgit 


From 26a4734623e4f06752014336b05cf3ae77158892 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Mon, 11 Mar 2013 15:38:48 +0200
Subject: mtd: add 'const' qualifier to a couple of register functions

'mtd_device_parse_register()' and 'parse_mtd_partitions()' functions accept a
an array of character pointers. These functions modify neither the pointers nor
the characters they point to. The characters are actually names of the MTD
parsers.

At the moment, the argument type is 'const char **', which means that only the
names of the parsers are constant. Let's turn the argument type into 'const
char * const *', which means that both names and the pointers which point to
them are constant.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/mtdcore.c   | 2 +-
 drivers/mtd/mtdcore.h   | 3 ++-
 drivers/mtd/mtdpart.c   | 2 +-
 include/linux/mtd/mtd.h | 8 ++++----
 4 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c
index 61d5f56473e1..681bb6de6ff9 100644
--- a/drivers/mtd/mtdcore.c
+++ b/drivers/mtd/mtdcore.c
@@ -492,7 +492,7 @@ out_error:
  *
  * Returns zero in case of success and a negative error code in case of failure.
  */
-int mtd_device_parse_register(struct mtd_info *mtd, const char **types,
+int mtd_device_parse_register(struct mtd_info *mtd, const char * const *types,
 			      struct mtd_part_parser_data *parser_data,
 			      const struct mtd_partition *parts,
 			      int nr_parts)
diff --git a/drivers/mtd/mtdcore.h b/drivers/mtd/mtdcore.h
index 961a38408542..62bf5ac42209 100644
--- a/drivers/mtd/mtdcore.h
+++ b/drivers/mtd/mtdcore.h
@@ -15,7 +15,8 @@ extern int del_mtd_device(struct mtd_info *mtd);
 extern int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *,
 			      int);
 extern int del_mtd_partitions(struct mtd_info *);
-extern int parse_mtd_partitions(struct mtd_info *master, const char **types,
+extern int parse_mtd_partitions(struct mtd_info *master,
+				const char * const *types,
 				struct mtd_partition **pparts,
 				struct mtd_part_parser_data *data);
 
diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
index 70fa70a8318f..9ee0911025c8 100644
--- a/drivers/mtd/mtdpart.c
+++ b/drivers/mtd/mtdpart.c
@@ -720,7 +720,7 @@ static const char *default_mtd_part_types[] = {
  * o a positive number of found partitions, in which case on exit @pparts will
  *   point to an array containing this number of &struct mtd_info objects.
  */
-int parse_mtd_partitions(struct mtd_info *master, const char **types,
+int parse_mtd_partitions(struct mtd_info *master, const char *const *types,
 			 struct mtd_partition **pparts,
 			 struct mtd_part_parser_data *data)
 {
diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
index f9ac2897b86b..a5cf4e8d6818 100644
--- a/include/linux/mtd/mtd.h
+++ b/include/linux/mtd/mtd.h
@@ -362,10 +362,10 @@ struct mtd_partition;
 struct mtd_part_parser_data;
 
 extern int mtd_device_parse_register(struct mtd_info *mtd,
-			      const char **part_probe_types,
-			      struct mtd_part_parser_data *parser_data,
-			      const struct mtd_partition *defparts,
-			      int defnr_parts);
+				     const char * const *part_probe_types,
+				     struct mtd_part_parser_data *parser_data,
+				     const struct mtd_partition *defparts,
+				     int defnr_parts);
 #define mtd_device_register(master, parts, nr_parts)	\
 	mtd_device_parse_register(master, NULL, NULL, parts, nr_parts)
 extern int mtd_device_unregister(struct mtd_info *master);
-- 
cgit 


From d50dcb1d0e1f8ebe11db1719aad5753cd1b4c4f8 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Tue, 12 Mar 2013 10:28:31 +0200
Subject: mtd: plat-ram: add const quilifiers

Be a bit stricter and add few more 'const' qualifiers.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/maps/plat-ram.c  | 2 +-
 include/linux/mtd/plat-ram.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/plat-ram.c b/drivers/mtd/maps/plat-ram.c
index 2de66b062f0d..71fdda29594b 100644
--- a/drivers/mtd/maps/plat-ram.c
+++ b/drivers/mtd/maps/plat-ram.c
@@ -199,7 +199,7 @@ static int platram_probe(struct platform_device *pdev)
 	 * supplied by the platform_data struct */
 
 	if (pdata->map_probes) {
-		const char **map_probes = pdata->map_probes;
+		const char * const *map_probes = pdata->map_probes;
 
 		for ( ; !info->mtd && *map_probes; map_probes++)
 			info->mtd = do_map_probe(*map_probes , &info->map);
diff --git a/include/linux/mtd/plat-ram.h b/include/linux/mtd/plat-ram.h
index e07890aff1cf..44212d65aa97 100644
--- a/include/linux/mtd/plat-ram.h
+++ b/include/linux/mtd/plat-ram.h
@@ -20,8 +20,8 @@
 
 struct platdata_mtd_ram {
 	const char		*mapname;
-	const char		**map_probes;
-	const char		**probes;
+	const char * const      *map_probes;
+	const char * const      *probes;
 	struct mtd_partition	*partitions;
 	int			 nr_partitions;
 	int			 bankwidth;
-- 
cgit 


From f39cf6c7d472349a9907955ef213db1f26618ba0 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Tue, 12 Mar 2013 10:32:52 +0200
Subject: mtd: physmap: add const qualifiers

Be a bit stricter and add few more 'const' qualifiers.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/maps/physmap.c  | 17 +++++++----------
 include/linux/mtd/physmap.h |  2 +-
 2 files changed, 8 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c
index 21b0b713cacb..e7a592c8c765 100644
--- a/drivers/mtd/maps/physmap.c
+++ b/drivers/mtd/maps/physmap.c
@@ -87,21 +87,18 @@ static void physmap_set_vpp(struct map_info *map, int state)
 	spin_unlock_irqrestore(&info->vpp_lock, flags);
 }
 
-static const char *rom_probe_types[] = {
-					"cfi_probe",
-					"jedec_probe",
-					"qinfo_probe",
-					"map_rom",
-					NULL };
-static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", "afs",
-					  NULL };
+static const char * const rom_probe_types[] = {
+	"cfi_probe", "jedec_probe", "qinfo_probe", "map_rom", NULL };
+
+static const char * const part_probe_types[] = {
+	"cmdlinepart", "RedBoot", "afs", NULL };
 
 static int physmap_flash_probe(struct platform_device *dev)
 {
 	struct physmap_flash_data *physmap_data;
 	struct physmap_flash_info *info;
-	const char **probe_type;
-	const char **part_types;
+	const char * const *probe_type;
+	const char * const *part_types;
 	int err = 0;
 	int i;
 	int devices_found = 0;
diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
index d2887e76b7f6..aa6a2633c2da 100644
--- a/include/linux/mtd/physmap.h
+++ b/include/linux/mtd/physmap.h
@@ -30,7 +30,7 @@ struct physmap_flash_data {
 	unsigned int		pfow_base;
 	char                    *probe_type;
 	struct mtd_partition	*parts;
-	const char		**part_probe_types;
+	const char * const	*part_probe_types;
 };
 
 #endif /* __LINUX_MTD_PHYSMAP__ */
-- 
cgit 


From ecb42fea59cf2d1e7160c5c7e707120e0e6714db Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Wed, 13 Mar 2013 13:45:00 +0200
Subject: mtd: nand: use more reasonable integer types

Use 'unsigned int' instead of 'unsigned long' in the NAND chip description data
structure, because 32-bits is more than enough for our purposes. We do not need
64-bits, which is what we end up on 64-bit architectures. We declare many
instances of this data structure, so this should help saving some amount of
memory.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 9a1b74c85044..d5903c0a0fe7 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -582,8 +582,8 @@ struct nand_chip {
  * @pagesize: size of the NAND page in bytes; if 0, then the real page size (as
  *            well as the eraseblock size) is determined from the extended NAND
  *            chip ID array)
- * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
  * @chipsize: total chip size in MiB
+ * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
  * @options: stores various chip bit options
  */
 struct nand_flash_dev {
@@ -595,10 +595,10 @@ struct nand_flash_dev {
 		};
 		uint8_t id[8];
 	};
-	unsigned long pagesize;
-	unsigned long chipsize;
-	unsigned long erasesize;
-	unsigned long options;
+	unsigned int pagesize;
+	unsigned int chipsize;
+	unsigned int erasesize;
+	unsigned int options;
 };
 
 /**
-- 
cgit 


From 53552d22bfe1f83f69f18eddae2f1d96249440f3 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Thu, 14 Mar 2013 09:57:23 +0200
Subject: mtd: introduce a macro for max NAND ID sequence length

Introduce a helpful macro for the maximum NAND ID sequence length instead of
using the "8" magic number.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Acked-by: Huang Shijie <shijie8@gmail.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index d5903c0a0fe7..0f78d19303c5 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -546,6 +546,9 @@ struct nand_chip {
 #define NAND_MFR_MACRONIX	0xc2
 #define NAND_MFR_EON		0x92
 
+/* The maximum expected count of bytes in the NAND ID sequence */
+#define NAND_MAX_ID_LEN 8
+
 /*
  * A helper for defining older NAND chips where the second ID byte fully
  * defined the chip, including the geometry (chip size, eraseblock size, page
@@ -593,7 +596,7 @@ struct nand_flash_dev {
 			uint8_t mfr_id;
 			uint8_t dev_id;
 		};
-		uint8_t id[8];
+		uint8_t id[NAND_MAX_ID_LEN];
 	};
 	unsigned int pagesize;
 	unsigned int chipsize;
-- 
cgit 


From f22d5f638b0ea40e7cceb4639a608bd2c3eff97c Mon Sep 17 00:00:00 2001
From: Huang Shijie <b32955@freescale.com>
Date: Fri, 15 Mar 2013 11:00:59 +0800
Subject: mtd: add new fields to nand_flash_dev{}

As time goes on, we begin to meet the situation that we can not get enough
information from some nand chips's id data. Take some Toshiba's nand chips
for example. I have 4 Toshiba's nand chips in my hand:
	TC58NVG2S0F, TC58NVG3S0F, TC58NVG5D2, TC58NVG6D2

When we read these chips' datasheets, we will get the geometry of these chips:
	TC58NVG2S0F : 4096 + 224
	TC58NVG3S0F : 4096 + 232
	TC58NVG5D2  : 8192 + 640
	TC58NVG6D2  : 8192 + 640

But we can not parse out the correct oob size for these chips from the id data.

This patch adds some new fields to the nand_flash_dev{}:
  @id_len: the valid length of the id data. See the comments in
           nand_id_has_period()
  @oobsize: the oob size.

Signed-off-by: Huang Shijie <b32955@freescale.com>
Reviewed-by: Brian Norris <computersforpeace@gmail.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 include/linux/mtd/nand.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 0f78d19303c5..13786f0ae12a 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -588,6 +588,8 @@ struct nand_chip {
  * @chipsize: total chip size in MiB
  * @erasesize: eraseblock size in bytes (determined from the extended ID if 0)
  * @options: stores various chip bit options
+ * @id_len: The valid length of the @id.
+ * @oobsize: OOB size
  */
 struct nand_flash_dev {
 	char *name;
@@ -602,6 +604,8 @@ struct nand_flash_dev {
 	unsigned int chipsize;
 	unsigned int erasesize;
 	unsigned int options;
+	uint16_t id_len;
+	uint16_t oobsize;
 };
 
 /**
-- 
cgit 


From 5bfa9b71a2d6642506e2dfdf49a66620f54f1d92 Mon Sep 17 00:00:00 2001
From: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Date: Tue, 19 Mar 2013 10:29:26 +0200
Subject: mtd: nand_ids: improve LEGACY_ID_NAND macro a bit

Notice that all the flashes belonging to the "legacy ID" class have 512 bytes
NAND page. This means we may simplify the 'LEGACY_ID_NAND()' macro as well as
the NAND ID table a little.

Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/nand_ids.c  | 60 ++++++++++++++++++++++----------------------
 drivers/mtd/nand/sm_common.c | 50 ++++++++++++++++++------------------
 include/linux/mtd/nand.h     |  9 +++----
 3 files changed, 59 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index d2c72368a7b5..80634b05a395 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -44,36 +44,36 @@ struct nand_flash_dev nand_flash_ids[] = {
 		{ .id = {0x98, 0xde, 0x94, 0x82, 0x76, 0x56, 0x04, 0x20} },
 		  SZ_8K, SZ_8K, SZ_2M, 0, 8, 640},
 
-	LEGACY_ID_NAND("NAND 4MiB 5V 8-bit",   0x6B, 512, 4, 0x2000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE3, 512, 4, 0x2000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE5, 512, 4, 0x2000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xD6, 512, 8, 0x2000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xE6, 512, 8, 0x2000, SP_OPTIONS),
-
-	LEGACY_ID_NAND("NAND 16MiB 1,8V 8-bit",  0x33, 512, 16, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 16MiB 3,3V 8-bit",  0x73, 512, 16, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 16MiB 1,8V 16-bit", 0x43, 512, 16, 0x4000, SP_OPTIONS16),
-	LEGACY_ID_NAND("NAND 16MiB 3,3V 16-bit", 0x53, 512, 16, 0x4000, SP_OPTIONS16),
-
-	LEGACY_ID_NAND("NAND 32MiB 1,8V 8-bit",  0x35, 512, 32, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 32MiB 3,3V 8-bit",  0x75, 512, 32, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 32MiB 1,8V 16-bit", 0x45, 512, 32, 0x4000, SP_OPTIONS16),
-	LEGACY_ID_NAND("NAND 32MiB 3,3V 16-bit", 0x55, 512, 32, 0x4000, SP_OPTIONS16),
-
-	LEGACY_ID_NAND("NAND 64MiB 1,8V 8-bit",  0x36, 512, 64, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 64MiB 3,3V 8-bit",  0x76, 512, 64, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 64MiB 1,8V 16-bit", 0x46, 512, 64, 0x4000, SP_OPTIONS16),
-	LEGACY_ID_NAND("NAND 64MiB 3,3V 16-bit", 0x56, 512, 64, 0x4000, SP_OPTIONS16),
-
-	LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x78, 512, 128, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x39, 512, 128, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 128MiB 3,3V 8-bit",  0x79, 512, 128, 0x4000, SP_OPTIONS),
-	LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x72, 512, 128, 0x4000, SP_OPTIONS16),
-	LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x49, 512, 128, 0x4000, SP_OPTIONS16),
-	LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x74, 512, 128, 0x4000, SP_OPTIONS16),
-	LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x59, 512, 128, 0x4000, SP_OPTIONS16),
-
-	LEGACY_ID_NAND("NAND 256MiB 3,3V 8-bit", 0x71, 512, 256, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 4MiB 5V 8-bit",   0x6B, 4, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE3, 4, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 4MiB 3,3V 8-bit", 0xE5, 4, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xD6, 8, 0x2000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 8MiB 3,3V 8-bit", 0xE6, 8, 0x2000, SP_OPTIONS),
+
+	LEGACY_ID_NAND("NAND 16MiB 1,8V 8-bit",  0x33, 16, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 16MiB 3,3V 8-bit",  0x73, 16, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 16MiB 1,8V 16-bit", 0x43, 16, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 16MiB 3,3V 16-bit", 0x53, 16, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 32MiB 1,8V 8-bit",  0x35, 32, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 32MiB 3,3V 8-bit",  0x75, 32, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 32MiB 1,8V 16-bit", 0x45, 32, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 32MiB 3,3V 16-bit", 0x55, 32, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 64MiB 1,8V 8-bit",  0x36, 64, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 64MiB 3,3V 8-bit",  0x76, 64, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 64MiB 1,8V 16-bit", 0x46, 64, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 64MiB 3,3V 16-bit", 0x56, 64, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x78, 128, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 8-bit",  0x39, 128, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 128MiB 3,3V 8-bit",  0x79, 128, 0x4000, SP_OPTIONS),
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x72, 128, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 128MiB 1,8V 16-bit", 0x49, 128, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x74, 128, 0x4000, SP_OPTIONS16),
+	LEGACY_ID_NAND("NAND 128MiB 3,3V 16-bit", 0x59, 128, 0x4000, SP_OPTIONS16),
+
+	LEGACY_ID_NAND("NAND 256MiB 3,3V 8-bit", 0x71, 256, 0x4000, SP_OPTIONS),
 
 	/*
 	 * These are the new chips with large page size. Their page size and
diff --git a/drivers/mtd/nand/sm_common.c b/drivers/mtd/nand/sm_common.c
index 6cdf1e88ff30..7bcab623acc3 100644
--- a/drivers/mtd/nand/sm_common.c
+++ b/drivers/mtd/nand/sm_common.c
@@ -68,35 +68,35 @@ static int sm_block_markbad(struct mtd_info *mtd, loff_t ofs)
 }
 
 static struct nand_flash_dev nand_smartmedia_flash_ids[] = {
-	LEGACY_ID_NAND("SmartMedia 2MiB 3,3V ROM",   0x5d, 512, 2,   0x2000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 4MiB 3,3V",       0xe3, 512, 4,   0x2000, 0),
-	LEGACY_ID_NAND("SmartMedia 4MiB 3,3/5V",     0xe5, 512, 4,   0x2000, 0),
-	LEGACY_ID_NAND("SmartMedia 4MiB 5V",         0x6b, 512, 4,   0x2000, 0),
-	LEGACY_ID_NAND("SmartMedia 4MiB 3,3V ROM",   0xd5, 512, 4,   0x2000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 8MiB 3,3V",       0xe6, 512, 8,   0x2000, 0),
-	LEGACY_ID_NAND("SmartMedia 8MiB 3,3V ROM",   0xd6, 512, 8,   0x2000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 16MiB 3,3V",      0x73, 512, 16,  0x4000, 0),
-	LEGACY_ID_NAND("SmartMedia 16MiB 3,3V ROM",  0x57, 512, 16,  0x4000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 32MiB 3,3V",      0x75, 512, 32,  0x4000, 0),
-	LEGACY_ID_NAND("SmartMedia 32MiB 3,3V ROM",  0x58, 512, 32,  0x4000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 64MiB 3,3V",      0x76, 512, 64,  0x4000, 0),
-	LEGACY_ID_NAND("SmartMedia 64MiB 3,3V ROM",  0xd9, 512, 64,  0x4000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 128MiB 3,3V",     0x79, 512, 128, 0x4000, 0),
-	LEGACY_ID_NAND("SmartMedia 128MiB 3,3V ROM", 0xda, 512, 128, 0x4000, NAND_ROM),
-	LEGACY_ID_NAND("SmartMedia 256MiB 3, 3V",    0x71, 512, 256, 0x4000, 0),
-	LEGACY_ID_NAND("SmartMedia 256MiB 3,3V ROM", 0x5b, 512, 256, 0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 2MiB 3,3V ROM",   0x5d, 2,   0x2000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 4MiB 3,3V",       0xe3, 4,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 4MiB 3,3/5V",     0xe5, 4,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 4MiB 5V",         0x6b, 4,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 4MiB 3,3V ROM",   0xd5, 4,   0x2000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 8MiB 3,3V",       0xe6, 8,   0x2000, 0),
+	LEGACY_ID_NAND("SmartMedia 8MiB 3,3V ROM",   0xd6, 8,   0x2000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 16MiB 3,3V",      0x73, 16,  0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 16MiB 3,3V ROM",  0x57, 16,  0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 32MiB 3,3V",      0x75, 32,  0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 32MiB 3,3V ROM",  0x58, 32,  0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 64MiB 3,3V",      0x76, 64,  0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 64MiB 3,3V ROM",  0xd9, 64,  0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 128MiB 3,3V",     0x79, 128, 0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 128MiB 3,3V ROM", 0xda, 128, 0x4000, NAND_ROM),
+	LEGACY_ID_NAND("SmartMedia 256MiB 3, 3V",    0x71, 256, 0x4000, 0),
+	LEGACY_ID_NAND("SmartMedia 256MiB 3,3V ROM", 0x5b, 256, 0x4000, NAND_ROM),
 	{NULL}
 };
 
 static struct nand_flash_dev nand_xd_flash_ids[] = {
-	LEGACY_ID_NAND("xD 16MiB 3,3V",  0x73, 512, 16,   0x4000, 0),
-	LEGACY_ID_NAND("xD 32MiB 3,3V",  0x75, 512, 32,   0x4000, 0),
-	LEGACY_ID_NAND("xD 64MiB 3,3V",  0x76, 512, 64,   0x4000, 0),
-	LEGACY_ID_NAND("xD 128MiB 3,3V", 0x79, 512, 128,  0x4000, 0),
-	LEGACY_ID_NAND("xD 256MiB 3,3V", 0x71, 512, 256,  0x4000, NAND_BROKEN_XD),
-	LEGACY_ID_NAND("xD 512MiB 3,3V", 0xdc, 512, 512,  0x4000, NAND_BROKEN_XD),
-	LEGACY_ID_NAND("xD 1GiB 3,3V",   0xd3, 512, 1024, 0x4000, NAND_BROKEN_XD),
-	LEGACY_ID_NAND("xD 2GiB 3,3V",   0xd5, 512, 2048, 0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 16MiB 3,3V",  0x73, 16,   0x4000, 0),
+	LEGACY_ID_NAND("xD 32MiB 3,3V",  0x75, 32,   0x4000, 0),
+	LEGACY_ID_NAND("xD 64MiB 3,3V",  0x76, 64,   0x4000, 0),
+	LEGACY_ID_NAND("xD 128MiB 3,3V", 0x79, 128,  0x4000, 0),
+	LEGACY_ID_NAND("xD 256MiB 3,3V", 0x71, 256,  0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 512MiB 3,3V", 0xdc, 512,  0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 1GiB 3,3V",   0xd3, 1024, 0x4000, NAND_BROKEN_XD),
+	LEGACY_ID_NAND("xD 2GiB 3,3V",   0xd5, 2048, 0x4000, NAND_BROKEN_XD),
 	{NULL}
 };
 
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index 13786f0ae12a..ebf970e11428 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -552,12 +552,11 @@ struct nand_chip {
 /*
  * A helper for defining older NAND chips where the second ID byte fully
  * defined the chip, including the geometry (chip size, eraseblock size, page
- * size).
+ * size). All these chips have 512 bytes NAND page size.
  */
-#define LEGACY_ID_NAND(nm, devid, pagesz, chipsz, erasesz, opts)       \
-	{ .name = (nm), {{ .dev_id = (devid) }}, .pagesize = (pagesz), \
-	  .chipsize = (chipsz), .erasesize = (erasesz),                \
-	  .options = (opts) }
+#define LEGACY_ID_NAND(nm, devid, chipsz, erasesz, opts)          \
+	{ .name = (nm), {{ .dev_id = (devid) }}, .pagesize = 512, \
+	  .chipsize = (chipsz), .erasesize = (erasesz), .options = (opts) }
 
 /*
  * A helper for defining newer chips which report their page size and
-- 
cgit 


From 837a6ba4f3b6d23026674e6af6b6849a4634fff9 Mon Sep 17 00:00:00 2001
From: "Gupta, Pekon" <pekon@ti.com>
Date: Fri, 15 Mar 2013 17:55:53 +0530
Subject: mtd: nand: subpage write support for hardware based ECC schemes

This patch adds support for subpage (partial-page) writes when using
hardware based ECC schemes.
Advantages:
(1) reduces storage overhead when using file-systems like UBIFS, which
store LEB header at page-size granularity.
(2) allows independent subpage writes, thereby increasing NAND storage
efficiency for non-page aligned data.
+ updated cafe_nand and lpc32xx_mlc NAND drivers for change in
chip->write_page interface.

Signed-off-by: Gupta, Pekon <pekon@ti.com>
Signed-off-by: Artem Bityutskiy <artem.bityutskiy@linux.intel.com>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 drivers/mtd/nand/cafe_nand.c   |  4 +-
 drivers/mtd/nand/lpc32xx_mlc.c |  4 +-
 drivers/mtd/nand/nand_base.c   | 99 ++++++++++++++++++++++++++++++++++++------
 include/linux/mtd/nand.h       |  8 +++-
 4 files changed, 96 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c
index a01ccb970102..c34985a55101 100644
--- a/drivers/mtd/nand/cafe_nand.c
+++ b/drivers/mtd/nand/cafe_nand.c
@@ -530,8 +530,8 @@ static int cafe_nand_write_page_lowlevel(struct mtd_info *mtd,
 }
 
 static int cafe_nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-				const uint8_t *buf, int oob_required, int page,
-				int cached, int raw)
+			uint32_t offset, int data_len, const uint8_t *buf,
+			int oob_required, int page, int cached, int raw)
 {
 	int status;
 
diff --git a/drivers/mtd/nand/lpc32xx_mlc.c b/drivers/mtd/nand/lpc32xx_mlc.c
index 0ca22ae9135c..a94facb46e5c 100644
--- a/drivers/mtd/nand/lpc32xx_mlc.c
+++ b/drivers/mtd/nand/lpc32xx_mlc.c
@@ -540,8 +540,8 @@ static int lpc32xx_write_page_lowlevel(struct mtd_info *mtd,
 }
 
 static int lpc32xx_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-			      const uint8_t *buf, int oob_required, int page,
-			      int cached, int raw)
+			uint32_t offset, int data_len, const uint8_t *buf,
+			int oob_required, int page, int cached, int raw)
 {
 	int res;
 
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index ae9790b659b1..dfcd0a565c5b 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -1110,7 +1110,7 @@ static int nand_read_page_swecc(struct mtd_info *mtd, struct nand_chip *chip,
 }
 
 /**
- * nand_read_subpage - [REPLACEABLE] software ECC based sub-page read function
+ * nand_read_subpage - [REPLACEABLE] ECC based sub-page read function
  * @mtd: mtd info structure
  * @chip: nand chip info structure
  * @data_offs: offset of requested data within the page
@@ -1978,6 +1978,67 @@ static int nand_write_page_hwecc(struct mtd_info *mtd, struct nand_chip *chip,
 	return 0;
 }
 
+
+/**
+ * nand_write_subpage_hwecc - [REPLACABLE] hardware ECC based subpage write
+ * @mtd:	mtd info structure
+ * @chip:	nand chip info structure
+ * @column:	column address of subpage within the page
+ * @data_len:	data length
+ * @oob_required: must write chip->oob_poi to OOB
+ */
+static int nand_write_subpage_hwecc(struct mtd_info *mtd,
+				struct nand_chip *chip, uint32_t offset,
+				uint32_t data_len, const uint8_t *data_buf,
+				int oob_required)
+{
+	uint8_t *oob_buf  = chip->oob_poi;
+	uint8_t *ecc_calc = chip->buffers->ecccalc;
+	int ecc_size      = chip->ecc.size;
+	int ecc_bytes     = chip->ecc.bytes;
+	int ecc_steps     = chip->ecc.steps;
+	uint32_t *eccpos  = chip->ecc.layout->eccpos;
+	uint32_t start_step = offset / ecc_size;
+	uint32_t end_step   = (offset + data_len - 1) / ecc_size;
+	int oob_bytes       = mtd->oobsize / ecc_steps;
+	int step, i;
+
+	for (step = 0; step < ecc_steps; step++) {
+		/* configure controller for WRITE access */
+		chip->ecc.hwctl(mtd, NAND_ECC_WRITE);
+
+		/* write data (untouched subpages already masked by 0xFF) */
+		chip->write_buf(mtd, data_buf, ecc_size);
+
+		/* mask ECC of un-touched subpages by padding 0xFF */
+		if ((step < start_step) || (step > end_step))
+			memset(ecc_calc, 0xff, ecc_bytes);
+		else
+			chip->ecc.calculate(mtd, data_buf, ecc_calc);
+
+		/* mask OOB of un-touched subpages by padding 0xFF */
+		/* if oob_required, preserve OOB metadata of written subpage */
+		if (!oob_required || (step < start_step) || (step > end_step))
+			memset(oob_buf, 0xff, oob_bytes);
+
+		data_buf += ecc_size;
+		ecc_calc += ecc_bytes;
+		oob_buf  += oob_bytes;
+	}
+
+	/* copy calculated ECC for whole page to chip->buffer->oob */
+	/* this include masked-value(0xFF) for unwritten subpages */
+	ecc_calc = chip->buffers->ecccalc;
+	for (i = 0; i < chip->ecc.total; i++)
+		chip->oob_poi[eccpos[i]] = ecc_calc[i];
+
+	/* write OOB buffer to NAND device */
+	chip->write_buf(mtd, chip->oob_poi, mtd->oobsize);
+
+	return 0;
+}
+
+
 /**
  * nand_write_page_syndrome - [REPLACEABLE] hardware ECC syndrome based page write
  * @mtd: mtd info structure
@@ -2030,6 +2091,8 @@ static int nand_write_page_syndrome(struct mtd_info *mtd,
  * nand_write_page - [REPLACEABLE] write one page
  * @mtd: MTD device structure
  * @chip: NAND chip descriptor
+ * @offset: address offset within the page
+ * @data_len: length of actual data to be written
  * @buf: the data to write
  * @oob_required: must write chip->oob_poi to OOB
  * @page: page number to write
@@ -2037,15 +2100,25 @@ static int nand_write_page_syndrome(struct mtd_info *mtd,
  * @raw: use _raw version of write_page
  */
 static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip,
-			   const uint8_t *buf, int oob_required, int page,
-			   int cached, int raw)
+		uint32_t offset, int data_len, const uint8_t *buf,
+		int oob_required, int page, int cached, int raw)
 {
-	int status;
+	int status, subpage;
+
+	if (!(chip->options & NAND_NO_SUBPAGE_WRITE) &&
+		chip->ecc.write_subpage)
+		subpage = offset || (data_len < mtd->writesize);
+	else
+		subpage = 0;
 
 	chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page);
 
 	if (unlikely(raw))
-		status = chip->ecc.write_page_raw(mtd, chip, buf, oob_required);
+		status = chip->ecc.write_page_raw(mtd, chip, buf,
+							oob_required);
+	else if (subpage)
+		status = chip->ecc.write_subpage(mtd, chip, offset, data_len,
+							 buf, oob_required);
 	else
 		status = chip->ecc.write_page(mtd, chip, buf, oob_required);
 
@@ -2159,7 +2232,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 
 	uint8_t *oob = ops->oobbuf;
 	uint8_t *buf = ops->datbuf;
-	int ret, subpage;
+	int ret;
 	int oob_required = oob ? 1 : 0;
 
 	ops->retlen = 0;
@@ -2174,10 +2247,6 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 	}
 
 	column = to & (mtd->writesize - 1);
-	subpage = column || (writelen & (mtd->writesize - 1));
-
-	if (subpage && oob)
-		return -EINVAL;
 
 	chipnr = (int)(to >> chip->chip_shift);
 	chip->select_chip(mtd, chipnr);
@@ -2226,9 +2295,9 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
 			/* We still need to erase leftover OOB data */
 			memset(chip->oob_poi, 0xff, mtd->oobsize);
 		}
-
-		ret = chip->write_page(mtd, chip, wbuf, oob_required, page,
-				       cached, (ops->mode == MTD_OPS_RAW));
+		ret = chip->write_page(mtd, chip, column, bytes, wbuf,
+					oob_required, page, cached,
+					(ops->mode == MTD_OPS_RAW));
 		if (ret)
 			break;
 
@@ -3414,6 +3483,10 @@ int nand_scan_tail(struct mtd_info *mtd)
 			chip->ecc.read_oob = nand_read_oob_std;
 		if (!chip->ecc.write_oob)
 			chip->ecc.write_oob = nand_write_oob_std;
+		if (!chip->ecc.read_subpage)
+			chip->ecc.read_subpage = nand_read_subpage;
+		if (!chip->ecc.write_subpage)
+			chip->ecc.write_subpage = nand_write_subpage_hwecc;
 
 	case NAND_ECC_HW_SYNDROME:
 		if ((!chip->ecc.calculate || !chip->ecc.correct ||
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index ebf970e11428..ab6363443ce8 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -318,6 +318,7 @@ struct nand_hw_control {
  *		any single ECC step, 0 if bitflips uncorrectable, -EIO hw error
  * @read_subpage:	function to read parts of the page covered by ECC;
  *			returns same as read_page()
+ * @write_subpage:	function to write parts of the page covered by ECC.
  * @write_page:	function to write a page according to the ECC generator
  *		requirements.
  * @write_oob_raw:	function to write chip OOB data without ECC
@@ -349,6 +350,9 @@ struct nand_ecc_ctrl {
 			uint8_t *buf, int oob_required, int page);
 	int (*read_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
 			uint32_t offs, uint32_t len, uint8_t *buf);
+	int (*write_subpage)(struct mtd_info *mtd, struct nand_chip *chip,
+			uint32_t offset, uint32_t data_len,
+			const uint8_t *data_buf, int oob_required);
 	int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
 			const uint8_t *buf, int oob_required);
 	int (*write_oob_raw)(struct mtd_info *mtd, struct nand_chip *chip,
@@ -484,8 +488,8 @@ struct nand_chip {
 	int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state,
 			int status, int page);
 	int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip,
-			const uint8_t *buf, int oob_required, int page,
-			int cached, int raw);
+			uint32_t offset, int data_len, const uint8_t *buf,
+			int oob_required, int page, int cached, int raw);
 	int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip,
 			int feature_addr, uint8_t *subfeature_para);
 	int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip,
-- 
cgit 


From 5ab3a89a741fa4c331f42dec17fde59586dc59e6 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Wed, 13 Mar 2013 21:34:20 +0400
Subject: mfd: syscon: Add non-DT support

This patch allow using syscon driver from the platform data, i.e.
possibility using driver on systems without oftree support.
For search syscon device from the client drivers,
"syscon_regmap_lookup_by_pdevname" function was added.

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig        |  1 -
 drivers/mfd/syscon.c       | 75 +++++++++++++++++++++++++++-------------------
 include/linux/mfd/syscon.h |  1 +
 3 files changed, 46 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 67e56dcdcbf6..3c50c388c78f 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -1100,7 +1100,6 @@ config MFD_STA2X11
 
 config MFD_SYSCON
 	bool "System Controller Register R/W Based on Regmap"
-	depends on OF
 	select REGMAP_MMIO
 	help
 	  Select this option to enable accessing system control registers
diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c
index 674af1432807..7905d36d0517 100644
--- a/drivers/mfd/syscon.c
+++ b/drivers/mfd/syscon.c
@@ -29,7 +29,7 @@ struct syscon {
 	struct regmap *regmap;
 };
 
-static int syscon_match(struct device *dev, void *data)
+static int syscon_match_node(struct device *dev, void *data)
 {
 	struct device_node *dn = data;
 
@@ -42,7 +42,7 @@ struct regmap *syscon_node_to_regmap(struct device_node *np)
 	struct device *dev;
 
 	dev = driver_find_device(&syscon_driver.driver, NULL, np,
-				 syscon_match);
+				 syscon_match_node);
 	if (!dev)
 		return ERR_PTR(-EPROBE_DEFER);
 
@@ -68,6 +68,34 @@ struct regmap *syscon_regmap_lookup_by_compatible(const char *s)
 }
 EXPORT_SYMBOL_GPL(syscon_regmap_lookup_by_compatible);
 
+static int syscon_match_pdevname(struct device *dev, void *data)
+{
+	struct platform_device *pdev = to_platform_device(dev);
+	const struct platform_device_id *id = platform_get_device_id(pdev);
+
+	if (id)
+		if (!strcmp(id->name, (const char *)data))
+			return 1;
+
+	return !strcmp(dev_name(dev), (const char *)data);
+}
+
+struct regmap *syscon_regmap_lookup_by_pdevname(const char *s)
+{
+	struct device *dev;
+	struct syscon *syscon;
+
+	dev = driver_find_device(&syscon_driver.driver, NULL, (void *)s,
+				 syscon_match_pdevname);
+	if (!dev)
+		return ERR_PTR(-EPROBE_DEFER);
+
+	syscon = dev_get_drvdata(dev);
+
+	return syscon->regmap;
+}
+EXPORT_SYMBOL_GPL(syscon_regmap_lookup_by_pdevname);
+
 struct regmap *syscon_regmap_lookup_by_phandle(struct device_node *np,
 					const char *property)
 {
@@ -99,28 +127,22 @@ static struct regmap_config syscon_regmap_config = {
 static int syscon_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
-	struct device_node *np = dev->of_node;
 	struct syscon *syscon;
-	struct resource res;
-	int ret;
-
-	if (!np)
-		return -ENOENT;
+	struct resource *res;
 
-	syscon = devm_kzalloc(dev, sizeof(struct syscon),
-			    GFP_KERNEL);
+	syscon = devm_kzalloc(dev, sizeof(*syscon), GFP_KERNEL);
 	if (!syscon)
 		return -ENOMEM;
 
-	syscon->base = of_iomap(np, 0);
-	if (!syscon->base)
-		return -EADDRNOTAVAIL;
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res)
+		return -ENOENT;
 
-	ret = of_address_to_resource(np, 0, &res);
-	if (ret)
-		return ret;
+	syscon->base = devm_ioremap(dev, res->start, resource_size(res));
+	if (!syscon->base)
+		return -ENOMEM;
 
-	syscon_regmap_config.max_register = res.end - res.start - 3;
+	syscon_regmap_config.max_register = res->end - res->start - 3;
 	syscon->regmap = devm_regmap_init_mmio(dev, syscon->base,
 					&syscon_regmap_config);
 	if (IS_ERR(syscon->regmap)) {
@@ -130,22 +152,15 @@ static int syscon_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, syscon);
 
-	dev_info(dev, "syscon regmap start 0x%x end 0x%x registered\n",
-		res.start, res.end);
+	dev_info(dev, "regmap 0x%x-0x%x registered\n", res->start, res->end);
 
 	return 0;
 }
 
-static int syscon_remove(struct platform_device *pdev)
-{
-	struct syscon *syscon;
-
-	syscon = platform_get_drvdata(pdev);
-	iounmap(syscon->base);
-	platform_set_drvdata(pdev, NULL);
-
-	return 0;
-}
+static const struct platform_device_id syscon_ids[] = {
+	{ "syscon", },
+	{ }
+};
 
 static struct platform_driver syscon_driver = {
 	.driver = {
@@ -154,7 +169,7 @@ static struct platform_driver syscon_driver = {
 		.of_match_table = of_syscon_match,
 	},
 	.probe		= syscon_probe,
-	.remove		= syscon_remove,
+	.id_table	= syscon_ids,
 };
 
 static int __init syscon_init(void)
diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h
index 6aeb6b8da64d..5c9ee6ee7960 100644
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -17,6 +17,7 @@
 
 extern struct regmap *syscon_node_to_regmap(struct device_node *np);
 extern struct regmap *syscon_regmap_lookup_by_compatible(const char *s);
+extern struct regmap *syscon_regmap_lookup_by_pdevname(const char *s);
 extern struct regmap *syscon_regmap_lookup_by_phandle(
 					struct device_node *np,
 					const char *property);
-- 
cgit 


From 12202fa7573d32aa0915cde6e8fab4c86b63ca2c Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Fri, 5 Apr 2013 19:40:10 +0200
Subject: netfilter: remove unneeded variable proc_net_netfilter

Now that this supports net namespace for nflog and nfqueue,
we can remove the global proc_net_netfilter which has no
clients anymore.

Based on patch from Gao feng.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter.h |  5 -----
 net/netfilter/core.c      | 16 ++++------------
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index ee142846f56a..0060fde3160e 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -289,11 +289,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 #endif
 }
 
-#ifdef CONFIG_PROC_FS
-#include <linux/proc_fs.h>
-extern struct proc_dir_entry *proc_net_netfilter;
-#endif
-
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 #define NF_HOOK_COND(pf, hook, skb, indev, outdev, okfn, cond) (okfn)(skb)
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index b085184d9b45..7d97302f7c07 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -276,23 +276,15 @@ void (*nf_nat_decode_session_hook)(struct sk_buff *, struct flowi *);
 EXPORT_SYMBOL(nf_nat_decode_session_hook);
 #endif
 
-#ifdef CONFIG_PROC_FS
-struct proc_dir_entry *proc_net_netfilter;
-EXPORT_SYMBOL(proc_net_netfilter);
-#endif
-
 static int __net_init netfilter_net_init(struct net *net)
 {
 #ifdef CONFIG_PROC_FS
 	net->nf.proc_netfilter = proc_net_mkdir(net, "netfilter",
 						net->proc_net);
-	if (net_eq(net, &init_net)) {
-		if (!net->nf.proc_netfilter)
-			return -ENOMEM;
-		else
-			proc_net_netfilter = net->nf.proc_netfilter;
-	} else if (!net->nf.proc_netfilter) {
-		pr_err("cannot create netfilter proc entry");
+	if (!net->nf.proc_netfilter) {
+		if (!net_eq(net, &init_net))
+			pr_err("cannot create netfilter proc entry");
+
 		return -ENOMEM;
 	}
 #endif
-- 
cgit 


From 5e49035e26abf557737c1281439548f33b41578e Mon Sep 17 00:00:00 2001
From: Stephen Warren <swarren@nvidia.com>
Date: Fri, 15 Feb 2013 15:03:50 -0700
Subject: spi/tegra: remove unused Tegra platform data header

The platform data header is no longer used. Delete it.

Signed-off-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Grant Likely <grant.likely@secretlab.ca>
---
 include/linux/spi/spi-tegra.h | 40 ----------------------------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 include/linux/spi/spi-tegra.h

(limited to 'include/linux')

diff --git a/include/linux/spi/spi-tegra.h b/include/linux/spi/spi-tegra.h
deleted file mode 100644
index 786932c62edb..000000000000
--- a/include/linux/spi/spi-tegra.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * spi-tegra.h: SPI interface for Nvidia Tegra20 SLINK controller.
- *
- * Copyright (C) 2011 NVIDIA Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#ifndef _LINUX_SPI_TEGRA_H
-#define _LINUX_SPI_TEGRA_H
-
-struct tegra_spi_platform_data {
-	int dma_req_sel;
-	unsigned int spi_max_frequency;
-};
-
-/*
- * Controller data from device to pass some info like
- * hw based chip select can be used or not and if yes
- * then CS hold and setup time.
- */
-struct tegra_spi_device_controller_data {
-	bool is_hw_based_cs;
-	int cs_setup_clk_count;
-	int cs_hold_clk_count;
-};
-
-#endif /* _LINUX_SPI_TEGRA_H */
-- 
cgit 


From a2b950ac7b1e6442919ee9e79c4963e134698869 Mon Sep 17 00:00:00 2001
From: Ohad Ben-Cohen <ohad@wizery.com>
Date: Sun, 7 Apr 2013 14:06:07 +0300
Subject: remoteproc: perserve resource table data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Copy resource table from first to second firmware loading.
After firmware is loaded to memory, update the vdevs resource
pointer to the resource table kept in device memory.

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Acked-by: Ido Yariv <ido@wizery.com>
[rebase, terminology and style changes]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 drivers/remoteproc/Kconfig           |  1 +
 drivers/remoteproc/remoteproc_core.c | 98 ++++++++++++++++++++++++++++--------
 include/linux/remoteproc.h           |  9 ++++
 3 files changed, 88 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/Kconfig b/drivers/remoteproc/Kconfig
index cc1f7bf53fd0..289e867a77dd 100644
--- a/drivers/remoteproc/Kconfig
+++ b/drivers/remoteproc/Kconfig
@@ -4,6 +4,7 @@ menu "Remoteproc drivers"
 config REMOTEPROC
 	tristate
 	depends on HAS_DMA
+	select CRC32
 	select FW_CONFIG
 	select VIRTIO
 
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 9d2a4ac6c706..617b825aa553 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -37,6 +37,7 @@
 #include <linux/iommu.h>
 #include <linux/idr.h>
 #include <linux/elf.h>
+#include <linux/crc32.h>
 #include <linux/virtio_ids.h>
 #include <linux/virtio_ring.h>
 #include <asm/byteorder.h>
@@ -45,7 +46,8 @@
 
 typedef int (*rproc_handle_resources_t)(struct rproc *rproc,
 				struct resource_table *table, int len);
-typedef int (*rproc_handle_resource_t)(struct rproc *rproc, void *, int avail);
+typedef int (*rproc_handle_resource_t)(struct rproc *rproc,
+				 void *, int offset, int avail);
 
 /* Unique indices for remoteproc devices */
 static DEFINE_IDA(rproc_dev_index);
@@ -302,7 +304,7 @@ void rproc_free_vring(struct rproc_vring *rvring)
  * Returns 0 on success, or an appropriate error code otherwise
  */
 static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
-								int avail)
+							int offset, int avail)
 {
 	struct device *dev = &rproc->dev;
 	struct rproc_vdev *rvdev;
@@ -346,6 +348,9 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
 	/* remember the device features */
 	rvdev->dfeatures = rsc->dfeatures;
 
+	/* remember the resource offset*/
+	rvdev->rsc_offset = offset;
+
 	list_add_tail(&rvdev->node, &rproc->rvdevs);
 
 	/* it is now safe to add the virtio device */
@@ -377,7 +382,7 @@ free_rvdev:
  * Returns 0 on success, or an appropriate error code otherwise
  */
 static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
-								int avail)
+							int offset, int avail)
 {
 	struct rproc_mem_entry *trace;
 	struct device *dev = &rproc->dev;
@@ -459,7 +464,7 @@ static int rproc_handle_trace(struct rproc *rproc, struct fw_rsc_trace *rsc,
  * are outside those ranges.
  */
 static int rproc_handle_devmem(struct rproc *rproc, struct fw_rsc_devmem *rsc,
-								int avail)
+							int offset, int avail)
 {
 	struct rproc_mem_entry *mapping;
 	struct device *dev = &rproc->dev;
@@ -532,7 +537,9 @@ out:
  * pressure is important; it may have a substantial impact on performance.
  */
 static int rproc_handle_carveout(struct rproc *rproc,
-				struct fw_rsc_carveout *rsc, int avail)
+						struct fw_rsc_carveout *rsc,
+						int offset, int avail)
+
 {
 	struct rproc_mem_entry *carveout, *mapping;
 	struct device *dev = &rproc->dev;
@@ -655,7 +662,7 @@ free_carv:
 }
 
 static int rproc_count_vrings(struct rproc *rproc, struct fw_rsc_vdev *rsc,
-				 int avail)
+			      int offset, int avail)
 {
 	/* Summarize the number of notification IDs */
 	rproc->max_notifyid += rsc->num_of_vrings;
@@ -683,17 +690,16 @@ static rproc_handle_resource_t rproc_count_vrings_handler[RSC_LAST] = {
 };
 
 /* handle firmware resource entries before booting the remote processor */
-static int rproc_handle_resources(struct rproc *rproc,
-				  struct resource_table *table, int len,
+static int rproc_handle_resources(struct rproc *rproc, int len,
 				  rproc_handle_resource_t handlers[RSC_LAST])
 {
 	struct device *dev = &rproc->dev;
 	rproc_handle_resource_t handler;
 	int ret = 0, i;
 
-	for (i = 0; i < table->num; i++) {
-		int offset = table->offset[i];
-		struct fw_rsc_hdr *hdr = (void *)table + offset;
+	for (i = 0; i < rproc->table_ptr->num; i++) {
+		int offset = rproc->table_ptr->offset[i];
+		struct fw_rsc_hdr *hdr = (void *)rproc->table_ptr + offset;
 		int avail = len - offset - sizeof(*hdr);
 		void *rsc = (void *)hdr + sizeof(*hdr);
 
@@ -714,7 +720,7 @@ static int rproc_handle_resources(struct rproc *rproc,
 		if (!handler)
 			continue;
 
-		ret = handler(rproc, rsc, avail);
+		ret = handler(rproc, rsc, offset + sizeof(*hdr), avail);
 		if (ret)
 			break;
 	}
@@ -772,9 +778,12 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
 {
 	struct device *dev = &rproc->dev;
 	const char *name = rproc->firmware;
-	struct resource_table *table;
+	struct resource_table *table, *loaded_table;
 	int ret, tablesz;
 
+	if (!rproc->table_ptr)
+		return -ENOMEM;
+
 	ret = rproc_fw_sanity_check(rproc, fw);
 	if (ret)
 		return ret;
@@ -800,9 +809,15 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
 		goto clean_up;
 	}
 
+	/* Verify that resource table in loaded fw is unchanged */
+	if (rproc->table_csum != crc32(0, table, tablesz)) {
+		dev_err(dev, "resource checksum failed, fw changed?\n");
+		ret = -EINVAL;
+		goto clean_up;
+	}
+
 	/* handle fw resources which are required to boot rproc */
-	ret = rproc_handle_resources(rproc, table, tablesz,
-				     rproc_loading_handlers);
+	ret = rproc_handle_resources(rproc, tablesz, rproc_loading_handlers);
 	if (ret) {
 		dev_err(dev, "Failed to process resources: %d\n", ret);
 		goto clean_up;
@@ -815,6 +830,19 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
 		goto clean_up;
 	}
 
+	/*
+	 * The starting device has been given the rproc->cached_table as the
+	 * resource table. The address of the vring along with the other
+	 * allocated resources (carveouts etc) is stored in cached_table.
+	 * In order to pass this information to the remote device we must
+	 * copy this information to device memory.
+	 */
+	loaded_table = rproc_find_loaded_rsc_table(rproc, fw);
+	if (!loaded_table)
+		goto clean_up;
+
+	memcpy(loaded_table, rproc->cached_table, tablesz);
+
 	/* power up the remote processor */
 	ret = rproc->ops->start(rproc);
 	if (ret) {
@@ -822,6 +850,13 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
 		goto clean_up;
 	}
 
+	/*
+	 * Update table_ptr so that all subsequent vring allocations and
+	 * virtio fields manipulation update the actual loaded resource table
+	 * in device memory.
+	 */
+	rproc->table_ptr = loaded_table;
+
 	rproc->state = RPROC_RUNNING;
 
 	dev_info(dev, "remote processor %s is now up\n", rproc->name);
@@ -856,16 +891,30 @@ static void rproc_fw_config_virtio(const struct firmware *fw, void *context)
 	if (!table)
 		goto out;
 
+	rproc->table_csum = crc32(0, table, tablesz);
+
+	/*
+	 * Create a copy of the resource table. When a virtio device starts
+	 * and calls vring_new_virtqueue() the address of the allocated vring
+	 * will be stored in the cached_table. Before the device is started,
+	 * cached_table will be copied into devic memory.
+	 */
+	rproc->cached_table = kmalloc(tablesz, GFP_KERNEL);
+	if (!rproc->cached_table)
+		goto out;
+
+	memcpy(rproc->cached_table, table, tablesz);
+	rproc->table_ptr = rproc->cached_table;
+
 	/* count the number of notify-ids */
 	rproc->max_notifyid = -1;
-	ret = rproc_handle_resources(rproc, table, tablesz,
-				    rproc_count_vrings_handler);
-
-	/* look for virtio devices and register them */
-	ret = rproc_handle_resources(rproc, table, tablesz, rproc_vdev_handler);
+	ret = rproc_handle_resources(rproc, tablesz, rproc_count_vrings_handler);
 	if (ret)
 		goto out;
 
+	/* look for virtio devices and register them */
+	ret = rproc_handle_resources(rproc, tablesz, rproc_vdev_handler);
+
 out:
 	release_firmware(fw);
 	/* allow rproc_del() contexts, if any, to proceed */
@@ -923,6 +972,9 @@ int rproc_trigger_recovery(struct rproc *rproc)
 	/* wait until there is no more rproc users */
 	wait_for_completion(&rproc->crash_comp);
 
+	/* Free the copy of the resource table */
+	kfree(rproc->cached_table);
+
 	return rproc_add_virtio_devices(rproc);
 }
 
@@ -1078,6 +1130,9 @@ void rproc_shutdown(struct rproc *rproc)
 
 	rproc_disable_iommu(rproc);
 
+	/* Give the next start a clean resource table */
+	rproc->table_ptr = rproc->cached_table;
+
 	/* if in crash state, unlock crash handler */
 	if (rproc->state == RPROC_CRASHED)
 		complete_all(&rproc->crash_comp);
@@ -1288,6 +1343,9 @@ int rproc_del(struct rproc *rproc)
 	list_for_each_entry_safe(rvdev, tmp, &rproc->rvdevs, node)
 		rproc_remove_virtio_dev(rvdev);
 
+	/* Free the copy of the resource table */
+	kfree(rproc->cached_table);
+
 	device_del(&rproc->dev);
 
 	return 0;
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index faf33324c78f..b4cef16460f8 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -401,6 +401,9 @@ enum rproc_crash_type {
  * @crash_comp: completion used to sync crash handler and the rproc reload
  * @recovery_disabled: flag that state if recovery was disabled
  * @max_notifyid: largest allocated notify id.
+ * @table_ptr: pointer to the resource table in effect
+ * @cached_table: copy of the resource table
+ * @table_csum: checksum of the resource table
  */
 struct rproc {
 	struct klist_node node;
@@ -429,9 +432,13 @@ struct rproc {
 	struct completion crash_comp;
 	bool recovery_disabled;
 	int max_notifyid;
+	struct resource_table *table_ptr;
+	struct resource_table *cached_table;
+	u32 table_csum;
 };
 
 /* we currently support only two vrings per rvdev */
+
 #define RVDEV_NUM_VRINGS 2
 
 /**
@@ -464,6 +471,7 @@ struct rproc_vring {
  * @vring: the vrings for this vdev
  * @dfeatures: virtio device features
  * @gfeatures: virtio guest features
+ * @rsc_offset: offset of the vdev's resource entry
  */
 struct rproc_vdev {
 	struct list_head node;
@@ -472,6 +480,7 @@ struct rproc_vdev {
 	struct rproc_vring vring[RVDEV_NUM_VRINGS];
 	unsigned long dfeatures;
 	unsigned long gfeatures;
+	u32 rsc_offset;
 };
 
 struct rproc *rproc_alloc(struct device *dev, const char *name,
-- 
cgit 


From 92b38f851470f8d8ea7ed638d546f83b5268bc12 Mon Sep 17 00:00:00 2001
From: Sjur Brændeland <sjur.brandeland@stericsson.com>
Date: Thu, 21 Feb 2013 18:15:39 +0100
Subject: remoteproc: support virtio config space.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Support virtio configuration space and device status. The virtio
device can now access the resource table in shared memory.

Signed-off-by: Sjur Brændeland <sjur.brandeland@stericsson.com>
Acked-by: Ido Yariv <ido@wizery.com>
[rebase and style changes]
Signed-off-by: Ohad Ben-Cohen <ohad@wizery.com>
---
 drivers/remoteproc/remoteproc_core.c   |  3 --
 drivers/remoteproc/remoteproc_virtio.c | 79 +++++++++++++++++++++++++++-------
 include/linux/remoteproc.h             |  4 --
 3 files changed, 64 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index 617b825aa553..d0251fe9e119 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -345,9 +345,6 @@ static int rproc_handle_vdev(struct rproc *rproc, struct fw_rsc_vdev *rsc,
 			goto free_rvdev;
 	}
 
-	/* remember the device features */
-	rvdev->dfeatures = rsc->dfeatures;
-
 	/* remember the resource offset*/
 	rvdev->rsc_offset = offset;
 
diff --git a/drivers/remoteproc/remoteproc_virtio.c b/drivers/remoteproc/remoteproc_virtio.c
index afed9b7731c4..b09c75c21b60 100644
--- a/drivers/remoteproc/remoteproc_virtio.c
+++ b/drivers/remoteproc/remoteproc_virtio.c
@@ -173,25 +173,35 @@ error:
 	return ret;
 }
 
-/*
- * We don't support yet real virtio status semantics.
- *
- * The plan is to provide this via the VDEV resource entry
- * which is part of the firmware: this way the remote processor
- * will be able to access the status values as set by us.
- */
 static u8 rproc_virtio_get_status(struct virtio_device *vdev)
 {
-	return 0;
+	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+
+	return rsc->status;
 }
 
 static void rproc_virtio_set_status(struct virtio_device *vdev, u8 status)
 {
+	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+
+	rsc->status = status;
 	dev_dbg(&vdev->dev, "status: %d\n", status);
 }
 
 static void rproc_virtio_reset(struct virtio_device *vdev)
 {
+	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+
+	rsc->status = 0;
 	dev_dbg(&vdev->dev, "reset !\n");
 }
 
@@ -199,13 +209,19 @@ static void rproc_virtio_reset(struct virtio_device *vdev)
 static u32 rproc_virtio_get_features(struct virtio_device *vdev)
 {
 	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
 
-	return rvdev->dfeatures;
+	return rsc->dfeatures;
 }
 
 static void rproc_virtio_finalize_features(struct virtio_device *vdev)
 {
 	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
 
 	/* Give virtio_ring a chance to accept features */
 	vring_transport_features(vdev);
@@ -213,13 +229,44 @@ static void rproc_virtio_finalize_features(struct virtio_device *vdev)
 	/*
 	 * Remember the finalized features of our vdev, and provide it
 	 * to the remote processor once it is powered on.
-	 *
-	 * Similarly to the status field, we don't expose yet the negotiated
-	 * features to the remote processors at this point. This will be
-	 * fixed as part of a small resource table overhaul and then an
-	 * extension of the virtio resource entries.
 	 */
-	rvdev->gfeatures = vdev->features[0];
+	rsc->gfeatures = vdev->features[0];
+}
+
+static void rproc_virtio_get(struct virtio_device *vdev, unsigned offset,
+							void *buf, unsigned len)
+{
+	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+	void *cfg;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+	cfg = &rsc->vring[rsc->num_of_vrings];
+
+	if (offset + len > rsc->config_len || offset + len < len) {
+		dev_err(&vdev->dev, "rproc_virtio_get: access out of bounds\n");
+		return;
+	}
+
+	memcpy(buf, cfg + offset, len);
+}
+
+static void rproc_virtio_set(struct virtio_device *vdev, unsigned offset,
+		      const void *buf, unsigned len)
+{
+	struct rproc_vdev *rvdev = vdev_to_rvdev(vdev);
+	struct fw_rsc_vdev *rsc;
+	void *cfg;
+
+	rsc = (void *)rvdev->rproc->table_ptr + rvdev->rsc_offset;
+	cfg = &rsc->vring[rsc->num_of_vrings];
+
+	if (offset + len > rsc->config_len || offset + len < len) {
+		dev_err(&vdev->dev, "rproc_virtio_set: access out of bounds\n");
+		return;
+	}
+
+	memcpy(cfg + offset, buf, len);
 }
 
 static const struct virtio_config_ops rproc_virtio_config_ops = {
@@ -230,6 +277,8 @@ static const struct virtio_config_ops rproc_virtio_config_ops = {
 	.reset		= rproc_virtio_reset,
 	.set_status	= rproc_virtio_set_status,
 	.get_status	= rproc_virtio_get_status,
+	.get		= rproc_virtio_get,
+	.set		= rproc_virtio_set,
 };
 
 /*
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index b4cef16460f8..9e7e745dac55 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -469,8 +469,6 @@ struct rproc_vring {
  * @rproc: the rproc handle
  * @vdev: the virio device
  * @vring: the vrings for this vdev
- * @dfeatures: virtio device features
- * @gfeatures: virtio guest features
  * @rsc_offset: offset of the vdev's resource entry
  */
 struct rproc_vdev {
@@ -478,8 +476,6 @@ struct rproc_vdev {
 	struct rproc *rproc;
 	struct virtio_device vdev;
 	struct rproc_vring vring[RVDEV_NUM_VRINGS];
-	unsigned long dfeatures;
-	unsigned long gfeatures;
 	u32 rsc_offset;
 };
 
-- 
cgit 


From 8cc9934520e7f752fe45d5199664d741ba24a932 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:50 -0700
Subject: cgroup, cpuset: replace move_member_tasks_to_cpuset() with
 cgroup_transfer_tasks()

When a cpuset becomes empty (no CPU or memory), its tasks are
transferred with the nearest ancestor with execution resources.  This
is implemented using cgroup_scan_tasks() with a callback which grabs
cgroup_mutex and invokes cgroup_attach_task() on each task.

Both cgroup_mutex and cgroup_attach_task() are scheduled to be
unexported.  Implement cgroup_transfer_tasks() in cgroup proper which
is essentially the same as move_member_tasks_to_cpuset() except that
it takes cgroups instead of cpusets and @to comes before @from like
normal functions with those arguments, and replace
move_member_tasks_to_cpuset() with it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 28 +++++++++++++++++++++++++++
 kernel/cpuset.c        | 51 ++++++--------------------------------------------
 3 files changed, 35 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 01c48c6806d6..f8eb01d75ddc 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -696,6 +696,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
 		       bool threadgroup);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
 /*
  * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4aee5bdd66c8..147d7ccb3b0b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3269,6 +3269,34 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
 	return 0;
 }
 
+static void cgroup_transfer_one_task(struct task_struct *task,
+				     struct cgroup_scanner *scan)
+{
+	struct cgroup *new_cgroup = scan->data;
+
+	cgroup_lock();
+	cgroup_attach_task(new_cgroup, task, false);
+	cgroup_unlock();
+}
+
+/**
+ * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * @to: cgroup to which the tasks will be moved
+ * @from: cgroup in which the tasks currently reside
+ */
+int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
+{
+	struct cgroup_scanner scan;
+
+	scan.cg = from;
+	scan.test_task = NULL; /* select all tasks in cgroup */
+	scan.process_task = cgroup_transfer_one_task;
+	scan.heap = NULL;
+	scan.data = to;
+
+	return cgroup_scan_tasks(&scan);
+}
+
 /*
  * Stuff for reading the 'tasks'/'procs' files.
  *
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 98d458aad789..866d78ee7930 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1994,50 +1994,6 @@ int __init cpuset_init(void)
 	return 0;
 }
 
-/**
- * cpuset_do_move_task - move a given task to another cpuset
- * @tsk: pointer to task_struct the task to move
- * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
- *
- * Called by cgroup_scan_tasks() for each task in a cgroup.
- * Return nonzero to stop the walk through the tasks.
- */
-static void cpuset_do_move_task(struct task_struct *tsk,
-				struct cgroup_scanner *scan)
-{
-	struct cgroup *new_cgroup = scan->data;
-
-	cgroup_lock();
-	cgroup_attach_task(new_cgroup, tsk, false);
-	cgroup_unlock();
-}
-
-/**
- * move_member_tasks_to_cpuset - move tasks from one cpuset to another
- * @from: cpuset in which the tasks currently reside
- * @to: cpuset to which the tasks will be moved
- *
- * Called with cpuset_mutex held
- * callback_mutex must not be held, as cpuset_attach() will take it.
- *
- * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
- * calling callback functions for each.
- */
-static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
-{
-	struct cgroup_scanner scan;
-
-	scan.cg = from->css.cgroup;
-	scan.test_task = NULL; /* select all tasks in cgroup */
-	scan.process_task = cpuset_do_move_task;
-	scan.heap = NULL;
-	scan.data = to->css.cgroup;
-
-	if (cgroup_scan_tasks(&scan))
-		printk(KERN_ERR "move_member_tasks_to_cpuset: "
-				"cgroup_scan_tasks failed\n");
-}
-
 /*
  * If CPU and/or memory hotplug handlers, below, unplug any CPUs
  * or memory nodes, we need to walk over the cpuset hierarchy,
@@ -2058,7 +2014,12 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 			nodes_empty(parent->mems_allowed))
 		parent = parent_cs(parent);
 
-	move_member_tasks_to_cpuset(cs, parent);
+	if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
+		rcu_read_lock();
+		printk(KERN_ERR "cpuset: failed to transfer tasks out of empty cpuset %s\n",
+		       cgroup_name(cs->css.cgroup));
+		rcu_read_unlock();
+	}
 }
 
 /**
-- 
cgit 


From b9777cf8d7c7854c3c38bd6621d993b85c2afcdf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:51 -0700
Subject: cgroup: unexport locking interface and cgroup_attach_task()

Now that all external cgroup_lock() users are gone, we can finally
unexport the locking interface and prevent future abuse of
cgroup_mutex.

Make cgroup_[un]lock() and cgroup_lock_live_group() static.  Also,
cgroup_attach_task() doesn't have any user left and can't be used
without locking interface anyway.  Make it static too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 5 -----
 kernel/cgroup.c        | 9 +++------
 2 files changed, 3 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index f8eb01d75ddc..63deb70f3149 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -30,10 +30,7 @@ struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
-extern void cgroup_lock(void);
 extern int cgroup_lock_is_held(void);
-extern bool cgroup_lock_live_group(struct cgroup *cgrp);
-extern void cgroup_unlock(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
@@ -693,8 +690,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 					struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
-int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk,
-		       bool threadgroup);
 int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
 int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae7617095dec..32ca0304452f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -336,7 +336,7 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
  * On success, returns true; the lock should be later released with
  * cgroup_unlock(). On failure returns false with no lock held.
  */
-bool cgroup_lock_live_group(struct cgroup *cgrp)
+static bool cgroup_lock_live_group(struct cgroup *cgrp)
 {
 	mutex_lock(&cgroup_mutex);
 	if (cgroup_is_removed(cgrp)) {
@@ -345,7 +345,6 @@ bool cgroup_lock_live_group(struct cgroup *cgrp)
 	}
 	return true;
 }
-EXPORT_SYMBOL_GPL(cgroup_lock_live_group);
 
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
@@ -824,22 +823,20 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * cgroup_lock - lock out any changes to cgroup structures
  *
  */
-void cgroup_lock(void)
+static void cgroup_lock(void)
 {
 	mutex_lock(&cgroup_mutex);
 }
-EXPORT_SYMBOL_GPL(cgroup_lock);
 
 /**
  * cgroup_unlock - release lock on cgroup changes
  *
  * Undo the lock taken in a previous cgroup_lock() call.
  */
-void cgroup_unlock(void)
+static void cgroup_unlock(void)
 {
 	mutex_unlock(&cgroup_mutex);
 }
-EXPORT_SYMBOL_GPL(cgroup_unlock);
 
 /*
  * A couple of forward declarations required, due to cyclic reference loop:
-- 
cgit 


From 2219449a65ace0290cd9c2260ff337e326b8be8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 7 Apr 2013 09:29:51 -0700
Subject: cgroup: remove cgroup_lock_is_held()

We don't want controllers to assume that the information is officially
available and do funky things with it.

The only user is task_subsys_state_check() which uses it to verify RCU
access context.  We can move cgroup_lock_is_held() inside
CONFIG_PROVE_RCU but that doesn't add meaningful protection compared
to conditionally exposing cgroup_mutex.

Remove cgroup_lock_is_held(), export cgroup_mutex iff CONFIG_PROVE_RCU
and use lockdep_is_held() directly on the mutex in
task_subsys_state_check().

While at it, add parentheses around macro arguments in
task_subsys_state_check().

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 13 +++++++++----
 kernel/cgroup.c        | 20 ++++++--------------
 2 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 63deb70f3149..515927eebb37 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -30,7 +30,6 @@ struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
-extern int cgroup_lock_is_held(void);
 extern void cgroup_fork(struct task_struct *p);
 extern void cgroup_post_fork(struct task_struct *p);
 extern void cgroup_exit(struct task_struct *p, int run_callbacks);
@@ -552,10 +551,16 @@ static inline struct cgroup_subsys_state *cgroup_subsys_state(
  * rcu_dereference_check() conditions, such as locks used during the
  * cgroup_subsys::attach() methods.
  */
+#ifdef CONFIG_PROVE_RCU
+extern struct mutex cgroup_mutex;
 #define task_subsys_state_check(task, subsys_id, __c)			\
-	rcu_dereference_check(task->cgroups->subsys[subsys_id],		\
-			      lockdep_is_held(&task->alloc_lock) ||	\
-			      cgroup_lock_is_held() || (__c))
+	rcu_dereference_check((task)->cgroups->subsys[(subsys_id)],	\
+			      lockdep_is_held(&(task)->alloc_lock) ||	\
+			      lockdep_is_held(&cgroup_mutex) || (__c))
+#else
+#define task_subsys_state_check(task, subsys_id, __c)			\
+	rcu_dereference((task)->cgroups->subsys[(subsys_id)])
+#endif
 
 static inline struct cgroup_subsys_state *
 task_subsys_state(struct task_struct *task, int subsys_id)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1a65958c1a0b..ba3e24a76dae 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -83,7 +83,13 @@
  * B happens only through cgroup_show_options() and using cgroup_root_mutex
  * breaks it.
  */
+#ifdef CONFIG_PROVE_RCU
+DEFINE_MUTEX(cgroup_mutex);
+EXPORT_SYMBOL_GPL(cgroup_mutex);	/* only for task_subsys_state_check() */
+#else
 static DEFINE_MUTEX(cgroup_mutex);
+#endif
+
 static DEFINE_MUTEX(cgroup_root_mutex);
 
 /*
@@ -251,20 +257,6 @@ static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
 
-#ifdef CONFIG_PROVE_LOCKING
-int cgroup_lock_is_held(void)
-{
-	return lockdep_is_held(&cgroup_mutex);
-}
-#else /* #ifdef CONFIG_PROVE_LOCKING */
-int cgroup_lock_is_held(void)
-{
-	return mutex_is_locked(&cgroup_mutex);
-}
-#endif /* #else #ifdef CONFIG_PROVE_LOCKING */
-
-EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
-
 static int css_unbias_refcnt(int refcnt)
 {
 	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-- 
cgit 


From 540b3a39eea0056d305f17dda47eb185c4d56ddc Mon Sep 17 00:00:00 2001
From: Or Gerlitz <ogerlitz@mellanox.com>
Date: Sun, 7 Apr 2013 03:44:07 +0000
Subject: net/mlx4_en: Enable DCB ETS ops only when supported by the firmware

Enable the DCB ETS ops only when supported by the firmware. For older firmware/cards
which don't support ETS, advertize only PFC DCB ops.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.co.il>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c |  8 ++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 10 ++++++++--
 drivers/net/ethernet/mellanox/mlx4/fw.c        |  1 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  1 +
 include/linux/mlx4/device.h                    |  1 +
 5 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
index b799ab12a291..321553fd58df 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_dcb_nl.c
@@ -253,3 +253,11 @@ const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops = {
 	.getdcbx	= mlx4_en_dcbnl_getdcbx,
 	.setdcbx	= mlx4_en_dcbnl_setdcbx,
 };
+
+const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops = {
+	.ieee_getpfc	= mlx4_en_dcbnl_ieee_getpfc,
+	.ieee_setpfc	= mlx4_en_dcbnl_ieee_setpfc,
+
+	.getdcbx	= mlx4_en_dcbnl_getdcbx,
+	.setdcbx	= mlx4_en_dcbnl_setdcbx,
+};
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 473c9d2fec1a..d2a4f919bf1f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2013,8 +2013,14 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	INIT_WORK(&priv->linkstate_task, mlx4_en_linkstate);
 	INIT_DELAYED_WORK(&priv->stats_task, mlx4_en_do_get_stats);
 #ifdef CONFIG_MLX4_EN_DCB
-	if (!mlx4_is_slave(priv->mdev->dev))
-		dev->dcbnl_ops = &mlx4_en_dcbnl_ops;
+	if (!mlx4_is_slave(priv->mdev->dev)) {
+		if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_SET_ETH_SCHED) {
+			dev->dcbnl_ops = &mlx4_en_dcbnl_ops;
+		} else {
+			en_info(priv, "enabling only PFC DCB ops\n");
+			dev->dcbnl_ops = &mlx4_en_dcbnl_pfc_ops;
+		}
+	}
 #endif
 
 	for (i = 0; i < MLX4_EN_MAC_HASH_SIZE; ++i)
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 876439748cdf..ab470d991ade 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -109,6 +109,7 @@ static void dump_dev_cap_flags(struct mlx4_dev *dev, u64 flags)
 		[41] = "Unicast VEP steering support",
 		[42] = "Multicast VEP steering support",
 		[48] = "Counters support",
+		[53] = "Port ETS Scheduler support",
 		[55] = "Port link type sensing support",
 		[59] = "Port management change event support",
 		[61] = "64 byte EQE support",
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index f710b7ce0dcb..d4cb5d3b28a2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -624,6 +624,7 @@ int mlx4_en_QUERY_PORT(struct mlx4_en_dev *mdev, u8 port);
 
 #ifdef CONFIG_MLX4_EN_DCB
 extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_ops;
+extern const struct dcbnl_rtnl_ops mlx4_en_dcbnl_pfc_ops;
 #endif
 
 int mlx4_en_setup_tc(struct net_device *dev, u8 up);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 811f91cf5e8c..1bc5a750b330 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -140,6 +140,7 @@ enum {
 	MLX4_DEV_CAP_FLAG_VEP_UC_STEER	= 1LL << 41,
 	MLX4_DEV_CAP_FLAG_VEP_MC_STEER	= 1LL << 42,
 	MLX4_DEV_CAP_FLAG_COUNTERS	= 1LL << 48,
+	MLX4_DEV_CAP_FLAG_SET_ETH_SCHED = 1LL << 53,
 	MLX4_DEV_CAP_FLAG_SENSE_SUPPORT	= 1LL << 55,
 	MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV = 1LL << 59,
 	MLX4_DEV_CAP_FLAG_64B_EQE	= 1LL << 61,
-- 
cgit 


From a0ef6a348a47bdb3896c468608f9cd19c1c26949 Mon Sep 17 00:00:00 2001
From: Henrik Rydberg <rydberg@euromail.se>
Date: Sun, 7 Apr 2013 20:52:22 -0700
Subject: Input: MT - handle semi-mt devices in core

Most semi-mt drivers use the slots in a manual way, but really only
need to treat the finger count manually. With this patch, a semi-mt
driver may use the input-mt core for everything else.

Signed-off-by: Henrik Rydberg <rydberg@euromail.se>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 drivers/input/input-mt.c | 8 +++++++-
 include/linux/input/mt.h | 1 +
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/input/input-mt.c b/drivers/input/input-mt.c
index 71db1930573f..d398f1321f14 100644
--- a/drivers/input/input-mt.c
+++ b/drivers/input/input-mt.c
@@ -79,6 +79,8 @@ int input_mt_init_slots(struct input_dev *dev, unsigned int num_slots,
 	}
 	if (flags & INPUT_MT_DIRECT)
 		__set_bit(INPUT_PROP_DIRECT, dev->propbit);
+	if (flags & INPUT_MT_SEMI_MT)
+		__set_bit(INPUT_PROP_SEMI_MT, dev->propbit);
 	if (flags & INPUT_MT_TRACK) {
 		unsigned int n2 = num_slots * num_slots;
 		mt->red = kcalloc(n2, sizeof(*mt->red), GFP_KERNEL);
@@ -246,6 +248,7 @@ void input_mt_sync_frame(struct input_dev *dev)
 {
 	struct input_mt *mt = dev->mt;
 	struct input_mt_slot *s;
+	bool use_count = false;
 
 	if (!mt)
 		return;
@@ -259,7 +262,10 @@ void input_mt_sync_frame(struct input_dev *dev)
 		}
 	}
 
-	input_mt_report_pointer_emulation(dev, (mt->flags & INPUT_MT_POINTER));
+	if ((mt->flags & INPUT_MT_POINTER) && !(mt->flags & INPUT_MT_SEMI_MT))
+		use_count = true;
+
+	input_mt_report_pointer_emulation(dev, use_count);
 
 	mt->frame++;
 }
diff --git a/include/linux/input/mt.h b/include/linux/input/mt.h
index 2e86bd0bfba1..1b1dfa80d9ff 100644
--- a/include/linux/input/mt.h
+++ b/include/linux/input/mt.h
@@ -19,6 +19,7 @@
 #define INPUT_MT_DIRECT		0x0002	/* direct device, e.g. touchscreen */
 #define INPUT_MT_DROP_UNUSED	0x0004	/* drop contacts not seen in frame */
 #define INPUT_MT_TRACK		0x0008	/* use in-kernel tracking */
+#define INPUT_MT_SEMI_MT	0x0010	/* semi-mt device, finger count handled manually */
 
 /**
  * struct input_mt_slot - represents the state of an input MT slot
-- 
cgit 


From 9e8269de100dd0be1199778dc175ff22417aebd2 Mon Sep 17 00:00:00 2001
From: Naveen Krishna Chatradhi <ch.naveen@samsung.com>
Date: Wed, 13 Mar 2013 09:38:20 +0530
Subject: hwmon: (ntc_thermistor) Add DT with IIO support to NTC thermistor
 driver

This patch adds DT support to NTC driver to parse the
platform data.

Also adds the support to work as an iio device client.

During the probe ntc driver gets the respective channels of ADC
and uses iio_raw_read calls to get the ADC converted value.

Signed-off-by: Naveen Krishna Chatradhi <ch.naveen@samsung.com>
[Guenter Roeck: fixed Kconfig dependencies; use ERR_CAST]
Tested-by: Doug Anderson <dianders@chromium.org>
Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 .../devicetree/bindings/hwmon/ntc_thermistor.txt   |  29 +++++
 drivers/hwmon/Kconfig                              |   1 +
 drivers/hwmon/ntc_thermistor.c                     | 145 ++++++++++++++++++---
 include/linux/platform_data/ntc_thermistor.h       |   8 +-
 4 files changed, 163 insertions(+), 20 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt b/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
new file mode 100644
index 000000000000..c6f66674f19c
--- /dev/null
+++ b/Documentation/devicetree/bindings/hwmon/ntc_thermistor.txt
@@ -0,0 +1,29 @@
+NTC Thermistor hwmon sensors
+-------------------------------
+
+Requires node properties:
+- "compatible" value : one of
+	"ntc,ncp15wb473"
+	"ntc,ncp18wb473"
+	"ntc,ncp21wb473"
+	"ntc,ncp03wb473"
+	"ntc,ncp15wl333"
+- "pullup-uv"	Pull up voltage in micro volts
+- "pullup-ohm"	Pull up resistor value in ohms
+- "pulldown-ohm" Pull down resistor value in ohms
+- "connected-positive" Always ON, If not specified.
+		Status change is possible.
+- "io-channels"	Channel node of ADC to be used for
+		conversion.
+
+Read more about iio bindings at
+	Documentation/devicetree/bindings/iio/iio-bindings.txt
+
+Example:
+	ncp15wb473@0 {
+		compatible = "ntc,ncp15wb473";
+		pullup-uv = <1800000>;
+		pullup-ohm = <47000>;
+		pulldown-ohm = <0>;
+		io-channels = <&adc 3>;
+	};
diff --git a/drivers/hwmon/Kconfig b/drivers/hwmon/Kconfig
index aaa14f4a0f7d..47d2176957a0 100644
--- a/drivers/hwmon/Kconfig
+++ b/drivers/hwmon/Kconfig
@@ -899,6 +899,7 @@ config SENSORS_MCP3021
 
 config SENSORS_NTC_THERMISTOR
 	tristate "NTC thermistor support"
+	depends on (!OF && !IIO) || (OF && IIO)
 	help
 	  This driver supports NTC thermistors sensor reading and its
 	  interpretation. The driver can also monitor the temperature and
diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c
index b5f63f9c0ce1..d399197655e6 100644
--- a/drivers/hwmon/ntc_thermistor.c
+++ b/drivers/hwmon/ntc_thermistor.c
@@ -26,9 +26,16 @@
 #include <linux/math64.h>
 #include <linux/platform_device.h>
 #include <linux/err.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
 
 #include <linux/platform_data/ntc_thermistor.h>
 
+#include <linux/iio/iio.h>
+#include <linux/iio/machine.h>
+#include <linux/iio/driver.h>
+#include <linux/iio/consumer.h>
+
 #include <linux/hwmon.h>
 #include <linux/hwmon-sysfs.h>
 
@@ -37,6 +44,15 @@ struct ntc_compensation {
 	unsigned int	ohm;
 };
 
+static const struct platform_device_id ntc_thermistor_id[] = {
+	{ "ncp15wb473", TYPE_NCPXXWB473 },
+	{ "ncp18wb473", TYPE_NCPXXWB473 },
+	{ "ncp21wb473", TYPE_NCPXXWB473 },
+	{ "ncp03wb473", TYPE_NCPXXWB473 },
+	{ "ncp15wl333", TYPE_NCPXXWL333 },
+	{ },
+};
+
 /*
  * A compensation table should be sorted by the values of .ohm
  * in descending order.
@@ -125,6 +141,92 @@ struct ntc_data {
 	char name[PLATFORM_NAME_SIZE];
 };
 
+#ifdef CONFIG_OF
+static int ntc_adc_iio_read(struct ntc_thermistor_platform_data *pdata)
+{
+	struct iio_channel *channel = pdata->chan;
+	unsigned int result;
+	int val, ret;
+
+	ret = iio_read_channel_raw(channel, &val);
+	if (ret < 0) {
+		pr_err("read channel() error: %d\n", ret);
+		return ret;
+	}
+
+	/* unit: mV */
+	result = pdata->pullup_uV * val;
+	result >>= 12;
+
+	return result;
+}
+
+static const struct of_device_id ntc_match[] = {
+	{ .compatible = "ntc,ncp15wb473",
+		.data = &ntc_thermistor_id[TYPE_NCPXXWB473] },
+	{ .compatible = "ntc,ncp18wb473",
+		.data = &ntc_thermistor_id[TYPE_NCPXXWB473] },
+	{ .compatible = "ntc,ncp21wb473",
+		.data = &ntc_thermistor_id[TYPE_NCPXXWB473] },
+	{ .compatible = "ntc,ncp03wb473",
+		.data = &ntc_thermistor_id[TYPE_NCPXXWB473] },
+	{ .compatible = "ntc,ncp15wl333",
+		.data = &ntc_thermistor_id[TYPE_NCPXXWL333] },
+	{ },
+};
+MODULE_DEVICE_TABLE(of, ntc_match);
+
+static struct ntc_thermistor_platform_data *
+ntc_thermistor_parse_dt(struct platform_device *pdev)
+{
+	struct iio_channel *chan;
+	struct device_node *np = pdev->dev.of_node;
+	struct ntc_thermistor_platform_data *pdata;
+
+	if (!np)
+		return NULL;
+
+	pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return ERR_PTR(-ENOMEM);
+
+	chan = iio_channel_get(&pdev->dev, NULL);
+	if (IS_ERR(chan))
+		return ERR_CAST(chan);
+
+	if (of_property_read_u32(np, "pullup-uv", &pdata->pullup_uV))
+		return ERR_PTR(-ENODEV);
+	if (of_property_read_u32(np, "pullup-ohm", &pdata->pullup_ohm))
+		return ERR_PTR(-ENODEV);
+	if (of_property_read_u32(np, "pulldown-ohm", &pdata->pulldown_ohm))
+		return ERR_PTR(-ENODEV);
+
+	if (of_find_property(np, "connected-positive", NULL))
+		pdata->connect = NTC_CONNECTED_POSITIVE;
+	else /* status change should be possible if not always on. */
+		pdata->connect = NTC_CONNECTED_GROUND;
+
+	pdata->chan = chan;
+	pdata->read_uV = ntc_adc_iio_read;
+
+	return pdata;
+}
+static void ntc_iio_channel_release(struct ntc_thermistor_platform_data *pdata)
+{
+	if (pdata->chan)
+		iio_channel_release(pdata->chan);
+}
+#else
+static struct ntc_thermistor_platform_data *
+ntc_thermistor_parse_dt(struct platform_device *pdev)
+{
+	return NULL;
+}
+
+static void ntc_iio_channel_release(struct ntc_thermistor_platform_data *pdata)
+{ }
+#endif
+
 static inline u64 div64_u64_safe(u64 dividend, u64 divisor)
 {
 	if (divisor == 0 && dividend == 0)
@@ -259,7 +361,7 @@ static int ntc_thermistor_get_ohm(struct ntc_data *data)
 		return data->pdata->read_ohm();
 
 	if (data->pdata->read_uV) {
-		read_uV = data->pdata->read_uV();
+		read_uV = data->pdata->read_uV(data->pdata);
 		if (read_uV < 0)
 			return read_uV;
 		return get_ohm_of_thermistor(data, read_uV);
@@ -311,9 +413,18 @@ static const struct attribute_group ntc_attr_group = {
 
 static int ntc_thermistor_probe(struct platform_device *pdev)
 {
+	const struct of_device_id *of_id =
+			of_match_device(of_match_ptr(ntc_match), &pdev->dev);
+	const struct platform_device_id *pdev_id;
+	struct ntc_thermistor_platform_data *pdata;
 	struct ntc_data *data;
-	struct ntc_thermistor_platform_data *pdata = pdev->dev.platform_data;
-	int ret = 0;
+	int ret;
+
+	pdata = ntc_thermistor_parse_dt(pdev);
+	if (IS_ERR(pdata))
+		return PTR_ERR(pdata);
+	else if (pdata == NULL)
+		pdata = pdev->dev.platform_data;
 
 	if (!pdata) {
 		dev_err(&pdev->dev, "No platform init data supplied.\n");
@@ -349,11 +460,13 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
 	if (!data)
 		return -ENOMEM;
 
+	pdev_id = of_id ? of_id->data : platform_get_device_id(pdev);
+
 	data->dev = &pdev->dev;
 	data->pdata = pdata;
-	strlcpy(data->name, pdev->id_entry->name, sizeof(data->name));
+	strlcpy(data->name, pdev_id->name, sizeof(data->name));
 
-	switch (pdev->id_entry->driver_data) {
+	switch (pdev_id->driver_data) {
 	case TYPE_NCPXXWB473:
 		data->comp = ncpXXwb473;
 		data->n_comp = ARRAY_SIZE(ncpXXwb473);
@@ -364,8 +477,7 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
 		break;
 	default:
 		dev_err(&pdev->dev, "Unknown device type: %lu(%s)\n",
-				pdev->id_entry->driver_data,
-				pdev->id_entry->name);
+				pdev_id->driver_data, pdev_id->name);
 		return -EINVAL;
 	}
 
@@ -384,39 +496,34 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
 		goto err_after_sysfs;
 	}
 
-	dev_info(&pdev->dev, "Thermistor %s:%d (type: %s/%lu) successfully probed.\n",
-			pdev->name, pdev->id, pdev->id_entry->name,
-			pdev->id_entry->driver_data);
+	dev_info(&pdev->dev, "Thermistor type: %s successfully probed.\n",
+								pdev->name);
+
 	return 0;
 err_after_sysfs:
 	sysfs_remove_group(&data->dev->kobj, &ntc_attr_group);
+	ntc_iio_channel_release(pdata);
 	return ret;
 }
 
 static int ntc_thermistor_remove(struct platform_device *pdev)
 {
 	struct ntc_data *data = platform_get_drvdata(pdev);
+	struct ntc_thermistor_platform_data *pdata = data->pdata;
 
 	hwmon_device_unregister(data->hwmon_dev);
 	sysfs_remove_group(&data->dev->kobj, &ntc_attr_group);
+	ntc_iio_channel_release(pdata);
 	platform_set_drvdata(pdev, NULL);
 
 	return 0;
 }
 
-static const struct platform_device_id ntc_thermistor_id[] = {
-	{ "ncp15wb473", TYPE_NCPXXWB473 },
-	{ "ncp18wb473", TYPE_NCPXXWB473 },
-	{ "ncp21wb473", TYPE_NCPXXWB473 },
-	{ "ncp03wb473", TYPE_NCPXXWB473 },
-	{ "ncp15wl333", TYPE_NCPXXWL333 },
-	{ },
-};
-
 static struct platform_driver ntc_thermistor_driver = {
 	.driver = {
 		.name = "ntc-thermistor",
 		.owner = THIS_MODULE,
+		.of_match_table = of_match_ptr(ntc_match),
 	},
 	.probe = ntc_thermistor_probe,
 	.remove = ntc_thermistor_remove,
diff --git a/include/linux/platform_data/ntc_thermistor.h b/include/linux/platform_data/ntc_thermistor.h
index 88734e871e3a..fa95f9cbe7ea 100644
--- a/include/linux/platform_data/ntc_thermistor.h
+++ b/include/linux/platform_data/ntc_thermistor.h
@@ -21,6 +21,8 @@
 #ifndef _LINUX_NTC_H
 #define _LINUX_NTC_H
 
+struct iio_channel;
+
 enum ntc_thermistor_type {
 	TYPE_NCPXXWB473,
 	TYPE_NCPXXWL333,
@@ -39,13 +41,17 @@ struct ntc_thermistor_platform_data {
 	 * described at Documentation/hwmon/ntc_thermistor
 	 *
 	 * pullup/down_ohm: 0 for infinite / not-connected
+	 *
+	 * chan: iio_channel pointer to communicate with the ADC which the
+	 * thermistor is using for conversion of the analog values.
 	 */
-	int (*read_uV)(void);
+	int (*read_uV)(struct ntc_thermistor_platform_data *);
 	unsigned int pullup_uV;
 
 	unsigned int pullup_ohm;
 	unsigned int pulldown_ohm;
 	enum { NTC_CONNECTED_POSITIVE, NTC_CONNECTED_GROUND } connect;
+	struct iio_channel *chan;
 
 	int (*read_ohm)(void);
 };
-- 
cgit 


From 088ce2ac9ebac5c74faf4d39083627875fa6f0f0 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Wed, 13 Mar 2013 16:40:39 -0700
Subject: hwmon: Fix CamelCase checkpatch warnings

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
---
 drivers/hwmon/ads7871.c                      |  42 +++---
 drivers/hwmon/asc7621.c                      |  56 +++----
 drivers/hwmon/da9052-hwmon.c                 |  14 +-
 drivers/hwmon/da9055-hwmon.c                 |   4 +-
 drivers/hwmon/it87.c                         |  24 +--
 drivers/hwmon/lm93.c                         |  38 ++---
 drivers/hwmon/ntc_thermistor.c               | 216 +++++++++++++--------------
 drivers/hwmon/via686a.c                      |  38 ++---
 include/linux/platform_data/ntc_thermistor.h |   4 +-
 9 files changed, 218 insertions(+), 218 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hwmon/ads7871.c b/drivers/hwmon/ads7871.c
index a79875986f91..3eff73b6220d 100644
--- a/drivers/hwmon/ads7871.c
+++ b/drivers/hwmon/ads7871.c
@@ -40,25 +40,25 @@
  * the instruction byte
  */
 /*Instruction Bit masks*/
-#define INST_MODE_bm	(1<<7)
-#define INST_READ_bm	(1<<6)
-#define INST_16BIT_bm	(1<<5)
+#define INST_MODE_BM	(1 << 7)
+#define INST_READ_BM	(1 << 6)
+#define INST_16BIT_BM	(1 << 5)
 
 /*From figure 18 in the datasheet*/
 /*bit masks for Rev/Oscillator Control Register*/
-#define MUX_CNV_bv	7
-#define MUX_CNV_bm	(1<<MUX_CNV_bv)
-#define MUX_M3_bm	(1<<3) /*M3 selects single ended*/
-#define MUX_G_bv	4 /*allows for reg = (gain << MUX_G_bv) | ...*/
+#define MUX_CNV_BV	7
+#define MUX_CNV_BM	(1 << MUX_CNV_BV)
+#define MUX_M3_BM	(1 << 3) /*M3 selects single ended*/
+#define MUX_G_BV	4 /*allows for reg = (gain << MUX_G_BV) | ...*/
 
 /*From figure 18 in the datasheet*/
 /*bit masks for Rev/Oscillator Control Register*/
-#define OSC_OSCR_bm	(1<<5)
-#define OSC_OSCE_bm	(1<<4)
-#define OSC_REFE_bm	(1<<3)
-#define OSC_BUFE_bm	(1<<2)
-#define OSC_R2V_bm	(1<<1)
-#define OSC_RBG_bm	(1<<0)
+#define OSC_OSCR_BM	(1 << 5)
+#define OSC_OSCE_BM	(1 << 4)
+#define OSC_REFE_BM	(1 << 3)
+#define OSC_BUFE_BM	(1 << 2)
+#define OSC_R2V_BM	(1 << 1)
+#define OSC_RBG_BM	(1 << 0)
 
 #include <linux/module.h>
 #include <linux/init.h>
@@ -79,7 +79,7 @@ struct ads7871_data {
 static int ads7871_read_reg8(struct spi_device *spi, int reg)
 {
 	int ret;
-	reg = reg | INST_READ_bm;
+	reg = reg | INST_READ_BM;
 	ret = spi_w8r8(spi, reg);
 	return ret;
 }
@@ -87,7 +87,7 @@ static int ads7871_read_reg8(struct spi_device *spi, int reg)
 static int ads7871_read_reg16(struct spi_device *spi, int reg)
 {
 	int ret;
-	reg = reg | INST_READ_bm | INST_16BIT_bm;
+	reg = reg | INST_READ_BM | INST_16BIT_BM;
 	ret = spi_w8r16(spi, reg);
 	return ret;
 }
@@ -111,13 +111,13 @@ static ssize_t show_voltage(struct device *dev,
 	 * TODO: add support for conversions
 	 * other than single ended with a gain of 1
 	 */
-	/*MUX_M3_bm forces single ended*/
+	/*MUX_M3_BM forces single ended*/
 	/*This is also where the gain of the PGA would be set*/
 	ads7871_write_reg8(spi, REG_GAIN_MUX,
-		(MUX_CNV_bm | MUX_M3_bm | channel));
+		(MUX_CNV_BM | MUX_M3_BM | channel));
 
 	ret = ads7871_read_reg8(spi, REG_GAIN_MUX);
-	mux_cnv = ((ret & MUX_CNV_bm)>>MUX_CNV_bv);
+	mux_cnv = ((ret & MUX_CNV_BM) >> MUX_CNV_BV);
 	/*
 	 * on 400MHz arm9 platform the conversion
 	 * is already done when we do this test
@@ -125,14 +125,14 @@ static ssize_t show_voltage(struct device *dev,
 	while ((i < 2) && mux_cnv) {
 		i++;
 		ret = ads7871_read_reg8(spi, REG_GAIN_MUX);
-		mux_cnv = ((ret & MUX_CNV_bm)>>MUX_CNV_bv);
+		mux_cnv = ((ret & MUX_CNV_BM) >> MUX_CNV_BV);
 		msleep_interruptible(1);
 	}
 
 	if (mux_cnv == 0) {
 		val = ads7871_read_reg16(spi, REG_LS_BYTE);
 		/*result in volts*10000 = (val/8192)*2.5*10000*/
-		val = ((val>>2) * 25000) / 8192;
+		val = ((val >> 2) * 25000) / 8192;
 		return sprintf(buf, "%d\n", val);
 	} else {
 		return -1;
@@ -189,7 +189,7 @@ static int ads7871_probe(struct spi_device *spi)
 	ads7871_write_reg8(spi, REG_SER_CONTROL, 0);
 	ads7871_write_reg8(spi, REG_AD_CONTROL, 0);
 
-	val = (OSC_OSCR_bm | OSC_OSCE_bm | OSC_REFE_bm | OSC_BUFE_bm);
+	val = (OSC_OSCR_BM | OSC_OSCE_BM | OSC_REFE_BM | OSC_BUFE_BM);
 	ads7871_write_reg8(spi, REG_OSC_CONTROL, val);
 	ret = ads7871_read_reg8(spi, REG_OSC_CONTROL);
 
diff --git a/drivers/hwmon/asc7621.c b/drivers/hwmon/asc7621.c
index da7f5b5d5db5..3ad9d849add2 100644
--- a/drivers/hwmon/asc7621.c
+++ b/drivers/hwmon/asc7621.c
@@ -159,12 +159,12 @@ static inline int write_byte(struct i2c_client *client, u8 reg, u8 data)
  * and retrieval of like parameters.
  */
 
-#define SETUP_SHOW_data_param(d, a) \
+#define SETUP_SHOW_DATA_PARAM(d, a) \
 	struct sensor_device_attribute *sda = to_sensor_dev_attr(a); \
 	struct asc7621_data *data = asc7621_update_device(d); \
 	struct asc7621_param *param = to_asc7621_param(sda)
 
-#define SETUP_STORE_data_param(d, a) \
+#define SETUP_STORE_DATA_PARAM(d, a) \
 	struct sensor_device_attribute *sda = to_sensor_dev_attr(a); \
 	struct i2c_client *client = to_i2c_client(d); \
 	struct asc7621_data *data = i2c_get_clientdata(client); \
@@ -177,7 +177,7 @@ static inline int write_byte(struct i2c_client *client, u8 reg, u8 data)
 static ssize_t show_u8(struct device *dev, struct device_attribute *attr,
 		       char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 
 	return sprintf(buf, "%u\n", data->reg[param->msb[0]]);
 }
@@ -185,7 +185,7 @@ static ssize_t show_u8(struct device *dev, struct device_attribute *attr,
 static ssize_t store_u8(struct device *dev, struct device_attribute *attr,
 			const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 
 	if (kstrtol(buf, 10, &reqval))
@@ -206,7 +206,7 @@ static ssize_t store_u8(struct device *dev, struct device_attribute *attr,
 static ssize_t show_bitmask(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 
 	return sprintf(buf, "%u\n",
 		       (data->reg[param->msb[0]] >> param->
@@ -217,7 +217,7 @@ static ssize_t store_bitmask(struct device *dev,
 			     struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 	u8 currval;
 
@@ -246,7 +246,7 @@ static ssize_t store_bitmask(struct device *dev,
 static ssize_t show_fan16(struct device *dev,
 			  struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u16 regval;
 
 	mutex_lock(&data->update_lock);
@@ -262,7 +262,7 @@ static ssize_t store_fan16(struct device *dev,
 			   struct device_attribute *attr, const char *buf,
 			   size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 
 	if (kstrtol(buf, 10, &reqval))
@@ -307,7 +307,7 @@ static int asc7621_in_scaling[] = {
 static ssize_t show_in10(struct device *dev, struct device_attribute *attr,
 			 char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u16 regval;
 	u8 nr = sda->index;
 
@@ -325,7 +325,7 @@ static ssize_t show_in10(struct device *dev, struct device_attribute *attr,
 static ssize_t show_in8(struct device *dev, struct device_attribute *attr,
 			char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 nr = sda->index;
 
 	return sprintf(buf, "%u\n",
@@ -336,7 +336,7 @@ static ssize_t show_in8(struct device *dev, struct device_attribute *attr,
 static ssize_t store_in8(struct device *dev, struct device_attribute *attr,
 			 const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 	u8 nr = sda->index;
 
@@ -360,7 +360,7 @@ static ssize_t store_in8(struct device *dev, struct device_attribute *attr,
 static ssize_t show_temp8(struct device *dev,
 			  struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 
 	return sprintf(buf, "%d\n", ((s8) data->reg[param->msb[0]]) * 1000);
 }
@@ -369,7 +369,7 @@ static ssize_t store_temp8(struct device *dev,
 			   struct device_attribute *attr, const char *buf,
 			   size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 	s8 temp;
 
@@ -397,7 +397,7 @@ static ssize_t store_temp8(struct device *dev,
 static ssize_t show_temp10(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 msb, lsb;
 	int temp;
 
@@ -414,7 +414,7 @@ static ssize_t show_temp10(struct device *dev,
 static ssize_t show_temp62(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 regval = data->reg[param->msb[0]];
 	int temp = ((s8) (regval & 0xfc) * 1000) + ((regval & 0x03) * 250);
 
@@ -425,7 +425,7 @@ static ssize_t store_temp62(struct device *dev,
 			    struct device_attribute *attr, const char *buf,
 			    size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval, i, f;
 	s8 temp;
 
@@ -459,7 +459,7 @@ static u32 asc7621_range_map[] = {
 static ssize_t show_ap2_temp(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	long auto_point1;
 	u8 regval;
 	int temp;
@@ -479,7 +479,7 @@ static ssize_t store_ap2_temp(struct device *dev,
 			      struct device_attribute *attr,
 			      const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval, auto_point1;
 	int i;
 	u8 currval, newval = 0;
@@ -510,7 +510,7 @@ static ssize_t store_ap2_temp(struct device *dev,
 static ssize_t show_pwm_ac(struct device *dev,
 			   struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 config, altbit, regval;
 	u8 map[] = {
 		0x01, 0x02, 0x04, 0x1f, 0x00, 0x06, 0x07, 0x10,
@@ -530,7 +530,7 @@ static ssize_t store_pwm_ac(struct device *dev,
 			    struct device_attribute *attr,
 			    const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	unsigned long reqval;
 	u8 currval, config, altbit, newval;
 	u16 map[] = {
@@ -569,7 +569,7 @@ static ssize_t store_pwm_ac(struct device *dev,
 static ssize_t show_pwm_enable(struct device *dev,
 			       struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 config, altbit, minoff, val, newval;
 
 	mutex_lock(&data->update_lock);
@@ -599,7 +599,7 @@ static ssize_t store_pwm_enable(struct device *dev,
 				struct device_attribute *attr,
 				const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 	u8 currval, config, altbit, newval, minoff = 255;
 
@@ -659,7 +659,7 @@ static u32 asc7621_pwm_freq_map[] = {
 static ssize_t show_pwm_freq(struct device *dev,
 			     struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 regval =
 	    (data->reg[param->msb[0]] >> param->shift[0]) & param->mask[0];
 
@@ -672,7 +672,7 @@ static ssize_t store_pwm_freq(struct device *dev,
 			      struct device_attribute *attr,
 			      const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	unsigned long reqval;
 	u8 currval, newval = 255;
 	int i;
@@ -707,7 +707,7 @@ static u32 asc7621_pwm_auto_spinup_map[] =  {
 static ssize_t show_pwm_ast(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 regval =
 	    (data->reg[param->msb[0]] >> param->shift[0]) & param->mask[0];
 
@@ -721,7 +721,7 @@ static ssize_t store_pwm_ast(struct device *dev,
 			     struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 	u8 currval, newval = 255;
 	u32 i;
@@ -756,7 +756,7 @@ static u32 asc7621_temp_smoothing_time_map[] = {
 static ssize_t show_temp_st(struct device *dev,
 			    struct device_attribute *attr, char *buf)
 {
-	SETUP_SHOW_data_param(dev, attr);
+	SETUP_SHOW_DATA_PARAM(dev, attr);
 	u8 regval =
 	    (data->reg[param->msb[0]] >> param->shift[0]) & param->mask[0];
 	regval = clamp_val(regval, 0, 7);
@@ -768,7 +768,7 @@ static ssize_t store_temp_st(struct device *dev,
 			     struct device_attribute *attr,
 			     const char *buf, size_t count)
 {
-	SETUP_STORE_data_param(dev, attr);
+	SETUP_STORE_DATA_PARAM(dev, attr);
 	long reqval;
 	u8 currval, newval = 255;
 	u32 i;
diff --git a/drivers/hwmon/da9052-hwmon.c b/drivers/hwmon/da9052-hwmon.c
index ab4452c5a98c..960fac3fb166 100644
--- a/drivers/hwmon/da9052-hwmon.c
+++ b/drivers/hwmon/da9052-hwmon.c
@@ -43,19 +43,19 @@ static const char * const input_names[] = {
 };
 
 /* Conversion function for VDDOUT and VBAT */
-static inline int volt_reg_to_mV(int value)
+static inline int volt_reg_to_mv(int value)
 {
 	return DIV_ROUND_CLOSEST(value * 1000, 512) + 2500;
 }
 
 /* Conversion function for ADC channels 4, 5 and 6 */
-static inline int input_reg_to_mV(int value)
+static inline int input_reg_to_mv(int value)
 {
 	return DIV_ROUND_CLOSEST(value * 2500, 1023);
 }
 
 /* Conversion function for VBBAT */
-static inline int vbbat_reg_to_mV(int value)
+static inline int vbbat_reg_to_mv(int value)
 {
 	return DIV_ROUND_CLOSEST(value * 2500, 512);
 }
@@ -96,7 +96,7 @@ static ssize_t da9052_read_vddout(struct device *dev,
 		goto hwmon_err;
 
 	mutex_unlock(&hwmon->hwmon_lock);
-	return sprintf(buf, "%d\n", volt_reg_to_mV(vdd));
+	return sprintf(buf, "%d\n", volt_reg_to_mv(vdd));
 
 hwmon_err_release:
 	da9052_disable_vddout_channel(hwmon->da9052);
@@ -137,7 +137,7 @@ static ssize_t da9052_read_vbat(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", volt_reg_to_mV(ret));
+	return sprintf(buf, "%d\n", volt_reg_to_mv(ret));
 }
 
 static ssize_t da9052_read_misc_channel(struct device *dev,
@@ -152,7 +152,7 @@ static ssize_t da9052_read_misc_channel(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", input_reg_to_mV(ret));
+	return sprintf(buf, "%d\n", input_reg_to_mv(ret));
 }
 
 static ssize_t da9052_read_tjunc(struct device *dev,
@@ -187,7 +187,7 @@ static ssize_t da9052_read_vbbat(struct device *dev,
 	if (ret < 0)
 		return ret;
 
-	return sprintf(buf, "%d\n", vbbat_reg_to_mV(ret));
+	return sprintf(buf, "%d\n", vbbat_reg_to_mv(ret));
 }
 
 static ssize_t da9052_hwmon_show_name(struct device *dev,
diff --git a/drivers/hwmon/da9055-hwmon.c b/drivers/hwmon/da9055-hwmon.c
index 9465c050c326..029ecabc4380 100644
--- a/drivers/hwmon/da9055-hwmon.c
+++ b/drivers/hwmon/da9055-hwmon.c
@@ -119,7 +119,7 @@ static irqreturn_t da9055_auxadc_irq(int irq, void *irq_data)
 }
 
 /* Conversion function for VSYS and ADCINx */
-static inline int volt_reg_to_mV(int value, int channel)
+static inline int volt_reg_to_mv(int value, int channel)
 {
 	if (channel == DA9055_ADC_VSYS)
 		return DIV_ROUND_CLOSEST(value * 1000, DA9055_VSYS_DIV) + 2500;
@@ -168,7 +168,7 @@ static ssize_t da9055_read_auto_ch(struct device *dev,
 
 	mutex_unlock(&hwmon->hwmon_lock);
 
-	return sprintf(buf, "%d\n", volt_reg_to_mV(adc, channel));
+	return sprintf(buf, "%d\n", volt_reg_to_mv(adc, channel));
 
 hwmon_err_release:
 	da9055_disable_auto_mode(hwmon->da9055, channel);
diff --git a/drivers/hwmon/it87.c b/drivers/hwmon/it87.c
index 37fc980fde24..72b21d5b1c62 100644
--- a/drivers/hwmon/it87.c
+++ b/drivers/hwmon/it87.c
@@ -1778,7 +1778,7 @@ static int __init it87_find(unsigned short *address,
 		superio_select(5);
 		sio_data->beep_pin = superio_inb(IT87_SIO_BEEP_PIN_REG) & 0x3f;
 	} else if (sio_data->type == it8783) {
-		int reg25, reg27, reg2A, reg2C, regEF;
+		int reg25, reg27, reg2a, reg2c, regef;
 
 		sio_data->skip_vid = 1;	/* No VID */
 
@@ -1786,15 +1786,15 @@ static int __init it87_find(unsigned short *address,
 
 		reg25 = superio_inb(IT87_SIO_GPIO1_REG);
 		reg27 = superio_inb(IT87_SIO_GPIO3_REG);
-		reg2A = superio_inb(IT87_SIO_PINX1_REG);
-		reg2C = superio_inb(IT87_SIO_PINX2_REG);
-		regEF = superio_inb(IT87_SIO_SPI_REG);
+		reg2a = superio_inb(IT87_SIO_PINX1_REG);
+		reg2c = superio_inb(IT87_SIO_PINX2_REG);
+		regef = superio_inb(IT87_SIO_SPI_REG);
 
 		/* Check if fan3 is there or not */
-		if ((reg27 & (1 << 0)) || !(reg2C & (1 << 2)))
+		if ((reg27 & (1 << 0)) || !(reg2c & (1 << 2)))
 			sio_data->skip_fan |= (1 << 2);
 		if ((reg25 & (1 << 4))
-		    || (!(reg2A & (1 << 1)) && (regEF & (1 << 0))))
+		    || (!(reg2a & (1 << 1)) && (regef & (1 << 0))))
 			sio_data->skip_pwm |= (1 << 2);
 
 		/* Check if fan2 is there or not */
@@ -1804,7 +1804,7 @@ static int __init it87_find(unsigned short *address,
 			sio_data->skip_pwm |= (1 << 1);
 
 		/* VIN5 */
-		if ((reg27 & (1 << 0)) || (reg2C & (1 << 2)))
+		if ((reg27 & (1 << 0)) || (reg2c & (1 << 2)))
 			sio_data->skip_in |= (1 << 5); /* No VIN5 */
 
 		/* VIN6 */
@@ -1829,18 +1829,18 @@ static int __init it87_find(unsigned short *address,
 			 * not the case, and ask the user to report if the
 			 * resulting voltage is sane.
 			 */
-			if (!(reg2C & (1 << 1))) {
-				reg2C |= (1 << 1);
-				superio_outb(IT87_SIO_PINX2_REG, reg2C);
+			if (!(reg2c & (1 << 1))) {
+				reg2c |= (1 << 1);
+				superio_outb(IT87_SIO_PINX2_REG, reg2c);
 				pr_notice("Routing internal VCCH5V to in7.\n");
 			}
 			pr_notice("in7 routed to internal voltage divider, with external pin disabled.\n");
 			pr_notice("Please report if it displays a reasonable voltage.\n");
 		}
 
-		if (reg2C & (1 << 0))
+		if (reg2c & (1 << 0))
 			sio_data->internal |= (1 << 0);
-		if (reg2C & (1 << 1))
+		if (reg2c & (1 << 1))
 			sio_data->internal |= (1 << 1);
 
 		sio_data->beep_pin = superio_inb(IT87_SIO_BEEP_PIN_REG) & 0x3f;
diff --git a/drivers/hwmon/lm93.c b/drivers/hwmon/lm93.c
index b40f34cdb3ca..340382f173a0 100644
--- a/drivers/hwmon/lm93.c
+++ b/drivers/hwmon/lm93.c
@@ -354,12 +354,12 @@ static const unsigned long lm93_vin_val_max[16] = {
 
 static unsigned LM93_IN_FROM_REG(int nr, u8 reg)
 {
-	const long uV_max = lm93_vin_val_max[nr] * 1000;
-	const long uV_min = lm93_vin_val_min[nr] * 1000;
+	const long uv_max = lm93_vin_val_max[nr] * 1000;
+	const long uv_min = lm93_vin_val_min[nr] * 1000;
 
-	const long slope = (uV_max - uV_min) /
+	const long slope = (uv_max - uv_min) /
 		(lm93_vin_reg_max[nr] - lm93_vin_reg_min[nr]);
-	const long intercept = uV_min - slope * lm93_vin_reg_min[nr];
+	const long intercept = uv_min - slope * lm93_vin_reg_min[nr];
 
 	return (slope * reg + intercept + 500) / 1000;
 }
@@ -371,20 +371,20 @@ static unsigned LM93_IN_FROM_REG(int nr, u8 reg)
 static u8 LM93_IN_TO_REG(int nr, unsigned val)
 {
 	/* range limit */
-	const long mV = clamp_val(val,
+	const long mv = clamp_val(val,
 				  lm93_vin_val_min[nr], lm93_vin_val_max[nr]);
 
 	/* try not to lose too much precision here */
-	const long uV = mV * 1000;
-	const long uV_max = lm93_vin_val_max[nr] * 1000;
-	const long uV_min = lm93_vin_val_min[nr] * 1000;
+	const long uv = mv * 1000;
+	const long uv_max = lm93_vin_val_max[nr] * 1000;
+	const long uv_min = lm93_vin_val_min[nr] * 1000;
 
 	/* convert */
-	const long slope = (uV_max - uV_min) /
+	const long slope = (uv_max - uv_min) /
 		(lm93_vin_reg_max[nr] - lm93_vin_reg_min[nr]);
-	const long intercept = uV_min - slope * lm93_vin_reg_min[nr];
+	const long intercept = uv_min - slope * lm93_vin_reg_min[nr];
 
-	u8 result = ((uV - intercept + (slope/2)) / slope);
+	u8 result = ((uv - intercept + (slope/2)) / slope);
 	result = clamp_val(result,
 			   lm93_vin_reg_min[nr], lm93_vin_reg_max[nr]);
 	return result;
@@ -393,10 +393,10 @@ static u8 LM93_IN_TO_REG(int nr, unsigned val)
 /* vid in mV, upper == 0 indicates low limit, otherwise upper limit */
 static unsigned LM93_IN_REL_FROM_REG(u8 reg, int upper, int vid)
 {
-	const long uV_offset = upper ? (((reg >> 4 & 0x0f) + 1) * 12500) :
+	const long uv_offset = upper ? (((reg >> 4 & 0x0f) + 1) * 12500) :
 				(((reg >> 0 & 0x0f) + 1) * -25000);
-	const long uV_vid = vid * 1000;
-	return (uV_vid + uV_offset + 5000) / 10000;
+	const long uv_vid = vid * 1000;
+	return (uv_vid + uv_offset + 5000) / 10000;
 }
 
 #define LM93_IN_MIN_FROM_REG(reg, vid)	LM93_IN_REL_FROM_REG((reg), 0, (vid))
@@ -409,13 +409,13 @@ static unsigned LM93_IN_REL_FROM_REG(u8 reg, int upper, int vid)
  */
 static u8 LM93_IN_REL_TO_REG(unsigned val, int upper, int vid)
 {
-	long uV_offset = vid * 1000 - val * 10000;
+	long uv_offset = vid * 1000 - val * 10000;
 	if (upper) {
-		uV_offset = clamp_val(uV_offset, 12500, 200000);
-		return (u8)((uV_offset /  12500 - 1) << 4);
+		uv_offset = clamp_val(uv_offset, 12500, 200000);
+		return (u8)((uv_offset /  12500 - 1) << 4);
 	} else {
-		uV_offset = clamp_val(uV_offset, -400000, -25000);
-		return (u8)((uV_offset / -25000 - 1) << 0);
+		uv_offset = clamp_val(uv_offset, -400000, -25000);
+		return (u8)((uv_offset / -25000 - 1) << 0);
 	}
 }
 
diff --git a/drivers/hwmon/ntc_thermistor.c b/drivers/hwmon/ntc_thermistor.c
index d399197655e6..d6d640a733d5 100644
--- a/drivers/hwmon/ntc_thermistor.c
+++ b/drivers/hwmon/ntc_thermistor.c
@@ -40,7 +40,7 @@
 #include <linux/hwmon-sysfs.h>
 
 struct ntc_compensation {
-	int		temp_C;
+	int		temp_c;
 	unsigned int	ohm;
 };
 
@@ -60,76 +60,76 @@ static const struct platform_device_id ntc_thermistor_id[] = {
  * Thermistors Datasheet
  */
 static const struct ntc_compensation ncpXXwb473[] = {
-	{ .temp_C	= -40, .ohm	= 1747920 },
-	{ .temp_C	= -35, .ohm	= 1245428 },
-	{ .temp_C	= -30, .ohm	= 898485 },
-	{ .temp_C	= -25, .ohm	= 655802 },
-	{ .temp_C	= -20, .ohm	= 483954 },
-	{ .temp_C	= -15, .ohm	= 360850 },
-	{ .temp_C	= -10, .ohm	= 271697 },
-	{ .temp_C	= -5, .ohm	= 206463 },
-	{ .temp_C	= 0, .ohm	= 158214 },
-	{ .temp_C	= 5, .ohm	= 122259 },
-	{ .temp_C	= 10, .ohm	= 95227 },
-	{ .temp_C	= 15, .ohm	= 74730 },
-	{ .temp_C	= 20, .ohm	= 59065 },
-	{ .temp_C	= 25, .ohm	= 47000 },
-	{ .temp_C	= 30, .ohm	= 37643 },
-	{ .temp_C	= 35, .ohm	= 30334 },
-	{ .temp_C	= 40, .ohm	= 24591 },
-	{ .temp_C	= 45, .ohm	= 20048 },
-	{ .temp_C	= 50, .ohm	= 16433 },
-	{ .temp_C	= 55, .ohm	= 13539 },
-	{ .temp_C	= 60, .ohm	= 11209 },
-	{ .temp_C	= 65, .ohm	= 9328 },
-	{ .temp_C	= 70, .ohm	= 7798 },
-	{ .temp_C	= 75, .ohm	= 6544 },
-	{ .temp_C	= 80, .ohm	= 5518 },
-	{ .temp_C	= 85, .ohm	= 4674 },
-	{ .temp_C	= 90, .ohm	= 3972 },
-	{ .temp_C	= 95, .ohm	= 3388 },
-	{ .temp_C	= 100, .ohm	= 2902 },
-	{ .temp_C	= 105, .ohm	= 2494 },
-	{ .temp_C	= 110, .ohm	= 2150 },
-	{ .temp_C	= 115, .ohm	= 1860 },
-	{ .temp_C	= 120, .ohm	= 1615 },
-	{ .temp_C	= 125, .ohm	= 1406 },
+	{ .temp_c	= -40, .ohm	= 1747920 },
+	{ .temp_c	= -35, .ohm	= 1245428 },
+	{ .temp_c	= -30, .ohm	= 898485 },
+	{ .temp_c	= -25, .ohm	= 655802 },
+	{ .temp_c	= -20, .ohm	= 483954 },
+	{ .temp_c	= -15, .ohm	= 360850 },
+	{ .temp_c	= -10, .ohm	= 271697 },
+	{ .temp_c	= -5, .ohm	= 206463 },
+	{ .temp_c	= 0, .ohm	= 158214 },
+	{ .temp_c	= 5, .ohm	= 122259 },
+	{ .temp_c	= 10, .ohm	= 95227 },
+	{ .temp_c	= 15, .ohm	= 74730 },
+	{ .temp_c	= 20, .ohm	= 59065 },
+	{ .temp_c	= 25, .ohm	= 47000 },
+	{ .temp_c	= 30, .ohm	= 37643 },
+	{ .temp_c	= 35, .ohm	= 30334 },
+	{ .temp_c	= 40, .ohm	= 24591 },
+	{ .temp_c	= 45, .ohm	= 20048 },
+	{ .temp_c	= 50, .ohm	= 16433 },
+	{ .temp_c	= 55, .ohm	= 13539 },
+	{ .temp_c	= 60, .ohm	= 11209 },
+	{ .temp_c	= 65, .ohm	= 9328 },
+	{ .temp_c	= 70, .ohm	= 7798 },
+	{ .temp_c	= 75, .ohm	= 6544 },
+	{ .temp_c	= 80, .ohm	= 5518 },
+	{ .temp_c	= 85, .ohm	= 4674 },
+	{ .temp_c	= 90, .ohm	= 3972 },
+	{ .temp_c	= 95, .ohm	= 3388 },
+	{ .temp_c	= 100, .ohm	= 2902 },
+	{ .temp_c	= 105, .ohm	= 2494 },
+	{ .temp_c	= 110, .ohm	= 2150 },
+	{ .temp_c	= 115, .ohm	= 1860 },
+	{ .temp_c	= 120, .ohm	= 1615 },
+	{ .temp_c	= 125, .ohm	= 1406 },
 };
 static const struct ntc_compensation ncpXXwl333[] = {
-	{ .temp_C	= -40, .ohm	= 1610154 },
-	{ .temp_C	= -35, .ohm	= 1130850 },
-	{ .temp_C	= -30, .ohm	= 802609 },
-	{ .temp_C	= -25, .ohm	= 575385 },
-	{ .temp_C	= -20, .ohm	= 416464 },
-	{ .temp_C	= -15, .ohm	= 304219 },
-	{ .temp_C	= -10, .ohm	= 224193 },
-	{ .temp_C	= -5, .ohm	= 166623 },
-	{ .temp_C	= 0, .ohm	= 124850 },
-	{ .temp_C	= 5, .ohm	= 94287 },
-	{ .temp_C	= 10, .ohm	= 71747 },
-	{ .temp_C	= 15, .ohm	= 54996 },
-	{ .temp_C	= 20, .ohm	= 42455 },
-	{ .temp_C	= 25, .ohm	= 33000 },
-	{ .temp_C	= 30, .ohm	= 25822 },
-	{ .temp_C	= 35, .ohm	= 20335 },
-	{ .temp_C	= 40, .ohm	= 16115 },
-	{ .temp_C	= 45, .ohm	= 12849 },
-	{ .temp_C	= 50, .ohm	= 10306 },
-	{ .temp_C	= 55, .ohm	= 8314 },
-	{ .temp_C	= 60, .ohm	= 6746 },
-	{ .temp_C	= 65, .ohm	= 5503 },
-	{ .temp_C	= 70, .ohm	= 4513 },
-	{ .temp_C	= 75, .ohm	= 3721 },
-	{ .temp_C	= 80, .ohm	= 3084 },
-	{ .temp_C	= 85, .ohm	= 2569 },
-	{ .temp_C	= 90, .ohm	= 2151 },
-	{ .temp_C	= 95, .ohm	= 1809 },
-	{ .temp_C	= 100, .ohm	= 1529 },
-	{ .temp_C	= 105, .ohm	= 1299 },
-	{ .temp_C	= 110, .ohm	= 1108 },
-	{ .temp_C	= 115, .ohm	= 949 },
-	{ .temp_C	= 120, .ohm	= 817 },
-	{ .temp_C	= 125, .ohm	= 707 },
+	{ .temp_c	= -40, .ohm	= 1610154 },
+	{ .temp_c	= -35, .ohm	= 1130850 },
+	{ .temp_c	= -30, .ohm	= 802609 },
+	{ .temp_c	= -25, .ohm	= 575385 },
+	{ .temp_c	= -20, .ohm	= 416464 },
+	{ .temp_c	= -15, .ohm	= 304219 },
+	{ .temp_c	= -10, .ohm	= 224193 },
+	{ .temp_c	= -5, .ohm	= 166623 },
+	{ .temp_c	= 0, .ohm	= 124850 },
+	{ .temp_c	= 5, .ohm	= 94287 },
+	{ .temp_c	= 10, .ohm	= 71747 },
+	{ .temp_c	= 15, .ohm	= 54996 },
+	{ .temp_c	= 20, .ohm	= 42455 },
+	{ .temp_c	= 25, .ohm	= 33000 },
+	{ .temp_c	= 30, .ohm	= 25822 },
+	{ .temp_c	= 35, .ohm	= 20335 },
+	{ .temp_c	= 40, .ohm	= 16115 },
+	{ .temp_c	= 45, .ohm	= 12849 },
+	{ .temp_c	= 50, .ohm	= 10306 },
+	{ .temp_c	= 55, .ohm	= 8314 },
+	{ .temp_c	= 60, .ohm	= 6746 },
+	{ .temp_c	= 65, .ohm	= 5503 },
+	{ .temp_c	= 70, .ohm	= 4513 },
+	{ .temp_c	= 75, .ohm	= 3721 },
+	{ .temp_c	= 80, .ohm	= 3084 },
+	{ .temp_c	= 85, .ohm	= 2569 },
+	{ .temp_c	= 90, .ohm	= 2151 },
+	{ .temp_c	= 95, .ohm	= 1809 },
+	{ .temp_c	= 100, .ohm	= 1529 },
+	{ .temp_c	= 105, .ohm	= 1299 },
+	{ .temp_c	= 110, .ohm	= 1108 },
+	{ .temp_c	= 115, .ohm	= 949 },
+	{ .temp_c	= 120, .ohm	= 817 },
+	{ .temp_c	= 125, .ohm	= 707 },
 };
 
 struct ntc_data {
@@ -155,7 +155,7 @@ static int ntc_adc_iio_read(struct ntc_thermistor_platform_data *pdata)
 	}
 
 	/* unit: mV */
-	result = pdata->pullup_uV * val;
+	result = pdata->pullup_uv * val;
 	result >>= 12;
 
 	return result;
@@ -194,7 +194,7 @@ ntc_thermistor_parse_dt(struct platform_device *pdev)
 	if (IS_ERR(chan))
 		return ERR_CAST(chan);
 
-	if (of_property_read_u32(np, "pullup-uv", &pdata->pullup_uV))
+	if (of_property_read_u32(np, "pullup-uv", &pdata->pullup_uv))
 		return ERR_PTR(-ENODEV);
 	if (of_property_read_u32(np, "pullup-ohm", &pdata->pullup_ohm))
 		return ERR_PTR(-ENODEV);
@@ -207,7 +207,7 @@ ntc_thermistor_parse_dt(struct platform_device *pdev)
 		pdata->connect = NTC_CONNECTED_GROUND;
 
 	pdata->chan = chan;
-	pdata->read_uV = ntc_adc_iio_read;
+	pdata->read_uv = ntc_adc_iio_read;
 
 	return pdata;
 }
@@ -236,37 +236,37 @@ static inline u64 div64_u64_safe(u64 dividend, u64 divisor)
 	return div64_u64(dividend, divisor);
 }
 
-static int get_ohm_of_thermistor(struct ntc_data *data, unsigned int uV)
+static int get_ohm_of_thermistor(struct ntc_data *data, unsigned int uv)
 {
 	struct ntc_thermistor_platform_data *pdata = data->pdata;
-	u64 mV = uV / 1000;
-	u64 pmV = pdata->pullup_uV / 1000;
-	u64 N, puO, pdO;
-	puO = pdata->pullup_ohm;
-	pdO = pdata->pulldown_ohm;
+	u64 mv = uv / 1000;
+	u64 pmv = pdata->pullup_uv / 1000;
+	u64 n, puo, pdo;
+	puo = pdata->pullup_ohm;
+	pdo = pdata->pulldown_ohm;
 
-	if (mV == 0) {
+	if (mv == 0) {
 		if (pdata->connect == NTC_CONNECTED_POSITIVE)
 			return INT_MAX;
 		return 0;
 	}
-	if (mV >= pmV)
+	if (mv >= pmv)
 		return (pdata->connect == NTC_CONNECTED_POSITIVE) ?
 			0 : INT_MAX;
 
-	if (pdata->connect == NTC_CONNECTED_POSITIVE && puO == 0)
-		N = div64_u64_safe(pdO * (pmV - mV), mV);
-	else if (pdata->connect == NTC_CONNECTED_GROUND && pdO == 0)
-		N = div64_u64_safe(puO * mV, pmV - mV);
+	if (pdata->connect == NTC_CONNECTED_POSITIVE && puo == 0)
+		n = div64_u64_safe(pdo * (pmv - mv), mv);
+	else if (pdata->connect == NTC_CONNECTED_GROUND && pdo == 0)
+		n = div64_u64_safe(puo * mv, pmv - mv);
 	else if (pdata->connect == NTC_CONNECTED_POSITIVE)
-		N = div64_u64_safe(pdO * puO * (pmV - mV),
-				puO * mV - pdO * (pmV - mV));
+		n = div64_u64_safe(pdo * puo * (pmv - mv),
+				puo * mv - pdo * (pmv - mv));
 	else
-		N = div64_u64_safe(pdO * puO * mV, pdO * (pmV - mV) - puO * mV);
+		n = div64_u64_safe(pdo * puo * mv, pdo * (pmv - mv) - puo * mv);
 
-	if (N > INT_MAX)
-		N = INT_MAX;
-	return N;
+	if (n > INT_MAX)
+		n = INT_MAX;
+	return n;
 }
 
 static void lookup_comp(struct ntc_data *data, unsigned int ohm,
@@ -335,7 +335,7 @@ static void lookup_comp(struct ntc_data *data, unsigned int ohm,
 		*i_high = end - 1;
 }
 
-static int get_temp_mC(struct ntc_data *data, unsigned int ohm)
+static int get_temp_mc(struct ntc_data *data, unsigned int ohm)
 {
 	int low, high;
 	int temp;
@@ -343,10 +343,10 @@ static int get_temp_mC(struct ntc_data *data, unsigned int ohm)
 	lookup_comp(data, ohm, &low, &high);
 	if (low == high) {
 		/* Unable to use linear approximation */
-		temp = data->comp[low].temp_C * 1000;
+		temp = data->comp[low].temp_c * 1000;
 	} else {
-		temp = data->comp[low].temp_C * 1000 +
-			((data->comp[high].temp_C - data->comp[low].temp_C) *
+		temp = data->comp[low].temp_c * 1000 +
+			((data->comp[high].temp_c - data->comp[low].temp_c) *
 			 1000 * ((int)ohm - (int)data->comp[low].ohm)) /
 			((int)data->comp[high].ohm - (int)data->comp[low].ohm);
 	}
@@ -355,16 +355,16 @@ static int get_temp_mC(struct ntc_data *data, unsigned int ohm)
 
 static int ntc_thermistor_get_ohm(struct ntc_data *data)
 {
-	int read_uV;
+	int read_uv;
 
 	if (data->pdata->read_ohm)
 		return data->pdata->read_ohm();
 
-	if (data->pdata->read_uV) {
-		read_uV = data->pdata->read_uV(data->pdata);
-		if (read_uV < 0)
-			return read_uV;
-		return get_ohm_of_thermistor(data, read_uV);
+	if (data->pdata->read_uv) {
+		read_uv = data->pdata->read_uv(data->pdata);
+		if (read_uv < 0)
+			return read_uv;
+		return get_ohm_of_thermistor(data, read_uv);
 	}
 	return -EINVAL;
 }
@@ -393,7 +393,7 @@ static ssize_t ntc_show_temp(struct device *dev,
 	if (ohm < 0)
 		return ohm;
 
-	return sprintf(buf, "%d\n", get_temp_mC(data, ohm));
+	return sprintf(buf, "%d\n", get_temp_mc(data, ohm));
 }
 
 static SENSOR_DEVICE_ATTR(temp1_type, S_IRUGO, ntc_show_type, NULL, 0);
@@ -432,19 +432,19 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
 	}
 
 	/* Either one of the two is required. */
-	if (!pdata->read_uV && !pdata->read_ohm) {
+	if (!pdata->read_uv && !pdata->read_ohm) {
 		dev_err(&pdev->dev,
-			"Both read_uV and read_ohm missing. Need either one of the two.\n");
+			"Both read_uv and read_ohm missing. Need either one of the two.\n");
 		return -EINVAL;
 	}
 
-	if (pdata->read_uV && pdata->read_ohm) {
+	if (pdata->read_uv && pdata->read_ohm) {
 		dev_warn(&pdev->dev,
-			 "Only one of read_uV and read_ohm is needed; ignoring read_uV.\n");
-		pdata->read_uV = NULL;
+			 "Only one of read_uv and read_ohm is needed; ignoring read_uv.\n");
+		pdata->read_uv = NULL;
 	}
 
-	if (pdata->read_uV && (pdata->pullup_uV == 0 ||
+	if (pdata->read_uv && (pdata->pullup_uv == 0 ||
 				(pdata->pullup_ohm == 0 && pdata->connect ==
 				 NTC_CONNECTED_GROUND) ||
 				(pdata->pulldown_ohm == 0 && pdata->connect ==
@@ -452,7 +452,7 @@ static int ntc_thermistor_probe(struct platform_device *pdev)
 				(pdata->connect != NTC_CONNECTED_POSITIVE &&
 				 pdata->connect != NTC_CONNECTED_GROUND))) {
 		dev_err(&pdev->dev,
-			"Required data to use read_uV not supplied.\n");
+			"Required data to use read_uv not supplied.\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/hwmon/via686a.c b/drivers/hwmon/via686a.c
index 3123b30208c5..9427e95f7903 100644
--- a/drivers/hwmon/via686a.c
+++ b/drivers/hwmon/via686a.c
@@ -125,7 +125,7 @@ static const u8 VIA686A_REG_TEMP_HYST[]	= { 0x3a, 0x3e, 0x1e };
  * (These conversions were contributed by Jonathan Teh Soon Yew
  * <j.teh@iname.com>)
  */
-static inline u8 IN_TO_REG(long val, int inNum)
+static inline u8 IN_TO_REG(long val, int in_num)
 {
 	/*
 	 * To avoid floating point, we multiply constants by 10 (100 for +12V).
@@ -134,29 +134,29 @@ static inline u8 IN_TO_REG(long val, int inNum)
 	 * by an additional 10000 (100000 for +12V): 1000 for val and 10 (100)
 	 * for the constants.
 	 */
-	if (inNum <= 1)
+	if (in_num <= 1)
 		return (u8) clamp_val((val * 21024 - 1205000) / 250000, 0, 255);
-	else if (inNum == 2)
+	else if (in_num == 2)
 		return (u8) clamp_val((val * 15737 - 1205000) / 250000, 0, 255);
-	else if (inNum == 3)
+	else if (in_num == 3)
 		return (u8) clamp_val((val * 10108 - 1205000) / 250000, 0, 255);
 	else
 		return (u8) clamp_val((val * 41714 - 12050000) / 2500000, 0,
 				      255);
 }
 
-static inline long IN_FROM_REG(u8 val, int inNum)
+static inline long IN_FROM_REG(u8 val, int in_num)
 {
 	/*
 	 * To avoid floating point, we multiply constants by 10 (100 for +12V).
 	 * We also multiply them by 1000 because we want 0.001V/bit for the
 	 * output value. Rounding is done.
 	 */
-	if (inNum <= 1)
+	if (in_num <= 1)
 		return (long) ((250000 * val + 1330000 + 21024 / 2) / 21024);
-	else if (inNum == 2)
+	else if (in_num == 2)
 		return (long) ((250000 * val + 1330000 + 15737 / 2) / 15737);
-	else if (inNum == 3)
+	else if (in_num == 3)
 		return (long) ((250000 * val + 1330000 + 10108 / 2) / 10108);
 	else
 		return (long) ((2500000 * val + 13300000 + 41714 / 2) / 41714);
@@ -210,10 +210,10 @@ static inline u8 FAN_TO_REG(long rpm, int div)
  * VIA register values 0-255.  I *10 before rounding, so we get tenth-degree
  * precision.  (I could have done all 1024 values for our 10-bit readings,
  * but the function is very linear in the useful range (0-80 deg C), so
- * we'll just use linear interpolation for 10-bit readings.)  So, tempLUT
+ * we'll just use linear interpolation for 10-bit readings.)  So, temp_lut
  * is the temp at via register values 0-255:
  */
-static const s16 tempLUT[] = {
+static const s16 temp_lut[] = {
 	-709, -688, -667, -646, -627, -607, -589, -570, -553, -536, -519,
 	-503, -487, -471, -456, -442, -428, -414, -400, -387, -375,
 	-362, -350, -339, -327, -316, -305, -295, -285, -275, -265,
@@ -261,7 +261,7 @@ static const s16 tempLUT[] = {
  * - 2.525453e-04*val^3 + 1.424593e-02*val^2 + 2.148941e+00*val +7.275808e+01)
  * Note that n=161:
  */
-static const u8 viaLUT[] = {
+static const u8 via_lut[] = {
 	12, 12, 13, 14, 14, 15, 16, 16, 17, 18, 18, 19, 20, 20, 21, 22, 23,
 	23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 39, 40,
 	41, 43, 45, 46, 48, 49, 51, 53, 55, 57, 59, 60, 62, 64, 66,
@@ -284,26 +284,26 @@ static const u8 viaLUT[] = {
  */
 static inline u8 TEMP_TO_REG(long val)
 {
-	return viaLUT[val <= -50000 ? 0 : val >= 110000 ? 160 :
+	return via_lut[val <= -50000 ? 0 : val >= 110000 ? 160 :
 		      (val < 0 ? val - 500 : val + 500) / 1000 + 50];
 }
 
 /* for 8-bit temperature hyst and over registers */
-#define TEMP_FROM_REG(val)	((long)tempLUT[val] * 100)
+#define TEMP_FROM_REG(val)	((long)temp_lut[val] * 100)
 
 /* for 10-bit temperature readings */
 static inline long TEMP_FROM_REG10(u16 val)
 {
-	u16 eightBits = val >> 2;
-	u16 twoBits = val & 3;
+	u16 eight_bits = val >> 2;
+	u16 two_bits = val & 3;
 
 	/* no interpolation for these */
-	if (twoBits == 0 || eightBits == 255)
-		return TEMP_FROM_REG(eightBits);
+	if (two_bits == 0 || eight_bits == 255)
+		return TEMP_FROM_REG(eight_bits);
 
 	/* do some linear interpolation */
-	return (tempLUT[eightBits] * (4 - twoBits) +
-		tempLUT[eightBits + 1] * twoBits) * 25;
+	return (temp_lut[eight_bits] * (4 - two_bits) +
+		temp_lut[eight_bits + 1] * two_bits) * 25;
 }
 
 #define DIV_FROM_REG(val) (1 << (val))
diff --git a/include/linux/platform_data/ntc_thermistor.h b/include/linux/platform_data/ntc_thermistor.h
index fa95f9cbe7ea..c7285b575462 100644
--- a/include/linux/platform_data/ntc_thermistor.h
+++ b/include/linux/platform_data/ntc_thermistor.h
@@ -45,8 +45,8 @@ struct ntc_thermistor_platform_data {
 	 * chan: iio_channel pointer to communicate with the ADC which the
 	 * thermistor is using for conversion of the analog values.
 	 */
-	int (*read_uV)(struct ntc_thermistor_platform_data *);
-	unsigned int pullup_uV;
+	int (*read_uv)(struct ntc_thermistor_platform_data *);
+	unsigned int pullup_uv;
 
 	unsigned int pullup_ohm;
 	unsigned int pulldown_ohm;
-- 
cgit 


From 79ba1d8910f517c3bd39d794ddb1a5b4c03795c4 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Wed, 27 Mar 2013 14:38:07 +0100
Subject: mac80211: parse Timeout Interval Element using a struct

Instead of open-coding the accesses and length check do
the length check in the IE parser and assign a struct
pointer for use in the remaining code.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 10 ++++++++++
 net/mac80211/ieee80211_i.h |  3 +--
 net/mac80211/mlme.c        |  6 +++---
 net/mac80211/util.c        |  6 ++++--
 4 files changed, 18 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index d10b5bba3268..e46fea8b972e 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -1955,6 +1955,16 @@ enum ieee80211_timeout_interval_type {
 	WLAN_TIMEOUT_ASSOC_COMEBACK = 3 /* 802.11w */,
 };
 
+/**
+ * struct ieee80211_timeout_interval_ie - Timeout Interval element
+ * @type: type, see &enum ieee80211_timeout_interval_type
+ * @value: timeout interval value
+ */
+struct ieee80211_timeout_interval_ie {
+	u8 type;
+	__le32 value;
+} __packed;
+
 /* BACK action code */
 enum ieee80211_back_actioncode {
 	WLAN_ACTION_ADDBA_REQ = 0,
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 6ad019d32623..c783e996bcce 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1180,7 +1180,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_channel_sw_ie *ch_switch_ie;
 	const u8 *country_elem;
 	const u8 *pwr_constr_elem;
-	const u8 *timeout_int;
+	const struct ieee80211_timeout_interval_ie *timeout_int;
 	const u8 *opmode_notif;
 
 	/* length of them, respectively */
@@ -1198,7 +1198,6 @@ struct ieee802_11_elems {
 	u8 prep_len;
 	u8 perr_len;
 	u8 country_elem_len;
-	u8 timeout_int_len;
 
 	/* whether a parse error occurred while retrieving these elements */
 	bool parse_error;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 157d951df7a4..304d6cfc6250 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2629,10 +2629,10 @@ ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
 
 	if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY &&
-	    elems.timeout_int && elems.timeout_int_len == 5 &&
-	    elems.timeout_int[0] == WLAN_TIMEOUT_ASSOC_COMEBACK) {
+	    elems.timeout_int &&
+	    elems.timeout_int->type == WLAN_TIMEOUT_ASSOC_COMEBACK) {
 		u32 tu, ms;
-		tu = get_unaligned_le32(elems.timeout_int + 1);
+		tu = le32_to_cpu(elems.timeout_int->value);
 		ms = tu * 1024 / 1000;
 		sdata_info(sdata,
 			   "%pM rejected association temporarily; comeback duration %u TU (%u ms)\n",
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 4839dec5c9ac..f9581c6378ae 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -874,8 +874,10 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			elems->pwr_constr_elem = pos;
 			break;
 		case WLAN_EID_TIMEOUT_INTERVAL:
-			elems->timeout_int = pos;
-			elems->timeout_int_len = elen;
+			if (elen >= sizeof(struct ieee80211_timeout_interval_ie))
+				elems->timeout_int = (void *)pos;
+			else
+				elem_parse_failed = true;
 			break;
 		default:
 			break;
-- 
cgit 


From fc1b74925f87f6aca5432eb73f6a57eff30afde7 Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Fri, 5 Apr 2013 19:20:30 +0000
Subject: KVM: Move vm_list kvm_lock declarations out of x86

The variables vm_list and kvm_lock are common to all architectures, so
move the declarations from arch/x86/include/asm/kvm_host.h to
include/linux/kvm_host.h.

Fixes sparse warnings like these when building for arm64:

  virt/kvm/kvm_main.c: warning: symbol 'kvm_lock' was not declared. Should it be static?
  virt/kvm/kvm_main.c: warning: symbol 'vm_list' was not declared. Should it be static?

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 3 ---
 include/linux/kvm_host.h        | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3dd84c996d56..628163c0c6e4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -94,9 +94,6 @@
 
 #define ASYNC_PF_PER_VCPU 64
 
-extern raw_spinlock_t kvm_lock;
-extern struct list_head vm_list;
-
 struct kvm_vcpu;
 struct kvm;
 struct kvm_async_pf;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c0be23f874d..71fed38c7200 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -135,6 +135,9 @@ struct kvm;
 struct kvm_vcpu;
 extern struct kmem_cache *kvm_vcpu_cache;
 
+extern raw_spinlock_t kvm_lock;
+extern struct list_head vm_list;
+
 struct kvm_io_range {
 	gpa_t addr;
 	int len;
-- 
cgit 


From 8b415dcd762607379cf0a69c9dd25940da1d174e Mon Sep 17 00:00:00 2001
From: Geoff Levand <geoff@infradead.org>
Date: Fri, 5 Apr 2013 19:20:30 +0000
Subject: KVM: Move kvm_rebooting declaration out of x86

The variable kvm_rebooting is a common kvm variable, so move its
declaration from arch/x86/include/asm/kvm_host.h to
include/asm/kvm_host.h.

Fixes this sparse warning when building on arm64:

  virt/kvm/kvm_main.c:warning: symbol 'kvm_rebooting' was not declared. Should it be static?

Signed-off-by: Geoff Levand <geoff@infradead.org>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 include/linux/kvm_host.h        | 2 ++
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 628163c0c6e4..b2c7263c93b6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -972,7 +972,6 @@ enum {
  * Trap the fault and ignore the instruction if that happens.
  */
 asmlinkage void kvm_spurious_fault(void);
-extern bool kvm_rebooting;
 
 #define ____kvm_handle_fault_on_reboot(insn, cleanup_insn)	\
 	"666: " insn "\n\t" \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 71fed38c7200..20d77d24d764 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1061,6 +1061,8 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 	}
 }
 
+extern bool kvm_rebooting;
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
-- 
cgit 


From 9a47a8dccf8866b497bd80809da1c665e7b07c2c Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Thu, 21 Mar 2013 12:27:25 +0100
Subject: mfd: prcmu: pass a base and size with the early initcall

This patch will make an early remapping of the PRCMU, to be
used when setting up the clocks, that will call down into parts
of the PRCMU driver before it is probed.

Going forward this will be removed like this:

- The mailbox subsystem need to be merged.
  http://marc.info/?l=linux-kernel&m=136314559201983&w=2

- At this point the PRCMU clock code can be moved over to the
  ux500 clock driver in drivers/clk/ux500/* and maintained
  there in a decentralized manner.

- This early initcall and PRCMU base parameters become part of
  the ux500_clk_init() call instead.

Cc: Suman Anna <s-anna@ti.com>
Cc: Loic Pallardy <loic.pallardy@st.com>
Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ux500/cpu.c        | 15 +++++++++------
 drivers/mfd/db8500-prcmu.c       | 13 ++++++++++++-
 include/linux/mfd/db8500-prcmu.h |  4 ++--
 include/linux/mfd/dbx500-prcmu.h |  6 +++---
 4 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c
index 537870d3fea8..6d3b57d61835 100644
--- a/arch/arm/mach-ux500/cpu.c
+++ b/arch/arm/mach-ux500/cpu.c
@@ -8,7 +8,7 @@
 
 #include <linux/platform_device.h>
 #include <linux/io.h>
-#include <linux/mfd/db8500-prcmu.h>
+#include <linux/mfd/dbx500-prcmu.h>
 #include <linux/clksrc-dbx500-prcmu.h>
 #include <linux/sys_soc.h>
 #include <linux/err.h>
@@ -68,13 +68,16 @@ void __init ux500_init_irq(void)
 	 * Init clocks here so that they are available for system timer
 	 * initialization.
 	 */
-	if (cpu_is_u8500_family() || cpu_is_u9540())
-		db8500_prcmu_early_init();
-
-	if (cpu_is_u8500_family() || cpu_is_u9540())
+	if (cpu_is_u8500_family()) {
+		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
+		u8500_clk_init();
+	} else if (cpu_is_u9540()) {
+		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
 		u8500_clk_init();
-	else if (cpu_is_u8540())
+	} else if (cpu_is_u8540()) {
+		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
 		u8540_clk_init();
+	}
 }
 
 void __init ux500_init_late(void)
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 21f261bf9e95..0d0cc91f30e8 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -2825,8 +2825,19 @@ static void dbx500_fw_version_init(struct platform_device *pdev,
 	}
 }
 
-void __init db8500_prcmu_early_init(void)
+void __init db8500_prcmu_early_init(u32 phy_base, u32 size)
 {
+	/*
+	 * This is a temporary remap to bring up the clocks. It is
+	 * subsequently replaces with a real remap. After the merge of
+	 * the mailbox subsystem all of this early code goes away, and the
+	 * clock driver can probe independently. An early initcall will
+	 * still be needed, but it can be diverted into drivers/clk/ux500.
+	 */
+	prcmu_base = ioremap(phy_base, size);
+	if (!prcmu_base)
+		pr_err("%s: ioremap() of prcmu registers failed!\n", __func__);
+
 	spin_lock_init(&mb0_transfer.lock);
 	spin_lock_init(&mb0_transfer.dbb_irqs_lock);
 	mutex_init(&mb0_transfer.ac_wake_lock);
diff --git a/include/linux/mfd/db8500-prcmu.h b/include/linux/mfd/db8500-prcmu.h
index 77a46ae2fc17..ac943df93489 100644
--- a/include/linux/mfd/db8500-prcmu.h
+++ b/include/linux/mfd/db8500-prcmu.h
@@ -489,7 +489,7 @@ struct prcmu_auto_pm_config {
 
 #ifdef CONFIG_MFD_DB8500_PRCMU
 
-void db8500_prcmu_early_init(void);
+void db8500_prcmu_early_init(u32 phy_base, u32 size);
 int prcmu_set_rc_a2p(enum romcode_write);
 enum romcode_read prcmu_get_rc_p2a(void);
 enum ap_pwrst prcmu_get_xp70_current_state(void);
@@ -553,7 +553,7 @@ void db8500_prcmu_write_masked(unsigned int reg, u32 mask, u32 value);
 
 #else /* !CONFIG_MFD_DB8500_PRCMU */
 
-static inline void db8500_prcmu_early_init(void) {}
+static inline void db8500_prcmu_early_init(u32 phy_base, u32 size) {}
 
 static inline int prcmu_set_rc_a2p(enum romcode_write code)
 {
diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index 3abcca91eecd..8c546cbcb9a6 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -276,9 +276,9 @@ struct prcmu_fw_version {
 
 #if defined(CONFIG_UX500_SOC_DB8500)
 
-static inline void __init prcmu_early_init(void)
+static inline void prcmu_early_init(u32 phy_base, u32 size)
 {
-	return db8500_prcmu_early_init();
+	return db8500_prcmu_early_init(phy_base, size);
 }
 
 static inline int prcmu_set_power_state(u8 state, bool keep_ulp_clk,
@@ -500,7 +500,7 @@ static inline int prcmu_config_a9wdog(u8 num, bool sleep_auto_off)
 }
 #else
 
-static inline void __init prcmu_early_init(void) {}
+static inline void prcmu_early_init(u32 phy_base, u32 size) {}
 
 static inline int prcmu_set_power_state(u8 state, bool keep_ulp_clk,
 	bool keep_ap_pll)
-- 
cgit 


From 1e22a8c614a5d8c29d0882de21ce327673b71fca Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 19 Mar 2013 15:36:12 +0100
Subject: ARM: ux500: move PM-related PRCMU functions to machine

We are trying to decompose and decentralize the code in
the DB8500 PRCMU out into subdrivers. The code moved in
this patch concerns a group of functions used for
decoupling and recoupling the IRQs from the GIC. During
sleep and idle the Ux500 system will transfer all IRQ
handling to the PRCMU using these functions.

Basically we are left with the two alternatives of code
placement as:

- arch/arm/mach-ux500/pm.c - this because the code is
  closely related to the GIC, and takes ownership of
  some of the registers from the PRCMU related to this
  PM functionality.

- drivers/mfd/db8500-prcmu-pm.c - because the code is
  affecting stuff in the PRCMU register range. But then
  this code needs to remap and handle GIC registers.

This patch implementation is taking the first approach.

Currently the cpuidle driver is the only piece of code
using this set of functions, but it will later also be
used by the suspend/resume code which is currently under
review.

The header file is moved to:
<linux/platform_data/arm-ux500-pm.h>
The function prototypes need to be placed in a globally
visible header since the CPUidle code is planned to move
out to drivers/cpuidle.

Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Acked-by: Rickard Andersson <rickard.andersson@stericsson.com>
Acked-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ux500/Makefile               |   2 +-
 arch/arm/mach-ux500/cpu.c                  |   4 +
 arch/arm/mach-ux500/cpuidle.c              |   3 +-
 arch/arm/mach-ux500/pm.c                   | 167 +++++++++++++++++++++++++++++
 drivers/mfd/db8500-prcmu.c                 | 114 --------------------
 drivers/mfd/dbx500-prcmu-regs.h            |  22 ----
 include/linux/mfd/db8500-prcmu.h           |   6 --
 include/linux/mfd/dbx500-prcmu.h           |  30 ------
 include/linux/platform_data/arm-ux500-pm.h |  21 ++++
 9 files changed, 195 insertions(+), 174 deletions(-)
 create mode 100644 arch/arm/mach-ux500/pm.c
 create mode 100644 include/linux/platform_data/arm-ux500-pm.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/Makefile b/arch/arm/mach-ux500/Makefile
index f24710dfc395..580a4db9e97d 100644
--- a/arch/arm/mach-ux500/Makefile
+++ b/arch/arm/mach-ux500/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-y				:= cpu.o devices.o devices-common.o \
-				   id.o usb.o timer.o
+				   id.o usb.o timer.o pm.o
 obj-$(CONFIG_CPU_IDLE)          += cpuidle.o
 obj-$(CONFIG_CACHE_L2X0)	+= cache-l2x0.o
 obj-$(CONFIG_UX500_SOC_DB8500)	+= cpu-db8500.o devices-db8500.o
diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c
index 6f8e152226e3..2c5c48baa3eb 100644
--- a/arch/arm/mach-ux500/cpu.c
+++ b/arch/arm/mach-ux500/cpu.c
@@ -20,6 +20,7 @@
 #include <linux/irqchip.h>
 #include <linux/irqchip/arm-gic.h>
 #include <linux/platform_data/clk-ux500.h>
+#include <linux/platform_data/arm-ux500-pm.h>
 
 #include <asm/mach/map.h>
 
@@ -68,12 +69,15 @@ void __init ux500_init_irq(void)
 	 */
 	if (cpu_is_u8500_family()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
+		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K - 1);
 		u8500_clk_init();
 	} else if (cpu_is_u9540()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
+		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K - 1);
 		u8500_clk_init();
 	} else if (cpu_is_u8540()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
+		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
 		u8540_clk_init();
 	}
 }
diff --git a/arch/arm/mach-ux500/cpuidle.c b/arch/arm/mach-ux500/cpuidle.c
index ce9149302cc3..1e5bb6640f56 100644
--- a/arch/arm/mach-ux500/cpuidle.c
+++ b/arch/arm/mach-ux500/cpuidle.c
@@ -16,6 +16,7 @@
 #include <linux/atomic.h>
 #include <linux/smp.h>
 #include <linux/mfd/dbx500-prcmu.h>
+#include <linux/platform_data/arm-ux500-pm.h>
 
 #include <asm/cpuidle.h>
 #include <asm/proc-fns.h>
@@ -130,7 +131,7 @@ int __init ux500_idle_init(void)
 	int ret, cpu;
 	struct cpuidle_device *device;
 
-        /* Configure wake up reasons */
+	/* Configure wake up reasons */
 	prcmu_enable_wakeups(PRCMU_WAKEUP(ARM) | PRCMU_WAKEUP(RTC) |
 			     PRCMU_WAKEUP(ABB));
 
diff --git a/arch/arm/mach-ux500/pm.c b/arch/arm/mach-ux500/pm.c
new file mode 100644
index 000000000000..6949a1332f8f
--- /dev/null
+++ b/arch/arm/mach-ux500/pm.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010-2013
+ * Author: Rickard Andersson <rickard.andersson@stericsson.com> for
+ *         ST-Ericsson.
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org> for Linaro.
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/irqchip/arm-gic.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/platform_data/arm-ux500-pm.h>
+
+#include <mach/hardware.h>
+
+/* ARM WFI Standby signal register */
+#define PRCM_ARM_WFI_STANDBY    (prcmu_base + 0x130)
+#define PRCM_ARM_WFI_STANDBY_WFI0		0x08
+#define PRCM_ARM_WFI_STANDBY_WFI1		0x10
+#define PRCM_IOCR		(prcmu_base + 0x310)
+#define PRCM_IOCR_IOFORCE			0x1
+
+/* Dual A9 core interrupt management unit registers */
+#define PRCM_A9_MASK_REQ	(prcmu_base + 0x328)
+#define PRCM_A9_MASK_REQ_PRCM_A9_MASK_REQ	0x1
+
+#define PRCM_A9_MASK_ACK	(prcmu_base + 0x32c)
+#define PRCM_ARMITMSK31TO0	(prcmu_base + 0x11c)
+#define PRCM_ARMITMSK63TO32	(prcmu_base + 0x120)
+#define PRCM_ARMITMSK95TO64	(prcmu_base + 0x124)
+#define PRCM_ARMITMSK127TO96	(prcmu_base + 0x128)
+#define PRCM_POWER_STATE_VAL	(prcmu_base + 0x25C)
+#define PRCM_ARMITVAL31TO0	(prcmu_base + 0x260)
+#define PRCM_ARMITVAL63TO32	(prcmu_base + 0x264)
+#define PRCM_ARMITVAL95TO64	(prcmu_base + 0x268)
+#define PRCM_ARMITVAL127TO96	(prcmu_base + 0x26C)
+
+static void __iomem *prcmu_base;
+
+/* This function decouple the gic from the prcmu */
+int prcmu_gic_decouple(void)
+{
+	u32 val = readl(PRCM_A9_MASK_REQ);
+
+	/* Set bit 0 register value to 1 */
+	writel(val | PRCM_A9_MASK_REQ_PRCM_A9_MASK_REQ,
+	       PRCM_A9_MASK_REQ);
+
+	/* Make sure the register is updated */
+	readl(PRCM_A9_MASK_REQ);
+
+	/* Wait a few cycles for the gic mask completion */
+	udelay(1);
+
+	return 0;
+}
+
+/* This function recouple the gic with the prcmu */
+int prcmu_gic_recouple(void)
+{
+	u32 val = readl(PRCM_A9_MASK_REQ);
+
+	/* Set bit 0 register value to 0 */
+	writel(val & ~PRCM_A9_MASK_REQ_PRCM_A9_MASK_REQ, PRCM_A9_MASK_REQ);
+
+	return 0;
+}
+
+#define PRCMU_GIC_NUMBER_REGS 5
+
+/*
+ * This function checks if there are pending irq on the gic. It only
+ * makes sense if the gic has been decoupled before with the
+ * db8500_prcmu_gic_decouple function. Disabling an interrupt only
+ * disables the forwarding of the interrupt to any CPU interface. It
+ * does not prevent the interrupt from changing state, for example
+ * becoming pending, or active and pending if it is already
+ * active. Hence, we have to check the interrupt is pending *and* is
+ * active.
+ */
+bool prcmu_gic_pending_irq(void)
+{
+	u32 pr; /* Pending register */
+	u32 er; /* Enable register */
+	void __iomem *dist_base = __io_address(U8500_GIC_DIST_BASE);
+	int i;
+
+	/* 5 registers. STI & PPI not skipped */
+	for (i = 0; i < PRCMU_GIC_NUMBER_REGS; i++) {
+
+		pr = readl_relaxed(dist_base + GIC_DIST_PENDING_SET + i * 4);
+		er = readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
+
+		if (pr & er)
+			return true; /* There is a pending interrupt */
+	}
+
+	return false;
+}
+
+/*
+ * This function checks if there are pending interrupt on the
+ * prcmu which has been delegated to monitor the irqs with the
+ * db8500_prcmu_copy_gic_settings function.
+ */
+bool prcmu_pending_irq(void)
+{
+	u32 it, im;
+	int i;
+
+	for (i = 0; i < PRCMU_GIC_NUMBER_REGS - 1; i++) {
+		it = readl(PRCM_ARMITVAL31TO0 + i * 4);
+		im = readl(PRCM_ARMITMSK31TO0 + i * 4);
+		if (it & im)
+			return true; /* There is a pending interrupt */
+	}
+
+	return false;
+}
+
+/*
+ * This function checks if the specified cpu is in in WFI. It's usage
+ * makes sense only if the gic is decoupled with the db8500_prcmu_gic_decouple
+ * function. Of course passing smp_processor_id() to this function will
+ * always return false...
+ */
+bool prcmu_is_cpu_in_wfi(int cpu)
+{
+	return readl(PRCM_ARM_WFI_STANDBY) & cpu ? PRCM_ARM_WFI_STANDBY_WFI1 :
+		     PRCM_ARM_WFI_STANDBY_WFI0;
+}
+
+/*
+ * This function copies the gic SPI settings to the prcmu in order to
+ * monitor them and abort/finish the retention/off sequence or state.
+ */
+int prcmu_copy_gic_settings(void)
+{
+	u32 er; /* Enable register */
+	void __iomem *dist_base = __io_address(U8500_GIC_DIST_BASE);
+	int i;
+
+	/* We skip the STI and PPI */
+	for (i = 0; i < PRCMU_GIC_NUMBER_REGS - 1; i++) {
+		er = readl_relaxed(dist_base +
+				   GIC_DIST_ENABLE_SET + (i + 1) * 4);
+		writel(er, PRCM_ARMITMSK31TO0 + i * 4);
+	}
+
+	return 0;
+}
+
+void __init ux500_pm_init(u32 phy_base, u32 size)
+{
+	prcmu_base = ioremap(phy_base, size);
+	if (!prcmu_base) {
+		pr_err("could not remap PRCMU for PM functions\n");
+		return;
+	}
+	/*
+	 * On watchdog reboot the GIC is in some cases decoupled.
+	 * This will make sure that the GIC is correctly configured.
+	 */
+	prcmu_gic_recouple();
+}
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 75a60c4721ff..16e5696024a5 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -26,7 +26,6 @@
 #include <linux/fs.h>
 #include <linux/platform_device.h>
 #include <linux/uaccess.h>
-#include <linux/irqchip/arm-gic.h>
 #include <linux/mfd/core.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/mfd/abx500/ab8500.h>
@@ -794,119 +793,6 @@ u8 db8500_prcmu_get_power_state_result(void)
 	return readb(tcdm_base + PRCM_ACK_MB0_AP_PWRSTTR_STATUS);
 }
 
-/* This function decouple the gic from the prcmu */
-int db8500_prcmu_gic_decouple(void)
-{
-	u32 val = readl(PRCM_A9_MASK_REQ);
-
-	/* Set bit 0 register value to 1 */
-	writel(val | PRCM_A9_MASK_REQ_PRCM_A9_MASK_REQ,
-	       PRCM_A9_MASK_REQ);
-
-	/* Make sure the register is updated */
-	readl(PRCM_A9_MASK_REQ);
-
-	/* Wait a few cycles for the gic mask completion */
-	udelay(1);
-
-	return 0;
-}
-
-/* This function recouple the gic with the prcmu */
-int db8500_prcmu_gic_recouple(void)
-{
-	u32 val = readl(PRCM_A9_MASK_REQ);
-
-	/* Set bit 0 register value to 0 */
-	writel(val & ~PRCM_A9_MASK_REQ_PRCM_A9_MASK_REQ, PRCM_A9_MASK_REQ);
-
-	return 0;
-}
-
-#define PRCMU_GIC_NUMBER_REGS 5
-
-/*
- * This function checks if there are pending irq on the gic. It only
- * makes sense if the gic has been decoupled before with the
- * db8500_prcmu_gic_decouple function. Disabling an interrupt only
- * disables the forwarding of the interrupt to any CPU interface. It
- * does not prevent the interrupt from changing state, for example
- * becoming pending, or active and pending if it is already
- * active. Hence, we have to check the interrupt is pending *and* is
- * active.
- */
-bool db8500_prcmu_gic_pending_irq(void)
-{
-	u32 pr; /* Pending register */
-	u32 er; /* Enable register */
-	void __iomem *dist_base = __io_address(U8500_GIC_DIST_BASE);
-	int i;
-
-        /* 5 registers. STI & PPI not skipped */
-	for (i = 0; i < PRCMU_GIC_NUMBER_REGS; i++) {
-
-		pr = readl_relaxed(dist_base + GIC_DIST_PENDING_SET + i * 4);
-		er = readl_relaxed(dist_base + GIC_DIST_ENABLE_SET + i * 4);
-
-		if (pr & er)
-			return true; /* There is a pending interrupt */
-	}
-
-	return false;
-}
-
-/*
- * This function checks if there are pending interrupt on the
- * prcmu which has been delegated to monitor the irqs with the
- * db8500_prcmu_copy_gic_settings function.
- */
-bool db8500_prcmu_pending_irq(void)
-{
-	u32 it, im;
-	int i;
-
-	for (i = 0; i < PRCMU_GIC_NUMBER_REGS - 1; i++) {
-		it = readl(PRCM_ARMITVAL31TO0 + i * 4);
-		im = readl(PRCM_ARMITMSK31TO0 + i * 4);
-		if (it & im)
-			return true; /* There is a pending interrupt */
-	}
-
-	return false;
-}
-
-/*
- * This function checks if the specified cpu is in in WFI. It's usage
- * makes sense only if the gic is decoupled with the db8500_prcmu_gic_decouple
- * function. Of course passing smp_processor_id() to this function will
- * always return false...
- */
-bool db8500_prcmu_is_cpu_in_wfi(int cpu)
-{
-	return readl(PRCM_ARM_WFI_STANDBY) & cpu ? PRCM_ARM_WFI_STANDBY_WFI1 :
-		     PRCM_ARM_WFI_STANDBY_WFI0;
-}
-
-/*
- * This function copies the gic SPI settings to the prcmu in order to
- * monitor them and abort/finish the retention/off sequence or state.
- */
-int db8500_prcmu_copy_gic_settings(void)
-{
-	u32 er; /* Enable register */
-	void __iomem *dist_base = __io_address(U8500_GIC_DIST_BASE);
-	int i;
-
-        /* We skip the STI and PPI */
-	for (i = 0; i < PRCMU_GIC_NUMBER_REGS - 1; i++) {
-		er = readl_relaxed(dist_base +
-				   GIC_DIST_ENABLE_SET + (i + 1) * 4);
-		writel(er, PRCM_ARMITMSK31TO0 + i * 4);
-	}
-
-	return 0;
-}
-
 /* This function should only be called while mb0_transfer.lock is held. */
 static void config_wakeups(void)
 {
diff --git a/drivers/mfd/dbx500-prcmu-regs.h b/drivers/mfd/dbx500-prcmu-regs.h
index 439254d23d56..d14836ed2114 100644
--- a/drivers/mfd/dbx500-prcmu-regs.h
+++ b/drivers/mfd/dbx500-prcmu-regs.h
@@ -74,33 +74,11 @@
 #define PRCM_A9PL_FORCE_CLKEN_PRCM_A9PL_FORCE_CLKEN BIT(0)
 #define PRCM_A9PL_FORCE_CLKEN_PRCM_A9AXI_FORCE_CLKEN BIT(1)
 
-/* ARM WFI Standby signal register */
-#define PRCM_ARM_WFI_STANDBY    (prcmu_base + 0x130)
-#define PRCM_ARM_WFI_STANDBY_WFI0               0x08
-#define PRCM_ARM_WFI_STANDBY_WFI1               0x10
-#define PRCM_IOCR		(prcmu_base + 0x310)
-#define PRCM_IOCR_IOFORCE			0x1
-
 /* CPU mailbox registers */
 #define PRCM_MBOX_CPU_VAL	(prcmu_base + 0x0fc)
 #define PRCM_MBOX_CPU_SET	(prcmu_base + 0x100)
 #define PRCM_MBOX_CPU_CLR	(prcmu_base + 0x104)
 
-/* Dual A9 core interrupt management unit registers */
-#define PRCM_A9_MASK_REQ	(prcmu_base + 0x328)
-#define PRCM_A9_MASK_REQ_PRCM_A9_MASK_REQ	0x1
-
-#define PRCM_A9_MASK_ACK	(prcmu_base + 0x32c)
-#define PRCM_ARMITMSK31TO0	(prcmu_base + 0x11c)
-#define PRCM_ARMITMSK63TO32	(prcmu_base + 0x120)
-#define PRCM_ARMITMSK95TO64	(prcmu_base + 0x124)
-#define PRCM_ARMITMSK127TO96	(prcmu_base + 0x128)
-#define PRCM_POWER_STATE_VAL	(prcmu_base + 0x25C)
-#define PRCM_ARMITVAL31TO0	(prcmu_base + 0x260)
-#define PRCM_ARMITVAL63TO32	(prcmu_base + 0x264)
-#define PRCM_ARMITVAL95TO64	(prcmu_base + 0x268)
-#define PRCM_ARMITVAL127TO96	(prcmu_base + 0x26C)
-
 #define PRCM_HOSTACCESS_REQ	(prcmu_base + 0x334)
 #define PRCM_HOSTACCESS_REQ_HOSTACCESS_REQ 0x1
 #define PRCM_HOSTACCESS_REQ_WAKE_REQ	BIT(16)
diff --git a/include/linux/mfd/db8500-prcmu.h b/include/linux/mfd/db8500-prcmu.h
index ac943df93489..0bd69446bb05 100644
--- a/include/linux/mfd/db8500-prcmu.h
+++ b/include/linux/mfd/db8500-prcmu.h
@@ -522,12 +522,6 @@ int db8500_prcmu_load_a9wdog(u8 id, u32 val);
 void db8500_prcmu_system_reset(u16 reset_code);
 int db8500_prcmu_set_power_state(u8 state, bool keep_ulp_clk, bool keep_ap_pll);
 u8 db8500_prcmu_get_power_state_result(void);
-int db8500_prcmu_gic_decouple(void);
-int db8500_prcmu_gic_recouple(void);
-int db8500_prcmu_copy_gic_settings(void);
-bool db8500_prcmu_gic_pending_irq(void);
-bool db8500_prcmu_pending_irq(void);
-bool db8500_prcmu_is_cpu_in_wfi(int cpu);
 void db8500_prcmu_enable_wakeups(u32 wakeups);
 int db8500_prcmu_set_epod(u16 epod_id, u8 epod_state);
 int db8500_prcmu_request_clock(u8 clock, bool enable);
diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index 8c546cbcb9a6..fc43cecc9449 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -293,36 +293,6 @@ static inline u8 prcmu_get_power_state_result(void)
 	return db8500_prcmu_get_power_state_result();
 }
 
-static inline int prcmu_gic_decouple(void)
-{
-	return db8500_prcmu_gic_decouple();
-}
-
-static inline int prcmu_gic_recouple(void)
-{
-	return db8500_prcmu_gic_recouple();
-}
-
-static inline bool prcmu_gic_pending_irq(void)
-{
-	return db8500_prcmu_gic_pending_irq();
-}
-
-static inline bool prcmu_is_cpu_in_wfi(int cpu)
-{
-	return db8500_prcmu_is_cpu_in_wfi(cpu);
-}
-
-static inline int prcmu_copy_gic_settings(void)
-{
-	return db8500_prcmu_copy_gic_settings();
-}
-
-static inline bool prcmu_pending_irq(void)
-{
-	return db8500_prcmu_pending_irq();
-}
-
 static inline int prcmu_set_epod(u16 epod_id, u8 epod_state)
 {
 	return db8500_prcmu_set_epod(epod_id, epod_state);
diff --git a/include/linux/platform_data/arm-ux500-pm.h b/include/linux/platform_data/arm-ux500-pm.h
new file mode 100644
index 000000000000..8dff64b29ec0
--- /dev/null
+++ b/include/linux/platform_data/arm-ux500-pm.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010-2013
+ * Author: Rickard Andersson <rickard.andersson@stericsson.com> for
+ *         ST-Ericsson.
+ * Author: Daniel Lezcano <daniel.lezcano@linaro.org> for Linaro.
+ * License terms: GNU General Public License (GPL) version 2
+ *
+ */
+
+#ifndef ARM_UX500_PM_H
+#define ARM_UX500_PM_H
+
+int prcmu_gic_decouple(void);
+int prcmu_gic_recouple(void);
+bool prcmu_gic_pending_irq(void);
+bool prcmu_pending_irq(void);
+bool prcmu_is_cpu_in_wfi(int cpu);
+int prcmu_copy_gic_settings(void);
+void ux500_pm_init(u32 phy_base, u32 size);
+
+#endif /* ARM_UX500_PM_H */
-- 
cgit 


From 55b175d7e6327939df82592ef279c534da323354 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Mar 2013 22:51:07 +0100
Subject: ARM: ux500: split out prcmu initialization

This untangles the final bits of the prcmu code from the platform
code:

* The IRQ_PRCMU_* definitions move from irqs-db8500.h into prcmu.c
  because they are only of local significance.
* u8500_thsens_device goes into the prcmu, because it uses a PRCMU
  IRQ that the platform does not see.
* IRQ_DB8500_AB8500 and IRQ_PRCMU_BASE go into the platform data
  because the PRCMU does not see it.

Acked-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[Fixed a oneliner bug]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ux500/board-mop500.c             |  58 ----------
 arch/arm/mach-ux500/devices-db8500.c           |   2 +
 arch/arm/mach-ux500/include/mach/irqs-db8500.h |  25 -----
 drivers/mfd/db8500-prcmu.c                     | 140 ++++++++++++++++++++-----
 include/linux/mfd/dbx500-prcmu.h               |   2 +
 5 files changed, 115 insertions(+), 112 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500.c b/arch/arm/mach-ux500/board-mop500.c
index b03457881c4b..fe3d72cc341c 100644
--- a/arch/arm/mach-ux500/board-mop500.c
+++ b/arch/arm/mach-ux500/board-mop500.c
@@ -206,63 +206,6 @@ struct ab8500_platform_data ab8500_platdata = {
 	.codec		= &ab8500_codec_pdata,
 };
 
-/*
- * Thermal Sensor
- */
-
-static struct resource db8500_thsens_resources[] = {
-	{
-		.name = "IRQ_HOTMON_LOW",
-		.start  = IRQ_PRCMU_HOTMON_LOW,
-		.end    = IRQ_PRCMU_HOTMON_LOW,
-		.flags  = IORESOURCE_IRQ,
-	},
-	{
-		.name = "IRQ_HOTMON_HIGH",
-		.start  = IRQ_PRCMU_HOTMON_HIGH,
-		.end    = IRQ_PRCMU_HOTMON_HIGH,
-		.flags  = IORESOURCE_IRQ,
-	},
-};
-
-static struct db8500_thsens_platform_data db8500_thsens_data = {
-	.trip_points[0] = {
-		.temp = 70000,
-		.type = THERMAL_TRIP_ACTIVE,
-		.cdev_name = {
-			[0] = "thermal-cpufreq-0",
-		},
-	},
-	.trip_points[1] = {
-		.temp = 75000,
-		.type = THERMAL_TRIP_ACTIVE,
-		.cdev_name = {
-			[0] = "thermal-cpufreq-0",
-		},
-	},
-	.trip_points[2] = {
-		.temp = 80000,
-		.type = THERMAL_TRIP_ACTIVE,
-		.cdev_name = {
-			[0] = "thermal-cpufreq-0",
-		},
-	},
-	.trip_points[3] = {
-		.temp = 85000,
-		.type = THERMAL_TRIP_CRITICAL,
-	},
-	.num_trips = 4,
-};
-
-static struct platform_device u8500_thsens_device = {
-	.name           = "db8500-thermal",
-	.resource       = db8500_thsens_resources,
-	.num_resources  = ARRAY_SIZE(db8500_thsens_resources),
-	.dev	= {
-		.platform_data	= &db8500_thsens_data,
-	},
-};
-
 static struct platform_device u8500_cpufreq_cooling_device = {
 	.name           = "db8500-cpufreq-cooling",
 };
@@ -622,7 +565,6 @@ static struct platform_device *snowball_platform_devs[] __initdata = {
 	&snowball_key_dev,
 	&snowball_sbnet_dev,
 	&snowball_gpio_en_3v3_regulator_dev,
-	&u8500_thsens_device,
 	&u8500_cpufreq_cooling_device,
 };
 
diff --git a/arch/arm/mach-ux500/devices-db8500.c b/arch/arm/mach-ux500/devices-db8500.c
index f3d9419f75d3..df4d7de0fd9f 100644
--- a/arch/arm/mach-ux500/devices-db8500.c
+++ b/arch/arm/mach-ux500/devices-db8500.c
@@ -199,6 +199,8 @@ struct platform_device u8500_ske_keypad_device = {
 
 struct prcmu_pdata db8500_prcmu_pdata = {
 	.ab_platdata	= &ab8500_platdata,
+	.ab_irq		= IRQ_DB8500_AB8500,
+	.irq_base	= IRQ_PRCMU_BASE,
 	.version_offset	= DB8500_PRCMU_FW_VERSION_OFFSET,
 	.legacy_offset	= DB8500_PRCMU_LEGACY_OFFSET,
 };
diff --git a/arch/arm/mach-ux500/include/mach/irqs-db8500.h b/arch/arm/mach-ux500/include/mach/irqs-db8500.h
index 68bc14974608..f3a9d5947ef3 100644
--- a/arch/arm/mach-ux500/include/mach/irqs-db8500.h
+++ b/arch/arm/mach-ux500/include/mach/irqs-db8500.h
@@ -109,31 +109,6 @@
 
 /* Virtual interrupts corresponding to the PRCMU wakeups.  */
 #define IRQ_PRCMU_BASE IRQ_SOC_START
-#define NUM_PRCMU_WAKEUPS (IRQ_PRCMU_END - IRQ_PRCMU_BASE)
-
-#define IRQ_PRCMU_RTC (IRQ_PRCMU_BASE)
-#define IRQ_PRCMU_RTT0 (IRQ_PRCMU_BASE + 1)
-#define IRQ_PRCMU_RTT1 (IRQ_PRCMU_BASE + 2)
-#define IRQ_PRCMU_HSI0 (IRQ_PRCMU_BASE + 3)
-#define IRQ_PRCMU_HSI1 (IRQ_PRCMU_BASE + 4)
-#define IRQ_PRCMU_CA_WAKE (IRQ_PRCMU_BASE + 5)
-#define IRQ_PRCMU_USB (IRQ_PRCMU_BASE + 6)
-#define IRQ_PRCMU_ABB (IRQ_PRCMU_BASE + 7)
-#define IRQ_PRCMU_ABB_FIFO (IRQ_PRCMU_BASE + 8)
-#define IRQ_PRCMU_ARM (IRQ_PRCMU_BASE + 9)
-#define IRQ_PRCMU_MODEM_SW_RESET_REQ (IRQ_PRCMU_BASE + 10)
-#define IRQ_PRCMU_GPIO0 (IRQ_PRCMU_BASE + 11)
-#define IRQ_PRCMU_GPIO1 (IRQ_PRCMU_BASE + 12)
-#define IRQ_PRCMU_GPIO2 (IRQ_PRCMU_BASE + 13)
-#define IRQ_PRCMU_GPIO3 (IRQ_PRCMU_BASE + 14)
-#define IRQ_PRCMU_GPIO4 (IRQ_PRCMU_BASE + 15)
-#define IRQ_PRCMU_GPIO5 (IRQ_PRCMU_BASE + 16)
-#define IRQ_PRCMU_GPIO6 (IRQ_PRCMU_BASE + 17)
-#define IRQ_PRCMU_GPIO7 (IRQ_PRCMU_BASE + 18)
-#define IRQ_PRCMU_GPIO8 (IRQ_PRCMU_BASE + 19)
-#define IRQ_PRCMU_CA_SLEEP (IRQ_PRCMU_BASE + 20)
-#define IRQ_PRCMU_HOTMON_LOW (IRQ_PRCMU_BASE + 21)
-#define IRQ_PRCMU_HOTMON_HIGH (IRQ_PRCMU_BASE + 22)
 #define IRQ_PRCMU_END (IRQ_PRCMU_BASE + 23)
 
 /*
diff --git a/drivers/mfd/db8500-prcmu.c b/drivers/mfd/db8500-prcmu.c
index 0f99b01afa88..21434beb420a 100644
--- a/drivers/mfd/db8500-prcmu.c
+++ b/drivers/mfd/db8500-prcmu.c
@@ -33,7 +33,7 @@
 #include <linux/regulator/machine.h>
 #include <linux/cpufreq.h>
 #include <linux/platform_data/ux500_wdt.h>
-#include <mach/irqs.h>
+#include <linux/platform_data/db8500_thermal.h>
 #include "dbx500-prcmu-regs.h"
 
 /* Index of different voltages to be used when accessing AVSData */
@@ -273,8 +273,34 @@ static struct irq_domain *db8500_irq_domain;
  * the bits in the bit field are not. (The bits also have a tendency to move
  * around, to further complicate matters.)
  */
-#define IRQ_INDEX(_name) ((IRQ_PRCMU_##_name) - IRQ_PRCMU_BASE)
+#define IRQ_INDEX(_name) ((IRQ_PRCMU_##_name))
 #define IRQ_ENTRY(_name)[IRQ_INDEX(_name)] = (WAKEUP_BIT_##_name)
+
+#define IRQ_PRCMU_RTC 0
+#define IRQ_PRCMU_RTT0 1
+#define IRQ_PRCMU_RTT1 2
+#define IRQ_PRCMU_HSI0 3
+#define IRQ_PRCMU_HSI1 4
+#define IRQ_PRCMU_CA_WAKE 5
+#define IRQ_PRCMU_USB 6
+#define IRQ_PRCMU_ABB 7
+#define IRQ_PRCMU_ABB_FIFO 8
+#define IRQ_PRCMU_ARM 9
+#define IRQ_PRCMU_MODEM_SW_RESET_REQ 10
+#define IRQ_PRCMU_GPIO0 11
+#define IRQ_PRCMU_GPIO1 12
+#define IRQ_PRCMU_GPIO2 13
+#define IRQ_PRCMU_GPIO3 14
+#define IRQ_PRCMU_GPIO4 15
+#define IRQ_PRCMU_GPIO5 16
+#define IRQ_PRCMU_GPIO6 17
+#define IRQ_PRCMU_GPIO7 18
+#define IRQ_PRCMU_GPIO8 19
+#define IRQ_PRCMU_CA_SLEEP 20
+#define IRQ_PRCMU_HOTMON_LOW 21
+#define IRQ_PRCMU_HOTMON_HIGH 22
+#define NUM_PRCMU_WAKEUPS 23
+
 static u32 prcmu_irq_bit[NUM_PRCMU_WAKEUPS] = {
 	IRQ_ENTRY(RTC),
 	IRQ_ENTRY(RTT0),
@@ -2649,14 +2675,13 @@ static struct irq_domain_ops db8500_irq_ops = {
 	.xlate  = irq_domain_xlate_twocell,
 };
 
-static int db8500_irq_init(struct device_node *np)
+static int db8500_irq_init(struct device_node *np, int irq_base)
 {
-	int irq_base = 0;
 	int i;
 
 	/* In the device tree case, just take some IRQs */
-	if (!np)
-		irq_base = IRQ_PRCMU_BASE;
+	if (np)
+		irq_base = 0;
 
 	db8500_irq_domain = irq_domain_add_simple(
 		np, NUM_PRCMU_WAKEUPS, irq_base,
@@ -2988,18 +3013,57 @@ static struct regulator_init_data db8500_regulators[DB8500_NUM_REGULATORS] = {
 	},
 };
 
-static struct resource ab8500_resources[] = {
-	[0] = {
-		.start	= IRQ_DB8500_AB8500,
-		.end	= IRQ_DB8500_AB8500,
-		.flags	= IORESOURCE_IRQ
-	}
-};
-
 static struct ux500_wdt_data db8500_wdt_pdata = {
 	.timeout = 600, /* 10 minutes */
 	.has_28_bits_resolution = true,
 };
+/*
+ * Thermal Sensor
+ */
+
+static struct resource db8500_thsens_resources[] = {
+	{
+		.name = "IRQ_HOTMON_LOW",
+		.start  = IRQ_PRCMU_HOTMON_LOW,
+		.end    = IRQ_PRCMU_HOTMON_LOW,
+		.flags  = IORESOURCE_IRQ,
+	},
+	{
+		.name = "IRQ_HOTMON_HIGH",
+		.start  = IRQ_PRCMU_HOTMON_HIGH,
+		.end    = IRQ_PRCMU_HOTMON_HIGH,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct db8500_thsens_platform_data db8500_thsens_data = {
+	.trip_points[0] = {
+		.temp = 70000,
+		.type = THERMAL_TRIP_ACTIVE,
+		.cdev_name = {
+			[0] = "thermal-cpufreq-0",
+		},
+	},
+	.trip_points[1] = {
+		.temp = 75000,
+		.type = THERMAL_TRIP_ACTIVE,
+		.cdev_name = {
+			[0] = "thermal-cpufreq-0",
+		},
+	},
+	.trip_points[2] = {
+		.temp = 80000,
+		.type = THERMAL_TRIP_ACTIVE,
+		.cdev_name = {
+			[0] = "thermal-cpufreq-0",
+		},
+	},
+	.trip_points[3] = {
+		.temp = 85000,
+		.type = THERMAL_TRIP_CRITICAL,
+	},
+	.num_trips = 4,
+};
 
 static struct mfd_cell db8500_prcmu_devs[] = {
 	{
@@ -3021,11 +3085,10 @@ static struct mfd_cell db8500_prcmu_devs[] = {
 		.id = -1,
 	},
 	{
-		.name = "ab8500-core",
-		.of_compatible = "stericsson,ab8500",
-		.num_resources = ARRAY_SIZE(ab8500_resources),
-		.resources = ab8500_resources,
-		.id = AB8500_VERSION_AB8500,
+		.name = "db8500-thermal",
+		.num_resources = ARRAY_SIZE(db8500_thsens_resources),
+		.resources = db8500_thsens_resources,
+		.platform_data = &db8500_thsens_data,
 	},
 };
 
@@ -3037,6 +3100,24 @@ static void db8500_prcmu_update_cpufreq(void)
 	}
 }
 
+static int db8500_prcmu_register_ab8500(struct device *parent,
+					struct ab8500_platform_data *pdata,
+					int irq)
+{
+	struct resource ab8500_resource = DEFINE_RES_IRQ(irq);
+	struct mfd_cell ab8500_cell = {
+		.name = "ab8500-core",
+		.of_compatible = "stericsson,ab8500",
+		.id = AB8500_VERSION_AB8500,
+		.platform_data = pdata,
+		.pdata_size = sizeof(struct ab8500_platform_data),
+		.resources = &ab8500_resource,
+		.num_resources = 1,
+	};
+
+	return mfd_add_devices(parent, 0, &ab8500_cell, 1, NULL, 0, NULL);
+}
+
 /**
  * prcmu_fw_init - arch init call for the Linux PRCMU fw init logic
  *
@@ -3045,7 +3126,7 @@ static int db8500_prcmu_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct prcmu_pdata *pdata = dev_get_platdata(&pdev->dev);
-	int irq = 0, err = 0, i;
+	int irq = 0, err = 0;
 	struct resource *res;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "prcmu");
@@ -3086,26 +3167,27 @@ static int db8500_prcmu_probe(struct platform_device *pdev)
 		goto no_irq_return;
 	}
 
-	db8500_irq_init(np);
-
-	for (i = 0; i < ARRAY_SIZE(db8500_prcmu_devs); i++) {
-		if (!strcmp(db8500_prcmu_devs[i].name, "ab8500-core")) {
-			db8500_prcmu_devs[i].platform_data = pdata->ab_platdata;
-			db8500_prcmu_devs[i].pdata_size = sizeof(struct ab8500_platform_data);
-		}
-	}
+	db8500_irq_init(np, pdata->irq_base);
 
 	prcmu_config_esram0_deep_sleep(ESRAM0_DEEP_SLEEP_STATE_RET);
 
 	db8500_prcmu_update_cpufreq();
 
 	err = mfd_add_devices(&pdev->dev, 0, db8500_prcmu_devs,
-			      ARRAY_SIZE(db8500_prcmu_devs), NULL, 0, NULL);
+			      ARRAY_SIZE(db8500_prcmu_devs), NULL, 0, db8500_irq_domain);
 	if (err) {
 		pr_err("prcmu: Failed to add subdevices\n");
 		return err;
 	}
 
+	err = db8500_prcmu_register_ab8500(&pdev->dev, pdata->ab_platdata,
+					   pdata->ab_irq);
+	if (err) {
+		mfd_remove_devices(&pdev->dev);
+		pr_err("prcmu: Failed to add ab8500 subdevice\n");
+		goto no_irq_return;
+	}
+
 	pr_info("DB8500 PRCMU initialized\n");
 
 no_irq_return:
diff --git a/include/linux/mfd/dbx500-prcmu.h b/include/linux/mfd/dbx500-prcmu.h
index fc43cecc9449..689e6a0d9c99 100644
--- a/include/linux/mfd/dbx500-prcmu.h
+++ b/include/linux/mfd/dbx500-prcmu.h
@@ -237,6 +237,8 @@ struct prcmu_pdata
 	bool enable_set_ddr_opp;
 	bool enable_ape_opp_100_voltage;
 	struct ab8500_platform_data *ab_platdata;
+	int ab_irq;
+	int irq_base;
 	u32 version_offset;
 	u32 legacy_offset;
 	u32 adt_offset;
-- 
cgit 


From 9b819060d349e2247c471d8f055c91a597560da0 Mon Sep 17 00:00:00 2001
From: Linus Walleij <linus.walleij@linaro.org>
Date: Tue, 19 Mar 2013 11:21:56 +0100
Subject: clk: ux500: pass clock base adresses in init call

The ux500 clock driver was including <mach/db8500-regs.h>
which will not work when building for multiplatform support
since <mach/*> is going away.

Pass the base adresses in the init call instead.

Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Mike Turquette <mturquette@linaro.org>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Acked-by: Mike Turquette <mturquette@linaro.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ux500/cpu.c               |   8 +-
 drivers/clk/ux500/u8500_clk.c           | 142 ++++++++++++++++----------------
 include/linux/platform_data/clk-ux500.h |   3 +-
 3 files changed, 79 insertions(+), 74 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/cpu.c b/arch/arm/mach-ux500/cpu.c
index 2c5c48baa3eb..ee69439d5a8e 100644
--- a/arch/arm/mach-ux500/cpu.c
+++ b/arch/arm/mach-ux500/cpu.c
@@ -70,11 +70,15 @@ void __init ux500_init_irq(void)
 	if (cpu_is_u8500_family()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
 		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K - 1);
-		u8500_clk_init();
+		u8500_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
+			       U8500_CLKRST3_BASE, U8500_CLKRST5_BASE,
+			       U8500_CLKRST6_BASE);
 	} else if (cpu_is_u9540()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K - 1);
 		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K - 1);
-		u8500_clk_init();
+		u8500_clk_init(U8500_CLKRST1_BASE, U8500_CLKRST2_BASE,
+			       U8500_CLKRST3_BASE, U8500_CLKRST5_BASE,
+			       U8500_CLKRST6_BASE);
 	} else if (cpu_is_u8540()) {
 		prcmu_early_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
 		ux500_pm_init(U8500_PRCMU_BASE, SZ_8K + SZ_4K - 1);
diff --git a/drivers/clk/ux500/u8500_clk.c b/drivers/clk/ux500/u8500_clk.c
index 6b889a0e90b3..0c9b83d98f11 100644
--- a/drivers/clk/ux500/u8500_clk.c
+++ b/drivers/clk/ux500/u8500_clk.c
@@ -12,10 +12,10 @@
 #include <linux/clk-provider.h>
 #include <linux/mfd/dbx500-prcmu.h>
 #include <linux/platform_data/clk-ux500.h>
-#include <mach/db8500-regs.h>
 #include "clk.h"
 
-void u8500_clk_init(void)
+void u8500_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
+		    u32 clkrst5_base, u32 clkrst6_base)
 {
 	struct prcmu_fw_version *fw_version;
 	const char *sgaclk_parent = NULL;
@@ -215,147 +215,147 @@ void u8500_clk_init(void)
 	 */
 
 	/* PRCC P-clocks */
-	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk0", "per1clk", clkrst1_base,
 				BIT(0), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk1", "per1clk", clkrst1_base,
 				BIT(1), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart1");
 
-	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk2", "per1clk", clkrst1_base,
 				BIT(2), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.1");
 
-	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk3", "per1clk", clkrst1_base,
 				BIT(3), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp0");
 	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk4", "per1clk", clkrst1_base,
 				BIT(4), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp1");
 	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.1");
 
-	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk5", "per1clk", clkrst1_base,
 				BIT(5), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk6", "per1clk", clkrst1_base,
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.2");
 
-	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk7", "per1clk", clkrst1_base,
 				BIT(7), 0);
 	clk_register_clkdev(clk, NULL, "spi3");
 
-	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk8", "per1clk", clkrst1_base,
 				BIT(8), 0);
 	clk_register_clkdev(clk, "apb_pclk", "slimbus0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk9", "per1clk", clkrst1_base,
 				BIT(9), 0);
 	clk_register_clkdev(clk, NULL, "gpio.0");
 	clk_register_clkdev(clk, NULL, "gpio.1");
 	clk_register_clkdev(clk, NULL, "gpioblock0");
 
-	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk10", "per1clk", clkrst1_base,
 				BIT(10), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.4");
 
-	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", U8500_CLKRST1_BASE,
+	clk = clk_reg_prcc_pclk("p1_pclk11", "per1clk", clkrst1_base,
 				BIT(11), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp3");
 	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.3");
 
-	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk0", "per2clk", clkrst2_base,
 				BIT(0), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.3");
 
-	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk1", "per2clk", clkrst2_base,
 				BIT(1), 0);
 	clk_register_clkdev(clk, NULL, "spi2");
 
-	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk2", "per2clk", clkrst2_base,
 				BIT(2), 0);
 	clk_register_clkdev(clk, NULL, "spi1");
 
-	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk3", "per2clk", clkrst2_base,
 				BIT(3), 0);
 	clk_register_clkdev(clk, NULL, "pwl");
 
-	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk4", "per2clk", clkrst2_base,
 				BIT(4), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi4");
 
-	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk5", "per2clk", clkrst2_base,
 				BIT(5), 0);
 	clk_register_clkdev(clk, "apb_pclk", "msp2");
 	clk_register_clkdev(clk, "apb_pclk", "ux500-msp-i2s.2");
 
-	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk6", "per2clk", clkrst2_base,
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi1");
 
-	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk7", "per2clk", clkrst2_base,
 				BIT(7), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi3");
 
-	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk8", "per2clk", clkrst2_base,
 				BIT(8), 0);
 	clk_register_clkdev(clk, NULL, "spi0");
 
-	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk9", "per2clk", clkrst2_base,
 				BIT(9), 0);
 	clk_register_clkdev(clk, "hsir_hclk", "ste_hsi.0");
 
-	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk10", "per2clk", clkrst2_base,
 				BIT(10), 0);
 	clk_register_clkdev(clk, "hsit_hclk", "ste_hsi.0");
 
-	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk11", "per2clk", clkrst2_base,
 				BIT(11), 0);
 	clk_register_clkdev(clk, NULL, "gpio.6");
 	clk_register_clkdev(clk, NULL, "gpio.7");
 	clk_register_clkdev(clk, NULL, "gpioblock1");
 
-	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", U8500_CLKRST2_BASE,
+	clk = clk_reg_prcc_pclk("p2_pclk12", "per2clk", clkrst2_base,
 				BIT(12), 0);
 
-	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk0", "per3clk", clkrst3_base,
 				BIT(0), 0);
 	clk_register_clkdev(clk, NULL, "fsmc");
 
-	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk1", "per3clk", clkrst3_base,
 				BIT(1), 0);
 	clk_register_clkdev(clk, "apb_pclk", "ssp0");
 
-	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk2", "per3clk", clkrst3_base,
 				BIT(2), 0);
 	clk_register_clkdev(clk, "apb_pclk", "ssp1");
 
-	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk3", "per3clk", clkrst3_base,
 				BIT(3), 0);
 	clk_register_clkdev(clk, "apb_pclk", "nmk-i2c.0");
 
-	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk4", "per3clk", clkrst3_base,
 				BIT(4), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi2");
 
-	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk5", "per3clk", clkrst3_base,
 				BIT(5), 0);
 	clk_register_clkdev(clk, "apb_pclk", "ske");
 	clk_register_clkdev(clk, "apb_pclk", "nmk-ske-keypad");
 
-	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk6", "per3clk", clkrst3_base,
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "uart2");
 
-	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk7", "per3clk", clkrst3_base,
 				BIT(7), 0);
 	clk_register_clkdev(clk, "apb_pclk", "sdi5");
 
-	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", U8500_CLKRST3_BASE,
+	clk = clk_reg_prcc_pclk("p3_pclk8", "per3clk", clkrst3_base,
 				BIT(8), 0);
 	clk_register_clkdev(clk, NULL, "gpio.2");
 	clk_register_clkdev(clk, NULL, "gpio.3");
@@ -363,45 +363,45 @@ void u8500_clk_init(void)
 	clk_register_clkdev(clk, NULL, "gpio.5");
 	clk_register_clkdev(clk, NULL, "gpioblock2");
 
-	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", U8500_CLKRST5_BASE,
+	clk = clk_reg_prcc_pclk("p5_pclk0", "per5clk", clkrst5_base,
 				BIT(0), 0);
 	clk_register_clkdev(clk, "usb", "musb-ux500.0");
 
-	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", U8500_CLKRST5_BASE,
+	clk = clk_reg_prcc_pclk("p5_pclk1", "per5clk", clkrst5_base,
 				BIT(1), 0);
 	clk_register_clkdev(clk, NULL, "gpio.8");
 	clk_register_clkdev(clk, NULL, "gpioblock3");
 
-	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk0", "per6clk", clkrst6_base,
 				BIT(0), 0);
 	clk_register_clkdev(clk, "apb_pclk", "rng");
 
-	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk1", "per6clk", clkrst6_base,
 				BIT(1), 0);
 	clk_register_clkdev(clk, NULL, "cryp0");
 	clk_register_clkdev(clk, NULL, "cryp1");
 
-	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk2", "per6clk", clkrst6_base,
 				BIT(2), 0);
 	clk_register_clkdev(clk, NULL, "hash0");
 
-	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk3", "per6clk", clkrst6_base,
 				BIT(3), 0);
 	clk_register_clkdev(clk, NULL, "pka");
 
-	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk4", "per6clk", clkrst6_base,
 				BIT(4), 0);
 	clk_register_clkdev(clk, NULL, "hash1");
 
-	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk5", "per6clk", clkrst6_base,
 				BIT(5), 0);
 	clk_register_clkdev(clk, NULL, "cfgreg");
 
-	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk6", "per6clk", clkrst6_base,
 				BIT(6), 0);
 	clk_register_clkdev(clk, "apb_pclk", "mtu0");
 
-	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", U8500_CLKRST6_BASE,
+	clk = clk_reg_prcc_pclk("p6_pclk7", "per6clk", clkrst6_base,
 				BIT(7), 0);
 	clk_register_clkdev(clk, "apb_pclk", "mtu1");
 
@@ -415,110 +415,110 @@ void u8500_clk_init(void)
 
 	/* Periph1 */
 	clk = clk_reg_prcc_kclk("p1_uart0_kclk", "uartclk",
-			U8500_CLKRST1_BASE, BIT(0), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(0), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart0");
 
 	clk = clk_reg_prcc_kclk("p1_uart1_kclk", "uartclk",
-			U8500_CLKRST1_BASE, BIT(1), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(1), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart1");
 
 	clk = clk_reg_prcc_kclk("p1_i2c1_kclk", "i2cclk",
-			U8500_CLKRST1_BASE, BIT(2), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(2), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.1");
 
 	clk = clk_reg_prcc_kclk("p1_msp0_kclk", "msp02clk",
-			U8500_CLKRST1_BASE, BIT(3), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(3), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp0");
 	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.0");
 
 	clk = clk_reg_prcc_kclk("p1_msp1_kclk", "msp1clk",
-			U8500_CLKRST1_BASE, BIT(4), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(4), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp1");
 	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.1");
 
 	clk = clk_reg_prcc_kclk("p1_sdi0_kclk", "sdmmcclk",
-			U8500_CLKRST1_BASE, BIT(5), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(5), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi0");
 
 	clk = clk_reg_prcc_kclk("p1_i2c2_kclk", "i2cclk",
-			U8500_CLKRST1_BASE, BIT(6), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(6), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.2");
 
 	clk = clk_reg_prcc_kclk("p1_slimbus0_kclk", "slimclk",
-			U8500_CLKRST1_BASE, BIT(8), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(8), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "slimbus0");
 
 	clk = clk_reg_prcc_kclk("p1_i2c4_kclk", "i2cclk",
-			U8500_CLKRST1_BASE, BIT(9), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(9), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.4");
 
 	clk = clk_reg_prcc_kclk("p1_msp3_kclk", "msp1clk",
-			U8500_CLKRST1_BASE, BIT(10), CLK_SET_RATE_GATE);
+			clkrst1_base, BIT(10), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp3");
 	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.3");
 
 	/* Periph2 */
 	clk = clk_reg_prcc_kclk("p2_i2c3_kclk", "i2cclk",
-			U8500_CLKRST2_BASE, BIT(0), CLK_SET_RATE_GATE);
+			clkrst2_base, BIT(0), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.3");
 
 	clk = clk_reg_prcc_kclk("p2_sdi4_kclk", "sdmmcclk",
-			U8500_CLKRST2_BASE, BIT(2), CLK_SET_RATE_GATE);
+			clkrst2_base, BIT(2), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi4");
 
 	clk = clk_reg_prcc_kclk("p2_msp2_kclk", "msp02clk",
-			U8500_CLKRST2_BASE, BIT(3), CLK_SET_RATE_GATE);
+			clkrst2_base, BIT(3), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "msp2");
 	clk_register_clkdev(clk, NULL, "ux500-msp-i2s.2");
 
 	clk = clk_reg_prcc_kclk("p2_sdi1_kclk", "sdmmcclk",
-			U8500_CLKRST2_BASE, BIT(4), CLK_SET_RATE_GATE);
+			clkrst2_base, BIT(4), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi1");
 
 	clk = clk_reg_prcc_kclk("p2_sdi3_kclk", "sdmmcclk",
-			U8500_CLKRST2_BASE, BIT(5), CLK_SET_RATE_GATE);
+			clkrst2_base, BIT(5), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi3");
 
 	/* Note that rate is received from parent. */
 	clk = clk_reg_prcc_kclk("p2_ssirx_kclk", "hsirxclk",
-			U8500_CLKRST2_BASE, BIT(6),
+			clkrst2_base, BIT(6),
 			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
 	clk = clk_reg_prcc_kclk("p2_ssitx_kclk", "hsitxclk",
-			U8500_CLKRST2_BASE, BIT(7),
+			clkrst2_base, BIT(7),
 			CLK_SET_RATE_GATE|CLK_SET_RATE_PARENT);
 
 	/* Periph3 */
 	clk = clk_reg_prcc_kclk("p3_ssp0_kclk", "sspclk",
-			U8500_CLKRST3_BASE, BIT(1), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(1), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "ssp0");
 
 	clk = clk_reg_prcc_kclk("p3_ssp1_kclk", "sspclk",
-			U8500_CLKRST3_BASE, BIT(2), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(2), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "ssp1");
 
 	clk = clk_reg_prcc_kclk("p3_i2c0_kclk", "i2cclk",
-			U8500_CLKRST3_BASE, BIT(3), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(3), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "nmk-i2c.0");
 
 	clk = clk_reg_prcc_kclk("p3_sdi2_kclk", "sdmmcclk",
-			U8500_CLKRST3_BASE, BIT(4), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(4), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi2");
 
 	clk = clk_reg_prcc_kclk("p3_ske_kclk", "rtc32k",
-			U8500_CLKRST3_BASE, BIT(5), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(5), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "ske");
 	clk_register_clkdev(clk, NULL, "nmk-ske-keypad");
 
 	clk = clk_reg_prcc_kclk("p3_uart2_kclk", "uartclk",
-			U8500_CLKRST3_BASE, BIT(6), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(6), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "uart2");
 
 	clk = clk_reg_prcc_kclk("p3_sdi5_kclk", "sdmmcclk",
-			U8500_CLKRST3_BASE, BIT(7), CLK_SET_RATE_GATE);
+			clkrst3_base, BIT(7), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "sdi5");
 
 	/* Periph6 */
 	clk = clk_reg_prcc_kclk("p3_rng_kclk", "rngclk",
-			U8500_CLKRST6_BASE, BIT(0), CLK_SET_RATE_GATE);
+			clkrst6_base, BIT(0), CLK_SET_RATE_GATE);
 	clk_register_clkdev(clk, NULL, "rng");
 }
diff --git a/include/linux/platform_data/clk-ux500.h b/include/linux/platform_data/clk-ux500.h
index 3af0da1f3be5..320d9c39ea0a 100644
--- a/include/linux/platform_data/clk-ux500.h
+++ b/include/linux/platform_data/clk-ux500.h
@@ -10,7 +10,8 @@
 #ifndef __CLK_UX500_H
 #define __CLK_UX500_H
 
-void u8500_clk_init(void);
+void u8500_clk_init(u32 clkrst1_base, u32 clkrst2_base, u32 clkrst3_base,
+		    u32 clkrst5_base, u32 clkrst6_base);
 void u9540_clk_init(void);
 void u8540_clk_init(void);
 
-- 
cgit 


From ab0fc6ce4825311e1842bb3c46611d0f02d189df Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Mar 2013 22:51:06 +0100
Subject: ARM: ux500: move mach/msp.h to <linux/platform_data/*>

This header file only contains platform data structure definitions,
so it's straightforward to move.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
[Delete one include rather than move it]
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-ux500/board-mop500-audio.c     |  2 +-
 arch/arm/mach-ux500/board-mop500.h           |  2 +-
 arch/arm/mach-ux500/include/mach/msp.h       | 27 ---------------------------
 include/linux/platform_data/asoc-ux500-msp.h | 27 +++++++++++++++++++++++++++
 sound/soc/ux500/ux500_msp_dai.c              |  2 +-
 sound/soc/ux500/ux500_msp_i2s.c              |  2 +-
 sound/soc/ux500/ux500_msp_i2s.h              |  2 --
 7 files changed, 31 insertions(+), 33 deletions(-)
 delete mode 100644 arch/arm/mach-ux500/include/mach/msp.h
 create mode 100644 include/linux/platform_data/asoc-ux500-msp.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-ux500/board-mop500-audio.c b/arch/arm/mach-ux500/board-mop500-audio.c
index 7209db7cdc72..43bbee1d20c1 100644
--- a/arch/arm/mach-ux500/board-mop500-audio.c
+++ b/arch/arm/mach-ux500/board-mop500-audio.c
@@ -13,7 +13,7 @@
 #include <mach/devices.h>
 #include <mach/hardware.h>
 #include <mach/irqs.h>
-#include <mach/msp.h>
+#include <linux/platform_data/asoc-ux500-msp.h>
 
 #include "ste-dma40-db8500.h"
 #include "board-mop500.h"
diff --git a/arch/arm/mach-ux500/board-mop500.h b/arch/arm/mach-ux500/board-mop500.h
index eaa605f5d90d..eb81caf1157e 100644
--- a/arch/arm/mach-ux500/board-mop500.h
+++ b/arch/arm/mach-ux500/board-mop500.h
@@ -9,7 +9,7 @@
 
 /* For NOMADIK_NR_GPIO */
 #include <mach/irqs.h>
-#include <mach/msp.h>
+#include <linux/platform_data/asoc-ux500-msp.h>
 #include <linux/amba/mmci.h>
 
 /* Snowball specific GPIO assignments, this board has no GPIO expander */
diff --git a/arch/arm/mach-ux500/include/mach/msp.h b/arch/arm/mach-ux500/include/mach/msp.h
deleted file mode 100644
index 9991aea3d577..000000000000
--- a/arch/arm/mach-ux500/include/mach/msp.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright (C) ST-Ericsson SA 2010
- *
- * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
- * License terms: GNU General Public License (GPL), version 2.
- */
-
-#ifndef __MSP_H
-#define __MSP_H
-
-#include <linux/platform_data/dma-ste-dma40.h>
-
-enum msp_i2s_id {
-	MSP_I2S_0 = 0,
-	MSP_I2S_1,
-	MSP_I2S_2,
-	MSP_I2S_3,
-};
-
-/* Platform data structure for a MSP I2S-device */
-struct msp_i2s_platform_data {
-	enum msp_i2s_id id;
-	struct stedma40_chan_cfg *msp_i2s_dma_rx;
-	struct stedma40_chan_cfg *msp_i2s_dma_tx;
-};
-
-#endif
diff --git a/include/linux/platform_data/asoc-ux500-msp.h b/include/linux/platform_data/asoc-ux500-msp.h
new file mode 100644
index 000000000000..9991aea3d577
--- /dev/null
+++ b/include/linux/platform_data/asoc-ux500-msp.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (C) ST-Ericsson SA 2010
+ *
+ * Author: Rabin Vincent <rabin.vincent@stericsson.com> for ST-Ericsson
+ * License terms: GNU General Public License (GPL), version 2.
+ */
+
+#ifndef __MSP_H
+#define __MSP_H
+
+#include <linux/platform_data/dma-ste-dma40.h>
+
+enum msp_i2s_id {
+	MSP_I2S_0 = 0,
+	MSP_I2S_1,
+	MSP_I2S_2,
+	MSP_I2S_3,
+};
+
+/* Platform data structure for a MSP I2S-device */
+struct msp_i2s_platform_data {
+	enum msp_i2s_id id;
+	struct stedma40_chan_cfg *msp_i2s_dma_rx;
+	struct stedma40_chan_cfg *msp_i2s_dma_tx;
+};
+
+#endif
diff --git a/sound/soc/ux500/ux500_msp_dai.c b/sound/soc/ux500/ux500_msp_dai.c
index 94a3e5705aaa..0d9713e4a440 100644
--- a/sound/soc/ux500/ux500_msp_dai.c
+++ b/sound/soc/ux500/ux500_msp_dai.c
@@ -19,9 +19,9 @@
 #include <linux/clk.h>
 #include <linux/regulator/consumer.h>
 #include <linux/mfd/dbx500-prcmu.h>
+#include <linux/platform_data/asoc-ux500-msp.h>
 
 #include <mach/hardware.h>
-#include <mach/msp.h>
 
 #include <sound/soc.h>
 #include <sound/soc-dai.h>
diff --git a/sound/soc/ux500/ux500_msp_i2s.c b/sound/soc/ux500/ux500_msp_i2s.c
index a26c6bf0a29b..a5820073c875 100644
--- a/sound/soc/ux500/ux500_msp_i2s.c
+++ b/sound/soc/ux500/ux500_msp_i2s.c
@@ -20,9 +20,9 @@
 #include <linux/slab.h>
 #include <linux/io.h>
 #include <linux/of.h>
+#include <linux/platform_data/asoc-ux500-msp.h>
 
 #include <mach/hardware.h>
-#include <mach/msp.h>
 
 #include <sound/soc.h>
 
diff --git a/sound/soc/ux500/ux500_msp_i2s.h b/sound/soc/ux500/ux500_msp_i2s.h
index 1311c0df7628..437f0c032c58 100644
--- a/sound/soc/ux500/ux500_msp_i2s.h
+++ b/sound/soc/ux500/ux500_msp_i2s.h
@@ -17,8 +17,6 @@
 
 #include <linux/platform_device.h>
 
-#include <mach/msp.h>
-
 #define MSP_INPUT_FREQ_APB 48000000
 
 /*** Stereo mode. Used for APB data accesses as 16 bits accesses (mono),
-- 
cgit 


From b82718565c74a4f313163da8e1ce12af4bc9d34e Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Tue, 12 Feb 2013 15:29:15 -0800
Subject: ARM: EXYNOS: change the name of USB ohci header

This patch changes the name of USB ohci header from 'usb-exynos.h'
to 'usb-ohci-exynos.h'. This is because this header file has
the platdata for only EXYNOS OHCI.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Kukjin Kim <kgene.kim@samsung.com>
---
 arch/arm/mach-exynos/dev-ohci.c               |  2 +-
 arch/arm/mach-exynos/mach-origen.c            |  2 +-
 arch/arm/mach-exynos/mach-smdkv310.c          |  2 +-
 drivers/usb/host/ohci-exynos.c                |  3 ++-
 include/linux/platform_data/usb-exynos.h      | 21 ---------------------
 include/linux/platform_data/usb-ohci-exynos.h | 21 +++++++++++++++++++++
 6 files changed, 26 insertions(+), 25 deletions(-)
 delete mode 100644 include/linux/platform_data/usb-exynos.h
 create mode 100644 include/linux/platform_data/usb-ohci-exynos.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/dev-ohci.c b/arch/arm/mach-exynos/dev-ohci.c
index 4244d02dafbd..d5bc129e6bb7 100644
--- a/arch/arm/mach-exynos/dev-ohci.c
+++ b/arch/arm/mach-exynos/dev-ohci.c
@@ -12,7 +12,7 @@
 
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/usb-exynos.h>
+#include <linux/platform_data/usb-ohci-exynos.h>
 
 #include <mach/irqs.h>
 #include <mach/map.h>
diff --git a/arch/arm/mach-exynos/mach-origen.c b/arch/arm/mach-exynos/mach-origen.c
index 579d2d171daa..bf946931ab32 100644
--- a/arch/arm/mach-exynos/mach-origen.c
+++ b/arch/arm/mach-exynos/mach-origen.c
@@ -26,7 +26,7 @@
 #include <linux/platform_data/i2c-s3c2410.h>
 #include <linux/platform_data/s3c-hsotg.h>
 #include <linux/platform_data/usb-ehci-s5p.h>
-#include <linux/platform_data/usb-exynos.h>
+#include <linux/platform_data/usb-ohci-exynos.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
diff --git a/arch/arm/mach-exynos/mach-smdkv310.c b/arch/arm/mach-exynos/mach-smdkv310.c
index d71672922b19..8270929d7b44 100644
--- a/arch/arm/mach-exynos/mach-smdkv310.c
+++ b/arch/arm/mach-exynos/mach-smdkv310.c
@@ -23,7 +23,7 @@
 #include <linux/platform_data/i2c-s3c2410.h>
 #include <linux/platform_data/s3c-hsotg.h>
 #include <linux/platform_data/usb-ehci-s5p.h>
-#include <linux/platform_data/usb-exynos.h>
+#include <linux/platform_data/usb-ohci-exynos.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
diff --git a/drivers/usb/host/ohci-exynos.c b/drivers/usb/host/ohci-exynos.c
index e3b7e85120e4..0bd6f47dfdc4 100644
--- a/drivers/usb/host/ohci-exynos.c
+++ b/drivers/usb/host/ohci-exynos.c
@@ -14,9 +14,10 @@
 #include <linux/clk.h>
 #include <linux/of.h>
 #include <linux/platform_device.h>
-#include <linux/platform_data/usb-exynos.h>
+#include <linux/platform_data/usb-ohci-exynos.h>
 #include <linux/usb/phy.h>
 #include <linux/usb/samsung_usb_phy.h>
+
 #include <plat/usb-phy.h>
 
 struct exynos_ohci_hcd {
diff --git a/include/linux/platform_data/usb-exynos.h b/include/linux/platform_data/usb-exynos.h
deleted file mode 100644
index c256c595be5e..000000000000
--- a/include/linux/platform_data/usb-exynos.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (C) 2011 Samsung Electronics Co.Ltd
- *		http://www.samsung.com/
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-#ifndef __MACH_EXYNOS_OHCI_H
-#define __MACH_EXYNOS_OHCI_H
-
-struct exynos4_ohci_platdata {
-	int (*phy_init)(struct platform_device *pdev, int type);
-	int (*phy_exit)(struct platform_device *pdev, int type);
-};
-
-extern void exynos4_ohci_set_platdata(struct exynos4_ohci_platdata *pd);
-
-#endif /* __MACH_EXYNOS_OHCI_H */
diff --git a/include/linux/platform_data/usb-ohci-exynos.h b/include/linux/platform_data/usb-ohci-exynos.h
new file mode 100644
index 000000000000..c256c595be5e
--- /dev/null
+++ b/include/linux/platform_data/usb-ohci-exynos.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (C) 2011 Samsung Electronics Co.Ltd
+ *		http://www.samsung.com/
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef __MACH_EXYNOS_OHCI_H
+#define __MACH_EXYNOS_OHCI_H
+
+struct exynos4_ohci_platdata {
+	int (*phy_init)(struct platform_device *pdev, int type);
+	int (*phy_exit)(struct platform_device *pdev, int type);
+};
+
+extern void exynos4_ohci_set_platdata(struct exynos4_ohci_platdata *pd);
+
+#endif /* __MACH_EXYNOS_OHCI_H */
-- 
cgit 


From f8a0941f1bbdbaa68441142675986501b48da8f5 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Fri, 22 Mar 2013 12:59:33 +0100
Subject: mfd: arizona: Basic support for edge triggered IRQs

Allow the user to configure edge triggered IRQs, though we do not yet
fully handle new interrupts occurring while an interrupt is being handled.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 arch/arm/mach-s3c64xx/mach-crag6410-module.c |  4 ++--
 drivers/mfd/arizona-irq.c                    | 12 +++++++-----
 include/linux/mfd/arizona/pdata.h            |  3 ++-
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c64xx/mach-crag6410-module.c b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
index bf3d1c09b085..40b0e75b1ecc 100644
--- a/arch/arm/mach-s3c64xx/mach-crag6410-module.c
+++ b/arch/arm/mach-s3c64xx/mach-crag6410-module.c
@@ -208,7 +208,7 @@ static const struct i2c_board_info wm1277_devs[] = {
 static struct arizona_pdata wm5102_reva_pdata = {
 	.ldoena = S3C64XX_GPN(7),
 	.gpio_base = CODEC_GPIO_BASE,
-	.irq_active_high = true,
+	.irq_flags = IRQF_TRIGGER_HIGH,
 	.micd_pol_gpio = CODEC_GPIO_BASE + 4,
 	.gpio_defaults = {
 		[2] = 0x10000, /* AIF3TXLRCLK */
@@ -237,7 +237,7 @@ static struct spi_board_info wm5102_reva_spi_devs[] = {
 static struct arizona_pdata wm5102_pdata = {
 	.ldoena = S3C64XX_GPN(7),
 	.gpio_base = CODEC_GPIO_BASE,
-	.irq_active_high = true,
+	.irq_flags = IRQF_TRIGGER_HIGH,
 	.micd_pol_gpio = CODEC_GPIO_BASE + 2,
 	.gpio_defaults = {
 		[2] = 0x10000, /* AIF3TXLRCLK */
diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index 0aa39e2eb008..aa25468d24aa 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -195,7 +195,11 @@ int arizona_irq_init(struct arizona *arizona)
 	/* Disable all wake sources by default */
 	regmap_write(arizona->regmap, ARIZONA_WAKE_CONTROL, 0);
 
-	if (arizona->pdata.irq_active_high) {
+	if (!arizona->pdata.irq_flags)
+		arizona->pdata.irq_flags = IRQF_TRIGGER_LOW;
+
+	if (arizona->pdata.irq_flags & (IRQF_TRIGGER_HIGH |
+					IRQF_TRIGGER_RISING)) {
 		ret = regmap_update_bits(arizona->regmap, ARIZONA_IRQ_CTRL_1,
 					 ARIZONA_IRQ_POL, 0);
 		if (ret != 0) {
@@ -203,12 +207,10 @@ int arizona_irq_init(struct arizona *arizona)
 				ret);
 			goto err;
 		}
-
-		flags |= IRQF_TRIGGER_HIGH;
-	} else {
-		flags |= IRQF_TRIGGER_LOW;
 	}
 
+	flags |= arizona->pdata.irq_flags;
+
 	/* Allocate a virtual IRQ domain to distribute to the regmap domains */
 	arizona->virq = irq_domain_add_linear(NULL, 2, &arizona_domain_ops,
 					      arizona);
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 455c51d22d6b..84fefede3a23 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -99,7 +99,8 @@ struct arizona_pdata {
 	/** If a direct 32kHz clock is provided on an MCLK specify it here */
 	int clk32k_src;
 
-	bool irq_active_high; /** IRQ polarity */
+	/** Mode for primary IRQ (defaults to active low) */
+	unsigned int irq_flags;
 
 	/* Base GPIO */
 	int gpio_base;
-- 
cgit 


From 3092f8050eccce8463afe771f0910634a433e24b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Sun, 24 Mar 2013 23:05:58 +0000
Subject: mfd: arizona: Allow GPIO to be specified for IRQ line

If a GPIO is specified for the chip IRQ line then request it. This
improves support for systems that do not put pins into input mode when
used as interrupts.

Also use this GPIO when the primary IRQ is in edge triggered mode to
detect if we have handled pending interrupts in order to improve
robustness.

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/arizona-irq.c         | 66 +++++++++++++++++++++++++++++++--------
 include/linux/mfd/arizona/pdata.h |  3 ++
 2 files changed, 56 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/arizona-irq.c b/drivers/mfd/arizona-irq.c
index aa25468d24aa..f761cc119c01 100644
--- a/drivers/mfd/arizona-irq.c
+++ b/drivers/mfd/arizona-irq.c
@@ -94,6 +94,7 @@ static irqreturn_t arizona_ctrlif_err(int irq, void *data)
 static irqreturn_t arizona_irq_thread(int irq, void *data)
 {
 	struct arizona *arizona = data;
+	bool poll;
 	unsigned int val;
 	int ret;
 
@@ -103,20 +104,39 @@ static irqreturn_t arizona_irq_thread(int irq, void *data)
 		return IRQ_NONE;
 	}
 
-	/* Always handle the AoD domain */
-	handle_nested_irq(irq_find_mapping(arizona->virq, 0));
+	do {
+		poll = false;
+
+		/* Always handle the AoD domain */
+		handle_nested_irq(irq_find_mapping(arizona->virq, 0));
+
+		/*
+		 * Check if one of the main interrupts is asserted and only
+		 * check that domain if it is.
+		 */
+		ret = regmap_read(arizona->regmap, ARIZONA_IRQ_PIN_STATUS,
+				  &val);
+		if (ret == 0 && val & ARIZONA_IRQ1_STS) {
+			handle_nested_irq(irq_find_mapping(arizona->virq, 1));
+		} else if (ret != 0) {
+			dev_err(arizona->dev,
+				"Failed to read main IRQ status: %d\n", ret);
+		}
 
-	/*
-	 * Check if one of the main interrupts is asserted and only
-	 * check that domain if it is.
-	 */
-	ret = regmap_read(arizona->regmap, ARIZONA_IRQ_PIN_STATUS, &val);
-	if (ret == 0 && val & ARIZONA_IRQ1_STS) {
-		handle_nested_irq(irq_find_mapping(arizona->virq, 1));
-	} else if (ret != 0) {
-		dev_err(arizona->dev, "Failed to read main IRQ status: %d\n",
-			ret);
-	}
+		/*
+		 * Poll the IRQ pin status to see if we're really done
+		 * if the interrupt controller can't do it for us.
+		 */
+		if (!arizona->pdata.irq_gpio) {
+			break;
+		} else if (arizona->pdata.irq_flags & IRQF_TRIGGER_RISING &&
+			   gpio_get_value_cansleep(arizona->pdata.irq_gpio)) {
+			poll = true;
+		} else if (arizona->pdata.irq_flags & IRQF_TRIGGER_FALLING &&
+			   !gpio_get_value_cansleep(arizona->pdata.irq_gpio)) {
+			poll = true;
+		}
+	} while (poll);
 
 	pm_runtime_mark_last_busy(arizona->dev);
 	pm_runtime_put_autosuspend(arizona->dev);
@@ -262,6 +282,26 @@ int arizona_irq_init(struct arizona *arizona)
 		}
 	}
 
+	/* Used to emulate edge trigger and to work around broken pinmux */
+	if (arizona->pdata.irq_gpio) {
+		if (gpio_to_irq(arizona->pdata.irq_gpio) != arizona->irq) {
+			dev_warn(arizona->dev, "IRQ %d is not GPIO %d (%d)\n",
+				 arizona->irq, arizona->pdata.irq_gpio,
+				 gpio_to_irq(arizona->pdata.irq_gpio));
+			arizona->irq = gpio_to_irq(arizona->pdata.irq_gpio);
+		}
+
+		ret = devm_gpio_request_one(arizona->dev,
+					    arizona->pdata.irq_gpio,
+					    GPIOF_IN, "arizona IRQ");
+		if (ret != 0) {
+			dev_err(arizona->dev,
+				"Failed to request IRQ GPIO %d:: %d\n",
+				arizona->pdata.irq_gpio, ret);
+			arizona->pdata.irq_gpio = 0;
+		}
+	}
+
 	ret = request_threaded_irq(arizona->irq, NULL, arizona_irq_thread,
 				   flags, "arizona", arizona);
 
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 84fefede3a23..8755dd05ab78 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -163,6 +163,9 @@ struct arizona_pdata {
 
 	/** Haptic actuator type */
 	unsigned int hap_act;
+
+	/** GPIO for primary IRQ (used for edge triggered emulation) */
+	int irq_gpio;
 };
 
 #endif
-- 
cgit 


From 544c7aadd7d4309ed01fcd787d393db67eb7eaea Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Tue, 29 Jan 2013 18:44:41 +0800
Subject: mfd: arizona: Support configuring MICBIASes into bypass mode

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/arizona-core.c        | 12 +++++++++++-
 include/linux/mfd/arizona/pdata.h |  1 +
 2 files changed, 12 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/arizona-core.c b/drivers/mfd/arizona-core.c
index 0962c558dc02..dd679e8601da 100644
--- a/drivers/mfd/arizona-core.c
+++ b/drivers/mfd/arizona-core.c
@@ -514,10 +514,16 @@ int arizona_dev_init(struct arizona *arizona)
 	}
 
 	for (i = 0; i < ARIZONA_MAX_MICBIAS; i++) {
-		if (!arizona->pdata.micbias[i].mV)
+		if (!arizona->pdata.micbias[i].mV &&
+		    !arizona->pdata.micbias[i].bypass)
 			continue;
 
+		/* Apply default for bypass mode */
+		if (!arizona->pdata.micbias[i].mV)
+			arizona->pdata.micbias[i].mV = 2800;
+
 		val = (arizona->pdata.micbias[i].mV - 1500) / 100;
+
 		val <<= ARIZONA_MICB1_LVL_SHIFT;
 
 		if (arizona->pdata.micbias[i].ext_cap)
@@ -529,10 +535,14 @@ int arizona_dev_init(struct arizona *arizona)
 		if (arizona->pdata.micbias[i].fast_start)
 			val |= ARIZONA_MICB1_RATE;
 
+		if (arizona->pdata.micbias[i].bypass)
+			val |= ARIZONA_MICB1_BYPASS;
+
 		regmap_update_bits(arizona->regmap,
 				   ARIZONA_MIC_BIAS_CTRL_1 + i,
 				   ARIZONA_MICB1_LVL_MASK |
 				   ARIZONA_MICB1_DISCH |
+				   ARIZONA_MICB1_BYPASS |
 				   ARIZONA_MICB1_RATE, val);
 	}
 
diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h
index 8755dd05ab78..554d903cdf41 100644
--- a/include/linux/mfd/arizona/pdata.h
+++ b/include/linux/mfd/arizona/pdata.h
@@ -78,6 +78,7 @@ struct arizona_micbias {
 	unsigned int ext_cap:1;    /** External capacitor fitted */
 	unsigned int discharge:1;  /** Actively discharge */
 	unsigned int fast_start:1; /** Enable aggressive startup ramp rate */
+	unsigned int bypass:1;     /** Use bypass mode */
 };
 
 struct arizona_micd_config {
-- 
cgit 


From a5055d5d683605210f77d46614bbdb389c1e4a07 Mon Sep 17 00:00:00 2001
From: Pali Rohár <pali.rohar@gmail.com>
Date: Fri, 15 Feb 2013 23:56:49 +0100
Subject: mfd: twl4030-madc: Add support for raw value in
 twl4030_madc_conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Driver twl4030-madc has hardcoded channel types (10 - battery current,
1 - battery temperature) and also conversation data in variable
twl4030_divider_ratios. These hardcoded channels are incorrect for
Nokia RX-51 board (where is channel 0 - battery temperature).

For Nokia RX-51 there is rx51_battery power_supply driver which reporting
battery information via twl4030_madc_conversion. But this driver needs
raw values (not converted via some hardcoded functions). So this patch
adding new parameter "raw" to struct twl4030_madc_request which tell
twl4030-madc driver to not convert values, but rather return raw.

Signed-off-by: Pali Rohár <pali.rohar@gmail.com>
Reviewed-by: Anton Vorontsov <anton@enomsg.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/twl4030-madc.c       | 14 ++++++++++----
 include/linux/i2c/twl4030-madc.h |  2 ++
 2 files changed, 12 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/twl4030-madc.c b/drivers/mfd/twl4030-madc.c
index 942b666a2a07..42bd3ea5df3c 100644
--- a/drivers/mfd/twl4030-madc.c
+++ b/drivers/mfd/twl4030-madc.c
@@ -211,12 +211,14 @@ static int twl4030battery_current(int raw_volt)
  * @reg_base - Base address of the first channel
  * @Channels - 16 bit bitmap. If the bit is set, channel value is read
  * @buf - The channel values are stored here. if read fails error
+ * @raw - Return raw values without conversion
  * value is stored
  * Returns the number of successfully read channels.
  */
 static int twl4030_madc_read_channels(struct twl4030_madc_data *madc,
 				      u8 reg_base, unsigned
-						long channels, int *buf)
+				      long channels, int *buf,
+				      bool raw)
 {
 	int count = 0, count_req = 0, i;
 	u8 reg;
@@ -230,6 +232,10 @@ static int twl4030_madc_read_channels(struct twl4030_madc_data *madc,
 			count_req++;
 			continue;
 		}
+		if (raw) {
+			count++;
+			continue;
+		}
 		switch (i) {
 		case 10:
 			buf[i] = twl4030battery_current(buf[i]);
@@ -371,7 +377,7 @@ static irqreturn_t twl4030_madc_threaded_irq_handler(int irq, void *_madc)
 		method = &twl4030_conversion_methods[r->method];
 		/* Read results */
 		len = twl4030_madc_read_channels(madc, method->rbase,
-						 r->channels, r->rbuf);
+						 r->channels, r->rbuf, r->raw);
 		/* Return results to caller */
 		if (r->func_cb != NULL) {
 			r->func_cb(len, r->channels, r->rbuf);
@@ -397,7 +403,7 @@ err_i2c:
 		method = &twl4030_conversion_methods[r->method];
 		/* Read results */
 		len = twl4030_madc_read_channels(madc, method->rbase,
-						 r->channels, r->rbuf);
+						 r->channels, r->rbuf, r->raw);
 		/* Return results to caller */
 		if (r->func_cb != NULL) {
 			r->func_cb(len, r->channels, r->rbuf);
@@ -585,7 +591,7 @@ int twl4030_madc_conversion(struct twl4030_madc_request *req)
 		goto out;
 	}
 	ret = twl4030_madc_read_channels(twl4030_madc, method->rbase,
-					 req->channels, req->rbuf);
+					 req->channels, req->rbuf, req->raw);
 	twl4030_madc->requests[req->method].active = 0;
 
 out:
diff --git a/include/linux/i2c/twl4030-madc.h b/include/linux/i2c/twl4030-madc.h
index 530e11ba0738..01f595107048 100644
--- a/include/linux/i2c/twl4030-madc.h
+++ b/include/linux/i2c/twl4030-madc.h
@@ -39,6 +39,7 @@ struct twl4030_madc_conversion_method {
  * @do_avgP:	sample the input channel for 4 consecutive cycles
  * @method:	RT, SW1, SW2
  * @type:	Polling or interrupt based method
+ * @raw:	Return raw value, do not convert it
  */
 
 struct twl4030_madc_request {
@@ -48,6 +49,7 @@ struct twl4030_madc_request {
 	u16 type;
 	bool active;
 	bool result_pending;
+	bool raw;
 	int rbuf[TWL4030_MADC_MAX_CHANNELS];
 	void (*func_cb)(int len, int channels, int *buf);
 };
-- 
cgit 


From 3c2670e6515cf584810f417db9b00992c8b2d75a Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay@vrfy.org>
Date: Sat, 6 Apr 2013 09:56:00 -0700
Subject: driver core: add uid and gid to devtmpfs

Some drivers want to tell userspace what uid and gid should be used for
their device nodes, so allow that information to percolate through the
driver core to userspace in order to make this happen.  This means that
some systems (i.e.  Android and friends) will not need to even run a
udev-like daemon for their device node manager and can just rely in
devtmpfs fully, reducing their footprint even more.

Signed-off-by: Kay Sievers <kay@vrfy.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c           |  3 ++-
 drivers/base/core.c     | 17 +++++++++++++----
 drivers/base/devtmpfs.c | 27 +++++++++++++++++----------
 drivers/usb/core/usb.c  |  3 ++-
 include/linux/device.h  |  7 +++++--
 5 files changed, 39 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index 3c001fba80c7..dfcec431ceea 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1111,7 +1111,8 @@ struct class block_class = {
 	.name		= "block",
 };
 
-static char *block_devnode(struct device *dev, umode_t *mode)
+static char *block_devnode(struct device *dev, umode_t *mode,
+			   uid_t *uid, gid_t *gid)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index a7391a30cb29..8a428b51089d 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -283,15 +283,21 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 		const char *tmp;
 		const char *name;
 		umode_t mode = 0;
+		uid_t uid = 0;
+		gid_t gid = 0;
 
 		add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
 		add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
-		name = device_get_devnode(dev, &mode, &tmp);
+		name = device_get_devnode(dev, &mode, &uid, &gid, &tmp);
 		if (name) {
 			add_uevent_var(env, "DEVNAME=%s", name);
-			kfree(tmp);
 			if (mode)
 				add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
+			if (uid)
+				add_uevent_var(env, "DEVUID=%u", uid);
+			if (gid)
+				add_uevent_var(env, "DEVGID=%u", gid);
+			kfree(tmp);
 		}
 	}
 
@@ -1281,6 +1287,8 @@ static struct device *next_device(struct klist_iter *i)
  * device_get_devnode - path of device node file
  * @dev: device
  * @mode: returned file access mode
+ * @uid: returned file owner
+ * @gid: returned file group
  * @tmp: possibly allocated string
  *
  * Return the relative path of a possible device node.
@@ -1289,7 +1297,8 @@ static struct device *next_device(struct klist_iter *i)
  * freed by the caller.
  */
 const char *device_get_devnode(struct device *dev,
-			       umode_t *mode, const char **tmp)
+			       umode_t *mode, uid_t *uid, gid_t *gid,
+			       const char **tmp)
 {
 	char *s;
 
@@ -1297,7 +1306,7 @@ const char *device_get_devnode(struct device *dev,
 
 	/* the device type may provide a specific name */
 	if (dev->type && dev->type->devnode)
-		*tmp = dev->type->devnode(dev, mode);
+		*tmp = dev->type->devnode(dev, mode, uid, gid);
 	if (*tmp)
 		return *tmp;
 
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index 01fc5b07f951..fda52563677f 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -41,6 +41,8 @@ static struct req {
 	int err;
 	const char *name;
 	umode_t mode;	/* 0 => delete */
+	uid_t uid;
+	gid_t gid;
 	struct device *dev;
 } *requests;
 
@@ -85,7 +87,9 @@ int devtmpfs_create_node(struct device *dev)
 		return 0;
 
 	req.mode = 0;
-	req.name = device_get_devnode(dev, &req.mode, &tmp);
+	req.uid = 0;
+	req.gid = 0;
+	req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp);
 	if (!req.name)
 		return -ENOMEM;
 
@@ -121,7 +125,7 @@ int devtmpfs_delete_node(struct device *dev)
 	if (!thread)
 		return 0;
 
-	req.name = device_get_devnode(dev, NULL, &tmp);
+	req.name = device_get_devnode(dev, NULL, NULL, NULL, &tmp);
 	if (!req.name)
 		return -ENOMEM;
 
@@ -187,7 +191,8 @@ static int create_path(const char *nodepath)
 	return err;
 }
 
-static int handle_create(const char *nodename, umode_t mode, struct device *dev)
+static int handle_create(const char *nodename, umode_t mode, uid_t uid,
+			 gid_t gid, struct device *dev)
 {
 	struct dentry *dentry;
 	struct path path;
@@ -201,14 +206,14 @@ static int handle_create(const char *nodename, umode_t mode, struct device *dev)
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	err = vfs_mknod(path.dentry->d_inode,
-			dentry, mode, dev->devt);
+	err = vfs_mknod(path.dentry->d_inode, dentry, mode, dev->devt);
 	if (!err) {
 		struct iattr newattrs;
 
-		/* fixup possibly umasked mode */
 		newattrs.ia_mode = mode;
-		newattrs.ia_valid = ATTR_MODE;
+		newattrs.ia_uid = uid;
+		newattrs.ia_gid = gid;
+		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
 		mutex_lock(&dentry->d_inode->i_mutex);
 		notify_change(dentry, &newattrs);
 		mutex_unlock(&dentry->d_inode->i_mutex);
@@ -358,10 +363,11 @@ int devtmpfs_mount(const char *mntdir)
 
 static DECLARE_COMPLETION(setup_done);
 
-static int handle(const char *name, umode_t mode, struct device *dev)
+static int handle(const char *name, umode_t mode, uid_t uid, gid_t gid,
+		  struct device *dev)
 {
 	if (mode)
-		return handle_create(name, mode, dev);
+		return handle_create(name, mode, uid, gid, dev);
 	else
 		return handle_remove(name, dev);
 }
@@ -387,7 +393,8 @@ static int devtmpfsd(void *p)
 			spin_unlock(&req_lock);
 			while (req) {
 				struct req *next = req->next;
-				req->err = handle(req->name, req->mode, req->dev);
+				req->err = handle(req->name, req->mode,
+						  req->uid, req->gid, req->dev);
 				complete(&req->done);
 				req = next;
 			}
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index f81b92572735..17002832abd9 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -317,7 +317,8 @@ static const struct dev_pm_ops usb_device_pm_ops = {
 #endif	/* CONFIG_PM */
 
 
-static char *usb_devnode(struct device *dev, umode_t *mode)
+static char *usb_devnode(struct device *dev,
+			 umode_t *mode, uid_t *uid, gid_t *gid)
 {
 	struct usb_device *usb_dev;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 4a7c4a84afee..851b85c7101e 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -25,6 +25,7 @@
 #include <linux/pm.h>
 #include <linux/atomic.h>
 #include <linux/ratelimit.h>
+#include <linux/uidgid.h>
 #include <asm/device.h>
 
 struct device;
@@ -465,7 +466,8 @@ struct device_type {
 	const char *name;
 	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
-	char *(*devnode)(struct device *dev, umode_t *mode);
+	char *(*devnode)(struct device *dev, umode_t *mode,
+			 uid_t *uid, gid_t *gid);
 	void (*release)(struct device *dev);
 
 	const struct dev_pm_ops *pm;
@@ -843,7 +845,8 @@ extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
 extern const char *device_get_devnode(struct device *dev,
-				      umode_t *mode, const char **tmp);
+				      umode_t *mode, uid_t *uid, gid_t *gid,
+				      const char **tmp);
 extern void *dev_get_drvdata(const struct device *dev);
 extern int dev_set_drvdata(struct device *dev, void *data);
 
-- 
cgit 


From 324670b620ab1ed00ba160e435686bd2ae4a59ce Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 4 Apr 2013 19:40:50 +0900
Subject: kprobes: Move __kprobes definition into compiler.h

Currently, __kprobes is defined in linux/kprobes.h which
is too big to be included in small or basic headers
that want to make use of this simple attribute.

So move __kprobes definition into linux/compiler.h
in which other compiler attributes are defined.

Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Timo Juhani Lindfors <timo.lindfors@iki.fi>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Nadia Yvette Chambers <nyc@holomorphy.com>
Cc: yrl.pp-manager.tt@hitachi.com
Cc: David S. Miller <davem@davemloft.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20130404104049.21071.20908.stgit@mhiramat-M0-7522
[ Improved the attribute explanation a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/compiler.h | 6 ++++++
 include/linux/kprobes.h  | 6 +-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 10b8f23fab0f..92669cd182a6 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -351,4 +351,10 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
  */
 #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
 
+/* Ignore/forbid kprobes attach on very low level functions marked by this attribute: */
+#ifdef CONFIG_KPROBES
+# define __kprobes	__attribute__((__section__(".kprobes.text")))
+#else
+# define __kprobes
+#endif
 #endif /* __LINUX_COMPILER_H */
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 4b6ef4d33cc2..ca1d27a0d6a6 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -29,6 +29,7 @@
  *		<jkenisto@us.ibm.com>  and Prasanna S Panchamukhi
  *		<prasanna@in.ibm.com> added function-return probes.
  */
+#include <linux/compiler.h>	/* for __kprobes */
 #include <linux/linkage.h>
 #include <linux/list.h>
 #include <linux/notifier.h>
@@ -49,16 +50,11 @@
 #define KPROBE_REENTER		0x00000004
 #define KPROBE_HIT_SSDONE	0x00000008
 
-/* Attach to insert probes on any functions which should be ignored*/
-#define __kprobes	__attribute__((__section__(".kprobes.text")))
-
 #else /* CONFIG_KPROBES */
 typedef int kprobe_opcode_t;
 struct arch_specific_insn {
 	int dummy;
 };
-#define __kprobes
-
 #endif /* CONFIG_KPROBES */
 
 struct kprobe;
-- 
cgit 


From ee761f629d598579594d7e1eb8c552f3c5f71e4d Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Mar 2013 22:49:32 +0100
Subject: arch: Consolidate tsk_is_polling()

Move it to a common place. Preparatory patch for implementing
set/clear for the idle need_resched poll implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Link: http://lkml.kernel.org/r/20130321215233.446034505@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/include/asm/thread_info.h      |  2 --
 arch/ia64/include/asm/thread_info.h       |  2 --
 arch/metag/include/asm/thread_info.h      |  2 --
 arch/microblaze/include/asm/thread_info.h |  1 -
 arch/mn10300/include/asm/thread_info.h    |  2 --
 arch/openrisc/include/asm/thread_info.h   |  2 --
 arch/parisc/include/asm/thread_info.h     |  2 --
 arch/powerpc/include/asm/thread_info.h    |  2 --
 arch/sh/include/asm/thread_info.h         |  2 --
 arch/sparc/include/asm/thread_info_32.h   |  2 --
 arch/sparc/include/asm/thread_info_64.h   |  2 --
 arch/tile/include/asm/thread_info.h       |  2 --
 arch/x86/include/asm/thread_info.h        |  2 --
 include/linux/sched.h                     | 20 ++++++++++++++++++++
 kernel/sched/core.c                       |  5 -----
 15 files changed, 20 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h
index 1f8c72959fb6..52cd2a4a3ff4 100644
--- a/arch/alpha/include/asm/thread_info.h
+++ b/arch/alpha/include/asm/thread_info.h
@@ -95,8 +95,6 @@ register struct thread_info *__current_thread_info __asm__("$8");
 #define TS_POLLING		0x0010	/* idle task polling need_resched,
 					   skip sending interrupt */
 
-#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK	1
 static inline void set_restore_sigmask(void)
diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h
index 020d655ed082..cade13dd0299 100644
--- a/arch/ia64/include/asm/thread_info.h
+++ b/arch/ia64/include/asm/thread_info.h
@@ -131,8 +131,6 @@ struct thread_info {
 #define TS_POLLING		1 	/* true if in idle loop and not sleeping */
 #define TS_RESTORE_SIGMASK	2	/* restore signal mask in do_signal() */
 
-#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK	1
 static inline void set_restore_sigmask(void)
diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h
index 0ecd34d8b5f6..7c4a33006142 100644
--- a/arch/metag/include/asm/thread_info.h
+++ b/arch/metag/include/asm/thread_info.h
@@ -150,6 +150,4 @@ static inline int kstack_end(void *addr)
 #define _TIF_WORK_MASK		(_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \
 				 _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP))
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h
index 008f30433d22..de26ea6373de 100644
--- a/arch/microblaze/include/asm/thread_info.h
+++ b/arch/microblaze/include/asm/thread_info.h
@@ -182,7 +182,6 @@ static inline bool test_and_clear_restore_sigmask(void)
 	ti->status &= ~TS_RESTORE_SIGMASK;
 	return true;
 }
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h
index f90062b0622d..224b4262486d 100644
--- a/arch/mn10300/include/asm/thread_info.h
+++ b/arch/mn10300/include/asm/thread_info.h
@@ -165,8 +165,6 @@ void arch_release_thread_info(struct thread_info *ti);
 #define _TIF_WORK_MASK		0x0000FFFE	/* work to do on interrupt/exception return */
 #define _TIF_ALLWORK_MASK	0x0000FFFF	/* work to do on any return to u-space */
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h
index 07f3212422ad..d797acc901e4 100644
--- a/arch/openrisc/include/asm/thread_info.h
+++ b/arch/openrisc/include/asm/thread_info.h
@@ -128,8 +128,6 @@ register struct thread_info *current_thread_info_reg asm("r10");
 /* For OpenRISC, this is anything in the LSW other than syscall trace */
 #define _TIF_WORK_MASK (0xff & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP))
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h
index d1fb79a36f3d..6182832e5b6c 100644
--- a/arch/parisc/include/asm/thread_info.h
+++ b/arch/parisc/include/asm/thread_info.h
@@ -77,8 +77,6 @@ struct thread_info {
 #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP |	\
 				 _TIF_BLOCKSTEP)
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_PARISC_THREAD_INFO_H */
diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h
index 406b7b9a1341..8ceea14d6fe4 100644
--- a/arch/powerpc/include/asm/thread_info.h
+++ b/arch/powerpc/include/asm/thread_info.h
@@ -182,8 +182,6 @@ static inline bool test_thread_local_flags(unsigned int flags)
 #define is_32bit_task()	(1)
 #endif
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h
index 7d5ac4e48485..45a93669289d 100644
--- a/arch/sh/include/asm/thread_info.h
+++ b/arch/sh/include/asm/thread_info.h
@@ -207,8 +207,6 @@ static inline bool test_and_clear_restore_sigmask(void)
 	return true;
 }
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif	/* !__ASSEMBLY__ */
 
 #endif /* __KERNEL__ */
diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h
index 25849ae3e900..dd3807599bb9 100644
--- a/arch/sparc/include/asm/thread_info_32.h
+++ b/arch/sparc/include/asm/thread_info_32.h
@@ -132,8 +132,6 @@ register struct thread_info *current_thread_info_reg asm("g6");
 #define _TIF_DO_NOTIFY_RESUME_MASK	(_TIF_NOTIFY_RESUME | \
 					 _TIF_SIGPENDING)
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_THREAD_INFO_H */
diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 269bd92313df..d5e504251079 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -256,8 +256,6 @@ static inline bool test_and_clear_restore_sigmask(void)
 	return true;
 }
 
-#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
-
 #define thread32_stack_is_64bit(__SP) (((__SP) & 0x1) != 0)
 #define test_thread_64bit_stack(__SP) \
 	((test_thread_flag(TIF_32BIT) && !thread32_stack_is_64bit(__SP)) ? \
diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h
index e9c670d7a7fe..ccc8ef37235c 100644
--- a/arch/tile/include/asm/thread_info.h
+++ b/arch/tile/include/asm/thread_info.h
@@ -153,8 +153,6 @@ extern void _cpu_idle(void);
 #define TS_POLLING		0x0004	/* in idle loop but not sleeping */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal */
 
-#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK	1
 static inline void set_restore_sigmask(void)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2cd056e3ada3..a1df6e84691f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -241,8 +241,6 @@ static inline struct thread_info *current_thread_info(void)
 					   skip sending interrupt */
 #define TS_RESTORE_SIGMASK	0x0008	/* restore signal mask in do_signal() */
 
-#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
-
 #ifndef __ASSEMBLY__
 #define HAVE_SET_RESTORE_SIGMASK	1
 static inline void set_restore_sigmask(void)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..6709a5813f27 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2621,6 +2621,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 #endif
 }
 
+/*
+ * Idle thread specific functions to determine the need_resched
+ * polling state. We have two versions, one based on TS_POLLING in
+ * thread_info.status and one based on TIF_POLLING_NRFLAG in
+ * thread_info.flags
+ */
+#ifdef TS_POLLING
+static inline int tsk_is_polling(struct task_struct *p)
+{
+	return task_thread_info(p)->status & TS_POLLING;
+}
+#elif defined(TIF_POLLING_NRFLAG)
+static inline int tsk_is_polling(struct task_struct *p)
+{
+	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+}
+#else
+static inline int tsk_is_polling(struct task_struct *p) { return 0; }
+#endif
+
 /*
  * Thread group CPU time accounting.
  */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..243a20c5cf91 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -512,11 +512,6 @@ static inline void init_hrtick(void)
  * the target CPU.
  */
 #ifdef CONFIG_SMP
-
-#ifndef tsk_is_polling
-#define tsk_is_polling(t) 0
-#endif
-
 void resched_task(struct task_struct *p)
 {
 	int cpu;
-- 
cgit 


From 3a98f871ecaf44806e188184332c3fec27c8f08c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Mar 2013 22:49:33 +0100
Subject: idle: Implement set/clr functions for need_resched poll

Implement set/clear functions for the idle need_resched poll
implementation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Link: http://lkml.kernel.org/r/20130321215233.518839807@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6709a5813f27..21fe9a142e51 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2632,13 +2632,34 @@ static inline int tsk_is_polling(struct task_struct *p)
 {
 	return task_thread_info(p)->status & TS_POLLING;
 }
+static inline void current_set_polling(void)
+{
+	current_thread_info()->status |= TS_POLLING;
+}
+
+static inline void current_clr_polling(void)
+{
+	current_thread_info()->status &= ~TS_POLLING;
+	smp_mb__after_clear_bit();
+}
 #elif defined(TIF_POLLING_NRFLAG)
 static inline int tsk_is_polling(struct task_struct *p)
 {
 	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
 }
+static inline void current_set_polling(void)
+{
+	set_thread_flag(TIF_POLLING_NRFLAG);
+}
+
+static inline void current_clr_polling(void)
+{
+	clear_thread_flag(TIF_POLLING_NRFLAG);
+}
 #else
 static inline int tsk_is_polling(struct task_struct *p) { return 0; }
+static inline void current_set_polling(void) { }
+static inline void current_clr_polling(void) { }
 #endif
 
 /*
-- 
cgit 


From a1a04ec3c7c27a682473fd9beb2c996316a64649 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Mar 2013 22:49:34 +0100
Subject: idle: Provide a generic entry point for the idle code

For now this calls cpu_idle(), but in the long run we want to move the
cpu bringup code to the core and therefor we add a state argument.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Link: http://lkml.kernel.org/r/20130321215233.583190032@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/cpu.h |  8 ++++++++
 init/main.c         |  2 +-
 kernel/Makefile     |  1 +
 kernel/cpu/Makefile |  1 +
 kernel/cpu/idle.c   | 10 ++++++++++
 5 files changed, 21 insertions(+), 1 deletion(-)
 create mode 100644 kernel/cpu/Makefile
 create mode 100644 kernel/cpu/idle.c

(limited to 'include/linux')

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index ce7a074f2519..7419e30c55fb 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -212,4 +212,12 @@ static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
 #endif /* !CONFIG_PM_SLEEP_SMP */
 
+enum cpuhp_state {
+	CPUHP_OFFLINE,
+	CPUHP_ONLINE,
+};
+
+void cpu_startup_entry(enum cpuhp_state state);
+void cpu_idle(void);
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/init/main.c b/init/main.c
index 63534a141b4e..adb179d3e0f8 100644
--- a/init/main.c
+++ b/init/main.c
@@ -384,7 +384,7 @@ static noinline void __init_refok rest_init(void)
 	init_idle_bootup_task(current);
 	schedule_preempt_disabled();
 	/* Call into cpu_idle with preempt disabled */
-	cpu_idle();
+	cpu_startup_entry(CPUHP_ONLINE);
 }
 
 /* Check for early params. */
diff --git a/kernel/Makefile b/kernel/Makefile
index bbde5f1a4486..d1574d47cf27 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -24,6 +24,7 @@ endif
 
 obj-y += sched/
 obj-y += power/
+obj-y += cpu/
 
 obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
 obj-$(CONFIG_FREEZER) += freezer.o
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
new file mode 100644
index 000000000000..59ab052ef7a0
--- /dev/null
+++ b/kernel/cpu/Makefile
@@ -0,0 +1 @@
+obj-y	= idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
new file mode 100644
index 000000000000..1908f00e0e98
--- /dev/null
+++ b/kernel/cpu/idle.c
@@ -0,0 +1,10 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+	cpu_idle();
+}
-- 
cgit 


From d166991234347215dc23fc9dc15a63a83a1a54e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 21 Mar 2013 22:49:35 +0100
Subject: idle: Implement generic idle function

All idle functions in arch/* are more or less the same, plus minus a
few bugs and extra instrumentation, tickless support and other
optional items.

Implement a generic idle function which resembles the functionality
found in arch/. Provide weak arch_cpu_idle_* functions which can be
overridden by the architecture code if needed.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Reviewed-by: Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: Magnus Damm <magnus.damm@gmail.com>
Link: http://lkml.kernel.org/r/20130321215233.646635455@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/Kconfig        |   3 ++
 include/linux/cpu.h |   8 ++++
 kernel/cpu/idle.c   | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 116 insertions(+)

(limited to 'include/linux')

diff --git a/arch/Kconfig b/arch/Kconfig
index 1455579791ec..a699f3767be4 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -216,6 +216,9 @@ config USE_GENERIC_SMP_HELPERS
 config GENERIC_SMP_IDLE_THREAD
        bool
 
+config GENERIC_IDLE_LOOP
+       bool
+
 # Select if arch init_task initializer is different to init/init_task.c
 config ARCH_INIT_TASK
        bool
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 7419e30c55fb..c6f6e0839b61 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -220,4 +220,12 @@ enum cpuhp_state {
 void cpu_startup_entry(enum cpuhp_state state);
 void cpu_idle(void);
 
+void cpu_idle_poll_ctrl(bool enable);
+
+void arch_cpu_idle(void);
+void arch_cpu_idle_prepare(void);
+void arch_cpu_idle_enter(void);
+void arch_cpu_idle_exit(void);
+void arch_cpu_idle_dead(void);
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 1908f00e0e98..54c320383934 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -3,8 +3,113 @@
  */
 #include <linux/sched.h>
 #include <linux/cpu.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
 
+#include <asm/tlb.h>
+
+#include <trace/events/power.h>
+
+#ifndef CONFIG_GENERIC_IDLE_LOOP
 void cpu_startup_entry(enum cpuhp_state state)
 {
 	cpu_idle();
 }
+#else
+
+static int __read_mostly cpu_idle_force_poll;
+
+void cpu_idle_poll_ctrl(bool enable)
+{
+	if (enable) {
+		cpu_idle_force_poll++;
+	} else {
+		cpu_idle_force_poll--;
+		WARN_ON_ONCE(cpu_idle_force_poll < 0);
+	}
+}
+
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+	cpu_idle_force_poll = 1;
+	return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+	cpu_idle_force_poll = 0;
+	return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+
+static inline int cpu_idle_poll(void)
+{
+	trace_cpu_idle_rcuidle(0, smp_processor_id());
+	local_irq_enable();
+	while (!need_resched())
+		cpu_relax();
+	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+	return 1;
+}
+
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+	cpu_idle_force_poll = 1;
+}
+
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+	while (1) {
+		tick_nohz_idle_enter();
+
+		while (!need_resched()) {
+			check_pgt_cache();
+			rmb();
+
+			if (cpu_is_offline(smp_processor_id()))
+				arch_cpu_idle_dead();
+
+			local_irq_disable();
+			arch_cpu_idle_enter();
+
+			if (cpu_idle_force_poll) {
+				cpu_idle_poll();
+			} else {
+				current_clr_polling();
+				if (!need_resched()) {
+					stop_critical_timings();
+					rcu_idle_enter();
+					arch_cpu_idle();
+					WARN_ON_ONCE(irqs_disabled());
+					rcu_idle_exit();
+					start_critical_timings();
+				} else {
+					local_irq_enable();
+				}
+				current_set_polling();
+			}
+			arch_cpu_idle_exit();
+		}
+		tick_nohz_idle_exit();
+		schedule_preempt_disabled();
+	}
+}
+
+void cpu_startup_entry(enum cpuhp_state state)
+{
+	current_set_polling();
+	arch_cpu_idle_prepare();
+	cpu_idle_loop();
+}
+#endif
-- 
cgit 


From 654003e98213991e4d534487907baad2afd9828d Mon Sep 17 00:00:00 2001
From: Ian Lartey <ian@slimlogic.co.uk>
Date: Fri, 22 Mar 2013 14:55:12 +0000
Subject: mfd: palmas: is_palmas_charger needed by multiple drivers

is_palmas_charger checks for the presence of charging
functionality in the device

Signed-off-by: Graeme Gregory <gg@slimlogic.co.uk>
Signed-off-by: Ian Lartey <ian@slimlogic.co.uk>
Acked-by: Laxman Dewangani <ldewangan@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/palmas.h | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index 3bbda22721ea..4a066d0caf44 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -1,9 +1,10 @@
 /*
  * TI Palmas
  *
- * Copyright 2011 Texas Instruments Inc.
+ * Copyright 2011-2013 Texas Instruments Inc.
  *
  * Author: Graeme Gregory <gg@slimlogic.co.uk>
+ * Author: Ian Lartey <ian@slimlogic.co.uk>
  *
  *  This program is free software; you can redistribute it and/or modify it
  *  under  the terms of the GNU General  Public License as published by the
@@ -22,6 +23,15 @@
 
 #define PALMAS_NUM_CLIENTS		3
 
+/* The ID_REVISION NUMBERS */
+#define PALMAS_CHIP_OLD_ID		0x0000
+#define PALMAS_CHIP_ID			0xC035
+#define PALMAS_CHIP_CHARGER_ID		0xC036
+
+#define is_palmas(a)	(((a) == PALMAS_CHIP_OLD_ID) || \
+			((a) == PALMAS_CHIP_ID))
+#define is_palmas_charger(a) ((a) == PALMAS_CHIP_CHARGER_ID)
+
 struct palmas_pmic;
 struct palmas_gpadc;
 struct palmas_resource;
-- 
cgit 


From 751391c74f3d41a8d5c5dd5100385f723660c368 Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Tue, 12 Mar 2013 18:08:06 -0400
Subject: mfd: tps65090: Fix enum in header file

The enum is missing the definition for the first bit, which makes all
the rest off by one. Add definition for the TPS65090_IRQ_INTERRUPT bit
which at 0.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/tps65090.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/tps65090.h b/include/linux/mfd/tps65090.h
index 6694cf43e8b8..9ce231fda74f 100644
--- a/include/linux/mfd/tps65090.h
+++ b/include/linux/mfd/tps65090.h
@@ -27,6 +27,7 @@
 
 /* TPS65090 IRQs */
 enum {
+	TPS65090_IRQ_INTERRUPT,
 	TPS65090_IRQ_VAC_STATUS_CHANGE,
 	TPS65090_IRQ_VSYS_STATUS_CHANGE,
 	TPS65090_IRQ_BAT_STATUS_CHANGE,
-- 
cgit 


From 57e1b48805a92304e4f50ebf5b885e627731a444 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@opensource.wolfsonmicro.com>
Date: Wed, 6 Mar 2013 14:28:13 +0800
Subject: mfd: arizona: Define additional FLL control registers

Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/arizona/registers.h | 40 +++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/arizona/registers.h b/include/linux/mfd/arizona/registers.h
index 340355136069..a61ce90ecd3f 100644
--- a/include/linux/mfd/arizona/registers.h
+++ b/include/linux/mfd/arizona/registers.h
@@ -85,12 +85,14 @@
 #define ARIZONA_FLL1_CONTROL_6                   0x176
 #define ARIZONA_FLL1_LOOP_FILTER_TEST_1          0x177
 #define ARIZONA_FLL1_NCO_TEST_0                  0x178
+#define ARIZONA_FLL1_CONTROL_7                   0x179
 #define ARIZONA_FLL1_SYNCHRONISER_1              0x181
 #define ARIZONA_FLL1_SYNCHRONISER_2              0x182
 #define ARIZONA_FLL1_SYNCHRONISER_3              0x183
 #define ARIZONA_FLL1_SYNCHRONISER_4              0x184
 #define ARIZONA_FLL1_SYNCHRONISER_5              0x185
 #define ARIZONA_FLL1_SYNCHRONISER_6              0x186
+#define ARIZONA_FLL1_SYNCHRONISER_7              0x187
 #define ARIZONA_FLL1_SPREAD_SPECTRUM             0x189
 #define ARIZONA_FLL1_GPIO_CLOCK                  0x18A
 #define ARIZONA_FLL2_CONTROL_1                   0x191
@@ -101,12 +103,14 @@
 #define ARIZONA_FLL2_CONTROL_6                   0x196
 #define ARIZONA_FLL2_LOOP_FILTER_TEST_1          0x197
 #define ARIZONA_FLL2_NCO_TEST_0                  0x198
+#define ARIZONA_FLL2_CONTROL_7                   0x199
 #define ARIZONA_FLL2_SYNCHRONISER_1              0x1A1
 #define ARIZONA_FLL2_SYNCHRONISER_2              0x1A2
 #define ARIZONA_FLL2_SYNCHRONISER_3              0x1A3
 #define ARIZONA_FLL2_SYNCHRONISER_4              0x1A4
 #define ARIZONA_FLL2_SYNCHRONISER_5              0x1A5
 #define ARIZONA_FLL2_SYNCHRONISER_6              0x1A6
+#define ARIZONA_FLL2_SYNCHRONISER_7              0x1A7
 #define ARIZONA_FLL2_SPREAD_SPECTRUM             0x1A9
 #define ARIZONA_FLL2_GPIO_CLOCK                  0x1AA
 #define ARIZONA_MIC_CHARGE_PUMP_1                0x200
@@ -1677,6 +1681,13 @@
 #define ARIZONA_FLL1_FRC_INTEG_VAL_SHIFT              0  /* FLL1_FRC_INTEG_VAL - [11:0] */
 #define ARIZONA_FLL1_FRC_INTEG_VAL_WIDTH             12  /* FLL1_FRC_INTEG_VAL - [11:0] */
 
+/*
+ * R377 (0x179) - FLL1 Control 7
+ */
+#define ARIZONA_FLL1_GAIN_MASK                   0x003c  /* FLL1_GAIN */
+#define ARIZONA_FLL1_GAIN_SHIFT                       2  /* FLL1_GAIN */
+#define ARIZONA_FLL1_GAIN_WIDTH                       4  /* FLL1_GAIN */
+
 /*
  * R385 (0x181) - FLL1 Synchroniser 1
  */
@@ -1723,6 +1734,17 @@
 #define ARIZONA_FLL1_CLK_SYNC_SRC_SHIFT               0  /* FLL1_CLK_SYNC_SRC - [3:0] */
 #define ARIZONA_FLL1_CLK_SYNC_SRC_WIDTH               4  /* FLL1_CLK_SYNC_SRC - [3:0] */
 
+/*
+ * R391 (0x187) - FLL1 Synchroniser 7
+ */
+#define ARIZONA_FLL1_SYNC_GAIN_MASK              0x003c  /* FLL1_SYNC_GAIN */
+#define ARIZONA_FLL1_SYNC_GAIN_SHIFT                  2  /* FLL1_SYNC_GAIN */
+#define ARIZONA_FLL1_SYNC_GAIN_WIDTH                  4  /* FLL1_SYNC_GAIN */
+#define ARIZONA_FLL1_SYNC_BW                     0x0001  /* FLL1_SYNC_BW */
+#define ARIZONA_FLL1_SYNC_BW_MASK                0x0001  /* FLL1_SYNC_BW */
+#define ARIZONA_FLL1_SYNC_BW_SHIFT                    0  /* FLL1_SYNC_BW */
+#define ARIZONA_FLL1_SYNC_BW_WIDTH                    1  /* FLL1_SYNC_BW */
+
 /*
  * R393 (0x189) - FLL1 Spread Spectrum
  */
@@ -1815,6 +1837,13 @@
 #define ARIZONA_FLL2_FRC_INTEG_VAL_SHIFT              0  /* FLL2_FRC_INTEG_VAL - [11:0] */
 #define ARIZONA_FLL2_FRC_INTEG_VAL_WIDTH             12  /* FLL2_FRC_INTEG_VAL - [11:0] */
 
+/*
+ * R409 (0x199) - FLL2 Control 7
+ */
+#define ARIZONA_FLL2_GAIN_MASK                   0x003c  /* FLL2_GAIN */
+#define ARIZONA_FLL2_GAIN_SHIFT                       2  /* FLL2_GAIN */
+#define ARIZONA_FLL2_GAIN_WIDTH                       4  /* FLL2_GAIN */
+
 /*
  * R417 (0x1A1) - FLL2 Synchroniser 1
  */
@@ -1861,6 +1890,17 @@
 #define ARIZONA_FLL2_CLK_SYNC_SRC_SHIFT               0  /* FLL2_CLK_SYNC_SRC - [3:0] */
 #define ARIZONA_FLL2_CLK_SYNC_SRC_WIDTH               4  /* FLL2_CLK_SYNC_SRC - [3:0] */
 
+/*
+ * R423 (0x1A7) - FLL2 Synchroniser 7
+ */
+#define ARIZONA_FLL2_SYNC_GAIN_MASK              0x003c  /* FLL2_SYNC_GAIN */
+#define ARIZONA_FLL2_SYNC_GAIN_SHIFT                  2  /* FLL2_SYNC_GAIN */
+#define ARIZONA_FLL2_SYNC_GAIN_WIDTH                  4  /* FLL2_SYNC_GAIN */
+#define ARIZONA_FLL2_SYNC_BW_MASK                0x0001  /* FLL2_SYNC_BW */
+#define ARIZONA_FLL2_SYNC_BW_MASK                0x0001  /* FLL2_SYNC_BW */
+#define ARIZONA_FLL2_SYNC_BW_SHIFT                    0  /* FLL2_SYNC_BW */
+#define ARIZONA_FLL2_SYNC_BW_WIDTH                    1  /* FLL2_SYNC_BW */
+
 /*
  * R425 (0x1A9) - FLL2 Spread Spectrum
  */
-- 
cgit 


From 395b97a3aeff0b8d949ee3e67bf8c11c5ffd6861 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Wed, 27 Mar 2013 09:31:28 -0400
Subject: ftrace: Do not call stub functions in control loop

The function tracing control loop used by perf spits out a warning
if the called function is not a control function. This is because
the control function references a per cpu allocated data structure
on struct ftrace_ops that is not allocated for other types of
functions.

commit 0a016409e42 "ftrace: Optimize the function tracer list loop"

Had an optimization done to all function tracing loops to optimize
for a single registered ops. Unfortunately, this allows for a slight
race when tracing starts or ends, where the stub function might be
called after the current registered ops is removed. In this case we
get the following dump:

root# perf stat -e ftrace:function sleep 1
[   74.339105] WARNING: at include/linux/ftrace.h:209 ftrace_ops_control_func+0xde/0xf0()
[   74.349522] Hardware name: PRIMERGY RX200 S6
[   74.357149] Modules linked in: sg igb iTCO_wdt ptp pps_core iTCO_vendor_support i7core_edac dca lpc_ich i2c_i801 coretemp edac_core crc32c_intel mfd_core ghash_clmulni_intel dm_multipath acpi_power_meter pcspk
r microcode vhost_net tun macvtap macvlan nfsd kvm_intel kvm auth_rpcgss nfs_acl lockd sunrpc uinput xfs libcrc32c sd_mod crc_t10dif sr_mod cdrom mgag200 i2c_algo_bit drm_kms_helper ttm qla2xxx mptsas ahci drm li
bahci scsi_transport_sas mptscsih libata scsi_transport_fc i2c_core mptbase scsi_tgt dm_mirror dm_region_hash dm_log dm_mod
[   74.446233] Pid: 1377, comm: perf Tainted: G        W    3.9.0-rc1 #1
[   74.453458] Call Trace:
[   74.456233]  [<ffffffff81062e3f>] warn_slowpath_common+0x7f/0xc0
[   74.462997]  [<ffffffff810fbc60>] ? rcu_note_context_switch+0xa0/0xa0
[   74.470272]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.478117]  [<ffffffff81062e9a>] warn_slowpath_null+0x1a/0x20
[   74.484681]  [<ffffffff81102ede>] ftrace_ops_control_func+0xde/0xf0
[   74.491760]  [<ffffffff8162f400>] ftrace_call+0x5/0x2f
[   74.497511]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.503486]  [<ffffffff8162f400>] ? ftrace_call+0x5/0x2f
[   74.509500]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.516088]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.522268]  [<ffffffff810fbc65>] ? synchronize_sched+0x5/0x50
[   74.528837]  [<ffffffff811041a2>] ? __unregister_ftrace_function+0xa2/0x1a0
[   74.536696]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.542878]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.548869]  [<ffffffff81105c67>] unregister_ftrace_function+0x27/0x50
[   74.556243]  [<ffffffff8111eadf>] perf_ftrace_event_register+0x9f/0x140
[   74.563709]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.569887]  [<ffffffff8162402d>] ? mutex_lock+0x1d/0x50
[   74.575898]  [<ffffffff8111e94e>] perf_trace_destroy+0x2e/0x50
[   74.582505]  [<ffffffff81127ba9>] tp_perf_event_destroy+0x9/0x10
[   74.589298]  [<ffffffff811295d0>] free_event+0x70/0x1a0
[   74.595208]  [<ffffffff8112a579>] perf_event_release_kernel+0x69/0xa0
[   74.602460]  [<ffffffff816254d5>] ? _cond_resched+0x5/0x40
[   74.608667]  [<ffffffff8112a640>] put_event+0x90/0xc0
[   74.614373]  [<ffffffff8112a740>] perf_release+0x10/0x20
[   74.620367]  [<ffffffff811a3044>] __fput+0xf4/0x280
[   74.625894]  [<ffffffff811a31de>] ____fput+0xe/0x10
[   74.631387]  [<ffffffff81083697>] task_work_run+0xa7/0xe0
[   74.637452]  [<ffffffff81014981>] do_notify_resume+0x71/0xb0
[   74.643843]  [<ffffffff8162fa92>] int_signal+0x12/0x17

To fix this a new ftrace_ops flag is added that denotes the ftrace_list_end
ftrace_ops stub as just that, a stub. This flag is now checked in the
control loop and the function is not called if the flag is set.

Thanks to Jovi for not just reporting the bug, but also pointing out
where the bug was in the code.

Link: http://lkml.kernel.org/r/514A8855.7090402@redhat.com
Link: http://lkml.kernel.org/r/1364377499-1900-15-git-send-email-jovi.zhangwei@huawei.com

Tested-by: WANG Chao <chaowang@redhat.com>
Reported-by: WANG Chao <chaowang@redhat.com>
Reported-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h | 2 ++
 kernel/trace/ftrace.c  | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e5ca8ef50e9b..167abf907802 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -89,6 +89,7 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  *            that the call back has its own recursion protection. If it does
  *            not set this, then the ftrace infrastructure will add recursion
  *            protection for the caller.
+ * STUB   - The ftrace_ops is just a place holder.
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -98,6 +99,7 @@ enum {
 	FTRACE_OPS_FL_SAVE_REGS			= 1 << 4,
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 5,
 	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 6,
+	FTRACE_OPS_FL_STUB			= 1 << 7,
 };
 
 struct ftrace_ops {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index cc4943c7ce6d..7e897106b7e0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
 
 static struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
 };
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -4131,7 +4131,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 	preempt_disable_notrace();
 	trace_recursion_set(TRACE_CONTROL_BIT);
 	do_for_each_ftrace_op(op, ftrace_control_list) {
-		if (!ftrace_function_local_disabled(op) &&
+		if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+		    !ftrace_function_local_disabled(op) &&
 		    ftrace_ops_test(op, ip))
 			op->func(ip, parent_ip, op, regs);
 	} while_for_each_ftrace_op(op);
-- 
cgit 


From ea71d9a600e769ca669f4ba3e25ffdecdeede240 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Sun, 24 Mar 2013 00:01:48 +0100
Subject: clocksource: sunxi: make use of CLKSRC_OF

Using CLKSRC_OF allows to remove the SoC specific sunxi_timer.h header,
and instead of using a custom init function in the machine definition
use the standard clocksource_of_init function.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm/mach-sunxi/Kconfig       |  1 +
 arch/arm/mach-sunxi/sunxi.c       | 11 +++++++++--
 drivers/clocksource/sunxi_timer.c | 18 +++---------------
 include/linux/sunxi_timer.h       | 24 ------------------------
 4 files changed, 13 insertions(+), 41 deletions(-)
 delete mode 100644 include/linux/sunxi_timer.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-sunxi/Kconfig b/arch/arm/mach-sunxi/Kconfig
index 8709a39bd34c..06c28945357d 100644
--- a/arch/arm/mach-sunxi/Kconfig
+++ b/arch/arm/mach-sunxi/Kconfig
@@ -1,6 +1,7 @@
 config ARCH_SUNXI
 	bool "Allwinner A1X SOCs" if ARCH_MULTI_V7
 	select CLKSRC_MMIO
+	select CLKSRC_OF
 	select COMMON_CLK
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_IRQ_CHIP
diff --git a/arch/arm/mach-sunxi/sunxi.c b/arch/arm/mach-sunxi/sunxi.c
index 23afb732cb40..b26748d82045 100644
--- a/arch/arm/mach-sunxi/sunxi.c
+++ b/arch/arm/mach-sunxi/sunxi.c
@@ -10,6 +10,7 @@
  * warranty of any kind, whether express or implied.
  */
 
+#include <linux/clocksource.h>
 #include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -17,8 +18,8 @@
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/io.h>
-#include <linux/sunxi_timer.h>
 
+#include <linux/clk/sunxi.h>
 #include <linux/irqchip/sunxi.h>
 
 #include <asm/mach/arch.h>
@@ -81,6 +82,12 @@ void __init sunxi_map_io(void)
 	iotable_init(sunxi_io_desc, ARRAY_SIZE(sunxi_io_desc));
 }
 
+static void __init sunxi_timer_init(void)
+{
+	sunxi_init_clocks();
+	clocksource_of_init();
+}
+
 static void __init sunxi_dt_init(void)
 {
 	sunxi_setup_restart();
@@ -100,6 +107,6 @@ DT_MACHINE_START(SUNXI_DT, "Allwinner A1X (Device Tree)")
 	.init_irq	= sunxi_init_irq,
 	.handle_irq	= sunxi_handle_irq,
 	.restart	= sunxi_restart,
-	.init_time	= &sunxi_timer_init,
+	.init_time	= sunxi_timer_init,
 	.dt_compat	= sunxi_board_dt_compat,
 MACHINE_END
diff --git a/drivers/clocksource/sunxi_timer.c b/drivers/clocksource/sunxi_timer.c
index 7a3ea236be52..308bbe328318 100644
--- a/drivers/clocksource/sunxi_timer.c
+++ b/drivers/clocksource/sunxi_timer.c
@@ -22,8 +22,6 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
-#include <linux/sunxi_timer.h>
-#include <linux/clk/sunxi.h>
 
 #define TIMER_IRQ_EN_REG	0x00
 #define TIMER_IRQ_EN(val)		(1 << val)
@@ -98,23 +96,13 @@ static struct irqaction sunxi_timer_irq = {
 	.dev_id = &sunxi_clockevent,
 };
 
-static struct of_device_id sunxi_timer_dt_ids[] = {
-	{ .compatible = "allwinner,sunxi-timer" },
-	{ }
-};
-
-void __init sunxi_timer_init(void)
+static void __init sunxi_timer_init(struct device_node *node)
 {
-	struct device_node *node;
 	unsigned long rate = 0;
 	struct clk *clk;
 	int ret, irq;
 	u32 val;
 
-	node = of_find_matching_node(NULL, sunxi_timer_dt_ids);
-	if (!node)
-		panic("No sunxi timer node");
-
 	timer_base = of_iomap(node, 0);
 	if (!timer_base)
 		panic("Can't map registers");
@@ -123,8 +111,6 @@ void __init sunxi_timer_init(void)
 	if (irq <= 0)
 		panic("Can't parse IRQ");
 
-	sunxi_init_clocks();
-
 	clk = of_clk_get(node, 0);
 	if (IS_ERR(clk))
 		panic("Can't get timer clock");
@@ -158,3 +144,5 @@ void __init sunxi_timer_init(void)
 	clockevents_config_and_register(&sunxi_clockevent, rate / TIMER_SCAL,
 					0x1, 0xff);
 }
+CLOCKSOURCE_OF_DECLARE(sunxi, "allwinner,sunxi-timer",
+		       sunxi_timer_init);
diff --git a/include/linux/sunxi_timer.h b/include/linux/sunxi_timer.h
deleted file mode 100644
index 18081787e5f3..000000000000
--- a/include/linux/sunxi_timer.h
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- * Copyright 2012 Maxime Ripard
- *
- * Maxime Ripard <maxime.ripard@free-electrons.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __SUNXI_TIMER_H
-#define __SUNXI_TIMER_H
-
-#include <asm/mach/time.h>
-
-void sunxi_timer_init(void);
-
-#endif
-- 
cgit 


From f1dc6c4f77678979868fe220c6913890e13473d8 Mon Sep 17 00:00:00 2001
From: Maxime Ripard <maxime.ripard@free-electrons.com>
Date: Sun, 10 Mar 2013 15:54:46 +0100
Subject: irqchip: sunxi: Make use of the IRQCHIP_DECLARE macro

This allows to remove some boilerplate code. At the same time, call the
set_handle_irq function in the initialization function of the irqchip,
so that we can remove it from the machine declaration.

Signed-off-by: Maxime Ripard <maxime.ripard@free-electrons.com>
---
 arch/arm/mach-sunxi/sunxi.c   |  5 ++---
 drivers/irqchip/irq-sunxi.c   | 22 ++++++++++------------
 include/linux/irqchip/sunxi.h | 27 ---------------------------
 3 files changed, 12 insertions(+), 42 deletions(-)
 delete mode 100644 include/linux/irqchip/sunxi.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-sunxi/sunxi.c b/arch/arm/mach-sunxi/sunxi.c
index b26748d82045..c99ab1bdee4d 100644
--- a/arch/arm/mach-sunxi/sunxi.c
+++ b/arch/arm/mach-sunxi/sunxi.c
@@ -14,13 +14,13 @@
 #include <linux/delay.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irqchip.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_platform.h>
 #include <linux/io.h>
 
 #include <linux/clk/sunxi.h>
-#include <linux/irqchip/sunxi.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach/map.h>
@@ -104,8 +104,7 @@ static const char * const sunxi_board_dt_compat[] = {
 DT_MACHINE_START(SUNXI_DT, "Allwinner A1X (Device Tree)")
 	.init_machine	= sunxi_dt_init,
 	.map_io		= sunxi_map_io,
-	.init_irq	= sunxi_init_irq,
-	.handle_irq	= sunxi_handle_irq,
+	.init_irq	= irqchip_init,
 	.restart	= sunxi_restart,
 	.init_time	= sunxi_timer_init,
 	.dt_compat	= sunxi_board_dt_compat,
diff --git a/drivers/irqchip/irq-sunxi.c b/drivers/irqchip/irq-sunxi.c
index 10974fa42653..0fc49c577033 100644
--- a/drivers/irqchip/irq-sunxi.c
+++ b/drivers/irqchip/irq-sunxi.c
@@ -20,7 +20,10 @@
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 
-#include <linux/irqchip/sunxi.h>
+#include <asm/exception.h>
+#include <asm/mach/irq.h>
+
+#include "irqchip.h"
 
 #define SUNXI_IRQ_VECTOR_REG		0x00
 #define SUNXI_IRQ_PROTECTION_REG	0x08
@@ -33,6 +36,8 @@
 static void __iomem *sunxi_irq_base;
 static struct irq_domain *sunxi_irq_domain;
 
+static asmlinkage void __exception_irq_entry sunxi_handle_irq(struct pt_regs *regs);
+
 void sunxi_irq_ack(struct irq_data *irqd)
 {
 	unsigned int irq = irqd_to_hwirq(irqd);
@@ -125,20 +130,13 @@ static int __init sunxi_of_init(struct device_node *node,
 	if (!sunxi_irq_domain)
 		panic("%s: unable to create IRQ domain\n", node->full_name);
 
-	return 0;
-}
-
-static struct of_device_id sunxi_irq_dt_ids[] __initconst = {
-	{ .compatible = "allwinner,sunxi-ic", .data = sunxi_of_init },
-	{ }
-};
+	set_handle_irq(sunxi_handle_irq);
 
-void __init sunxi_init_irq(void)
-{
-	of_irq_init(sunxi_irq_dt_ids);
+	return 0;
 }
+IRQCHIP_DECLARE(allwinner_sunxi_ic, "allwinner,sunxi-ic", sunxi_of_init);
 
-asmlinkage void __exception_irq_entry sunxi_handle_irq(struct pt_regs *regs)
+static asmlinkage void __exception_irq_entry sunxi_handle_irq(struct pt_regs *regs)
 {
 	u32 irq, hwirq;
 
diff --git a/include/linux/irqchip/sunxi.h b/include/linux/irqchip/sunxi.h
deleted file mode 100644
index 1fe2c2260e2b..000000000000
--- a/include/linux/irqchip/sunxi.h
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright 2012 Maxime Ripard
- *
- * Maxime Ripard <maxime.ripard@free-electrons.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- */
-
-#ifndef __LINUX_IRQCHIP_SUNXI_H
-#define __LINUX_IRQCHIP_SUNXI_H
-
-#include <asm/exception.h>
-
-extern void sunxi_init_irq(void);
-
-extern asmlinkage void __exception_irq_entry sunxi_handle_irq(
-	struct pt_regs *regs);
-
-#endif
-- 
cgit 


From e46980a10a76ec3282dd6832c1974b880acd23d3 Mon Sep 17 00:00:00 2001
From: Samuel Ortiz <sameo@linux.intel.com>
Date: Tue, 9 Apr 2013 01:51:38 +0300
Subject: mei: bus: Add device enabling and disabling API

It should be left to the drivers to enable and disable the device on the
MEI bus when e.g getting probed.
For drivers to be able to safely call the enable and disable hooks, the
mei_cl_ops must be set before it's probed and thus this should happen
before registering the device on the MEI bus. Hence the mei_cl_add_device()
prototype change.

Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 Documentation/misc-devices/mei/mei-client-bus.txt |  7 +-
 drivers/misc/mei/bus.c                            | 99 ++++++++++++++++++++++-
 drivers/misc/mei/mei_dev.h                        | 28 ++++---
 include/linux/mei_cl_bus.h                        |  3 +
 4 files changed, 123 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/misc-devices/mei/mei-client-bus.txt b/Documentation/misc-devices/mei/mei-client-bus.txt
index 9dc5ebf94eb1..f83910a8ce76 100644
--- a/Documentation/misc-devices/mei/mei-client-bus.txt
+++ b/Documentation/misc-devices/mei/mei-client-bus.txt
@@ -104,13 +104,16 @@ int contact_probe(struct mei_cl_device *dev, struct mei_cl_device_id *id)
 	struct contact_driver *contact;
 
 	[...]
+	mei_cl_enable_device(dev);
+
 	mei_cl_register_event_cb(dev, contact_event_cb, contact);
 
 	return 0;
  }
 
-In the probe routine the driver basically registers an ME bus event handler
-which is as close as it can get to registering a threaded IRQ handler.
+In the probe routine the driver first enable the MEI device and then registers
+an ME bus event handler which is as close as it can get to registering a
+threaded IRQ handler.
 The handler implementation will typically call some I/O routine depending on
 the pending events:
 
diff --git a/drivers/misc/mei/bus.c b/drivers/misc/mei/bus.c
index 6badfa1110e9..834ceeb69cbf 100644
--- a/drivers/misc/mei/bus.c
+++ b/drivers/misc/mei/bus.c
@@ -153,7 +153,8 @@ static struct mei_cl *mei_bus_find_mei_cl_by_uuid(struct mei_device *dev,
 	return NULL;
 }
 struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
-				  uuid_le uuid, char *name)
+					uuid_le uuid, char *name,
+					struct mei_cl_ops *ops)
 {
 	struct mei_cl_device *device;
 	struct mei_cl *cl;
@@ -168,6 +169,7 @@ struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
 		return NULL;
 
 	device->cl = cl;
+	device->ops = ops;
 
 	device->dev.parent = &dev->pdev->dev;
 	device->dev.bus = &mei_cl_bus_type;
@@ -408,6 +410,101 @@ void mei_cl_set_drvdata(struct mei_cl_device *device, void *data)
 }
 EXPORT_SYMBOL_GPL(mei_cl_set_drvdata);
 
+int mei_cl_enable_device(struct mei_cl_device *device)
+{
+	int err;
+	struct mei_device *dev;
+	struct mei_cl *cl = device->cl;
+
+	if (cl == NULL)
+		return -ENODEV;
+
+	dev = cl->dev;
+
+	mutex_lock(&dev->device_lock);
+
+	cl->state = MEI_FILE_CONNECTING;
+
+	err = mei_cl_connect(cl, NULL);
+	if (err < 0) {
+		mutex_unlock(&dev->device_lock);
+		dev_err(&dev->pdev->dev, "Could not connect to the ME client");
+
+		return err;
+	}
+
+	mutex_unlock(&dev->device_lock);
+
+	if (device->event_cb && !cl->read_cb)
+		mei_cl_read_start(device->cl);
+
+	if (!device->ops || !device->ops->enable)
+		return 0;
+
+	return device->ops->enable(device);
+}
+EXPORT_SYMBOL_GPL(mei_cl_enable_device);
+
+int mei_cl_disable_device(struct mei_cl_device *device)
+{
+	int err;
+	struct mei_device *dev;
+	struct mei_cl *cl = device->cl;
+
+	if (cl == NULL)
+		return -ENODEV;
+
+	dev = cl->dev;
+
+	mutex_lock(&dev->device_lock);
+
+	if (cl->state != MEI_FILE_CONNECTED) {
+		mutex_unlock(&dev->device_lock);
+		dev_err(&dev->pdev->dev, "Already disconnected");
+
+		return 0;
+	}
+
+	cl->state = MEI_FILE_DISCONNECTING;
+
+	err = mei_cl_disconnect(cl);
+	if (err < 0) {
+		mutex_unlock(&dev->device_lock);
+		dev_err(&dev->pdev->dev,
+			"Could not disconnect from the ME client");
+
+		return err;
+	}
+
+	/* Flush queues and remove any pending read */
+	mei_cl_flush_queues(cl);
+
+	if (cl->read_cb) {
+		struct mei_cl_cb *cb = NULL;
+
+		cb = mei_cl_find_read_cb(cl);
+		/* Remove entry from read list */
+		if (cb)
+			list_del(&cb->list);
+
+		cb = cl->read_cb;
+		cl->read_cb = NULL;
+
+		if (cb) {
+			mei_io_cb_free(cb);
+			cb = NULL;
+		}
+	}
+
+	mutex_unlock(&dev->device_lock);
+
+	if (!device->ops || !device->ops->disable)
+		return 0;
+
+	return device->ops->disable(device);
+}
+EXPORT_SYMBOL_GPL(mei_cl_disable_device);
+
 void mei_cl_bus_rx_event(struct mei_cl *cl)
 {
 	struct mei_cl_device *device = cl->device;
diff --git a/drivers/misc/mei/mei_dev.h b/drivers/misc/mei/mei_dev.h
index d786da6aa25b..c02967d01527 100644
--- a/drivers/misc/mei/mei_dev.h
+++ b/drivers/misc/mei/mei_dev.h
@@ -269,30 +269,36 @@ struct mei_hw_ops {
 };
 
 /* MEI bus API*/
-struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
-				  uuid_le uuid, char *name);
-void mei_cl_remove_device(struct mei_cl_device *device);
-
-int __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length);
-int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
-int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
 
 /**
- * struct mei_cl_transport_ops - MEI CL device transport ops
+ * struct mei_cl_ops - MEI CL device ops
  * This structure allows ME host clients to implement technology
- * specific transport layers.
+ * specific operations.
  *
+ * @enable: Enable an MEI CL device. Some devices require specific
+ *	HECI commands to initialize completely.
+ * @disable: Disable an MEI CL device.
  * @send: Tx hook for the device. This allows ME host clients to trap
  *	the device driver buffers before actually physically
  *	pushing it to the ME.
  * @recv: Rx hook for the device. This allows ME host clients to trap the
  *	ME buffers before forwarding them to the device driver.
  */
-struct mei_cl_transport_ops {
+struct mei_cl_ops {
+	int (*enable)(struct mei_cl_device *device);
+	int (*disable)(struct mei_cl_device *device);
 	int (*send)(struct mei_cl_device *device, u8 *buf, size_t length);
 	int (*recv)(struct mei_cl_device *device, u8 *buf, size_t length);
 };
 
+struct mei_cl_device *mei_cl_add_device(struct mei_device *dev,
+					uuid_le uuid, char *name,
+					struct mei_cl_ops *ops);
+void mei_cl_remove_device(struct mei_cl_device *device);
+
+int __mei_cl_async_send(struct mei_cl *cl, u8 *buf, size_t length);
+int __mei_cl_send(struct mei_cl *cl, u8 *buf, size_t length);
+int __mei_cl_recv(struct mei_cl *cl, u8 *buf, size_t length);
 void mei_cl_bus_rx_event(struct mei_cl *cl);
 int mei_cl_bus_init(void);
 void mei_cl_bus_exit(void);
@@ -319,7 +325,7 @@ struct mei_cl_device {
 
 	struct mei_cl *cl;
 
-	const struct mei_cl_transport_ops *ops;
+	const struct mei_cl_ops *ops;
 
 	struct work_struct event_work;
 	mei_cl_event_cb_t event_cb;
diff --git a/include/linux/mei_cl_bus.h b/include/linux/mei_cl_bus.h
index 1bece18825ba..d14af7b722ef 100644
--- a/include/linux/mei_cl_bus.h
+++ b/include/linux/mei_cl_bus.h
@@ -38,4 +38,7 @@ int mei_cl_register_event_cb(struct mei_cl_device *device,
 void *mei_cl_get_drvdata(const struct mei_cl_device *device);
 void mei_cl_set_drvdata(struct mei_cl_device *device, void *data);
 
+int mei_cl_enable_device(struct mei_cl_device *device);
+int mei_cl_disable_device(struct mei_cl_device *device);
+
 #endif /* _LINUX_MEI_CL_BUS_H */
-- 
cgit 


From 577b42327d707fbe7166aad6902c2eeee6a65015 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Mon, 8 Apr 2013 21:38:12 -0400
Subject: NFS: Add functionality to allow waiting on all outstanding reads to
 complete

This will later allow NFS locking code to wait for readahead to complete
before releasing byte range locks.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/inode.c         |  1 +
 fs/nfs/internal.h      |  7 +++++++
 fs/nfs/pagelist.c      | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h |  7 +++++++
 4 files changed, 66 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 55b840f05ab2..c1c7a9d78722 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -561,6 +561,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 	l_ctx->lockowner.l_owner = current->files;
 	l_ctx->lockowner.l_pid = current->tgid;
 	INIT_LIST_HEAD(&l_ctx->list);
+	nfs_iocounter_init(&l_ctx->io_count);
 }
 
 static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 541c9ebdbc5a..91e59a39fc08 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -229,6 +229,13 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr,
 			      void (*release)(struct nfs_pgio_header *hdr));
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
+int nfs_iocounter_wait(struct nfs_io_counter *c);
+
+static inline void nfs_iocounter_init(struct nfs_io_counter *c)
+{
+	c->flags = 0;
+	atomic_set(&c->io_count, 0);
+}
 
 /* nfs2xdr.c */
 extern struct rpc_procinfo nfs_procedures[];
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f0933086b36..29cfb7ade121 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -84,6 +84,55 @@ nfs_page_free(struct nfs_page *p)
 	kmem_cache_free(nfs_page_cachep, p);
 }
 
+static void
+nfs_iocounter_inc(struct nfs_io_counter *c)
+{
+	atomic_inc(&c->io_count);
+}
+
+static void
+nfs_iocounter_dec(struct nfs_io_counter *c)
+{
+	if (atomic_dec_and_test(&c->io_count)) {
+		clear_bit(NFS_IO_INPROGRESS, &c->flags);
+		smp_mb__after_clear_bit();
+		wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
+	}
+}
+
+static int
+__nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
+	DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
+	int ret = 0;
+
+	do {
+		prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
+		set_bit(NFS_IO_INPROGRESS, &c->flags);
+		if (atomic_read(&c->io_count) == 0)
+			break;
+		ret = nfs_wait_bit_killable(&c->flags);
+	} while (atomic_read(&c->io_count) != 0);
+	finish_wait(wq, &q.wait);
+	return ret;
+}
+
+/**
+ * nfs_iocounter_wait - wait for i/o to complete
+ * @c: nfs_io_counter to use
+ *
+ * returns -ERESTARTSYS if interrupted by a fatal signal.
+ * Otherwise returns 0 once the io_count hits 0.
+ */
+int
+nfs_iocounter_wait(struct nfs_io_counter *c)
+{
+	if (atomic_read(&c->io_count) == 0)
+		return 0;
+	return __nfs_iocounter_wait(c);
+}
+
 /**
  * nfs_create_request - Create an NFS read/write request.
  * @ctx: open context to use
@@ -118,6 +167,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode,
 		return ERR_CAST(l_ctx);
 	}
 	req->wb_lock_context = l_ctx;
+	nfs_iocounter_inc(&l_ctx->io_count);
 
 	/* Initialize the request struct. Initially, we assume a
 	 * long write-back delay. This will be adjusted in
@@ -177,6 +227,7 @@ static void nfs_clear_request(struct nfs_page *req)
 		req->wb_page = NULL;
 	}
 	if (l_ctx != NULL) {
+		nfs_iocounter_dec(&l_ctx->io_count);
 		nfs_put_lock_context(l_ctx);
 		req->wb_lock_context = NULL;
 	}
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index f6b1956f3c86..fc01d5cb4cf1 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -59,11 +59,18 @@ struct nfs_lockowner {
 	pid_t l_pid;
 };
 
+#define NFS_IO_INPROGRESS 0
+struct nfs_io_counter {
+	unsigned long flags;
+	atomic_t io_count;
+};
+
 struct nfs_lock_context {
 	atomic_t count;
 	struct list_head list;
 	struct nfs_open_context *open_context;
 	struct nfs_lockowner lockowner;
+	struct nfs_io_counter io_count;
 };
 
 struct nfs4_state;
-- 
cgit 


From 5fb1c2dd8f9e9f3fd4ecbaefc75b83ab0ec40cfa Mon Sep 17 00:00:00 2001
From: Sylwester Nawrocki <s.nawrocki@samsung.com>
Date: Mon, 25 Mar 2013 17:56:23 +0100
Subject: mfd: syscon: Add missing struct device_node declaration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

syscon.h header uses struct device_node in functions' declarations
without declaring it. This causes compilation warnings like:

include/linux/mfd/syscon.h:20: warning: ‘struct device_node’
declared inside parameter list
include/linux/mfd/syscon.h:20: warning: its scope is only this
definition or declaration, which is probably not what you want

Fix it by adding a forward declaration of struct device_node.

Signed-off-by: Sylwester Nawrocki <s.nawrocki@samsung.com>
Signed-off-by: Kyungmin Park <kyungmin.park@samsung.com>
Acked-by: Dong Aisheng <dong.aisheng@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/syscon.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mfd/syscon.h b/include/linux/mfd/syscon.h
index 5c9ee6ee7960..b473577f36db 100644
--- a/include/linux/mfd/syscon.h
+++ b/include/linux/mfd/syscon.h
@@ -15,6 +15,8 @@
 #ifndef __LINUX_MFD_SYSCON_H__
 #define __LINUX_MFD_SYSCON_H__
 
+struct device_node;
+
 extern struct regmap *syscon_node_to_regmap(struct device_node *np);
 extern struct regmap *syscon_regmap_lookup_by_compatible(const char *s);
 extern struct regmap *syscon_regmap_lookup_by_pdevname(const char *s);
-- 
cgit 


From a6e4d5a03e9e3587e88aba687d8f225f4f04c792 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Mon, 25 Mar 2013 09:14:30 +0000
Subject: x86, efivars: firmware bug workarounds should be in platform code

Let's not burden ia64 with checks in the common efivars code that we're not
writing too much data to the variable store. That kind of thing is an x86
firmware bug, plain and simple.

efi_query_variable_store() provides platforms with a wrapper in which they can
perform checks and workarounds for EFI variable storage bugs.

Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c | 25 +++++++++++++++++++++++++
 drivers/firmware/efivars.c  | 18 +++---------------
 include/linux/efi.h         |  9 ++++++++-
 3 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 5f2ecaf3f9d8..c89c245eff40 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -999,3 +999,28 @@ u64 efi_mem_attributes(unsigned long phys_addr)
 	}
 	return 0;
 }
+
+/*
+ * Some firmware has serious problems when using more than 50% of the EFI
+ * variable store, i.e. it triggers bugs that can brick machines. Ensure that
+ * we never use more than this safe limit.
+ *
+ * Return EFI_SUCCESS if it is safe to write 'size' bytes to the variable
+ * store.
+ */
+efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+{
+	efi_status_t status;
+	u64 storage_size, remaining_size, max_size;
+
+	status = efi.query_variable_info(attributes, &storage_size,
+					 &remaining_size, &max_size);
+	if (status != EFI_SUCCESS)
+		return status;
+
+	if (!storage_size || size > remaining_size || size > max_size ||
+	    (remaining_size - size) < (storage_size / 2))
+		return EFI_OUT_OF_RESOURCES;
+
+	return EFI_SUCCESS;
+}
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 7acafb80fd4c..bf15d81d74e1 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -436,24 +436,12 @@ static efi_status_t
 check_var_size_locked(struct efivars *efivars, u32 attributes,
 			unsigned long size)
 {
-	u64 storage_size, remaining_size, max_size;
-	efi_status_t status;
 	const struct efivar_operations *fops = efivars->ops;
 
-	if (!efivars->ops->query_variable_info)
+	if (!efivars->ops->query_variable_store)
 		return EFI_UNSUPPORTED;
 
-	status = fops->query_variable_info(attributes, &storage_size,
-					   &remaining_size, &max_size);
-
-	if (status != EFI_SUCCESS)
-		return status;
-
-	if (!storage_size || size > remaining_size || size > max_size ||
-	    (remaining_size - size) < (storage_size / 2))
-		return EFI_OUT_OF_RESOURCES;
-
-	return status;
+	return fops->query_variable_store(attributes, size);
 }
 
 
@@ -2131,7 +2119,7 @@ efivars_init(void)
 	ops.get_variable = efi.get_variable;
 	ops.set_variable = efi.set_variable;
 	ops.get_next_variable = efi.get_next_variable;
-	ops.query_variable_info = efi.query_variable_info;
+	ops.query_variable_store = efi_query_variable_store;
 
 	error = register_efivars(&__efivars, &ops, efi_kobj);
 	if (error)
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 9bf2f1fcae27..3d7df3d32c66 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -333,6 +333,7 @@ typedef efi_status_t efi_query_capsule_caps_t(efi_capsule_header_t **capsules,
 					      unsigned long count,
 					      u64 *max_size,
 					      int *reset_type);
+typedef efi_status_t efi_query_variable_store_t(u32 attributes, unsigned long size);
 
 /*
  *  EFI Configuration Table and GUID definitions
@@ -575,9 +576,15 @@ extern void efi_enter_virtual_mode (void);	/* switch EFI to virtual mode, if pos
 #ifdef CONFIG_X86
 extern void efi_late_init(void);
 extern void efi_free_boot_services(void);
+extern efi_status_t efi_query_variable_store(u32 attributes, unsigned long size);
 #else
 static inline void efi_late_init(void) {}
 static inline void efi_free_boot_services(void) {}
+
+static inline efi_status_t efi_query_variable_store(u32 attributes, unsigned long size)
+{
+	return EFI_SUCCESS;
+}
 #endif
 extern void __iomem *efi_lookup_mapped_addr(u64 phys_addr);
 extern u64 efi_get_iobase (void);
@@ -731,7 +738,7 @@ struct efivar_operations {
 	efi_get_variable_t *get_variable;
 	efi_get_next_variable_t *get_next_variable;
 	efi_set_variable_t *set_variable;
-	efi_query_variable_info_t *query_variable_info;
+	efi_query_variable_store_t *query_variable_store;
 };
 
 struct efivars {
-- 
cgit 


From 230f13a5035fd4725a6623af83953623fd51173a Mon Sep 17 00:00:00 2001
From: Jean-Nicolas Graux <jean-nicolas.graux@stericsson.com>
Date: Tue, 9 Apr 2013 10:35:19 +0200
Subject: mfd: support stmpe1801 18 bits enhanced port expander

Provides support for 1801 variant of stmpe gpio port expanders.
This chip has 18 gpios configurable as GPI, GPO, keypad matrix,
special key or dedicated key function.

Note that special/dedicated key function is not supported yet.

Signed-off-by: Jean-Nicolas Graux <jean-nicolas.graux@stericsson.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig       |  1 +
 drivers/mfd/stmpe-i2c.c   |  1 +
 drivers/mfd/stmpe.c       | 97 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/mfd/stmpe.h       | 49 ++++++++++++++++++++++++
 include/linux/mfd/stmpe.h |  3 ++
 5 files changed, 150 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 533ce2c9435b..9d64e321e8c0 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -393,6 +393,7 @@ config MFD_STMPE
 
 		STMPE811: GPIO, Touchscreen
 		STMPE1601: GPIO, Keypad
+		STMPE1801: GPIO, Keypad
 		STMPE2401: GPIO, Keypad
 		STMPE2403: GPIO, Keypad
 
diff --git a/drivers/mfd/stmpe-i2c.c b/drivers/mfd/stmpe-i2c.c
index fd5fcb630685..0da02e11d58e 100644
--- a/drivers/mfd/stmpe-i2c.c
+++ b/drivers/mfd/stmpe-i2c.c
@@ -75,6 +75,7 @@ static const struct i2c_device_id stmpe_i2c_id[] = {
 	{ "stmpe801", STMPE801 },
 	{ "stmpe811", STMPE811 },
 	{ "stmpe1601", STMPE1601 },
+	{ "stmpe1801", STMPE1801 },
 	{ "stmpe2401", STMPE2401 },
 	{ "stmpe2403", STMPE2403 },
 	{ }
diff --git a/drivers/mfd/stmpe.c b/drivers/mfd/stmpe.c
index bc8587777993..bbccd514d3ec 100644
--- a/drivers/mfd/stmpe.c
+++ b/drivers/mfd/stmpe.c
@@ -19,6 +19,7 @@
 #include <linux/pm.h>
 #include <linux/slab.h>
 #include <linux/mfd/core.h>
+#include <linux/delay.h>
 #include "stmpe.h"
 
 static int __stmpe_enable(struct stmpe *stmpe, unsigned int blocks)
@@ -642,6 +643,88 @@ static struct stmpe_variant_info stmpe1601 = {
 	.enable_autosleep	= stmpe1601_autosleep,
 };
 
+/*
+ * STMPE1801
+ */
+static const u8 stmpe1801_regs[] = {
+	[STMPE_IDX_CHIP_ID]	= STMPE1801_REG_CHIP_ID,
+	[STMPE_IDX_ICR_LSB]	= STMPE1801_REG_INT_CTRL_LOW,
+	[STMPE_IDX_IER_LSB]	= STMPE1801_REG_INT_EN_MASK_LOW,
+	[STMPE_IDX_ISR_LSB]	= STMPE1801_REG_INT_STA_LOW,
+	[STMPE_IDX_GPMR_LSB]	= STMPE1801_REG_GPIO_MP_LOW,
+	[STMPE_IDX_GPSR_LSB]	= STMPE1801_REG_GPIO_SET_LOW,
+	[STMPE_IDX_GPCR_LSB]	= STMPE1801_REG_GPIO_CLR_LOW,
+	[STMPE_IDX_GPDR_LSB]	= STMPE1801_REG_GPIO_SET_DIR_LOW,
+	[STMPE_IDX_GPRER_LSB]	= STMPE1801_REG_GPIO_RE_LOW,
+	[STMPE_IDX_GPFER_LSB]	= STMPE1801_REG_GPIO_FE_LOW,
+	[STMPE_IDX_IEGPIOR_LSB]	= STMPE1801_REG_INT_EN_GPIO_MASK_LOW,
+	[STMPE_IDX_ISGPIOR_LSB]	= STMPE1801_REG_INT_STA_GPIO_LOW,
+};
+
+static struct stmpe_variant_block stmpe1801_blocks[] = {
+	{
+		.cell	= &stmpe_gpio_cell,
+		.irq	= STMPE1801_IRQ_GPIOC,
+		.block	= STMPE_BLOCK_GPIO,
+	},
+	{
+		.cell	= &stmpe_keypad_cell,
+		.irq	= STMPE1801_IRQ_KEYPAD,
+		.block	= STMPE_BLOCK_KEYPAD,
+	},
+};
+
+static int stmpe1801_enable(struct stmpe *stmpe, unsigned int blocks,
+			    bool enable)
+{
+	unsigned int mask = 0;
+	if (blocks & STMPE_BLOCK_GPIO)
+		mask |= STMPE1801_MSK_INT_EN_GPIO;
+
+	if (blocks & STMPE_BLOCK_KEYPAD)
+		mask |= STMPE1801_MSK_INT_EN_KPC;
+
+	return __stmpe_set_bits(stmpe, STMPE1801_REG_INT_EN_MASK_LOW, mask,
+				enable ? mask : 0);
+}
+
+static int stmpe1801_reset(struct stmpe *stmpe)
+{
+	unsigned long timeout;
+	int ret = 0;
+
+	ret = __stmpe_set_bits(stmpe, STMPE1801_REG_SYS_CTRL,
+		STMPE1801_MSK_SYS_CTRL_RESET, STMPE1801_MSK_SYS_CTRL_RESET);
+	if (ret < 0)
+		return ret;
+
+	timeout = jiffies + msecs_to_jiffies(100);
+	while (time_before(jiffies, timeout)) {
+		ret = __stmpe_reg_read(stmpe, STMPE1801_REG_SYS_CTRL);
+		if (ret < 0)
+			return ret;
+		if (!(ret & STMPE1801_MSK_SYS_CTRL_RESET))
+			return 0;
+		usleep_range(100, 200);
+	};
+	return -EIO;
+}
+
+static struct stmpe_variant_info stmpe1801 = {
+	.name		= "stmpe1801",
+	.id_val		= STMPE1801_ID,
+	.id_mask	= 0xfff0,
+	.num_gpios	= 18,
+	.af_bits	= 0,
+	.regs		= stmpe1801_regs,
+	.blocks		= stmpe1801_blocks,
+	.num_blocks	= ARRAY_SIZE(stmpe1801_blocks),
+	.num_irqs	= STMPE1801_NR_INTERNAL_IRQS,
+	.enable		= stmpe1801_enable,
+	/* stmpe1801 do not have any gpio alternate function */
+	.get_altfunc	= NULL,
+};
+
 /*
  * STMPE24XX
  */
@@ -740,6 +823,7 @@ static struct stmpe_variant_info *stmpe_variant_info[STMPE_NBR_PARTS] = {
 	[STMPE801]	= &stmpe801,
 	[STMPE811]	= &stmpe811,
 	[STMPE1601]	= &stmpe1601,
+	[STMPE1801]	= &stmpe1801,
 	[STMPE2401]	= &stmpe2401,
 	[STMPE2403]	= &stmpe2403,
 };
@@ -759,7 +843,7 @@ static irqreturn_t stmpe_irq(int irq, void *data)
 	struct stmpe *stmpe = data;
 	struct stmpe_variant_info *variant = stmpe->variant;
 	int num = DIV_ROUND_UP(variant->num_irqs, 8);
-	u8 israddr = stmpe->regs[STMPE_IDX_ISR_MSB];
+	u8 israddr;
 	u8 isr[num];
 	int ret;
 	int i;
@@ -771,6 +855,11 @@ static irqreturn_t stmpe_irq(int irq, void *data)
 		return IRQ_HANDLED;
 	}
 
+	if (variant->id_val == STMPE1801_ID)
+		israddr = stmpe->regs[STMPE_IDX_ISR_LSB];
+	else
+		israddr = stmpe->regs[STMPE_IDX_ISR_MSB];
+
 	ret = stmpe_block_read(stmpe, israddr, num, isr);
 	if (ret < 0)
 		return IRQ_NONE;
@@ -938,6 +1027,12 @@ static int stmpe_chip_init(struct stmpe *stmpe)
 	if (ret)
 		return ret;
 
+	if (id == STMPE1801_ID)	{
+		ret =  stmpe1801_reset(stmpe);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (stmpe->irq >= 0) {
 		if (id == STMPE801_ID)
 			icr = STMPE801_REG_SYS_CTRL_INT_EN;
diff --git a/drivers/mfd/stmpe.h b/drivers/mfd/stmpe.h
index 7b8e13f5b764..ff2b09ba8797 100644
--- a/drivers/mfd/stmpe.h
+++ b/drivers/mfd/stmpe.h
@@ -198,6 +198,55 @@ int stmpe_remove(struct stmpe *stmpe);
 #define STMPE1601_AUTOSLEEP_TIMEOUT_MASK	(0x7)
 #define STPME1601_AUTOSLEEP_ENABLE		(1 << 3)
 
+/*
+ * STMPE1801
+ */
+#define STMPE1801_ID			0xc110
+#define STMPE1801_NR_INTERNAL_IRQS	5
+#define STMPE1801_IRQ_KEYPAD_COMBI	4
+#define STMPE1801_IRQ_GPIOC		3
+#define STMPE1801_IRQ_KEYPAD_OVER	2
+#define STMPE1801_IRQ_KEYPAD		1
+#define STMPE1801_IRQ_WAKEUP		0
+
+#define STMPE1801_REG_CHIP_ID			0x00
+#define STMPE1801_REG_SYS_CTRL			0x02
+#define STMPE1801_REG_INT_CTRL_LOW		0x04
+#define STMPE1801_REG_INT_EN_MASK_LOW		0x06
+#define STMPE1801_REG_INT_STA_LOW		0x08
+#define STMPE1801_REG_INT_EN_GPIO_MASK_LOW	0x0A
+#define STMPE1801_REG_INT_EN_GPIO_MASK_MID	0x0B
+#define STMPE1801_REG_INT_EN_GPIO_MASK_HIGH	0x0C
+#define STMPE1801_REG_INT_STA_GPIO_LOW		0x0D
+#define STMPE1801_REG_INT_STA_GPIO_MID		0x0E
+#define STMPE1801_REG_INT_STA_GPIO_HIGH		0x0F
+#define STMPE1801_REG_GPIO_SET_LOW		0x10
+#define STMPE1801_REG_GPIO_SET_MID		0x11
+#define STMPE1801_REG_GPIO_SET_HIGH		0x12
+#define STMPE1801_REG_GPIO_CLR_LOW		0x13
+#define STMPE1801_REG_GPIO_CLR_MID		0x14
+#define STMPE1801_REG_GPIO_CLR_HIGH		0x15
+#define STMPE1801_REG_GPIO_MP_LOW		0x16
+#define STMPE1801_REG_GPIO_MP_MID		0x17
+#define STMPE1801_REG_GPIO_MP_HIGH		0x18
+#define STMPE1801_REG_GPIO_SET_DIR_LOW		0x19
+#define STMPE1801_REG_GPIO_SET_DIR_MID		0x1A
+#define STMPE1801_REG_GPIO_SET_DIR_HIGH		0x1B
+#define STMPE1801_REG_GPIO_RE_LOW		0x1C
+#define STMPE1801_REG_GPIO_RE_MID		0x1D
+#define STMPE1801_REG_GPIO_RE_HIGH		0x1E
+#define STMPE1801_REG_GPIO_FE_LOW		0x1F
+#define STMPE1801_REG_GPIO_FE_MID		0x20
+#define STMPE1801_REG_GPIO_FE_HIGH		0x21
+#define STMPE1801_REG_GPIO_PULL_UP_LOW		0x22
+#define STMPE1801_REG_GPIO_PULL_UP_MID		0x23
+#define STMPE1801_REG_GPIO_PULL_UP_HIGH		0x24
+
+#define STMPE1801_MSK_SYS_CTRL_RESET		(1 << 7)
+
+#define STMPE1801_MSK_INT_EN_KPC		(1 << 1)
+#define STMPE1801_MSK_INT_EN_GPIO		(1 << 3)
+
 /*
  * STMPE24xx
  */
diff --git a/include/linux/mfd/stmpe.h b/include/linux/mfd/stmpe.h
index 383ac1512a39..48395a69a7e9 100644
--- a/include/linux/mfd/stmpe.h
+++ b/include/linux/mfd/stmpe.h
@@ -26,6 +26,7 @@ enum stmpe_partnum {
 	STMPE801,
 	STMPE811,
 	STMPE1601,
+	STMPE1801,
 	STMPE2401,
 	STMPE2403,
 	STMPE_NBR_PARTS
@@ -39,6 +40,7 @@ enum {
 	STMPE_IDX_CHIP_ID,
 	STMPE_IDX_ICR_LSB,
 	STMPE_IDX_IER_LSB,
+	STMPE_IDX_ISR_LSB,
 	STMPE_IDX_ISR_MSB,
 	STMPE_IDX_GPMR_LSB,
 	STMPE_IDX_GPSR_LSB,
@@ -49,6 +51,7 @@ enum {
 	STMPE_IDX_GPFER_LSB,
 	STMPE_IDX_GPAFR_U_MSB,
 	STMPE_IDX_IEGPIOR_LSB,
+	STMPE_IDX_ISGPIOR_LSB,
 	STMPE_IDX_ISGPIOR_MSB,
 	STMPE_IDX_MAX,
 };
-- 
cgit 


From 1b8664341100716202c29d67f24d67094a82971e Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 9 Apr 2013 05:54:01 +0000
Subject: net: sctp: introduce uapi header for sctp

This patch introduces an UAPI header for the SCTP protocol,
so that we can facilitate the maintenance and development of
user land applications or libraries, in particular in terms
of header synchronization.

To not break compatibility, some fragments from lksctp-tools'
netinet/sctp.h have been carefully included, while taking care
that neither kernel nor user land breaks, so both compile fine
with this change (for lksctp-tools I tested with the old
netinet/sctp.h header and with a newly adapted one that includes
the uapi sctp header). lksctp-tools smoke test run through
successfully as well in both cases.

Suggested-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 fs/dlm/lowcomms.c            |   2 +-
 include/linux/sctp.h         |   6 +-
 include/net/sctp/constants.h |   1 -
 include/net/sctp/user.h      | 782 ---------------------------------------
 include/uapi/linux/Kbuild    |   1 +
 include/uapi/linux/sctp.h    | 846 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 850 insertions(+), 788 deletions(-)
 delete mode 100644 include/net/sctp/user.h
 create mode 100644 include/uapi/linux/sctp.h

(limited to 'include/linux')

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 4f5ad246582f..d0ccd2fd79eb 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -52,8 +52,8 @@
 #include <linux/mutex.h>
 #include <linux/sctp.h>
 #include <linux/slab.h>
+#include <linux/sctp.h>
 #include <net/sctp/sctp.h>
-#include <net/sctp/user.h>
 #include <net/ipv6.h>
 
 #include "dlm_internal.h"
diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index c11a28706fa4..3bfe8d6ee248 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -53,7 +53,9 @@
 
 #include <linux/in.h>		/* We need in_addr.  */
 #include <linux/in6.h>		/* We need in6_addr.  */
+#include <linux/skbuff.h>
 
+#include <uapi/linux/sctp.h>
 
 /* Section 3.1.  SCTP Common Header Format */
 typedef struct sctphdr {
@@ -63,14 +65,10 @@ typedef struct sctphdr {
 	__le32 checksum;
 } __packed sctp_sctphdr_t;
 
-#ifdef __KERNEL__
-#include <linux/skbuff.h>
-
 static inline struct sctphdr *sctp_hdr(const struct sk_buff *skb)
 {
 	return (struct sctphdr *)skb_transport_header(skb);
 }
-#endif
 
 /* Section 3.2.  Chunk Field Descriptions. */
 typedef struct sctp_chunkhdr {
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index a7dd5c50df79..ca50e0751e47 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -49,7 +49,6 @@
 
 #include <linux/sctp.h>
 #include <linux/ipv6.h> /* For ipv6hdr. */
-#include <net/sctp/user.h>
 #include <net/tcp_states.h>  /* For TCP states used in sctp_sock_state_t */
 
 /* Value used for stream negotiation. */
diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h
deleted file mode 100644
index 9a0ae091366d..000000000000
--- a/include/net/sctp/user.h
+++ /dev/null
@@ -1,782 +0,0 @@
-/* SCTP kernel implementation
- * (C) Copyright IBM Corp. 2001, 2004
- * Copyright (c) 1999-2000 Cisco, Inc.
- * Copyright (c) 1999-2001 Motorola, Inc.
- * Copyright (c) 2002 Intel Corp.
- *
- * This file is part of the SCTP kernel implementation
- *
- * This header represents the structures and constants needed to support
- * the SCTP Extension to the Sockets API.
- *
- * This SCTP implementation is free software;
- * you can redistribute it and/or modify it under the terms of
- * the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This SCTP implementation is distributed in the hope that it
- * will be useful, but WITHOUT ANY WARRANTY; without even the implied
- *                 ************************
- * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with GNU CC; see the file COPYING.  If not, write to
- * the Free Software Foundation, 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- *
- * Please send any bug reports or fixes you make to the
- * email address(es):
- *    lksctp developers <lksctp-developers@lists.sourceforge.net>
- *
- * Or submit a bug report through the following website:
- *    http://www.sf.net/projects/lksctp
- *
- * Written or modified by:
- *    La Monte H.P. Yarroll    <piggy@acm.org>
- *    R. Stewart               <randall@sctp.chicago.il.us>
- *    K. Morneau               <kmorneau@cisco.com>
- *    Q. Xie                   <qxie1@email.mot.com>
- *    Karl Knutson             <karl@athena.chicago.il.us>
- *    Jon Grimm                <jgrimm@us.ibm.com>
- *    Daisy Chang              <daisyc@us.ibm.com>
- *    Ryan Layer               <rmlayer@us.ibm.com>
- *    Ardelle Fan	       <ardelle.fan@intel.com>
- *    Sridhar Samudrala        <sri@us.ibm.com>
- *
- * Any bugs reported given to us we will try to fix... any fixes shared will
- * be incorporated into the next SCTP release.
- */
-
-#ifndef __net_sctp_user_h__
-#define __net_sctp_user_h__
-
-#include <linux/types.h>
-#include <linux/socket.h>
-
-typedef __s32 sctp_assoc_t;
-
-/* The following symbols come from the Sockets API Extensions for
- * SCTP <draft-ietf-tsvwg-sctpsocket-07.txt>.
- */
-#define SCTP_RTOINFO	0
-#define SCTP_ASSOCINFO  1
-#define SCTP_INITMSG	2
-#define SCTP_NODELAY	3		/* Get/set nodelay option. */
-#define SCTP_AUTOCLOSE	4
-#define SCTP_SET_PEER_PRIMARY_ADDR 5
-#define SCTP_PRIMARY_ADDR	6
-#define SCTP_ADAPTATION_LAYER	7
-#define SCTP_DISABLE_FRAGMENTS	8
-#define SCTP_PEER_ADDR_PARAMS	9
-#define SCTP_DEFAULT_SEND_PARAM	10
-#define SCTP_EVENTS	11
-#define SCTP_I_WANT_MAPPED_V4_ADDR 12	/* Turn on/off mapped v4 addresses  */
-#define SCTP_MAXSEG	13		/* Get/set maximum fragment. */
-#define SCTP_STATUS	14
-#define SCTP_GET_PEER_ADDR_INFO	15
-#define SCTP_DELAYED_ACK_TIME	16
-#define SCTP_DELAYED_ACK SCTP_DELAYED_ACK_TIME
-#define SCTP_DELAYED_SACK SCTP_DELAYED_ACK_TIME
-#define SCTP_CONTEXT	17
-#define SCTP_FRAGMENT_INTERLEAVE	18
-#define SCTP_PARTIAL_DELIVERY_POINT	19 /* Set/Get partial delivery point */
-#define SCTP_MAX_BURST	20		/* Set/Get max burst */
-#define SCTP_AUTH_CHUNK	21	/* Set only: add a chunk type to authenticate */
-#define SCTP_HMAC_IDENT	22
-#define SCTP_AUTH_KEY	23
-#define SCTP_AUTH_ACTIVE_KEY	24
-#define SCTP_AUTH_DELETE_KEY	25
-#define SCTP_PEER_AUTH_CHUNKS	26	/* Read only */
-#define SCTP_LOCAL_AUTH_CHUNKS	27	/* Read only */
-#define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
-#define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
-#define SCTP_AUTO_ASCONF       30
-#define SCTP_PEER_ADDR_THLDS	31
-
-/* Internal Socket Options. Some of the sctp library functions are
- * implemented using these socket options.
- */
-#define SCTP_SOCKOPT_BINDX_ADD	100	/* BINDX requests for adding addrs */
-#define SCTP_SOCKOPT_BINDX_REM	101	/* BINDX requests for removing addrs. */
-#define SCTP_SOCKOPT_PEELOFF	102	/* peel off association. */
-/* Options 104-106 are deprecated and removed. Do not use this space */
-#define SCTP_SOCKOPT_CONNECTX_OLD	107	/* CONNECTX old requests. */
-#define SCTP_GET_PEER_ADDRS	108		/* Get all peer address. */
-#define SCTP_GET_LOCAL_ADDRS	109		/* Get all local address. */
-#define SCTP_SOCKOPT_CONNECTX	110		/* CONNECTX requests. */
-#define SCTP_SOCKOPT_CONNECTX3	111	/* CONNECTX requests (updated) */
-#define SCTP_GET_ASSOC_STATS	112	/* Read only */
-
-/*
- * 5.2.1 SCTP Initiation Structure (SCTP_INIT)
- *
- *   This cmsghdr structure provides information for initializing new
- *   SCTP associations with sendmsg().  The SCTP_INITMSG socket option
- *   uses this same data structure.  This structure is not used for
- *   recvmsg().
- *
- *   cmsg_level    cmsg_type      cmsg_data[]
- *   ------------  ------------   ----------------------
- *   IPPROTO_SCTP  SCTP_INIT      struct sctp_initmsg
- *
- */
-struct sctp_initmsg {
-	__u16 sinit_num_ostreams;
-	__u16 sinit_max_instreams;
-	__u16 sinit_max_attempts;
-	__u16 sinit_max_init_timeo;
-};
-
-/*
- * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
- *
- *   This cmsghdr structure specifies SCTP options for sendmsg() and
- *   describes SCTP header information about a received message through
- *   recvmsg().
- *
- *   cmsg_level    cmsg_type      cmsg_data[]
- *   ------------  ------------   ----------------------
- *   IPPROTO_SCTP  SCTP_SNDRCV    struct sctp_sndrcvinfo
- *
- */
-struct sctp_sndrcvinfo {
-	__u16 sinfo_stream;
-	__u16 sinfo_ssn;
-	__u16 sinfo_flags;
-	__u32 sinfo_ppid;
-	__u32 sinfo_context;
-	__u32 sinfo_timetolive;
-	__u32 sinfo_tsn;
-	__u32 sinfo_cumtsn;
-	sctp_assoc_t sinfo_assoc_id;
-};
-
-/*
- *  sinfo_flags: 16 bits (unsigned integer)
- *
- *   This field may contain any of the following flags and is composed of
- *   a bitwise OR of these values.
- */
-
-enum sctp_sinfo_flags {
-	SCTP_UNORDERED = 1,  /* Send/receive message unordered. */
-	SCTP_ADDR_OVER = 2,  /* Override the primary destination. */
-	SCTP_ABORT=4,        /* Send an ABORT message to the peer. */
-	SCTP_SACK_IMMEDIATELY = 8,	/* SACK should be sent without delay */
-	SCTP_EOF=MSG_FIN,    /* Initiate graceful shutdown process. */	
-};
-
-
-/* These are cmsg_types.  */
-typedef enum sctp_cmsg_type {
-	SCTP_INIT,              /* 5.2.1 SCTP Initiation Structure */
-	SCTP_SNDRCV,            /* 5.2.2 SCTP Header Information Structure */
-} sctp_cmsg_t;
-
-
-/*
- * 5.3.1.1 SCTP_ASSOC_CHANGE
- *
- *   Communication notifications inform the ULP that an SCTP association
- *   has either begun or ended. The identifier for a new association is
- *   provided by this notificaion. The notification information has the
- *   following format:
- *
- */
-struct sctp_assoc_change {
-	__u16 sac_type;
-	__u16 sac_flags;
-	__u32 sac_length;
-	__u16 sac_state;
-	__u16 sac_error;
-	__u16 sac_outbound_streams;
-	__u16 sac_inbound_streams;
-	sctp_assoc_t sac_assoc_id;
-	__u8 sac_info[0];
-};
-
-/*
- *   sac_state: 32 bits (signed integer)
- *
- *   This field holds one of a number of values that communicate the
- *   event that happened to the association.  They include:
- *
- *   Note:  The following state names deviate from the API draft as
- *   the names clash too easily with other kernel symbols.
- */
-enum sctp_sac_state {
-	SCTP_COMM_UP,
-	SCTP_COMM_LOST,
-	SCTP_RESTART,
-	SCTP_SHUTDOWN_COMP,
-	SCTP_CANT_STR_ASSOC,
-};
-
-/*
- * 5.3.1.2 SCTP_PEER_ADDR_CHANGE
- *
- *   When a destination address on a multi-homed peer encounters a change
- *   an interface details event is sent.  The information has the
- *   following structure:
- */
-struct sctp_paddr_change {
-	__u16 spc_type;
-	__u16 spc_flags;
-	__u32 spc_length;
-	struct sockaddr_storage spc_aaddr;
-	int spc_state;
-	int spc_error;
-	sctp_assoc_t spc_assoc_id;
-} __attribute__((packed, aligned(4)));
-
-/*
- *    spc_state:  32 bits (signed integer)
- *
- *   This field holds one of a number of values that communicate the
- *   event that happened to the address.  They include:
- */
-enum sctp_spc_state {
-	SCTP_ADDR_AVAILABLE,
-	SCTP_ADDR_UNREACHABLE,
-	SCTP_ADDR_REMOVED,
-	SCTP_ADDR_ADDED,
-	SCTP_ADDR_MADE_PRIM,
-	SCTP_ADDR_CONFIRMED,
-};
-
-
-/*
- * 5.3.1.3 SCTP_REMOTE_ERROR
- *
- *   A remote peer may send an Operational Error message to its peer.
- *   This message indicates a variety of error conditions on an
- *   association. The entire error TLV as it appears on the wire is
- *   included in a SCTP_REMOTE_ERROR event.  Please refer to the SCTP
- *   specification [SCTP] and any extensions for a list of possible
- *   error formats. SCTP error TLVs have the format:
- */
-struct sctp_remote_error {
-	__u16 sre_type;
-	__u16 sre_flags;
-	__u32 sre_length;
-	__u16 sre_error;
-	sctp_assoc_t sre_assoc_id;
-	__u8 sre_data[0];
-};
-
-
-/*
- * 5.3.1.4 SCTP_SEND_FAILED
- *
- *   If SCTP cannot deliver a message it may return the message as a
- *   notification.
- */
-struct sctp_send_failed {
-	__u16 ssf_type;
-	__u16 ssf_flags;
-	__u32 ssf_length;
-	__u32 ssf_error;
-	struct sctp_sndrcvinfo ssf_info;
-	sctp_assoc_t ssf_assoc_id;
-	__u8 ssf_data[0];
-};
-
-/*
- *   ssf_flags: 16 bits (unsigned integer)
- *
- *   The flag value will take one of the following values
- *
- *   SCTP_DATA_UNSENT  - Indicates that the data was never put on
- *                       the wire.
- *
- *   SCTP_DATA_SENT    - Indicates that the data was put on the wire.
- *                       Note that this does not necessarily mean that the
- *                       data was (or was not) successfully delivered.
- */
-enum sctp_ssf_flags {
-	SCTP_DATA_UNSENT,
-	SCTP_DATA_SENT,
-};
-
-/*
- * 5.3.1.5 SCTP_SHUTDOWN_EVENT
- *
- *   When a peer sends a SHUTDOWN, SCTP delivers this notification to
- *   inform the application that it should cease sending data.
- */
-struct sctp_shutdown_event {
-	__u16 sse_type;
-	__u16 sse_flags;
-	__u32 sse_length;
-	sctp_assoc_t sse_assoc_id;
-};
-
-/*
- * 5.3.1.6 SCTP_ADAPTATION_INDICATION
- *
- *   When a peer sends a Adaptation Layer Indication parameter , SCTP
- *   delivers this notification to inform the application
- *   that of the peers requested adaptation layer.
- */
-struct sctp_adaptation_event {
-	__u16 sai_type;
-	__u16 sai_flags;
-	__u32 sai_length;
-	__u32 sai_adaptation_ind;
-	sctp_assoc_t sai_assoc_id;
-};
-
-/*
- * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT
- *
- *   When a receiver is engaged in a partial delivery of a
- *   message this notification will be used to indicate
- *   various events.
- */
-struct sctp_pdapi_event {
-	__u16 pdapi_type;
-	__u16 pdapi_flags;
-	__u32 pdapi_length;
-	__u32 pdapi_indication;
-	sctp_assoc_t pdapi_assoc_id;
-};
-
-enum { SCTP_PARTIAL_DELIVERY_ABORTED=0, };
-
-struct sctp_authkey_event {
-	__u16 auth_type;
-	__u16 auth_flags;
-	__u32 auth_length;
-	__u16 auth_keynumber;
-	__u16 auth_altkeynumber;
-	__u32 auth_indication;
-	sctp_assoc_t auth_assoc_id;
-};
-
-enum { SCTP_AUTH_NEWKEY = 0, };
-
-/*
- * 6.1.9. SCTP_SENDER_DRY_EVENT
- *
- * When the SCTP stack has no more user data to send or retransmit, this
- * notification is given to the user. Also, at the time when a user app
- * subscribes to this event, if there is no data to be sent or
- * retransmit, the stack will immediately send up this notification.
- */
-struct sctp_sender_dry_event {
-	__u16 sender_dry_type;
-	__u16 sender_dry_flags;
-	__u32 sender_dry_length;
-	sctp_assoc_t sender_dry_assoc_id;
-};
-
-/*
- * Described in Section 7.3
- *   Ancillary Data and Notification Interest Options
- */
-struct sctp_event_subscribe {
-	__u8 sctp_data_io_event;
-	__u8 sctp_association_event;
-	__u8 sctp_address_event;
-	__u8 sctp_send_failure_event;
-	__u8 sctp_peer_error_event;
-	__u8 sctp_shutdown_event;
-	__u8 sctp_partial_delivery_event;
-	__u8 sctp_adaptation_layer_event;
-	__u8 sctp_authentication_event;
-	__u8 sctp_sender_dry_event;
-};
-
-/*
- * 5.3.1 SCTP Notification Structure
- *
- *   The notification structure is defined as the union of all
- *   notification types.
- *
- */
-union sctp_notification {
-	struct {
-		__u16 sn_type;             /* Notification type. */
-		__u16 sn_flags;
-		__u32 sn_length;
-	} sn_header;
-	struct sctp_assoc_change sn_assoc_change;
-	struct sctp_paddr_change sn_paddr_change;
-	struct sctp_remote_error sn_remote_error;
-	struct sctp_send_failed sn_send_failed;
-	struct sctp_shutdown_event sn_shutdown_event;
-	struct sctp_adaptation_event sn_adaptation_event;
-	struct sctp_pdapi_event sn_pdapi_event;
-	struct sctp_authkey_event sn_authkey_event;
-	struct sctp_sender_dry_event sn_sender_dry_event;
-};
-
-/* Section 5.3.1
- * All standard values for sn_type flags are greater than 2^15.
- * Values from 2^15 and down are reserved.
- */
-
-enum sctp_sn_type {
-	SCTP_SN_TYPE_BASE     = (1<<15),
-	SCTP_ASSOC_CHANGE,
-	SCTP_PEER_ADDR_CHANGE,
-	SCTP_SEND_FAILED,
-	SCTP_REMOTE_ERROR,
-	SCTP_SHUTDOWN_EVENT,
-	SCTP_PARTIAL_DELIVERY_EVENT,
-	SCTP_ADAPTATION_INDICATION,
-	SCTP_AUTHENTICATION_EVENT,
-#define SCTP_AUTHENTICATION_INDICATION	SCTP_AUTHENTICATION_EVENT
-	SCTP_SENDER_DRY_EVENT,
-};
-
-/* Notification error codes used to fill up the error fields in some
- * notifications.
- * SCTP_PEER_ADDRESS_CHAGE 	: spc_error
- * SCTP_ASSOC_CHANGE		: sac_error
- * These names should be potentially included in the draft 04 of the SCTP
- * sockets API specification.
- */
-typedef enum sctp_sn_error {
-	SCTP_FAILED_THRESHOLD,
-	SCTP_RECEIVED_SACK,
-	SCTP_HEARTBEAT_SUCCESS,
-	SCTP_RESPONSE_TO_USER_REQ,
-	SCTP_INTERNAL_ERROR,
-	SCTP_SHUTDOWN_GUARD_EXPIRES,
-	SCTP_PEER_FAULTY,
-} sctp_sn_error_t;
-
-/*
- * 7.1.1 Retransmission Timeout Parameters (SCTP_RTOINFO)
- *
- *   The protocol parameters used to initialize and bound retransmission
- *   timeout (RTO) are tunable.  See [SCTP] for more information on how
- *   these parameters are used in RTO calculation. 
- */
-struct sctp_rtoinfo {
-	sctp_assoc_t	srto_assoc_id;
-	__u32		srto_initial;
-	__u32		srto_max;
-	__u32		srto_min;
-};
-
-/*
- * 7.1.2 Association Parameters (SCTP_ASSOCINFO)
- *
- *   This option is used to both examine and set various association and
- *   endpoint parameters.
- */
-struct sctp_assocparams {
-	sctp_assoc_t	sasoc_assoc_id;
-	__u16		sasoc_asocmaxrxt;
-	__u16		sasoc_number_peer_destinations;
-	__u32		sasoc_peer_rwnd;
-	__u32		sasoc_local_rwnd;
-	__u32		sasoc_cookie_life;
-};
-
-/*
- * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR)
- *
- *  Requests that the peer mark the enclosed address as the association
- *  primary. The enclosed address must be one of the association's
- *  locally bound addresses. The following structure is used to make a
- *   set primary request:
- */
-struct sctp_setpeerprim {
-	sctp_assoc_t            sspp_assoc_id;
-	struct sockaddr_storage sspp_addr;
-} __attribute__((packed, aligned(4)));
-
-/*
- * 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
- *
- *  Requests that the local SCTP stack use the enclosed peer address as
- *  the association primary. The enclosed address must be one of the
- *  association peer's addresses. The following structure is used to
- *  make a set peer primary request:
- */
-struct sctp_prim {
-	sctp_assoc_t            ssp_assoc_id;
-	struct sockaddr_storage ssp_addr;
-} __attribute__((packed, aligned(4)));
-
-/*
- * 7.1.11 Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER)
- *
- * Requests that the local endpoint set the specified Adaptation Layer
- * Indication parameter for all future INIT and INIT-ACK exchanges.
- */
-struct sctp_setadaptation {
-	__u32	ssb_adaptation_ind;
-};
-
-/*
- * 7.1.13 Peer Address Parameters  (SCTP_PEER_ADDR_PARAMS)
- *
- *   Applications can enable or disable heartbeats for any peer address
- *   of an association, modify an address's heartbeat interval, force a
- *   heartbeat to be sent immediately, and adjust the address's maximum
- *   number of retransmissions sent before an address is considered
- *   unreachable. The following structure is used to access and modify an
- *   address's parameters:
- */
-enum  sctp_spp_flags {
-	SPP_HB_ENABLE = 1<<0,		/*Enable heartbeats*/
-	SPP_HB_DISABLE = 1<<1,		/*Disable heartbeats*/
-	SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE,
-	SPP_HB_DEMAND = 1<<2,		/*Send heartbeat immediately*/
-	SPP_PMTUD_ENABLE = 1<<3,	/*Enable PMTU discovery*/
-	SPP_PMTUD_DISABLE = 1<<4,	/*Disable PMTU discovery*/
-	SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE,
-	SPP_SACKDELAY_ENABLE = 1<<5,	/*Enable SACK*/
-	SPP_SACKDELAY_DISABLE = 1<<6,	/*Disable SACK*/
-	SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
-	SPP_HB_TIME_IS_ZERO = 1<<7,	/* Set HB delay to 0 */
-};
-
-struct sctp_paddrparams {
-	sctp_assoc_t		spp_assoc_id;
-	struct sockaddr_storage	spp_address;
-	__u32			spp_hbinterval;
-	__u16			spp_pathmaxrxt;
-	__u32			spp_pathmtu;
-	__u32			spp_sackdelay;
-	__u32			spp_flags;
-} __attribute__((packed, aligned(4)));
-
-/*
- * 7.1.18.  Add a chunk that must be authenticated (SCTP_AUTH_CHUNK)
- *
- * This set option adds a chunk type that the user is requesting to be
- * received only in an authenticated way.  Changes to the list of chunks
- * will only effect future associations on the socket.
- */
-struct sctp_authchunk {
-	__u8		sauth_chunk;
-};
-
-/*
- * 7.1.19.  Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT)
- *
- * This option gets or sets the list of HMAC algorithms that the local
- * endpoint requires the peer to use.
-*/
-struct sctp_hmacalgo {
-	__u32		shmac_num_idents;
-	__u16		shmac_idents[];
-};
-
-/*
- * 7.1.20.  Set a shared key (SCTP_AUTH_KEY)
- *
- * This option will set a shared secret key which is used to build an
- * association shared key.
- */
-struct sctp_authkey {
-	sctp_assoc_t	sca_assoc_id;
-	__u16		sca_keynumber;
-	__u16		sca_keylength;
-	__u8		sca_key[];
-};
-
-/*
- * 7.1.21.  Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY)
- *
- * This option will get or set the active shared key to be used to build
- * the association shared key.
- */
-
-struct sctp_authkeyid {
-	sctp_assoc_t	scact_assoc_id;
-	__u16		scact_keynumber;
-};
-
-
-/*
- * 7.1.23.  Get or set delayed ack timer (SCTP_DELAYED_SACK)
- *
- * This option will effect the way delayed acks are performed.  This
- * option allows you to get or set the delayed ack time, in
- * milliseconds.  It also allows changing the delayed ack frequency.
- * Changing the frequency to 1 disables the delayed sack algorithm.  If
- * the assoc_id is 0, then this sets or gets the endpoints default
- * values.  If the assoc_id field is non-zero, then the set or get
- * effects the specified association for the one to many model (the
- * assoc_id field is ignored by the one to one model).  Note that if
- * sack_delay or sack_freq are 0 when setting this option, then the
- * current values will remain unchanged.
- */
-struct sctp_sack_info {
-	sctp_assoc_t	sack_assoc_id;
-	uint32_t	sack_delay;
-	uint32_t	sack_freq;
-};
-
-struct sctp_assoc_value {
-    sctp_assoc_t            assoc_id;
-    uint32_t                assoc_value;
-};
-
-/*
- * 7.2.2 Peer Address Information
- *
- *   Applications can retrieve information about a specific peer address
- *   of an association, including its reachability state, congestion
- *   window, and retransmission timer values.  This information is
- *   read-only. The following structure is used to access this
- *   information:
- */
-struct sctp_paddrinfo {
-	sctp_assoc_t		spinfo_assoc_id;
-	struct sockaddr_storage	spinfo_address;
-	__s32			spinfo_state;
-	__u32			spinfo_cwnd;
-	__u32			spinfo_srtt;
-	__u32			spinfo_rto;
-	__u32			spinfo_mtu;
-} __attribute__((packed, aligned(4)));
-
-/* Peer addresses's state. */
-/* UNKNOWN: Peer address passed by the upper layer in sendmsg or connect[x]
- * calls.
- * UNCONFIRMED: Peer address received in INIT/INIT-ACK address parameters.
- *              Not yet confirmed by a heartbeat and not available for data
- *		transfers.
- * ACTIVE : Peer address confirmed, active and available for data transfers.
- * INACTIVE: Peer address inactive and not available for data transfers.
- */
-enum sctp_spinfo_state {
-	SCTP_INACTIVE,
-	SCTP_PF,
-	SCTP_ACTIVE,
-	SCTP_UNCONFIRMED,
-	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
-};
-
-/*
- * 7.2.1 Association Status (SCTP_STATUS)
- *
- *   Applications can retrieve current status information about an
- *   association, including association state, peer receiver window size,
- *   number of unacked data chunks, and number of data chunks pending
- *   receipt.  This information is read-only.  The following structure is
- *   used to access this information:
- */
-struct sctp_status {
-	sctp_assoc_t		sstat_assoc_id;
-	__s32			sstat_state;
-	__u32			sstat_rwnd;
-	__u16			sstat_unackdata;
-	__u16			sstat_penddata;
-	__u16			sstat_instrms;
-	__u16			sstat_outstrms;
-	__u32			sstat_fragmentation_point;
-	struct sctp_paddrinfo	sstat_primary;
-};
-
-/*
- * 7.2.3.  Get the list of chunks the peer requires to be authenticated
- *         (SCTP_PEER_AUTH_CHUNKS)
- *
- * This option gets a list of chunks for a specified association that
- * the peer requires to be received authenticated only.
- */
-struct sctp_authchunks {
-	sctp_assoc_t	gauth_assoc_id;
-	__u32		gauth_number_of_chunks;
-	uint8_t		gauth_chunks[];
-};
-
-/*
- * 8.2.6. Get the Current Identifiers of Associations
- *        (SCTP_GET_ASSOC_ID_LIST)
- *
- * This option gets the current list of SCTP association identifiers of
- * the SCTP associations handled by a one-to-many style socket.
- */
-struct sctp_assoc_ids {
-	__u32		gaids_number_of_ids;
-	sctp_assoc_t	gaids_assoc_id[];
-};
-
-/*
- * 8.3, 8.5 get all peer/local addresses in an association.
- * This parameter struct is used by SCTP_GET_PEER_ADDRS and 
- * SCTP_GET_LOCAL_ADDRS socket options used internally to implement
- * sctp_getpaddrs() and sctp_getladdrs() API. 
- */
-struct sctp_getaddrs_old {
-	sctp_assoc_t            assoc_id;
-	int			addr_num;
-	struct sockaddr		__user *addrs;
-};
-struct sctp_getaddrs {
-	sctp_assoc_t		assoc_id; /*input*/
-	__u32			addr_num; /*output*/
-	__u8			addrs[0]; /*output, variable size*/
-};
-
-/* A socket user request obtained via SCTP_GET_ASSOC_STATS that retrieves
- * association stats. All stats are counts except sas_maxrto and
- * sas_obs_rto_ipaddr. maxrto is the max observed rto + transport since
- * the last call. Will return 0 when RTO was not update since last call
- */
-struct sctp_assoc_stats {
-	sctp_assoc_t	sas_assoc_id;    /* Input */
-					 /* Transport of observed max RTO */
-	struct sockaddr_storage sas_obs_rto_ipaddr;
-	__u64		sas_maxrto;      /* Maximum Observed RTO for period */
-	__u64		sas_isacks;	 /* SACKs received */
-	__u64		sas_osacks;	 /* SACKs sent */
-	__u64		sas_opackets;	 /* Packets sent */
-	__u64		sas_ipackets;	 /* Packets received */
-	__u64		sas_rtxchunks;   /* Retransmitted Chunks */
-	__u64		sas_outofseqtsns;/* TSN received > next expected */
-	__u64		sas_idupchunks;  /* Dups received (ordered+unordered) */
-	__u64		sas_gapcnt;      /* Gap Acknowledgements Received */
-	__u64		sas_ouodchunks;  /* Unordered data chunks sent */
-	__u64		sas_iuodchunks;  /* Unordered data chunks received */
-	__u64		sas_oodchunks;	 /* Ordered data chunks sent */
-	__u64		sas_iodchunks;	 /* Ordered data chunks received */
-	__u64		sas_octrlchunks; /* Control chunks sent */
-	__u64		sas_ictrlchunks; /* Control chunks received */
-};
-
-/* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
-/* On user space Linux, these live in <bits/socket.h> as an enum.  */
-enum sctp_msg_flags {
-	MSG_NOTIFICATION = 0x8000,
-#define MSG_NOTIFICATION MSG_NOTIFICATION
-};
-
-/*
- * 8.1 sctp_bindx()
- *
- * The flags parameter is formed from the bitwise OR of zero or more of the
- * following currently defined flags:
- */
-#define SCTP_BINDX_ADD_ADDR 0x01
-#define SCTP_BINDX_REM_ADDR 0x02
-
-/* This is the structure that is passed as an argument(optval) to
- * getsockopt(SCTP_SOCKOPT_PEELOFF).
- */
-typedef struct {
-	sctp_assoc_t associd;
-	int sd;
-} sctp_peeloff_arg_t;
-
-/*
- *  Peer Address Thresholds socket option
- */
-struct sctp_paddrthlds {
-	sctp_assoc_t spt_assoc_id;
-	struct sockaddr_storage spt_address;
-	__u16 spt_pathmaxrxt;
-	__u16 spt_pathpfthld;
-};
-#endif /* __net_sctp_user_h__ */
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 5c8a1d25e21c..7df190525337 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -331,6 +331,7 @@ header-y += rtnetlink.h
 header-y += scc.h
 header-y += sched.h
 header-y += screen_info.h
+header-y += sctp.h
 header-y += sdla.h
 header-y += seccomp.h
 header-y += securebits.h
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
new file mode 100644
index 000000000000..66b466e4ca08
--- /dev/null
+++ b/include/uapi/linux/sctp.h
@@ -0,0 +1,846 @@
+/* SCTP kernel implementation
+ * (C) Copyright IBM Corp. 2001, 2004
+ * Copyright (c) 1999-2000 Cisco, Inc.
+ * Copyright (c) 1999-2001 Motorola, Inc.
+ * Copyright (c) 2002 Intel Corp.
+ *
+ * This file is part of the SCTP kernel implementation
+ *
+ * This header represents the structures and constants needed to support
+ * the SCTP Extension to the Sockets API.
+ *
+ * This SCTP implementation is free software;
+ * you can redistribute it and/or modify it under the terms of
+ * the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * This SCTP implementation is distributed in the hope that it
+ * will be useful, but WITHOUT ANY WARRANTY; without even the implied
+ *                 ************************
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with GNU CC; see the file COPYING.  If not, write to
+ * the Free Software Foundation, 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ *
+ * Please send any bug reports or fixes you make to the
+ * email address(es):
+ *    lksctp developers <lksctp-developers@lists.sourceforge.net>
+ *
+ * Or submit a bug report through the following website:
+ *    http://www.sf.net/projects/lksctp
+ *
+ * Written or modified by:
+ *    La Monte H.P. Yarroll    <piggy@acm.org>
+ *    R. Stewart               <randall@sctp.chicago.il.us>
+ *    K. Morneau               <kmorneau@cisco.com>
+ *    Q. Xie                   <qxie1@email.mot.com>
+ *    Karl Knutson             <karl@athena.chicago.il.us>
+ *    Jon Grimm                <jgrimm@us.ibm.com>
+ *    Daisy Chang              <daisyc@us.ibm.com>
+ *    Ryan Layer               <rmlayer@us.ibm.com>
+ *    Ardelle Fan              <ardelle.fan@intel.com>
+ *    Sridhar Samudrala        <sri@us.ibm.com>
+ *    Inaky Perez-Gonzalez     <inaky.gonzalez@intel.com>
+ *    Vlad Yasevich            <vladislav.yasevich@hp.com>
+ *
+ * Any bugs reported given to us we will try to fix... any fixes shared will
+ * be incorporated into the next SCTP release.
+ */
+
+#ifndef _UAPI_SCTP_H
+#define _UAPI_SCTP_H
+
+#include <linux/types.h>
+#include <linux/socket.h>
+
+typedef __s32 sctp_assoc_t;
+
+/* The following symbols come from the Sockets API Extensions for
+ * SCTP <draft-ietf-tsvwg-sctpsocket-07.txt>.
+ */
+#define SCTP_RTOINFO	0
+#define SCTP_ASSOCINFO  1
+#define SCTP_INITMSG	2
+#define SCTP_NODELAY	3		/* Get/set nodelay option. */
+#define SCTP_AUTOCLOSE	4
+#define SCTP_SET_PEER_PRIMARY_ADDR 5
+#define SCTP_PRIMARY_ADDR	6
+#define SCTP_ADAPTATION_LAYER	7
+#define SCTP_DISABLE_FRAGMENTS	8
+#define SCTP_PEER_ADDR_PARAMS	9
+#define SCTP_DEFAULT_SEND_PARAM	10
+#define SCTP_EVENTS	11
+#define SCTP_I_WANT_MAPPED_V4_ADDR 12	/* Turn on/off mapped v4 addresses  */
+#define SCTP_MAXSEG	13		/* Get/set maximum fragment. */
+#define SCTP_STATUS	14
+#define SCTP_GET_PEER_ADDR_INFO	15
+#define SCTP_DELAYED_ACK_TIME	16
+#define SCTP_DELAYED_ACK SCTP_DELAYED_ACK_TIME
+#define SCTP_DELAYED_SACK SCTP_DELAYED_ACK_TIME
+#define SCTP_CONTEXT	17
+#define SCTP_FRAGMENT_INTERLEAVE	18
+#define SCTP_PARTIAL_DELIVERY_POINT	19 /* Set/Get partial delivery point */
+#define SCTP_MAX_BURST	20		/* Set/Get max burst */
+#define SCTP_AUTH_CHUNK	21	/* Set only: add a chunk type to authenticate */
+#define SCTP_HMAC_IDENT	22
+#define SCTP_AUTH_KEY	23
+#define SCTP_AUTH_ACTIVE_KEY	24
+#define SCTP_AUTH_DELETE_KEY	25
+#define SCTP_PEER_AUTH_CHUNKS	26	/* Read only */
+#define SCTP_LOCAL_AUTH_CHUNKS	27	/* Read only */
+#define SCTP_GET_ASSOC_NUMBER	28	/* Read only */
+#define SCTP_GET_ASSOC_ID_LIST	29	/* Read only */
+#define SCTP_AUTO_ASCONF       30
+#define SCTP_PEER_ADDR_THLDS	31
+
+/* Internal Socket Options. Some of the sctp library functions are
+ * implemented using these socket options.
+ */
+#define SCTP_SOCKOPT_BINDX_ADD	100	/* BINDX requests for adding addrs */
+#define SCTP_SOCKOPT_BINDX_REM	101	/* BINDX requests for removing addrs. */
+#define SCTP_SOCKOPT_PEELOFF	102	/* peel off association. */
+/* Options 104-106 are deprecated and removed. Do not use this space */
+#define SCTP_SOCKOPT_CONNECTX_OLD	107	/* CONNECTX old requests. */
+#define SCTP_GET_PEER_ADDRS	108		/* Get all peer address. */
+#define SCTP_GET_LOCAL_ADDRS	109		/* Get all local address. */
+#define SCTP_SOCKOPT_CONNECTX	110		/* CONNECTX requests. */
+#define SCTP_SOCKOPT_CONNECTX3	111	/* CONNECTX requests (updated) */
+#define SCTP_GET_ASSOC_STATS	112	/* Read only */
+
+/*
+ * 5.2.1 SCTP Initiation Structure (SCTP_INIT)
+ *
+ *   This cmsghdr structure provides information for initializing new
+ *   SCTP associations with sendmsg().  The SCTP_INITMSG socket option
+ *   uses this same data structure.  This structure is not used for
+ *   recvmsg().
+ *
+ *   cmsg_level    cmsg_type      cmsg_data[]
+ *   ------------  ------------   ----------------------
+ *   IPPROTO_SCTP  SCTP_INIT      struct sctp_initmsg
+ *
+ */
+struct sctp_initmsg {
+	__u16 sinit_num_ostreams;
+	__u16 sinit_max_instreams;
+	__u16 sinit_max_attempts;
+	__u16 sinit_max_init_timeo;
+};
+
+/*
+ * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV)
+ *
+ *   This cmsghdr structure specifies SCTP options for sendmsg() and
+ *   describes SCTP header information about a received message through
+ *   recvmsg().
+ *
+ *   cmsg_level    cmsg_type      cmsg_data[]
+ *   ------------  ------------   ----------------------
+ *   IPPROTO_SCTP  SCTP_SNDRCV    struct sctp_sndrcvinfo
+ *
+ */
+struct sctp_sndrcvinfo {
+	__u16 sinfo_stream;
+	__u16 sinfo_ssn;
+	__u16 sinfo_flags;
+	__u32 sinfo_ppid;
+	__u32 sinfo_context;
+	__u32 sinfo_timetolive;
+	__u32 sinfo_tsn;
+	__u32 sinfo_cumtsn;
+	sctp_assoc_t sinfo_assoc_id;
+};
+
+/*
+ *  sinfo_flags: 16 bits (unsigned integer)
+ *
+ *   This field may contain any of the following flags and is composed of
+ *   a bitwise OR of these values.
+ */
+
+enum sctp_sinfo_flags {
+	SCTP_UNORDERED = 1,  /* Send/receive message unordered. */
+	SCTP_ADDR_OVER = 2,  /* Override the primary destination. */
+	SCTP_ABORT=4,        /* Send an ABORT message to the peer. */
+	SCTP_SACK_IMMEDIATELY = 8,	/* SACK should be sent without delay */
+	SCTP_EOF=MSG_FIN,    /* Initiate graceful shutdown process. */
+};
+
+typedef union {
+	__u8   			raw;
+	struct sctp_initmsg	init;
+	struct sctp_sndrcvinfo	sndrcv;
+} sctp_cmsg_data_t;
+
+/* These are cmsg_types.  */
+typedef enum sctp_cmsg_type {
+	SCTP_INIT,              /* 5.2.1 SCTP Initiation Structure */
+#define SCTP_INIT	SCTP_INIT
+	SCTP_SNDRCV,            /* 5.2.2 SCTP Header Information Structure */
+#define SCTP_SNDRCV	SCTP_SNDRCV
+} sctp_cmsg_t;
+
+/*
+ * 5.3.1.1 SCTP_ASSOC_CHANGE
+ *
+ *   Communication notifications inform the ULP that an SCTP association
+ *   has either begun or ended. The identifier for a new association is
+ *   provided by this notificaion. The notification information has the
+ *   following format:
+ *
+ */
+struct sctp_assoc_change {
+	__u16 sac_type;
+	__u16 sac_flags;
+	__u32 sac_length;
+	__u16 sac_state;
+	__u16 sac_error;
+	__u16 sac_outbound_streams;
+	__u16 sac_inbound_streams;
+	sctp_assoc_t sac_assoc_id;
+	__u8 sac_info[0];
+};
+
+/*
+ *   sac_state: 32 bits (signed integer)
+ *
+ *   This field holds one of a number of values that communicate the
+ *   event that happened to the association.  They include:
+ *
+ *   Note:  The following state names deviate from the API draft as
+ *   the names clash too easily with other kernel symbols.
+ */
+enum sctp_sac_state {
+	SCTP_COMM_UP,
+	SCTP_COMM_LOST,
+	SCTP_RESTART,
+	SCTP_SHUTDOWN_COMP,
+	SCTP_CANT_STR_ASSOC,
+};
+
+/*
+ * 5.3.1.2 SCTP_PEER_ADDR_CHANGE
+ *
+ *   When a destination address on a multi-homed peer encounters a change
+ *   an interface details event is sent.  The information has the
+ *   following structure:
+ */
+struct sctp_paddr_change {
+	__u16 spc_type;
+	__u16 spc_flags;
+	__u32 spc_length;
+	struct sockaddr_storage spc_aaddr;
+	int spc_state;
+	int spc_error;
+	sctp_assoc_t spc_assoc_id;
+} __attribute__((packed, aligned(4)));
+
+/*
+ *    spc_state:  32 bits (signed integer)
+ *
+ *   This field holds one of a number of values that communicate the
+ *   event that happened to the address.  They include:
+ */
+enum sctp_spc_state {
+	SCTP_ADDR_AVAILABLE,
+	SCTP_ADDR_UNREACHABLE,
+	SCTP_ADDR_REMOVED,
+	SCTP_ADDR_ADDED,
+	SCTP_ADDR_MADE_PRIM,
+	SCTP_ADDR_CONFIRMED,
+};
+
+
+/*
+ * 5.3.1.3 SCTP_REMOTE_ERROR
+ *
+ *   A remote peer may send an Operational Error message to its peer.
+ *   This message indicates a variety of error conditions on an
+ *   association. The entire error TLV as it appears on the wire is
+ *   included in a SCTP_REMOTE_ERROR event.  Please refer to the SCTP
+ *   specification [SCTP] and any extensions for a list of possible
+ *   error formats. SCTP error TLVs have the format:
+ */
+struct sctp_remote_error {
+	__u16 sre_type;
+	__u16 sre_flags;
+	__u32 sre_length;
+	__u16 sre_error;
+	sctp_assoc_t sre_assoc_id;
+	__u8 sre_data[0];
+};
+
+
+/*
+ * 5.3.1.4 SCTP_SEND_FAILED
+ *
+ *   If SCTP cannot deliver a message it may return the message as a
+ *   notification.
+ */
+struct sctp_send_failed {
+	__u16 ssf_type;
+	__u16 ssf_flags;
+	__u32 ssf_length;
+	__u32 ssf_error;
+	struct sctp_sndrcvinfo ssf_info;
+	sctp_assoc_t ssf_assoc_id;
+	__u8 ssf_data[0];
+};
+
+/*
+ *   ssf_flags: 16 bits (unsigned integer)
+ *
+ *   The flag value will take one of the following values
+ *
+ *   SCTP_DATA_UNSENT  - Indicates that the data was never put on
+ *                       the wire.
+ *
+ *   SCTP_DATA_SENT    - Indicates that the data was put on the wire.
+ *                       Note that this does not necessarily mean that the
+ *                       data was (or was not) successfully delivered.
+ */
+enum sctp_ssf_flags {
+	SCTP_DATA_UNSENT,
+	SCTP_DATA_SENT,
+};
+
+/*
+ * 5.3.1.5 SCTP_SHUTDOWN_EVENT
+ *
+ *   When a peer sends a SHUTDOWN, SCTP delivers this notification to
+ *   inform the application that it should cease sending data.
+ */
+struct sctp_shutdown_event {
+	__u16 sse_type;
+	__u16 sse_flags;
+	__u32 sse_length;
+	sctp_assoc_t sse_assoc_id;
+};
+
+/*
+ * 5.3.1.6 SCTP_ADAPTATION_INDICATION
+ *
+ *   When a peer sends a Adaptation Layer Indication parameter , SCTP
+ *   delivers this notification to inform the application
+ *   that of the peers requested adaptation layer.
+ */
+struct sctp_adaptation_event {
+	__u16 sai_type;
+	__u16 sai_flags;
+	__u32 sai_length;
+	__u32 sai_adaptation_ind;
+	sctp_assoc_t sai_assoc_id;
+};
+
+/*
+ * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT
+ *
+ *   When a receiver is engaged in a partial delivery of a
+ *   message this notification will be used to indicate
+ *   various events.
+ */
+struct sctp_pdapi_event {
+	__u16 pdapi_type;
+	__u16 pdapi_flags;
+	__u32 pdapi_length;
+	__u32 pdapi_indication;
+	sctp_assoc_t pdapi_assoc_id;
+};
+
+enum { SCTP_PARTIAL_DELIVERY_ABORTED=0, };
+
+/*
+ * 5.3.1.8.  SCTP_AUTHENTICATION_EVENT
+ *
+ *  When a receiver is using authentication this message will provide
+ *  notifications regarding new keys being made active as well as errors.
+ */
+struct sctp_authkey_event {
+	__u16 auth_type;
+	__u16 auth_flags;
+	__u32 auth_length;
+	__u16 auth_keynumber;
+	__u16 auth_altkeynumber;
+	__u32 auth_indication;
+	sctp_assoc_t auth_assoc_id;
+};
+
+enum { SCTP_AUTH_NEWKEY = 0, };
+
+/*
+ * 6.1.9. SCTP_SENDER_DRY_EVENT
+ *
+ * When the SCTP stack has no more user data to send or retransmit, this
+ * notification is given to the user. Also, at the time when a user app
+ * subscribes to this event, if there is no data to be sent or
+ * retransmit, the stack will immediately send up this notification.
+ */
+struct sctp_sender_dry_event {
+	__u16 sender_dry_type;
+	__u16 sender_dry_flags;
+	__u32 sender_dry_length;
+	sctp_assoc_t sender_dry_assoc_id;
+};
+
+/*
+ * Described in Section 7.3
+ *   Ancillary Data and Notification Interest Options
+ */
+struct sctp_event_subscribe {
+	__u8 sctp_data_io_event;
+	__u8 sctp_association_event;
+	__u8 sctp_address_event;
+	__u8 sctp_send_failure_event;
+	__u8 sctp_peer_error_event;
+	__u8 sctp_shutdown_event;
+	__u8 sctp_partial_delivery_event;
+	__u8 sctp_adaptation_layer_event;
+	__u8 sctp_authentication_event;
+	__u8 sctp_sender_dry_event;
+};
+
+/*
+ * 5.3.1 SCTP Notification Structure
+ *
+ *   The notification structure is defined as the union of all
+ *   notification types.
+ *
+ */
+union sctp_notification {
+	struct {
+		__u16 sn_type;             /* Notification type. */
+		__u16 sn_flags;
+		__u32 sn_length;
+	} sn_header;
+	struct sctp_assoc_change sn_assoc_change;
+	struct sctp_paddr_change sn_paddr_change;
+	struct sctp_remote_error sn_remote_error;
+	struct sctp_send_failed sn_send_failed;
+	struct sctp_shutdown_event sn_shutdown_event;
+	struct sctp_adaptation_event sn_adaptation_event;
+	struct sctp_pdapi_event sn_pdapi_event;
+	struct sctp_authkey_event sn_authkey_event;
+	struct sctp_sender_dry_event sn_sender_dry_event;
+};
+
+/* Section 5.3.1
+ * All standard values for sn_type flags are greater than 2^15.
+ * Values from 2^15 and down are reserved.
+ */
+
+enum sctp_sn_type {
+	SCTP_SN_TYPE_BASE     = (1<<15),
+	SCTP_ASSOC_CHANGE,
+#define SCTP_ASSOC_CHANGE		SCTP_ASSOC_CHANGE
+	SCTP_PEER_ADDR_CHANGE,
+#define SCTP_PEER_ADDR_CHANGE		SCTP_PEER_ADDR_CHANGE
+	SCTP_SEND_FAILED,
+#define SCTP_SEND_FAILED		SCTP_SEND_FAILED
+	SCTP_REMOTE_ERROR,
+#define SCTP_REMOTE_ERROR		SCTP_REMOTE_ERROR
+	SCTP_SHUTDOWN_EVENT,
+#define SCTP_SHUTDOWN_EVENT		SCTP_SHUTDOWN_EVENT
+	SCTP_PARTIAL_DELIVERY_EVENT,
+#define SCTP_PARTIAL_DELIVERY_EVENT	SCTP_PARTIAL_DELIVERY_EVENT
+	SCTP_ADAPTATION_INDICATION,
+#define SCTP_ADAPTATION_INDICATION	SCTP_ADAPTATION_INDICATION
+	SCTP_AUTHENTICATION_EVENT,
+#define SCTP_AUTHENTICATION_INDICATION	SCTP_AUTHENTICATION_EVENT
+	SCTP_SENDER_DRY_EVENT,
+#define SCTP_SENDER_DRY_EVENT		SCTP_SENDER_DRY_EVENT
+};
+
+/* Notification error codes used to fill up the error fields in some
+ * notifications.
+ * SCTP_PEER_ADDRESS_CHAGE 	: spc_error
+ * SCTP_ASSOC_CHANGE		: sac_error
+ * These names should be potentially included in the draft 04 of the SCTP
+ * sockets API specification.
+ */
+typedef enum sctp_sn_error {
+	SCTP_FAILED_THRESHOLD,
+	SCTP_RECEIVED_SACK,
+	SCTP_HEARTBEAT_SUCCESS,
+	SCTP_RESPONSE_TO_USER_REQ,
+	SCTP_INTERNAL_ERROR,
+	SCTP_SHUTDOWN_GUARD_EXPIRES,
+	SCTP_PEER_FAULTY,
+} sctp_sn_error_t;
+
+/*
+ * 7.1.1 Retransmission Timeout Parameters (SCTP_RTOINFO)
+ *
+ *   The protocol parameters used to initialize and bound retransmission
+ *   timeout (RTO) are tunable.  See [SCTP] for more information on how
+ *   these parameters are used in RTO calculation.
+ */
+struct sctp_rtoinfo {
+	sctp_assoc_t	srto_assoc_id;
+	__u32		srto_initial;
+	__u32		srto_max;
+	__u32		srto_min;
+};
+
+/*
+ * 7.1.2 Association Parameters (SCTP_ASSOCINFO)
+ *
+ *   This option is used to both examine and set various association and
+ *   endpoint parameters.
+ */
+struct sctp_assocparams {
+	sctp_assoc_t	sasoc_assoc_id;
+	__u16		sasoc_asocmaxrxt;
+	__u16		sasoc_number_peer_destinations;
+	__u32		sasoc_peer_rwnd;
+	__u32		sasoc_local_rwnd;
+	__u32		sasoc_cookie_life;
+};
+
+/*
+ * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR)
+ *
+ *  Requests that the peer mark the enclosed address as the association
+ *  primary. The enclosed address must be one of the association's
+ *  locally bound addresses. The following structure is used to make a
+ *   set primary request:
+ */
+struct sctp_setpeerprim {
+	sctp_assoc_t            sspp_assoc_id;
+	struct sockaddr_storage sspp_addr;
+} __attribute__((packed, aligned(4)));
+
+/*
+ * 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR)
+ *
+ *  Requests that the local SCTP stack use the enclosed peer address as
+ *  the association primary. The enclosed address must be one of the
+ *  association peer's addresses. The following structure is used to
+ *  make a set peer primary request:
+ */
+struct sctp_prim {
+	sctp_assoc_t            ssp_assoc_id;
+	struct sockaddr_storage ssp_addr;
+} __attribute__((packed, aligned(4)));
+
+/* For backward compatibility use, define the old name too */
+#define sctp_setprim	sctp_prim
+
+/*
+ * 7.1.11 Set Adaptation Layer Indicator (SCTP_ADAPTATION_LAYER)
+ *
+ * Requests that the local endpoint set the specified Adaptation Layer
+ * Indication parameter for all future INIT and INIT-ACK exchanges.
+ */
+struct sctp_setadaptation {
+	__u32	ssb_adaptation_ind;
+};
+
+/*
+ * 7.1.13 Peer Address Parameters  (SCTP_PEER_ADDR_PARAMS)
+ *
+ *   Applications can enable or disable heartbeats for any peer address
+ *   of an association, modify an address's heartbeat interval, force a
+ *   heartbeat to be sent immediately, and adjust the address's maximum
+ *   number of retransmissions sent before an address is considered
+ *   unreachable. The following structure is used to access and modify an
+ *   address's parameters:
+ */
+enum  sctp_spp_flags {
+	SPP_HB_ENABLE = 1<<0,		/*Enable heartbeats*/
+	SPP_HB_DISABLE = 1<<1,		/*Disable heartbeats*/
+	SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE,
+	SPP_HB_DEMAND = 1<<2,		/*Send heartbeat immediately*/
+	SPP_PMTUD_ENABLE = 1<<3,	/*Enable PMTU discovery*/
+	SPP_PMTUD_DISABLE = 1<<4,	/*Disable PMTU discovery*/
+	SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE,
+	SPP_SACKDELAY_ENABLE = 1<<5,	/*Enable SACK*/
+	SPP_SACKDELAY_DISABLE = 1<<6,	/*Disable SACK*/
+	SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE,
+	SPP_HB_TIME_IS_ZERO = 1<<7,	/* Set HB delay to 0 */
+};
+
+struct sctp_paddrparams {
+	sctp_assoc_t		spp_assoc_id;
+	struct sockaddr_storage	spp_address;
+	__u32			spp_hbinterval;
+	__u16			spp_pathmaxrxt;
+	__u32			spp_pathmtu;
+	__u32			spp_sackdelay;
+	__u32			spp_flags;
+} __attribute__((packed, aligned(4)));
+
+/*
+ * 7.1.18.  Add a chunk that must be authenticated (SCTP_AUTH_CHUNK)
+ *
+ * This set option adds a chunk type that the user is requesting to be
+ * received only in an authenticated way.  Changes to the list of chunks
+ * will only effect future associations on the socket.
+ */
+struct sctp_authchunk {
+	__u8		sauth_chunk;
+};
+
+/*
+ * 7.1.19.  Get or set the list of supported HMAC Identifiers (SCTP_HMAC_IDENT)
+ *
+ * This option gets or sets the list of HMAC algorithms that the local
+ * endpoint requires the peer to use.
+ */
+#ifndef __KERNEL__
+/* This here is only used by user space as is. It might not be a good idea
+ * to export/reveal the whole structure with reserved fields etc.
+ */
+enum {
+	SCTP_AUTH_HMAC_ID_SHA1 = 1,
+	SCTP_AUTH_HMAC_ID_SHA256 = 3,
+};
+#endif
+
+struct sctp_hmacalgo {
+	__u32		shmac_num_idents;
+	__u16		shmac_idents[];
+};
+
+/* Sadly, user and kernel space have different names for
+ * this structure member, so this is to not break anything.
+ */
+#define shmac_number_of_idents	shmac_num_idents
+
+/*
+ * 7.1.20.  Set a shared key (SCTP_AUTH_KEY)
+ *
+ * This option will set a shared secret key which is used to build an
+ * association shared key.
+ */
+struct sctp_authkey {
+	sctp_assoc_t	sca_assoc_id;
+	__u16		sca_keynumber;
+	__u16		sca_keylength;
+	__u8		sca_key[];
+};
+
+/*
+ * 7.1.21.  Get or set the active shared key (SCTP_AUTH_ACTIVE_KEY)
+ *
+ * This option will get or set the active shared key to be used to build
+ * the association shared key.
+ */
+
+struct sctp_authkeyid {
+	sctp_assoc_t	scact_assoc_id;
+	__u16		scact_keynumber;
+};
+
+
+/*
+ * 7.1.23.  Get or set delayed ack timer (SCTP_DELAYED_SACK)
+ *
+ * This option will effect the way delayed acks are performed.  This
+ * option allows you to get or set the delayed ack time, in
+ * milliseconds.  It also allows changing the delayed ack frequency.
+ * Changing the frequency to 1 disables the delayed sack algorithm.  If
+ * the assoc_id is 0, then this sets or gets the endpoints default
+ * values.  If the assoc_id field is non-zero, then the set or get
+ * effects the specified association for the one to many model (the
+ * assoc_id field is ignored by the one to one model).  Note that if
+ * sack_delay or sack_freq are 0 when setting this option, then the
+ * current values will remain unchanged.
+ */
+struct sctp_sack_info {
+	sctp_assoc_t	sack_assoc_id;
+	uint32_t	sack_delay;
+	uint32_t	sack_freq;
+};
+
+struct sctp_assoc_value {
+    sctp_assoc_t            assoc_id;
+    uint32_t                assoc_value;
+};
+
+/*
+ * 7.2.2 Peer Address Information
+ *
+ *   Applications can retrieve information about a specific peer address
+ *   of an association, including its reachability state, congestion
+ *   window, and retransmission timer values.  This information is
+ *   read-only. The following structure is used to access this
+ *   information:
+ */
+struct sctp_paddrinfo {
+	sctp_assoc_t		spinfo_assoc_id;
+	struct sockaddr_storage	spinfo_address;
+	__s32			spinfo_state;
+	__u32			spinfo_cwnd;
+	__u32			spinfo_srtt;
+	__u32			spinfo_rto;
+	__u32			spinfo_mtu;
+} __attribute__((packed, aligned(4)));
+
+/* Peer addresses's state. */
+/* UNKNOWN: Peer address passed by the upper layer in sendmsg or connect[x]
+ * calls.
+ * UNCONFIRMED: Peer address received in INIT/INIT-ACK address parameters.
+ *              Not yet confirmed by a heartbeat and not available for data
+ *		transfers.
+ * ACTIVE : Peer address confirmed, active and available for data transfers.
+ * INACTIVE: Peer address inactive and not available for data transfers.
+ */
+enum sctp_spinfo_state {
+	SCTP_INACTIVE,
+	SCTP_PF,
+	SCTP_ACTIVE,
+	SCTP_UNCONFIRMED,
+	SCTP_UNKNOWN = 0xffff  /* Value used for transport state unknown */
+};
+
+/*
+ * 7.2.1 Association Status (SCTP_STATUS)
+ *
+ *   Applications can retrieve current status information about an
+ *   association, including association state, peer receiver window size,
+ *   number of unacked data chunks, and number of data chunks pending
+ *   receipt.  This information is read-only.  The following structure is
+ *   used to access this information:
+ */
+struct sctp_status {
+	sctp_assoc_t		sstat_assoc_id;
+	__s32			sstat_state;
+	__u32			sstat_rwnd;
+	__u16			sstat_unackdata;
+	__u16			sstat_penddata;
+	__u16			sstat_instrms;
+	__u16			sstat_outstrms;
+	__u32			sstat_fragmentation_point;
+	struct sctp_paddrinfo	sstat_primary;
+};
+
+/*
+ * 7.2.3.  Get the list of chunks the peer requires to be authenticated
+ *         (SCTP_PEER_AUTH_CHUNKS)
+ *
+ * This option gets a list of chunks for a specified association that
+ * the peer requires to be received authenticated only.
+ */
+struct sctp_authchunks {
+	sctp_assoc_t	gauth_assoc_id;
+	__u32		gauth_number_of_chunks;
+	uint8_t		gauth_chunks[];
+};
+
+/* The broken spelling has been released already in lksctp-tools header,
+ * so don't break anyone, now that it's fixed.
+ */
+#define guth_number_of_chunks	gauth_number_of_chunks
+
+/* Association states.  */
+enum sctp_sstat_state {
+	SCTP_EMPTY                = 0,
+	SCTP_CLOSED               = 1,
+	SCTP_COOKIE_WAIT          = 2,
+	SCTP_COOKIE_ECHOED        = 3,
+	SCTP_ESTABLISHED          = 4,
+	SCTP_SHUTDOWN_PENDING     = 5,
+	SCTP_SHUTDOWN_SENT        = 6,
+	SCTP_SHUTDOWN_RECEIVED    = 7,
+	SCTP_SHUTDOWN_ACK_SENT    = 8,
+};
+
+/*
+ * 8.2.6. Get the Current Identifiers of Associations
+ *        (SCTP_GET_ASSOC_ID_LIST)
+ *
+ * This option gets the current list of SCTP association identifiers of
+ * the SCTP associations handled by a one-to-many style socket.
+ */
+struct sctp_assoc_ids {
+	__u32		gaids_number_of_ids;
+	sctp_assoc_t	gaids_assoc_id[];
+};
+
+/*
+ * 8.3, 8.5 get all peer/local addresses in an association.
+ * This parameter struct is used by SCTP_GET_PEER_ADDRS and
+ * SCTP_GET_LOCAL_ADDRS socket options used internally to implement
+ * sctp_getpaddrs() and sctp_getladdrs() API.
+ */
+struct sctp_getaddrs_old {
+	sctp_assoc_t            assoc_id;
+	int			addr_num;
+#ifdef __KERNEL__
+	struct sockaddr		__user *addrs;
+#else
+	struct sockaddr		*addrs;
+#endif
+};
+
+struct sctp_getaddrs {
+	sctp_assoc_t		assoc_id; /*input*/
+	__u32			addr_num; /*output*/
+	__u8			addrs[0]; /*output, variable size*/
+};
+
+/* A socket user request obtained via SCTP_GET_ASSOC_STATS that retrieves
+ * association stats. All stats are counts except sas_maxrto and
+ * sas_obs_rto_ipaddr. maxrto is the max observed rto + transport since
+ * the last call. Will return 0 when RTO was not update since last call
+ */
+struct sctp_assoc_stats {
+	sctp_assoc_t	sas_assoc_id;    /* Input */
+					 /* Transport of observed max RTO */
+	struct sockaddr_storage sas_obs_rto_ipaddr;
+	__u64		sas_maxrto;      /* Maximum Observed RTO for period */
+	__u64		sas_isacks;	 /* SACKs received */
+	__u64		sas_osacks;	 /* SACKs sent */
+	__u64		sas_opackets;	 /* Packets sent */
+	__u64		sas_ipackets;	 /* Packets received */
+	__u64		sas_rtxchunks;   /* Retransmitted Chunks */
+	__u64		sas_outofseqtsns;/* TSN received > next expected */
+	__u64		sas_idupchunks;  /* Dups received (ordered+unordered) */
+	__u64		sas_gapcnt;      /* Gap Acknowledgements Received */
+	__u64		sas_ouodchunks;  /* Unordered data chunks sent */
+	__u64		sas_iuodchunks;  /* Unordered data chunks received */
+	__u64		sas_oodchunks;	 /* Ordered data chunks sent */
+	__u64		sas_iodchunks;	 /* Ordered data chunks received */
+	__u64		sas_octrlchunks; /* Control chunks sent */
+	__u64		sas_ictrlchunks; /* Control chunks received */
+};
+
+/* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
+/* On user space Linux, these live in <bits/socket.h> as an enum.  */
+enum sctp_msg_flags {
+	MSG_NOTIFICATION = 0x8000,
+#define MSG_NOTIFICATION MSG_NOTIFICATION
+};
+
+/*
+ * 8.1 sctp_bindx()
+ *
+ * The flags parameter is formed from the bitwise OR of zero or more of the
+ * following currently defined flags:
+ */
+#define SCTP_BINDX_ADD_ADDR 0x01
+#define SCTP_BINDX_REM_ADDR 0x02
+
+/* This is the structure that is passed as an argument(optval) to
+ * getsockopt(SCTP_SOCKOPT_PEELOFF).
+ */
+typedef struct {
+	sctp_assoc_t associd;
+	int sd;
+} sctp_peeloff_arg_t;
+
+/*
+ *  Peer Address Thresholds socket option
+ */
+struct sctp_paddrthlds {
+	sctp_assoc_t spt_assoc_id;
+	struct sockaddr_storage spt_address;
+	__u16 spt_pathmaxrxt;
+	__u16 spt_pathpfthld;
+};
+
+#endif /* _UAPI_SCTP_H */
-- 
cgit 


From ca10b9e9a8ca7342ee07065289cbe74ac128c169 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 8 Apr 2013 17:58:11 +0000
Subject: selinux: add a skb_owned_by() hook

Commit 90ba9b1986b5ac (tcp: tcp_make_synack() can use alloc_skb())
broke certain SELinux/NetLabel configurations by no longer correctly
assigning the sock to the outgoing SYNACK packet.

Cost of atomic operations on the LISTEN socket is quite big,
and we would like it to happen only if really needed.

This patch introduces a new security_ops->skb_owned_by() method,
that is a void operation unless selinux is active.

Reported-by: Miroslav Vadkerti <mvadkert@redhat.com>
Diagnosed-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: linux-security-module@vger.kernel.org
Acked-by: James Morris <james.l.morris@oracle.com>
Tested-by: Paul Moore <pmoore@redhat.com>
Acked-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 8 ++++++++
 net/ipv4/tcp_output.c    | 1 +
 security/capability.c    | 6 ++++++
 security/security.c      | 5 +++++
 security/selinux/hooks.c | 7 +++++++
 5 files changed, 27 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index eee7478cda70..6c3a78ace051 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1638,6 +1638,7 @@ struct security_operations {
 	int (*tun_dev_attach_queue) (void *security);
 	int (*tun_dev_attach) (struct sock *sk, void *security);
 	int (*tun_dev_open) (void *security);
+	void (*skb_owned_by) (struct sk_buff *skb, struct sock *sk);
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
@@ -2588,6 +2589,8 @@ int security_tun_dev_attach_queue(void *security);
 int security_tun_dev_attach(struct sock *sk, void *security);
 int security_tun_dev_open(void *security);
 
+void security_skb_owned_by(struct sk_buff *skb, struct sock *sk);
+
 #else	/* CONFIG_SECURITY_NETWORK */
 static inline int security_unix_stream_connect(struct sock *sock,
 					       struct sock *other,
@@ -2779,6 +2782,11 @@ static inline int security_tun_dev_open(void *security)
 {
 	return 0;
 }
+
+static inline void security_skb_owned_by(struct sk_buff *skb, struct sock *sk)
+{
+}
+
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 5d0b4387cba6..b44cf81d8178 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2709,6 +2709,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
 	skb_reserve(skb, MAX_TCP_HEADER);
 
 	skb_dst_set(skb, dst);
+	security_skb_owned_by(skb, sk);
 
 	mss = dst_metric_advmss(dst);
 	if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
diff --git a/security/capability.c b/security/capability.c
index 579775088967..6783c3e6c88e 100644
--- a/security/capability.c
+++ b/security/capability.c
@@ -737,6 +737,11 @@ static int cap_tun_dev_open(void *security)
 {
 	return 0;
 }
+
+static void cap_skb_owned_by(struct sk_buff *skb, struct sock *sk)
+{
+}
+
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
@@ -1071,6 +1076,7 @@ void __init security_fixup_ops(struct security_operations *ops)
 	set_to_cap_if_null(ops, tun_dev_open);
 	set_to_cap_if_null(ops, tun_dev_attach_queue);
 	set_to_cap_if_null(ops, tun_dev_attach);
+	set_to_cap_if_null(ops, skb_owned_by);
 #endif	/* CONFIG_SECURITY_NETWORK */
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	set_to_cap_if_null(ops, xfrm_policy_alloc_security);
diff --git a/security/security.c b/security/security.c
index 7b88c6aeaed4..03f248b84e9f 100644
--- a/security/security.c
+++ b/security/security.c
@@ -1290,6 +1290,11 @@ int security_tun_dev_open(void *security)
 }
 EXPORT_SYMBOL(security_tun_dev_open);
 
+void security_skb_owned_by(struct sk_buff *skb, struct sock *sk)
+{
+	security_ops->skb_owned_by(skb, sk);
+}
+
 #endif	/* CONFIG_SECURITY_NETWORK */
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 2fa28c88900c..7171a957b933 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -51,6 +51,7 @@
 #include <linux/tty.h>
 #include <net/icmp.h>
 #include <net/ip.h>		/* for local_port_range[] */
+#include <net/sock.h>
 #include <net/tcp.h>		/* struct or_callable used in sock_rcv_skb */
 #include <net/net_namespace.h>
 #include <net/netlabel.h>
@@ -4363,6 +4364,11 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb)
 	selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid);
 }
 
+static void selinux_skb_owned_by(struct sk_buff *skb, struct sock *sk)
+{
+	skb_set_owner_w(skb, sk);
+}
+
 static int selinux_secmark_relabel_packet(u32 sid)
 {
 	const struct task_security_struct *__tsec;
@@ -5664,6 +5670,7 @@ static struct security_operations selinux_ops = {
 	.tun_dev_attach_queue =		selinux_tun_dev_attach_queue,
 	.tun_dev_attach =		selinux_tun_dev_attach,
 	.tun_dev_open =			selinux_tun_dev_open,
+	.skb_owned_by =			selinux_skb_owned_by,
 
 #ifdef CONFIG_SECURITY_NETWORK_XFRM
 	.xfrm_policy_alloc_security =	selinux_xfrm_policy_alloc,
-- 
cgit 


From 386afc91144b36b42117b0092893f15bc8798a80 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Apr 2013 10:48:33 -0700
Subject: spinlocks and preemption points need to be at least compiler barriers

In UP and non-preempt respectively, the spinlocks and preemption
disable/enable points are stubbed out entirely, because there is no
regular code that can ever hit the kind of concurrency they are meant to
protect against.

However, while there is no regular code that can cause scheduling, we
_do_ end up having some exceptional (literally!) code that can do so,
and that we need to make sure does not ever get moved into the critical
region by the compiler.

In particular, get_user() and put_user() is generally implemented as
inline asm statements (even if the inline asm may then make a call
instruction to call out-of-line), and can obviously cause a page fault
and IO as a result.  If that inline asm has been scheduled into the
middle of a preemption-safe (or spinlock-protected) code region, we
obviously lose.

Now, admittedly this is *very* unlikely to actually ever happen, and
we've not seen examples of actual bugs related to this.  But partly
exactly because it's so hard to trigger and the resulting bug is so
subtle, we should be extra careful to get this right.

So make sure that even when preemption is disabled, and we don't have to
generate any actual *code* to explicitly tell the system that we are in
a preemption-disabled region, we need to at least tell the compiler not
to move things around the critical region.

This patch grew out of the same discussion that caused commits
79e5f05edcbf ("ARC: Add implicit compiler barrier to raw_local_irq*
functions") and 3e2e0d2c222b ("tile: comment assumption about
__insn_mtspr for <asm/irqflags.h>") to come about.

Note for stable: use discretion when/if applying this.  As mentioned,
this bug may never have actually bitten anybody, and gcc may never have
done the required code motion for it to possibly ever trigger in
practice.

Cc: stable@vger.kernel.org
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/preempt.h     | 22 ++++++++++++++--------
 include/linux/spinlock_up.h | 29 ++++++++++++++++++-----------
 2 files changed, 32 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 5a710b9c578e..87a03c746f17 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -93,14 +93,20 @@ do { \
 
 #else /* !CONFIG_PREEMPT_COUNT */
 
-#define preempt_disable()		do { } while (0)
-#define sched_preempt_enable_no_resched()	do { } while (0)
-#define preempt_enable_no_resched()	do { } while (0)
-#define preempt_enable()		do { } while (0)
-
-#define preempt_disable_notrace()		do { } while (0)
-#define preempt_enable_no_resched_notrace()	do { } while (0)
-#define preempt_enable_notrace()		do { } while (0)
+/*
+ * Even if we don't have any preemption, we need preempt disable/enable
+ * to be barriers, so that we don't have things like get_user/put_user
+ * that can cause faults and scheduling migrate into our preempt-protected
+ * region.
+ */
+#define preempt_disable()		barrier()
+#define sched_preempt_enable_no_resched()	barrier()
+#define preempt_enable_no_resched()	barrier()
+#define preempt_enable()		barrier()
+
+#define preempt_disable_notrace()		barrier()
+#define preempt_enable_no_resched_notrace()	barrier()
+#define preempt_enable_notrace()		barrier()
 
 #endif /* CONFIG_PREEMPT_COUNT */
 
diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index a26e2fb604e6..e2369c167dbd 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -16,7 +16,10 @@
  * In the debug case, 1 means unlocked, 0 means locked. (the values
  * are inverted, to catch initialization bugs)
  *
- * No atomicity anywhere, we are on UP.
+ * No atomicity anywhere, we are on UP. However, we still need
+ * the compiler barriers, because we do not want the compiler to
+ * move potentially faulting instructions (notably user accesses)
+ * into the locked sequence, resulting in non-atomic execution.
  */
 
 #ifdef CONFIG_DEBUG_SPINLOCK
@@ -25,6 +28,7 @@
 static inline void arch_spin_lock(arch_spinlock_t *lock)
 {
 	lock->slock = 0;
+	barrier();
 }
 
 static inline void
@@ -32,6 +36,7 @@ arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
 {
 	local_irq_save(flags);
 	lock->slock = 0;
+	barrier();
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lock)
@@ -39,32 +44,34 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
 	char oldval = lock->slock;
 
 	lock->slock = 0;
+	barrier();
 
 	return oldval > 0;
 }
 
 static inline void arch_spin_unlock(arch_spinlock_t *lock)
 {
+	barrier();
 	lock->slock = 1;
 }
 
 /*
  * Read-write spinlocks. No debug version.
  */
-#define arch_read_lock(lock)		do { (void)(lock); } while (0)
-#define arch_write_lock(lock)		do { (void)(lock); } while (0)
-#define arch_read_trylock(lock)	({ (void)(lock); 1; })
-#define arch_write_trylock(lock)	({ (void)(lock); 1; })
-#define arch_read_unlock(lock)		do { (void)(lock); } while (0)
-#define arch_write_unlock(lock)	do { (void)(lock); } while (0)
+#define arch_read_lock(lock)		do { barrier(); (void)(lock); } while (0)
+#define arch_write_lock(lock)		do { barrier(); (void)(lock); } while (0)
+#define arch_read_trylock(lock)	({ barrier(); (void)(lock); 1; })
+#define arch_write_trylock(lock)	({ barrier(); (void)(lock); 1; })
+#define arch_read_unlock(lock)		do { barrier(); (void)(lock); } while (0)
+#define arch_write_unlock(lock)	do { barrier(); (void)(lock); } while (0)
 
 #else /* DEBUG_SPINLOCK */
 #define arch_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched.c and kernel_lock.c: */
-# define arch_spin_lock(lock)		do { (void)(lock); } while (0)
-# define arch_spin_lock_flags(lock, flags)	do { (void)(lock); } while (0)
-# define arch_spin_unlock(lock)	do { (void)(lock); } while (0)
-# define arch_spin_trylock(lock)	({ (void)(lock); 1; })
+# define arch_spin_lock(lock)		do { barrier(); (void)(lock); } while (0)
+# define arch_spin_lock_flags(lock, flags)	do { barrier(); (void)(lock); } while (0)
+# define arch_spin_unlock(lock)	do { barrier(); (void)(lock); } while (0)
+# define arch_spin_trylock(lock)	({ barrier(); (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
 
 #define arch_spin_is_contended(lock)	(((void)(lock), 0))
-- 
cgit 


From 8ce584c7416d8a85a6f3edc17d1cddefe331e87e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 30 Mar 2013 20:13:46 -0400
Subject: procfs: add proc_remove_subtree()

just what it sounds like; do that only to procfs subtrees you've
created - doing that to something shared with another driver is
not only antisocial, but might cause interesting races with
proc_create() and its ilk.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c       | 119 ++++++++++++++++++++++++++++++++++++------------
 include/linux/proc_fs.h |   2 +
 2 files changed, 91 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 4b3b3ffb52f1..21e1a8f1659d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -755,37 +755,8 @@ void pde_put(struct proc_dir_entry *pde)
 		free_proc_entry(pde);
 }
 
-/*
- * Remove a /proc entry and free it if it's not currently in use.
- */
-void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+static void entry_rundown(struct proc_dir_entry *de)
 {
-	struct proc_dir_entry **p;
-	struct proc_dir_entry *de = NULL;
-	const char *fn = name;
-	unsigned int len;
-
-	spin_lock(&proc_subdir_lock);
-	if (__xlate_proc_name(name, &parent, &fn) != 0) {
-		spin_unlock(&proc_subdir_lock);
-		return;
-	}
-	len = strlen(fn);
-
-	for (p = &parent->subdir; *p; p=&(*p)->next ) {
-		if (proc_match(len, fn, *p)) {
-			de = *p;
-			*p = de->next;
-			de->next = NULL;
-			break;
-		}
-	}
-	spin_unlock(&proc_subdir_lock);
-	if (!de) {
-		WARN(1, "name '%s'\n", name);
-		return;
-	}
-
 	spin_lock(&de->pde_unload_lock);
 	/*
 	 * Stop accepting new callers into module. If you're
@@ -817,6 +788,40 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		spin_lock(&de->pde_unload_lock);
 	}
 	spin_unlock(&de->pde_unload_lock);
+}
+
+/*
+ * Remove a /proc entry and free it if it's not currently in use.
+ */
+void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry **p;
+	struct proc_dir_entry *de = NULL;
+	const char *fn = name;
+	unsigned int len;
+
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
+		return;
+	}
+	len = strlen(fn);
+
+	for (p = &parent->subdir; *p; p=&(*p)->next ) {
+		if (proc_match(len, fn, *p)) {
+			de = *p;
+			*p = de->next;
+			de->next = NULL;
+			break;
+		}
+	}
+	spin_unlock(&proc_subdir_lock);
+	if (!de) {
+		WARN(1, "name '%s'\n", name);
+		return;
+	}
+
+	entry_rundown(de);
 
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
@@ -827,3 +832,57 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 	pde_put(de);
 }
 EXPORT_SYMBOL(remove_proc_entry);
+
+int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
+{
+	struct proc_dir_entry **p;
+	struct proc_dir_entry *root = NULL, *de, *next;
+	const char *fn = name;
+	unsigned int len;
+
+	spin_lock(&proc_subdir_lock);
+	if (__xlate_proc_name(name, &parent, &fn) != 0) {
+		spin_unlock(&proc_subdir_lock);
+		return -ENOENT;
+	}
+	len = strlen(fn);
+
+	for (p = &parent->subdir; *p; p=&(*p)->next ) {
+		if (proc_match(len, fn, *p)) {
+			root = *p;
+			*p = root->next;
+			root->next = NULL;
+			break;
+		}
+	}
+	if (!root) {
+		spin_unlock(&proc_subdir_lock);
+		return -ENOENT;
+	}
+	de = root;
+	while (1) {
+		next = de->subdir;
+		if (next) {
+			de->subdir = next->next;
+			next->next = NULL;
+			de = next;
+			continue;
+		}
+		spin_unlock(&proc_subdir_lock);
+
+		entry_rundown(de);
+		next = de->parent;
+		if (S_ISDIR(de->mode))
+			next->nlink--;
+		de->nlink = 0;
+		if (de == root)
+			break;
+		pde_put(de);
+
+		spin_lock(&proc_subdir_lock);
+		de = next;
+	}
+	pde_put(root);
+	return 0;
+}
+EXPORT_SYMBOL(remove_proc_subtree);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 8307f2f94d86..94dfb2aa5533 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -117,6 +117,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 				const struct file_operations *proc_fops,
 				void *data);
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
+extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent);
 
 struct pid_namespace;
 
@@ -202,6 +203,7 @@ static inline struct proc_dir_entry *proc_create_data(const char *name,
 	return NULL;
 }
 #define remove_proc_entry(name, parent) do {} while (0)
+#define remove_proc_subtree(name, parent) do {} while (0)
 
 static inline struct proc_dir_entry *proc_symlink(const char *name,
 		struct proc_dir_entry *parent,const char *dest) {return NULL;}
-- 
cgit 


From 8d71db4f0890605d44815a2b2da4ca003f1bb142 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 19 Mar 2013 21:01:03 -0400
Subject: lift sb_start_write/sb_end_write out of ->aio_write()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/aio.c           |  4 ++++
 fs/btrfs/file.c    |  3 ---
 fs/cifs/file.c     |  3 ---
 fs/compat.c        |  6 ++++--
 fs/fuse/file.c     |  2 --
 fs/ntfs/file.c     |  2 --
 fs/ocfs2/file.c    |  3 ---
 fs/read_write.c    |  8 ++++++--
 fs/xfs/xfs_file.c  |  3 ---
 include/linux/fs.h | 14 ++++++++++++++
 mm/filemap.c       |  2 --
 11 files changed, 28 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 3f941f2a3059..4ec28f13a92e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1324,6 +1324,8 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 	if (iocb->ki_pos < 0)
 		return -EINVAL;
 
+	if (opcode == IOCB_CMD_PWRITEV)
+		file_start_write(file);
 	do {
 		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
 			    iocb->ki_nr_segs - iocb->ki_cur_seg,
@@ -1336,6 +1338,8 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 	} while (ret > 0 && iocb->ki_left > 0 &&
 		 (opcode == IOCB_CMD_PWRITEV ||
 		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
+	if (opcode == IOCB_CMD_PWRITEV)
+		file_end_write(file);
 
 	/* This means we must have transferred all that we could */
 	/* No need to retry anymore */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 5b4ea5f55b8f..254aeb72915f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1514,8 +1514,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	size_t count, ocount;
 	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 
-	sb_start_write(inode->i_sb);
-
 	mutex_lock(&inode->i_mutex);
 
 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
@@ -1617,7 +1615,6 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 	if (sync)
 		atomic_dec(&BTRFS_I(inode)->sync_writers);
 out:
-	sb_end_write(inode->i_sb);
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7a0dd99e4507..2d4a231dd70b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2520,8 +2520,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	sb_start_write(inode->i_sb);
-
 	/*
 	 * We need to hold the sem to be sure nobody modifies lock list
 	 * with a brlock that prevents writing.
@@ -2545,7 +2543,6 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
 	}
 
 	up_read(&cinode->lock_sem);
-	sb_end_write(inode->i_sb);
 	return rc;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index d487985dd0ea..daa3b771d64d 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1103,10 +1103,12 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 		fnv = file->f_op->aio_write;
 	}
 
-	if (fnv)
+	if (fnv) {
+		file_start_write(file);
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
-	else
+		file_end_write(file);
+	} else
 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
 out:
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34b80ba95bad..d15c6f21c17f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -971,7 +971,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		return err;
 
 	count = ocount;
-	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 
 	/* We can write back this queue in page reclaim */
@@ -1030,7 +1029,6 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 out:
 	current->backing_dev_info = NULL;
 	mutex_unlock(&inode->i_mutex);
-	sb_end_write(inode->i_sb);
 
 	return written ? written : err;
 }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 5b2d4f0853ac..1da4b81e6f76 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2129,7 +2129,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
@@ -2138,7 +2137,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0)
 			ret = err;
 	}
-	sb_end_write(inode->i_sb);
 	return ret;
 }
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 6474cb44004d..1c93e771e950 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2248,8 +2248,6 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	if (iocb->ki_left == 0)
 		return 0;
 
-	sb_start_write(inode->i_sb);
-
 	appending = file->f_flags & O_APPEND ? 1 : 0;
 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
 
@@ -2423,7 +2421,6 @@ out_sems:
 		ocfs2_iocb_clear_sem_locked(iocb);
 
 	mutex_unlock(&inode->i_mutex);
-	sb_end_write(inode->i_sb);
 
 	if (written)
 		ret = written;
diff --git a/fs/read_write.c b/fs/read_write.c
index f7b5a23b804b..3e1791a2cfd6 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -398,6 +398,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	struct kiocb kiocb;
 	ssize_t ret;
 
+	file_start_write(filp);
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
 	kiocb.ki_left = len;
@@ -413,6 +414,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
+	file_end_write(filp);
 	return ret;
 }
 
@@ -758,10 +760,12 @@ static ssize_t do_readv_writev(int type, struct file *file,
 		fnv = file->f_op->aio_write;
 	}
 
-	if (fnv)
+	if (fnv) {
+		file_start_write(file);
 		ret = do_sync_readv_writev(file, iov, nr_segs, tot_len,
 						pos, fnv);
-	else
+		file_end_write(file);
+	} else
 		ret = do_loop_readv_writev(file, iov, nr_segs, pos, fn);
 
 out:
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f03bf1a456fb..3800128d2171 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -775,8 +775,6 @@ xfs_file_aio_write(
 	if (ocount == 0)
 		return 0;
 
-	sb_start_write(inode->i_sb);
-
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		ret = -EIO;
 		goto out;
@@ -800,7 +798,6 @@ xfs_file_aio_write(
 	}
 
 out:
-	sb_end_write(inode->i_sb);
 	return ret;
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c28271ab9d4..578a66e6ee72 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2223,6 +2223,20 @@ static inline struct inode *file_inode(struct file *f)
 	return f->f_inode;
 }
 
+static inline void file_start_write(struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return;
+	__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
+}
+
+static inline void file_end_write(struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return;
+	__sb_end_write(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+}
+
 /*
  * get_write_access() gets write permission for a file.
  * put_write_access() releases this write permission.
diff --git a/mm/filemap.c b/mm/filemap.c
index e1979fdca805..cbde8842a374 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2528,7 +2528,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	BUG_ON(iocb->ki_pos != pos);
 
-	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
@@ -2540,7 +2539,6 @@ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		if (err < 0 && ret > 0)
 			ret = err;
 	}
-	sb_end_write(inode->i_sb);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);
-- 
cgit 


From 599a0ac14e065b7c08471ef2e75a504b7dec9267 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 12 Mar 2013 09:58:10 -0400
Subject: pipe: fold file_operations instances in one

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/inode.c         |   2 +-
 fs/internal.h      |   5 ++
 fs/pipe.c          | 221 ++++++++---------------------------------------------
 include/linux/fs.h |   5 --
 4 files changed, 38 insertions(+), 195 deletions(-)

(limited to 'include/linux')

diff --git a/fs/inode.c b/fs/inode.c
index f5f7c06c36fb..5b76d9b1a884 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1803,7 +1803,7 @@ void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
 		inode->i_fop = &def_blk_fops;
 		inode->i_rdev = rdev;
 	} else if (S_ISFIFO(mode))
-		inode->i_fop = &def_fifo_fops;
+		inode->i_fop = &pipefifo_fops;
 	else if (S_ISSOCK(mode))
 		inode->i_fop = &bad_sock_fops;
 	else
diff --git a/fs/internal.h b/fs/internal.h
index 4be78237d896..eaa75f75b625 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -130,3 +130,8 @@ extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
  * read_write.c
  */
 extern ssize_t __kernel_write(struct file *, const char *, size_t, loff_t *);
+
+/*
+ * pipe.c
+ */
+extern const struct file_operations pipefifo_fops;
diff --git a/fs/pipe.c b/fs/pipe.c
index aed80c25cd40..099ac3bf89f9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -25,6 +25,8 @@
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
 
+#include "internal.h"
+
 /*
  * The max size that a non-root user is allowed to grow the pipe. Can
  * be set by root in /proc/sys/fs/pipe-max-size
@@ -662,19 +664,6 @@ out:
 	return ret;
 }
 
-static ssize_t
-bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
-{
-	return -EBADF;
-}
-
-static ssize_t
-bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
-	   loff_t *ppos)
-{
-	return -EBADF;
-}
-
 static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -734,14 +723,16 @@ pipe_poll(struct file *filp, poll_table *wait)
 }
 
 static int
-pipe_release(struct inode *inode, int decr, int decw)
+pipe_release(struct inode *inode, struct file *file)
 {
 	struct pipe_inode_info *pipe;
 
 	mutex_lock(&inode->i_mutex);
 	pipe = inode->i_pipe;
-	pipe->readers -= decr;
-	pipe->writers -= decw;
+	if (file->f_mode & FMODE_READ)
+		pipe->readers--;
+	if (file->f_mode & FMODE_WRITE)
+		pipe->writers--;
 
 	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
@@ -756,174 +747,25 @@ pipe_release(struct inode *inode, int decr, int decw)
 }
 
 static int
-pipe_read_fasync(int fd, struct file *filp, int on)
-{
-	struct inode *inode = file_inode(filp);
-	int retval;
-
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
-	mutex_unlock(&inode->i_mutex);
-
-	return retval;
-}
-
-
-static int
-pipe_write_fasync(int fd, struct file *filp, int on)
-{
-	struct inode *inode = file_inode(filp);
-	int retval;
-
-	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
-	mutex_unlock(&inode->i_mutex);
-
-	return retval;
-}
-
-
-static int
-pipe_rdwr_fasync(int fd, struct file *filp, int on)
+pipe_fasync(int fd, struct file *filp, int on)
 {
 	struct inode *inode = file_inode(filp);
 	struct pipe_inode_info *pipe = inode->i_pipe;
-	int retval;
+	int retval = 0;
 
 	mutex_lock(&inode->i_mutex);
-	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
-	if (retval >= 0) {
+	if (filp->f_mode & FMODE_READ)
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
+	if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
 		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
-		if (retval < 0) /* this can happen only if on == T */
+		if (retval < 0 && (filp->f_mode & FMODE_READ))
+			/* this can happen only if on == T */
 			fasync_helper(-1, filp, 0, &pipe->fasync_readers);
 	}
 	mutex_unlock(&inode->i_mutex);
 	return retval;
 }
 
-
-static int
-pipe_read_release(struct inode *inode, struct file *filp)
-{
-	return pipe_release(inode, 1, 0);
-}
-
-static int
-pipe_write_release(struct inode *inode, struct file *filp)
-{
-	return pipe_release(inode, 0, 1);
-}
-
-static int
-pipe_rdwr_release(struct inode *inode, struct file *filp)
-{
-	int decr, decw;
-
-	decr = (filp->f_mode & FMODE_READ) != 0;
-	decw = (filp->f_mode & FMODE_WRITE) != 0;
-	return pipe_release(inode, decr, decw);
-}
-
-static int
-pipe_read_open(struct inode *inode, struct file *filp)
-{
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_pipe) {
-		ret = 0;
-		inode->i_pipe->readers++;
-	}
-
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-static int
-pipe_write_open(struct inode *inode, struct file *filp)
-{
-	int ret = -ENOENT;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_pipe) {
-		ret = 0;
-		inode->i_pipe->writers++;
-	}
-
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-static int
-pipe_rdwr_open(struct inode *inode, struct file *filp)
-{
-	int ret = -ENOENT;
-
-	if (!(filp->f_mode & (FMODE_READ|FMODE_WRITE)))
-		return -EINVAL;
-
-	mutex_lock(&inode->i_mutex);
-
-	if (inode->i_pipe) {
-		ret = 0;
-		if (filp->f_mode & FMODE_READ)
-			inode->i_pipe->readers++;
-		if (filp->f_mode & FMODE_WRITE)
-			inode->i_pipe->writers++;
-	}
-
-	mutex_unlock(&inode->i_mutex);
-
-	return ret;
-}
-
-/*
- * The file_operations structs are not static because they
- * are also used in linux/fs/fifo.c to do operations on FIFOs.
- *
- * Pipes reuse fifos' file_operations structs.
- */
-const struct file_operations read_pipefifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= bad_pipe_w,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_read_open,
-	.release	= pipe_read_release,
-	.fasync		= pipe_read_fasync,
-};
-
-const struct file_operations write_pipefifo_fops = {
-	.llseek		= no_llseek,
-	.read		= bad_pipe_r,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_write_open,
-	.release	= pipe_write_release,
-	.fasync		= pipe_write_fasync,
-};
-
-const struct file_operations rdwr_pipefifo_fops = {
-	.llseek		= no_llseek,
-	.read		= do_sync_read,
-	.aio_read	= pipe_read,
-	.write		= do_sync_write,
-	.aio_write	= pipe_write,
-	.poll		= pipe_poll,
-	.unlocked_ioctl	= pipe_ioctl,
-	.open		= pipe_rdwr_open,
-	.release	= pipe_rdwr_release,
-	.fasync		= pipe_rdwr_fasync,
-};
-
 struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 {
 	struct pipe_inode_info *pipe;
@@ -996,7 +838,7 @@ static struct inode * get_pipe_inode(void)
 	inode->i_pipe = pipe;
 
 	pipe->readers = pipe->writers = 1;
-	inode->i_fop = &rdwr_pipefifo_fops;
+	inode->i_fop = &pipefifo_fops;
 
 	/*
 	 * Mark the inode dirty from the very beginning,
@@ -1039,13 +881,13 @@ int create_pipe_files(struct file **res, int flags)
 	d_instantiate(path.dentry, inode);
 
 	err = -ENFILE;
-	f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
+	f = alloc_file(&path, FMODE_WRITE, &pipefifo_fops);
 	if (IS_ERR(f))
 		goto err_dentry;
 
 	f->f_flags = O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT));
 
-	res[0] = alloc_file(&path, FMODE_READ, &read_pipefifo_fops);
+	res[0] = alloc_file(&path, FMODE_READ, &pipefifo_fops);
 	if (IS_ERR(res[0]))
 		goto err_file;
 
@@ -1164,6 +1006,7 @@ static void wake_up_partner(struct inode* inode)
 static int fifo_open(struct inode *inode, struct file *filp)
 {
 	struct pipe_inode_info *pipe;
+	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
 	int ret;
 
 	mutex_lock(&inode->i_mutex);
@@ -1187,12 +1030,11 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  POSIX.1 says that O_NONBLOCK means return with the FIFO
 	 *  opened, even when there is no process writing the FIFO.
 	 */
-		filp->f_op = &read_pipefifo_fops;
 		pipe->r_counter++;
 		if (pipe->readers++ == 0)
 			wake_up_partner(inode);
 
-		if (!pipe->writers) {
+		if (!is_pipe && !pipe->writers) {
 			if ((filp->f_flags & O_NONBLOCK)) {
 				/* suppress POLLHUP until we have
 				 * seen a writer */
@@ -1211,15 +1053,14 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  errno=ENXIO when there is no process reading the FIFO.
 	 */
 		ret = -ENXIO;
-		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
+		if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
 			goto err;
 
-		filp->f_op = &write_pipefifo_fops;
 		pipe->w_counter++;
 		if (!pipe->writers++)
 			wake_up_partner(inode);
 
-		if (!pipe->readers) {
+		if (!is_pipe && !pipe->readers) {
 			if (wait_for_partner(inode, &pipe->r_counter))
 				goto err_wr;
 		}
@@ -1232,7 +1073,6 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  This implementation will NEVER block on a O_RDWR open, since
 	 *  the process can at least talk to itself.
 	 */
-		filp->f_op = &rdwr_pipefifo_fops;
 
 		pipe->readers++;
 		pipe->writers++;
@@ -1272,14 +1112,17 @@ err_nocleanup:
 	return ret;
 }
 
-/*
- * Dummy default file-operations: the only thing this does
- * is contain the open that then fills in the correct operations
- * depending on the access mode of the file...
- */
-const struct file_operations def_fifo_fops = {
-	.open		= fifo_open,	/* will set read_ or write_pipefifo_fops */
-	.llseek		= noop_llseek,
+const struct file_operations pipefifo_fops = {
+	.open		= fifo_open,
+	.llseek		= no_llseek,
+	.read		= do_sync_read,
+	.aio_read	= pipe_read,
+	.write		= do_sync_write,
+	.aio_write	= pipe_write,
+	.poll		= pipe_poll,
+	.unlocked_ioctl	= pipe_ioctl,
+	.release	= pipe_release,
+	.fasync		= pipe_fasync,
 };
 
 /*
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 578a66e6ee72..b1f28b02ede6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2080,7 +2080,6 @@ extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
-extern const struct file_operations def_fifo_fops;
 #ifdef CONFIG_BLOCK
 extern int ioctl_by_bdev(struct block_device *, unsigned, unsigned long);
 extern int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long);
@@ -2152,10 +2151,6 @@ extern void init_special_inode(struct inode *, umode_t, dev_t);
 extern void make_bad_inode(struct inode *);
 extern int is_bad_inode(struct inode *);
 
-extern const struct file_operations read_pipefifo_fops;
-extern const struct file_operations write_pipefifo_fops;
-extern const struct file_operations rdwr_pipefifo_fops;
-
 #ifdef CONFIG_BLOCK
 /*
  * return READ, READA, or WRITE
-- 
cgit 


From ba5bb147330a8737b6b5a812cc774c79c070704b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 21 Mar 2013 02:21:19 -0400
Subject: pipe: take allocation and freeing of pipe_inode_info out of ->i_mutex

* new field - pipe->files; number of struct file over that pipe (all
  sharing the same inode, of course); protected by inode->i_lock.
* pipe_release() decrements pipe->files, clears inode->i_pipe when
  if the counter has reached 0 (all under ->i_lock) and, in that case,
  frees pipe after having done pipe_unlock()
* fifo_open() starts with grabbing ->i_lock, and either bumps pipe->files
  if ->i_pipe was non-NULL or allocates a new pipe (dropping and regaining
  ->i_lock) and rechecks ->i_pipe; if it's still NULL, inserts new pipe
  there, otherwise bumps ->i_pipe->files and frees the one we'd allocated.
  At that point we know that ->i_pipe is non-NULL and won't go away, so
  we can do pipe_lock() on it and proceed as we used to.  If we end up
  failing, decrement pipe->files and if it reaches 0 clear ->i_pipe and
  free the sucker after pipe_unlock().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c                 | 72 +++++++++++++++++++++++++++++++++--------------
 include/linux/pipe_fs_i.h |  2 ++
 2 files changed, 53 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 357471db890d..abaa9234d27b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -718,23 +718,30 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, struct file *file)
 {
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *pipe = inode->i_pipe;
+	int kill = 0;
 
-	mutex_lock(&inode->i_mutex);
-	pipe = inode->i_pipe;
+	pipe_lock(pipe);
 	if (file->f_mode & FMODE_READ)
 		pipe->readers--;
 	if (file->f_mode & FMODE_WRITE)
 		pipe->writers--;
 
-	if (!pipe->readers && !pipe->writers) {
-		free_pipe_info(inode);
-	} else {
+	if (pipe->readers || pipe->writers) {
 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM | POLLERR | POLLHUP);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	mutex_unlock(&inode->i_mutex);
+	spin_lock(&inode->i_lock);
+	if (!--pipe->files) {
+		inode->i_pipe = NULL;
+		kill = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	pipe_unlock(pipe);
+
+	if (kill)
+		__free_pipe_info(pipe);
 
 	return 0;
 }
@@ -827,8 +834,9 @@ static struct inode * get_pipe_inode(void)
 	pipe = alloc_pipe_info(inode);
 	if (!pipe)
 		goto fail_iput;
-	inode->i_pipe = pipe;
 
+	inode->i_pipe = pipe;
+	pipe->files = 2;
 	pipe->readers = pipe->writers = 1;
 	inode->i_fop = &pipefifo_fops;
 
@@ -999,18 +1007,36 @@ static int fifo_open(struct inode *inode, struct file *filp)
 {
 	struct pipe_inode_info *pipe;
 	bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
+	int kill = 0;
 	int ret;
 
-	mutex_lock(&inode->i_mutex);
-	pipe = inode->i_pipe;
-	if (!pipe) {
-		ret = -ENOMEM;
+	filp->f_version = 0;
+
+	spin_lock(&inode->i_lock);
+	if (inode->i_pipe) {
+		pipe = inode->i_pipe;
+		pipe->files++;
+		spin_unlock(&inode->i_lock);
+	} else {
+		spin_unlock(&inode->i_lock);
 		pipe = alloc_pipe_info(inode);
 		if (!pipe)
-			goto err_nocleanup;
-		inode->i_pipe = pipe;
+			return -ENOMEM;
+		pipe->files = 1;
+		spin_lock(&inode->i_lock);
+		if (unlikely(inode->i_pipe)) {
+			inode->i_pipe->files++;
+			spin_unlock(&inode->i_lock);
+			__free_pipe_info(pipe);
+			pipe = inode->i_pipe;
+		} else {
+			inode->i_pipe = pipe;
+			spin_unlock(&inode->i_lock);
+		}
 	}
-	filp->f_version = 0;
+	/* OK, we have a pipe and it's pinned down */
+
+	pipe_lock(pipe);
 
 	/* We can only do regular read/write on fifos */
 	filp->f_mode &= (FMODE_READ | FMODE_WRITE);
@@ -1080,7 +1106,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	mutex_unlock(&inode->i_mutex);
+	pipe_unlock(pipe);
 	return 0;
 
 err_rd:
@@ -1096,11 +1122,15 @@ err_wr:
 	goto err;
 
 err:
-	if (!pipe->readers && !pipe->writers)
-		free_pipe_info(inode);
-
-err_nocleanup:
-	mutex_unlock(&inode->i_mutex);
+	spin_lock(&inode->i_lock);
+	if (!--pipe->files) {
+		inode->i_pipe = NULL;
+		kill = 1;
+	}
+	spin_unlock(&inode->i_lock);
+	pipe_unlock(pipe);
+	if (kill)
+		__free_pipe_info(pipe);
 	return ret;
 }
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index ad1a427b5267..59778e1c9c08 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -34,6 +34,7 @@ struct pipe_buffer {
  *	@tmp_page: cached released page
  *	@readers: number of current readers of this pipe
  *	@writers: number of current writers of this pipe
+ *	@files: number of struct file refering this pipe (protected by ->i_lock)
  *	@waiting_writers: number of writers blocked waiting for room
  *	@r_counter: reader counter
  *	@w_counter: writer counter
@@ -47,6 +48,7 @@ struct pipe_inode_info {
 	unsigned int nrbufs, curbuf, buffers;
 	unsigned int readers;
 	unsigned int writers;
+	unsigned int files;
 	unsigned int waiting_writers;
 	unsigned int r_counter;
 	unsigned int w_counter;
-- 
cgit 


From 72b0d9aacb89f3759931ec440e1b535671145bb4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 21 Mar 2013 02:32:24 -0400
Subject: pipe: don't use ->i_mutex

now it can be done - put mutex into pipe_inode_info, use it instead
of ->i_mutex

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/file.c           | 6 ++----
 fs/pipe.c                 | 5 +++--
 include/linux/pipe_fs_i.h | 2 ++
 3 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 1c93e771e950..8a7509f9e6f5 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2465,8 +2465,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 			out->f_path.dentry->d_name.len,
 			out->f_path.dentry->d_name.name, len);
 
-	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_PARENT);
+	pipe_lock(pipe);
 
 	splice_from_pipe_begin(&sd);
 	do {
@@ -2486,8 +2485,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 	} while (ret > 0);
 	splice_from_pipe_end(pipe, &sd);
 
-	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+	pipe_unlock(pipe);
 
 	if (sd.num_spliced)
 		ret = sd.num_spliced;
diff --git a/fs/pipe.c b/fs/pipe.c
index abaa9234d27b..d4b97e4e37c5 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -56,7 +56,7 @@ unsigned int pipe_min_size = PAGE_SIZE;
 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
 {
 	if (pipe->inode)
-		mutex_lock_nested(&pipe->inode->i_mutex, subclass);
+		mutex_lock_nested(&pipe->mutex, subclass);
 }
 
 void pipe_lock(struct pipe_inode_info *pipe)
@@ -71,7 +71,7 @@ EXPORT_SYMBOL(pipe_lock);
 void pipe_unlock(struct pipe_inode_info *pipe)
 {
 	if (pipe->inode)
-		mutex_unlock(&pipe->inode->i_mutex);
+		mutex_unlock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_unlock);
 
@@ -777,6 +777,7 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 			pipe->r_counter = pipe->w_counter = 1;
 			pipe->inode = inode;
 			pipe->buffers = PIPE_DEF_BUFFERS;
+			mutex_init(&pipe->mutex);
 			return pipe;
 		}
 		kfree(pipe);
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 59778e1c9c08..d803a85a64b6 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -27,6 +27,7 @@ struct pipe_buffer {
 
 /**
  *	struct pipe_inode_info - a linux kernel pipe
+ *	@mutex: mutex protecting the whole thing
  *	@wait: reader/writer wait point in case of empty/full pipe
  *	@nrbufs: the number of non-empty pipe buffers in this pipe
  *	@buffers: total number of buffers (should be a power of 2)
@@ -44,6 +45,7 @@ struct pipe_buffer {
  *	@bufs: the circular array of pipe buffers
  **/
 struct pipe_inode_info {
+	struct mutex mutex;
 	wait_queue_head_t wait;
 	unsigned int nrbufs, curbuf, buffers;
 	unsigned int readers;
-- 
cgit 


From 6447a3cf19da8c4653283d1c491e2e775633f348 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 21 Mar 2013 11:01:38 -0400
Subject: get rid of pipe->inode

it's used only as a flag to distinguish normal pipes/FIFOs from the
internal per-task one used by file-to-file splice.  And pipe->files
would work just as well for that purpose...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fuse/dev.c             | 2 +-
 fs/pipe.c                 | 5 ++---
 fs/splice.c               | 4 ++--
 include/linux/pipe_fs_i.h | 2 --
 4 files changed, 5 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 11dfa0c3fb46..9bfd1a3214e6 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1319,7 +1319,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
 		page_nr++;
 		ret += buf->len;
 
-		if (pipe->inode)
+		if (pipe->files)
 			do_wakeup = 1;
 	}
 
diff --git a/fs/pipe.c b/fs/pipe.c
index e2fc5ccb0d49..39bdec06fe2b 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -55,7 +55,7 @@ unsigned int pipe_min_size = PAGE_SIZE;
 
 static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
 {
-	if (pipe->inode)
+	if (pipe->files)
 		mutex_lock_nested(&pipe->mutex, subclass);
 }
 
@@ -70,7 +70,7 @@ EXPORT_SYMBOL(pipe_lock);
 
 void pipe_unlock(struct pipe_inode_info *pipe)
 {
-	if (pipe->inode)
+	if (pipe->files)
 		mutex_unlock(&pipe->mutex);
 }
 EXPORT_SYMBOL(pipe_unlock);
@@ -785,7 +785,6 @@ struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
 		if (pipe->bufs) {
 			init_waitqueue_head(&pipe->wait);
 			pipe->r_counter = pipe->w_counter = 1;
-			pipe->inode = inode;
 			pipe->buffers = PIPE_DEF_BUFFERS;
 			mutex_init(&pipe->mutex);
 			return pipe;
diff --git a/fs/splice.c b/fs/splice.c
index 7efc2f5057fb..9f2a4447da50 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -218,7 +218,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
 			page_nr++;
 			ret += buf->len;
 
-			if (pipe->inode)
+			if (pipe->files)
 				do_wakeup = 1;
 
 			if (!--spd->nr_pages)
@@ -828,7 +828,7 @@ int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd,
 			ops->release(pipe, buf);
 			pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
 			pipe->nrbufs--;
-			if (pipe->inode)
+			if (pipe->files)
 				sd->need_wakeup = true;
 		}
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index d803a85a64b6..ed8eeeb10811 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -41,7 +41,6 @@ struct pipe_buffer {
  *	@w_counter: writer counter
  *	@fasync_readers: reader side fasync
  *	@fasync_writers: writer side fasync
- *	@inode: inode this pipe is attached to
  *	@bufs: the circular array of pipe buffers
  **/
 struct pipe_inode_info {
@@ -57,7 +56,6 @@ struct pipe_inode_info {
 	struct page *tmp_page;
 	struct fasync_struct *fasync_readers;
 	struct fasync_struct *fasync_writers;
-	struct inode *inode;
 	struct pipe_buffer *bufs;
 };
 
-- 
cgit 


From 7bee130e222dfb3a7a70c0404dc09f104cddd7d6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 21 Mar 2013 11:04:15 -0400
Subject: get rid of alloc_pipe_info() argument

not used anymore

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c                 | 6 +++---
 fs/splice.c               | 2 +-
 include/linux/pipe_fs_i.h | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 39bdec06fe2b..6cac5ceeded0 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -775,7 +775,7 @@ pipe_fasync(int fd, struct file *filp, int on)
 	return retval;
 }
 
-struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
@@ -841,7 +841,7 @@ static struct inode * get_pipe_inode(void)
 
 	inode->i_ino = get_next_ino();
 
-	pipe = alloc_pipe_info(inode);
+	pipe = alloc_pipe_info();
 	if (!pipe)
 		goto fail_iput;
 
@@ -1031,7 +1031,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		spin_unlock(&inode->i_lock);
 	} else {
 		spin_unlock(&inode->i_lock);
-		pipe = alloc_pipe_info(inode);
+		pipe = alloc_pipe_info();
 		if (!pipe)
 			return -ENOMEM;
 		pipe->files = 1;
diff --git a/fs/splice.c b/fs/splice.c
index 9f2a4447da50..45e645b15d92 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1183,7 +1183,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
 	 */
 	pipe = current->splice_pipe;
 	if (unlikely(!pipe)) {
-		pipe = alloc_pipe_info(NULL);
+		pipe = alloc_pipe_info();
 		if (!pipe)
 			return -ENOMEM;
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index ed8eeeb10811..765114f16243 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -146,7 +146,7 @@ int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 /* Drop the inode semaphore and wait for a pipe event, atomically */
 void pipe_wait(struct pipe_inode_info *pipe);
 
-struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
+struct pipe_inode_info *alloc_pipe_info(void);
 void free_pipe_info(struct inode * inode);
 void __free_pipe_info(struct pipe_inode_info *);
 
-- 
cgit 


From 4b8a8f1e4f94fd87747e6e3acef74cf0b4dc0dae Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 21 Mar 2013 11:06:46 -0400
Subject: get rid of the last free_pipe_info() callers

and rename __free_pipe_info() to free_pipe_info()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/pipe.c                 | 18 ++++++------------
 include/linux/pipe_fs_i.h |  3 +--
 kernel/exit.c             |  2 +-
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/fs/pipe.c b/fs/pipe.c
index 6cac5ceeded0..a029a14bacf1 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -751,7 +751,7 @@ pipe_release(struct inode *inode, struct file *file)
 	__pipe_unlock(pipe);
 
 	if (kill)
-		__free_pipe_info(pipe);
+		free_pipe_info(pipe);
 
 	return 0;
 }
@@ -795,7 +795,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 	return NULL;
 }
 
-void __free_pipe_info(struct pipe_inode_info *pipe)
+void free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
@@ -810,12 +810,6 @@ void __free_pipe_info(struct pipe_inode_info *pipe)
 	kfree(pipe);
 }
 
-void free_pipe_info(struct inode *inode)
-{
-	__free_pipe_info(inode->i_pipe);
-	inode->i_pipe = NULL;
-}
-
 static struct vfsmount *pipe_mnt __read_mostly;
 
 /*
@@ -911,12 +905,12 @@ int create_pipe_files(struct file **res, int flags)
 err_file:
 	put_filp(f);
 err_dentry:
-	free_pipe_info(inode);
+	free_pipe_info(inode->i_pipe);
 	path_put(&path);
 	return err;
 
 err_inode:
-	free_pipe_info(inode);
+	free_pipe_info(inode->i_pipe);
 	iput(inode);
 	return err;
 }
@@ -1039,7 +1033,7 @@ static int fifo_open(struct inode *inode, struct file *filp)
 		if (unlikely(inode->i_pipe)) {
 			inode->i_pipe->files++;
 			spin_unlock(&inode->i_lock);
-			__free_pipe_info(pipe);
+			free_pipe_info(pipe);
 			pipe = inode->i_pipe;
 		} else {
 			inode->i_pipe = pipe;
@@ -1143,7 +1137,7 @@ err:
 	spin_unlock(&inode->i_lock);
 	__pipe_unlock(pipe);
 	if (kill)
-		__free_pipe_info(pipe);
+		free_pipe_info(pipe);
 	return ret;
 }
 
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index 765114f16243..b8809fef61f5 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -147,8 +147,7 @@ int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *);
 void pipe_wait(struct pipe_inode_info *pipe);
 
 struct pipe_inode_info *alloc_pipe_info(void);
-void free_pipe_info(struct inode * inode);
-void __free_pipe_info(struct pipe_inode_info *);
+void free_pipe_info(struct pipe_inode_info *);
 
 /* Generic pipe buffer ops functions */
 void *generic_pipe_buf_map(struct pipe_inode_info *, struct pipe_buffer *, int);
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca9935..cd9e9e799bd2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -847,7 +847,7 @@ void do_exit(long code)
 		exit_io_context(tsk);
 
 	if (tsk->splice_pipe)
-		__free_pipe_info(tsk->splice_pipe);
+		free_pipe_info(tsk->splice_pipe);
 
 	if (tsk->task_frag.page)
 		put_page(tsk->task_frag.page);
-- 
cgit 


From 021ada7dff22d0d9540ff596cb0f8bb866755ee1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 29 Mar 2013 19:27:05 -0400
Subject: procfs: switch /proc/self away from proc_dir_entry

Just have it pinned in dcache all along and let procfs ->kill_sb()
drop it before kill_anon_super().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c                | 16 +++++++++++----
 fs/proc/inode.c               |  2 +-
 fs/proc/internal.h            |  1 +
 fs/proc/root.c                |  2 ++
 fs/proc/self.c                | 46 ++++++++++++++++++++++++++++++++++++-------
 include/linux/pid_namespace.h |  1 +
 6 files changed, 56 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 69078c7cef1f..593e7c5ddb49 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2794,7 +2794,7 @@ retry:
 	return iter;
 }
 
-#define TGID_OFFSET (FIRST_PROCESS_ENTRY)
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + 1)
 
 static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
 	struct tgid_iter iter)
@@ -2817,13 +2817,21 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
 	filldir_t __filldir;
+	loff_t pos = filp->f_pos;
 
-	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
+	if (pos >= PID_MAX_LIMIT + TGID_OFFSET)
 		goto out;
 
-	ns = filp->f_dentry->d_sb->s_fs_info;
+	if (pos == TGID_OFFSET - 1) {
+		if (proc_fill_cache(filp, dirent, filldir, "self", 4,
+					NULL, NULL, NULL) < 0)
+			goto out;
+		iter.tgid = 0;
+	} else {
+		iter.tgid = pos - TGID_OFFSET;
+	}
 	iter.task = NULL;
-	iter.tgid = filp->f_pos - TGID_OFFSET;
+	ns = filp->f_dentry->d_sb->s_fs_info;
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 869116c2afbe..908e97457319 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -506,5 +506,5 @@ int proc_fill_super(struct super_block *s)
 		return -ENOMEM;
 	}
 
-	return 0;
+	return proc_setup_self(s);
 }
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 85ff3a4598b3..9c93a53f371d 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -205,3 +205,4 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr);
 extern const struct inode_operations proc_ns_dir_inode_operations;
 extern const struct file_operations proc_ns_dir_operations;
 
+extern int proc_setup_self(struct super_block *);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c6e9fac26bac..20834b3c8ea3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -137,6 +137,8 @@ static void proc_kill_sb(struct super_block *sb)
 	struct pid_namespace *ns;
 
 	ns = (struct pid_namespace *)sb->s_fs_info;
+	if (ns->proc_self)
+		dput(ns->proc_self);
 	kill_anon_super(sb);
 	put_pid_ns(ns);
 }
diff --git a/fs/proc/self.c b/fs/proc/self.c
index d8a025296613..21940d89977e 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -1,6 +1,7 @@
-#include <linux/proc_fs.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/pid_namespace.h>
+#include "internal.h"
 
 /*
  * /proc/self:
@@ -48,12 +49,43 @@ static const struct inode_operations proc_self_inode_operations = {
 	.put_link	= proc_self_put_link,
 };
 
-void __init proc_self_init(void)
+static unsigned self_inum;
+
+int proc_setup_self(struct super_block *s)
 {
-	struct proc_dir_entry *proc_self_symlink;
-	umode_t mode;
+	struct inode *root_inode = s->s_root->d_inode;
+	struct pid_namespace *ns = s->s_fs_info;
+	struct dentry *self;
+	
+	mutex_lock(&root_inode->i_mutex);
+	self = d_alloc_name(s->s_root, "self");
+	if (self) {
+		struct inode *inode = new_inode_pseudo(s);
+		if (inode) {
+			inode->i_ino = self_inum;
+			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_uid = GLOBAL_ROOT_UID;
+			inode->i_gid = GLOBAL_ROOT_GID;
+			inode->i_op = &proc_self_inode_operations;
+			d_add(self, inode);
+		} else {
+			dput(self);
+			self = ERR_PTR(-ENOMEM);
+		}
+	} else {
+		self = ERR_PTR(-ENOMEM);
+	}
+	mutex_unlock(&root_inode->i_mutex);
+	if (IS_ERR(self)) {
+		pr_err("proc_fill_super: can't allocate /proc/self\n");
+		return PTR_ERR(self);
+	}
+	ns->proc_self = self;
+	return 0;
+}
 
-	mode = S_IFLNK | S_IRWXUGO;
-	proc_self_symlink = proc_create("self", mode, NULL, NULL );
-	proc_self_symlink->proc_iops = &proc_self_inode_operations;
+void __init proc_self_init(void)
+{
+	proc_alloc_inum(&self_inum);
 }
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 215e5e3dda10..5524f8cfa950 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -28,6 +28,7 @@ struct pid_namespace {
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
 	struct vfsmount *proc_mnt;
+	struct dentry *proc_self;
 #endif
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
-- 
cgit 


From 2043f495c7c1a06f7748b5bcd17656d93c95e1a6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Mar 2013 13:43:23 -0400
Subject: new helper: single_open_size()

Same as single_open(), but preallocates the buffer of given size.
Doesn't make any sense for sizes up to PAGE_SIZE and doesn't make
sense if output of show() exceeds PAGE_SIZE only rarely - seq_read()
will take care of growing the buffer and redoing show().  If you
_know_ that it will be large, it might make more sense to look into
saner iterator, rather than go with single-shot one.  If that's
impossible, single_open_size() might be for you.

Again, don't use that without a good reason; occasionally that's really
the best way to go, but very often there are better solutions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/seq_file.c            | 18 ++++++++++++++++++
 include/linux/seq_file.h |  1 +
 2 files changed, 19 insertions(+)

(limited to 'include/linux')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 38bb59f3f2ad..774c1eb7f1c9 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -599,6 +599,24 @@ int single_open(struct file *file, int (*show)(struct seq_file *, void *),
 }
 EXPORT_SYMBOL(single_open);
 
+int single_open_size(struct file *file, int (*show)(struct seq_file *, void *),
+		void *data, size_t size)
+{
+	char *buf = kmalloc(size, GFP_KERNEL);
+	int ret;
+	if (!buf)
+		return -ENOMEM;
+	ret = single_open(file, show, data);
+	if (ret) {
+		kfree(buf);
+		return ret;
+	}
+	((struct seq_file *)file->private_data)->buf = buf;
+	((struct seq_file *)file->private_data)->size = size;
+	return 0;
+}
+EXPORT_SYMBOL(single_open_size);
+
 int single_release(struct inode *inode, struct file *file)
 {
 	const struct seq_operations *op = ((struct seq_file *)file->private_data)->op;
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 68a04a343cad..2da29ac178fc 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -123,6 +123,7 @@ static inline int seq_nodemask_list(struct seq_file *m, nodemask_t *mask)
 }
 
 int single_open(struct file *, int (*)(struct seq_file *, void *), void *);
+int single_open_size(struct file *, int (*)(struct seq_file *, void *), void *, size_t);
 int single_release(struct inode *, struct file *);
 void *__seq_open_private(struct file *, const struct seq_operations *, int);
 int seq_open_private(struct file *, const struct seq_operations *, int);
-- 
cgit 


From ee21ed0afc2f47007fbd8b22928ecb17316e13e2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Mar 2013 15:30:40 -0400
Subject: procfs: kill ->write_proc()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c       | 25 -------------------------
 include/linux/proc_fs.h |  1 -
 2 files changed, 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 6bce60703c76..51fcb201e289 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -196,30 +196,6 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	return rv;
 }
 
-static ssize_t
-proc_file_write(struct file *file, const char __user *buffer,
-		size_t count, loff_t *ppos)
-{
-	struct proc_dir_entry *pde = PDE(file_inode(file));
-	ssize_t rv = -EIO;
-
-	if (pde->write_proc) {
-		spin_lock(&pde->pde_unload_lock);
-		if (!pde->proc_fops) {
-			spin_unlock(&pde->pde_unload_lock);
-			return rv;
-		}
-		pde->pde_users++;
-		spin_unlock(&pde->pde_unload_lock);
-
-		/* FIXME: does this routine need ppos?  probably... */
-		rv = pde->write_proc(file, buffer, count, pde->data);
-		pde_users_dec(pde);
-	}
-	return rv;
-}
-
-
 static loff_t
 proc_file_lseek(struct file *file, loff_t offset, int orig)
 {
@@ -239,7 +215,6 @@ proc_file_lseek(struct file *file, loff_t offset, int orig)
 static const struct file_operations proc_file_operations = {
 	.llseek		= proc_file_lseek,
 	.read		= proc_file_read,
-	.write		= proc_file_write,
 };
 
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 94dfb2aa5533..4f4137a0bd8a 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -72,7 +72,6 @@ struct proc_dir_entry {
 	struct proc_dir_entry *next, *parent, *subdir;
 	void *data;
 	read_proc_t *read_proc;
-	write_proc_t *write_proc;
 	atomic_t count;		/* use count */
 	int pde_users;	/* number of callers into module in progress */
 	struct completion *pde_unload_completion;
-- 
cgit 


From d9dda78bad879595d8c4220a067fc029d6484a16 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 31 Mar 2013 18:16:14 -0400
Subject: procfs: new helper - PDE_DATA(inode)

The only part of proc_dir_entry the code outside of fs/proc
really cares about is PDE(inode)->data.  Provide a helper
for that; static inline for now, eventually will be moved
to fs/proc, along with the knowledge of struct proc_dir_entry
layout.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/srm_env.c              |  4 ++--
 arch/arm/kernel/atags_proc.c             |  2 +-
 arch/blackfin/kernel/cplbinfo.c          |  4 +---
 arch/ia64/kernel/salinfo.c               | 18 +++++----------
 arch/mips/lasat/picvue_proc.c            |  4 ++--
 arch/powerpc/kernel/proc_powerpc.c       | 19 +++++++---------
 arch/powerpc/platforms/pseries/scanlog.c |  9 +++-----
 arch/sh/mm/alignment.c                   |  2 +-
 arch/sparc/kernel/ioport.c               |  2 +-
 arch/tile/kernel/hardwall.c              |  2 +-
 arch/xtensa/platforms/iss/simdisk.c      |  4 ++--
 drivers/acpi/ac.c                        |  2 +-
 drivers/acpi/battery.c                   |  2 +-
 drivers/acpi/button.c                    |  2 +-
 drivers/acpi/proc.c                      |  4 ++--
 drivers/acpi/sbs.c                       |  8 +++----
 drivers/block/DAC960.c                   |  8 +++----
 drivers/block/cciss.c                    |  2 +-
 drivers/block/cpqarray.c                 |  2 +-
 drivers/block/drbd/drbd_proc.c           |  2 +-
 drivers/block/pktcdvd.c                  |  2 +-
 drivers/block/ps3vram.c                  |  2 +-
 drivers/char/ipmi/ipmi_msghandler.c      |  6 ++---
 drivers/char/ipmi/ipmi_si_intf.c         |  6 ++---
 drivers/gpu/drm/drm_proc.c               |  2 +-
 drivers/ide/ide-cd.c                     |  2 +-
 drivers/ide/ide-disk_proc.c              |  8 +++----
 drivers/ide/ide-floppy_proc.c            |  2 +-
 drivers/ide/ide-proc.c                   | 22 +++++++++---------
 drivers/ide/ide-tape.c                   |  2 +-
 drivers/isdn/gigaset/capi.c              |  2 +-
 drivers/isdn/hardware/avm/b1.c           |  2 +-
 drivers/isdn/hardware/avm/b1dma.c        |  2 +-
 drivers/isdn/hardware/avm/c4.c           |  2 +-
 drivers/isdn/hardware/eicon/divasproc.c  | 12 +++++-----
 drivers/isdn/hysdn/hycapi.c              |  2 +-
 drivers/isdn/hysdn/hysdn_procconf.c      |  4 ++--
 drivers/isdn/hysdn/hysdn_proclog.c       |  8 +++----
 drivers/macintosh/via-pmu.c              |  2 +-
 drivers/media/pci/zoran/zoran_procfs.c   |  4 ++--
 drivers/message/fusion/mptbase.c         |  4 ++--
 drivers/message/i2o/i2o_proc.c           | 38 ++++++++++++++++----------------
 drivers/net/bonding/bond_procfs.c        |  4 +---
 drivers/net/irda/vlsi_ir.c               |  2 +-
 drivers/net/wireless/airo.c              | 33 +++++++++------------------
 drivers/net/wireless/ray_cs.c            |  2 +-
 drivers/parisc/led.c                     |  4 ++--
 drivers/pci/proc.c                       | 20 ++++++-----------
 drivers/platform/x86/thinkpad_acpi.c     |  4 ++--
 drivers/platform/x86/toshiba_acpi.c      | 18 +++++++--------
 drivers/pnp/isapnp/proc.c                |  4 +---
 drivers/pnp/pnpbios/proc.c               |  4 ++--
 drivers/rtc/rtc-proc.c                   |  2 +-
 drivers/scsi/scsi_proc.c                 |  4 ++--
 drivers/staging/ccg/rndis.c              |  4 ++--
 drivers/staging/dgrp/dgrp_dpa_ops.c      |  9 +-------
 drivers/staging/dgrp/dgrp_mon_ops.c      |  9 +-------
 drivers/staging/dgrp/dgrp_net_ops.c      |  9 +-------
 drivers/staging/dgrp/dgrp_ports_ops.c    |  2 +-
 drivers/staging/silicom/bpctl_mod.c      | 38 ++++++++++++++++----------------
 drivers/tty/serial/serial_core.c         |  2 +-
 drivers/usb/gadget/at91_udc.c            |  2 +-
 drivers/usb/gadget/lpc32xx_udc.c         |  2 +-
 drivers/usb/gadget/rndis.c               |  4 ++--
 drivers/usb/host/isp1362-hcd.c           |  2 +-
 drivers/usb/host/sl811-hcd.c             |  2 +-
 drivers/video/bfin_adv7393fb.c           |  2 +-
 drivers/zorro/proc.c                     |  4 +---
 fs/afs/proc.c                            |  8 +++----
 fs/ext4/mballoc.c                        |  2 +-
 fs/ext4/super.c                          |  2 +-
 fs/jbd2/journal.c                        |  2 +-
 fs/proc/generic.c                        |  2 +-
 fs/proc/proc_devtree.c                   |  2 +-
 include/linux/proc_fs.h                  |  5 +++++
 ipc/util.c                               |  2 +-
 kernel/irq/proc.c                        | 14 ++++++------
 net/8021q/vlanproc.c                     |  2 +-
 net/atm/proc.c                           |  2 +-
 net/bluetooth/af_bluetooth.c             |  2 +-
 net/bluetooth/cmtp/capi.c                |  2 +-
 net/can/bcm.c                            |  2 +-
 net/can/proc.c                           |  2 +-
 net/core/neighbour.c                     |  2 +-
 net/core/pktgen.c                        |  6 ++---
 net/ipv4/netfilter/ipt_CLUSTERIP.c       |  6 ++---
 net/ipv4/tcp_ipv4.c                      |  2 +-
 net/ipv4/udp.c                           |  2 +-
 net/ipv6/proc.c                          |  2 +-
 net/netfilter/x_tables.c                 |  6 ++---
 net/netfilter/xt_hashlimit.c             |  2 +-
 net/netfilter/xt_recent.c                |  6 ++---
 net/sunrpc/cache.c                       | 24 ++++++++++----------
 net/sunrpc/stats.c                       |  2 +-
 sound/core/info.c                        |  4 +---
 95 files changed, 243 insertions(+), 302 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/srm_env.c b/arch/alpha/kernel/srm_env.c
index e64559f0a82d..ef8769b6c98b 100644
--- a/arch/alpha/kernel/srm_env.c
+++ b/arch/alpha/kernel/srm_env.c
@@ -104,14 +104,14 @@ static int srm_env_proc_show(struct seq_file *m, void *v)
 
 static int srm_env_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, srm_env_proc_show, PDE(inode)->data);
+	return single_open(file, srm_env_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t srm_env_proc_write(struct file *file, const char __user *buffer,
 				  size_t count, loff_t *pos)
 {
 	int res;
-	srm_env_t	*entry = PDE(file_inode(file))->data;
+	srm_env_t	*entry = PDE_DATA(file_inode(file));
 	char		*buf = (char *) __get_free_page(GFP_USER);
 	unsigned long	ret1, ret2;
 
diff --git a/arch/arm/kernel/atags_proc.c b/arch/arm/kernel/atags_proc.c
index 8c00f75bf1ab..c7ff8073416f 100644
--- a/arch/arm/kernel/atags_proc.c
+++ b/arch/arm/kernel/atags_proc.c
@@ -12,7 +12,7 @@ struct buffer {
 static ssize_t atags_read(struct file *file, char __user *buf,
 			  size_t count, loff_t *ppos)
 {
-	struct buffer *b = PDE(file_inode(file))->data;
+	struct buffer *b = PDE_DATA(file_inode(file));
 	return simple_read_from_buffer(buf, count, ppos, b->data, b->size);
 }
 
diff --git a/arch/blackfin/kernel/cplbinfo.c b/arch/blackfin/kernel/cplbinfo.c
index e1d0b24c6070..404045dcc5e4 100644
--- a/arch/blackfin/kernel/cplbinfo.c
+++ b/arch/blackfin/kernel/cplbinfo.c
@@ -116,14 +116,12 @@ static const struct seq_operations cplbinfo_sops = {
 
 static int cplbinfo_open(struct inode *inode, struct file *file)
 {
-	struct proc_dir_entry *pde = PDE(file_inode(file));
 	char cplb_type;
-	unsigned int cpu;
+	unsigned int cpu = (unsigned long)PDE_DATA(file_inode(file));
 	int ret;
 	struct seq_file *m;
 	struct cplbinfo_data *cdata;
 
-	cpu = (unsigned int)pde->data;
 	cplb_type = cpu & CPLBINFO_DCPLB_FLAG ? 'D' : 'I';
 	cpu &= ~CPLBINFO_DCPLB_FLAG;
 
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index aa527d7e91f2..a97d75b9c5ec 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -301,9 +301,7 @@ salinfo_event_open(struct inode *inode, struct file *file)
 static ssize_t
 salinfo_event_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
 {
-	struct inode *inode = file_inode(file);
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(file_inode(file));
 	char cmd[32];
 	size_t size;
 	int i, n, cpu = -1;
@@ -360,8 +358,7 @@ static const struct file_operations salinfo_event_fops = {
 static int
 salinfo_log_open(struct inode *inode, struct file *file)
 {
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(inode);
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -386,8 +383,7 @@ salinfo_log_open(struct inode *inode, struct file *file)
 static int
 salinfo_log_release(struct inode *inode, struct file *file)
 {
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(inode);
 
 	if (data->state == STATE_NO_DATA) {
 		vfree(data->log_buffer);
@@ -463,9 +459,7 @@ retry:
 static ssize_t
 salinfo_log_read(struct file *file, char __user *buffer, size_t count, loff_t *ppos)
 {
-	struct inode *inode = file_inode(file);
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(file_inode(file));
 	u8 *buf;
 	u64 bufsize;
 
@@ -524,9 +518,7 @@ salinfo_log_clear(struct salinfo_data *data, int cpu)
 static ssize_t
 salinfo_log_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos)
 {
-	struct inode *inode = file_inode(file);
-	struct proc_dir_entry *entry = PDE(inode);
-	struct salinfo_data *data = entry->data;
+	struct salinfo_data *data = PDE_DATA(file_inode(file));
 	char cmd[32];
 	size_t size;
 	u32 offset;
diff --git a/arch/mips/lasat/picvue_proc.c b/arch/mips/lasat/picvue_proc.c
index c592bc8b8c99..638c5db122c9 100644
--- a/arch/mips/lasat/picvue_proc.c
+++ b/arch/mips/lasat/picvue_proc.c
@@ -58,13 +58,13 @@ static int pvc_line_proc_show(struct seq_file *m, void *v)
 
 static int pvc_line_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pvc_line_proc_show, PDE(inode)->data);
+	return single_open(file, pvc_line_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t pvc_line_proc_write(struct file *file, const char __user *buf,
 				   size_t count, loff_t *pos)
 {
-	int lineno = *(int *)PDE(file_inode(file))->data;
+	int lineno = *(int *)PDE_DATA(file_inode(file));
 	char kbuf[PVC_LINELEN];
 	size_t len;
 
diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
index f19d0bdc3241..41d8ee9c82f1 100644
--- a/arch/powerpc/kernel/proc_powerpc.c
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -32,8 +32,6 @@
 static loff_t page_map_seek( struct file *file, loff_t off, int whence)
 {
 	loff_t new;
-	struct proc_dir_entry *dp = PDE(file_inode(file));
-
 	switch(whence) {
 	case 0:
 		new = off;
@@ -42,12 +40,12 @@ static loff_t page_map_seek( struct file *file, loff_t off, int whence)
 		new = file->f_pos + off;
 		break;
 	case 2:
-		new = dp->size + off;
+		new = PAGE_SIZE + off;
 		break;
 	default:
 		return -EINVAL;
 	}
-	if ( new < 0 || new > dp->size )
+	if ( new < 0 || new > PAGE_SIZE )
 		return -EINVAL;
 	return (file->f_pos = new);
 }
@@ -55,19 +53,18 @@ static loff_t page_map_seek( struct file *file, loff_t off, int whence)
 static ssize_t page_map_read( struct file *file, char __user *buf, size_t nbytes,
 			      loff_t *ppos)
 {
-	struct proc_dir_entry *dp = PDE(file_inode(file));
-	return simple_read_from_buffer(buf, nbytes, ppos, dp->data, dp->size);
+	return simple_read_from_buffer(buf, nbytes, ppos,
+			PDE_DATA(file_inode(file)), PAGE_SIZE);
 }
 
 static int page_map_mmap( struct file *file, struct vm_area_struct *vma )
 {
-	struct proc_dir_entry *dp = PDE(file_inode(file));
-
-	if ((vma->vm_end - vma->vm_start) > dp->size)
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE)
 		return -EINVAL;
 
-	remap_pfn_range(vma, vma->vm_start, __pa(dp->data) >> PAGE_SHIFT,
-						dp->size, vma->vm_page_prot);
+	remap_pfn_range(vma, vma->vm_start,
+			__pa(PDE_DATA(file_inode(file))) >> PAGE_SHIFT,
+			PAGE_SIZE, vma->vm_page_prot);
 	return 0;
 }
 
diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c
index 47f3cda2a68b..cc220d2061b2 100644
--- a/arch/powerpc/platforms/pseries/scanlog.c
+++ b/arch/powerpc/platforms/pseries/scanlog.c
@@ -46,8 +46,7 @@ static struct proc_dir_entry *proc_ppc64_scan_log_dump;	/* The proc file */
 static ssize_t scanlog_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct proc_dir_entry *dp = PDE(file_inode(file));
-	unsigned int *data = (unsigned int *)dp->data;
+	unsigned int *data = PDE_DATA(file_inode(file));
 	int status;
 	unsigned long len, off;
 	unsigned int wait_time;
@@ -135,8 +134,7 @@ static ssize_t scanlog_write(struct file * file, const char __user * buf,
 
 static int scanlog_open(struct inode * inode, struct file * file)
 {
-	struct proc_dir_entry *dp = PDE(inode);
-	unsigned int *data = (unsigned int *)dp->data;
+	unsigned int *data = PDE_DATA(file_inode(file));
 
 	if (data[0] != 0) {
 		/* This imperfect test stops a second copy of the
@@ -152,8 +150,7 @@ static int scanlog_open(struct inode * inode, struct file * file)
 
 static int scanlog_release(struct inode * inode, struct file * file)
 {
-	struct proc_dir_entry *dp = PDE(inode);
-	unsigned int *data = (unsigned int *)dp->data;
+	unsigned int *data = PDE_DATA(file_inode(file));
 
 	data[0] = 0;
 
diff --git a/arch/sh/mm/alignment.c b/arch/sh/mm/alignment.c
index aea14855e656..ec2b25302427 100644
--- a/arch/sh/mm/alignment.c
+++ b/arch/sh/mm/alignment.c
@@ -140,7 +140,7 @@ static int alignment_proc_open(struct inode *inode, struct file *file)
 static ssize_t alignment_proc_write(struct file *file,
 		const char __user *buffer, size_t count, loff_t *pos)
 {
-	int *data = PDE(file_inode(file))->data;
+	int *data = PDE_DATA(file_inode(file));
 	char mode;
 
 	if (count > 0) {
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index 0f094db918c7..2096468de9b2 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -693,7 +693,7 @@ static int sparc_io_proc_show(struct seq_file *m, void *v)
 
 static int sparc_io_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, sparc_io_proc_show, PDE(inode)->data);
+	return single_open(file, sparc_io_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations sparc_io_proc_fops = {
diff --git a/arch/tile/kernel/hardwall.c b/arch/tile/kernel/hardwall.c
index 20273ee37deb..38ac189d9575 100644
--- a/arch/tile/kernel/hardwall.c
+++ b/arch/tile/kernel/hardwall.c
@@ -914,7 +914,7 @@ static int hardwall_proc_show(struct seq_file *sf, void *v)
 static int hardwall_proc_open(struct inode *inode,
 			      struct file *file)
 {
-	return single_open(file, hardwall_proc_show, PDE(inode)->data);
+	return single_open(file, hardwall_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations hardwall_proc_fops = {
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 47ccef7839c9..4a06d70ddf5e 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -217,7 +217,7 @@ static int simdisk_detach(struct simdisk *dev)
 static ssize_t proc_read_simdisk(struct file *file, char __user *buf,
 			size_t size, loff_t *ppos)
 {
-	struct simdisk *dev = PDE(file_inode(file))->data;
+	struct simdisk *dev = PDE_DATA(file_inode(file));
 	char *s = dev->filename;
 	if (s) {
 		ssize_t n = simple_read_from_buffer(buf, size, ppos,
@@ -234,7 +234,7 @@ static ssize_t proc_write_simdisk(struct file *file, const char __user *buf,
 			size_t size, loff_t *ppos)
 {
 	char *tmp = kmalloc(count + 1, GFP_KERNEL);
-	struct simdisk *dev = PDE(file_inode(file))->data;
+	struct simdisk *dev = PDE_DATA(file_inode(file));
 	int err;
 
 	if (tmp == NULL)
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 6d5bf649196d..00d2efd674df 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -194,7 +194,7 @@ static int acpi_ac_seq_show(struct seq_file *seq, void *offset)
 
 static int acpi_ac_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_ac_seq_show, PDE(inode)->data);
+	return single_open(file, acpi_ac_seq_show, PDE_DATA(inode));
 }
 
 static int acpi_ac_add_fs(struct acpi_device *device)
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index c5cd5b5513e6..169ced7e540d 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -929,7 +929,7 @@ static int acpi_battery_read_##_name(struct seq_file *seq, void *offset) \
 } \
 static int acpi_battery_##_name##_open_fs(struct inode *inode, struct file *file) \
 { \
-	return single_open(file, acpi_battery_read_##_name, PDE(inode)->data); \
+	return single_open(file, acpi_battery_read_##_name, PDE_DATA(inode)); \
 }
 
 DECLARE_FILE_FUNCTIONS(info);
diff --git a/drivers/acpi/button.c b/drivers/acpi/button.c
index 86c7d5445c38..5d57cd513f4f 100644
--- a/drivers/acpi/button.c
+++ b/drivers/acpi/button.c
@@ -128,7 +128,7 @@ static int acpi_button_state_seq_show(struct seq_file *seq, void *offset)
 
 static int acpi_button_state_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_button_state_seq_show, PDE(inode)->data);
+	return single_open(file, acpi_button_state_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations acpi_button_state_fops = {
diff --git a/drivers/acpi/proc.c b/drivers/acpi/proc.c
index 52ce76725c20..aa1227a7e3f2 100644
--- a/drivers/acpi/proc.c
+++ b/drivers/acpi/proc.c
@@ -120,7 +120,7 @@ static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset)
 
 static int acpi_system_alarm_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_system_alarm_seq_show, PDE(inode)->data);
+	return single_open(file, acpi_system_alarm_seq_show, PDE_DATA(inode));
 }
 
 static int get_date_field(char **p, u32 * value)
@@ -397,7 +397,7 @@ static int
 acpi_system_wakeup_device_open_fs(struct inode *inode, struct file *file)
 {
 	return single_open(file, acpi_system_wakeup_device_seq_show,
-			   PDE(inode)->data);
+			   PDE_DATA(inode));
 }
 
 static const struct file_operations acpi_system_wakeup_device_fops = {
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index e523245643ac..a296e08d76b6 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -584,7 +584,7 @@ static int acpi_battery_read_info(struct seq_file *seq, void *offset)
 
 static int acpi_battery_info_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_battery_read_info, PDE(inode)->data);
+	return single_open(file, acpi_battery_read_info, PDE_DATA(inode));
 }
 
 static int acpi_battery_read_state(struct seq_file *seq, void *offset)
@@ -623,7 +623,7 @@ static int acpi_battery_read_state(struct seq_file *seq, void *offset)
 
 static int acpi_battery_state_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_battery_read_state, PDE(inode)->data);
+	return single_open(file, acpi_battery_read_state, PDE_DATA(inode));
 }
 
 static int acpi_battery_read_alarm(struct seq_file *seq, void *offset)
@@ -688,7 +688,7 @@ acpi_battery_write_alarm(struct file *file, const char __user * buffer,
 
 static int acpi_battery_alarm_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_battery_read_alarm, PDE(inode)->data);
+	return single_open(file, acpi_battery_read_alarm, PDE_DATA(inode));
 }
 
 static const struct file_operations acpi_battery_info_fops = {
@@ -736,7 +736,7 @@ static int acpi_ac_read_state(struct seq_file *seq, void *offset)
 
 static int acpi_ac_state_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, acpi_ac_read_state, PDE(inode)->data);
+	return single_open(file, acpi_ac_read_state, PDE_DATA(inode));
 }
 
 static const struct file_operations acpi_ac_state_fops = {
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 5b5ee79ff236..eb3950113e42 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6473,7 +6473,7 @@ static int dac960_initial_status_proc_show(struct seq_file *m, void *v)
 
 static int dac960_initial_status_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, dac960_initial_status_proc_show, PDE(inode)->data);
+	return single_open(file, dac960_initial_status_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations dac960_initial_status_proc_fops = {
@@ -6519,7 +6519,7 @@ static int dac960_current_status_proc_show(struct seq_file *m, void *v)
 
 static int dac960_current_status_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, dac960_current_status_proc_show, PDE(inode)->data);
+	return single_open(file, dac960_current_status_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations dac960_current_status_proc_fops = {
@@ -6540,14 +6540,14 @@ static int dac960_user_command_proc_show(struct seq_file *m, void *v)
 
 static int dac960_user_command_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, dac960_user_command_proc_show, PDE(inode)->data);
+	return single_open(file, dac960_user_command_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t dac960_user_command_proc_write(struct file *file,
 				       const char __user *Buffer,
 				       size_t Count, loff_t *pos)
 {
-  DAC960_Controller_T *Controller = (DAC960_Controller_T *) PDE(file_inode(file))->data;
+  DAC960_Controller_T *Controller = PDE_DATA(file_inode(file));
   unsigned char CommandBuffer[80];
   int Length;
   if (Count > sizeof(CommandBuffer)-1) return -EINVAL;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ade58bc8f3c4..d150fe1248bf 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -493,7 +493,7 @@ static int cciss_seq_open(struct inode *inode, struct file *file)
 	struct seq_file *seq = file->private_data;
 
 	if (!ret)
-		seq->private = PDE(inode)->data;
+		seq->private = PDE_DATA(inode);
 
 	return ret;
 }
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 3f087133a25a..3b9e8ebcb96b 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -296,7 +296,7 @@ static int ida_proc_show(struct seq_file *m, void *v)
 
 static int ida_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ida_proc_show, PDE(inode)->data);
+	return single_open(file, ida_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ida_proc_fops = {
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 56672a61eb94..928adb815b09 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -314,7 +314,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
 static int drbd_proc_open(struct inode *inode, struct file *file)
 {
 	if (try_module_get(THIS_MODULE))
-		return single_open(file, drbd_seq_show, PDE(inode)->data);
+		return single_open(file, drbd_seq_show, PDE_DATA(inode));
 	return -ENODEV;
 }
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 2e7de7a59bfc..e0588c6dd86f 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2648,7 +2648,7 @@ static int pkt_seq_show(struct seq_file *m, void *p)
 
 static int pkt_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pkt_seq_show, PDE(inode)->data);
+	return single_open(file, pkt_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations pkt_proc_fops = {
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 75e112d66006..06a2e53e5f37 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -525,7 +525,7 @@ static int ps3vram_proc_show(struct seq_file *m, void *v)
 
 static int ps3vram_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ps3vram_proc_show, PDE(inode)->data);
+	return single_open(file, ps3vram_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ps3vram_proc_fops = {
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 053201b062a4..1420bbbe1a61 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -1917,7 +1917,7 @@ static int smi_ipmb_proc_show(struct seq_file *m, void *v)
 
 static int smi_ipmb_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, smi_ipmb_proc_show, PDE(inode)->data);
+	return single_open(file, smi_ipmb_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations smi_ipmb_proc_ops = {
@@ -1938,7 +1938,7 @@ static int smi_version_proc_show(struct seq_file *m, void *v)
 
 static int smi_version_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, smi_version_proc_show, PDE(inode)->data);
+	return single_open(file, smi_version_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations smi_version_proc_ops = {
@@ -2013,7 +2013,7 @@ static int smi_stats_proc_show(struct seq_file *m, void *v)
 
 static int smi_stats_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, smi_stats_proc_show, PDE(inode)->data);
+	return single_open(file, smi_stats_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations smi_stats_proc_ops = {
diff --git a/drivers/char/ipmi/ipmi_si_intf.c b/drivers/char/ipmi/ipmi_si_intf.c
index 0ac9b45a585e..313538abe63c 100644
--- a/drivers/char/ipmi/ipmi_si_intf.c
+++ b/drivers/char/ipmi/ipmi_si_intf.c
@@ -2839,7 +2839,7 @@ static int smi_type_proc_show(struct seq_file *m, void *v)
 
 static int smi_type_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, smi_type_proc_show, PDE(inode)->data);
+	return single_open(file, smi_type_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations smi_type_proc_ops = {
@@ -2882,7 +2882,7 @@ static int smi_si_stats_proc_show(struct seq_file *m, void *v)
 
 static int smi_si_stats_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, smi_si_stats_proc_show, PDE(inode)->data);
+	return single_open(file, smi_si_stats_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations smi_si_stats_proc_ops = {
@@ -2910,7 +2910,7 @@ static int smi_params_proc_show(struct seq_file *m, void *v)
 
 static int smi_params_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, smi_params_proc_show, PDE(inode)->data);
+	return single_open(file, smi_params_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations smi_params_proc_ops = {
diff --git a/drivers/gpu/drm/drm_proc.c b/drivers/gpu/drm/drm_proc.c
index ff5456b7df72..e06431897f4e 100644
--- a/drivers/gpu/drm/drm_proc.c
+++ b/drivers/gpu/drm/drm_proc.c
@@ -63,7 +63,7 @@ static struct drm_info_list drm_proc_list[] = {
 
 static int drm_proc_open(struct inode *inode, struct file *file)
 {
-	struct drm_info_node* node = PDE(inode)->data;
+	struct drm_info_node* node = PDE_DATA(inode);
 
 	return single_open(file, node->info_ent->show, node);
 }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 8126824daccb..b23113926388 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1408,7 +1408,7 @@ static int idecd_capacity_proc_show(struct seq_file *m, void *v)
 
 static int idecd_capacity_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idecd_capacity_proc_show, PDE(inode)->data);
+	return single_open(file, idecd_capacity_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idecd_capacity_proc_fops = {
diff --git a/drivers/ide/ide-disk_proc.c b/drivers/ide/ide-disk_proc.c
index 8b570a17bcd9..0d1fae6cba6d 100644
--- a/drivers/ide/ide-disk_proc.c
+++ b/drivers/ide/ide-disk_proc.c
@@ -53,7 +53,7 @@ static int idedisk_cache_proc_show(struct seq_file *m, void *v)
 
 static int idedisk_cache_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idedisk_cache_proc_show, PDE(inode)->data);
+	return single_open(file, idedisk_cache_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idedisk_cache_proc_fops = {
@@ -74,7 +74,7 @@ static int idedisk_capacity_proc_show(struct seq_file *m, void *v)
 
 static int idedisk_capacity_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idedisk_capacity_proc_show, PDE(inode)->data);
+	return single_open(file, idedisk_capacity_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idedisk_capacity_proc_fops = {
@@ -115,7 +115,7 @@ static int idedisk_sv_proc_show(struct seq_file *m, void *v)
 
 static int idedisk_sv_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idedisk_sv_proc_show, PDE(inode)->data);
+	return single_open(file, idedisk_sv_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idedisk_sv_proc_fops = {
@@ -133,7 +133,7 @@ static int idedisk_st_proc_show(struct seq_file *m, void *v)
 
 static int idedisk_st_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idedisk_st_proc_show, PDE(inode)->data);
+	return single_open(file, idedisk_st_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idedisk_st_proc_fops = {
diff --git a/drivers/ide/ide-floppy_proc.c b/drivers/ide/ide-floppy_proc.c
index 1600720f3e86..e7a25ea757df 100644
--- a/drivers/ide/ide-floppy_proc.c
+++ b/drivers/ide/ide-floppy_proc.c
@@ -15,7 +15,7 @@ static int idefloppy_capacity_proc_show(struct seq_file *m, void *v)
 
 static int idefloppy_capacity_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idefloppy_capacity_proc_show, PDE(inode)->data);
+	return single_open(file, idefloppy_capacity_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idefloppy_capacity_proc_fops = {
diff --git a/drivers/ide/ide-proc.c b/drivers/ide/ide-proc.c
index 2abcc4790f12..97c070077774 100644
--- a/drivers/ide/ide-proc.c
+++ b/drivers/ide/ide-proc.c
@@ -58,7 +58,7 @@ static int ide_imodel_proc_show(struct seq_file *m, void *v)
 
 static int ide_imodel_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_imodel_proc_show, PDE(inode)->data);
+	return single_open(file, ide_imodel_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ide_imodel_proc_fops = {
@@ -82,7 +82,7 @@ static int ide_mate_proc_show(struct seq_file *m, void *v)
 
 static int ide_mate_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_mate_proc_show, PDE(inode)->data);
+	return single_open(file, ide_mate_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ide_mate_proc_fops = {
@@ -103,7 +103,7 @@ static int ide_channel_proc_show(struct seq_file *m, void *v)
 
 static int ide_channel_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_channel_proc_show, PDE(inode)->data);
+	return single_open(file, ide_channel_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ide_channel_proc_fops = {
@@ -143,7 +143,7 @@ static int ide_identify_proc_show(struct seq_file *m, void *v)
 
 static int ide_identify_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_identify_proc_show, PDE(inode)->data);
+	return single_open(file, ide_identify_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ide_identify_proc_fops = {
@@ -325,7 +325,7 @@ static int ide_settings_proc_show(struct seq_file *m, void *v)
 
 static int ide_settings_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_settings_proc_show, PDE(inode)->data);
+	return single_open(file, ide_settings_proc_show, PDE_DATA(inode));
 }
 
 #define MAX_LEN	30
@@ -333,7 +333,7 @@ static int ide_settings_proc_open(struct inode *inode, struct file *file)
 static ssize_t ide_settings_proc_write(struct file *file, const char __user *buffer,
 				       size_t count, loff_t *pos)
 {
-	ide_drive_t	*drive = (ide_drive_t *) PDE(file_inode(file))->data;
+	ide_drive_t	*drive = PDE_DATA(file_inode(file));
 	char		name[MAX_LEN + 1];
 	int		for_real = 0, mul_factor, div_factor;
 	unsigned long	n;
@@ -474,7 +474,7 @@ static int ide_geometry_proc_show(struct seq_file *m, void *v)
 
 static int ide_geometry_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_geometry_proc_show, PDE(inode)->data);
+	return single_open(file, ide_geometry_proc_show, PDE_DATA(inode));
 }
 
 const struct file_operations ide_geometry_proc_fops = {
@@ -497,7 +497,7 @@ static int ide_dmodel_proc_show(struct seq_file *seq, void *v)
 
 static int ide_dmodel_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_dmodel_proc_show, PDE(inode)->data);
+	return single_open(file, ide_dmodel_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ide_dmodel_proc_fops = {
@@ -525,7 +525,7 @@ static int ide_driver_proc_show(struct seq_file *m, void *v)
 
 static int ide_driver_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_driver_proc_show, PDE(inode)->data);
+	return single_open(file, ide_driver_proc_show, PDE_DATA(inode));
 }
 
 static int ide_replace_subdriver(ide_drive_t *drive, const char *driver)
@@ -558,7 +558,7 @@ static int ide_replace_subdriver(ide_drive_t *drive, const char *driver)
 static ssize_t ide_driver_proc_write(struct file *file, const char __user *buffer,
 				     size_t count, loff_t *pos)
 {
-	ide_drive_t	*drive = (ide_drive_t *) PDE(file_inode(file))->data;
+	ide_drive_t	*drive = PDE_DATA(file_inode(file));
 	char name[32];
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -601,7 +601,7 @@ static int ide_media_proc_show(struct seq_file *m, void *v)
 
 static int ide_media_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, ide_media_proc_show, PDE(inode)->data);
+	return single_open(file, ide_media_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ide_media_proc_fops = {
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index ce8237d36159..89f859591bbb 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1847,7 +1847,7 @@ static int idetape_name_proc_show(struct seq_file *m, void *v)
 
 static int idetape_name_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, idetape_name_proc_show, PDE(inode)->data);
+	return single_open(file, idetape_name_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations idetape_name_proc_fops = {
diff --git a/drivers/isdn/gigaset/capi.c b/drivers/isdn/gigaset/capi.c
index 03a0a01a4054..3286903a95d2 100644
--- a/drivers/isdn/gigaset/capi.c
+++ b/drivers/isdn/gigaset/capi.c
@@ -2334,7 +2334,7 @@ static int gigaset_proc_show(struct seq_file *m, void *v)
 
 static int gigaset_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, gigaset_proc_show, PDE(inode)->data);
+	return single_open(file, gigaset_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations gigaset_proc_fops = {
diff --git a/drivers/isdn/hardware/avm/b1.c b/drivers/isdn/hardware/avm/b1.c
index 821f7ac33b37..4d9b195547c5 100644
--- a/drivers/isdn/hardware/avm/b1.c
+++ b/drivers/isdn/hardware/avm/b1.c
@@ -702,7 +702,7 @@ static int b1ctl_proc_show(struct seq_file *m, void *v)
 
 static int b1ctl_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, b1ctl_proc_show, PDE(inode)->data);
+	return single_open(file, b1ctl_proc_show, PDE_DATA(inode));
 }
 
 const struct file_operations b1ctl_proc_fops = {
diff --git a/drivers/isdn/hardware/avm/b1dma.c b/drivers/isdn/hardware/avm/b1dma.c
index 0896aa86fc08..19b113faeb7b 100644
--- a/drivers/isdn/hardware/avm/b1dma.c
+++ b/drivers/isdn/hardware/avm/b1dma.c
@@ -944,7 +944,7 @@ static int b1dmactl_proc_show(struct seq_file *m, void *v)
 
 static int b1dmactl_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, b1dmactl_proc_show, PDE(inode)->data);
+	return single_open(file, b1dmactl_proc_show, PDE_DATA(inode));
 }
 
 const struct file_operations b1dmactl_proc_fops = {
diff --git a/drivers/isdn/hardware/avm/c4.c b/drivers/isdn/hardware/avm/c4.c
index 1d7fc44e3eef..5d00d72fe482 100644
--- a/drivers/isdn/hardware/avm/c4.c
+++ b/drivers/isdn/hardware/avm/c4.c
@@ -1129,7 +1129,7 @@ static int c4_proc_show(struct seq_file *m, void *v)
 
 static int c4_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, c4_proc_show, PDE(inode)->data);
+	return single_open(file, c4_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations c4_proc_fops = {
diff --git a/drivers/isdn/hardware/eicon/divasproc.c b/drivers/isdn/hardware/eicon/divasproc.c
index 3a4165c61196..56ce98a4e248 100644
--- a/drivers/isdn/hardware/eicon/divasproc.c
+++ b/drivers/isdn/hardware/eicon/divasproc.c
@@ -145,7 +145,7 @@ void remove_divas_proc(void)
 static ssize_t grp_opt_proc_write(struct file *file, const char __user *buffer,
 				  size_t count, loff_t *pos)
 {
-	diva_os_xdi_adapter_t *a = PDE(file_inode(file))->data;
+	diva_os_xdi_adapter_t *a = PDE_DATA(file_inode(file));
 	PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1];
 
 	if ((count == 1) || (count == 2)) {
@@ -172,7 +172,7 @@ static ssize_t grp_opt_proc_write(struct file *file, const char __user *buffer,
 static ssize_t d_l1_down_proc_write(struct file *file, const char __user *buffer,
 				    size_t count, loff_t *pos)
 {
-	diva_os_xdi_adapter_t *a = PDE(file_inode(file))->data;
+	diva_os_xdi_adapter_t *a = PDE_DATA(file_inode(file));
 	PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1];
 
 	if ((count == 1) || (count == 2)) {
@@ -210,7 +210,7 @@ static int d_l1_down_proc_show(struct seq_file *m, void *v)
 
 static int d_l1_down_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, d_l1_down_proc_show, PDE(inode)->data);
+	return single_open(file, d_l1_down_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations d_l1_down_proc_fops = {
@@ -236,7 +236,7 @@ static int grp_opt_proc_show(struct seq_file *m, void *v)
 
 static int grp_opt_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, grp_opt_proc_show, PDE(inode)->data);
+	return single_open(file, grp_opt_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations grp_opt_proc_fops = {
@@ -251,7 +251,7 @@ static const struct file_operations grp_opt_proc_fops = {
 static ssize_t info_proc_write(struct file *file, const char __user *buffer,
 			       size_t count, loff_t *pos)
 {
-	diva_os_xdi_adapter_t *a = PDE(file_inode(file))->data;
+	diva_os_xdi_adapter_t *a = PDE_DATA(file_inode(file));
 	PISDN_ADAPTER IoAdapter = IoAdapters[a->controller - 1];
 	char c[4];
 
@@ -335,7 +335,7 @@ static int info_proc_show(struct seq_file *m, void *v)
 
 static int info_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, info_proc_show, PDE(inode)->data);
+	return single_open(file, info_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations info_proc_fops = {
diff --git a/drivers/isdn/hysdn/hycapi.c b/drivers/isdn/hysdn/hycapi.c
index 931f916c9c23..00aad10507d8 100644
--- a/drivers/isdn/hysdn/hycapi.c
+++ b/drivers/isdn/hysdn/hycapi.c
@@ -469,7 +469,7 @@ static int hycapi_proc_show(struct seq_file *m, void *v)
 
 static int hycapi_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, hycapi_proc_show, PDE(inode)->data);
+	return single_open(file, hycapi_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations hycapi_proc_fops = {
diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c
index dc88bcb25029..73079213ec94 100644
--- a/drivers/isdn/hysdn/hysdn_procconf.c
+++ b/drivers/isdn/hysdn/hysdn_procconf.c
@@ -234,7 +234,7 @@ hysdn_conf_open(struct inode *ino, struct file *filep)
 
 	/* now search the addressed card */
 	mutex_lock(&hysdn_conf_mutex);
-	card = PDE(ino)->data;
+	card = PDE_DATA(ino);
 	if (card->debug_flags & (LOG_PROC_OPEN | LOG_PROC_ALL))
 		hysdn_addlog(card, "config open for uid=%d gid=%d mode=0x%x",
 			     filep->f_cred->fsuid, filep->f_cred->fsgid,
@@ -308,7 +308,7 @@ hysdn_conf_close(struct inode *ino, struct file *filep)
 	int retval = 0;
 
 	mutex_lock(&hysdn_conf_mutex);
-	card = PDE(ino)->data;
+	card = PDE_DATA(ino);
 	if (card->debug_flags & (LOG_PROC_OPEN | LOG_PROC_ALL))
 		hysdn_addlog(card, "config close for uid=%d gid=%d mode=0x%x",
 			     filep->f_cred->fsuid, filep->f_cred->fsgid,
diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c
index 22f0e4ef1fb1..b61e8d5e84ad 100644
--- a/drivers/isdn/hysdn/hysdn_proclog.c
+++ b/drivers/isdn/hysdn/hysdn_proclog.c
@@ -173,7 +173,7 @@ hysdn_log_read(struct file *file, char __user *buf, size_t count, loff_t *off)
 {
 	struct log_data *inf;
 	int len;
-	hysdn_card *card = PDE(file_inode(file))->data;
+	hysdn_card *card = PDE_DATA(file_inode(file));
 
 	if (!*((struct log_data **) file->private_data)) {
 		struct procdata *pd = card->proclog;
@@ -202,7 +202,7 @@ hysdn_log_read(struct file *file, char __user *buf, size_t count, loff_t *off)
 static int
 hysdn_log_open(struct inode *ino, struct file *filep)
 {
-	hysdn_card *card = PDE(ino)->data;
+	hysdn_card *card = PDE_DATA(ino);
 
 	mutex_lock(&hysdn_log_mutex);
 	if ((filep->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_WRITE) {
@@ -255,7 +255,7 @@ hysdn_log_close(struct inode *ino, struct file *filep)
 			pd = (struct procdata *) inf->proc_ctrl;	/* still entries there */
 		else {
 			/* no info available -> search card */
-			card = PDE(file_inode(filep))->data;
+			card = PDE_DATA(file_inode(filep));
 			pd = card->proclog;	/* pointer to procfs log */
 		}
 		if (pd)
@@ -286,7 +286,7 @@ static unsigned int
 hysdn_log_poll(struct file *file, poll_table *wait)
 {
 	unsigned int mask = 0;
-	hysdn_card *card = PDE(file_inode(file))->data;
+	hysdn_card *card = PDE_DATA(file_inode(file));
 	struct procdata *pd = card->proclog;
 
 	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_WRITE)
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 22b8ce4191cc..c31fbab6aa82 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -869,7 +869,7 @@ static int pmu_battery_proc_show(struct seq_file *m, void *v)
 
 static int pmu_battery_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pmu_battery_proc_show, PDE(inode)->data);
+	return single_open(file, pmu_battery_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations pmu_battery_proc_fops = {
diff --git a/drivers/media/pci/zoran/zoran_procfs.c b/drivers/media/pci/zoran/zoran_procfs.c
index e084b0a21b1b..07a104d2bd1d 100644
--- a/drivers/media/pci/zoran/zoran_procfs.c
+++ b/drivers/media/pci/zoran/zoran_procfs.c
@@ -130,14 +130,14 @@ static int zoran_show(struct seq_file *p, void *v)
 
 static int zoran_open(struct inode *inode, struct file *file)
 {
-	struct zoran *data = PDE(inode)->data;
+	struct zoran *data = PDE_DATA(inode);
 	return single_open(file, zoran_show, data);
 }
 
 static ssize_t zoran_write(struct file *file, const char __user *buffer,
 			size_t count, loff_t *ppos)
 {
-	struct zoran *zr = PDE(file_inode(file))->data;
+	struct zoran *zr = PDE_DATA(file_inode(file));
 	char *string, *sp;
 	char *line, *ldelim, *varname, *svar, *tdelim;
 
diff --git a/drivers/message/fusion/mptbase.c b/drivers/message/fusion/mptbase.c
index fb69baa06ca8..767ff4d839f4 100644
--- a/drivers/message/fusion/mptbase.c
+++ b/drivers/message/fusion/mptbase.c
@@ -6656,7 +6656,7 @@ static int mpt_summary_proc_show(struct seq_file *m, void *v)
 
 static int mpt_summary_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, mpt_summary_proc_show, PDE(inode)->data);
+	return single_open(file, mpt_summary_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations mpt_summary_proc_fops = {
@@ -6805,7 +6805,7 @@ static int mpt_iocinfo_proc_show(struct seq_file *m, void *v)
 
 static int mpt_iocinfo_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, mpt_iocinfo_proc_show, PDE(inode)->data);
+	return single_open(file, mpt_iocinfo_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations mpt_iocinfo_proc_fops = {
diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 15c1e480c0dd..70a840f9b283 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -1599,98 +1599,98 @@ static int i2o_seq_show_sensors(struct seq_file *seq, void *v)
 
 static int i2o_seq_open_hrt(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_hrt, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_hrt, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_lct(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_lct, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_lct, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_status(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_status, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_status, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_hw(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_hw, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_hw, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_ddm_table(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_ddm_table, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_ddm_table, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_driver_store(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_driver_store, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_driver_store, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_drivers_stored(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_drivers_stored, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_drivers_stored, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_groups(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_groups, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_groups, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_phys_device(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_phys_device, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_phys_device, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_claimed(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_claimed, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_claimed, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_users(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_users, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_users, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_priv_msgs(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_priv_msgs, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_priv_msgs, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_authorized_users(struct inode *inode, struct file *file)
 {
 	return single_open(file, i2o_seq_show_authorized_users,
-			   PDE(inode)->data);
+			   PDE_DATA(inode));
 };
 
 static int i2o_seq_open_dev_identity(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_dev_identity, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_dev_identity, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_ddm_identity(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_ddm_identity, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_ddm_identity, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_uinfo(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_uinfo, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_uinfo, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_sgl_limits(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_sgl_limits, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_sgl_limits, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_sensors(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_sensors, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_sensors, PDE_DATA(inode));
 };
 
 static int i2o_seq_open_dev_name(struct inode *inode, struct file *file)
 {
-	return single_open(file, i2o_seq_show_dev_name, PDE(inode)->data);
+	return single_open(file, i2o_seq_show_dev_name, PDE_DATA(inode));
 };
 
 static const struct file_operations i2o_seq_fops_lct = {
diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c
index 3cea38d37344..94d06f1307b8 100644
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -218,15 +218,13 @@ static const struct seq_operations bond_info_seq_ops = {
 static int bond_info_open(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq;
-	struct proc_dir_entry *proc;
 	int res;
 
 	res = seq_open(file, &bond_info_seq_ops);
 	if (!res) {
 		/* recover the pointer buried in proc_dir_entry data */
 		seq = file->private_data;
-		proc = PDE(inode);
-		seq->private = proc->data;
+		seq->private = PDE_DATA(inode);
 	}
 
 	return res;
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index 2f99f8881dfc..e22cd4e7017a 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -383,7 +383,7 @@ static int vlsi_seq_show(struct seq_file *seq, void *v)
 
 static int vlsi_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, vlsi_seq_show, PDE(inode)->data);
+	return single_open(file, vlsi_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations vlsi_proc_fops = {
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 53295418f576..66e398d4730d 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -4663,8 +4663,7 @@ static ssize_t proc_write( struct file *file,
 static int proc_status_open(struct inode *inode, struct file *file)
 {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *apriv = dev->ml_priv;
 	CapabilityRid cap_rid;
 	StatusRid status_rid;
@@ -4746,8 +4745,7 @@ static int proc_stats_rid_open( struct inode *inode,
 				u16 rid )
 {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *apriv = dev->ml_priv;
 	StatsRid stats;
 	int i, j;
@@ -4809,8 +4807,7 @@ static inline int sniffing_mode(struct airo_info *ai)
 static void proc_config_on_close(struct inode *inode, struct file *file)
 {
 	struct proc_data *data = file->private_data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	char *line;
 
@@ -5021,8 +5018,7 @@ static const char *get_rmode(__le16 mode)
 static int proc_config_open(struct inode *inode, struct file *file)
 {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	int i;
 	__le16 mode;
@@ -5112,8 +5108,7 @@ static int proc_config_open(struct inode *inode, struct file *file)
 static void proc_SSID_on_close(struct inode *inode, struct file *file)
 {
 	struct proc_data *data = file->private_data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	SsidRid SSID_rid;
 	int i;
@@ -5148,8 +5143,7 @@ static void proc_SSID_on_close(struct inode *inode, struct file *file)
 
 static void proc_APList_on_close( struct inode *inode, struct file *file ) {
 	struct proc_data *data = file->private_data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	APListRid APList_rid;
 	int i;
@@ -5283,8 +5277,7 @@ static int set_wep_tx_idx(struct airo_info *ai, u16 index, int perm, int lock)
 
 static void proc_wepkey_on_close( struct inode *inode, struct file *file ) {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	int i, rc;
 	char key[16];
@@ -5335,8 +5328,7 @@ static void proc_wepkey_on_close( struct inode *inode, struct file *file ) {
 static int proc_wepkey_open( struct inode *inode, struct file *file )
 {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	char *ptr;
 	WepKeyRid wkr;
@@ -5384,8 +5376,7 @@ static int proc_wepkey_open( struct inode *inode, struct file *file )
 static int proc_SSID_open(struct inode *inode, struct file *file)
 {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	int i;
 	char *ptr;
@@ -5428,8 +5419,7 @@ static int proc_SSID_open(struct inode *inode, struct file *file)
 
 static int proc_APList_open( struct inode *inode, struct file *file ) {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	int i;
 	char *ptr;
@@ -5468,8 +5458,7 @@ static int proc_APList_open( struct inode *inode, struct file *file ) {
 
 static int proc_BSSList_open( struct inode *inode, struct file *file ) {
 	struct proc_data *data;
-	struct proc_dir_entry *dp = PDE(inode);
-	struct net_device *dev = dp->data;
+	struct net_device *dev = PDE_DATA(inode);
 	struct airo_info *ai = dev->ml_priv;
 	char *ptr;
 	BSSListRid BSSList_rid;
diff --git a/drivers/net/wireless/ray_cs.c b/drivers/net/wireless/ray_cs.c
index 3109c0db66e1..a6f660c01902 100644
--- a/drivers/net/wireless/ray_cs.c
+++ b/drivers/net/wireless/ray_cs.c
@@ -2778,7 +2778,7 @@ static ssize_t int_proc_write(struct file *file, const char __user *buffer,
 		nr = nr * 10 + c;
 		p++;
 	} while (--len);
-	*(int *)PDE(file_inode(file))->data = nr;
+	*(int *)PDE_DATA(file_inode(file)) = nr;
 	return count;
 }
 
diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c
index d4d800c54d86..b48243131993 100644
--- a/drivers/parisc/led.c
+++ b/drivers/parisc/led.c
@@ -172,14 +172,14 @@ static int led_proc_show(struct seq_file *m, void *v)
 
 static int led_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, led_proc_show, PDE(inode)->data);
+	return single_open(file, led_proc_show, PDE_DATA(inode));
 }
 
 
 static ssize_t led_proc_write(struct file *file, const char *buf,
 	size_t count, loff_t *pos)
 {
-	void *data = PDE(file_inode(file))->data;
+	void *data = PDE_DATA(file_inode(file));
 	char *cur, lbuf[32];
 	int d;
 
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 0b009470e6db..12e4fb5824c1 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -46,9 +46,7 @@ proc_bus_pci_lseek(struct file *file, loff_t off, int whence)
 static ssize_t
 proc_bus_pci_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	const struct inode *ino = file_inode(file);
-	const struct proc_dir_entry *dp = PDE(ino);
-	struct pci_dev *dev = dp->data;
+	struct pci_dev *dev = PDE_DATA(file_inode(file));
 	unsigned int pos = *ppos;
 	unsigned int cnt, size;
 
@@ -59,7 +57,7 @@ proc_bus_pci_read(struct file *file, char __user *buf, size_t nbytes, loff_t *pp
 	 */
 
 	if (capable(CAP_SYS_ADMIN))
-		size = dp->size;
+		size = dev->cfg_size;
 	else if (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS)
 		size = 128;
 	else
@@ -133,10 +131,9 @@ static ssize_t
 proc_bus_pci_write(struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos)
 {
 	struct inode *ino = file_inode(file);
-	const struct proc_dir_entry *dp = PDE(ino);
-	struct pci_dev *dev = dp->data;
+	struct pci_dev *dev = PDE_DATA(ino);
 	int pos = *ppos;
-	int size = dp->size;
+	int size = dev->cfg_size;
 	int cnt;
 
 	if (pos >= size)
@@ -200,7 +197,7 @@ proc_bus_pci_write(struct file *file, const char __user *buf, size_t nbytes, lof
 	pci_config_pm_runtime_put(dev);
 
 	*ppos = pos;
-	i_size_write(ino, dp->size);
+	i_size_write(ino, dev->cfg_size);
 	return nbytes;
 }
 
@@ -212,8 +209,7 @@ struct pci_filp_private {
 static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd,
 			       unsigned long arg)
 {
-	const struct proc_dir_entry *dp = PDE(file_inode(file));
-	struct pci_dev *dev = dp->data;
+	struct pci_dev *dev = PDE_DATA(file_inode(file));
 #ifdef HAVE_PCI_MMAP
 	struct pci_filp_private *fpriv = file->private_data;
 #endif /* HAVE_PCI_MMAP */
@@ -253,9 +249,7 @@ static long proc_bus_pci_ioctl(struct file *file, unsigned int cmd,
 #ifdef HAVE_PCI_MMAP
 static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct inode *inode = file_inode(file);
-	const struct proc_dir_entry *dp = PDE(inode);
-	struct pci_dev *dev = dp->data;
+	struct pci_dev *dev = PDE_DATA(file_inode(file));
 	struct pci_filp_private *fpriv = file->private_data;
 	int i, ret;
 
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index 9a907567f41e..05272e676a28 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -844,14 +844,14 @@ static int dispatch_proc_show(struct seq_file *m, void *v)
 
 static int dispatch_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, dispatch_proc_show, PDE(inode)->data);
+	return single_open(file, dispatch_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t dispatch_proc_write(struct file *file,
 			const char __user *userbuf,
 			size_t count, loff_t *pos)
 {
-	struct ibm_struct *ibm = PDE(file_inode(file))->data;
+	struct ibm_struct *ibm = PDE_DATA(file_inode(file));
 	char *kernbuf;
 	int ret;
 
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 242abac62d8b..eb3467ea6d86 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -553,7 +553,7 @@ static int lcd_proc_show(struct seq_file *m, void *v)
 
 static int lcd_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, lcd_proc_show, PDE(inode)->data);
+	return single_open(file, lcd_proc_show, PDE_DATA(inode));
 }
 
 static int set_lcd_brightness(struct toshiba_acpi_dev *dev, int value)
@@ -583,7 +583,7 @@ static int set_lcd_status(struct backlight_device *bd)
 static ssize_t lcd_proc_write(struct file *file, const char __user *buf,
 			      size_t count, loff_t *pos)
 {
-	struct toshiba_acpi_dev *dev = PDE(file_inode(file))->data;
+	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char cmd[42];
 	size_t len;
 	int value;
@@ -644,13 +644,13 @@ static int video_proc_show(struct seq_file *m, void *v)
 
 static int video_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, video_proc_show, PDE(inode)->data);
+	return single_open(file, video_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t video_proc_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *pos)
 {
-	struct toshiba_acpi_dev *dev = PDE(file_inode(file))->data;
+	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char *cmd, *buffer;
 	int ret;
 	int value;
@@ -744,13 +744,13 @@ static int fan_proc_show(struct seq_file *m, void *v)
 
 static int fan_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, fan_proc_show, PDE(inode)->data);
+	return single_open(file, fan_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t fan_proc_write(struct file *file, const char __user *buf,
 			      size_t count, loff_t *pos)
 {
-	struct toshiba_acpi_dev *dev = PDE(file_inode(file))->data;
+	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char cmd[42];
 	size_t len;
 	int value;
@@ -816,13 +816,13 @@ static int keys_proc_show(struct seq_file *m, void *v)
 
 static int keys_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, keys_proc_show, PDE(inode)->data);
+	return single_open(file, keys_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t keys_proc_write(struct file *file, const char __user *buf,
 			       size_t count, loff_t *pos)
 {
-	struct toshiba_acpi_dev *dev = PDE(file_inode(file))->data;
+	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char cmd[42];
 	size_t len;
 	int value;
@@ -859,7 +859,7 @@ static int version_proc_show(struct seq_file *m, void *v)
 
 static int version_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, version_proc_show, PDE(inode)->data);
+	return single_open(file, version_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations version_proc_fops = {
diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c
index 65f735ac6b3b..af4d40affb79 100644
--- a/drivers/pnp/isapnp/proc.c
+++ b/drivers/pnp/isapnp/proc.c
@@ -55,9 +55,7 @@ static loff_t isapnp_proc_bus_lseek(struct file *file, loff_t off, int whence)
 static ssize_t isapnp_proc_bus_read(struct file *file, char __user * buf,
 				    size_t nbytes, loff_t * ppos)
 {
-	struct inode *ino = file_inode(file);
-	struct proc_dir_entry *dp = PDE(ino);
-	struct pnp_dev *dev = dp->data;
+	struct pnp_dev *dev = PDE_DATA(file_inode(file));
 	int pos = *ppos;
 	int cnt, size = 256;
 
diff --git a/drivers/pnp/pnpbios/proc.c b/drivers/pnp/pnpbios/proc.c
index 63ddb0173456..8dafd65d4758 100644
--- a/drivers/pnp/pnpbios/proc.c
+++ b/drivers/pnp/pnpbios/proc.c
@@ -238,13 +238,13 @@ static int pnpbios_proc_show(struct seq_file *m, void *v)
 
 static int pnpbios_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pnpbios_proc_show, PDE(inode)->data);
+	return single_open(file, pnpbios_proc_show, PDE_DATA(inode));
 }
 
 static ssize_t pnpbios_proc_write(struct file *file, const char __user *buf,
 				  size_t count, loff_t *pos)
 {
-	void *data = PDE(file_inode(file))->data;
+	void *data = PDE_DATA(file_inode(file));
 	struct pnp_bios_node *node;
 	int boot = (long)data >> 8;
 	u8 nodenum = (long)data;
diff --git a/drivers/rtc/rtc-proc.c b/drivers/rtc/rtc-proc.c
index e96236ac2e78..ffa69e1c9245 100644
--- a/drivers/rtc/rtc-proc.c
+++ b/drivers/rtc/rtc-proc.c
@@ -110,7 +110,7 @@ static int rtc_proc_show(struct seq_file *seq, void *offset)
 static int rtc_proc_open(struct inode *inode, struct file *file)
 {
 	int ret;
-	struct rtc_device *rtc = PDE(inode)->data;
+	struct rtc_device *rtc = PDE_DATA(inode);
 
 	if (!try_module_get(THIS_MODULE))
 		return -ENODEV;
diff --git a/drivers/scsi/scsi_proc.c b/drivers/scsi/scsi_proc.c
index 1eb34c34d7b9..db66357211ed 100644
--- a/drivers/scsi/scsi_proc.c
+++ b/drivers/scsi/scsi_proc.c
@@ -48,7 +48,7 @@ static DEFINE_MUTEX(global_host_template_mutex);
 static ssize_t proc_scsi_host_write(struct file *file, const char __user *buf,
                            size_t count, loff_t *ppos)
 {
-	struct Scsi_Host *shost = PDE(file_inode(file))->data;
+	struct Scsi_Host *shost = PDE_DATA(file_inode(file));
 	ssize_t ret = -ENOMEM;
 	char *page;
     
@@ -78,7 +78,7 @@ static int proc_scsi_show(struct seq_file *m, void *v)
 
 static int proc_scsi_host_open(struct inode *inode, struct file *file)
 {
-	return single_open_size(file, proc_scsi_show, PDE(inode)->data,
+	return single_open_size(file, proc_scsi_show, PDE_DATA(inode),
 				4 * PAGE_SIZE);
 }
 
diff --git a/drivers/staging/ccg/rndis.c b/drivers/staging/ccg/rndis.c
index d9297eebbf73..1e4cfb05f70b 100644
--- a/drivers/staging/ccg/rndis.c
+++ b/drivers/staging/ccg/rndis.c
@@ -1065,7 +1065,7 @@ static int rndis_proc_show(struct seq_file *m, void *v)
 static ssize_t rndis_proc_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
-	rndis_params *p = PDE(file_inode(file))->data;
+	rndis_params *p = PDE_DATA(file_inode(file));
 	u32 speed = 0;
 	int i, fl_speed = 0;
 
@@ -1109,7 +1109,7 @@ static ssize_t rndis_proc_write(struct file *file, const char __user *buffer,
 
 static int rndis_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, rndis_proc_show, PDE(inode)->data);
+	return single_open(file, rndis_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations rndis_proc_fops = {
diff --git a/drivers/staging/dgrp/dgrp_dpa_ops.c b/drivers/staging/dgrp/dgrp_dpa_ops.c
index cfa8e82404f9..43209c163a43 100644
--- a/drivers/staging/dgrp/dgrp_dpa_ops.c
+++ b/drivers/staging/dgrp/dgrp_dpa_ops.c
@@ -113,8 +113,6 @@ static int dgrp_dpa_open(struct inode *inode, struct file *file)
 	struct nd_struct *nd;
 	int rtn = 0;
 
-	struct proc_dir_entry *de;
-
 	rtn = try_module_get(THIS_MODULE);
 	if (!rtn)
 		return -ENXIO;
@@ -137,12 +135,7 @@ static int dgrp_dpa_open(struct inode *inode, struct file *file)
 	/*
 	 *  Get the node pointer, and fail if it doesn't exist.
 	 */
-	de = PDE(inode);
-	if (!de) {
-		rtn = -ENXIO;
-		goto done;
-	}
-	nd = (struct nd_struct *)de->data;
+	nd = PDE_DATA(inode);
 	if (!nd) {
 		rtn = -ENXIO;
 		goto done;
diff --git a/drivers/staging/dgrp/dgrp_mon_ops.c b/drivers/staging/dgrp/dgrp_mon_ops.c
index 52493b5c1673..6edbbf069150 100644
--- a/drivers/staging/dgrp/dgrp_mon_ops.c
+++ b/drivers/staging/dgrp/dgrp_mon_ops.c
@@ -67,7 +67,6 @@ const struct file_operations dgrp_mon_ops = {
 static int dgrp_mon_open(struct inode *inode, struct file *file)
 {
 	struct nd_struct *nd;
-	struct proc_dir_entry *de;
 	struct timeval tv;
 	uint32_t time;
 	u8 *buf;
@@ -95,13 +94,7 @@ static int dgrp_mon_open(struct inode *inode, struct file *file)
 	/*
 	 *  Get the node pointer, and fail if it doesn't exist.
 	 */
-	de = PDE(inode);
-	if (!de) {
-		rtn = -ENXIO;
-		goto done;
-	}
-
-	nd = (struct nd_struct *)de->data;
+	nd = PDE_DATA(inode);
 	if (!nd) {
 		rtn = -ENXIO;
 		goto done;
diff --git a/drivers/staging/dgrp/dgrp_net_ops.c b/drivers/staging/dgrp/dgrp_net_ops.c
index dc826b2cf907..5448fc78bcad 100644
--- a/drivers/staging/dgrp/dgrp_net_ops.c
+++ b/drivers/staging/dgrp/dgrp_net_ops.c
@@ -784,7 +784,6 @@ out_err:
 static int dgrp_net_open(struct inode *inode, struct file *file)
 {
 	struct nd_struct *nd;
-	struct proc_dir_entry *de;
 	ulong  lock_flags;
 	int rtn;
 
@@ -808,13 +807,7 @@ static int dgrp_net_open(struct inode *inode, struct file *file)
 	/*
 	 *  Get the node pointer, and fail if it doesn't exist.
 	 */
-	de = PDE(inode);
-	if (!de) {
-		rtn = -ENXIO;
-		goto done;
-	}
-
-	nd = (struct nd_struct *) de->data;
+	nd = PDE_DATA(inode);
 	if (!nd) {
 		rtn = -ENXIO;
 		goto done;
diff --git a/drivers/staging/dgrp/dgrp_ports_ops.c b/drivers/staging/dgrp/dgrp_ports_ops.c
index 48e9079c6355..4ce030815f27 100644
--- a/drivers/staging/dgrp/dgrp_ports_ops.c
+++ b/drivers/staging/dgrp/dgrp_ports_ops.c
@@ -149,7 +149,7 @@ static int dgrp_ports_open(struct inode *inode, struct file *file)
 	rtn = seq_open(file, &ports_seq_ops);
 	if (!rtn) {
 		seq = file->private_data;
-		seq->private = PDE(inode)->data;
+		seq->private = PDE_DATA(inode);
 	}
 
 	return rtn;
diff --git a/drivers/staging/silicom/bpctl_mod.c b/drivers/staging/silicom/bpctl_mod.c
index f64ee07c15ac..e2da0fb3f98e 100644
--- a/drivers/staging/silicom/bpctl_mod.c
+++ b/drivers/staging/silicom/bpctl_mod.c
@@ -7250,7 +7250,7 @@ static int procfs_add(char *proc_name, const struct file_operations *fops,
 #define RO_FOPS(name)	\
 static int name##_open(struct inode *inode, struct file *file)	\
 {								\
-	return single_open(file, show_##name, PDE(inode)->data);\
+	return single_open(file, show_##name, PDE_DATA(inode));\
 }								\
 static const struct file_operations name##_ops = {		\
 	.open = name##_open,					\
@@ -7262,7 +7262,7 @@ static const struct file_operations name##_ops = {		\
 #define RW_FOPS(name)	\
 static int name##_open(struct inode *inode, struct file *file)	\
 {								\
-	return single_open(file, show_##name, PDE(inode)->data);\
+	return single_open(file, show_##name, PDE_DATA(inode));\
 }								\
 static const struct file_operations name##_ops = {		\
 	.open = name##_open,					\
@@ -7351,7 +7351,7 @@ static ssize_t bypass_write(struct file *file, const char __user *buffer,
 	if (bypass_param < 0)
 		return -1;
 
-	set_bypass_fn(PDE(file_inode(file))->data, bypass_param);
+	set_bypass_fn(PDE_DATA(file_inode(file)), bypass_param);
 	return count;
 }
 static int show_bypass(struct seq_file *m, void *v)
@@ -7375,7 +7375,7 @@ static ssize_t tap_write(struct file *file, const char __user *buffer,
 	if (tap_param < 0)
 		return -1;
 
-	set_tap_fn(PDE(file_inode(file))->data, tap_param);
+	set_tap_fn(PDE_DATA(file_inode(file)), tap_param);
 	return count;
 }
 static int show_tap(struct seq_file *m, void *v)
@@ -7399,7 +7399,7 @@ static ssize_t disc_write(struct file *file, const char __user *buffer,
 	if (tap_param < 0)
 		return -1;
 
-	set_disc_fn(PDE(file_inode(file))->data, tap_param);
+	set_disc_fn(PDE_DATA(file_inode(file)), tap_param);
 	return count;
 }
 static int show_disc(struct seq_file *m, void *v)
@@ -7461,7 +7461,7 @@ RO_FOPS(disc_change)
 static ssize_t bypass_wd_write(struct file *file, const char __user *buffer,
 				  size_t count, loff_t *pos)
 {
-	bpctl_dev_t *dev = PDE(file_inode(file))->data;
+	bpctl_dev_t *dev = PDE_DATA(file_inode(file));
 	int timeout;
 	int ret = kstrtoint_from_user(buffer, count, 10, &timeout);
 	if (ret)
@@ -7507,7 +7507,7 @@ RO_FOPS(wd_expire_time)
 static ssize_t tpl_write(struct file *file, const char __user *buffer,
 				  size_t count, loff_t *pos)
 {
-	bpctl_dev_t *dev = PDE(file_inode(file))->data;
+	bpctl_dev_t *dev = PDE_DATA(file_inode(file));
 	int tpl_param = user_on_off(buffer, count);
 	if (tpl_param < 0)
 		return -1;
@@ -7533,7 +7533,7 @@ RW_FOPS(tpl)
 static ssize_t wait_at_pwup_write(struct file *file, const char __user *buffer,
 				  size_t count, loff_t *pos)
 {
-	bpctl_dev_t *dev = PDE(file_inode(file))->data;
+	bpctl_dev_t *dev = PDE_DATA(file_inode(file));
 	int tpl_param = user_on_off(buffer, count);
 	if (tpl_param < 0)
 		return -1;
@@ -7558,7 +7558,7 @@ RW_FOPS(wait_at_pwup)
 static ssize_t hw_reset_write(struct file *file, const char __user *buffer,
 				  size_t count, loff_t *pos)
 {
-	bpctl_dev_t *dev = PDE(file_inode(file))->data;
+	bpctl_dev_t *dev = PDE_DATA(file_inode(file));
 	int tpl_param = user_on_off(buffer, count);
 	if (tpl_param < 0)
 		return -1;
@@ -7603,7 +7603,7 @@ static ssize_t dis_bypass_write(struct file *file, const char __user *buffer,
 	if (bypass_param < 0)
 		return -EINVAL;
 
-	set_dis_bypass_fn(PDE(file_inode(file))->data, bypass_param);
+	set_dis_bypass_fn(PDE_DATA(file_inode(file)), bypass_param);
 	return count;
 }
 static int show_dis_bypass(struct seq_file *m, void *v)
@@ -7627,7 +7627,7 @@ static ssize_t dis_tap_write(struct file *file, const char __user *buffer,
 	if (tap_param < 0)
 		return -EINVAL;
 
-	set_dis_tap_fn(PDE(file_inode(file))->data, tap_param);
+	set_dis_tap_fn(PDE_DATA(file_inode(file)), tap_param);
 	return count;
 }
 static int show_dis_tap(struct seq_file *m, void *v)
@@ -7651,7 +7651,7 @@ static ssize_t dis_disc_write(struct file *file, const char __user *buffer,
 	if (tap_param < 0)
 		return -EINVAL;
 
-	set_dis_disc_fn(PDE(file_inode(file))->data, tap_param);
+	set_dis_disc_fn(PDE_DATA(file_inode(file)), tap_param);
 	return count;
 }
 static int show_dis_disc(struct seq_file *m, void *v)
@@ -7675,7 +7675,7 @@ static ssize_t bypass_pwup_write(struct file *file, const char __user *buffer,
 	if (bypass_param < 0)
 		return -EINVAL;
 
-	set_bypass_pwup_fn(PDE(file_inode(file))->data, bypass_param);
+	set_bypass_pwup_fn(PDE_DATA(file_inode(file)), bypass_param);
 	return count;
 }
 static int show_bypass_pwup(struct seq_file *m, void *v)
@@ -7699,7 +7699,7 @@ static ssize_t bypass_pwoff_write(struct file *file, const char __user *buffer,
 	if (bypass_param < 0)
 		return -EINVAL;
 
-	set_bypass_pwoff_fn(PDE(file_inode(file))->data, bypass_param);
+	set_bypass_pwoff_fn(PDE_DATA(file_inode(file)), bypass_param);
 	return count;
 }
 static int show_bypass_pwoff(struct seq_file *m, void *v)
@@ -7723,7 +7723,7 @@ static ssize_t tap_pwup_write(struct file *file, const char __user *buffer,
 	if (tap_param < 0)
 		return -EINVAL;
 
-	set_tap_pwup_fn(PDE(file_inode(file))->data, tap_param);
+	set_tap_pwup_fn(PDE_DATA(file_inode(file)), tap_param);
 	return count;
 }
 static int show_tap_pwup(struct seq_file *m, void *v)
@@ -7747,7 +7747,7 @@ static ssize_t disc_pwup_write(struct file *file, const char __user *buffer,
 	if (tap_param < 0)
 		return -EINVAL;
 
-	set_disc_pwup_fn(PDE(file_inode(file))->data, tap_param);
+	set_disc_pwup_fn(PDE_DATA(file_inode(file)), tap_param);
 	return count;
 }
 static int show_disc_pwup(struct seq_file *m, void *v)
@@ -7771,7 +7771,7 @@ static ssize_t std_nic_write(struct file *file, const char __user *buffer,
 	if (bypass_param < 0)
 		return -EINVAL;
 
-	set_std_nic_fn(PDE(file_inode(file))->data, bypass_param);
+	set_std_nic_fn(PDE_DATA(file_inode(file)), bypass_param);
 	return count;
 }
 static int show_std_nic(struct seq_file *m, void *v)
@@ -7812,7 +7812,7 @@ static ssize_t wd_exp_mode_write(struct file *file, const char __user *buffer,
 	else if (strcmp(kbuf, "disc") == 0)
 		bypass_param = 2;
 
-	set_wd_exp_mode_fn(PDE(file_inode(file))->data, bypass_param);
+	set_wd_exp_mode_fn(PDE_DATA(file_inode(file)), bypass_param);
 
 	return count;
 }
@@ -7839,7 +7839,7 @@ static ssize_t wd_autoreset_write(struct file *file, const char __user *buffer,
 	int ret = kstrtoint_from_user(buffer, count, 10, &timeout);
 	if (ret)
 		return ret;
-	set_wd_autoreset_fn(PDE(file_inode(file))->data, timeout);
+	set_wd_autoreset_fn(PDE_DATA(file_inode(file)), timeout);
 	return count;
 }
 static int show_wd_autoreset(struct seq_file *m, void *v)
diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
index a400002dfa84..19cc749f8386 100644
--- a/drivers/tty/serial/serial_core.c
+++ b/drivers/tty/serial/serial_core.c
@@ -1711,7 +1711,7 @@ static int uart_proc_show(struct seq_file *m, void *v)
 
 static int uart_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, uart_proc_show, PDE(inode)->data);
+	return single_open(file, uart_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations uart_proc_fops = {
diff --git a/drivers/usb/gadget/at91_udc.c b/drivers/usb/gadget/at91_udc.c
index 45dd2929a671..88966e0508a5 100644
--- a/drivers/usb/gadget/at91_udc.c
+++ b/drivers/usb/gadget/at91_udc.c
@@ -221,7 +221,7 @@ static int proc_udc_show(struct seq_file *s, void *unused)
 
 static int proc_udc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, proc_udc_show, PDE(inode)->data);
+	return single_open(file, proc_udc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations proc_ops = {
diff --git a/drivers/usb/gadget/lpc32xx_udc.c b/drivers/usb/gadget/lpc32xx_udc.c
index aa04089d6899..1049d3745d7a 100644
--- a/drivers/usb/gadget/lpc32xx_udc.c
+++ b/drivers/usb/gadget/lpc32xx_udc.c
@@ -565,7 +565,7 @@ static int proc_udc_show(struct seq_file *s, void *unused)
 
 static int proc_udc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, proc_udc_show, PDE(inode)->data);
+	return single_open(file, proc_udc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations proc_ops = {
diff --git a/drivers/usb/gadget/rndis.c b/drivers/usb/gadget/rndis.c
index d9297eebbf73..1e4cfb05f70b 100644
--- a/drivers/usb/gadget/rndis.c
+++ b/drivers/usb/gadget/rndis.c
@@ -1065,7 +1065,7 @@ static int rndis_proc_show(struct seq_file *m, void *v)
 static ssize_t rndis_proc_write(struct file *file, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
-	rndis_params *p = PDE(file_inode(file))->data;
+	rndis_params *p = PDE_DATA(file_inode(file));
 	u32 speed = 0;
 	int i, fl_speed = 0;
 
@@ -1109,7 +1109,7 @@ static ssize_t rndis_proc_write(struct file *file, const char __user *buffer,
 
 static int rndis_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, rndis_proc_show, PDE(inode)->data);
+	return single_open(file, rndis_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations rndis_proc_fops = {
diff --git a/drivers/usb/host/isp1362-hcd.c b/drivers/usb/host/isp1362-hcd.c
index 9137caadb1c8..b04e8ece4d35 100644
--- a/drivers/usb/host/isp1362-hcd.c
+++ b/drivers/usb/host/isp1362-hcd.c
@@ -2175,7 +2175,7 @@ static int proc_isp1362_show(struct seq_file *s, void *unused)
 
 static int proc_isp1362_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, proc_isp1362_show, PDE(inode)->data);
+	return single_open(file, proc_isp1362_show, PDE_DATA(inode));
 }
 
 static const struct file_operations proc_ops = {
diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c
index d62f0404baaa..313d0bbfff29 100644
--- a/drivers/usb/host/sl811-hcd.c
+++ b/drivers/usb/host/sl811-hcd.c
@@ -1494,7 +1494,7 @@ static int proc_sl811h_show(struct seq_file *s, void *unused)
 
 static int proc_sl811h_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, proc_sl811h_show, PDE(inode)->data);
+	return single_open(file, proc_sl811h_show, PDE_DATA(inode));
 }
 
 static const struct file_operations proc_ops = {
diff --git a/drivers/video/bfin_adv7393fb.c b/drivers/video/bfin_adv7393fb.c
index b65c1f9dc154..a54f7f7d763b 100644
--- a/drivers/video/bfin_adv7393fb.c
+++ b/drivers/video/bfin_adv7393fb.c
@@ -349,7 +349,7 @@ static ssize_t
 adv7393_write_proc(struct file *file, const char __user * buffer,
 		   size_t count, loff_t *ppos)
 {
-	struct adv7393fb_device *fbdev = PDE(file_inode(file))->data;
+	struct adv7393fb_device *fbdev = PDE_DATA(file_inode(file));
 	unsigned int val;
 	int ret;
 
diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c
index 73b33837e12c..6d3a602c004b 100644
--- a/drivers/zorro/proc.c
+++ b/drivers/zorro/proc.c
@@ -47,9 +47,7 @@ proc_bus_zorro_lseek(struct file *file, loff_t off, int whence)
 static ssize_t
 proc_bus_zorro_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	struct inode *ino = file_inode(file);
-	struct proc_dir_entry *dp = PDE(ino);
-	struct zorro_dev *z = dp->data;
+	struct zorro_dev *z = PDE_DATA(file_inode(file));
 	struct ConfigDev cd;
 	loff_t pos = *ppos;
 
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 096b23f821a1..526e4bbbde59 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -190,7 +190,7 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 		return ret;
 
 	m = file->private_data;
-	m->private = PDE(inode)->data;
+	m->private = PDE_DATA(inode);
 
 	return 0;
 }
@@ -448,7 +448,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -554,7 +554,7 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
@@ -659,7 +659,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = PDE(inode)->data;
+	cell = PDE_DATA(inode);
 	if (!cell)
 		return -ENOENT;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index ee6614bdb639..28e421c208a5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2149,7 +2149,7 @@ static const struct seq_operations ext4_mb_seq_groups_ops = {
 
 static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 {
-	struct super_block *sb = PDE(inode)->data;
+	struct super_block *sb = PDE_DATA(inode);
 	int rc;
 
 	rc = seq_open(file, &ext4_mb_seq_groups_ops);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5d6d53578124..c65510548355 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1802,7 +1802,7 @@ static int options_seq_show(struct seq_file *seq, void *offset)
 
 static int options_open_fs(struct inode *inode, struct file *file)
 {
-	return single_open(file, options_seq_show, PDE(inode)->data);
+	return single_open(file, options_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations ext4_seq_options_fops = {
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index ed10991ab006..154592ea5632 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -950,7 +950,7 @@ static const struct seq_operations jbd2_seq_info_ops = {
 
 static int jbd2_seq_info_open(struct inode *inode, struct file *file)
 {
-	journal_t *journal = PDE(inode)->data;
+	journal_t *journal = PDE_DATA(inode);
 	struct jbd2_stats_proc_session *s;
 	int rc, size;
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 51fcb201e289..c0ad720c37b9 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -346,7 +346,7 @@ void proc_free_inum(unsigned int inum)
 
 static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	nd_set_link(nd, PDE(dentry->d_inode)->data);
+	nd_set_link(nd, PDE_DATA(dentry->d_inode));
 	return NULL;
 }
 
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index 30b590f5bd35..e0043c7e7ab7 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -41,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v)
 
 static int property_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, property_proc_show, PDE(inode)->data);
+	return single_open(file, property_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations property_proc_fops = {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 4f4137a0bd8a..5ae73e273e7e 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -306,6 +306,11 @@ static inline struct proc_dir_entry *PDE(const struct inode *inode)
 	return PROC_I(inode)->pde;
 }
 
+static inline void *PDE_DATA(const struct inode *inode)
+{
+	return PROC_I(inode)->pde->data;
+}
+
 static inline struct net *PDE_NET(struct proc_dir_entry *pde)
 {
 	return pde->parent->data;
diff --git a/ipc/util.c b/ipc/util.c
index 464a8abd779f..b6db68131a0e 100644
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -964,7 +964,7 @@ static int sysvipc_proc_open(struct inode *inode, struct file *file)
 	seq = file->private_data;
 	seq->private = iter;
 
-	iter->iface = PDE(inode)->data;
+	iter->iface = PDE_DATA(inode);
 	iter->ns    = get_ipc_ns(current->nsproxy->ipc_ns);
 out:
 	return ret;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 397db02209ed..d59ae3751a33 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -76,7 +76,7 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
 static ssize_t write_irq_affinity(int type, struct file *file,
 		const char __user *buffer, size_t count, loff_t *pos)
 {
-	unsigned int irq = (int)(long)PDE(file_inode(file))->data;
+	unsigned int irq = (int)(long)PDE_DATA(file_inode(file));
 	cpumask_var_t new_value;
 	int err;
 
@@ -131,17 +131,17 @@ static ssize_t irq_affinity_list_proc_write(struct file *file,
 
 static int irq_affinity_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
+	return single_open(file, irq_affinity_proc_show, PDE_DATA(inode));
 }
 
 static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+	return single_open(file, irq_affinity_list_proc_show, PDE_DATA(inode));
 }
 
 static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
+	return single_open(file, irq_affinity_hint_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations irq_affinity_proc_fops = {
@@ -212,7 +212,7 @@ out:
 
 static int default_affinity_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, default_affinity_show, PDE(inode)->data);
+	return single_open(file, default_affinity_show, PDE_DATA(inode));
 }
 
 static const struct file_operations default_affinity_proc_fops = {
@@ -233,7 +233,7 @@ static int irq_node_proc_show(struct seq_file *m, void *v)
 
 static int irq_node_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, irq_node_proc_show, PDE(inode)->data);
+	return single_open(file, irq_node_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations irq_node_proc_fops = {
@@ -256,7 +256,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
 
 static int irq_spurious_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
+	return single_open(file, irq_spurious_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations irq_spurious_proc_fops = {
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index dc526ec965e4..959ddbb0ca4d 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -93,7 +93,7 @@ static const struct file_operations vlan_fops = {
 
 static int vlandev_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, vlandev_seq_show, PDE(inode)->data);
+	return single_open(file, vlandev_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations vlandev_fops = {
diff --git a/net/atm/proc.c b/net/atm/proc.c
index 6ac35ff0d6b9..bbb6461a4b7f 100644
--- a/net/atm/proc.c
+++ b/net/atm/proc.c
@@ -385,7 +385,7 @@ static ssize_t proc_dev_atm_read(struct file *file, char __user *buf,
 	page = get_zeroed_page(GFP_KERNEL);
 	if (!page)
 		return -ENOMEM;
-	dev = PDE(file_inode(file))->data;
+	dev = PDE_DATA(file_inode(file));
 	if (!dev->ops->proc_read)
 		length = -EINVAL;
 	else {
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index d3ee69b35a78..82040e46b24b 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -617,7 +617,7 @@ static int bt_seq_open(struct inode *inode, struct file *file)
 	struct bt_sock_list *sk_list;
 	struct bt_seq_state *s;
 
-	sk_list = PDE(inode)->data;
+	sk_list = PDE_DATA(inode);
 	s = __seq_open_private(file, &bt_seq_ops,
 			       sizeof(struct bt_seq_state));
 	if (!s)
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index a4a9d4b6816c..cd75e4d64b90 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -539,7 +539,7 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
 
 static int cmtp_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, cmtp_proc_show, PDE(inode)->data);
+	return single_open(file, cmtp_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations cmtp_proc_fops = {
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 5dcb20076f39..8f113e6ff327 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -226,7 +226,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
 
 static int bcm_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, bcm_proc_show, PDE(inode)->data);
+	return single_open(file, bcm_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations bcm_proc_fops = {
diff --git a/net/can/proc.c b/net/can/proc.c
index 1ab8c888f102..b543470c8f8b 100644
--- a/net/can/proc.c
+++ b/net/can/proc.c
@@ -378,7 +378,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v)
 
 static int can_rcvlist_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, can_rcvlist_proc_show, PDE(inode)->data);
+	return single_open(file, can_rcvlist_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations can_rcvlist_proc_fops = {
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3863b8f639c5..537301a2c31f 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -2714,7 +2714,7 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file)
 
 	if (!ret) {
 		struct seq_file *sf = file->private_data;
-		sf->private = PDE(inode)->data;
+		sf->private = PDE_DATA(inode);
 	}
 	return ret;
 };
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 6048fc1da1c2..f6af4fe59f2e 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -508,7 +508,7 @@ out:
 
 static int pgctrl_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pgctrl_show, PDE(inode)->data);
+	return single_open(file, pgctrl_show, PDE_DATA(inode));
 }
 
 static const struct file_operations pktgen_fops = {
@@ -1685,7 +1685,7 @@ static ssize_t pktgen_if_write(struct file *file,
 
 static int pktgen_if_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pktgen_if_show, PDE(inode)->data);
+	return single_open(file, pktgen_if_show, PDE_DATA(inode));
 }
 
 static const struct file_operations pktgen_if_fops = {
@@ -1823,7 +1823,7 @@ out:
 
 static int pktgen_thread_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, pktgen_thread_show, PDE(inode)->data);
+	return single_open(file, pktgen_thread_show, PDE_DATA(inode));
 }
 
 static const struct file_operations pktgen_thread_fops = {
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index 5852b249054f..e4738fef070a 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -631,7 +631,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
 
 	if (!ret) {
 		struct seq_file *sf = file->private_data;
-		struct clusterip_config *c = PDE(inode)->data;
+		struct clusterip_config *c = PDE_DATA(inode);
 
 		sf->private = c;
 
@@ -643,7 +643,7 @@ static int clusterip_proc_open(struct inode *inode, struct file *file)
 
 static int clusterip_proc_release(struct inode *inode, struct file *file)
 {
-	struct clusterip_config *c = PDE(inode)->data;
+	struct clusterip_config *c = PDE_DATA(inode);
 	int ret;
 
 	ret = seq_release(inode, file);
@@ -657,7 +657,7 @@ static int clusterip_proc_release(struct inode *inode, struct file *file)
 static ssize_t clusterip_proc_write(struct file *file, const char __user *input,
 				size_t size, loff_t *ofs)
 {
-	struct clusterip_config *c = PDE(file_inode(file))->data;
+	struct clusterip_config *c = PDE_DATA(file_inode(file));
 #define PROC_WRITELEN	10
 	char buffer[PROC_WRITELEN+1];
 	unsigned long nodenum;
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index d09203c63264..fc55a1c79bd9 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2580,7 +2580,7 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
 
 int tcp_seq_open(struct inode *inode, struct file *file)
 {
-	struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct tcp_seq_afinfo *afinfo = PDE_DATA(inode);
 	struct tcp_iter_state *s;
 	int err;
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0a073a263720..d27264318315 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2093,7 +2093,7 @@ static void udp_seq_stop(struct seq_file *seq, void *v)
 
 int udp_seq_open(struct inode *inode, struct file *file)
 {
-	struct udp_seq_afinfo *afinfo = PDE(inode)->data;
+	struct udp_seq_afinfo *afinfo = PDE_DATA(inode);
 	struct udp_iter_state *s;
 	int err;
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index bbbe53a99b57..7ea6e180139c 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -247,7 +247,7 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
 
 static int snmp6_dev_seq_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, snmp6_dev_seq_show, PDE(inode)->data);
+	return single_open(file, snmp6_dev_seq_show, PDE_DATA(inode));
 }
 
 static const struct file_operations snmp6_dev_seq_fops = {
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 686c7715d777..67fb7bff9bbc 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -999,7 +999,7 @@ static int xt_table_open(struct inode *inode, struct file *file)
 			   sizeof(struct xt_names_priv));
 	if (!ret) {
 		priv = ((struct seq_file *)file->private_data)->private;
-		priv->af = (unsigned long)PDE(inode)->data;
+		priv->af = (unsigned long)PDE_DATA(inode);
 	}
 	return ret;
 }
@@ -1147,7 +1147,7 @@ static int xt_match_open(struct inode *inode, struct file *file)
 
 	seq = file->private_data;
 	seq->private = trav;
-	trav->nfproto = (unsigned long)PDE(inode)->data;
+	trav->nfproto = (unsigned long)PDE_DATA(inode);
 	return 0;
 }
 
@@ -1211,7 +1211,7 @@ static int xt_target_open(struct inode *inode, struct file *file)
 
 	seq = file->private_data;
 	seq->private = trav;
-	trav->nfproto = (unsigned long)PDE(inode)->data;
+	trav->nfproto = (unsigned long)PDE_DATA(inode);
 	return 0;
 }
 
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index f330e8beaf69..ebfad037b11f 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -841,7 +841,7 @@ static int dl_proc_open(struct inode *inode, struct file *file)
 
 	if (!ret) {
 		struct seq_file *sf = file->private_data;
-		sf->private = PDE(inode)->data;
+		sf->private = PDE_DATA(inode);
 	}
 	return ret;
 }
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index d9cad315229d..3db2d387cf52 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -525,14 +525,13 @@ static const struct seq_operations recent_seq_ops = {
 
 static int recent_seq_open(struct inode *inode, struct file *file)
 {
-	struct proc_dir_entry *pde = PDE(inode);
 	struct recent_iter_state *st;
 
 	st = __seq_open_private(file, &recent_seq_ops, sizeof(*st));
 	if (st == NULL)
 		return -ENOMEM;
 
-	st->table    = pde->data;
+	st->table    = PDE_DATA(inode);
 	return 0;
 }
 
@@ -540,8 +539,7 @@ static ssize_t
 recent_mt_proc_write(struct file *file, const char __user *input,
 		     size_t size, loff_t *loff)
 {
-	const struct proc_dir_entry *pde = PDE(file_inode(file));
-	struct recent_table *t = pde->data;
+	struct recent_table *t = PDE_DATA(file_inode(file));
 	struct recent_entry *e;
 	char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
 	const char *c = buf;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 25d58e766014..d9828b6799a3 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1461,7 +1461,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
 static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
 				 size_t count, loff_t *ppos)
 {
-	struct cache_detail *cd = PDE(file_inode(filp))->data;
+	struct cache_detail *cd = PDE_DATA(file_inode(filp));
 
 	return cache_read(filp, buf, count, ppos, cd);
 }
@@ -1469,14 +1469,14 @@ static ssize_t cache_read_procfs(struct file *filp, char __user *buf,
 static ssize_t cache_write_procfs(struct file *filp, const char __user *buf,
 				  size_t count, loff_t *ppos)
 {
-	struct cache_detail *cd = PDE(file_inode(filp))->data;
+	struct cache_detail *cd = PDE_DATA(file_inode(filp));
 
 	return cache_write(filp, buf, count, ppos, cd);
 }
 
 static unsigned int cache_poll_procfs(struct file *filp, poll_table *wait)
 {
-	struct cache_detail *cd = PDE(file_inode(filp))->data;
+	struct cache_detail *cd = PDE_DATA(file_inode(filp));
 
 	return cache_poll(filp, wait, cd);
 }
@@ -1485,21 +1485,21 @@ static long cache_ioctl_procfs(struct file *filp,
 			       unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return cache_ioctl(inode, filp, cmd, arg, cd);
 }
 
 static int cache_open_procfs(struct inode *inode, struct file *filp)
 {
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return cache_open(inode, filp, cd);
 }
 
 static int cache_release_procfs(struct inode *inode, struct file *filp)
 {
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return cache_release(inode, filp, cd);
 }
@@ -1517,14 +1517,14 @@ static const struct file_operations cache_file_operations_procfs = {
 
 static int content_open_procfs(struct inode *inode, struct file *filp)
 {
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return content_open(inode, filp, cd);
 }
 
 static int content_release_procfs(struct inode *inode, struct file *filp)
 {
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return content_release(inode, filp, cd);
 }
@@ -1538,14 +1538,14 @@ static const struct file_operations content_file_operations_procfs = {
 
 static int open_flush_procfs(struct inode *inode, struct file *filp)
 {
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return open_flush(inode, filp, cd);
 }
 
 static int release_flush_procfs(struct inode *inode, struct file *filp)
 {
-	struct cache_detail *cd = PDE(inode)->data;
+	struct cache_detail *cd = PDE_DATA(inode);
 
 	return release_flush(inode, filp, cd);
 }
@@ -1553,7 +1553,7 @@ static int release_flush_procfs(struct inode *inode, struct file *filp)
 static ssize_t read_flush_procfs(struct file *filp, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
-	struct cache_detail *cd = PDE(file_inode(filp))->data;
+	struct cache_detail *cd = PDE_DATA(file_inode(filp));
 
 	return read_flush(filp, buf, count, ppos, cd);
 }
@@ -1562,7 +1562,7 @@ static ssize_t write_flush_procfs(struct file *filp,
 				  const char __user *buf,
 				  size_t count, loff_t *ppos)
 {
-	struct cache_detail *cd = PDE(file_inode(filp))->data;
+	struct cache_detail *cd = PDE_DATA(file_inode(filp));
 
 	return write_flush(filp, buf, count, ppos, cd);
 }
diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c
index bc2068ee795b..21b75cb08c03 100644
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -64,7 +64,7 @@ static int rpc_proc_show(struct seq_file *seq, void *v) {
 
 static int rpc_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, rpc_proc_show, PDE(inode)->data);
+	return single_open(file, rpc_proc_show, PDE_DATA(inode));
 }
 
 static const struct file_operations rpc_proc_fops = {
diff --git a/sound/core/info.c b/sound/core/info.c
index a4e2de6874df..3aa88640808e 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -310,12 +310,10 @@ static int snd_info_entry_open(struct inode *inode, struct file *file)
 	struct snd_info_entry *entry;
 	struct snd_info_private_data *data;
 	struct snd_info_buffer *buffer;
-	struct proc_dir_entry *p;
 	int mode, err;
 
 	mutex_lock(&info_mutex);
-	p = PDE(inode);
-	entry = p == NULL ? NULL : (struct snd_info_entry *)p->data;
+	entry = PDE_DATA(inode);
 	if (entry == NULL || ! entry->p) {
 		mutex_unlock(&info_mutex);
 		return -ENODEV;
-- 
cgit 


From fbd387aea0cb98c9d6e534c55d3d2ac83153348d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 1 Apr 2013 20:48:34 -0400
Subject: create_proc_cpu_mask() doesn't need an argument...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/s390/kernel/irq.c  | 6 ++----
 include/linux/profile.h | 4 ++--
 kernel/profile.c        | 4 ++--
 3 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index 1630f439cd2a..1580af3db31a 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -162,10 +162,8 @@ asmlinkage void do_softirq(void)
 #ifdef CONFIG_PROC_FS
 void init_irq_proc(void)
 {
-	struct proc_dir_entry *root_irq_dir;
-
-	root_irq_dir = proc_mkdir("irq", NULL);
-	create_prof_cpu_mask(root_irq_dir);
+	if (proc_mkdir("irq", NULL))
+		create_prof_cpu_mask();
 }
 #endif
 
diff --git a/include/linux/profile.h b/include/linux/profile.h
index 21123902366d..aaad3861beb8 100644
--- a/include/linux/profile.h
+++ b/include/linux/profile.h
@@ -18,10 +18,10 @@ struct pt_regs;
 struct notifier_block;
 
 #if defined(CONFIG_PROFILING) && defined(CONFIG_PROC_FS)
-void create_prof_cpu_mask(struct proc_dir_entry *de);
+void create_prof_cpu_mask(void);
 int create_proc_profile(void);
 #else
-static inline void create_prof_cpu_mask(struct proc_dir_entry *de)
+static inline void create_prof_cpu_mask(void)
 {
 }
 
diff --git a/kernel/profile.c b/kernel/profile.c
index dc3384ee874e..524ce5e29d3f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -462,10 +462,10 @@ static const struct file_operations prof_cpu_mask_proc_fops = {
 	.write		= prof_cpu_mask_proc_write,
 };
 
-void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
+void create_prof_cpu_mask(void)
 {
 	/* create /proc/irq/prof_cpu_mask */
-	proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops);
+	proc_create("irq/prof_cpu_mask", 0600, NULL, &prof_cpu_mask_proc_fops);
 }
 
 /*
-- 
cgit 


From 80e928f7ebb958f4d79d4099d1c5c0a015a23b93 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 4 Apr 2013 17:02:03 +0100
Subject: proc: Kill create_proc_entry()

Kill create_proc_entry() in favour of create_proc_read_entry(), proc_create()
and proc_create_data().

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/proc/generic.c       |  9 ++++++---
 include/linux/proc_fs.h | 17 ++---------------
 2 files changed, 8 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index c0ad720c37b9..5453f1c0b70c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -650,8 +650,9 @@ struct proc_dir_entry *proc_mkdir(const char *name,
 }
 EXPORT_SYMBOL(proc_mkdir);
 
-struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
-					 struct proc_dir_entry *parent)
+struct proc_dir_entry *create_proc_read_entry(
+	const char *name, umode_t mode, struct proc_dir_entry *parent, 
+	read_proc_t *read_proc, void *data)
 {
 	struct proc_dir_entry *ent;
 
@@ -668,6 +669,8 @@ struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
 
 	ent = __proc_create(&parent, name, mode, 1);
 	if (ent) {
+		ent->read_proc = read_proc;
+		ent->data = data;
 		if (proc_register(parent, ent) < 0) {
 			kfree(ent);
 			ent = NULL;
@@ -675,7 +678,7 @@ struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
 	}
 	return ent;
 }
-EXPORT_SYMBOL(create_proc_entry);
+EXPORT_SYMBOL(create_proc_read_entry);
 
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 					struct proc_dir_entry *parent,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 5ae73e273e7e..bcc0e10ef1df 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -109,8 +109,6 @@ extern void proc_root_init(void);
 
 void proc_flush_task(struct task_struct *task);
 
-extern struct proc_dir_entry *create_proc_entry(const char *name, umode_t mode,
-						struct proc_dir_entry *parent);
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 				struct proc_dir_entry *parent,
 				const struct file_operations *proc_fops,
@@ -164,17 +162,9 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 	return proc_create_data(name, mode, parent, proc_fops, NULL);
 }
 
-static inline struct proc_dir_entry *create_proc_read_entry(const char *name,
+extern struct proc_dir_entry *create_proc_read_entry(const char *name,
 	umode_t mode, struct proc_dir_entry *base, 
-	read_proc_t *read_proc, void * data)
-{
-	struct proc_dir_entry *res=create_proc_entry(name,mode,base);
-	if (res) {
-		res->read_proc=read_proc;
-		res->data=data;
-	}
-	return res;
-}
+	read_proc_t *read_proc, void *data);
  
 extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);
@@ -190,9 +180,6 @@ static inline void proc_flush_task(struct task_struct *task)
 {
 }
 
-static inline struct proc_dir_entry *create_proc_entry(const char *name,
-	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
-
 #define proc_create(name, mode, parent, fops)  ({ (void)(mode), NULL; })
 
 static inline struct proc_dir_entry *proc_create_data(const char *name,
-- 
cgit 


From 6eb4c7e96e19fd2c38a103472048fc0e0e0a3ec3 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Tue, 9 Apr 2013 08:57:20 +0000
Subject: netfilter: ipset: hash:*net*: nomatch flag not excluded on set resize

If a resize is triggered the nomatch flag is not excluded at hashing,
which leads to the element missed at lookup in the resized set.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set_ahash.h | 30 +++++++++++++++++++++-------
 net/netfilter/ipset/ip_set_hash_ipportnet.c  | 18 +++++++++++++++++
 net/netfilter/ipset/ip_set_hash_net.c        | 22 ++++++++++++++++++--
 net/netfilter/ipset/ip_set_hash_netiface.c   | 22 ++++++++++++++++++--
 net/netfilter/ipset/ip_set_hash_netport.c    | 18 +++++++++++++++++
 5 files changed, 99 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
index 01d25e6fc792..0214c4c146fa 100644
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ b/include/linux/netfilter/ipset/ip_set_ahash.h
@@ -291,6 +291,7 @@ ip_set_hash_destroy(struct ip_set *set)
 #define type_pf_data_tlist	TOKEN(TYPE, PF, _data_tlist)
 #define type_pf_data_next	TOKEN(TYPE, PF, _data_next)
 #define type_pf_data_flags	TOKEN(TYPE, PF, _data_flags)
+#define type_pf_data_reset_flags TOKEN(TYPE, PF, _data_reset_flags)
 #ifdef IP_SET_HASH_WITH_NETS
 #define type_pf_data_match	TOKEN(TYPE, PF, _data_match)
 #else
@@ -385,9 +386,9 @@ type_pf_resize(struct ip_set *set, bool retried)
 	struct ip_set_hash *h = set->data;
 	struct htable *t, *orig = h->table;
 	u8 htable_bits = orig->htable_bits;
-	const struct type_pf_elem *data;
+	struct type_pf_elem *data;
 	struct hbucket *n, *m;
-	u32 i, j;
+	u32 i, j, flags = 0;
 	int ret;
 
 retry:
@@ -412,9 +413,16 @@ retry:
 		n = hbucket(orig, i);
 		for (j = 0; j < n->pos; j++) {
 			data = ahash_data(n, j);
+#ifdef IP_SET_HASH_WITH_NETS
+			flags = 0;
+			type_pf_data_reset_flags(data, &flags);
+#endif
 			m = hbucket(t, HKEY(data, h->initval, htable_bits));
-			ret = type_pf_elem_add(m, data, AHASH_MAX(h), 0);
+			ret = type_pf_elem_add(m, data, AHASH_MAX(h), flags);
 			if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+				type_pf_data_flags(data, flags);
+#endif
 				read_unlock_bh(&set->lock);
 				ahash_destroy(t);
 				if (ret == -EAGAIN)
@@ -836,9 +844,9 @@ type_pf_tresize(struct ip_set *set, bool retried)
 	struct ip_set_hash *h = set->data;
 	struct htable *t, *orig = h->table;
 	u8 htable_bits = orig->htable_bits;
-	const struct type_pf_elem *data;
+	struct type_pf_elem *data;
 	struct hbucket *n, *m;
-	u32 i, j;
+	u32 i, j, flags = 0;
 	int ret;
 
 	/* Try to cleanup once */
@@ -873,10 +881,17 @@ retry:
 		n = hbucket(orig, i);
 		for (j = 0; j < n->pos; j++) {
 			data = ahash_tdata(n, j);
+#ifdef IP_SET_HASH_WITH_NETS
+			flags = 0;
+			type_pf_data_reset_flags(data, &flags);
+#endif
 			m = hbucket(t, HKEY(data, h->initval, htable_bits));
-			ret = type_pf_elem_tadd(m, data, AHASH_MAX(h), 0,
-						ip_set_timeout_get(type_pf_data_timeout(data)));
+			ret = type_pf_elem_tadd(m, data, AHASH_MAX(h), flags,
+				ip_set_timeout_get(type_pf_data_timeout(data)));
 			if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+				type_pf_data_flags(data, flags);
+#endif
 				read_unlock_bh(&set->lock);
 				ahash_destroy(t);
 				if (ret == -EAGAIN)
@@ -1187,6 +1202,7 @@ type_pf_gc_init(struct ip_set *set)
 #undef type_pf_data_tlist
 #undef type_pf_data_next
 #undef type_pf_data_flags
+#undef type_pf_data_reset_flags
 #undef type_pf_data_match
 
 #undef type_pf_elem
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index f2627226a087..10a30b4fc7db 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -104,6 +104,15 @@ hash_ipportnet4_data_flags(struct hash_ipportnet4_elem *dst, u32 flags)
 	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
 }
 
+static inline void
+hash_ipportnet4_data_reset_flags(struct hash_ipportnet4_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
+}
+
 static inline int
 hash_ipportnet4_data_match(const struct hash_ipportnet4_elem *elem)
 {
@@ -414,6 +423,15 @@ hash_ipportnet6_data_flags(struct hash_ipportnet6_elem *dst, u32 flags)
 	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
 }
 
+static inline void
+hash_ipportnet6_data_reset_flags(struct hash_ipportnet6_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
+}
+
 static inline int
 hash_ipportnet6_data_match(const struct hash_ipportnet6_elem *elem)
 {
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 4b677cf6bf7d..d6a59154d710 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -87,7 +87,16 @@ hash_net4_data_copy(struct hash_net4_elem *dst,
 static inline void
 hash_net4_data_flags(struct hash_net4_elem *dst, u32 flags)
 {
-	dst->nomatch = flags & IPSET_FLAG_NOMATCH;
+	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_net4_data_reset_flags(struct hash_net4_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
 }
 
 static inline int
@@ -308,7 +317,16 @@ hash_net6_data_copy(struct hash_net6_elem *dst,
 static inline void
 hash_net6_data_flags(struct hash_net6_elem *dst, u32 flags)
 {
-	dst->nomatch = flags & IPSET_FLAG_NOMATCH;
+	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_net6_data_reset_flags(struct hash_net6_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
 }
 
 static inline int
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 6ba985f1c96f..f2b0a3c30130 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -198,7 +198,16 @@ hash_netiface4_data_copy(struct hash_netiface4_elem *dst,
 static inline void
 hash_netiface4_data_flags(struct hash_netiface4_elem *dst, u32 flags)
 {
-	dst->nomatch = flags & IPSET_FLAG_NOMATCH;
+	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
+}
+
+static inline void
+hash_netiface4_data_reset_flags(struct hash_netiface4_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
 }
 
 static inline int
@@ -494,7 +503,7 @@ hash_netiface6_data_copy(struct hash_netiface6_elem *dst,
 static inline void
 hash_netiface6_data_flags(struct hash_netiface6_elem *dst, u32 flags)
 {
-	dst->nomatch = flags & IPSET_FLAG_NOMATCH;
+	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
 }
 
 static inline int
@@ -503,6 +512,15 @@ hash_netiface6_data_match(const struct hash_netiface6_elem *elem)
 	return elem->nomatch ? -ENOTEMPTY : 1;
 }
 
+static inline void
+hash_netiface6_data_reset_flags(struct hash_netiface6_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
+}
+
 static inline void
 hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem)
 {
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index af20c0c5ced2..349deb672a2d 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -104,6 +104,15 @@ hash_netport4_data_flags(struct hash_netport4_elem *dst, u32 flags)
 	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
 }
 
+static inline void
+hash_netport4_data_reset_flags(struct hash_netport4_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
+}
+
 static inline int
 hash_netport4_data_match(const struct hash_netport4_elem *elem)
 {
@@ -375,6 +384,15 @@ hash_netport6_data_flags(struct hash_netport6_elem *dst, u32 flags)
 	dst->nomatch = !!(flags & IPSET_FLAG_NOMATCH);
 }
 
+static inline void
+hash_netport6_data_reset_flags(struct hash_netport6_elem *dst, u32 *flags)
+{
+	if (dst->nomatch) {
+		*flags = IPSET_FLAG_NOMATCH;
+		dst->nomatch = 0;
+	}
+}
+
 static inline int
 hash_netport6_data_match(const struct hash_netport6_elem *elem)
 {
-- 
cgit 


From 866ad9a747bbf5461739fcae6d0a41c8971bbe1d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 3 Apr 2013 19:07:30 -0400
Subject: procfs: preparations for remove_proc_entry() race fixes

* leave ->proc_fops alone; make ->pde_users negative instead
* trim pde_opener
* move relevant code in fs/proc/inode.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c       |  83 +---------------
 fs/proc/inode.c         | 248 ++++++++++++++++++++++++------------------------
 fs/proc/internal.h      |   7 +-
 include/linux/proc_fs.h |  11 +--
 4 files changed, 135 insertions(+), 214 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index a6a1cb5d589d..bec58323629c 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -39,7 +39,7 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry
 /* buffer size is one page but our output routines use some slack for overruns */
 #define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
 
-static ssize_t
+ssize_t
 __proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	       loff_t *ppos)
 {
@@ -171,48 +171,6 @@ __proc_file_read(struct file *file, char __user *buf, size_t nbytes,
 	return retval;
 }
 
-static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
-	       loff_t *ppos)
-{
-	struct proc_dir_entry *pde = PDE(file_inode(file));
-	ssize_t rv = -EIO;
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
-	}
-	pde->pde_users++;
-	spin_unlock(&pde->pde_unload_lock);
-
-	rv = __proc_file_read(file, buf, nbytes, ppos);
-
-	pde_users_dec(pde);
-	return rv;
-}
-
-static loff_t
-proc_file_lseek(struct file *file, loff_t offset, int orig)
-{
-	loff_t retval = -EINVAL;
-	switch (orig) {
-	case 1:
-		offset += file->f_pos;
-	/* fallthrough */
-	case 0:
-		if (offset < 0 || offset > MAX_NON_LFS)
-			break;
-		file->f_pos = retval = offset;
-	}
-	return retval;
-}
-
-static const struct file_operations proc_file_operations = {
-	.llseek		= proc_file_lseek,
-	.read		= proc_file_read,
-};
-
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -722,41 +680,6 @@ void pde_put(struct proc_dir_entry *pde)
 		free_proc_entry(pde);
 }
 
-static void entry_rundown(struct proc_dir_entry *de)
-{
-	spin_lock(&de->pde_unload_lock);
-	/*
-	 * Stop accepting new callers into module. If you're
-	 * dynamically allocating ->proc_fops, save a pointer somewhere.
-	 */
-	de->proc_fops = NULL;
-	/* Wait until all existing callers into module are done. */
-	if (de->pde_users > 0) {
-		DECLARE_COMPLETION_ONSTACK(c);
-
-		if (!de->pde_unload_completion)
-			de->pde_unload_completion = &c;
-
-		spin_unlock(&de->pde_unload_lock);
-
-		wait_for_completion(de->pde_unload_completion);
-
-		spin_lock(&de->pde_unload_lock);
-	}
-
-	while (!list_empty(&de->pde_openers)) {
-		struct pde_opener *pdeo;
-
-		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
-		list_del(&pdeo->lh);
-		spin_unlock(&de->pde_unload_lock);
-		pdeo->release(pdeo->inode, pdeo->file);
-		kfree(pdeo);
-		spin_lock(&de->pde_unload_lock);
-	}
-	spin_unlock(&de->pde_unload_lock);
-}
-
 /*
  * Remove a /proc entry and free it if it's not currently in use.
  */
@@ -788,7 +711,7 @@ void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
 		return;
 	}
 
-	entry_rundown(de);
+	proc_entry_rundown(de);
 
 	if (S_ISDIR(de->mode))
 		parent->nlink--;
@@ -837,7 +760,7 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 		}
 		spin_unlock(&proc_subdir_lock);
 
-		entry_rundown(de);
+		proc_entry_rundown(de);
 		next = de->parent;
 		if (S_ISDIR(de->mode))
 			next->nlink--;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index a4aaaeee3342..0cd9d80f28e8 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -129,96 +129,138 @@ static const struct super_operations proc_sops = {
 	.show_options	= proc_show_options,
 };
 
+enum {BIAS = -1U<<31};
+
+static inline int use_pde(struct proc_dir_entry *pde)
+{
+	int res = 1;
+	spin_lock(&pde->pde_unload_lock);
+	if (unlikely(pde->pde_users < 0))
+		res = 0;
+	else
+		pde->pde_users++;
+	spin_unlock(&pde->pde_unload_lock);
+	return res;
+}
+
 static void __pde_users_dec(struct proc_dir_entry *pde)
 {
-	pde->pde_users--;
-	if (pde->pde_unload_completion && pde->pde_users == 0)
+	if (--pde->pde_users == BIAS)
 		complete(pde->pde_unload_completion);
 }
 
-void pde_users_dec(struct proc_dir_entry *pde)
+static void unuse_pde(struct proc_dir_entry *pde)
 {
 	spin_lock(&pde->pde_unload_lock);
 	__pde_users_dec(pde);
 	spin_unlock(&pde->pde_unload_lock);
 }
 
-static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+void proc_entry_rundown(struct proc_dir_entry *de)
 {
-	struct proc_dir_entry *pde = PDE(file_inode(file));
-	loff_t rv = -EINVAL;
-	loff_t (*llseek)(struct file *, loff_t, int);
+	spin_lock(&de->pde_unload_lock);
+	de->pde_users += BIAS;
+	/* Wait until all existing callers into module are done. */
+	if (de->pde_users != BIAS) {
+		DECLARE_COMPLETION_ONSTACK(c);
+		de->pde_unload_completion = &c;
+		spin_unlock(&de->pde_unload_lock);
 
-	spin_lock(&pde->pde_unload_lock);
-	/*
-	 * remove_proc_entry() is going to delete PDE (as part of module
-	 * cleanup sequence). No new callers into module allowed.
-	 */
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+		wait_for_completion(de->pde_unload_completion);
+
+		spin_lock(&de->pde_unload_lock);
 	}
-	/*
-	 * Bump refcount so that remove_proc_entry will wail for ->llseek to
-	 * complete.
-	 */
-	pde->pde_users++;
-	/*
-	 * Save function pointer under lock, to protect against ->proc_fops
-	 * NULL'ifying right after ->pde_unload_lock is dropped.
-	 */
-	llseek = pde->proc_fops->llseek;
-	spin_unlock(&pde->pde_unload_lock);
 
-	if (!llseek)
-		llseek = default_llseek;
-	rv = llseek(file, offset, whence);
+	while (!list_empty(&de->pde_openers)) {
+		struct pde_opener *pdeo;
+		struct file *file;
 
-	pde_users_dec(pde);
-	return rv;
+		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
+		list_del(&pdeo->lh);
+		spin_unlock(&de->pde_unload_lock);
+		file = pdeo->file;
+		de->proc_fops->release(file_inode(file), file);
+		kfree(pdeo);
+		spin_lock(&de->pde_unload_lock);
+	}
+	spin_unlock(&de->pde_unload_lock);
 }
 
-static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+/* ->read_proc() users - legacy crap */
+static ssize_t
+proc_file_read(struct file *file, char __user *buf, size_t nbytes,
+	       loff_t *ppos)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
-	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
+	if (use_pde(pde)) {
+		rv = __proc_file_read(file, buf, nbytes, ppos);
+		unuse_pde(pde);
+	}
+	return rv;
+}
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+static loff_t
+proc_file_lseek(struct file *file, loff_t offset, int orig)
+{
+	loff_t retval = -EINVAL;
+	switch (orig) {
+	case 1:
+		offset += file->f_pos;
+	/* fallthrough */
+	case 0:
+		if (offset < 0 || offset > MAX_NON_LFS)
+			break;
+		file->f_pos = retval = offset;
 	}
-	pde->pde_users++;
-	read = pde->proc_fops->read;
-	spin_unlock(&pde->pde_unload_lock);
+	return retval;
+}
 
-	if (read)
-		rv = read(file, buf, count, ppos);
+const struct file_operations proc_file_operations = {
+	.llseek		= proc_file_lseek,
+	.read		= proc_file_read,
+};
 
-	pde_users_dec(pde);
+static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
+{
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	loff_t rv = -EINVAL;
+	if (use_pde(pde)) {
+		loff_t (*llseek)(struct file *, loff_t, int);
+		llseek = pde->proc_fops->llseek;
+		if (!llseek)
+			llseek = default_llseek;
+		rv = llseek(file, offset, whence);
+		unuse_pde(pde);
+	}
 	return rv;
 }
 
-static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+static ssize_t proc_reg_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
+	ssize_t (*read)(struct file *, char __user *, size_t, loff_t *);
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	ssize_t rv = -EIO;
-	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		read = pde->proc_fops->read;
+		if (read)
+			rv = read(file, buf, count, ppos);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	write = pde->proc_fops->write;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (write)
-		rv = write(file, buf, count, ppos);
+	return rv;
+}
 
-	pde_users_dec(pde);
+static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos)
+{
+	ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *);
+	struct proc_dir_entry *pde = PDE(file_inode(file));
+	ssize_t rv = -EIO;
+	if (use_pde(pde)) {
+		write = pde->proc_fops->write;
+		if (write)
+			rv = write(file, buf, count, ppos);
+		unuse_pde(pde);
+	}
 	return rv;
 }
 
@@ -227,20 +269,12 @@ static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *p
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	unsigned int rv = DEFAULT_POLLMASK;
 	unsigned int (*poll)(struct file *, struct poll_table_struct *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		poll = pde->proc_fops->poll;
+		if (poll)
+			rv = poll(file, pts);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	poll = pde->proc_fops->poll;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (poll)
-		rv = poll(file, pts);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
@@ -249,20 +283,12 @@ static long proc_reg_unlocked_ioctl(struct file *file, unsigned int cmd, unsigne
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
 	long (*ioctl)(struct file *, unsigned int, unsigned long);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		ioctl = pde->proc_fops->unlocked_ioctl;
+		if (ioctl)
+			rv = ioctl(file, cmd, arg);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	ioctl = pde->proc_fops->unlocked_ioctl;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (ioctl)
-		rv = ioctl(file, cmd, arg);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
@@ -272,20 +298,12 @@ static long proc_reg_compat_ioctl(struct file *file, unsigned int cmd, unsigned
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	long rv = -ENOTTY;
 	long (*compat_ioctl)(struct file *, unsigned int, unsigned long);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		compat_ioctl = pde->proc_fops->compat_ioctl;
+		if (compat_ioctl)
+			rv = compat_ioctl(file, cmd, arg);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	compat_ioctl = pde->proc_fops->compat_ioctl;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (compat_ioctl)
-		rv = compat_ioctl(file, cmd, arg);
-
-	pde_users_dec(pde);
 	return rv;
 }
 #endif
@@ -295,20 +313,12 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma)
 	struct proc_dir_entry *pde = PDE(file_inode(file));
 	int rv = -EIO;
 	int (*mmap)(struct file *, struct vm_area_struct *);
-
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
-		return rv;
+	if (use_pde(pde)) {
+		mmap = pde->proc_fops->mmap;
+		if (mmap)
+			rv = mmap(file, vma);
+		unuse_pde(pde);
 	}
-	pde->pde_users++;
-	mmap = pde->proc_fops->mmap;
-	spin_unlock(&pde->pde_unload_lock);
-
-	if (mmap)
-		rv = mmap(file, vma);
-
-	pde_users_dec(pde);
 	return rv;
 }
 
@@ -334,16 +344,12 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	if (!pdeo)
 		return -ENOMEM;
 
-	spin_lock(&pde->pde_unload_lock);
-	if (!pde->proc_fops) {
-		spin_unlock(&pde->pde_unload_lock);
+	if (!use_pde(pde)) {
 		kfree(pdeo);
 		return -ENOENT;
 	}
-	pde->pde_users++;
 	open = pde->proc_fops->open;
 	release = pde->proc_fops->release;
-	spin_unlock(&pde->pde_unload_lock);
 
 	if (open)
 		rv = open(inode, file);
@@ -351,10 +357,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	spin_lock(&pde->pde_unload_lock);
 	if (rv == 0 && release) {
 		/* To know what to release. */
-		pdeo->inode = inode;
 		pdeo->file = file;
 		/* Strictly for "too late" ->release in proc_reg_release(). */
-		pdeo->release = release;
 		list_add(&pdeo->lh, &pde->pde_openers);
 	} else
 		kfree(pdeo);
@@ -364,12 +368,12 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 }
 
 static struct pde_opener *find_pde_opener(struct proc_dir_entry *pde,
-					struct inode *inode, struct file *file)
+					struct file *file)
 {
 	struct pde_opener *pdeo;
 
 	list_for_each_entry(pdeo, &pde->pde_openers, lh) {
-		if (pdeo->inode == inode && pdeo->file == file)
+		if (pdeo->file == file)
 			return pdeo;
 	}
 	return NULL;
@@ -383,8 +387,8 @@ static int proc_reg_release(struct inode *inode, struct file *file)
 	struct pde_opener *pdeo;
 
 	spin_lock(&pde->pde_unload_lock);
-	pdeo = find_pde_opener(pde, inode, file);
-	if (!pde->proc_fops) {
+	pdeo = find_pde_opener(pde, file);
+	if (pde->pde_users < 0) {
 		/*
 		 * Can't simply exit, __fput() will think that everything is OK,
 		 * and move on to freeing struct file. remove_proc_entry() will
@@ -396,7 +400,7 @@ static int proc_reg_release(struct inode *inode, struct file *file)
 		if (pdeo) {
 			list_del(&pdeo->lh);
 			spin_unlock(&pde->pde_unload_lock);
-			rv = pdeo->release(inode, file);
+			rv = pde->proc_fops->release(inode, file);
 			kfree(pdeo);
 		} else
 			spin_unlock(&pde->pde_unload_lock);
@@ -413,7 +417,7 @@ static int proc_reg_release(struct inode *inode, struct file *file)
 	if (release)
 		rv = release(inode, file);
 
-	pde_users_dec(pde);
+	unuse_pde(pde);
 	return rv;
 }
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9c93a53f371d..c43d536f93b9 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -151,12 +151,13 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 		filldir_t filldir);
 
 struct pde_opener {
-	struct inode *inode;
 	struct file *file;
-	int (*release)(struct inode *, struct file *);
 	struct list_head lh;
 };
-void pde_users_dec(struct proc_dir_entry *pde);
+
+ssize_t __proc_file_read(struct file *, char __user *, size_t, loff_t *);
+extern const struct file_operations proc_file_operations;
+void proc_entry_rundown(struct proc_dir_entry *);
 
 extern spinlock_t proc_subdir_lock;
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index bcc0e10ef1df..947ae7eb63ef 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -60,20 +60,13 @@ struct proc_dir_entry {
 	kgid_t gid;
 	loff_t size;
 	const struct inode_operations *proc_iops;
-	/*
-	 * NULL ->proc_fops means "PDE is going away RSN" or
-	 * "PDE is just created". In either case, e.g. ->read_proc won't be
-	 * called because it's too late or too early, respectively.
-	 *
-	 * If you're allocating ->proc_fops dynamically, save a pointer
-	 * somewhere.
-	 */
 	const struct file_operations *proc_fops;
 	struct proc_dir_entry *next, *parent, *subdir;
 	void *data;
 	read_proc_t *read_proc;
 	atomic_t count;		/* use count */
-	int pde_users;	/* number of callers into module in progress */
+	int pde_users;	/* number of callers into module in progress; */
+			/* negative -> it's going away RSN */
 	struct completion *pde_unload_completion;
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
 	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
-- 
cgit 


From 05c0ae21c034a6f7c6f4c0c63a31167ebb4b061f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Apr 2013 16:28:47 -0400
Subject: try a saner locking for pde_opener...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/inode.c         | 62 +++++++++++++++++--------------------------------
 fs/proc/internal.h      |  4 ++--
 include/linux/proc_fs.h |  2 +-
 3 files changed, 24 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b5b204d6b07f..3b14a45870a9 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -133,67 +133,48 @@ enum {BIAS = -1U<<31};
 
 static inline int use_pde(struct proc_dir_entry *pde)
 {
-	int res = 1;
-	spin_lock(&pde->pde_unload_lock);
-	if (unlikely(pde->pde_users < 0))
-		res = 0;
-	else
-		pde->pde_users++;
-	spin_unlock(&pde->pde_unload_lock);
-	return res;
-}
-
-static void __pde_users_dec(struct proc_dir_entry *pde)
-{
-	if (--pde->pde_users == BIAS)
-		complete(pde->pde_unload_completion);
+	return atomic_inc_unless_negative(&pde->in_use);
 }
 
 static void unuse_pde(struct proc_dir_entry *pde)
 {
-	spin_lock(&pde->pde_unload_lock);
-	__pde_users_dec(pde);
-	spin_unlock(&pde->pde_unload_lock);
+	if (atomic_dec_return(&pde->in_use) == BIAS)
+		complete(pde->pde_unload_completion);
 }
 
 /* pde is locked */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 {
-	pdeo->count++;
-	if (!mutex_trylock(&pdeo->mutex)) {
+	if (pdeo->closing) {
 		/* somebody else is doing that, just wait */
+		DECLARE_COMPLETION_ONSTACK(c);
+		pdeo->c = &c;
 		spin_unlock(&pde->pde_unload_lock);
-		mutex_lock(&pdeo->mutex);
+		wait_for_completion(&c);
 		spin_lock(&pde->pde_unload_lock);
-		WARN_ON(!list_empty(&pdeo->lh));
 	} else {
 		struct file *file;
+		pdeo->closing = 1;
 		spin_unlock(&pde->pde_unload_lock);
 		file = pdeo->file;
 		pde->proc_fops->release(file_inode(file), file);
 		spin_lock(&pde->pde_unload_lock);
 		list_del_init(&pdeo->lh);
-	}
-	mutex_unlock(&pdeo->mutex);
-	if (!--pdeo->count)
+		if (pdeo->c)
+			complete(pdeo->c);
 		kfree(pdeo);
+	}
 }
 
 void proc_entry_rundown(struct proc_dir_entry *de)
 {
-	spin_lock(&de->pde_unload_lock);
-	de->pde_users += BIAS;
+	DECLARE_COMPLETION_ONSTACK(c);
 	/* Wait until all existing callers into module are done. */
-	if (de->pde_users != BIAS) {
-		DECLARE_COMPLETION_ONSTACK(c);
-		de->pde_unload_completion = &c;
-		spin_unlock(&de->pde_unload_lock);
-
-		wait_for_completion(de->pde_unload_completion);
-
-		spin_lock(&de->pde_unload_lock);
-	}
+	de->pde_unload_completion = &c;
+	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
+		wait_for_completion(&c);
 
+	spin_lock(&de->pde_unload_lock);
 	while (!list_empty(&de->pde_openers)) {
 		struct pde_opener *pdeo;
 		pdeo = list_first_entry(&de->pde_openers, struct pde_opener, lh);
@@ -356,7 +337,7 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	 * by hand in remove_proc_entry(). For this, save opener's credentials
 	 * for later.
 	 */
-	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
 	if (!pdeo)
 		return -ENOMEM;
 
@@ -370,18 +351,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	if (open)
 		rv = open(inode, file);
 
-	spin_lock(&pde->pde_unload_lock);
 	if (rv == 0 && release) {
 		/* To know what to release. */
-		mutex_init(&pdeo->mutex);
-		pdeo->count = 0;
 		pdeo->file = file;
 		/* Strictly for "too late" ->release in proc_reg_release(). */
+		spin_lock(&pde->pde_unload_lock);
 		list_add(&pdeo->lh, &pde->pde_openers);
+		spin_unlock(&pde->pde_unload_lock);
 	} else
 		kfree(pdeo);
-	__pde_users_dec(pde);
-	spin_unlock(&pde->pde_unload_lock);
+
+	unuse_pde(pde);
 	return rv;
 }
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index e2fa9345a9a8..46a7e2a7b904 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -153,8 +153,8 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 struct pde_opener {
 	struct file *file;
 	struct list_head lh;
-	int count;	/* number of threads in close_pdeo() */
-	struct mutex mutex;
+	int closing;
+	struct completion *c;
 };
 
 ssize_t __proc_file_read(struct file *, char __user *, size_t, loff_t *);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 947ae7eb63ef..2781e498f709 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -65,7 +65,7 @@ struct proc_dir_entry {
 	void *data;
 	read_proc_t *read_proc;
 	atomic_t count;		/* use count */
-	int pde_users;	/* number of callers into module in progress; */
+	atomic_t in_use;	/* number of callers into module in progress; */
 			/* negative -> it's going away RSN */
 	struct completion *pde_unload_completion;
 	struct list_head pde_openers;	/* who did ->open, but not ->release */
-- 
cgit 


From 066ec1dd44e84a911fe04556e4a9b89f1c83b0cb Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Tue, 9 Apr 2013 19:47:40 +0400
Subject: of: Add stub of_get_parent for non-OF builds

Fixes build error on x86_64 allmodconfig, introduced by commit
5ab3a89a741f ("mfd: syscon: Add non-DT support").

drivers/regulator/anatop-regulator.c: In function 'anatop_regulator_probe':
drivers/regulator/anatop-regulator.c:134:2: error: implicit declaration of function 'of_get_parent' [-Werror=implicit-function-declaration]

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Acked-by: Rob Herring <rob.herring@calxeda.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/of.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index a0f129284948..9f5923fc0775 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -353,6 +353,11 @@ static inline struct device_node *of_find_node_by_name(struct device_node *from,
 	return NULL;
 }
 
+static inline struct device_node *of_get_parent(const struct device_node *node)
+{
+	return NULL;
+}
+
 static inline bool of_have_populated_dt(void)
 {
 	return false;
-- 
cgit 


From 06b332a52293a45324320b6b446a7fa677fb6702 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Tue, 9 Apr 2013 11:34:36 -0400
Subject: nfsd4: check backchannel attributes on create_session

Make sure the client gives us an adequate backchannel.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4callback.c          | 25 +------------------------
 fs/nfsd/nfs4state.c             | 25 +++++++++++++++++++++++++
 fs/nfsd/xdr4cb.h                | 23 +++++++++++++++++++++++
 include/linux/sunrpc/msg_prot.h |  3 +++
 4 files changed, 52 insertions(+), 24 deletions(-)
 create mode 100644 fs/nfsd/xdr4cb.h

(limited to 'include/linux')

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index be3ff0f3ff68..7f05cd140de3 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -37,6 +37,7 @@
 #include "nfsd.h"
 #include "state.h"
 #include "netns.h"
+#include "xdr4cb.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -53,30 +54,6 @@ enum {
 	NFSPROC4_CLNT_CB_SEQUENCE,
 };
 
-#define NFS4_MAXTAGLEN		20
-
-#define NFS4_enc_cb_null_sz		0
-#define NFS4_dec_cb_null_sz		0
-#define cb_compound_enc_hdr_sz		4
-#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
-#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
-#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
-					1 /* no referring calls list yet */)
-#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
-
-#define op_enc_sz			1
-#define op_dec_sz			2
-#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
-#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
-#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
-					cb_sequence_enc_sz +            \
-					1 + enc_stateid_sz +            \
-					enc_nfs4_fh_sz)
-
-#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
-					cb_sequence_dec_sz +            \
-					op_dec_sz)
-
 struct nfs4_cb_compound_hdr {
 	/* args */
 	u32		ident;	/* minorversion 0 only */
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 036d5f16fd7f..67017fcebb21 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -42,6 +42,7 @@
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/addr.h>
 #include "xdr4.h"
+#include "xdr4cb.h"
 #include "vfs.h"
 #include "current_stateid.h"
 
@@ -1794,6 +1795,27 @@ static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs *ca, struct nfs
 	return nfs_ok;
 }
 
+static __be32 check_backchannel_attrs(struct nfsd4_channel_attrs *ca)
+{
+	ca->headerpadsz = 0;
+
+	/*
+	 * These RPC_MAX_HEADER macros are overkill, especially since we
+	 * don't even do gss on the backchannel yet.  But this is still
+	 * less than 1k.  Tighten up this estimate in the unlikely event
+	 * it turns out to be a problem for some client:
+	 */
+	if (ca->maxreq_sz < NFS4_enc_cb_recall_sz + RPC_MAX_HEADER_WITH_AUTH)
+		return nfserr_toosmall;
+	if (ca->maxresp_sz < NFS4_dec_cb_recall_sz + RPC_MAX_REPHEADER_WITH_AUTH)
+		return nfserr_toosmall;
+	ca->maxresp_cached = 0;
+	if (ca->maxops < 2)
+		return nfserr_toosmall;
+
+	return nfs_ok;
+}
+
 __be32
 nfsd4_create_session(struct svc_rqst *rqstp,
 		     struct nfsd4_compound_state *cstate,
@@ -1810,6 +1832,9 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 	if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
 		return nfserr_inval;
 	status = check_forechannel_attrs(&cr_ses->fore_channel, nn);
+	if (status)
+		return status;
+	status = check_backchannel_attrs(&cr_ses->back_channel);
 	if (status)
 		return status;
 	status = nfserr_jukebox;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
new file mode 100644
index 000000000000..c5c55dfb91a9
--- /dev/null
+++ b/fs/nfsd/xdr4cb.h
@@ -0,0 +1,23 @@
+#define NFS4_MAXTAGLEN		20
+
+#define NFS4_enc_cb_null_sz		0
+#define NFS4_dec_cb_null_sz		0
+#define cb_compound_enc_hdr_sz		4
+#define cb_compound_dec_hdr_sz		(3 + (NFS4_MAXTAGLEN >> 2))
+#define sessionid_sz			(NFS4_MAX_SESSIONID_LEN >> 2)
+#define cb_sequence_enc_sz		(sessionid_sz + 4 +             \
+					1 /* no referring calls list yet */)
+#define cb_sequence_dec_sz		(op_dec_sz + sessionid_sz + 4)
+
+#define op_enc_sz			1
+#define op_dec_sz			2
+#define enc_nfs4_fh_sz			(1 + (NFS4_FHSIZE >> 2))
+#define enc_stateid_sz			(NFS4_STATEID_SIZE >> 2)
+#define NFS4_enc_cb_recall_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_stateid_sz +            \
+					enc_nfs4_fh_sz)
+
+#define NFS4_dec_cb_recall_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/include/linux/sunrpc/msg_prot.h b/include/linux/sunrpc/msg_prot.h
index c68a147939a6..aadc6a04e1ac 100644
--- a/include/linux/sunrpc/msg_prot.h
+++ b/include/linux/sunrpc/msg_prot.h
@@ -138,6 +138,9 @@ typedef __be32	rpc_fraghdr;
 #define RPC_MAX_HEADER_WITH_AUTH \
 	(RPC_CALLHDRSIZE + 2*(2+RPC_MAX_AUTH_SIZE/4))
 
+#define RPC_MAX_REPHEADER_WITH_AUTH \
+	(RPC_REPHDRSIZE + (2 + RPC_MAX_AUTH_SIZE/4))
+
 /*
  * RFC1833/RFC3530 rpcbind (v3+) well-known netid's.
  */
-- 
cgit 


From 479f614110b889d5783acdaec865ede3cdb96b97 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 29 Mar 2013 14:44:42 +0800
Subject: cgroup: Kill subsys.active flag

The only user was cpuacct.

Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5155385A.4040207@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..95c02c0e35b3 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -458,7 +458,6 @@ struct cgroup_subsys {
 	void (*bind)(struct cgroup *root);
 
 	int subsys_id;
-	int active;
 	int disabled;
 	int early_init;
 	/*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..5c462813722d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4468,7 +4468,6 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	 * need to invoke fork callbacks here. */
 	BUG_ON(!list_empty(&init_task.tasks));
 
-	ss->active = 1;
 	BUG_ON(online_css(ss, dummytop));
 
 	mutex_unlock(&cgroup_mutex);
@@ -4573,7 +4572,6 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 	}
 	write_unlock(&css_set_lock);
 
-	ss->active = 1;
 	ret = online_css(ss, dummytop);
 	if (ret)
 		goto err_unload;
@@ -4614,7 +4612,6 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
 	mutex_lock(&cgroup_mutex);
 
 	offline_css(ss, dummytop);
-	ss->active = 0;
 
 	if (ss->use_id)
 		idr_destroy(&ss->idr);
-- 
cgit 


From 46fc4c909339f5a84d1679045297d9d2fb596987 Mon Sep 17 00:00:00 2001
From: Rafał Miłecki <zajec5@gmail.com>
Date: Tue, 2 Apr 2013 15:57:26 +0200
Subject: ssb: implement spurious tone avoidance
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

And make use of it in b43. This fixes a regression introduced with
49d55cef5b1925a5c1efb6aaddaa40fc7c693335
b43: N-PHY: implement spurious tone avoidance
This commit made BCM4322 use only MCS 0 on channel 13, which of course
resulted in performance drop (down to 0.7Mb/s).

Reported-by: Stefan Brüns <stefan.bruens@rwth-aachen.de>
Signed-off-by: Rafał Miłecki <zajec5@gmail.com>
Cc: Stable <stable@vger.kernel.org>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/b43/phy_n.c          |  3 ++-
 drivers/ssb/driver_chipcommon_pmu.c       | 29 +++++++++++++++++++++++++++++
 include/linux/ssb/ssb_driver_chipcommon.h |  2 ++
 3 files changed, 33 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/b43/phy_n.c b/drivers/net/wireless/b43/phy_n.c
index e8486c1e091a..b70f220bc4b3 100644
--- a/drivers/net/wireless/b43/phy_n.c
+++ b/drivers/net/wireless/b43/phy_n.c
@@ -5165,7 +5165,8 @@ static void b43_nphy_pmu_spur_avoid(struct b43_wldev *dev, bool avoid)
 #endif
 #ifdef CONFIG_B43_SSB
 	case B43_BUS_SSB:
-		/* FIXME */
+		ssb_pmu_spuravoid_pllupdate(&dev->dev->sdev->bus->chipco,
+					    avoid);
 		break;
 #endif
 	}
diff --git a/drivers/ssb/driver_chipcommon_pmu.c b/drivers/ssb/driver_chipcommon_pmu.c
index 4c0f6d883dd3..7b0bce936762 100644
--- a/drivers/ssb/driver_chipcommon_pmu.c
+++ b/drivers/ssb/driver_chipcommon_pmu.c
@@ -675,3 +675,32 @@ u32 ssb_pmu_get_controlclock(struct ssb_chipcommon *cc)
 		return 0;
 	}
 }
+
+void ssb_pmu_spuravoid_pllupdate(struct ssb_chipcommon *cc, int spuravoid)
+{
+	u32 pmu_ctl = 0;
+
+	switch (cc->dev->bus->chip_id) {
+	case 0x4322:
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL0, 0x11100070);
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL1, 0x1014140a);
+		ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL5, 0x88888854);
+		if (spuravoid == 1)
+			ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL2, 0x05201828);
+		else
+			ssb_chipco_pll_write(cc, SSB_PMU1_PLLCTL2, 0x05001828);
+		pmu_ctl = SSB_CHIPCO_PMU_CTL_PLL_UPD;
+		break;
+	case 43222:
+		/* TODO: BCM43222 requires updating PLLs too */
+		return;
+	default:
+		ssb_printk(KERN_ERR PFX
+			   "Unknown spuravoidance settings for chip 0x%04X, not changing PLL\n",
+			   cc->dev->bus->chip_id);
+		return;
+	}
+
+	chipco_set32(cc, SSB_CHIPCO_PMU_CTL, pmu_ctl);
+}
+EXPORT_SYMBOL_GPL(ssb_pmu_spuravoid_pllupdate);
diff --git a/include/linux/ssb/ssb_driver_chipcommon.h b/include/linux/ssb/ssb_driver_chipcommon.h
index 9e492be5244b..6fcfe99bd999 100644
--- a/include/linux/ssb/ssb_driver_chipcommon.h
+++ b/include/linux/ssb/ssb_driver_chipcommon.h
@@ -219,6 +219,7 @@
 #define SSB_CHIPCO_PMU_CTL			0x0600 /* PMU control */
 #define  SSB_CHIPCO_PMU_CTL_ILP_DIV		0xFFFF0000 /* ILP div mask */
 #define  SSB_CHIPCO_PMU_CTL_ILP_DIV_SHIFT	16
+#define  SSB_CHIPCO_PMU_CTL_PLL_UPD		0x00000400
 #define  SSB_CHIPCO_PMU_CTL_NOILPONW		0x00000200 /* No ILP on wait */
 #define  SSB_CHIPCO_PMU_CTL_HTREQEN		0x00000100 /* HT req enable */
 #define  SSB_CHIPCO_PMU_CTL_ALPREQEN		0x00000080 /* ALP req enable */
@@ -667,5 +668,6 @@ enum ssb_pmu_ldo_volt_id {
 void ssb_pmu_set_ldo_voltage(struct ssb_chipcommon *cc,
 			     enum ssb_pmu_ldo_volt_id id, u32 voltage);
 void ssb_pmu_set_ldo_paref(struct ssb_chipcommon *cc, bool on);
+void ssb_pmu_spuravoid_pllupdate(struct ssb_chipcommon *cc, int spuravoid);
 
 #endif /* LINUX_SSB_CHIPCO_H_ */
-- 
cgit 


From 10629d711ed780470937ecda50d9ffa0e925a4ee Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 10 Apr 2013 09:56:54 -0600
Subject: PCI: Remove __weak annotation from pcibios_get_phb_of_node decl

The __weak annotation on the pcibios_get_phb_of_node() declaration
causes *every* definition to be marked "weak."  The linker then
selects one based on link order, which may be the wrong one.

Gabor found that on MIPS, the linker selected the generic implementation
from drivers/pci even though arch/mips supplied a definition without the
__weak annotation:

$ mipsel-openwrt-linux-readelf -s arch/mips/pci/built-in.o \
    drivers/pci/built-in.o vmlinux.o | grep pcibios_get_phb_of_node
      86: 0000046c    12 FUNC    WEAK   DEFAULT    2 pcibios_get_phb_of_node
    1430: 00012e2c   104 FUNC    WEAK   DEFAULT    2 pcibios_get_phb_of_node
   31898: 0017e4ec   104 FUNC    WEAK   DEFAULT    2 pcibios_get_phb_of_node

This removes the __weak annotation from the pcibios_get_phb_of_node()
declaration so arch-specific non-weak implementations work reliably.

Suggested-by: Gabor Juhos <juhosg@openwrt.org>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2461033a7987..67734407fc7f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1823,7 +1823,7 @@ extern void pci_set_bus_of_node(struct pci_bus *bus);
 extern void pci_release_bus_of_node(struct pci_bus *bus);
 
 /* Arch may override this (weak) */
-extern struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus);
+extern struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus);
 
 static inline struct device_node *
 pci_device_to_OF_node(const struct pci_dev *pdev)
-- 
cgit 


From cdee3904b4ce7c03d1013ed6dd704b43ae7fc2e9 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 9 Jan 2013 10:46:17 +1100
Subject: audit: Syscall rules are not applied to existing processes on non-x86

Commit b05d8447e782 (audit: inline audit_syscall_entry to reduce
burden on archs) changed audit_syscall_entry to check for a dummy
context before calling __audit_syscall_entry. Unfortunately the dummy
context state is maintained in __audit_syscall_entry so once set it
never gets cleared, even if the audit rules change.

As a result, if there are no auditing rules when a process starts
then it will never be subject to any rules added later. x86 doesn't
see this because it has an assembly fast path that calls directly into
__audit_syscall_entry.

I noticed this issue when working on audit performance optimisations.
I wrote a set of simple test cases available at:

http://ozlabs.org/~anton/junkcode/audit_tests.tar.gz

02_new_rule.py fails without the patch and passes with it. The
test case clears all rules, starts a process, adds a rule then
verifies the process produces a syscall audit record.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: <stable@kernel.org> # 3.3+
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 5a6d718adf34..37464c592c92 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -120,7 +120,7 @@ static inline void audit_syscall_entry(int arch, int major, unsigned long a0,
 				       unsigned long a1, unsigned long a2,
 				       unsigned long a3)
 {
-	if (unlikely(!audit_dummy_context()))
+	if (unlikely(current->audit_context))
 		__audit_syscall_entry(arch, major, a0, a1, a2, a3);
 }
 static inline void audit_syscall_exit(void *pt_regs)
-- 
cgit 


From 2950fa9d3291b90e9b7663b6a409ea37a97a5e35 Mon Sep 17 00:00:00 2001
From: Chen Gang <gang.chen@asianux.com>
Date: Sun, 7 Apr 2013 16:55:23 +0800
Subject: kernel: audit: beautify code, for extern function, better to check
 its parameters by itself

  __audit_socketcall is an extern function.
  better to check its parameters by itself.

    also can return error code, when fail (find invalid parameters).
    also use macro instead of real hard code number
    also give related comments for it.

Signed-off-by: Chen Gang <gang.chen@asianux.com>
[eparis: fix the return value when !CONFIG_AUDIT]
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h | 16 +++++++++++-----
 kernel/auditsc.c      |  9 ++++++---
 net/socket.c          |  6 ++++--
 3 files changed, 21 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 37464c592c92..8f92e1dea966 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -84,6 +84,9 @@ extern int audit_classify_arch(int arch);
 #define	AUDIT_TYPE_CHILD_DELETE 3	/* a child being deleted */
 #define	AUDIT_TYPE_CHILD_CREATE 4	/* a child being created */
 
+/* maximized args number that audit_socketcall can process */
+#define AUDITSC_ARGS		6
+
 struct filename;
 
 #ifdef CONFIG_AUDITSYSCALL
@@ -190,7 +193,7 @@ extern void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern int __audit_bprm(struct linux_binprm *bprm);
-extern void __audit_socketcall(int nargs, unsigned long *args);
+extern int __audit_socketcall(int nargs, unsigned long *args);
 extern int __audit_sockaddr(int len, void *addr);
 extern void __audit_fd_pair(int fd1, int fd2);
 extern void __audit_mq_open(int oflag, umode_t mode, struct mq_attr *attr);
@@ -224,10 +227,11 @@ static inline int audit_bprm(struct linux_binprm *bprm)
 		return __audit_bprm(bprm);
 	return 0;
 }
-static inline void audit_socketcall(int nargs, unsigned long *args)
+static inline int audit_socketcall(int nargs, unsigned long *args)
 {
 	if (unlikely(!audit_dummy_context()))
-		__audit_socketcall(nargs, args);
+		return __audit_socketcall(nargs, args);
+	return 0;
 }
 static inline int audit_sockaddr(int len, void *addr)
 {
@@ -354,8 +358,10 @@ static inline int audit_bprm(struct linux_binprm *bprm)
 {
 	return 0;
 }
-static inline void audit_socketcall(int nargs, unsigned long *args)
-{ }
+static inline int audit_socketcall(int nargs, unsigned long *args)
+{
+	return 0;
+}
 static inline void audit_fd_pair(int fd1, int fd2)
 { }
 static inline int audit_sockaddr(int len, void *addr)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b59ffb293ded..d57ad32db367 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -226,7 +226,7 @@ struct audit_context {
 	union {
 		struct {
 			int nargs;
-			long args[6];
+			long args[AUDITSC_ARGS];
 		} socketcall;
 		struct {
 			kuid_t			uid;
@@ -2491,17 +2491,20 @@ int __audit_bprm(struct linux_binprm *bprm)
 
 /**
  * audit_socketcall - record audit data for sys_socketcall
- * @nargs: number of args
+ * @nargs: number of args, which should not be more than AUDITSC_ARGS.
  * @args: args array
  *
  */
-void __audit_socketcall(int nargs, unsigned long *args)
+int __audit_socketcall(int nargs, unsigned long *args)
 {
 	struct audit_context *context = current->audit_context;
 
+	if (nargs <= 0 || nargs > AUDITSC_ARGS || !args)
+		return -EINVAL;
 	context->type = AUDIT_SOCKETCALL;
 	context->socketcall.nargs = nargs;
 	memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long));
+	return 0;
 }
 
 /**
diff --git a/net/socket.c b/net/socket.c
index 2ca51c719ef9..1bbc37b7a312 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2436,7 +2436,7 @@ static const unsigned char nargs[21] = {
 
 SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 {
-	unsigned long a[6];
+	unsigned long a[AUDITSC_ARGS];
 	unsigned long a0, a1;
 	int err;
 	unsigned int len;
@@ -2452,7 +2452,9 @@ SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
 	if (copy_from_user(a, args, len))
 		return -EFAULT;
 
-	audit_socketcall(nargs[call] / sizeof(unsigned long), a);
+	err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);
+	if (err)
+		return err;
 
 	a0 = a[0];
 	a1 = a[1];
-- 
cgit 


From 84cfb6ab484b442d5115eb3baf9db7d74a3ea626 Mon Sep 17 00:00:00 2001
From: Rami Rosen <ramirose@gmail.com>
Date: Wed, 10 Apr 2013 14:41:17 +0300
Subject: cgroup: remove bind() method from cgroup_subsys.

The bind() method of cgroup_subsys is not used in any of the
controllers (cpuset, freezer, blkio, net_cls, memcg, net_prio,
devices, perf, hugetlb, cpu and cpuacct)

tj: Removed the entry on ->bind() from
    Documentation/cgroups/cgroups.txt.  Also updated a couple
    paragraphs which were suggesting that dynamic re-binding may be
    implemented.  It's not gonna.

Signed-off-by: Rami Rosen <ramirose@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 Documentation/cgroups/cgroups.txt | 20 +++++---------------
 include/linux/cgroup.h            |  2 --
 kernel/cgroup.c                   |  4 ----
 3 files changed, 5 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 638bf17ff869..2b51e12ce178 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -211,10 +211,9 @@ matches, and any of the requested subsystems are in use in an existing
 hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 is activated, associated with the requested subsystems.
 
-It's not currently possible to bind a new subsystem to an active
-cgroup hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy. This may be possible in future, but is fraught with nasty
-error-recovery issues.
+It's not possible to bind a new subsystem to an active cgroup
+hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy.
 
 When a cgroup filesystem is unmounted, if there are any
 child cgroups created below the top-level cgroup, that hierarchy
@@ -382,10 +381,8 @@ To Specify a hierarchy's release_agent:
 
 Note that specifying 'release_agent' more than once will return failure.
 
-Note that changing the set of subsystems is currently only supported
-when the hierarchy consists of a single (root) cgroup. Supporting
-the ability to arbitrarily bind/unbind subsystems from an existing
-cgroup hierarchy is intended to be implemented in the future.
+Note that changing the set of subsystems is only supported when the
+hierarchy consists of a single (root) cgroup.
 
 Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
 tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
@@ -643,13 +640,6 @@ void exit(struct task_struct *task)
 
 Called during task exit.
 
-void bind(struct cgroup *root)
-(cgroup_mutex held by caller)
-
-Called when a cgroup subsystem is rebound to a different hierarchy
-and root cgroup. Currently this will only involve movement between
-the default hierarchy (which never has sub-cgroups) and a hierarchy
-that is being created/destroyed (and hence has no sub-cgroups).
 
 4. Extended attribute usage
 ===========================
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 515927eebb37..92acf8601ae0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -483,8 +483,6 @@ struct cgroup_subsys {
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		     struct task_struct *task);
-	void (*bind)(struct cgroup *root);
-
 	int subsys_id;
 	int active;
 	int disabled;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ba3e24a76dae..fd38e1cfacca 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1064,16 +1064,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
-			if (ss->bind)
-				ss->bind(cgrp);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
-			if (ss->bind)
-				ss->bind(dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
-- 
cgit 


From 78574cf981cd3d9ae9f6adbd466a772310ec24ff Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 8 Apr 2013 19:00:38 -0700
Subject: cgroup: implement cgroup_is_descendant()

A couple controllers want to determine whether two cgroups are in
ancestor/descendant relationship.  As it's more likely that the
descendant is the primary subject of interest and there are other
operations focusing on the descendants, let's ask is_descendent rather
than is_ancestor.

Implementation is trivial as the previous patch guarantees that all
ancestors of a cgroup stay accessible as long as the cgroup is
accessible.

tj: Removed depth optimization, renamed from cgroup_is_ancestor(),
    rewrote descriptions.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 92acf8601ae0..a2b9d4b13369 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -439,6 +439,7 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 65b72d0367c5..7bf3ce09c50c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -276,6 +276,26 @@ inline int cgroup_is_removed(const struct cgroup *cgrp)
 	return test_bit(CGRP_REMOVED, &cgrp->flags);
 }
 
+/**
+ * cgroup_is_descendant - test ancestry
+ * @cgrp: the cgroup to be tested
+ * @ancestor: possible ancestor of @cgrp
+ *
+ * Test whether @cgrp is a descendant of @ancestor.  It also returns %true
+ * if @cgrp == @ancestor.  This function is safe to call as long as @cgrp
+ * and @ancestor are accessible.
+ */
+bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
+{
+	while (cgrp) {
+		if (cgrp == ancestor)
+			return true;
+		cgrp = cgrp->parent;
+	}
+	return false;
+}
+EXPORT_SYMBOL_GPL(cgroup_is_descendant);
+
 /* bits in struct cgroupfs_root flags field */
 enum {
 	ROOT_NOPREFIX,	/* mounted subsystems have no named prefix */
-- 
cgit 


From 6b07a24fc38476e04f591cc17594bc1835b9efdd Mon Sep 17 00:00:00 2001
From: Paul Moore <pmoore@redhat.com>
Date: Wed, 10 Apr 2013 15:34:14 -0400
Subject: lsm: add the missing documentation for the security_skb_owned_by()
 hook

Unfortunately we didn't catch the missing comments earlier when the
patch was merged.

Signed-off-by: Paul Moore <pmoore@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/security.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/security.h b/include/linux/security.h
index 6c3a78ace051..032c366ef1c6 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1012,6 +1012,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *	This hook can be used by the module to update any security state
  *	associated with the TUN device's security structure.
  *	@security pointer to the TUN devices's security structure.
+ * @skb_owned_by:
+ *	This hook sets the packet's owning sock.
+ *	@skb is the packet.
+ *	@sk the sock which owns the packet.
  *
  * Security hooks for XFRM operations.
  *
-- 
cgit 


From 2b78f1e1389aef263071b9edf41c0980b092e601 Mon Sep 17 00:00:00 2001
From: Andreas Larsson <andreas@gaisler.com>
Date: Fri, 15 Mar 2013 14:45:38 +0100
Subject: gpio: gpio-generic: Add 16 and 32 bit big endian byte order support

There is no general support for 64-bit big endian accesses, so that is
left unsupported.

Signed-off-by: Andreas Larsson <andreas@gaisler.com>
Acked-by: Anton Vorontsov <anton@enomsg.org>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpio-generic.c     | 56 ++++++++++++++++++++++++++++++++++-------
 include/linux/basic_mmio_gpio.h |  1 +
 2 files changed, 48 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-generic.c b/drivers/gpio/gpio-generic.c
index 05fcc0f247ca..42d470632aea 100644
--- a/drivers/gpio/gpio-generic.c
+++ b/drivers/gpio/gpio-generic.c
@@ -104,6 +104,26 @@ static unsigned long bgpio_read64(void __iomem *reg)
 }
 #endif /* BITS_PER_LONG >= 64 */
 
+static void bgpio_write16be(void __iomem *reg, unsigned long data)
+{
+	iowrite16be(data, reg);
+}
+
+static unsigned long bgpio_read16be(void __iomem *reg)
+{
+	return ioread16be(reg);
+}
+
+static void bgpio_write32be(void __iomem *reg, unsigned long data)
+{
+	iowrite32be(data, reg);
+}
+
+static unsigned long bgpio_read32be(void __iomem *reg)
+{
+	return ioread32be(reg);
+}
+
 static unsigned long bgpio_pin2mask(struct bgpio_chip *bgc, unsigned int pin)
 {
 	return 1 << pin;
@@ -249,7 +269,8 @@ static int bgpio_dir_out_inv(struct gpio_chip *gc, unsigned int gpio, int val)
 
 static int bgpio_setup_accessors(struct device *dev,
 				 struct bgpio_chip *bgc,
-				 bool be)
+				 bool bit_be,
+				 bool byte_be)
 {
 
 	switch (bgc->bits) {
@@ -258,17 +279,33 @@ static int bgpio_setup_accessors(struct device *dev,
 		bgc->write_reg	= bgpio_write8;
 		break;
 	case 16:
-		bgc->read_reg	= bgpio_read16;
-		bgc->write_reg	= bgpio_write16;
+		if (byte_be) {
+			bgc->read_reg	= bgpio_read16be;
+			bgc->write_reg	= bgpio_write16be;
+		} else {
+			bgc->read_reg	= bgpio_read16;
+			bgc->write_reg	= bgpio_write16;
+		}
 		break;
 	case 32:
-		bgc->read_reg	= bgpio_read32;
-		bgc->write_reg	= bgpio_write32;
+		if (byte_be) {
+			bgc->read_reg	= bgpio_read32be;
+			bgc->write_reg	= bgpio_write32be;
+		} else {
+			bgc->read_reg	= bgpio_read32;
+			bgc->write_reg	= bgpio_write32;
+		}
 		break;
 #if BITS_PER_LONG >= 64
 	case 64:
-		bgc->read_reg	= bgpio_read64;
-		bgc->write_reg	= bgpio_write64;
+		if (byte_be) {
+			dev_err(dev,
+				"64 bit big endian byte order unsupported\n");
+			return -EINVAL;
+		} else {
+			bgc->read_reg	= bgpio_read64;
+			bgc->write_reg	= bgpio_write64;
+		}
 		break;
 #endif /* BITS_PER_LONG >= 64 */
 	default:
@@ -276,7 +313,7 @@ static int bgpio_setup_accessors(struct device *dev,
 		return -EINVAL;
 	}
 
-	bgc->pin2mask = be ? bgpio_pin2mask_be : bgpio_pin2mask;
+	bgc->pin2mask = bit_be ? bgpio_pin2mask_be : bgpio_pin2mask;
 
 	return 0;
 }
@@ -385,7 +422,8 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = bgpio_setup_accessors(dev, bgc, flags & BGPIOF_BIG_ENDIAN);
+	ret = bgpio_setup_accessors(dev, bgc, flags & BGPIOF_BIG_ENDIAN,
+				    flags & BGPIOF_BIG_ENDIAN_BYTE_ORDER);
 	if (ret)
 		return ret;
 
diff --git a/include/linux/basic_mmio_gpio.h b/include/linux/basic_mmio_gpio.h
index 1c504ca5bdb3..d8a97ec0e2b8 100644
--- a/include/linux/basic_mmio_gpio.h
+++ b/include/linux/basic_mmio_gpio.h
@@ -72,5 +72,6 @@ int bgpio_init(struct bgpio_chip *bgc, struct device *dev,
 #define BGPIOF_BIG_ENDIAN		BIT(0)
 #define BGPIOF_UNREADABLE_REG_SET	BIT(1) /* reg_set is unreadable */
 #define BGPIOF_UNREADABLE_REG_DIR	BIT(2) /* reg_dir is unreadable */
+#define BGPIOF_BIG_ENDIAN_BYTE_ORDER	BIT(3)
 
 #endif /* __BASIC_MMIO_GPIO_H */
-- 
cgit 


From 7fc7acb9a0b0ff3ffdf21818fe0735ebaf4fecb8 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Date: Tue, 9 Apr 2013 15:57:25 +0200
Subject: gpio / ACPI: Handle ACPI events in accordance with the spec

Commit 0d1c28a (gpiolib-acpi: Add ACPI5 event model support to gpio.)
that added support for ACPI events signalled through GPIO interrupts
covered only GPIO pins whose numbers are less than or equal to 255.
However, there may be GPIO pins with numbers greater than 255 and
the ACPI spec (ACPI 5.0, Section 5.6.5.1) requires the _EVT method
to be used for handling events corresponding to those pins.

Moreover, according to the spec, _EVT is the default mechanism
for handling all ACPI events signalled through GPIO interrupts,
so if the _Exx/_Lxx method is not present for the given pin,
_EVT should be used instead.  If present, though, _Exx/_Lxx take
precedence over _EVT which shouldn't be executed in that case
(ACPI 5.0, Section 5.6.5.3).

Modify acpi_gpiochip_request_interrupts() to follow the spec as
described above and add acpi_gpiochip_free_interrupts() needed
to free interrupts associated with _EVT.

Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Acked-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/gpio/gpiolib-acpi.c | 140 +++++++++++++++++++++++++++++++++++++-------
 include/linux/acpi_gpio.h   |   2 +
 2 files changed, 122 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index a063eb04b6ce..89336c4f82cd 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -17,6 +17,13 @@
 #include <linux/acpi.h>
 #include <linux/interrupt.h>
 
+struct acpi_gpio_evt_pin {
+	struct list_head node;
+	acpi_handle *evt_handle;
+	unsigned int pin;
+	unsigned int irq;
+};
+
 static int acpi_gpiochip_find(struct gpio_chip *gc, void *data)
 {
 	if (!gc->dev)
@@ -54,7 +61,6 @@ int acpi_get_gpio(char *path, int pin)
 }
 EXPORT_SYMBOL_GPL(acpi_get_gpio);
 
-
 static irqreturn_t acpi_gpio_irq_handler(int irq, void *data)
 {
 	acpi_handle handle = data;
@@ -64,6 +70,27 @@ static irqreturn_t acpi_gpio_irq_handler(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static irqreturn_t acpi_gpio_irq_handler_evt(int irq, void *data)
+{
+	struct acpi_gpio_evt_pin *evt_pin = data;
+	struct acpi_object_list args;
+	union acpi_object arg;
+
+	arg.type = ACPI_TYPE_INTEGER;
+	arg.integer.value = evt_pin->pin;
+	args.count = 1;
+	args.pointer = &arg;
+
+	acpi_evaluate_object(evt_pin->evt_handle, NULL, &args, NULL);
+
+	return IRQ_HANDLED;
+}
+
+static void acpi_gpio_evt_dh(acpi_handle handle, void *data)
+{
+	/* The address of this function is used as a key. */
+}
+
 /**
  * acpi_gpiochip_request_interrupts() - Register isr for gpio chip ACPI events
  * @chip:      gpio chip
@@ -73,15 +100,13 @@ static irqreturn_t acpi_gpio_irq_handler(int irq, void *data)
  * chip's interrupt handler. acpi_gpiochip_request_interrupts finds out which
  * gpio pins have acpi event methods and assigns interrupt handlers that calls
  * the acpi event methods for those pins.
- *
- * Interrupts are automatically freed on driver detach
  */
-
 void acpi_gpiochip_request_interrupts(struct gpio_chip *chip)
 {
 	struct acpi_buffer buf = {ACPI_ALLOCATE_BUFFER, NULL};
 	struct acpi_resource *res;
-	acpi_handle handle, ev_handle;
+	acpi_handle handle, evt_handle;
+	struct list_head *evt_pins = NULL;
 	acpi_status status;
 	unsigned int pin;
 	int irq, ret;
@@ -98,13 +123,30 @@ void acpi_gpiochip_request_interrupts(struct gpio_chip *chip)
 	if (ACPI_FAILURE(status))
 		return;
 
-	/* If a gpio interrupt has an acpi event handler method, then
-	 * set up an interrupt handler that calls the acpi event handler
-	 */
+	status = acpi_get_handle(handle, "_EVT", &evt_handle);
+	if (ACPI_SUCCESS(status)) {
+		evt_pins = kzalloc(sizeof(*evt_pins), GFP_KERNEL);
+		if (evt_pins) {
+			INIT_LIST_HEAD(evt_pins);
+			status = acpi_attach_data(handle, acpi_gpio_evt_dh,
+						  evt_pins);
+			if (ACPI_FAILURE(status)) {
+				kfree(evt_pins);
+				evt_pins = NULL;
+			}
+		}
+	}
 
+	/*
+	 * If a GPIO interrupt has an ACPI event handler method, or _EVT is
+	 * present, set up an interrupt handler that calls the ACPI event
+	 * handler.
+	 */
 	for (res = buf.pointer;
 	     res && (res->type != ACPI_RESOURCE_TYPE_END_TAG);
 	     res = ACPI_NEXT_RESOURCE(res)) {
+		irq_handler_t handler = NULL;
+		void *data;
 
 		if (res->type != ACPI_RESOURCE_TYPE_GPIO ||
 		    res->data.gpio.connection_type !=
@@ -115,23 +157,42 @@ void acpi_gpiochip_request_interrupts(struct gpio_chip *chip)
 		if (pin > chip->ngpio)
 			continue;
 
-		sprintf(ev_name, "_%c%02X",
-		res->data.gpio.triggering ? 'E' : 'L', pin);
-
-		status = acpi_get_handle(handle, ev_name, &ev_handle);
-		if (ACPI_FAILURE(status))
-			continue;
-
 		irq = chip->to_irq(chip, pin);
 		if (irq < 0)
 			continue;
 
+		if (pin <= 255) {
+			acpi_handle ev_handle;
+
+			sprintf(ev_name, "_%c%02X",
+				res->data.gpio.triggering ? 'E' : 'L', pin);
+			status = acpi_get_handle(handle, ev_name, &ev_handle);
+			if (ACPI_SUCCESS(status)) {
+				handler = acpi_gpio_irq_handler;
+				data = ev_handle;
+			}
+		}
+		if (!handler && evt_pins) {
+			struct acpi_gpio_evt_pin *evt_pin;
+
+			evt_pin = kzalloc(sizeof(*evt_pin), GFP_KERNEL);
+			if (!evt_pin)
+				continue;
+
+			list_add_tail(&evt_pin->node, evt_pins);
+			evt_pin->evt_handle = evt_handle;
+			evt_pin->pin = pin;
+			evt_pin->irq = irq;
+			handler = acpi_gpio_irq_handler_evt;
+			data = evt_pin;
+		}
+		if (!handler)
+			continue;
+
 		/* Assume BIOS sets the triggering, so no flags */
-		ret = devm_request_threaded_irq(chip->dev, irq, NULL,
-					  acpi_gpio_irq_handler,
-					  0,
-					  "GPIO-signaled-ACPI-event",
-					  ev_handle);
+		ret = devm_request_threaded_irq(chip->dev, irq, NULL, handler,
+						0, "GPIO-signaled-ACPI-event",
+						data);
 		if (ret)
 			dev_err(chip->dev,
 				"Failed to request IRQ %d ACPI event handler\n",
@@ -139,3 +200,42 @@ void acpi_gpiochip_request_interrupts(struct gpio_chip *chip)
 	}
 }
 EXPORT_SYMBOL(acpi_gpiochip_request_interrupts);
+
+
+/**
+ * acpi_gpiochip_free_interrupts() - Free GPIO _EVT ACPI event interrupts.
+ * @chip:      gpio chip
+ *
+ * Free interrupts associated with the _EVT method for the given GPIO chip.
+ *
+ * The remaining ACPI event interrupts associated with the chip are freed
+ * automatically.
+ */
+void acpi_gpiochip_free_interrupts(struct gpio_chip *chip)
+{
+	acpi_handle handle;
+	acpi_status status;
+	struct list_head *evt_pins;
+	struct acpi_gpio_evt_pin *evt_pin, *ep;
+
+	if (!chip->dev || !chip->to_irq)
+		return;
+
+	handle = ACPI_HANDLE(chip->dev);
+	if (!handle)
+		return;
+
+	status = acpi_get_data(handle, acpi_gpio_evt_dh, (void **)&evt_pins);
+	if (ACPI_FAILURE(status))
+		return;
+
+	list_for_each_entry_safe_reverse(evt_pin, ep, evt_pins, node) {
+		devm_free_irq(chip->dev, evt_pin->irq, evt_pin);
+		list_del(&evt_pin->node);
+		kfree(evt_pin);
+	}
+
+	acpi_detach_data(handle, acpi_gpio_evt_dh);
+	kfree(evt_pins);
+}
+EXPORT_SYMBOL(acpi_gpiochip_free_interrupts);
diff --git a/include/linux/acpi_gpio.h b/include/linux/acpi_gpio.h
index b76ebd08ff8e..213135f44333 100644
--- a/include/linux/acpi_gpio.h
+++ b/include/linux/acpi_gpio.h
@@ -8,6 +8,7 @@
 
 int acpi_get_gpio(char *path, int pin);
 void acpi_gpiochip_request_interrupts(struct gpio_chip *chip);
+void acpi_gpiochip_free_interrupts(struct gpio_chip *chip);
 
 #else /* CONFIG_GPIO_ACPI */
 
@@ -17,6 +18,7 @@ static inline int acpi_get_gpio(char *path, int pin)
 }
 
 static inline void acpi_gpiochip_request_interrupts(struct gpio_chip *chip) { }
+static inline void acpi_gpiochip_free_interrupts(struct gpio_chip *chip) { }
 
 #endif /* CONFIG_GPIO_ACPI */
 
-- 
cgit 


From fb72a0590b770f7da6a02bde6b8a147a3d9f6168 Mon Sep 17 00:00:00 2001
From: Soren Brinkmann <soren.brinkmann@xilinx.com>
Date: Wed, 3 Apr 2013 12:17:12 -0700
Subject: clk: Properly handle notifier return values

Notifiers may return NOTIFY_(OK|DONE|STOP|BAD). The CCF uses an
inconsistent mix of checking against NOTIFY_STOP or NOTIFY_BAD.
This inconsistency leaves errors undetected in some cases:
clk_set_parent() calls __clk_speculate_rates(), which stops when it
hits a NOTIFIER_BAD (STOP is ignored), and passes this value back to the
caller.
clk_set_parent() compares this return value against NOTIFY_STOP only,
ignoring NOTIFY_BAD returns.

Use NOTIFY_STOP_MASK to detect a negative notifier return value and
document all four return value options.

Signed-off-by: Soren Brinkmann <soren.brinkmann@xilinx.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 drivers/clk/clk.c   | 10 +++++-----
 include/linux/clk.h |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk.c b/drivers/clk/clk.c
index de6b459de78e..d65ef178e6eb 100644
--- a/drivers/clk/clk.c
+++ b/drivers/clk/clk.c
@@ -1027,16 +1027,16 @@ static int __clk_speculate_rates(struct clk *clk, unsigned long parent_rate)
 	else
 		new_rate = parent_rate;
 
-	/* abort the rate change if a driver returns NOTIFY_BAD */
+	/* abort rate change if a driver returns NOTIFY_BAD or NOTIFY_STOP */
 	if (clk->notifier_count)
 		ret = __clk_notify(clk, PRE_RATE_CHANGE, clk->rate, new_rate);
 
-	if (ret == NOTIFY_BAD)
+	if (ret & NOTIFY_STOP_MASK)
 		goto out;
 
 	hlist_for_each_entry(child, &clk->children, child_node) {
 		ret = __clk_speculate_rates(child, new_rate);
-		if (ret == NOTIFY_BAD)
+		if (ret & NOTIFY_STOP_MASK)
 			break;
 	}
 
@@ -1129,7 +1129,7 @@ static struct clk *clk_propagate_rate_change(struct clk *clk, unsigned long even
 
 	if (clk->notifier_count) {
 		ret = __clk_notify(clk, event, clk->rate, clk->new_rate);
-		if (ret == NOTIFY_BAD)
+		if (ret & NOTIFY_STOP_MASK)
 			fail_clk = clk;
 	}
 
@@ -1484,7 +1484,7 @@ int clk_set_parent(struct clk *clk, struct clk *parent)
 		ret = __clk_speculate_rates(clk, p_rate);
 
 	/* abort if a driver objects */
-	if (ret == NOTIFY_STOP)
+	if (ret & NOTIFY_STOP_MASK)
 		goto out;
 
 	/* do the re-parent */
diff --git a/include/linux/clk.h b/include/linux/clk.h
index b3ac22d0fc1f..9a6d04524b1a 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -28,16 +28,16 @@ struct clk;
  * PRE_RATE_CHANGE - called immediately before the clk rate is changed,
  *     to indicate that the rate change will proceed.  Drivers must
  *     immediately terminate any operations that will be affected by the
- *     rate change.  Callbacks may either return NOTIFY_DONE or
- *     NOTIFY_STOP.
+ *     rate change.  Callbacks may either return NOTIFY_DONE, NOTIFY_OK,
+ *     NOTIFY_STOP or NOTIFY_BAD.
  *
  * ABORT_RATE_CHANGE: called if the rate change failed for some reason
  *     after PRE_RATE_CHANGE.  In this case, all registered notifiers on
  *     the clk will be called with ABORT_RATE_CHANGE. Callbacks must
- *     always return NOTIFY_DONE.
+ *     always return NOTIFY_DONE or NOTIFY_OK.
  *
  * POST_RATE_CHANGE - called after the clk rate change has successfully
- *     completed.  Callbacks must always return NOTIFY_DONE.
+ *     completed.  Callbacks must always return NOTIFY_DONE or NOTIFY_OK.
  *
  */
 #define PRE_RATE_CHANGE			BIT(0)
-- 
cgit 


From b8f649f1f531914a30ecb420e7565ee04dccc2ad Mon Sep 17 00:00:00 2001
From: Haojian Zhuang <haojian.zhuang@linaro.org>
Date: Tue, 9 Apr 2013 18:12:04 +0800
Subject: ARM: pxa: move PXA_GPIO_TO_IRQ macro

Since PXA_GPIO_TO_IRQ() & MMP_GPIO_TO_IRQ() macro are depended on
arch code, move them from gpio driver to platform driver instead.

Signed-off-by: Haojian Zhuang <haojian.zhuang@linaro.org>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
---
 arch/arm/mach-mmp/aspenite.c      |  7 +++++++
 arch/arm/mach-mmp/avengers_lite.c |  7 +++++++
 arch/arm/mach-mmp/brownstone.c    |  7 +++++++
 arch/arm/mach-mmp/flint.c         |  7 +++++++
 arch/arm/mach-mmp/gplugd.c        |  7 +++++++
 arch/arm/mach-mmp/jasper.c        |  8 ++++++++
 arch/arm/mach-mmp/tavorevb.c      |  7 +++++++
 arch/arm/mach-mmp/teton_bga.c     |  7 +++++++
 arch/arm/mach-mmp/ttc_dkb.c       |  7 +++++++
 arch/arm/mach-pxa/pxa25x.c        |  3 ++-
 arch/arm/mach-pxa/pxa27x.c        |  3 ++-
 arch/arm/mach-pxa/pxa3xx.c        | 11 ++++++++++-
 arch/arm/mach-pxa/pxa930.c        | 12 ++++++++++--
 drivers/gpio/gpio-pxa.c           | 20 +++++++++-----------
 include/linux/gpio-pxa.h          |  1 +
 15 files changed, 98 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-mmp/aspenite.c b/arch/arm/mach-mmp/aspenite.c
index 9f64d5632e07..fa21aac52467 100644
--- a/arch/arm/mach-mmp/aspenite.c
+++ b/arch/arm/mach-mmp/aspenite.c
@@ -9,6 +9,7 @@
  *  publishhed by the Free Software Foundation.
  */
 #include <linux/gpio.h>
+#include <linux/gpio-pxa.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
@@ -110,6 +111,10 @@ static unsigned long common_pin_config[] __initdata = {
 	GPIO121_KP_MKIN4,
 };
 
+static struct pxa_gpio_platform_data pxa168_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct smc91x_platdata smc91x_info = {
 	.flags	= SMC91X_USE_16BIT | SMC91X_NOWAIT,
 };
@@ -248,6 +253,8 @@ static void __init common_init(void)
 	pxa168_add_nand(&aspenite_nand_info);
 	pxa168_add_fb(&aspenite_lcd_info);
 	pxa168_add_keypad(&aspenite_keypad_info);
+	platform_device_add_data(&pxa168_device_gpio, &pxa168_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&pxa168_device_gpio);
 
 	/* off-chip devices */
diff --git a/arch/arm/mach-mmp/avengers_lite.c b/arch/arm/mach-mmp/avengers_lite.c
index 1f94957b56ae..a451a0f4d512 100644
--- a/arch/arm/mach-mmp/avengers_lite.c
+++ b/arch/arm/mach-mmp/avengers_lite.c
@@ -12,6 +12,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/gpio-pxa.h>
 #include <linux/platform_device.h>
 
 #include <asm/mach-types.h>
@@ -32,12 +33,18 @@ static unsigned long avengers_lite_pin_config_V16F[] __initdata = {
 	GPIO89_UART2_RXD,
 };
 
+static struct pxa_gpio_platform_data pxa168_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static void __init avengers_lite_init(void)
 {
 	mfp_config(ARRAY_AND_SIZE(avengers_lite_pin_config_V16F));
 
 	/* on-chip devices */
 	pxa168_add_uart(2);
+	platform_device_add_data(&pxa168_device_gpio, &pxa168_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&pxa168_device_gpio);
 }
 
diff --git a/arch/arm/mach-mmp/brownstone.c b/arch/arm/mach-mmp/brownstone.c
index 2358011c7d8e..ac25544b8cdb 100644
--- a/arch/arm/mach-mmp/brownstone.c
+++ b/arch/arm/mach-mmp/brownstone.c
@@ -14,6 +14,7 @@
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
+#include <linux/gpio-pxa.h>
 #include <linux/regulator/machine.h>
 #include <linux/regulator/max8649.h>
 #include <linux/regulator/fixed.h>
@@ -104,6 +105,10 @@ static unsigned long brownstone_pin_config[] __initdata = {
 	GPIO89_GPIO,
 };
 
+static struct pxa_gpio_platform_data mmp2_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct regulator_consumer_supply max8649_supply[] = {
 	REGULATOR_SUPPLY("vcc_core", NULL),
 };
@@ -202,6 +207,8 @@ static void __init brownstone_init(void)
 	/* on-chip devices */
 	mmp2_add_uart(1);
 	mmp2_add_uart(3);
+	platform_device_add_data(&mmp2_device_gpio, &mmp2_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&mmp2_device_gpio);
 	mmp2_add_twsi(1, NULL, ARRAY_AND_SIZE(brownstone_twsi1_info));
 	mmp2_add_sdhost(0, &mmp2_sdh_platdata_mmc0); /* SD/MMC */
diff --git a/arch/arm/mach-mmp/flint.c b/arch/arm/mach-mmp/flint.c
index 754c352dd02b..6291c33d83e2 100644
--- a/arch/arm/mach-mmp/flint.c
+++ b/arch/arm/mach-mmp/flint.c
@@ -16,6 +16,7 @@
 #include <linux/smc91x.h>
 #include <linux/io.h>
 #include <linux/gpio.h>
+#include <linux/gpio-pxa.h>
 #include <linux/interrupt.h>
 
 #include <asm/mach-types.h>
@@ -77,6 +78,10 @@ static unsigned long flint_pin_config[] __initdata = {
 	GPIO160_ND_RDY1,
 };
 
+static struct pxa_gpio_platform_data mmp2_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct smc91x_platdata flint_smc91x_info = {
 	.flags  = SMC91X_USE_16BIT | SMC91X_NOWAIT,
 };
@@ -111,6 +116,8 @@ static void __init flint_init(void)
 	/* on-chip devices */
 	mmp2_add_uart(1);
 	mmp2_add_uart(2);
+	platform_device_add_data(&mmp2_device_gpio, &mmp2_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&mmp2_device_gpio);
 
 	/* off-chip devices */
diff --git a/arch/arm/mach-mmp/gplugd.c b/arch/arm/mach-mmp/gplugd.c
index f62b68d926f4..d81b2475e67e 100644
--- a/arch/arm/mach-mmp/gplugd.c
+++ b/arch/arm/mach-mmp/gplugd.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
+#include <linux/gpio-pxa.h>
 
 #include <asm/mach/arch.h>
 #include <asm/mach-types.h>
@@ -128,6 +129,10 @@ static unsigned long gplugd_pin_config[] __initdata = {
 	GPIO116_I2S_TXD
 };
 
+static struct pxa_gpio_platform_data pxa168_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct i2c_board_info gplugd_i2c_board_info[] = {
 	{
 		.type = "isl1208",
@@ -186,6 +191,8 @@ static void __init gplugd_init(void)
 	pxa168_add_uart(3);
 	pxa168_add_ssp(1);
 	pxa168_add_twsi(0, NULL, ARRAY_AND_SIZE(gplugd_i2c_board_info));
+	platform_device_add_data(&pxa168_device_gpio, &pxa168_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&pxa168_device_gpio);
 
 	pxa168_add_eth(&gplugd_eth_platform_data);
diff --git a/arch/arm/mach-mmp/jasper.c b/arch/arm/mach-mmp/jasper.c
index 66634fd0ecb0..0e9e5c05b37c 100644
--- a/arch/arm/mach-mmp/jasper.c
+++ b/arch/arm/mach-mmp/jasper.c
@@ -12,6 +12,7 @@
 
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/gpio-pxa.h>
 #include <linux/platform_device.h>
 #include <linux/io.h>
 #include <linux/regulator/machine.h>
@@ -99,6 +100,10 @@ static unsigned long jasper_pin_config[] __initdata = {
 	GPIO151_MMC3_CLK,
 };
 
+static struct pxa_gpio_platform_data mmp2_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct regulator_consumer_supply max8649_supply[] = {
 	REGULATOR_SUPPLY("vcc_core", NULL),
 };
@@ -165,6 +170,9 @@ static void __init jasper_init(void)
 	mmp2_add_uart(1);
 	mmp2_add_uart(3);
 	mmp2_add_twsi(1, NULL, ARRAY_AND_SIZE(jasper_twsi1_info));
+	platform_device_add_data(&mmp2_device_gpio, &mmp2_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
+	platform_device_register(&mmp2_device_gpio);
 	mmp2_add_sdhost(0, &mmp2_sdh_platdata_mmc0); /* SD/MMC */
 
 	regulator_has_full_constraints();
diff --git a/arch/arm/mach-mmp/tavorevb.c b/arch/arm/mach-mmp/tavorevb.c
index 4c127d23955d..cdfc9bfee1a4 100644
--- a/arch/arm/mach-mmp/tavorevb.c
+++ b/arch/arm/mach-mmp/tavorevb.c
@@ -8,6 +8,7 @@
  *  publishhed by the Free Software Foundation.
  */
 #include <linux/gpio.h>
+#include <linux/gpio-pxa.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
@@ -60,6 +61,10 @@ static unsigned long tavorevb_pin_config[] __initdata = {
 	DF_RDY0_DF_RDY0,
 };
 
+static struct pxa_gpio_platform_data pxa910_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct smc91x_platdata tavorevb_smc91x_info = {
 	.flags	= SMC91X_USE_16BIT | SMC91X_NOWAIT,
 };
@@ -93,6 +98,8 @@ static void __init tavorevb_init(void)
 
 	/* on-chip devices */
 	pxa910_add_uart(1);
+	platform_device_add_data(&pxa910_device_gpio, &pxa910_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&pxa910_device_gpio);
 
 	/* off-chip devices */
diff --git a/arch/arm/mach-mmp/teton_bga.c b/arch/arm/mach-mmp/teton_bga.c
index 8609967975ed..e4d95b4c6bb2 100644
--- a/arch/arm/mach-mmp/teton_bga.c
+++ b/arch/arm/mach-mmp/teton_bga.c
@@ -16,6 +16,7 @@
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
+#include <linux/gpio-pxa.h>
 #include <linux/input.h>
 #include <linux/platform_data/keypad-pxa27x.h>
 #include <linux/i2c.h>
@@ -49,6 +50,10 @@ static unsigned long teton_bga_pin_config[] __initdata = {
 	GPIO78_GPIO,
 };
 
+static struct pxa_gpio_platform_data pxa168_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static unsigned int teton_bga_matrix_key_map[] = {
 	KEY(0, 6, KEY_ESC),
 	KEY(0, 7, KEY_ENTER),
@@ -79,6 +84,8 @@ static void __init teton_bga_init(void)
 	pxa168_add_uart(1);
 	pxa168_add_keypad(&teton_bga_keypad_info);
 	pxa168_add_twsi(0, NULL, ARRAY_AND_SIZE(teton_bga_i2c_info));
+	platform_device_add_data(&pxa168_device_gpio, &pxa168_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_device_register(&pxa168_device_gpio);
 }
 
diff --git a/arch/arm/mach-mmp/ttc_dkb.c b/arch/arm/mach-mmp/ttc_dkb.c
index 22a9058f9f4d..6aa788872921 100644
--- a/arch/arm/mach-mmp/ttc_dkb.c
+++ b/arch/arm/mach-mmp/ttc_dkb.c
@@ -17,6 +17,7 @@
 #include <linux/interrupt.h>
 #include <linux/i2c/pca953x.h>
 #include <linux/gpio.h>
+#include <linux/gpio-pxa.h>
 #include <linux/mfd/88pm860x.h>
 #include <linux/platform_data/mv_usb.h>
 #include <linux/spi/spi.h>
@@ -75,6 +76,10 @@ static unsigned long ttc_dkb_pin_config[] __initdata = {
 	DF_RDY0_DF_RDY0,
 };
 
+static struct pxa_gpio_platform_data pxa910_gpio_pdata = {
+	.irq_base	= MMP_GPIO_TO_IRQ(0),
+};
+
 static struct mtd_partition ttc_dkb_onenand_partitions[] = {
 	{
 		.name		= "bootloader",
@@ -284,6 +289,8 @@ static void __init ttc_dkb_init(void)
 
 	/* off-chip devices */
 	pxa910_add_twsi(0, NULL, ARRAY_AND_SIZE(ttc_dkb_i2c_info));
+	platform_device_add_data(&pxa910_device_gpio, &pxa910_gpio_pdata,
+				 sizeof(struct pxa_gpio_platform_data));
 	platform_add_devices(ARRAY_AND_SIZE(ttc_dkb_devices));
 
 #ifdef CONFIG_USB_MV_UDC
diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c
index e31a8812cf0d..f2c28972084d 100644
--- a/arch/arm/mach-pxa/pxa25x.c
+++ b/arch/arm/mach-pxa/pxa25x.c
@@ -344,7 +344,8 @@ void __init pxa25x_map_io(void)
 }
 
 static struct pxa_gpio_platform_data pxa25x_gpio_info __initdata = {
-	.gpio_set_wake = gpio_set_wake,
+	.irq_base	= PXA_GPIO_TO_IRQ(0),
+	.gpio_set_wake	= gpio_set_wake,
 };
 
 static struct platform_device *pxa25x_devices[] __initdata = {
diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c
index 7635ec5c9a1d..301471a07a10 100644
--- a/arch/arm/mach-pxa/pxa27x.c
+++ b/arch/arm/mach-pxa/pxa27x.c
@@ -431,7 +431,8 @@ void __init pxa27x_set_i2c_power_info(struct i2c_pxa_platform_data *info)
 }
 
 static struct pxa_gpio_platform_data pxa27x_gpio_info __initdata = {
-	.gpio_set_wake = gpio_set_wake,
+	.irq_base	= PXA_GPIO_TO_IRQ(0),
+	.gpio_set_wake	= gpio_set_wake,
 };
 
 static struct platform_device *devices[] __initdata = {
diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c
index 572666a1e4a8..87011f3de69d 100644
--- a/arch/arm/mach-pxa/pxa3xx.c
+++ b/arch/arm/mach-pxa/pxa3xx.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/gpio-pxa.h>
 #include <linux/pm.h>
 #include <linux/platform_device.h>
 #include <linux/irq.h>
@@ -436,6 +437,10 @@ void __init pxa3xx_set_i2c_power_info(struct i2c_pxa_platform_data *info)
 	pxa_register_device(&pxa3xx_device_i2c_power, info);
 }
 
+static struct pxa_gpio_platform_data pxa3xx_gpio_pdata = {
+	.irq_base	= PXA_GPIO_TO_IRQ(0),
+};
+
 static struct platform_device *devices[] __initdata = {
 	&pxa27x_device_udc,
 	&pxa_device_pmu,
@@ -488,8 +493,12 @@ static int __init pxa3xx_init(void)
 		ret = platform_add_devices(devices, ARRAY_SIZE(devices));
 		if (ret)
 			return ret;
-		if (cpu_is_pxa300() || cpu_is_pxa310() || cpu_is_pxa320())
+		if (cpu_is_pxa300() || cpu_is_pxa310() || cpu_is_pxa320()) {
+			platform_device_add_data(&pxa3xx_device_gpio,
+						 &pxa3xx_gpio_pdata,
+						 sizeof(pxa3xx_gpio_pdata));
 			ret = platform_device_register(&pxa3xx_device_gpio);
+		}
 	}
 
 	return ret;
diff --git a/arch/arm/mach-pxa/pxa930.c b/arch/arm/mach-pxa/pxa930.c
index 4693a78948aa..ab624487cf39 100644
--- a/arch/arm/mach-pxa/pxa930.c
+++ b/arch/arm/mach-pxa/pxa930.c
@@ -12,9 +12,10 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
-#include <linux/platform_device.h>
-#include <linux/irq.h>
 #include <linux/dma-mapping.h>
+#include <linux/irq.h>
+#include <linux/gpio-pxa.h>
+#include <linux/platform_device.h>
 
 #include <mach/pxa930.h>
 
@@ -192,6 +193,10 @@ static struct mfp_addr_map pxa935_mfp_addr_map[] __initdata = {
 	MFP_ADDR_END,
 };
 
+static struct pxa_gpio_platform_data pxa93x_gpio_pdata = {
+	.irq_base	= PXA_GPIO_TO_IRQ(0),
+};
+
 static int __init pxa930_init(void)
 {
 	int ret = 0;
@@ -199,6 +204,9 @@ static int __init pxa930_init(void)
 	if (cpu_is_pxa93x()) {
 		mfp_init_base(io_p2v(MFPR_BASE));
 		mfp_init_addr(pxa930_mfp_addr_map);
+		platform_device_add_data(&pxa93x_device_gpio,
+					 &pxa93x_gpio_pdata,
+					 sizeof(pxa93x_gpio_pdata));
 		ret = platform_device_register(&pxa93x_device_gpio);
 	}
 
diff --git a/drivers/gpio/gpio-pxa.c b/drivers/gpio/gpio-pxa.c
index fe74b0cc2bcb..a0905b20f48e 100644
--- a/drivers/gpio/gpio-pxa.c
+++ b/drivers/gpio/gpio-pxa.c
@@ -574,19 +574,18 @@ static int pxa_gpio_probe(struct platform_device *pdev)
 	int gpio, irq, ret, use_of = 0;
 	int irq0 = 0, irq1 = 0, irq_mux, gpio_offset = 0;
 
-	ret = pxa_gpio_probe_dt(pdev);
-	if (ret < 0) {
+	info = dev_get_platdata(&pdev->dev);
+	if (info) {
+		irq_base = info->irq_base;
+		if (irq_base <= 0)
+			return -EINVAL;
 		pxa_last_gpio = pxa_gpio_nums(pdev);
-#ifdef CONFIG_ARCH_PXA
-		if (gpio_is_pxa_type(gpio_type))
-			irq_base = PXA_GPIO_TO_IRQ(0);
-#endif
-#ifdef CONFIG_ARCH_MMP
-		if (gpio_is_mmp_type(gpio_type))
-			irq_base = MMP_GPIO_TO_IRQ(0);
-#endif
 	} else {
+		irq_base = 0;
 		use_of = 1;
+		ret = pxa_gpio_probe_dt(pdev);
+		if (ret < 0)
+			return -EINVAL;
 	}
 
 	if (!pxa_last_gpio)
@@ -623,7 +622,6 @@ static int pxa_gpio_probe(struct platform_device *pdev)
 	}
 
 	/* Initialize GPIO chips */
-	info = dev_get_platdata(&pdev->dev);
 	pxa_init_gpio_chip(pxa_last_gpio, info ? info->gpio_set_wake : NULL);
 
 	/* clear all GPIO edge detects */
diff --git a/include/linux/gpio-pxa.h b/include/linux/gpio-pxa.h
index d755b28ba635..d90ebbe02ca4 100644
--- a/include/linux/gpio-pxa.h
+++ b/include/linux/gpio-pxa.h
@@ -14,6 +14,7 @@ extern int pxa_last_gpio;
 extern int pxa_irq_to_gpio(int irq);
 
 struct pxa_gpio_platform_data {
+	int irq_base;
 	int (*gpio_set_wake)(unsigned int gpio, unsigned int on);
 };
 
-- 
cgit 


From bbfce37b3ea11d13a984063a162c898a5fb23b1e Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Apr 2013 02:04:55 +0200
Subject: video/s3c: move platform_data out of arch/arm

The s3c-fb driver requires header files from the samsung platforms
to find its platform_data definition, but this no longer works on
multiplatform kernels, so let's move the data into a new header
file under include/linux/platform_data.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: linux-fbdev@vger.kernel.org
Acked-by: Jingoo Han <jg1.han@samsung.com>
Signed-off-by: Tomi Valkeinen <tomi.valkeinen@ti.com>
---
 arch/arm/plat-samsung/include/plat/fb.h | 50 +-----------------------------
 drivers/video/s3c-fb.c                  |  3 +-
 include/linux/platform_data/video_s3c.h | 54 +++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 51 deletions(-)
 create mode 100644 include/linux/platform_data/video_s3c.h

(limited to 'include/linux')

diff --git a/arch/arm/plat-samsung/include/plat/fb.h b/arch/arm/plat-samsung/include/plat/fb.h
index b885322717a1..9ae507270785 100644
--- a/arch/arm/plat-samsung/include/plat/fb.h
+++ b/arch/arm/plat-samsung/include/plat/fb.h
@@ -15,55 +15,7 @@
 #ifndef __PLAT_S3C_FB_H
 #define __PLAT_S3C_FB_H __FILE__
 
-/* S3C_FB_MAX_WIN
- * Set to the maximum number of windows that any of the supported hardware
- * can use. Since the platform data uses this for an array size, having it
- * set to the maximum of any version of the hardware can do is safe.
- */
-#define S3C_FB_MAX_WIN	(5)
-
-/**
- * struct s3c_fb_pd_win - per window setup data
- * @xres     : The window X size.
- * @yres     : The window Y size.
- * @virtual_x: The virtual X size.
- * @virtual_y: The virtual Y size.
- */
-struct s3c_fb_pd_win {
-	unsigned short		default_bpp;
-	unsigned short		max_bpp;
-	unsigned short		xres;
-	unsigned short		yres;
-	unsigned short		virtual_x;
-	unsigned short		virtual_y;
-};
-
-/**
- * struct s3c_fb_platdata -  S3C driver platform specific information
- * @setup_gpio: Setup the external GPIO pins to the right state to transfer
- *		the data from the display system to the connected display
- *		device.
- * @vidcon0: The base vidcon0 values to control the panel data format.
- * @vidcon1: The base vidcon1 values to control the panel data output.
- * @vtiming: Video timing when connected to a RGB type panel.
- * @win: The setup data for each hardware window, or NULL for unused.
- * @display_mode: The LCD output display mode.
- *
- * The platform data supplies the video driver with all the information
- * it requires to work with the display(s) attached to the machine. It
- * controls the initial mode, the number of display windows (0 is always
- * the base framebuffer) that are initialised etc.
- *
- */
-struct s3c_fb_platdata {
-	void	(*setup_gpio)(void);
-
-	struct s3c_fb_pd_win	*win[S3C_FB_MAX_WIN];
-	struct fb_videomode     *vtiming;
-
-	u32			 vidcon0;
-	u32			 vidcon1;
-};
+#include <linux/platform_data/video_s3c.h>
 
 /**
  * s3c_fb_set_platdata() - Setup the FB device with platform data.
diff --git a/drivers/video/s3c-fb.c b/drivers/video/s3c-fb.c
index 968a62571df7..2e7991c7ca08 100644
--- a/drivers/video/s3c-fb.c
+++ b/drivers/video/s3c-fb.c
@@ -24,10 +24,9 @@
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
 #include <linux/pm_runtime.h>
+#include <linux/platform_data/video_s3c.h>
 
 #include <video/samsung_fimd.h>
-#include <mach/map.h>
-#include <plat/fb.h>
 
 /* This driver will export a number of framebuffer interfaces depending
  * on the configuration passed in via the platform data. Each fb instance
diff --git a/include/linux/platform_data/video_s3c.h b/include/linux/platform_data/video_s3c.h
new file mode 100644
index 000000000000..48883995f47f
--- /dev/null
+++ b/include/linux/platform_data/video_s3c.h
@@ -0,0 +1,54 @@
+#ifndef __PLATFORM_DATA_VIDEO_S3C
+#define __PLATFORM_DATA_VIDEO_S3C
+
+/* S3C_FB_MAX_WIN
+ * Set to the maximum number of windows that any of the supported hardware
+ * can use. Since the platform data uses this for an array size, having it
+ * set to the maximum of any version of the hardware can do is safe.
+ */
+#define S3C_FB_MAX_WIN	(5)
+
+/**
+ * struct s3c_fb_pd_win - per window setup data
+ * @xres     : The window X size.
+ * @yres     : The window Y size.
+ * @virtual_x: The virtual X size.
+ * @virtual_y: The virtual Y size.
+ */
+struct s3c_fb_pd_win {
+	unsigned short		default_bpp;
+	unsigned short		max_bpp;
+	unsigned short		xres;
+	unsigned short		yres;
+	unsigned short		virtual_x;
+	unsigned short		virtual_y;
+};
+
+/**
+ * struct s3c_fb_platdata -  S3C driver platform specific information
+ * @setup_gpio: Setup the external GPIO pins to the right state to transfer
+ *		the data from the display system to the connected display
+ *		device.
+ * @vidcon0: The base vidcon0 values to control the panel data format.
+ * @vidcon1: The base vidcon1 values to control the panel data output.
+ * @vtiming: Video timing when connected to a RGB type panel.
+ * @win: The setup data for each hardware window, or NULL for unused.
+ * @display_mode: The LCD output display mode.
+ *
+ * The platform data supplies the video driver with all the information
+ * it requires to work with the display(s) attached to the machine. It
+ * controls the initial mode, the number of display windows (0 is always
+ * the base framebuffer) that are initialised etc.
+ *
+ */
+struct s3c_fb_platdata {
+	void	(*setup_gpio)(void);
+
+	struct s3c_fb_pd_win	*win[S3C_FB_MAX_WIN];
+	struct fb_videomode     *vtiming;
+
+	u32			 vidcon0;
+	u32			 vidcon1;
+};
+
+#endif
-- 
cgit 


From 3480c0cab6e1a25fdeb63147b0b643b7825a36fb Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 11 Apr 2013 12:04:18 +0800
Subject: regulator: ab8500-ext: Make the return type of
 ab8500_ext_regulator_exit() void

ab8500_ext_regulator_exit() never fails.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Acked-by: Bengt Jonsson <bengt.g.jonsson@stericsson.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/ab8500-ext.c   | 4 +---
 drivers/regulator/ab8500.c       | 9 +++------
 include/linux/regulator/ab8500.h | 2 +-
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/ab8500-ext.c b/drivers/regulator/ab8500-ext.c
index 9aee21cad379..20680f30a968 100644
--- a/drivers/regulator/ab8500-ext.c
+++ b/drivers/regulator/ab8500-ext.c
@@ -394,7 +394,7 @@ int ab8500_ext_regulator_init(struct platform_device *pdev)
 	return 0;
 }
 
-int ab8500_ext_regulator_exit(struct platform_device *pdev)
+void ab8500_ext_regulator_exit(struct platform_device *pdev)
 {
 	int i;
 
@@ -407,8 +407,6 @@ int ab8500_ext_regulator_exit(struct platform_device *pdev)
 
 		regulator_unregister(info->rdev);
 	}
-
-	return 0;
 }
 
 MODULE_LICENSE("GPL v2");
diff --git a/drivers/regulator/ab8500.c b/drivers/regulator/ab8500.c
index 9ebd131b2ec6..c200f8bb61db 100644
--- a/drivers/regulator/ab8500.c
+++ b/drivers/regulator/ab8500.c
@@ -3194,12 +3194,9 @@ static int ab8500_regulator_remove(struct platform_device *pdev)
 		regulator_unregister(info->regulator);
 	}
 
-	if (!is_ab8505(ab8500)) {
-		/* remove external regulators (after Vaux1, 2 and 3) */
-		err = ab8500_ext_regulator_exit(pdev);
-		if (err)
-			return err;
-	}
+	/* remove external regulators (after Vaux1, 2 and 3) */
+	if (!is_ab8505(ab8500))
+		ab8500_ext_regulator_exit(pdev);
 
 	/* remove regulator debug */
 	err = ab8500_regulator_debug_exit(pdev);
diff --git a/include/linux/regulator/ab8500.h b/include/linux/regulator/ab8500.h
index 90b8b5ae9a4e..7c5ff0c55773 100644
--- a/include/linux/regulator/ab8500.h
+++ b/include/linux/regulator/ab8500.h
@@ -338,6 +338,6 @@ static inline int ab8500_regulator_debug_exit(struct platform_device *pdev)
 
 /* AB8500 external regulator functions. */
 int ab8500_ext_regulator_init(struct platform_device *pdev);
-int ab8500_ext_regulator_exit(struct platform_device *pdev);
+void ab8500_ext_regulator_exit(struct platform_device *pdev);
 
 #endif
-- 
cgit 


From 4e4098a3e08783cfd75f9fcdab276dc1d46931da Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Thu, 11 Apr 2013 11:43:29 -0700
Subject: driver core: handle user namespaces properly with the uid/gid
 devtmpfs change

Now that devtmpfs is caring about uid/gid, we need to use the correct
internal types so users who have USER_NS enabled will have things work
properly for them.

Thanks to Eric for pointing this out, and the patch review.

Reported-by: Eric W. Biederman <ebiederm@xmission.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 block/genhd.c           |  2 +-
 drivers/base/core.c     | 14 +++++++-------
 drivers/base/devtmpfs.c | 18 +++++++++---------
 drivers/usb/core/usb.c  |  2 +-
 include/linux/device.h  |  4 ++--
 5 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/block/genhd.c b/block/genhd.c
index dfcec431ceea..20625eed5511 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1112,7 +1112,7 @@ struct class block_class = {
 };
 
 static char *block_devnode(struct device *dev, umode_t *mode,
-			   uid_t *uid, gid_t *gid)
+			   kuid_t *uid, kgid_t *gid)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8a428b51089d..f88d9e259a32 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -283,8 +283,8 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 		const char *tmp;
 		const char *name;
 		umode_t mode = 0;
-		uid_t uid = 0;
-		gid_t gid = 0;
+		kuid_t uid = GLOBAL_ROOT_UID;
+		kgid_t gid = GLOBAL_ROOT_GID;
 
 		add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt));
 		add_uevent_var(env, "MINOR=%u", MINOR(dev->devt));
@@ -293,10 +293,10 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj,
 			add_uevent_var(env, "DEVNAME=%s", name);
 			if (mode)
 				add_uevent_var(env, "DEVMODE=%#o", mode & 0777);
-			if (uid)
-				add_uevent_var(env, "DEVUID=%u", uid);
-			if (gid)
-				add_uevent_var(env, "DEVGID=%u", gid);
+			if (!uid_eq(uid, GLOBAL_ROOT_UID))
+				add_uevent_var(env, "DEVUID=%u", from_kuid(&init_user_ns, uid));
+			if (!gid_eq(gid, GLOBAL_ROOT_GID))
+				add_uevent_var(env, "DEVGID=%u", from_kgid(&init_user_ns, gid));
 			kfree(tmp);
 		}
 	}
@@ -1297,7 +1297,7 @@ static struct device *next_device(struct klist_iter *i)
  * freed by the caller.
  */
 const char *device_get_devnode(struct device *dev,
-			       umode_t *mode, uid_t *uid, gid_t *gid,
+			       umode_t *mode, kuid_t *uid, kgid_t *gid,
 			       const char **tmp)
 {
 	char *s;
diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c
index abd4eee61d27..7413d065906b 100644
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -42,8 +42,8 @@ static struct req {
 	int err;
 	const char *name;
 	umode_t mode;	/* 0 => delete */
-	uid_t uid;
-	gid_t gid;
+	kuid_t uid;
+	kgid_t gid;
 	struct device *dev;
 } *requests;
 
@@ -88,8 +88,8 @@ int devtmpfs_create_node(struct device *dev)
 		return 0;
 
 	req.mode = 0;
-	req.uid = 0;
-	req.gid = 0;
+	req.uid = GLOBAL_ROOT_UID;
+	req.gid = GLOBAL_ROOT_GID;
 	req.name = device_get_devnode(dev, &req.mode, &req.uid, &req.gid, &tmp);
 	if (!req.name)
 		return -ENOMEM;
@@ -192,8 +192,8 @@ static int create_path(const char *nodepath)
 	return err;
 }
 
-static int handle_create(const char *nodename, umode_t mode, uid_t uid,
-			 gid_t gid, struct device *dev)
+static int handle_create(const char *nodename, umode_t mode, kuid_t uid,
+			 kgid_t gid, struct device *dev)
 {
 	struct dentry *dentry;
 	struct path path;
@@ -212,8 +212,8 @@ static int handle_create(const char *nodename, umode_t mode, uid_t uid,
 		struct iattr newattrs;
 
 		newattrs.ia_mode = mode;
-		newattrs.ia_uid = KUIDT_INIT(uid);
-		newattrs.ia_gid = KGIDT_INIT(gid);
+		newattrs.ia_uid = uid;
+		newattrs.ia_gid = gid;
 		newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID;
 		mutex_lock(&dentry->d_inode->i_mutex);
 		notify_change(dentry, &newattrs);
@@ -364,7 +364,7 @@ int devtmpfs_mount(const char *mntdir)
 
 static DECLARE_COMPLETION(setup_done);
 
-static int handle(const char *name, umode_t mode, uid_t uid, gid_t gid,
+static int handle(const char *name, umode_t mode, kuid_t uid, kgid_t gid,
 		  struct device *dev)
 {
 	if (mode)
diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
index 17002832abd9..e092b414dc50 100644
--- a/drivers/usb/core/usb.c
+++ b/drivers/usb/core/usb.c
@@ -318,7 +318,7 @@ static const struct dev_pm_ops usb_device_pm_ops = {
 
 
 static char *usb_devnode(struct device *dev,
-			 umode_t *mode, uid_t *uid, gid_t *gid)
+			 umode_t *mode, kuid_t *uid, kgid_t *gid)
 {
 	struct usb_device *usb_dev;
 
diff --git a/include/linux/device.h b/include/linux/device.h
index 851b85c7101e..88615ccaf23a 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -467,7 +467,7 @@ struct device_type {
 	const struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
 	char *(*devnode)(struct device *dev, umode_t *mode,
-			 uid_t *uid, gid_t *gid);
+			 kuid_t *uid, kgid_t *gid);
 	void (*release)(struct device *dev);
 
 	const struct dev_pm_ops *pm;
@@ -845,7 +845,7 @@ extern int device_rename(struct device *dev, const char *new_name);
 extern int device_move(struct device *dev, struct device *new_parent,
 		       enum dpm_order dpm_order);
 extern const char *device_get_devnode(struct device *dev,
-				      umode_t *mode, uid_t *uid, gid_t *gid,
+				      umode_t *mode, kuid_t *uid, kgid_t *gid,
 				      const char **tmp);
 extern void *dev_get_drvdata(const struct device *dev);
 extern int dev_set_drvdata(struct device *dev, void *data);
-- 
cgit 


From ac64995da872c8ae6f74a5556fdad565829985b0 Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 11 Apr 2013 04:40:30 +0000
Subject: usbnet: introduce usbnet_link_change API

This patch introduces the API of usbnet_link_change, so that
usbnet can handle link change centrally, which may help to
implement killing traffic URBs for saving USB bus bandwidth
and host controller power.

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 13 +++++++++++++
 include/linux/usb/usbnet.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 51f3192f3931..40e4237b0a3a 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -1653,6 +1653,19 @@ int usbnet_manage_power(struct usbnet *dev, int on)
 }
 EXPORT_SYMBOL(usbnet_manage_power);
 
+void usbnet_link_change(struct usbnet *dev, bool link, bool need_reset)
+{
+	/* update link after link is reseted */
+	if (link && !need_reset)
+		netif_carrier_on(dev->net);
+	else
+		netif_carrier_off(dev->net);
+
+	if (need_reset && link)
+		usbnet_defer_kevent(dev, EVENT_LINK_RESET);
+}
+EXPORT_SYMBOL(usbnet_link_change);
+
 /*-------------------------------------------------------------------------*/
 static int __usbnet_read_cmd(struct usbnet *dev, u8 cmd, u8 reqtype,
 			     u16 value, u16 index, void *data, u16 size)
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 0e5ac93bab10..eb021b8f53bd 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -245,5 +245,6 @@ extern void usbnet_get_drvinfo(struct net_device *, struct ethtool_drvinfo *);
 extern int usbnet_nway_reset(struct net_device *net);
 
 extern int usbnet_manage_power(struct usbnet *, int);
+extern void usbnet_link_change(struct usbnet *, bool, bool);
 
 #endif /* __LINUX_USB_USBNET_H */
-- 
cgit 


From 4b49f58fff00e6e9b24eaa31d4c6324393d76b0a Mon Sep 17 00:00:00 2001
From: Ming Lei <ming.lei@canonical.com>
Date: Thu, 11 Apr 2013 04:40:40 +0000
Subject: usbnet: handle link change

The link change is detected via the interrupt pipe, and bulk
pipes are responsible for transfering packets, so it is reasonable
to stop bulk transfer after link is reported as off.

Two adavantages may be obtained with stopping bulk transfer
after link becomes off:

- USB bus bandwidth is saved(USB bus is shared bus except for
USB3.0), for example, lots of 'IN' token packets and 'NYET'
handshake packets is transfered on 2.0 bus.

- probabaly power might be saved for usb host controller since
cancelling bulk transfer may disable the asynchronous schedule of
host controller.

With this patch, when link becomes off, about ~10% performance
boost can be found on bulk transfer of anther usb device which
is attached to same bus with the usbnet device, see below
test on next-20130410:

- read from usb mass storage(Sandisk Extreme USB 3.0) on pandaboard
with below command after unplugging ethernet cable:

	dd if=/dev/sda iflag=direct of=/dev/null bs=1M count=800

- without the patch
1, 838860800 bytes (839 MB) copied, 36.2216 s, 23.2 MB/s
2, 838860800 bytes (839 MB) copied, 35.8368 s, 23.4 MB/s
3, 838860800 bytes (839 MB) copied, 35.823 s, 23.4 MB/s
4, 838860800 bytes (839 MB) copied, 35.937 s, 23.3 MB/s
5, 838860800 bytes (839 MB) copied, 35.7365 s, 23.5 MB/s
average: 23.6MB/s

- with the patch
1, 838860800 bytes (839 MB) copied, 32.3817 s, 25.9 MB/s
2, 838860800 bytes (839 MB) copied, 31.7389 s, 26.4 MB/s
3, 838860800 bytes (839 MB) copied, 32.438 s, 25.9 MB/s
4, 838860800 bytes (839 MB) copied, 32.5492 s, 25.8 MB/s
5, 838860800 bytes (839 MB) copied, 31.6178 s, 26.5 MB/s
average: 26.1MB/s

Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 30 ++++++++++++++++++++++++++++++
 include/linux/usb/usbnet.h |  1 +
 2 files changed, 31 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 34e425264a7a..1e5a9b72650e 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -938,6 +938,27 @@ static const struct ethtool_ops usbnet_ethtool_ops = {
 
 /*-------------------------------------------------------------------------*/
 
+static void __handle_link_change(struct usbnet *dev)
+{
+	if (!test_bit(EVENT_DEV_OPEN, &dev->flags))
+		return;
+
+	if (!netif_carrier_ok(dev->net)) {
+		/* kill URBs for reading packets to save bus bandwidth */
+		unlink_urbs(dev, &dev->rxq);
+
+		/*
+		 * tx_timeout will unlink URBs for sending packets and
+		 * tx queue is stopped by netcore after link becomes off
+		 */
+	} else {
+		/* submitting URBs for reading packets */
+		tasklet_schedule(&dev->bh);
+	}
+
+	clear_bit(EVENT_LINK_CHANGE, &dev->flags);
+}
+
 /* work that cannot be done in interrupt context uses keventd.
  *
  * NOTE:  with 2.5 we could do more of this using completion callbacks,
@@ -1035,8 +1056,14 @@ skip_reset:
 		} else {
 			usb_autopm_put_interface(dev->intf);
 		}
+
+		/* handle link change from link resetting */
+		__handle_link_change(dev);
 	}
 
+	if (test_bit (EVENT_LINK_CHANGE, &dev->flags))
+		__handle_link_change(dev);
+
 	if (dev->flags)
 		netdev_dbg(dev->net, "kevent done, flags = 0x%lx\n", dev->flags);
 }
@@ -1286,6 +1313,7 @@ static void usbnet_bh (unsigned long param)
 	// or are we maybe short a few urbs?
 	} else if (netif_running (dev->net) &&
 		   netif_device_present (dev->net) &&
+		   netif_carrier_ok(dev->net) &&
 		   !timer_pending (&dev->delay) &&
 		   !test_bit (EVENT_RX_HALT, &dev->flags)) {
 		int	temp = dev->rxq.qlen;
@@ -1663,6 +1691,8 @@ void usbnet_link_change(struct usbnet *dev, bool link, bool need_reset)
 
 	if (need_reset && link)
 		usbnet_defer_kevent(dev, EVENT_LINK_RESET);
+	else
+		usbnet_defer_kevent(dev, EVENT_LINK_CHANGE);
 }
 EXPORT_SYMBOL(usbnet_link_change);
 
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index eb021b8f53bd..da46327fca17 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -72,6 +72,7 @@ struct usbnet {
 #		define EVENT_DEVICE_REPORT_IDLE	8
 #		define EVENT_NO_RUNTIME_PM	9
 #		define EVENT_RX_KILL	10
+#		define EVENT_LINK_CHANGE	11
 };
 
 static inline struct usb_driver *driver_of(struct usb_interface *intf)
-- 
cgit 


From d71956960c9dafaafeb19ea010c234abe9d25f10 Mon Sep 17 00:00:00 2001
From: Rob Herring <rob.herring@calxeda.com>
Date: Wed, 20 Mar 2013 16:56:18 -0500
Subject: OF: add empty of_device_is_available for !OF

Add an empty version of of_device_is_available.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
---
 include/linux/of.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/of.h b/include/linux/of.h
index a0f129284948..95c8583688bd 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -379,6 +379,11 @@ static inline int of_device_is_compatible(const struct device_node *device,
 	return 0;
 }
 
+static inline int of_device_is_available(const struct device_node *device)
+{
+	return 0;
+}
+
 static inline struct property *of_find_property(const struct device_node *np,
 						const char *name,
 						int *lenp)
-- 
cgit 


From 9ee51f01eee84a108510ac2794b9a9dff8be6d3f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Apr 2013 02:04:48 +0200
Subject: tty: serial/samsung: make register definitions global

The registers for the Samsung S3C serial port are currently defined in
the platform specific arch/arm/plat-samsung/include/plat/regs-serial.h
file, which is not visible to multiplatform capable drivers.

Unfortunately, it is not possible to move the file into a more local
place as we should normally try to, because the same registers
may be used in one of four places:

* In the driver itself
* In platform-independent ARM code for early debug output
* In platform_data definitions
* In the Samsung platform power management code

I have also found no way to logically split out a platform_data
file, other than possibly move everything into
include/linux/platform_data, which also felt wrong. The only
part of this file that makes sense to keep specific to the s3c24xx
platform are the virtual and physical addresses defined here,
which are needed in no other location.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-s3c24xx/clock-s3c2440.c            |   5 +
 arch/arm/mach-s3c24xx/common.c                   |   5 +
 arch/arm/plat-samsung/include/plat/regs-serial.h | 282 +----------------------
 drivers/tty/serial/samsung.c                     |   6 +-
 include/linux/serial_s3c.h                       | 260 +++++++++++++++++++++
 5 files changed, 274 insertions(+), 284 deletions(-)
 create mode 100644 include/linux/serial_s3c.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-s3c24xx/clock-s3c2440.c b/arch/arm/mach-s3c24xx/clock-s3c2440.c
index 04b87ec92537..1069b5680826 100644
--- a/arch/arm/mach-s3c24xx/clock-s3c2440.c
+++ b/arch/arm/mach-s3c24xx/clock-s3c2440.c
@@ -123,6 +123,11 @@ static struct clk s3c2440_clk_ac97 = {
 	.ctrlbit	= S3C2440_CLKCON_AC97,
 };
 
+#define S3C24XX_VA_UART0      (S3C_VA_UART)
+#define S3C24XX_VA_UART1      (S3C_VA_UART + 0x4000 )
+#define S3C24XX_VA_UART2      (S3C_VA_UART + 0x8000 )
+#define S3C24XX_VA_UART3      (S3C_VA_UART + 0xC000 )
+
 static unsigned long  s3c2440_fclk_n_getrate(struct clk *clk)
 {
 	unsigned long ucon0, ucon1, ucon2, divisor;
diff --git a/arch/arm/mach-s3c24xx/common.c b/arch/arm/mach-s3c24xx/common.c
index 6bcf87f65f9e..92e609440c57 100644
--- a/arch/arm/mach-s3c24xx/common.c
+++ b/arch/arm/mach-s3c24xx/common.c
@@ -239,6 +239,11 @@ void __init s3c24xx_init_io(struct map_desc *mach_desc, int size)
 
 /* Serial port registrations */
 
+#define S3C2410_PA_UART0      (S3C24XX_PA_UART)
+#define S3C2410_PA_UART1      (S3C24XX_PA_UART + 0x4000 )
+#define S3C2410_PA_UART2      (S3C24XX_PA_UART + 0x8000 )
+#define S3C2443_PA_UART3      (S3C24XX_PA_UART + 0xC000 )
+
 static struct resource s3c2410_uart0_resource[] = {
 	[0] = DEFINE_RES_MEM(S3C2410_PA_UART0, SZ_16K),
 	[1] = DEFINE_RES_NAMED(IRQ_S3CUART_RX0, \
diff --git a/arch/arm/plat-samsung/include/plat/regs-serial.h b/arch/arm/plat-samsung/include/plat/regs-serial.h
index 29c26a818842..f05f2afa440d 100644
--- a/arch/arm/plat-samsung/include/plat/regs-serial.h
+++ b/arch/arm/plat-samsung/include/plat/regs-serial.h
@@ -1,281 +1 @@
-/* arch/arm/plat-samsung/include/plat/regs-serial.h
- *
- *  From linux/include/asm-arm/hardware/serial_s3c2410.h
- *
- *  Internal header file for Samsung S3C2410 serial ports (UART0-2)
- *
- *  Copyright (C) 2002 Shane Nay (shane@minirl.com)
- *
- *  Additional defines, Copyright 2003 Simtec Electronics (linux@simtec.co.uk)
- *
- *  Adapted from:
- *
- *  Internal header file for MX1ADS serial ports (UART1 & 2)
- *
- *  Copyright (C) 2002 Shane Nay (shane@minirl.com)
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-*/
-
-#ifndef __ASM_ARM_REGS_SERIAL_H
-#define __ASM_ARM_REGS_SERIAL_H
-
-#define S3C24XX_VA_UART0      (S3C_VA_UART)
-#define S3C24XX_VA_UART1      (S3C_VA_UART + 0x4000 )
-#define S3C24XX_VA_UART2      (S3C_VA_UART + 0x8000 )
-#define S3C24XX_VA_UART3      (S3C_VA_UART + 0xC000 )
-
-#define S3C2410_PA_UART0      (S3C24XX_PA_UART)
-#define S3C2410_PA_UART1      (S3C24XX_PA_UART + 0x4000 )
-#define S3C2410_PA_UART2      (S3C24XX_PA_UART + 0x8000 )
-#define S3C2443_PA_UART3      (S3C24XX_PA_UART + 0xC000 )
-
-#define S3C2410_URXH	  (0x24)
-#define S3C2410_UTXH	  (0x20)
-#define S3C2410_ULCON	  (0x00)
-#define S3C2410_UCON	  (0x04)
-#define S3C2410_UFCON	  (0x08)
-#define S3C2410_UMCON	  (0x0C)
-#define S3C2410_UBRDIV	  (0x28)
-#define S3C2410_UTRSTAT	  (0x10)
-#define S3C2410_UERSTAT	  (0x14)
-#define S3C2410_UFSTAT	  (0x18)
-#define S3C2410_UMSTAT	  (0x1C)
-
-#define S3C2410_LCON_CFGMASK	  ((0xF<<3)|(0x3))
-
-#define S3C2410_LCON_CS5	  (0x0)
-#define S3C2410_LCON_CS6	  (0x1)
-#define S3C2410_LCON_CS7	  (0x2)
-#define S3C2410_LCON_CS8	  (0x3)
-#define S3C2410_LCON_CSMASK	  (0x3)
-
-#define S3C2410_LCON_PNONE	  (0x0)
-#define S3C2410_LCON_PEVEN	  (0x5 << 3)
-#define S3C2410_LCON_PODD	  (0x4 << 3)
-#define S3C2410_LCON_PMASK	  (0x7 << 3)
-
-#define S3C2410_LCON_STOPB	  (1<<2)
-#define S3C2410_LCON_IRM          (1<<6)
-
-#define S3C2440_UCON_CLKMASK	  (3<<10)
-#define S3C2440_UCON_CLKSHIFT	  (10)
-#define S3C2440_UCON_PCLK	  (0<<10)
-#define S3C2440_UCON_UCLK	  (1<<10)
-#define S3C2440_UCON_PCLK2	  (2<<10)
-#define S3C2440_UCON_FCLK	  (3<<10)
-#define S3C2443_UCON_EPLL	  (3<<10)
-
-#define S3C6400_UCON_CLKMASK	(3<<10)
-#define S3C6400_UCON_CLKSHIFT	(10)
-#define S3C6400_UCON_PCLK	(0<<10)
-#define S3C6400_UCON_PCLK2	(2<<10)
-#define S3C6400_UCON_UCLK0	(1<<10)
-#define S3C6400_UCON_UCLK1	(3<<10)
-
-#define S3C2440_UCON2_FCLK_EN	  (1<<15)
-#define S3C2440_UCON0_DIVMASK	  (15 << 12)
-#define S3C2440_UCON1_DIVMASK	  (15 << 12)
-#define S3C2440_UCON2_DIVMASK	  (7 << 12)
-#define S3C2440_UCON_DIVSHIFT	  (12)
-
-#define S3C2412_UCON_CLKMASK	(3<<10)
-#define S3C2412_UCON_CLKSHIFT	(10)
-#define S3C2412_UCON_UCLK	(1<<10)
-#define S3C2412_UCON_USYSCLK	(3<<10)
-#define S3C2412_UCON_PCLK	(0<<10)
-#define S3C2412_UCON_PCLK2	(2<<10)
-
-#define S3C2410_UCON_CLKMASK	(1 << 10)
-#define S3C2410_UCON_CLKSHIFT	(10)
-#define S3C2410_UCON_UCLK	  (1<<10)
-#define S3C2410_UCON_SBREAK	  (1<<4)
-
-#define S3C2410_UCON_TXILEVEL	  (1<<9)
-#define S3C2410_UCON_RXILEVEL	  (1<<8)
-#define S3C2410_UCON_TXIRQMODE	  (1<<2)
-#define S3C2410_UCON_RXIRQMODE	  (1<<0)
-#define S3C2410_UCON_RXFIFO_TOI	  (1<<7)
-#define S3C2443_UCON_RXERR_IRQEN  (1<<6)
-#define S3C2443_UCON_LOOPBACK	  (1<<5)
-
-#define S3C2410_UCON_DEFAULT	  (S3C2410_UCON_TXILEVEL  | \
-				   S3C2410_UCON_RXILEVEL  | \
-				   S3C2410_UCON_TXIRQMODE | \
-				   S3C2410_UCON_RXIRQMODE | \
-				   S3C2410_UCON_RXFIFO_TOI)
-
-#define S3C2410_UFCON_FIFOMODE	  (1<<0)
-#define S3C2410_UFCON_TXTRIG0	  (0<<6)
-#define S3C2410_UFCON_RXTRIG8	  (1<<4)
-#define S3C2410_UFCON_RXTRIG12	  (2<<4)
-
-/* S3C2440 FIFO trigger levels */
-#define S3C2440_UFCON_RXTRIG1	  (0<<4)
-#define S3C2440_UFCON_RXTRIG8	  (1<<4)
-#define S3C2440_UFCON_RXTRIG16	  (2<<4)
-#define S3C2440_UFCON_RXTRIG32	  (3<<4)
-
-#define S3C2440_UFCON_TXTRIG0	  (0<<6)
-#define S3C2440_UFCON_TXTRIG16	  (1<<6)
-#define S3C2440_UFCON_TXTRIG32	  (2<<6)
-#define S3C2440_UFCON_TXTRIG48	  (3<<6)
-
-#define S3C2410_UFCON_RESETBOTH	  (3<<1)
-#define S3C2410_UFCON_RESETTX	  (1<<2)
-#define S3C2410_UFCON_RESETRX	  (1<<1)
-
-#define S3C2410_UFCON_DEFAULT	  (S3C2410_UFCON_FIFOMODE | \
-				   S3C2410_UFCON_TXTRIG0  | \
-				   S3C2410_UFCON_RXTRIG8 )
-
-#define	S3C2410_UMCOM_AFC	  (1<<4)
-#define	S3C2410_UMCOM_RTS_LOW	  (1<<0)
-
-#define S3C2412_UMCON_AFC_63	(0<<5)		/* same as s3c2443 */
-#define S3C2412_UMCON_AFC_56	(1<<5)
-#define S3C2412_UMCON_AFC_48	(2<<5)
-#define S3C2412_UMCON_AFC_40	(3<<5)
-#define S3C2412_UMCON_AFC_32	(4<<5)
-#define S3C2412_UMCON_AFC_24	(5<<5)
-#define S3C2412_UMCON_AFC_16	(6<<5)
-#define S3C2412_UMCON_AFC_8	(7<<5)
-
-#define S3C2410_UFSTAT_TXFULL	  (1<<9)
-#define S3C2410_UFSTAT_RXFULL	  (1<<8)
-#define S3C2410_UFSTAT_TXMASK	  (15<<4)
-#define S3C2410_UFSTAT_TXSHIFT	  (4)
-#define S3C2410_UFSTAT_RXMASK	  (15<<0)
-#define S3C2410_UFSTAT_RXSHIFT	  (0)
-
-/* UFSTAT S3C2443 same as S3C2440 */
-#define S3C2440_UFSTAT_TXFULL	  (1<<14)
-#define S3C2440_UFSTAT_RXFULL	  (1<<6)
-#define S3C2440_UFSTAT_TXSHIFT	  (8)
-#define S3C2440_UFSTAT_RXSHIFT	  (0)
-#define S3C2440_UFSTAT_TXMASK	  (63<<8)
-#define S3C2440_UFSTAT_RXMASK	  (63)
-
-#define S3C2410_UTRSTAT_TXE	  (1<<2)
-#define S3C2410_UTRSTAT_TXFE	  (1<<1)
-#define S3C2410_UTRSTAT_RXDR	  (1<<0)
-
-#define S3C2410_UERSTAT_OVERRUN	  (1<<0)
-#define S3C2410_UERSTAT_FRAME	  (1<<2)
-#define S3C2410_UERSTAT_BREAK	  (1<<3)
-#define S3C2443_UERSTAT_PARITY	  (1<<1)
-
-#define S3C2410_UERSTAT_ANY	  (S3C2410_UERSTAT_OVERRUN | \
-				   S3C2410_UERSTAT_FRAME | \
-				   S3C2410_UERSTAT_BREAK)
-
-#define S3C2410_UMSTAT_CTS	  (1<<0)
-#define S3C2410_UMSTAT_DeltaCTS	  (1<<2)
-
-#define S3C2443_DIVSLOT		  (0x2C)
-
-/* S3C64XX interrupt registers. */
-#define S3C64XX_UINTP		0x30
-#define S3C64XX_UINTSP		0x34
-#define S3C64XX_UINTM		0x38
-
-#define S3C64XX_UINTM_RXD	(0)
-#define S3C64XX_UINTM_TXD	(2)
-#define S3C64XX_UINTM_RXD_MSK	(1 << S3C64XX_UINTM_RXD)
-#define S3C64XX_UINTM_TXD_MSK	(1 << S3C64XX_UINTM_TXD)
-
-/* Following are specific to S5PV210 */
-#define S5PV210_UCON_CLKMASK	(1<<10)
-#define S5PV210_UCON_CLKSHIFT	(10)
-#define S5PV210_UCON_PCLK	(0<<10)
-#define S5PV210_UCON_UCLK	(1<<10)
-
-#define S5PV210_UFCON_TXTRIG0	(0<<8)
-#define S5PV210_UFCON_TXTRIG4	(1<<8)
-#define S5PV210_UFCON_TXTRIG8	(2<<8)
-#define S5PV210_UFCON_TXTRIG16	(3<<8)
-#define S5PV210_UFCON_TXTRIG32	(4<<8)
-#define S5PV210_UFCON_TXTRIG64	(5<<8)
-#define S5PV210_UFCON_TXTRIG128 (6<<8)
-#define S5PV210_UFCON_TXTRIG256 (7<<8)
-
-#define S5PV210_UFCON_RXTRIG1	(0<<4)
-#define S5PV210_UFCON_RXTRIG4	(1<<4)
-#define S5PV210_UFCON_RXTRIG8	(2<<4)
-#define S5PV210_UFCON_RXTRIG16	(3<<4)
-#define S5PV210_UFCON_RXTRIG32	(4<<4)
-#define S5PV210_UFCON_RXTRIG64	(5<<4)
-#define S5PV210_UFCON_RXTRIG128	(6<<4)
-#define S5PV210_UFCON_RXTRIG256	(7<<4)
-
-#define S5PV210_UFSTAT_TXFULL	(1<<24)
-#define S5PV210_UFSTAT_RXFULL	(1<<8)
-#define S5PV210_UFSTAT_TXMASK	(255<<16)
-#define S5PV210_UFSTAT_TXSHIFT	(16)
-#define S5PV210_UFSTAT_RXMASK	(255<<0)
-#define S5PV210_UFSTAT_RXSHIFT	(0)
-
-#define S3C2410_UCON_CLKSEL0	(1 << 0)
-#define S3C2410_UCON_CLKSEL1	(1 << 1)
-#define S3C2410_UCON_CLKSEL2	(1 << 2)
-#define S3C2410_UCON_CLKSEL3	(1 << 3)
-
-/* Default values for s5pv210 UCON and UFCON uart registers */
-#define S5PV210_UCON_DEFAULT	(S3C2410_UCON_TXILEVEL |	\
-				 S3C2410_UCON_RXILEVEL |	\
-				 S3C2410_UCON_TXIRQMODE |	\
-				 S3C2410_UCON_RXIRQMODE |	\
-				 S3C2410_UCON_RXFIFO_TOI |	\
-				 S3C2443_UCON_RXERR_IRQEN)
-
-#define S5PV210_UFCON_DEFAULT	(S3C2410_UFCON_FIFOMODE |	\
-				 S5PV210_UFCON_TXTRIG4 |	\
-				 S5PV210_UFCON_RXTRIG4)
-
-#ifndef __ASSEMBLY__
-
-/* configuration structure for per-machine configurations for the
- * serial port
- *
- * the pointer is setup by the machine specific initialisation from the
- * arch/arm/mach-s3c2410/ directory.
-*/
-
-struct s3c2410_uartcfg {
-	unsigned char	   hwport;	 /* hardware port number */
-	unsigned char	   unused;
-	unsigned short	   flags;
-	upf_t		   uart_flags;	 /* default uart flags */
-	unsigned int	   clk_sel;
-
-	unsigned int	   has_fracval;
-
-	unsigned long	   ucon;	 /* value of ucon for port */
-	unsigned long	   ulcon;	 /* value of ulcon for port */
-	unsigned long	   ufcon;	 /* value of ufcon for port */
-};
-
-/* s3c24xx_uart_devs
- *
- * this is exported from the core as we cannot use driver_register(),
- * or platform_add_device() before the console_initcall()
-*/
-
-extern struct platform_device *s3c24xx_uart_devs[4];
-
-#endif /* __ASSEMBLY__ */
-
-#endif /* __ASM_ARM_REGS_SERIAL_H */
-
+#include <linux/serial_s3c.h>
diff --git a/drivers/tty/serial/samsung.c b/drivers/tty/serial/samsung.c
index e91378b40d58..6a3695681fa5 100644
--- a/drivers/tty/serial/samsung.c
+++ b/drivers/tty/serial/samsung.c
@@ -39,6 +39,7 @@
 #include <linux/tty_flip.h>
 #include <linux/serial_core.h>
 #include <linux/serial.h>
+#include <linux/serial_s3c.h>
 #include <linux/delay.h>
 #include <linux/clk.h>
 #include <linux/cpufreq.h>
@@ -46,10 +47,9 @@
 
 #include <asm/irq.h>
 
-#include <mach/hardware.h>
-
-#include <plat/regs-serial.h>
+#ifdef CONFIG_SAMSUNG_CLOCK
 #include <plat/clock.h>
+#endif
 
 #include "samsung.h"
 
diff --git a/include/linux/serial_s3c.h b/include/linux/serial_s3c.h
new file mode 100644
index 000000000000..907d9d1d56cf
--- /dev/null
+++ b/include/linux/serial_s3c.h
@@ -0,0 +1,260 @@
+/*
+ *  Internal header file for Samsung S3C2410 serial ports (UART0-2)
+ *
+ *  Copyright (C) 2002 Shane Nay (shane@minirl.com)
+ *
+ *  Additional defines, Copyright 2003 Simtec Electronics (linux@simtec.co.uk)
+ *
+ *  Adapted from:
+ *
+ *  Internal header file for MX1ADS serial ports (UART1 & 2)
+ *
+ *  Copyright (C) 2002 Shane Nay (shane@minirl.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+*/
+
+#ifndef __ASM_ARM_REGS_SERIAL_H
+#define __ASM_ARM_REGS_SERIAL_H
+
+#define S3C2410_URXH	  (0x24)
+#define S3C2410_UTXH	  (0x20)
+#define S3C2410_ULCON	  (0x00)
+#define S3C2410_UCON	  (0x04)
+#define S3C2410_UFCON	  (0x08)
+#define S3C2410_UMCON	  (0x0C)
+#define S3C2410_UBRDIV	  (0x28)
+#define S3C2410_UTRSTAT	  (0x10)
+#define S3C2410_UERSTAT	  (0x14)
+#define S3C2410_UFSTAT	  (0x18)
+#define S3C2410_UMSTAT	  (0x1C)
+
+#define S3C2410_LCON_CFGMASK	  ((0xF<<3)|(0x3))
+
+#define S3C2410_LCON_CS5	  (0x0)
+#define S3C2410_LCON_CS6	  (0x1)
+#define S3C2410_LCON_CS7	  (0x2)
+#define S3C2410_LCON_CS8	  (0x3)
+#define S3C2410_LCON_CSMASK	  (0x3)
+
+#define S3C2410_LCON_PNONE	  (0x0)
+#define S3C2410_LCON_PEVEN	  (0x5 << 3)
+#define S3C2410_LCON_PODD	  (0x4 << 3)
+#define S3C2410_LCON_PMASK	  (0x7 << 3)
+
+#define S3C2410_LCON_STOPB	  (1<<2)
+#define S3C2410_LCON_IRM          (1<<6)
+
+#define S3C2440_UCON_CLKMASK	  (3<<10)
+#define S3C2440_UCON_CLKSHIFT	  (10)
+#define S3C2440_UCON_PCLK	  (0<<10)
+#define S3C2440_UCON_UCLK	  (1<<10)
+#define S3C2440_UCON_PCLK2	  (2<<10)
+#define S3C2440_UCON_FCLK	  (3<<10)
+#define S3C2443_UCON_EPLL	  (3<<10)
+
+#define S3C6400_UCON_CLKMASK	(3<<10)
+#define S3C6400_UCON_CLKSHIFT	(10)
+#define S3C6400_UCON_PCLK	(0<<10)
+#define S3C6400_UCON_PCLK2	(2<<10)
+#define S3C6400_UCON_UCLK0	(1<<10)
+#define S3C6400_UCON_UCLK1	(3<<10)
+
+#define S3C2440_UCON2_FCLK_EN	  (1<<15)
+#define S3C2440_UCON0_DIVMASK	  (15 << 12)
+#define S3C2440_UCON1_DIVMASK	  (15 << 12)
+#define S3C2440_UCON2_DIVMASK	  (7 << 12)
+#define S3C2440_UCON_DIVSHIFT	  (12)
+
+#define S3C2412_UCON_CLKMASK	(3<<10)
+#define S3C2412_UCON_CLKSHIFT	(10)
+#define S3C2412_UCON_UCLK	(1<<10)
+#define S3C2412_UCON_USYSCLK	(3<<10)
+#define S3C2412_UCON_PCLK	(0<<10)
+#define S3C2412_UCON_PCLK2	(2<<10)
+
+#define S3C2410_UCON_CLKMASK	(1 << 10)
+#define S3C2410_UCON_CLKSHIFT	(10)
+#define S3C2410_UCON_UCLK	  (1<<10)
+#define S3C2410_UCON_SBREAK	  (1<<4)
+
+#define S3C2410_UCON_TXILEVEL	  (1<<9)
+#define S3C2410_UCON_RXILEVEL	  (1<<8)
+#define S3C2410_UCON_TXIRQMODE	  (1<<2)
+#define S3C2410_UCON_RXIRQMODE	  (1<<0)
+#define S3C2410_UCON_RXFIFO_TOI	  (1<<7)
+#define S3C2443_UCON_RXERR_IRQEN  (1<<6)
+#define S3C2443_UCON_LOOPBACK	  (1<<5)
+
+#define S3C2410_UCON_DEFAULT	  (S3C2410_UCON_TXILEVEL  | \
+				   S3C2410_UCON_RXILEVEL  | \
+				   S3C2410_UCON_TXIRQMODE | \
+				   S3C2410_UCON_RXIRQMODE | \
+				   S3C2410_UCON_RXFIFO_TOI)
+
+#define S3C2410_UFCON_FIFOMODE	  (1<<0)
+#define S3C2410_UFCON_TXTRIG0	  (0<<6)
+#define S3C2410_UFCON_RXTRIG8	  (1<<4)
+#define S3C2410_UFCON_RXTRIG12	  (2<<4)
+
+/* S3C2440 FIFO trigger levels */
+#define S3C2440_UFCON_RXTRIG1	  (0<<4)
+#define S3C2440_UFCON_RXTRIG8	  (1<<4)
+#define S3C2440_UFCON_RXTRIG16	  (2<<4)
+#define S3C2440_UFCON_RXTRIG32	  (3<<4)
+
+#define S3C2440_UFCON_TXTRIG0	  (0<<6)
+#define S3C2440_UFCON_TXTRIG16	  (1<<6)
+#define S3C2440_UFCON_TXTRIG32	  (2<<6)
+#define S3C2440_UFCON_TXTRIG48	  (3<<6)
+
+#define S3C2410_UFCON_RESETBOTH	  (3<<1)
+#define S3C2410_UFCON_RESETTX	  (1<<2)
+#define S3C2410_UFCON_RESETRX	  (1<<1)
+
+#define S3C2410_UFCON_DEFAULT	  (S3C2410_UFCON_FIFOMODE | \
+				   S3C2410_UFCON_TXTRIG0  | \
+				   S3C2410_UFCON_RXTRIG8 )
+
+#define	S3C2410_UMCOM_AFC	  (1<<4)
+#define	S3C2410_UMCOM_RTS_LOW	  (1<<0)
+
+#define S3C2412_UMCON_AFC_63	(0<<5)		/* same as s3c2443 */
+#define S3C2412_UMCON_AFC_56	(1<<5)
+#define S3C2412_UMCON_AFC_48	(2<<5)
+#define S3C2412_UMCON_AFC_40	(3<<5)
+#define S3C2412_UMCON_AFC_32	(4<<5)
+#define S3C2412_UMCON_AFC_24	(5<<5)
+#define S3C2412_UMCON_AFC_16	(6<<5)
+#define S3C2412_UMCON_AFC_8	(7<<5)
+
+#define S3C2410_UFSTAT_TXFULL	  (1<<9)
+#define S3C2410_UFSTAT_RXFULL	  (1<<8)
+#define S3C2410_UFSTAT_TXMASK	  (15<<4)
+#define S3C2410_UFSTAT_TXSHIFT	  (4)
+#define S3C2410_UFSTAT_RXMASK	  (15<<0)
+#define S3C2410_UFSTAT_RXSHIFT	  (0)
+
+/* UFSTAT S3C2443 same as S3C2440 */
+#define S3C2440_UFSTAT_TXFULL	  (1<<14)
+#define S3C2440_UFSTAT_RXFULL	  (1<<6)
+#define S3C2440_UFSTAT_TXSHIFT	  (8)
+#define S3C2440_UFSTAT_RXSHIFT	  (0)
+#define S3C2440_UFSTAT_TXMASK	  (63<<8)
+#define S3C2440_UFSTAT_RXMASK	  (63)
+
+#define S3C2410_UTRSTAT_TXE	  (1<<2)
+#define S3C2410_UTRSTAT_TXFE	  (1<<1)
+#define S3C2410_UTRSTAT_RXDR	  (1<<0)
+
+#define S3C2410_UERSTAT_OVERRUN	  (1<<0)
+#define S3C2410_UERSTAT_FRAME	  (1<<2)
+#define S3C2410_UERSTAT_BREAK	  (1<<3)
+#define S3C2443_UERSTAT_PARITY	  (1<<1)
+
+#define S3C2410_UERSTAT_ANY	  (S3C2410_UERSTAT_OVERRUN | \
+				   S3C2410_UERSTAT_FRAME | \
+				   S3C2410_UERSTAT_BREAK)
+
+#define S3C2410_UMSTAT_CTS	  (1<<0)
+#define S3C2410_UMSTAT_DeltaCTS	  (1<<2)
+
+#define S3C2443_DIVSLOT		  (0x2C)
+
+/* S3C64XX interrupt registers. */
+#define S3C64XX_UINTP		0x30
+#define S3C64XX_UINTSP		0x34
+#define S3C64XX_UINTM		0x38
+
+#define S3C64XX_UINTM_RXD	(0)
+#define S3C64XX_UINTM_TXD	(2)
+#define S3C64XX_UINTM_RXD_MSK	(1 << S3C64XX_UINTM_RXD)
+#define S3C64XX_UINTM_TXD_MSK	(1 << S3C64XX_UINTM_TXD)
+
+/* Following are specific to S5PV210 */
+#define S5PV210_UCON_CLKMASK	(1<<10)
+#define S5PV210_UCON_CLKSHIFT	(10)
+#define S5PV210_UCON_PCLK	(0<<10)
+#define S5PV210_UCON_UCLK	(1<<10)
+
+#define S5PV210_UFCON_TXTRIG0	(0<<8)
+#define S5PV210_UFCON_TXTRIG4	(1<<8)
+#define S5PV210_UFCON_TXTRIG8	(2<<8)
+#define S5PV210_UFCON_TXTRIG16	(3<<8)
+#define S5PV210_UFCON_TXTRIG32	(4<<8)
+#define S5PV210_UFCON_TXTRIG64	(5<<8)
+#define S5PV210_UFCON_TXTRIG128 (6<<8)
+#define S5PV210_UFCON_TXTRIG256 (7<<8)
+
+#define S5PV210_UFCON_RXTRIG1	(0<<4)
+#define S5PV210_UFCON_RXTRIG4	(1<<4)
+#define S5PV210_UFCON_RXTRIG8	(2<<4)
+#define S5PV210_UFCON_RXTRIG16	(3<<4)
+#define S5PV210_UFCON_RXTRIG32	(4<<4)
+#define S5PV210_UFCON_RXTRIG64	(5<<4)
+#define S5PV210_UFCON_RXTRIG128	(6<<4)
+#define S5PV210_UFCON_RXTRIG256	(7<<4)
+
+#define S5PV210_UFSTAT_TXFULL	(1<<24)
+#define S5PV210_UFSTAT_RXFULL	(1<<8)
+#define S5PV210_UFSTAT_TXMASK	(255<<16)
+#define S5PV210_UFSTAT_TXSHIFT	(16)
+#define S5PV210_UFSTAT_RXMASK	(255<<0)
+#define S5PV210_UFSTAT_RXSHIFT	(0)
+
+#define S3C2410_UCON_CLKSEL0	(1 << 0)
+#define S3C2410_UCON_CLKSEL1	(1 << 1)
+#define S3C2410_UCON_CLKSEL2	(1 << 2)
+#define S3C2410_UCON_CLKSEL3	(1 << 3)
+
+/* Default values for s5pv210 UCON and UFCON uart registers */
+#define S5PV210_UCON_DEFAULT	(S3C2410_UCON_TXILEVEL |	\
+				 S3C2410_UCON_RXILEVEL |	\
+				 S3C2410_UCON_TXIRQMODE |	\
+				 S3C2410_UCON_RXIRQMODE |	\
+				 S3C2410_UCON_RXFIFO_TOI |	\
+				 S3C2443_UCON_RXERR_IRQEN)
+
+#define S5PV210_UFCON_DEFAULT	(S3C2410_UFCON_FIFOMODE |	\
+				 S5PV210_UFCON_TXTRIG4 |	\
+				 S5PV210_UFCON_RXTRIG4)
+
+#ifndef __ASSEMBLY__
+
+/* configuration structure for per-machine configurations for the
+ * serial port
+ *
+ * the pointer is setup by the machine specific initialisation from the
+ * arch/arm/mach-s3c2410/ directory.
+*/
+
+struct s3c2410_uartcfg {
+	unsigned char	   hwport;	 /* hardware port number */
+	unsigned char	   unused;
+	unsigned short	   flags;
+	upf_t		   uart_flags;	 /* default uart flags */
+	unsigned int	   clk_sel;
+
+	unsigned int	   has_fracval;
+
+	unsigned long	   ucon;	 /* value of ucon for port */
+	unsigned long	   ulcon;	 /* value of ulcon for port */
+	unsigned long	   ufcon;	 /* value of ufcon for port */
+};
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* __ASM_ARM_REGS_SERIAL_H */
+
-- 
cgit 


From 12028d2d216220618f76284af5f8ed510b11da55 Mon Sep 17 00:00:00 2001
From: Mika Westerberg <mika.westerberg@linux.intel.com>
Date: Wed, 3 Apr 2013 13:56:54 +0300
Subject: gpiolib-acpi: introduce acpi_get_gpio_by_index() helper

Instead of open-coding ACPI GPIO resource lookup in each driver, we provide
a helper function analogous to Device Tree version that allows drivers to
specify which GPIO resource they are interested (using an index to the GPIO
resources). The function then finds out the correct resource, translates
the ACPI GPIO number to the corresponding Linux GPIO number and returns
that.

Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 Documentation/acpi/enumeration.txt | 32 +++++++++++++++-
 drivers/gpio/gpiolib-acpi.c        | 77 ++++++++++++++++++++++++++++++++++++++
 include/linux/acpi_gpio.h          | 17 +++++++++
 3 files changed, 125 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/acpi/enumeration.txt b/Documentation/acpi/enumeration.txt
index 94a656131885..b0d541042ac6 100644
--- a/Documentation/acpi/enumeration.txt
+++ b/Documentation/acpi/enumeration.txt
@@ -199,6 +199,8 @@ the device to the driver. For example:
 	{
 		Name (SBUF, ResourceTemplate()
 		{
+			...
+			// Used to power on/off the device
 			GpioIo (Exclusive, PullDefault, 0x0000, 0x0000,
 				IoRestrictionOutputOnly, "\\_SB.PCI0.GPI0",
 				0x00, ResourceConsumer,,)
@@ -206,10 +208,20 @@ the device to the driver. For example:
 				// Pin List
 				0x0055
 			}
+
+			// Interrupt for the device
+			GpioInt (Edge, ActiveHigh, ExclusiveAndWake, PullNone,
+				 0x0000, "\\_SB.PCI0.GPI0", 0x00, ResourceConsumer,,)
+			{
+				// Pin list
+				0x0058
+			}
+
 			...
 
-			Return (SBUF)
 		}
+
+		Return (SBUF)
 	}
 
 These GPIO numbers are controller relative and path "\\_SB.PCI0.GPI0"
@@ -220,6 +232,24 @@ The driver can do this by including <linux/acpi_gpio.h> and then calling
 acpi_get_gpio(path, gpio). This will return the Linux GPIO number or
 negative errno if there was no translation found.
 
+In a simple case of just getting the Linux GPIO number from device
+resources one can use acpi_get_gpio_by_index() helper function. It takes
+pointer to the device and index of the GpioIo/GpioInt descriptor in the
+device resources list. For example:
+
+	int gpio_irq, gpio_power;
+	int ret;
+
+	gpio_irq = acpi_get_gpio_by_index(dev, 1, NULL);
+	if (gpio_irq < 0)
+		/* handle error */
+
+	gpio_power = acpi_get_gpio_by_index(dev, 0, NULL);
+	if (gpio_power < 0)
+		/* handle error */
+
+	/* Now we can use the GPIO numbers */
+
 Other GpioIo parameters must be converted first by the driver to be
 suitable to the gpiolib before passing them.
 
diff --git a/drivers/gpio/gpiolib-acpi.c b/drivers/gpio/gpiolib-acpi.c
index 89336c4f82cd..5c1ef2b3ef18 100644
--- a/drivers/gpio/gpiolib-acpi.c
+++ b/drivers/gpio/gpiolib-acpi.c
@@ -201,6 +201,83 @@ void acpi_gpiochip_request_interrupts(struct gpio_chip *chip)
 }
 EXPORT_SYMBOL(acpi_gpiochip_request_interrupts);
 
+struct acpi_gpio_lookup {
+	struct acpi_gpio_info info;
+	int index;
+	int gpio;
+	int n;
+};
+
+static int acpi_find_gpio(struct acpi_resource *ares, void *data)
+{
+	struct acpi_gpio_lookup *lookup = data;
+
+	if (ares->type != ACPI_RESOURCE_TYPE_GPIO)
+		return 1;
+
+	if (lookup->n++ == lookup->index && lookup->gpio < 0) {
+		const struct acpi_resource_gpio *agpio = &ares->data.gpio;
+
+		lookup->gpio = acpi_get_gpio(agpio->resource_source.string_ptr,
+					     agpio->pin_table[0]);
+		lookup->info.gpioint =
+			agpio->connection_type == ACPI_RESOURCE_GPIO_TYPE_INT;
+	}
+
+	return 1;
+}
+
+/**
+ * acpi_get_gpio_by_index() - get a GPIO number from device resources
+ * @dev: pointer to a device to get GPIO from
+ * @index: index of GpioIo/GpioInt resource (starting from %0)
+ * @info: info pointer to fill in (optional)
+ *
+ * Function goes through ACPI resources for @dev and based on @index looks
+ * up a GpioIo/GpioInt resource, translates it to the Linux GPIO number,
+ * and returns it. @index matches GpioIo/GpioInt resources only so if there
+ * are total %3 GPIO resources, the index goes from %0 to %2.
+ *
+ * If the GPIO cannot be translated or there is an error, negative errno is
+ * returned.
+ *
+ * Note: if the GPIO resource has multiple entries in the pin list, this
+ * function only returns the first.
+ */
+int acpi_get_gpio_by_index(struct device *dev, int index,
+			   struct acpi_gpio_info *info)
+{
+	struct acpi_gpio_lookup lookup;
+	struct list_head resource_list;
+	struct acpi_device *adev;
+	acpi_handle handle;
+	int ret;
+
+	if (!dev)
+		return -EINVAL;
+
+	handle = ACPI_HANDLE(dev);
+	if (!handle || acpi_bus_get_device(handle, &adev))
+		return -ENODEV;
+
+	memset(&lookup, 0, sizeof(lookup));
+	lookup.index = index;
+	lookup.gpio = -ENODEV;
+
+	INIT_LIST_HEAD(&resource_list);
+	ret = acpi_dev_get_resources(adev, &resource_list, acpi_find_gpio,
+				     &lookup);
+	if (ret < 0)
+		return ret;
+
+	acpi_dev_free_resource_list(&resource_list);
+
+	if (lookup.gpio >= 0 && info)
+		*info = lookup.info;
+
+	return lookup.gpio;
+}
+EXPORT_SYMBOL_GPL(acpi_get_gpio_by_index);
 
 /**
  * acpi_gpiochip_free_interrupts() - Free GPIO _EVT ACPI event interrupts.
diff --git a/include/linux/acpi_gpio.h b/include/linux/acpi_gpio.h
index 213135f44333..4c120a1e0ca3 100644
--- a/include/linux/acpi_gpio.h
+++ b/include/linux/acpi_gpio.h
@@ -1,12 +1,23 @@
 #ifndef _LINUX_ACPI_GPIO_H_
 #define _LINUX_ACPI_GPIO_H_
 
+#include <linux/device.h>
 #include <linux/errno.h>
 #include <linux/gpio.h>
 
+/**
+ * struct acpi_gpio_info - ACPI GPIO specific information
+ * @gpioint: if %true this GPIO is of type GpioInt otherwise type is GpioIo
+ */
+struct acpi_gpio_info {
+	bool gpioint;
+};
+
 #ifdef CONFIG_GPIO_ACPI
 
 int acpi_get_gpio(char *path, int pin);
+int acpi_get_gpio_by_index(struct device *dev, int index,
+			   struct acpi_gpio_info *info);
 void acpi_gpiochip_request_interrupts(struct gpio_chip *chip);
 void acpi_gpiochip_free_interrupts(struct gpio_chip *chip);
 
@@ -17,6 +28,12 @@ static inline int acpi_get_gpio(char *path, int pin)
 	return -ENODEV;
 }
 
+static inline int acpi_get_gpio_by_index(struct device *dev, int index,
+					 struct acpi_gpio_info *info)
+{
+	return -ENODEV;
+}
+
 static inline void acpi_gpiochip_request_interrupts(struct gpio_chip *chip) { }
 static inline void acpi_gpiochip_free_interrupts(struct gpio_chip *chip) { }
 
-- 
cgit 


From ae4647fb7654676fc44a97e86eb35f9f06b99f66 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 12 Apr 2013 00:03:42 -0400
Subject: jbd2: reduce journal_head size

Remove unused t_cow_tid field (ext4 copy-on-write support doesn't seem
to be happening) and change b_modified and b_jlist to bitfields thus
saving 8 bytes in the structure.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Reviewed-by: Zheng Liu <wenqing.lz@taobao.com>
---
 include/linux/journal-head.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index c18b46f8aeeb..13a3da25ff07 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h
@@ -31,21 +31,14 @@ struct journal_head {
 	/*
 	 * Journalling list for this buffer [jbd_lock_bh_state()]
 	 */
-	unsigned b_jlist;
+	unsigned b_jlist:4;
 
 	/*
 	 * This flag signals the buffer has been modified by
 	 * the currently running transaction
 	 * [jbd_lock_bh_state()]
 	 */
-	unsigned b_modified;
-
-	/*
-	 * This feild tracks the last transaction id in which this buffer
-	 * has been cowed
-	 * [jbd_lock_bh_state()]
-	 */
-	tid_t b_cow_tid;
+	unsigned b_modified:1;
 
 	/*
 	 * Copy of the buffer data frozen for writing to the log.
-- 
cgit 


From 9b89f6ba2ab56e4d9c00e7e591d6bc333137895e Mon Sep 17 00:00:00 2001
From: Andrei Epure <epure.andrei@gmail.com>
Date: Thu, 11 Apr 2013 20:30:29 +0300
Subject: sched: Document task_struct::personality field

Signed-off-by: Andrei Epure <epure.andrei@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1365701429-4721-1-git-send-email-epure.andrei@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9004f6e19eac..6bdaa73ede13 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1105,8 +1105,10 @@ struct task_struct {
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	unsigned int jobctl;	/* JOBCTL_*, siglock protected */
-	/* ??? */
+
+	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
+
 	unsigned did_exec:1;
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
-- 
cgit 


From 61fc41317666be400802ac793f47de816ef7bd57 Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 19 Nov 2012 17:23:13 +0100
Subject: reset: Add reset controller API

This adds a simple API for devices to request being reset
by separate reset controller hardware and implements the
reset signal device tree binding.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Reviewed-by: Stephen Warren <swarren@nvidia.com>
Reviewed-by: Shawn Guo <shawn.guo@linaro.org>
Reviewed-by: Marek Vasut <marex@denx.de>
Reviewed-by: Pavel Machek <pavel@ucw.cz>
---
 drivers/Kconfig                  |   2 +
 drivers/Makefile                 |   3 +
 drivers/reset/Kconfig            |  13 ++
 drivers/reset/Makefile           |   1 +
 drivers/reset/core.c             | 297 +++++++++++++++++++++++++++++++++++++++
 include/linux/reset-controller.h |  51 +++++++
 include/linux/reset.h            |  17 +++
 7 files changed, 384 insertions(+)
 create mode 100644 drivers/reset/Kconfig
 create mode 100644 drivers/reset/Makefile
 create mode 100644 drivers/reset/core.c
 create mode 100644 include/linux/reset-controller.h
 create mode 100644 include/linux/reset.h

(limited to 'include/linux')

diff --git a/drivers/Kconfig b/drivers/Kconfig
index 202fa6d051b9..847f8e31f3dd 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -162,4 +162,6 @@ source "drivers/irqchip/Kconfig"
 
 source "drivers/ipack/Kconfig"
 
+source "drivers/reset/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index dce39a95fa71..1a64c4cd9094 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -37,6 +37,9 @@ obj-$(CONFIG_XEN)		+= xen/
 # regulators early, since some subsystems rely on them to initialize
 obj-$(CONFIG_REGULATOR)		+= regulator/
 
+# reset controllers early, since gpu drivers might rely on them to initialize
+obj-$(CONFIG_RESET_CONTROLLER)	+= reset/
+
 # tty/ comes before char/ so that the VT console is the boot-time
 # default.
 obj-y				+= tty/
diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig
new file mode 100644
index 000000000000..c9d04f797862
--- /dev/null
+++ b/drivers/reset/Kconfig
@@ -0,0 +1,13 @@
+config ARCH_HAS_RESET_CONTROLLER
+	bool
+
+menuconfig RESET_CONTROLLER
+	bool "Reset Controller Support"
+	default y if ARCH_HAS_RESET_CONTROLLER
+	help
+	  Generic Reset Controller support.
+
+	  This framework is designed to abstract reset handling of devices
+	  via GPIOs or SoC-internal reset controller modules.
+
+	  If unsure, say no.
diff --git a/drivers/reset/Makefile b/drivers/reset/Makefile
new file mode 100644
index 000000000000..1e2d83f2b995
--- /dev/null
+++ b/drivers/reset/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_RESET_CONTROLLER) += core.o
diff --git a/drivers/reset/core.c b/drivers/reset/core.c
new file mode 100644
index 000000000000..a258277959b4
--- /dev/null
+++ b/drivers/reset/core.c
@@ -0,0 +1,297 @@
+/*
+ * Reset Controller framework
+ *
+ * Copyright 2013 Philipp Zabel, Pengutronix
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/reset.h>
+#include <linux/reset-controller.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(reset_controller_list_mutex);
+static LIST_HEAD(reset_controller_list);
+
+/**
+ * struct reset_control - a reset control
+ * @rcdev: a pointer to the reset controller device
+ *         this reset control belongs to
+ * @id: ID of the reset controller in the reset
+ *      controller device
+ */
+struct reset_control {
+	struct reset_controller_dev *rcdev;
+	struct device *dev;
+	unsigned int id;
+};
+
+/**
+ * of_reset_simple_xlate - translate reset_spec to the reset line number
+ * @rcdev: a pointer to the reset controller device
+ * @reset_spec: reset line specifier as found in the device tree
+ * @flags: a flags pointer to fill in (optional)
+ *
+ * This simple translation function should be used for reset controllers
+ * with 1:1 mapping, where reset lines can be indexed by number without gaps.
+ */
+int of_reset_simple_xlate(struct reset_controller_dev *rcdev,
+			  const struct of_phandle_args *reset_spec)
+{
+	if (WARN_ON(reset_spec->args_count != rcdev->of_reset_n_cells))
+		return -EINVAL;
+
+	if (reset_spec->args[0] >= rcdev->nr_resets)
+		return -EINVAL;
+
+	return reset_spec->args[0];
+}
+EXPORT_SYMBOL_GPL(of_reset_simple_xlate);
+
+/**
+ * reset_controller_register - register a reset controller device
+ * @rcdev: a pointer to the initialized reset controller device
+ */
+int reset_controller_register(struct reset_controller_dev *rcdev)
+{
+	if (!rcdev->of_xlate) {
+		rcdev->of_reset_n_cells = 1;
+		rcdev->of_xlate = of_reset_simple_xlate;
+	}
+
+	mutex_lock(&reset_controller_list_mutex);
+	list_add(&rcdev->list, &reset_controller_list);
+	mutex_unlock(&reset_controller_list_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(reset_controller_register);
+
+/**
+ * reset_controller_unregister - unregister a reset controller device
+ * @rcdev: a pointer to the reset controller device
+ */
+void reset_controller_unregister(struct reset_controller_dev *rcdev)
+{
+	mutex_lock(&reset_controller_list_mutex);
+	list_del(&rcdev->list);
+	mutex_unlock(&reset_controller_list_mutex);
+}
+EXPORT_SYMBOL_GPL(reset_controller_unregister);
+
+/**
+ * reset_control_reset - reset the controlled device
+ * @rstc: reset controller
+ */
+int reset_control_reset(struct reset_control *rstc)
+{
+	if (rstc->rcdev->ops->reset)
+		return rstc->rcdev->ops->reset(rstc->rcdev, rstc->id);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(reset_control_reset);
+
+/**
+ * reset_control_assert - asserts the reset line
+ * @rstc: reset controller
+ */
+int reset_control_assert(struct reset_control *rstc)
+{
+	if (rstc->rcdev->ops->assert)
+		return rstc->rcdev->ops->assert(rstc->rcdev, rstc->id);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(reset_control_assert);
+
+/**
+ * reset_control_deassert - deasserts the reset line
+ * @rstc: reset controller
+ */
+int reset_control_deassert(struct reset_control *rstc)
+{
+	if (rstc->rcdev->ops->deassert)
+		return rstc->rcdev->ops->deassert(rstc->rcdev, rstc->id);
+
+	return -ENOSYS;
+}
+EXPORT_SYMBOL_GPL(reset_control_deassert);
+
+/**
+ * reset_control_get - Lookup and obtain a reference to a reset controller.
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Returns a struct reset_control or IS_ERR() condition containing errno.
+ *
+ * Use of id names is optional.
+ */
+struct reset_control *reset_control_get(struct device *dev, const char *id)
+{
+	struct reset_control *rstc = ERR_PTR(-EPROBE_DEFER);
+	struct reset_controller_dev *r, *rcdev;
+	struct of_phandle_args args;
+	int index = 0;
+	int rstc_id;
+	int ret;
+
+	if (!dev)
+		return ERR_PTR(-EINVAL);
+
+	if (id)
+		index = of_property_match_string(dev->of_node,
+						 "reset-names", id);
+	ret = of_parse_phandle_with_args(dev->of_node, "resets", "#reset-cells",
+					 index, &args);
+	if (ret)
+		return ERR_PTR(ret);
+
+	mutex_lock(&reset_controller_list_mutex);
+	rcdev = NULL;
+	list_for_each_entry(r, &reset_controller_list, list) {
+		if (args.np == r->of_node) {
+			rcdev = r;
+			break;
+		}
+	}
+	of_node_put(args.np);
+
+	if (!rcdev) {
+		mutex_unlock(&reset_controller_list_mutex);
+		return ERR_PTR(-ENODEV);
+	}
+
+	rstc_id = rcdev->of_xlate(rcdev, &args);
+	if (rstc_id < 0) {
+		mutex_unlock(&reset_controller_list_mutex);
+		return ERR_PTR(rstc_id);
+	}
+
+	try_module_get(rcdev->owner);
+	mutex_unlock(&reset_controller_list_mutex);
+
+	rstc = kzalloc(sizeof(*rstc), GFP_KERNEL);
+	if (!rstc) {
+		module_put(rstc->rcdev->owner);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	rstc->dev = dev;
+	rstc->rcdev = rcdev;
+	rstc->id = rstc_id;
+
+	return rstc;
+}
+EXPORT_SYMBOL_GPL(reset_control_get);
+
+/**
+ * reset_control_put - free the reset controller
+ * @rstc: reset controller
+ */
+
+void reset_control_put(struct reset_control *rstc)
+{
+	if (IS_ERR(rstc))
+		return;
+
+	module_put(rstc->rcdev->owner);
+	kfree(rstc);
+}
+EXPORT_SYMBOL_GPL(reset_control_put);
+
+static void devm_reset_control_release(struct device *dev, void *res)
+{
+	reset_control_put(*(struct reset_control **)res);
+}
+
+/**
+ * devm_reset_control_get - resource managed reset_control_get()
+ * @dev: device to be reset by the controller
+ * @id: reset line name
+ *
+ * Managed reset_control_get(). For reset controllers returned from this
+ * function, reset_control_put() is called automatically on driver detach.
+ * See reset_control_get() for more information.
+ */
+struct reset_control *devm_reset_control_get(struct device *dev, const char *id)
+{
+	struct reset_control **ptr, *rstc;
+
+	ptr = devres_alloc(devm_reset_control_release, sizeof(*ptr),
+			   GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	rstc = reset_control_get(dev, id);
+	if (!IS_ERR(rstc)) {
+		*ptr = rstc;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return rstc;
+}
+EXPORT_SYMBOL_GPL(devm_reset_control_get);
+
+static int devm_reset_control_match(struct device *dev, void *res, void *data)
+{
+	struct reset_control **rstc = res;
+	if (WARN_ON(!rstc || !*rstc))
+		return 0;
+	return *rstc == data;
+}
+
+/**
+ * devm_reset_control_put - resource managed reset_control_put()
+ * @rstc: reset controller to free
+ *
+ * Deallocate a reset control allocated withd devm_reset_control_get().
+ * This function will not need to be called normally, as devres will take
+ * care of freeing the resource.
+ */
+void devm_reset_control_put(struct reset_control *rstc)
+{
+	int ret;
+
+	ret = devres_release(rstc->dev, devm_reset_control_release,
+			     devm_reset_control_match, rstc);
+	if (ret)
+		WARN_ON(ret);
+}
+EXPORT_SYMBOL_GPL(devm_reset_control_put);
+
+/**
+ * device_reset - find reset controller associated with the device
+ *                and perform reset
+ * @dev: device to be reset by the controller
+ *
+ * Convenience wrapper for reset_control_get() and reset_control_reset().
+ * This is useful for the common case of devices with single, dedicated reset
+ * lines.
+ */
+int device_reset(struct device *dev)
+{
+	struct reset_control *rstc;
+	int ret;
+
+	rstc = reset_control_get(dev, NULL);
+	if (IS_ERR(rstc))
+		return PTR_ERR(rstc);
+
+	ret = reset_control_reset(rstc);
+
+	reset_control_put(rstc);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(device_reset);
diff --git a/include/linux/reset-controller.h b/include/linux/reset-controller.h
new file mode 100644
index 000000000000..2f61311ae3e0
--- /dev/null
+++ b/include/linux/reset-controller.h
@@ -0,0 +1,51 @@
+#ifndef _LINUX_RESET_CONTROLLER_H_
+#define _LINUX_RESET_CONTROLLER_H_
+
+#include <linux/list.h>
+
+struct reset_controller_dev;
+
+/**
+ * struct reset_control_ops
+ *
+ * @reset: for self-deasserting resets, does all necessary
+ *         things to reset the device
+ * @assert: manually assert the reset line, if supported
+ * @deassert: manually deassert the reset line, if supported
+ */
+struct reset_control_ops {
+	int (*reset)(struct reset_controller_dev *rcdev, unsigned long id);
+	int (*assert)(struct reset_controller_dev *rcdev, unsigned long id);
+	int (*deassert)(struct reset_controller_dev *rcdev, unsigned long id);
+};
+
+struct module;
+struct device_node;
+
+/**
+ * struct reset_controller_dev - reset controller entity that might
+ *                               provide multiple reset controls
+ * @ops: a pointer to device specific struct reset_control_ops
+ * @owner: kernel module of the reset controller driver
+ * @list: internal list of reset controller devices
+ * @of_node: corresponding device tree node as phandle target
+ * @of_reset_n_cells: number of cells in reset line specifiers
+ * @of_xlate: translation function to translate from specifier as found in the
+ *            device tree to id as given to the reset control ops
+ * @nr_resets: number of reset controls in this reset controller device
+ */
+struct reset_controller_dev {
+	struct reset_control_ops *ops;
+	struct module *owner;
+	struct list_head list;
+	struct device_node *of_node;
+	int of_reset_n_cells;
+	int (*of_xlate)(struct reset_controller_dev *rcdev,
+			const struct of_phandle_args *reset_spec);
+	unsigned int nr_resets;
+};
+
+int reset_controller_register(struct reset_controller_dev *rcdev);
+void reset_controller_unregister(struct reset_controller_dev *rcdev);
+
+#endif
diff --git a/include/linux/reset.h b/include/linux/reset.h
new file mode 100644
index 000000000000..6082247feab1
--- /dev/null
+++ b/include/linux/reset.h
@@ -0,0 +1,17 @@
+#ifndef _LINUX_RESET_H_
+#define _LINUX_RESET_H_
+
+struct device;
+struct reset_control;
+
+int reset_control_reset(struct reset_control *rstc);
+int reset_control_assert(struct reset_control *rstc);
+int reset_control_deassert(struct reset_control *rstc);
+
+struct reset_control *reset_control_get(struct device *dev, const char *id);
+void reset_control_put(struct reset_control *rstc);
+struct reset_control *devm_reset_control_get(struct device *dev, const char *id);
+
+int device_reset(struct device *dev);
+
+#endif
-- 
cgit 


From f2530dc71cf0822f90bb63ea4600caaef33a66bb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Apr 2013 09:33:34 +0200
Subject: kthread: Prevent unpark race which puts threads on the wrong cpu

The smpboot threads rely on the park/unpark mechanism which binds per
cpu threads on a particular core. Though the functionality is racy:

CPU0	       	 	CPU1  	     	    CPU2
unpark(T)				    wake_up_process(T)
  clear(SHOULD_PARK)	T runs
			leave parkme() due to !SHOULD_PARK
  bind_to(CPU2)		BUG_ON(wrong CPU)

We cannot let the tasks move themself to the target CPU as one of
those tasks is actually the migration thread itself, which requires
that it starts running on the target cpu right away.

The solution to this problem is to prevent wakeups in park mode which
are not from unpark(). That way we can guarantee that the association
of the task to the target cpu is working correctly.

Add a new task state (TASK_PARKED) which prevents other wakeups and
use this state explicitly for the unpark wakeup.

Peter noticed: Also, since the task state is visible to userspace and
all the parked tasks are still in the PID space, its a good hint in ps
and friends that these tasks aren't really there for the moment.

The migration thread has another related issue.

CPU0	      	     	 CPU1
Bring up CPU2
create_thread(T)
park(T)
 wait_for_completion()
			 parkme()
			 complete()
sched_set_stop_task()
			 schedule(TASK_PARKED)

The sched_set_stop_task() call is issued while the task is on the
runqueue of CPU1 and that confuses the hell out of the stop_task class
on that cpu. So we need the same synchronizaion before
sched_set_stop_task().

Reported-by: Dave Jones <davej@redhat.com>
Reported-and-tested-by: Dave Hansen <dave@sr71.net>
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Peter Ziljstra <peterz@infradead.org>
Cc: Srivatsa S. Bhat <srivatsa.bhat@linux.vnet.ibm.com>
Cc: dhillf@gmail.com
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304091635430.21884@ionos
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 fs/proc/array.c              |  1 +
 include/linux/sched.h        |  5 +++--
 include/trace/events/sched.h |  2 +-
 kernel/kthread.c             | 52 ++++++++++++++++++++++++--------------------
 kernel/smpboot.c             | 14 ++++++++++--
 5 files changed, 45 insertions(+), 29 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/array.c b/fs/proc/array.c
index f7ed9ee46eb9..cbd0f1b324b9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -143,6 +143,7 @@ static const char * const task_state_array[] = {
 	"x (dead)",		/*  64 */
 	"K (wakekill)",		/* 128 */
 	"W (waking)",		/* 256 */
+	"P (parked)",		/* 512 */
 };
 
 static inline const char *get_task_state(struct task_struct *tsk)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..e692a022527b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -163,9 +163,10 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #define TASK_DEAD		64
 #define TASK_WAKEKILL		128
 #define TASK_WAKING		256
-#define TASK_STATE_MAX		512
+#define TASK_PARKED		512
+#define TASK_STATE_MAX		1024
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
+#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
 extern char ___assert_task_state[1 - 2*!!(
 		sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 5a8671e8a67f..e5586caff67a 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -147,7 +147,7 @@ TRACE_EVENT(sched_switch,
 		  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
 				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
 				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
-				{ 128, "W" }) : "R",
+				{ 128, "K" }, { 256, "W" }, { 512, "P" }) : "R",
 		__entry->prev_state & TASK_STATE_MAX ? "+" : "",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..9eb7fed0bbaa 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -124,12 +124,12 @@ void *kthread_data(struct task_struct *task)
 
 static void __kthread_parkme(struct kthread *self)
 {
-	__set_current_state(TASK_INTERRUPTIBLE);
+	__set_current_state(TASK_PARKED);
 	while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
 		if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
 			complete(&self->parked);
 		schedule();
-		__set_current_state(TASK_INTERRUPTIBLE);
+		__set_current_state(TASK_PARKED);
 	}
 	clear_bit(KTHREAD_IS_PARKED, &self->flags);
 	__set_current_state(TASK_RUNNING);
@@ -256,8 +256,13 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create_on_node);
 
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	if (!wait_task_inactive(p, state)) {
+		WARN_ON(1);
+		return;
+	}
 	/* It's safe because the task is inactive. */
 	do_set_cpus_allowed(p, cpumask_of(cpu));
 	p->flags |= PF_THREAD_BOUND;
@@ -274,12 +279,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
  */
 void kthread_bind(struct task_struct *p, unsigned int cpu)
 {
-	/* Must have done schedule() in kthread() before we set_task_cpu */
-	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
-		WARN_ON(1);
-		return;
-	}
-	__kthread_bind(p, cpu);
+	__kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
 }
 EXPORT_SYMBOL(kthread_bind);
 
@@ -324,6 +324,22 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
 	return NULL;
 }
 
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
+{
+	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+	/*
+	 * We clear the IS_PARKED bit here as we don't wait
+	 * until the task has left the park code. So if we'd
+	 * park before that happens we'd see the IS_PARKED bit
+	 * which might be about to be cleared.
+	 */
+	if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+		if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+			__kthread_bind(k, kthread->cpu, TASK_PARKED);
+		wake_up_state(k, TASK_PARKED);
+	}
+}
+
 /**
  * kthread_unpark - unpark a thread created by kthread_create().
  * @k:		thread created by kthread_create().
@@ -336,20 +352,8 @@ void kthread_unpark(struct task_struct *k)
 {
 	struct kthread *kthread = task_get_live_kthread(k);
 
-	if (kthread) {
-		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
-		/*
-		 * We clear the IS_PARKED bit here as we don't wait
-		 * until the task has left the park code. So if we'd
-		 * park before that happens we'd see the IS_PARKED bit
-		 * which might be about to be cleared.
-		 */
-		if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
-			if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
-				__kthread_bind(k, kthread->cpu);
-			wake_up_process(k);
-		}
-	}
+	if (kthread)
+		__kthread_unpark(k, kthread);
 	put_task_struct(k);
 }
 
@@ -407,7 +411,7 @@ int kthread_stop(struct task_struct *k)
 	trace_sched_kthread_stop(k);
 	if (kthread) {
 		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
-		clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+		__kthread_unpark(k, kthread);
 		wake_up_process(k);
 		wait_for_completion(&kthread->exited);
 	}
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 8eaed9aa9cf0..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
 	}
 	get_task_struct(tsk);
 	*per_cpu_ptr(ht->store, cpu) = tsk;
-	if (ht->create)
-		ht->create(cpu);
+	if (ht->create) {
+		/*
+		 * Make sure that the task has actually scheduled out
+		 * into park position, before calling the create
+		 * callback. At least the migration thread callback
+		 * requires that the task is off the runqueue.
+		 */
+		if (!wait_task_inactive(tsk, TASK_PARKED))
+			WARN_ON(1);
+		else
+			ht->create(cpu);
+	}
 	return 0;
 }
 
-- 
cgit 


From 788437273fa8b824810ea9a23f7ed4d7fdb2949a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Apr 2013 22:42:03 +0200
Subject: spi: s3c64xx: move to generic dmaengine API

The spi-s3c64xx uses a Samsung proprietary interface for
talking to the DMA engine, which does not work with
multiplatform kernels.

This version of the patch leaves the old code in place,
behind an #ifdef. This can be removed in the future,
after the s3c64xx platform start supporting the regular
dmaengine interface. An earlier version of this patch was
tested successfully on exynos5250 by Padma Venkat.

The conversion was rather mechanical, since the samsung
interface is just a shallow wrapper around the dmaengine
interface.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/arm/plat-samsung/devs.c              |  10 ++
 drivers/spi/spi-s3c64xx.c                 | 185 +++++++++++++++++++++++-------
 include/linux/platform_data/spi-s3c64xx.h |   3 +
 3 files changed, 154 insertions(+), 44 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/plat-samsung/devs.c b/arch/arm/plat-samsung/devs.c
index 51afedda9ab6..03db14d8ace9 100644
--- a/arch/arm/plat-samsung/devs.c
+++ b/arch/arm/plat-samsung/devs.c
@@ -10,6 +10,7 @@
  * published by the Free Software Foundation.
 */
 
+#include <linux/amba/pl330.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
@@ -1552,6 +1553,9 @@ void __init s3c64xx_spi0_set_platdata(int (*cfg_gpio)(void), int src_clk_nr,
 	pd.num_cs = num_cs;
 	pd.src_clk_nr = src_clk_nr;
 	pd.cfg_gpio = (cfg_gpio) ? cfg_gpio : s3c64xx_spi0_cfg_gpio;
+#ifdef CONFIG_PL330_DMA
+	pd.filter = pl330_filter;
+#endif
 
 	s3c_set_platdata(&pd, sizeof(pd), &s3c64xx_device_spi0);
 }
@@ -1590,6 +1594,9 @@ void __init s3c64xx_spi1_set_platdata(int (*cfg_gpio)(void), int src_clk_nr,
 	pd.num_cs = num_cs;
 	pd.src_clk_nr = src_clk_nr;
 	pd.cfg_gpio = (cfg_gpio) ? cfg_gpio : s3c64xx_spi1_cfg_gpio;
+#ifdef CONFIG_PL330_DMA
+	pd.filter = pl330_filter;
+#endif
 
 	s3c_set_platdata(&pd, sizeof(pd), &s3c64xx_device_spi1);
 }
@@ -1628,6 +1635,9 @@ void __init s3c64xx_spi2_set_platdata(int (*cfg_gpio)(void), int src_clk_nr,
 	pd.num_cs = num_cs;
 	pd.src_clk_nr = src_clk_nr;
 	pd.cfg_gpio = (cfg_gpio) ? cfg_gpio : s3c64xx_spi2_cfg_gpio;
+#ifdef CONFIG_PL330_DMA
+	pd.filter = pl330_filter;
+#endif
 
 	s3c_set_platdata(&pd, sizeof(pd), &s3c64xx_device_spi2);
 }
diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c
index 682b1e738370..4989aeb793fd 100644
--- a/drivers/spi/spi-s3c64xx.c
+++ b/drivers/spi/spi-s3c64xx.c
@@ -24,6 +24,7 @@
 #include <linux/delay.h>
 #include <linux/clk.h>
 #include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
 #include <linux/platform_device.h>
 #include <linux/pm_runtime.h>
 #include <linux/spi/spi.h>
@@ -31,9 +32,12 @@
 #include <linux/of.h>
 #include <linux/of_gpio.h>
 
-#include <mach/dma.h>
 #include <linux/platform_data/spi-s3c64xx.h>
 
+#ifdef CONFIG_SAMSUNG_DMADEV
+#include <mach/dma.h>
+#endif
+
 #define MAX_SPI_PORTS		3
 
 /* Registers and bit-fields */
@@ -131,9 +135,9 @@
 #define TXBUSY    (1<<3)
 
 struct s3c64xx_spi_dma_data {
-	unsigned		ch;
+	struct dma_chan *ch;
 	enum dma_transfer_direction direction;
-	enum dma_ch	dmach;
+	unsigned int dmach;
 };
 
 /**
@@ -195,16 +199,14 @@ struct s3c64xx_spi_driver_data {
 	unsigned                        cur_speed;
 	struct s3c64xx_spi_dma_data	rx_dma;
 	struct s3c64xx_spi_dma_data	tx_dma;
+#ifdef CONFIG_SAMSUNG_DMADEV
 	struct samsung_dma_ops		*ops;
+#endif
 	struct s3c64xx_spi_port_config	*port_conf;
 	unsigned int			port_id;
 	unsigned long			gpios[4];
 };
 
-static struct s3c2410_dma_client s3c64xx_spi_dma_client = {
-	.name = "samsung-spi-dma",
-};
-
 static void flush_fifo(struct s3c64xx_spi_driver_data *sdd)
 {
 	void __iomem *regs = sdd->regs;
@@ -281,6 +283,13 @@ static void s3c64xx_spi_dmacb(void *data)
 	spin_unlock_irqrestore(&sdd->lock, flags);
 }
 
+#ifdef CONFIG_SAMSUNG_DMADEV
+/* FIXME: remove this section once arch/arm/mach-s3c64xx uses dmaengine */
+
+static struct s3c2410_dma_client s3c64xx_spi_dma_client = {
+	.name = "samsung-spi-dma",
+};
+
 static void prepare_dma(struct s3c64xx_spi_dma_data *dma,
 					unsigned len, dma_addr_t buf)
 {
@@ -294,14 +303,14 @@ static void prepare_dma(struct s3c64xx_spi_dma_data *dma,
 		config.direction = sdd->rx_dma.direction;
 		config.fifo = sdd->sfr_start + S3C64XX_SPI_RX_DATA;
 		config.width = sdd->cur_bpw / 8;
-		sdd->ops->config(sdd->rx_dma.ch, &config);
+		sdd->ops->config((enum dma_ch)sdd->rx_dma.ch, &config);
 	} else {
 		sdd = container_of((void *)dma,
 			struct s3c64xx_spi_driver_data, tx_dma);
 		config.direction =  sdd->tx_dma.direction;
 		config.fifo = sdd->sfr_start + S3C64XX_SPI_TX_DATA;
 		config.width = sdd->cur_bpw / 8;
-		sdd->ops->config(sdd->tx_dma.ch, &config);
+		sdd->ops->config((enum dma_ch)sdd->tx_dma.ch, &config);
 	}
 
 	info.cap = DMA_SLAVE;
@@ -311,8 +320,8 @@ static void prepare_dma(struct s3c64xx_spi_dma_data *dma,
 	info.direction = dma->direction;
 	info.buf = buf;
 
-	sdd->ops->prepare(dma->ch, &info);
-	sdd->ops->trigger(dma->ch);
+	sdd->ops->prepare((enum dma_ch)dma->ch, &info);
+	sdd->ops->trigger((enum dma_ch)dma->ch);
 }
 
 static int acquire_dma(struct s3c64xx_spi_driver_data *sdd)
@@ -325,12 +334,126 @@ static int acquire_dma(struct s3c64xx_spi_driver_data *sdd)
 	req.cap = DMA_SLAVE;
 	req.client = &s3c64xx_spi_dma_client;
 
-	sdd->rx_dma.ch = sdd->ops->request(sdd->rx_dma.dmach, &req, dev, "rx");
-	sdd->tx_dma.ch = sdd->ops->request(sdd->tx_dma.dmach, &req, dev, "tx");
+	sdd->rx_dma.ch = (void *)sdd->ops->request(sdd->rx_dma.dmach, &req, dev, "rx");
+	sdd->tx_dma.ch = (void *)sdd->ops->request(sdd->tx_dma.dmach, &req, dev, "tx");
 
 	return 1;
 }
 
+static int s3c64xx_spi_prepare_transfer(struct spi_master *spi)
+{
+	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(spi);
+
+	/* Acquire DMA channels */
+	while (!acquire_dma(sdd))
+		usleep_range(10000, 11000);
+
+	pm_runtime_get_sync(&sdd->pdev->dev);
+
+	return 0;
+}
+
+static int s3c64xx_spi_unprepare_transfer(struct spi_master *spi)
+{
+	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(spi);
+
+	/* Free DMA channels */
+	sdd->ops->release((enum dma_ch)sdd->rx_dma.ch, &s3c64xx_spi_dma_client);
+	sdd->ops->release((enum dma_ch)sdd->tx_dma.ch, &s3c64xx_spi_dma_client);
+
+	pm_runtime_put(&sdd->pdev->dev);
+
+	return 0;
+}
+
+static void s3c64xx_spi_dma_stop(struct s3c64xx_spi_driver_data *sdd,
+				 struct s3c64xx_spi_dma_data *dma)
+{
+	sdd->ops->stop((enum dma_ch)dma->ch);
+}
+#else
+
+static void prepare_dma(struct s3c64xx_spi_dma_data *dma,
+					unsigned len, dma_addr_t buf)
+{
+	struct s3c64xx_spi_driver_data *sdd;
+	struct dma_slave_config config;
+	struct scatterlist sg;
+	struct dma_async_tx_descriptor *desc;
+
+	if (dma->direction == DMA_DEV_TO_MEM) {
+		sdd = container_of((void *)dma,
+			struct s3c64xx_spi_driver_data, rx_dma);
+		config.direction = dma->direction;
+		config.src_addr = sdd->sfr_start + S3C64XX_SPI_RX_DATA;
+		config.src_addr_width = sdd->cur_bpw / 8;
+		config.src_maxburst = 1;
+		dmaengine_slave_config(dma->ch, &config);
+	} else {
+		sdd = container_of((void *)dma,
+			struct s3c64xx_spi_driver_data, tx_dma);
+		config.direction = dma->direction;
+		config.dst_addr = sdd->sfr_start + S3C64XX_SPI_TX_DATA;
+		config.dst_addr_width = sdd->cur_bpw / 8;
+		config.dst_maxburst = 1;
+		dmaengine_slave_config(dma->ch, &config);
+	}
+
+	sg_init_table(&sg, 1);
+	sg_dma_len(&sg) = len;
+	sg_set_page(&sg, pfn_to_page(PFN_DOWN(buf)),
+		    len, offset_in_page(buf));
+	sg_dma_address(&sg) = buf;
+
+	desc = dmaengine_prep_slave_sg(dma->ch,
+		&sg, 1, dma->direction, DMA_PREP_INTERRUPT);
+
+	desc->callback = s3c64xx_spi_dmacb;
+	desc->callback_param = dma;
+
+	dmaengine_submit(desc);
+	dma_async_issue_pending(dma->ch);
+}
+
+static int s3c64xx_spi_prepare_transfer(struct spi_master *spi)
+{
+	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(spi);
+	dma_filter_fn filter = sdd->cntrlr_info->filter;
+	struct device *dev = &sdd->pdev->dev;
+	dma_cap_mask_t mask;
+
+	dma_cap_zero(mask);
+	dma_cap_set(DMA_SLAVE, mask);
+
+	/* Acquire DMA channels */
+	sdd->rx_dma.ch = dma_request_slave_channel_compat(mask, filter,
+				(void*)sdd->rx_dma.dmach, dev, "rx");
+	sdd->tx_dma.ch = dma_request_slave_channel_compat(mask, filter,
+				(void*)sdd->tx_dma.dmach, dev, "tx");
+	pm_runtime_get_sync(&sdd->pdev->dev);
+
+	return 0;
+}
+
+static int s3c64xx_spi_unprepare_transfer(struct spi_master *spi)
+{
+	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(spi);
+
+	/* Free DMA channels */
+	dma_release_channel(sdd->rx_dma.ch);
+	dma_release_channel(sdd->tx_dma.ch);
+
+	pm_runtime_put(&sdd->pdev->dev);
+	return 0;
+}
+
+static void s3c64xx_spi_dma_stop(struct s3c64xx_spi_driver_data *sdd,
+				 struct s3c64xx_spi_dma_data *dma)
+{
+	dmaengine_terminate_all(dma->ch);
+}
+#endif
+
 static void enable_datapath(struct s3c64xx_spi_driver_data *sdd,
 				struct spi_device *spi,
 				struct spi_transfer *xfer, int dma_mode)
@@ -713,9 +836,9 @@ static int s3c64xx_spi_transfer_one_message(struct spi_master *master,
 		}
 
 		/* Polling method for xfers not bigger than FIFO capacity */
-		if (xfer->len <= ((FIFO_LVL_MASK(sdd) >> 1) + 1))
-			use_dma = 0;
-		else
+		use_dma = 0;
+		if (sdd->rx_dma.ch && sdd->tx_dma.ch &&
+		    (xfer->len > ((FIFO_LVL_MASK(sdd) >> 1) + 1)))
 			use_dma = 1;
 
 		spin_lock_irqsave(&sdd->lock, flags);
@@ -750,10 +873,10 @@ static int s3c64xx_spi_transfer_one_message(struct spi_master *master,
 			if (use_dma) {
 				if (xfer->tx_buf != NULL
 						&& (sdd->state & TXBUSY))
-					sdd->ops->stop(sdd->tx_dma.ch);
+					s3c64xx_spi_dma_stop(sdd, &sdd->tx_dma);
 				if (xfer->rx_buf != NULL
 						&& (sdd->state & RXBUSY))
-					sdd->ops->stop(sdd->rx_dma.ch);
+					s3c64xx_spi_dma_stop(sdd, &sdd->rx_dma);
 			}
 
 			goto out;
@@ -790,32 +913,6 @@ out:
 	return 0;
 }
 
-static int s3c64xx_spi_prepare_transfer(struct spi_master *spi)
-{
-	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(spi);
-
-	/* Acquire DMA channels */
-	while (!acquire_dma(sdd))
-		usleep_range(10000, 11000);
-
-	pm_runtime_get_sync(&sdd->pdev->dev);
-
-	return 0;
-}
-
-static int s3c64xx_spi_unprepare_transfer(struct spi_master *spi)
-{
-	struct s3c64xx_spi_driver_data *sdd = spi_master_get_devdata(spi);
-
-	/* Free DMA channels */
-	sdd->ops->release(sdd->rx_dma.ch, &s3c64xx_spi_dma_client);
-	sdd->ops->release(sdd->tx_dma.ch, &s3c64xx_spi_dma_client);
-
-	pm_runtime_put(&sdd->pdev->dev);
-
-	return 0;
-}
-
 static struct s3c64xx_spi_csinfo *s3c64xx_get_slave_ctrldata(
 				struct spi_device *spi)
 {
diff --git a/include/linux/platform_data/spi-s3c64xx.h b/include/linux/platform_data/spi-s3c64xx.h
index ceba18d23a5a..8447f634c7f5 100644
--- a/include/linux/platform_data/spi-s3c64xx.h
+++ b/include/linux/platform_data/spi-s3c64xx.h
@@ -11,6 +11,8 @@
 #ifndef __S3C64XX_PLAT_SPI_H
 #define __S3C64XX_PLAT_SPI_H
 
+#include <linux/dmaengine.h>
+
 struct platform_device;
 
 /**
@@ -38,6 +40,7 @@ struct s3c64xx_spi_info {
 	int src_clk_nr;
 	int num_cs;
 	int (*cfg_gpio)(void);
+	dma_filter_fn filter;
 };
 
 /**
-- 
cgit 


From 26d5bbe5ba2073fc7ef9e69a55543b2376f5bad0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 12 Apr 2013 10:29:04 -0700
Subject: Revert "cgroup: remove bind() method from cgroup_subsys."

This reverts commit 84cfb6ab484b442d5115eb3baf9db7d74a3ea626.  There
are scheduled changes which make use of the removed callback.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rami Rosen <ramirose@gmail.com>
Cc: Li Zefan <lizefan@huawei.com>
---
 Documentation/cgroups/cgroups.txt | 20 +++++++++++++++-----
 include/linux/cgroup.h            |  2 ++
 kernel/cgroup.c                   |  4 ++++
 3 files changed, 21 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 2b51e12ce178..638bf17ff869 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -211,9 +211,10 @@ matches, and any of the requested subsystems are in use in an existing
 hierarchy, the mount will fail with -EBUSY. Otherwise, a new hierarchy
 is activated, associated with the requested subsystems.
 
-It's not possible to bind a new subsystem to an active cgroup
-hierarchy, or to unbind a subsystem from an active cgroup
-hierarchy.
+It's not currently possible to bind a new subsystem to an active
+cgroup hierarchy, or to unbind a subsystem from an active cgroup
+hierarchy. This may be possible in future, but is fraught with nasty
+error-recovery issues.
 
 When a cgroup filesystem is unmounted, if there are any
 child cgroups created below the top-level cgroup, that hierarchy
@@ -381,8 +382,10 @@ To Specify a hierarchy's release_agent:
 
 Note that specifying 'release_agent' more than once will return failure.
 
-Note that changing the set of subsystems is only supported when the
-hierarchy consists of a single (root) cgroup.
+Note that changing the set of subsystems is currently only supported
+when the hierarchy consists of a single (root) cgroup. Supporting
+the ability to arbitrarily bind/unbind subsystems from an existing
+cgroup hierarchy is intended to be implemented in the future.
 
 Then under /sys/fs/cgroup/rg1 you can find a tree that corresponds to the
 tree of the cgroups in the system. For instance, /sys/fs/cgroup/rg1
@@ -640,6 +643,13 @@ void exit(struct task_struct *task)
 
 Called during task exit.
 
+void bind(struct cgroup *root)
+(cgroup_mutex held by caller)
+
+Called when a cgroup subsystem is rebound to a different hierarchy
+and root cgroup. Currently this will only involve movement between
+the default hierarchy (which never has sub-cgroups) and a hierarchy
+that is being created/destroyed (and hence has no sub-cgroups).
 
 4. Extended attribute usage
 ===========================
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a2b9d4b13369..45aee0fc6b98 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -484,6 +484,8 @@ struct cgroup_subsys {
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		     struct task_struct *task);
+	void (*bind)(struct cgroup *root);
+
 	int subsys_id;
 	int active;
 	int disabled;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7bf3ce09c50c..678a22c75fdb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1091,12 +1091,16 @@ static int rebind_subsystems(struct cgroupfs_root *root,
 			cgrp->subsys[i]->cgroup = cgrp;
 			list_move(&ss->sibling, &root->subsys_list);
 			ss->root = root;
+			if (ss->bind)
+				ss->bind(cgrp);
 			/* refcount was already taken, and we're keeping it */
 		} else if (bit & removed_mask) {
 			/* We're removing this subsystem */
 			BUG_ON(ss == NULL);
 			BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
 			BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+			if (ss->bind)
+				ss->bind(dummytop);
 			dummytop->subsys[i]->cgroup = dummytop;
 			cgrp->subsys[i] = NULL;
 			subsys[i]->root = &rootnode;
-- 
cgit 


From 79b16641efabd14944dbfc2fde2ae1e8ae8413bc Mon Sep 17 00:00:00 2001
From: Gregory CLEMENT <gregory.clement@free-electrons.com>
Date: Fri, 12 Apr 2013 13:57:44 +0200
Subject: clk: add device tree fixed-factor-clock binding support

Add support for DT "fixed-factor-clock" binding to the common fixed
factor clock support.

Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Tested-by: Christian Ruppert <christian.ruppert@abilis.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 .../bindings/clock/fixed-factor-clock.txt          | 24 +++++++++++++++
 drivers/clk/clk-fixed-factor.c                     | 36 ++++++++++++++++++++++
 include/linux/clk-provider.h                       |  2 ++
 3 files changed, 62 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/fixed-factor-clock.txt

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/fixed-factor-clock.txt b/Documentation/devicetree/bindings/clock/fixed-factor-clock.txt
new file mode 100644
index 000000000000..5757f9abfc26
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/fixed-factor-clock.txt
@@ -0,0 +1,24 @@
+Binding for simple fixed factor rate clock sources.
+
+This binding uses the common clock binding[1].
+
+[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
+
+Required properties:
+- compatible : shall be "fixed-factor-clock".
+- #clock-cells : from common clock binding; shall be set to 0.
+- clock-div: fixed divider.
+- clock-mult: fixed multiplier.
+- clocks: parent clock.
+
+Optional properties:
+- clock-output-names : From common clock binding.
+
+Example:
+	clock {
+		compatible = "fixed-factor-clock";
+		clocks = <&parentclk>;
+		#clock-cells = <0>;
+		div = <2>;
+		mult = <1>;
+	};
diff --git a/drivers/clk/clk-fixed-factor.c b/drivers/clk/clk-fixed-factor.c
index 1ef271e47594..9ff7d510faa3 100644
--- a/drivers/clk/clk-fixed-factor.c
+++ b/drivers/clk/clk-fixed-factor.c
@@ -11,6 +11,7 @@
 #include <linux/clk-provider.h>
 #include <linux/slab.h>
 #include <linux/err.h>
+#include <linux/of.h>
 
 /*
  * DOC: basic fixed multiplier and divider clock that cannot gate
@@ -96,3 +97,38 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
 
 	return clk;
 }
+#ifdef CONFIG_OF
+/**
+ * of_fixed_factor_clk_setup() - Setup function for simple fixed factor clock
+ */
+void __init of_fixed_factor_clk_setup(struct device_node *node)
+{
+	struct clk *clk;
+	const char *clk_name = node->name;
+	const char *parent_name;
+	u32 div, mult;
+
+	if (of_property_read_u32(node, "clock-div", &div)) {
+		pr_err("%s Fixed factor clock <%s> must have a clock-div property\n",
+			__func__, node->name);
+		return;
+	}
+
+	if (of_property_read_u32(node, "clock-mult", &mult)) {
+		pr_err("%s Fixed factor clock <%s> must have a clokc-mult property\n",
+			__func__, node->name);
+		return;
+	}
+
+	of_property_read_string(node, "clock-output-names", &clk_name);
+	parent_name = of_clk_get_parent_name(node, 0);
+
+	clk = clk_register_fixed_factor(NULL, clk_name, parent_name, 0,
+					mult, div);
+	if (!IS_ERR(clk))
+		of_clk_add_provider(node, of_clk_src_simple_get, clk);
+}
+EXPORT_SYMBOL_GPL(of_fixed_factor_clk_setup);
+CLK_OF_DECLARE(fixed_factor_clk, "fixed-factor-clock",
+		of_fixed_factor_clk_setup);
+#endif
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index 9fdfae74d669..e7b7cbc53815 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -325,6 +325,8 @@ struct clk *clk_register_mux_table(struct device *dev, const char *name,
 		void __iomem *reg, u8 shift, u32 mask,
 		u8 clk_mux_flags, u32 *table, spinlock_t *lock);
 
+void of_fixed_factor_clk_setup(struct device_node *node);
+
 /**
  * struct clk_fixed_factor - fixed multiplier and divider clock
  *
-- 
cgit 


From 9abd5f0555df6cd36130feb742f1def6d99c60fe Mon Sep 17 00:00:00 2001
From: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Date: Thu, 11 Apr 2013 21:42:29 +0200
Subject: clk: add si5351 i2c common clock driver

This patch adds a common clock driver for Silicon Labs Si5351a/b/c
i2c programmable clock generators. Currently, the driver does not
support VXCO feature of si5351b. Passing platform_data or DT bindings
selectively allows to overwrite stored Si5351 configuration which is
very helpful for clock generators with empty eeprom configuration.
Corresponding device tree binding documentation is also added.

Signed-off-by: Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
Tested-by: Daniel Mack <zonque@gmail.com>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Tested-by: Michal Bachraty <michal.bachraty@streamunlimited.com>
Signed-off-by: Mike Turquette <mturquette@linaro.org>
---
 .../devicetree/bindings/clock/silabs,si5351.txt    |  114 ++
 .../devicetree/bindings/vendor-prefixes.txt        |    1 +
 drivers/clk/Kconfig                                |    9 +
 drivers/clk/Makefile                               |    1 +
 drivers/clk/clk-si5351.c                           | 1510 ++++++++++++++++++++
 drivers/clk/clk-si5351.h                           |  155 ++
 include/linux/platform_data/si5351.h               |  114 ++
 7 files changed, 1904 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/clock/silabs,si5351.txt
 create mode 100644 drivers/clk/clk-si5351.c
 create mode 100644 drivers/clk/clk-si5351.h
 create mode 100644 include/linux/platform_data/si5351.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/clock/silabs,si5351.txt b/Documentation/devicetree/bindings/clock/silabs,si5351.txt
new file mode 100644
index 000000000000..cc374651662c
--- /dev/null
+++ b/Documentation/devicetree/bindings/clock/silabs,si5351.txt
@@ -0,0 +1,114 @@
+Binding for Silicon Labs Si5351a/b/c programmable i2c clock generator.
+
+Reference
+[1] Si5351A/B/C Data Sheet
+    http://www.silabs.com/Support%20Documents/TechnicalDocs/Si5351.pdf
+
+The Si5351a/b/c are programmable i2c clock generators with upto 8 output
+clocks. Si5351a also has a reduced pin-count package (MSOP10) where only
+3 output clocks are accessible. The internal structure of the clock
+generators can be found in [1].
+
+==I2C device node==
+
+Required properties:
+- compatible: shall be one of "silabs,si5351{a,a-msop,b,c}".
+- reg: i2c device address, shall be 0x60 or 0x61.
+- #clock-cells: from common clock binding; shall be set to 1.
+- clocks: from common clock binding; list of parent clock
+  handles, shall be xtal reference clock or xtal and clkin for
+  si5351c only.
+- #address-cells: shall be set to 1.
+- #size-cells: shall be set to 0.
+
+Optional properties:
+- silabs,pll-source: pair of (number, source) for each pll. Allows
+  to overwrite clock source of pll A (number=0) or B (number=1).
+
+==Child nodes==
+
+Each of the clock outputs can be overwritten individually by
+using a child node to the I2C device node. If a child node for a clock
+output is not set, the eeprom configuration is not overwritten.
+
+Required child node properties:
+- reg: number of clock output.
+
+Optional child node properties:
+- silabs,clock-source: source clock of the output divider stage N, shall be
+  0 = multisynth N
+  1 = multisynth 0 for output clocks 0-3, else multisynth4
+  2 = xtal
+  3 = clkin (si5351c only)
+- silabs,drive-strength: output drive strength in mA, shall be one of {2,4,6,8}.
+- silabs,multisynth-source: source pll A(0) or B(1) of corresponding multisynth
+  divider.
+- silabs,pll-master: boolean, multisynth can change pll frequency.
+
+==Example==
+
+/* 25MHz reference crystal */
+ref25: ref25M {
+	compatible = "fixed-clock";
+	#clock-cells = <0>;
+	clock-frequency = <25000000>;
+};
+
+i2c-master-node {
+
+	/* Si5351a msop10 i2c clock generator */
+	si5351a: clock-generator@60 {
+		compatible = "silabs,si5351a-msop";
+		reg = <0x60>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+		#clock-cells = <1>;
+
+		/* connect xtal input to 25MHz reference */
+		clocks = <&ref25>;
+
+		/* connect xtal input as source of pll0 and pll1 */
+		silabs,pll-source = <0 0>, <1 0>;
+
+		/*
+		 * overwrite clkout0 configuration with:
+		 * - 8mA output drive strength
+		 * - pll0 as clock source of multisynth0
+		 * - multisynth0 as clock source of output divider
+		 * - multisynth0 can change pll0
+		 * - set initial clock frequency of 74.25MHz
+		 */
+		clkout0 {
+			reg = <0>;
+			silabs,drive-strength = <8>;
+			silabs,multisynth-source = <0>;
+			silabs,clock-source = <0>;
+			silabs,pll-master;
+			clock-frequency = <74250000>;
+		};
+
+		/*
+		 * overwrite clkout1 configuration with:
+		 * - 4mA output drive strength
+		 * - pll1 as clock source of multisynth1
+		 * - multisynth1 as clock source of output divider
+		 * - multisynth1 can change pll1
+		 */
+		clkout1 {
+			reg = <1>;
+			silabs,drive-strength = <4>;
+			silabs,multisynth-source = <1>;
+			silabs,clock-source = <0>;
+			pll-master;
+		};
+
+		/*
+		 * overwrite clkout2 configuration with:
+		 * - xtal as clock source of output divider
+		 */
+		clkout2 {
+			reg = <2>;
+			silabs,clock-source = <2>;
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt
index 19e1ef73ab0d..ca608496507b 100644
--- a/Documentation/devicetree/bindings/vendor-prefixes.txt
+++ b/Documentation/devicetree/bindings/vendor-prefixes.txt
@@ -48,6 +48,7 @@ samsung	Samsung Semiconductor
 sbs	Smart Battery System
 schindler	Schindler
 sil	Silicon Image
+silabs	Silicon Laboratories
 simtek
 sirf	SiRF Technology, Inc.
 snps 	Synopsys, Inc.
diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
index a64caefdba12..186dd0edca8f 100644
--- a/drivers/clk/Kconfig
+++ b/drivers/clk/Kconfig
@@ -55,6 +55,15 @@ config COMMON_CLK_MAX77686
 	---help---
 	  This driver supports Maxim 77686 crystal oscillator clock. 
 
+config COMMON_CLK_SI5351
+	tristate "Clock driver for SiLabs 5351A/B/C"
+	depends on I2C
+	select REGMAP_I2C
+	select RATIONAL
+	---help---
+	  This driver supports Silicon Labs 5351A/B/C programmable clock
+	  generators.
+
 config CLK_TWL6040
 	tristate "External McPDM functional clock from twl6040"
 	depends on TWL6040_CORE
diff --git a/drivers/clk/Makefile b/drivers/clk/Makefile
index 79e98e416724..e7f7fe9b2f09 100644
--- a/drivers/clk/Makefile
+++ b/drivers/clk/Makefile
@@ -36,4 +36,5 @@ obj-$(CONFIG_X86)		+= x86/
 obj-$(CONFIG_COMMON_CLK_AXI_CLKGEN) += clk-axi-clkgen.o
 obj-$(CONFIG_COMMON_CLK_WM831X) += clk-wm831x.o
 obj-$(CONFIG_COMMON_CLK_MAX77686) += clk-max77686.o
+obj-$(CONFIG_COMMON_CLK_SI5351) += clk-si5351.o
 obj-$(CONFIG_CLK_TWL6040)	+= clk-twl6040.o
diff --git a/drivers/clk/clk-si5351.c b/drivers/clk/clk-si5351.c
new file mode 100644
index 000000000000..892728412e9d
--- /dev/null
+++ b/drivers/clk/clk-si5351.c
@@ -0,0 +1,1510 @@
+/*
+ * clk-si5351.c: Silicon Laboratories Si5351A/B/C I2C Clock Generator
+ *
+ * Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
+ * Rabeeh Khoury <rabeeh@solid-run.com>
+ *
+ * References:
+ * [1] "Si5351A/B/C Data Sheet"
+ *     http://www.silabs.com/Support%20Documents/TechnicalDocs/Si5351.pdf
+ * [2] "Manually Generating an Si5351 Register Map"
+ *     http://www.silabs.com/Support%20Documents/TechnicalDocs/AN619.pdf
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/clkdev.h>
+#include <linux/clk-provider.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/rational.h>
+#include <linux/i2c.h>
+#include <linux/of_platform.h>
+#include <linux/platform_data/si5351.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <asm/div64.h>
+
+#include "clk-si5351.h"
+
+struct si5351_driver_data;
+
+struct si5351_parameters {
+	unsigned long	p1;
+	unsigned long	p2;
+	unsigned long	p3;
+	int		valid;
+};
+
+struct si5351_hw_data {
+	struct clk_hw			hw;
+	struct si5351_driver_data	*drvdata;
+	struct si5351_parameters	params;
+	unsigned char			num;
+};
+
+struct si5351_driver_data {
+	enum si5351_variant	variant;
+	struct i2c_client	*client;
+	struct regmap		*regmap;
+	struct clk_onecell_data onecell;
+
+	struct clk		*pxtal;
+	const char		*pxtal_name;
+	struct clk_hw		xtal;
+	struct clk		*pclkin;
+	const char		*pclkin_name;
+	struct clk_hw		clkin;
+
+	struct si5351_hw_data	pll[2];
+	struct si5351_hw_data	*msynth;
+	struct si5351_hw_data	*clkout;
+};
+
+static const char const *si5351_input_names[] = {
+	"xtal", "clkin"
+};
+static const char const *si5351_pll_names[] = {
+	"plla", "pllb", "vxco"
+};
+static const char const *si5351_msynth_names[] = {
+	"ms0", "ms1", "ms2", "ms3", "ms4", "ms5", "ms6", "ms7"
+};
+static const char const *si5351_clkout_names[] = {
+	"clk0", "clk1", "clk2", "clk3", "clk4", "clk5", "clk6", "clk7"
+};
+
+/*
+ * Si5351 i2c regmap
+ */
+static inline u8 si5351_reg_read(struct si5351_driver_data *drvdata, u8 reg)
+{
+	u32 val;
+	int ret;
+
+	ret = regmap_read(drvdata->regmap, reg, &val);
+	if (ret) {
+		dev_err(&drvdata->client->dev,
+			"unable to read from reg%02x\n", reg);
+		return 0;
+	}
+
+	return (u8)val;
+}
+
+static inline int si5351_bulk_read(struct si5351_driver_data *drvdata,
+				   u8 reg, u8 count, u8 *buf)
+{
+	return regmap_bulk_read(drvdata->regmap, reg, buf, count);
+}
+
+static inline int si5351_reg_write(struct si5351_driver_data *drvdata,
+				   u8 reg, u8 val)
+{
+	return regmap_write(drvdata->regmap, reg, val);
+}
+
+static inline int si5351_bulk_write(struct si5351_driver_data *drvdata,
+				    u8 reg, u8 count, const u8 *buf)
+{
+	return regmap_raw_write(drvdata->regmap, reg, buf, count);
+}
+
+static inline int si5351_set_bits(struct si5351_driver_data *drvdata,
+				  u8 reg, u8 mask, u8 val)
+{
+	return regmap_update_bits(drvdata->regmap, reg, mask, val);
+}
+
+static inline u8 si5351_msynth_params_address(int num)
+{
+	if (num > 5)
+		return SI5351_CLK6_PARAMETERS + (num - 6);
+	return SI5351_CLK0_PARAMETERS + (SI5351_PARAMETERS_LENGTH * num);
+}
+
+static void si5351_read_parameters(struct si5351_driver_data *drvdata,
+				   u8 reg, struct si5351_parameters *params)
+{
+	u8 buf[SI5351_PARAMETERS_LENGTH];
+
+	switch (reg) {
+	case SI5351_CLK6_PARAMETERS:
+	case SI5351_CLK7_PARAMETERS:
+		buf[0] = si5351_reg_read(drvdata, reg);
+		params->p1 = buf[0];
+		params->p2 = 0;
+		params->p3 = 1;
+		break;
+	default:
+		si5351_bulk_read(drvdata, reg, SI5351_PARAMETERS_LENGTH, buf);
+		params->p1 = ((buf[2] & 0x03) << 16) | (buf[3] << 8) | buf[4];
+		params->p2 = ((buf[5] & 0x0f) << 16) | (buf[6] << 8) | buf[7];
+		params->p3 = ((buf[5] & 0xf0) << 12) | (buf[0] << 8) | buf[1];
+	}
+	params->valid = 1;
+}
+
+static void si5351_write_parameters(struct si5351_driver_data *drvdata,
+				    u8 reg, struct si5351_parameters *params)
+{
+	u8 buf[SI5351_PARAMETERS_LENGTH];
+
+	switch (reg) {
+	case SI5351_CLK6_PARAMETERS:
+	case SI5351_CLK7_PARAMETERS:
+		buf[0] = params->p1 & 0xff;
+		si5351_reg_write(drvdata, reg, buf[0]);
+		break;
+	default:
+		buf[0] = ((params->p3 & 0x0ff00) >> 8) & 0xff;
+		buf[1] = params->p3 & 0xff;
+		/* save rdiv and divby4 */
+		buf[2] = si5351_reg_read(drvdata, reg + 2) & ~0x03;
+		buf[2] |= ((params->p1 & 0x30000) >> 16) & 0x03;
+		buf[3] = ((params->p1 & 0x0ff00) >> 8) & 0xff;
+		buf[4] = params->p1 & 0xff;
+		buf[5] = ((params->p3 & 0xf0000) >> 12) |
+			((params->p2 & 0xf0000) >> 16);
+		buf[6] = ((params->p2 & 0x0ff00) >> 8) & 0xff;
+		buf[7] = params->p2 & 0xff;
+		si5351_bulk_write(drvdata, reg, SI5351_PARAMETERS_LENGTH, buf);
+	}
+}
+
+static bool si5351_regmap_is_volatile(struct device *dev, unsigned int reg)
+{
+	switch (reg) {
+	case SI5351_DEVICE_STATUS:
+	case SI5351_INTERRUPT_STATUS:
+	case SI5351_PLL_RESET:
+		return true;
+	}
+	return false;
+}
+
+static bool si5351_regmap_is_writeable(struct device *dev, unsigned int reg)
+{
+	/* reserved registers */
+	if (reg >= 4 && reg <= 8)
+		return false;
+	if (reg >= 10 && reg <= 14)
+		return false;
+	if (reg >= 173 && reg <= 176)
+		return false;
+	if (reg >= 178 && reg <= 182)
+		return false;
+	/* read-only */
+	if (reg == SI5351_DEVICE_STATUS)
+		return false;
+	return true;
+}
+
+static struct regmap_config si5351_regmap_config = {
+	.reg_bits = 8,
+	.val_bits = 8,
+	.cache_type = REGCACHE_RBTREE,
+	.max_register = 187,
+	.writeable_reg = si5351_regmap_is_writeable,
+	.volatile_reg = si5351_regmap_is_volatile,
+};
+
+/*
+ * Si5351 xtal clock input
+ */
+static int si5351_xtal_prepare(struct clk_hw *hw)
+{
+	struct si5351_driver_data *drvdata =
+		container_of(hw, struct si5351_driver_data, xtal);
+	si5351_set_bits(drvdata, SI5351_FANOUT_ENABLE,
+			SI5351_XTAL_ENABLE, SI5351_XTAL_ENABLE);
+	return 0;
+}
+
+static void si5351_xtal_unprepare(struct clk_hw *hw)
+{
+	struct si5351_driver_data *drvdata =
+		container_of(hw, struct si5351_driver_data, xtal);
+	si5351_set_bits(drvdata, SI5351_FANOUT_ENABLE,
+			SI5351_XTAL_ENABLE, 0);
+}
+
+static const struct clk_ops si5351_xtal_ops = {
+	.prepare = si5351_xtal_prepare,
+	.unprepare = si5351_xtal_unprepare,
+};
+
+/*
+ * Si5351 clkin clock input (Si5351C only)
+ */
+static int si5351_clkin_prepare(struct clk_hw *hw)
+{
+	struct si5351_driver_data *drvdata =
+		container_of(hw, struct si5351_driver_data, clkin);
+	si5351_set_bits(drvdata, SI5351_FANOUT_ENABLE,
+			SI5351_CLKIN_ENABLE, SI5351_CLKIN_ENABLE);
+	return 0;
+}
+
+static void si5351_clkin_unprepare(struct clk_hw *hw)
+{
+	struct si5351_driver_data *drvdata =
+		container_of(hw, struct si5351_driver_data, clkin);
+	si5351_set_bits(drvdata, SI5351_FANOUT_ENABLE,
+			SI5351_CLKIN_ENABLE, 0);
+}
+
+/*
+ * CMOS clock source constraints:
+ * The input frequency range of the PLL is 10Mhz to 40MHz.
+ * If CLKIN is >40MHz, the input divider must be used.
+ */
+static unsigned long si5351_clkin_recalc_rate(struct clk_hw *hw,
+					      unsigned long parent_rate)
+{
+	struct si5351_driver_data *drvdata =
+		container_of(hw, struct si5351_driver_data, clkin);
+	unsigned long rate;
+	unsigned char idiv;
+
+	rate = parent_rate;
+	if (parent_rate > 160000000) {
+		idiv = SI5351_CLKIN_DIV_8;
+		rate /= 8;
+	} else if (parent_rate > 80000000) {
+		idiv = SI5351_CLKIN_DIV_4;
+		rate /= 4;
+	} else if (parent_rate > 40000000) {
+		idiv = SI5351_CLKIN_DIV_2;
+		rate /= 2;
+	} else {
+		idiv = SI5351_CLKIN_DIV_1;
+	}
+
+	si5351_set_bits(drvdata, SI5351_PLL_INPUT_SOURCE,
+			SI5351_CLKIN_DIV_MASK, idiv);
+
+	dev_dbg(&drvdata->client->dev, "%s - clkin div = %d, rate = %lu\n",
+		__func__, (1 << (idiv >> 6)), rate);
+
+	return rate;
+}
+
+static const struct clk_ops si5351_clkin_ops = {
+	.prepare = si5351_clkin_prepare,
+	.unprepare = si5351_clkin_unprepare,
+	.recalc_rate = si5351_clkin_recalc_rate,
+};
+
+/*
+ * Si5351 vxco clock input (Si5351B only)
+ */
+
+static int si5351_vxco_prepare(struct clk_hw *hw)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+
+	dev_warn(&hwdata->drvdata->client->dev, "VXCO currently unsupported\n");
+
+	return 0;
+}
+
+static void si5351_vxco_unprepare(struct clk_hw *hw)
+{
+}
+
+static unsigned long si5351_vxco_recalc_rate(struct clk_hw *hw,
+					     unsigned long parent_rate)
+{
+	return 0;
+}
+
+static int si5351_vxco_set_rate(struct clk_hw *hw, unsigned long rate,
+				unsigned long parent)
+{
+	return 0;
+}
+
+static const struct clk_ops si5351_vxco_ops = {
+	.prepare = si5351_vxco_prepare,
+	.unprepare = si5351_vxco_unprepare,
+	.recalc_rate = si5351_vxco_recalc_rate,
+	.set_rate = si5351_vxco_set_rate,
+};
+
+/*
+ * Si5351 pll a/b
+ *
+ * Feedback Multisynth Divider Equations [2]
+ *
+ * fVCO = fIN * (a + b/c)
+ *
+ * with 15 + 0/1048575 <= (a + b/c) <= 90 + 0/1048575 and
+ * fIN = fXTAL or fIN = fCLKIN/CLKIN_DIV
+ *
+ * Feedback Multisynth Register Equations
+ *
+ * (1) MSNx_P1[17:0] = 128 * a + floor(128 * b/c) - 512
+ * (2) MSNx_P2[19:0] = 128 * b - c * floor(128 * b/c) = (128*b) mod c
+ * (3) MSNx_P3[19:0] = c
+ *
+ * Transposing (2) yields: (4) floor(128 * b/c) = (128 * b / MSNx_P2)/c
+ *
+ * Using (4) on (1) yields:
+ * MSNx_P1 = 128 * a + (128 * b/MSNx_P2)/c - 512
+ * MSNx_P1 + 512 + MSNx_P2/c = 128 * a + 128 * b/c
+ *
+ * a + b/c = (MSNx_P1 + MSNx_P2/MSNx_P3 + 512)/128
+ *         = (MSNx_P1*MSNx_P3 + MSNx_P2 + 512*MSNx_P3)/(128*MSNx_P3)
+ *
+ */
+static int _si5351_pll_reparent(struct si5351_driver_data *drvdata,
+				int num, enum si5351_pll_src parent)
+{
+	u8 mask = (num == 0) ? SI5351_PLLA_SOURCE : SI5351_PLLB_SOURCE;
+
+	if (parent == SI5351_PLL_SRC_DEFAULT)
+		return 0;
+
+	if (num > 2)
+		return -EINVAL;
+
+	if (drvdata->variant != SI5351_VARIANT_C &&
+	    parent != SI5351_PLL_SRC_XTAL)
+		return -EINVAL;
+
+	si5351_set_bits(drvdata, SI5351_PLL_INPUT_SOURCE, mask,
+			(parent == SI5351_PLL_SRC_XTAL) ? 0 : mask);
+	return 0;
+}
+
+static unsigned char si5351_pll_get_parent(struct clk_hw *hw)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	u8 mask = (hwdata->num == 0) ? SI5351_PLLA_SOURCE : SI5351_PLLB_SOURCE;
+	u8 val;
+
+	val = si5351_reg_read(hwdata->drvdata, SI5351_PLL_INPUT_SOURCE);
+
+	return (val & mask) ? 1 : 0;
+}
+
+static int si5351_pll_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+
+	if (hwdata->drvdata->variant != SI5351_VARIANT_C &&
+	    index > 0)
+		return -EPERM;
+
+	if (index > 1)
+		return -EINVAL;
+
+	return _si5351_pll_reparent(hwdata->drvdata, hwdata->num,
+			     (index == 0) ? SI5351_PLL_SRC_XTAL :
+			     SI5351_PLL_SRC_CLKIN);
+}
+
+static unsigned long si5351_pll_recalc_rate(struct clk_hw *hw,
+					    unsigned long parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	u8 reg = (hwdata->num == 0) ? SI5351_PLLA_PARAMETERS :
+		SI5351_PLLB_PARAMETERS;
+	unsigned long long rate;
+
+	if (!hwdata->params.valid)
+		si5351_read_parameters(hwdata->drvdata, reg, &hwdata->params);
+
+	if (hwdata->params.p3 == 0)
+		return parent_rate;
+
+	/* fVCO = fIN * (P1*P3 + 512*P3 + P2)/(128*P3) */
+	rate  = hwdata->params.p1 * hwdata->params.p3;
+	rate += 512 * hwdata->params.p3;
+	rate += hwdata->params.p2;
+	rate *= parent_rate;
+	do_div(rate, 128 * hwdata->params.p3);
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: p1 = %lu, p2 = %lu, p3 = %lu, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk),
+		hwdata->params.p1, hwdata->params.p2, hwdata->params.p3,
+		parent_rate, (unsigned long)rate);
+
+	return (unsigned long)rate;
+}
+
+static long si5351_pll_round_rate(struct clk_hw *hw, unsigned long rate,
+				  unsigned long *parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	unsigned long rfrac, denom, a, b, c;
+	unsigned long long lltmp;
+
+	if (rate < SI5351_PLL_VCO_MIN)
+		rate = SI5351_PLL_VCO_MIN;
+	if (rate > SI5351_PLL_VCO_MAX)
+		rate = SI5351_PLL_VCO_MAX;
+
+	/* determine integer part of feedback equation */
+	a = rate / *parent_rate;
+
+	if (a < SI5351_PLL_A_MIN)
+		rate = *parent_rate * SI5351_PLL_A_MIN;
+	if (a > SI5351_PLL_A_MAX)
+		rate = *parent_rate * SI5351_PLL_A_MAX;
+
+	/* find best approximation for b/c = fVCO mod fIN */
+	denom = 1000 * 1000;
+	lltmp = rate % (*parent_rate);
+	lltmp *= denom;
+	do_div(lltmp, *parent_rate);
+	rfrac = (unsigned long)lltmp;
+
+	b = 0;
+	c = 1;
+	if (rfrac)
+		rational_best_approximation(rfrac, denom,
+				    SI5351_PLL_B_MAX, SI5351_PLL_C_MAX, &b, &c);
+
+	/* calculate parameters */
+	hwdata->params.p3  = c;
+	hwdata->params.p2  = (128 * b) % c;
+	hwdata->params.p1  = 128 * a;
+	hwdata->params.p1 += (128 * b / c);
+	hwdata->params.p1 -= 512;
+
+	/* recalculate rate by fIN * (a + b/c) */
+	lltmp  = *parent_rate;
+	lltmp *= b;
+	do_div(lltmp, c);
+
+	rate  = (unsigned long)lltmp;
+	rate += *parent_rate * a;
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: a = %lu, b = %lu, c = %lu, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk), a, b, c,
+		*parent_rate, rate);
+
+	return rate;
+}
+
+static int si5351_pll_set_rate(struct clk_hw *hw, unsigned long rate,
+			       unsigned long parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	u8 reg = (hwdata->num == 0) ? SI5351_PLLA_PARAMETERS :
+		SI5351_PLLB_PARAMETERS;
+
+	/* write multisynth parameters */
+	si5351_write_parameters(hwdata->drvdata, reg, &hwdata->params);
+
+	/* plla/pllb ctrl is in clk6/clk7 ctrl registers */
+	si5351_set_bits(hwdata->drvdata, SI5351_CLK6_CTRL + hwdata->num,
+		SI5351_CLK_INTEGER_MODE,
+		(hwdata->params.p2 == 0) ? SI5351_CLK_INTEGER_MODE : 0);
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: p1 = %lu, p2 = %lu, p3 = %lu, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk),
+		hwdata->params.p1, hwdata->params.p2, hwdata->params.p3,
+		parent_rate, rate);
+
+	return 0;
+}
+
+static const struct clk_ops si5351_pll_ops = {
+	.set_parent = si5351_pll_set_parent,
+	.get_parent = si5351_pll_get_parent,
+	.recalc_rate = si5351_pll_recalc_rate,
+	.round_rate = si5351_pll_round_rate,
+	.set_rate = si5351_pll_set_rate,
+};
+
+/*
+ * Si5351 multisync divider
+ *
+ * for fOUT <= 150 MHz:
+ *
+ * fOUT = (fIN * (a + b/c)) / CLKOUTDIV
+ *
+ * with 6 + 0/1048575 <= (a + b/c) <= 1800 + 0/1048575 and
+ * fIN = fVCO0, fVCO1
+ *
+ * Output Clock Multisynth Register Equations
+ *
+ * MSx_P1[17:0] = 128 * a + floor(128 * b/c) - 512
+ * MSx_P2[19:0] = 128 * b - c * floor(128 * b/c) = (128*b) mod c
+ * MSx_P3[19:0] = c
+ *
+ * MS[6,7] are integer (P1) divide only, P2 = 0, P3 = 0
+ *
+ * for 150MHz < fOUT <= 160MHz:
+ *
+ * MSx_P1 = 0, MSx_P2 = 0, MSx_P3 = 1, MSx_INT = 1, MSx_DIVBY4 = 11b
+ */
+static int _si5351_msynth_reparent(struct si5351_driver_data *drvdata,
+				   int num, enum si5351_multisynth_src parent)
+{
+	if (parent == SI5351_MULTISYNTH_SRC_DEFAULT)
+		return 0;
+
+	if (num > 8)
+		return -EINVAL;
+
+	si5351_set_bits(drvdata, SI5351_CLK0_CTRL + num, SI5351_CLK_PLL_SELECT,
+			(parent == SI5351_MULTISYNTH_SRC_VCO0) ? 0 :
+			SI5351_CLK_PLL_SELECT);
+	return 0;
+}
+
+static unsigned char si5351_msynth_get_parent(struct clk_hw *hw)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	u8 val;
+
+	val = si5351_reg_read(hwdata->drvdata, SI5351_CLK0_CTRL + hwdata->num);
+
+	return (val & SI5351_CLK_PLL_SELECT) ? 1 : 0;
+}
+
+static int si5351_msynth_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+
+	return _si5351_msynth_reparent(hwdata->drvdata, hwdata->num,
+			       (index == 0) ? SI5351_MULTISYNTH_SRC_VCO0 :
+			       SI5351_MULTISYNTH_SRC_VCO1);
+}
+
+static unsigned long si5351_msynth_recalc_rate(struct clk_hw *hw,
+					       unsigned long parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	u8 reg = si5351_msynth_params_address(hwdata->num);
+	unsigned long long rate;
+	unsigned long m;
+
+	if (!hwdata->params.valid)
+		si5351_read_parameters(hwdata->drvdata, reg, &hwdata->params);
+
+	if (hwdata->params.p3 == 0)
+		return parent_rate;
+
+	/*
+	 * multisync0-5: fOUT = (128 * P3 * fIN) / (P1*P3 + P2 + 512*P3)
+	 * multisync6-7: fOUT = fIN / P1
+	 */
+	rate = parent_rate;
+	if (hwdata->num > 5) {
+		m = hwdata->params.p1;
+	} else if ((si5351_reg_read(hwdata->drvdata, reg + 2) &
+		    SI5351_OUTPUT_CLK_DIVBY4) == SI5351_OUTPUT_CLK_DIVBY4) {
+		m = 4;
+	} else {
+		rate *= 128 * hwdata->params.p3;
+		m = hwdata->params.p1 * hwdata->params.p3;
+		m += hwdata->params.p2;
+		m += 512 * hwdata->params.p3;
+	}
+
+	if (m == 0)
+		return 0;
+	do_div(rate, m);
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: p1 = %lu, p2 = %lu, p3 = %lu, m = %lu, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk),
+		hwdata->params.p1, hwdata->params.p2, hwdata->params.p3,
+		m, parent_rate, (unsigned long)rate);
+
+	return (unsigned long)rate;
+}
+
+static long si5351_msynth_round_rate(struct clk_hw *hw, unsigned long rate,
+				     unsigned long *parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	unsigned long long lltmp;
+	unsigned long a, b, c;
+	int divby4;
+
+	/* multisync6-7 can only handle freqencies < 150MHz */
+	if (hwdata->num >= 6 && rate > SI5351_MULTISYNTH67_MAX_FREQ)
+		rate = SI5351_MULTISYNTH67_MAX_FREQ;
+
+	/* multisync frequency is 1MHz .. 160MHz */
+	if (rate > SI5351_MULTISYNTH_MAX_FREQ)
+		rate = SI5351_MULTISYNTH_MAX_FREQ;
+	if (rate < SI5351_MULTISYNTH_MIN_FREQ)
+		rate = SI5351_MULTISYNTH_MIN_FREQ;
+
+	divby4 = 0;
+	if (rate > SI5351_MULTISYNTH_DIVBY4_FREQ)
+		divby4 = 1;
+
+	/* multisync can set pll */
+	if (__clk_get_flags(hwdata->hw.clk) & CLK_SET_RATE_PARENT) {
+		/*
+		 * find largest integer divider for max
+		 * vco frequency and given target rate
+		 */
+		if (divby4 == 0) {
+			lltmp = SI5351_PLL_VCO_MAX;
+			do_div(lltmp, rate);
+			a = (unsigned long)lltmp;
+		} else
+			a = 4;
+
+		b = 0;
+		c = 1;
+
+		*parent_rate = a * rate;
+	} else {
+		unsigned long rfrac, denom;
+
+		/* disable divby4 */
+		if (divby4) {
+			rate = SI5351_MULTISYNTH_DIVBY4_FREQ;
+			divby4 = 0;
+		}
+
+		/* determine integer part of divider equation */
+		a = *parent_rate / rate;
+		if (a < SI5351_MULTISYNTH_A_MIN)
+			a = SI5351_MULTISYNTH_A_MIN;
+		if (hwdata->num >= 6 && a > SI5351_MULTISYNTH67_A_MAX)
+			a = SI5351_MULTISYNTH67_A_MAX;
+		else if (a > SI5351_MULTISYNTH_A_MAX)
+			a = SI5351_MULTISYNTH_A_MAX;
+
+		/* find best approximation for b/c = fVCO mod fOUT */
+		denom = 1000 * 1000;
+		lltmp = (*parent_rate) % rate;
+		lltmp *= denom;
+		do_div(lltmp, rate);
+		rfrac = (unsigned long)lltmp;
+
+		b = 0;
+		c = 1;
+		if (rfrac)
+			rational_best_approximation(rfrac, denom,
+			    SI5351_MULTISYNTH_B_MAX, SI5351_MULTISYNTH_C_MAX,
+			    &b, &c);
+	}
+
+	/* recalculate rate by fOUT = fIN / (a + b/c) */
+	lltmp  = *parent_rate;
+	lltmp *= c;
+	do_div(lltmp, a * c + b);
+	rate  = (unsigned long)lltmp;
+
+	/* calculate parameters */
+	if (divby4) {
+		hwdata->params.p3 = 1;
+		hwdata->params.p2 = 0;
+		hwdata->params.p1 = 0;
+	} else {
+		hwdata->params.p3  = c;
+		hwdata->params.p2  = (128 * b) % c;
+		hwdata->params.p1  = 128 * a;
+		hwdata->params.p1 += (128 * b / c);
+		hwdata->params.p1 -= 512;
+	}
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: a = %lu, b = %lu, c = %lu, divby4 = %d, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk), a, b, c, divby4,
+		*parent_rate, rate);
+
+	return rate;
+}
+
+static int si5351_msynth_set_rate(struct clk_hw *hw, unsigned long rate,
+				  unsigned long parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	u8 reg = si5351_msynth_params_address(hwdata->num);
+	int divby4 = 0;
+
+	/* write multisynth parameters */
+	si5351_write_parameters(hwdata->drvdata, reg, &hwdata->params);
+
+	if (rate > SI5351_MULTISYNTH_DIVBY4_FREQ)
+		divby4 = 1;
+
+	/* enable/disable integer mode and divby4 on multisynth0-5 */
+	if (hwdata->num < 6) {
+		si5351_set_bits(hwdata->drvdata, reg + 2,
+				SI5351_OUTPUT_CLK_DIVBY4,
+				(divby4) ? SI5351_OUTPUT_CLK_DIVBY4 : 0);
+		si5351_set_bits(hwdata->drvdata, SI5351_CLK0_CTRL + hwdata->num,
+			SI5351_CLK_INTEGER_MODE,
+			(hwdata->params.p2 == 0) ? SI5351_CLK_INTEGER_MODE : 0);
+	}
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: p1 = %lu, p2 = %lu, p3 = %lu, divby4 = %d, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk),
+		hwdata->params.p1, hwdata->params.p2, hwdata->params.p3,
+		divby4, parent_rate, rate);
+
+	return 0;
+}
+
+static const struct clk_ops si5351_msynth_ops = {
+	.set_parent = si5351_msynth_set_parent,
+	.get_parent = si5351_msynth_get_parent,
+	.recalc_rate = si5351_msynth_recalc_rate,
+	.round_rate = si5351_msynth_round_rate,
+	.set_rate = si5351_msynth_set_rate,
+};
+
+/*
+ * Si5351 clkout divider
+ */
+static int _si5351_clkout_reparent(struct si5351_driver_data *drvdata,
+				   int num, enum si5351_clkout_src parent)
+{
+	u8 val;
+
+	if (num > 8)
+		return -EINVAL;
+
+	switch (parent) {
+	case SI5351_CLKOUT_SRC_MSYNTH_N:
+		val = SI5351_CLK_INPUT_MULTISYNTH_N;
+		break;
+	case SI5351_CLKOUT_SRC_MSYNTH_0_4:
+		/* clk0/clk4 can only connect to its own multisync */
+		if (num == 0 || num == 4)
+			val = SI5351_CLK_INPUT_MULTISYNTH_N;
+		else
+			val = SI5351_CLK_INPUT_MULTISYNTH_0_4;
+		break;
+	case SI5351_CLKOUT_SRC_XTAL:
+		val = SI5351_CLK_INPUT_XTAL;
+		break;
+	case SI5351_CLKOUT_SRC_CLKIN:
+		if (drvdata->variant != SI5351_VARIANT_C)
+			return -EINVAL;
+
+		val = SI5351_CLK_INPUT_CLKIN;
+		break;
+	default:
+		return 0;
+	}
+
+	si5351_set_bits(drvdata, SI5351_CLK0_CTRL + num,
+			SI5351_CLK_INPUT_MASK, val);
+	return 0;
+}
+
+static int _si5351_clkout_set_drive_strength(
+	struct si5351_driver_data *drvdata, int num,
+	enum si5351_drive_strength drive)
+{
+	u8 mask;
+
+	if (num > 8)
+		return -EINVAL;
+
+	switch (drive) {
+	case SI5351_DRIVE_2MA:
+		mask = SI5351_CLK_DRIVE_STRENGTH_2MA;
+		break;
+	case SI5351_DRIVE_4MA:
+		mask = SI5351_CLK_DRIVE_STRENGTH_4MA;
+		break;
+	case SI5351_DRIVE_6MA:
+		mask = SI5351_CLK_DRIVE_STRENGTH_6MA;
+		break;
+	case SI5351_DRIVE_8MA:
+		mask = SI5351_CLK_DRIVE_STRENGTH_8MA;
+		break;
+	default:
+		return 0;
+	}
+
+	si5351_set_bits(drvdata, SI5351_CLK0_CTRL + num,
+			SI5351_CLK_DRIVE_STRENGTH_MASK, mask);
+	return 0;
+}
+
+static int si5351_clkout_prepare(struct clk_hw *hw)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+
+	si5351_set_bits(hwdata->drvdata, SI5351_CLK0_CTRL + hwdata->num,
+			SI5351_CLK_POWERDOWN, 0);
+	si5351_set_bits(hwdata->drvdata, SI5351_OUTPUT_ENABLE_CTRL,
+			(1 << hwdata->num), 0);
+	return 0;
+}
+
+static void si5351_clkout_unprepare(struct clk_hw *hw)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+
+	si5351_set_bits(hwdata->drvdata, SI5351_CLK0_CTRL + hwdata->num,
+			SI5351_CLK_POWERDOWN, SI5351_CLK_POWERDOWN);
+	si5351_set_bits(hwdata->drvdata, SI5351_OUTPUT_ENABLE_CTRL,
+			(1 << hwdata->num), (1 << hwdata->num));
+}
+
+static u8 si5351_clkout_get_parent(struct clk_hw *hw)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	int index = 0;
+	unsigned char val;
+
+	val = si5351_reg_read(hwdata->drvdata, SI5351_CLK0_CTRL + hwdata->num);
+	switch (val & SI5351_CLK_INPUT_MASK) {
+	case SI5351_CLK_INPUT_MULTISYNTH_N:
+		index = 0;
+		break;
+	case SI5351_CLK_INPUT_MULTISYNTH_0_4:
+		index = 1;
+		break;
+	case SI5351_CLK_INPUT_XTAL:
+		index = 2;
+		break;
+	case SI5351_CLK_INPUT_CLKIN:
+		index = 3;
+		break;
+	}
+
+	return index;
+}
+
+static int si5351_clkout_set_parent(struct clk_hw *hw, u8 index)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	enum si5351_clkout_src parent = SI5351_CLKOUT_SRC_DEFAULT;
+
+	switch (index) {
+	case 0:
+		parent = SI5351_CLKOUT_SRC_MSYNTH_N;
+		break;
+	case 1:
+		parent = SI5351_CLKOUT_SRC_MSYNTH_0_4;
+		break;
+	case 2:
+		parent = SI5351_CLKOUT_SRC_XTAL;
+		break;
+	case 3:
+		parent = SI5351_CLKOUT_SRC_CLKIN;
+		break;
+	}
+
+	return _si5351_clkout_reparent(hwdata->drvdata, hwdata->num, parent);
+}
+
+static unsigned long si5351_clkout_recalc_rate(struct clk_hw *hw,
+					       unsigned long parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	unsigned char reg;
+	unsigned char rdiv;
+
+	if (hwdata->num > 5)
+		reg = si5351_msynth_params_address(hwdata->num) + 2;
+	else
+		reg = SI5351_CLK6_7_OUTPUT_DIVIDER;
+
+	rdiv = si5351_reg_read(hwdata->drvdata, reg);
+	if (hwdata->num == 6) {
+		rdiv &= SI5351_OUTPUT_CLK6_DIV_MASK;
+	} else {
+		rdiv &= SI5351_OUTPUT_CLK_DIV_MASK;
+		rdiv >>= SI5351_OUTPUT_CLK_DIV_SHIFT;
+	}
+
+	return parent_rate >> rdiv;
+}
+
+static long si5351_clkout_round_rate(struct clk_hw *hw, unsigned long rate,
+				     unsigned long *parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	unsigned char rdiv;
+
+	/* clkout6/7 can only handle output freqencies < 150MHz */
+	if (hwdata->num >= 6 && rate > SI5351_CLKOUT67_MAX_FREQ)
+		rate = SI5351_CLKOUT67_MAX_FREQ;
+
+	/* clkout freqency is 8kHz - 160MHz */
+	if (rate > SI5351_CLKOUT_MAX_FREQ)
+		rate = SI5351_CLKOUT_MAX_FREQ;
+	if (rate < SI5351_CLKOUT_MIN_FREQ)
+		rate = SI5351_CLKOUT_MIN_FREQ;
+
+	/* request frequency if multisync master */
+	if (__clk_get_flags(hwdata->hw.clk) & CLK_SET_RATE_PARENT) {
+		/* use r divider for frequencies below 1MHz */
+		rdiv = SI5351_OUTPUT_CLK_DIV_1;
+		while (rate < SI5351_MULTISYNTH_MIN_FREQ &&
+		       rdiv < SI5351_OUTPUT_CLK_DIV_128) {
+			rdiv += 1;
+			rate *= 2;
+		}
+		*parent_rate = rate;
+	} else {
+		unsigned long new_rate, new_err, err;
+
+		/* round to closed rdiv */
+		rdiv = SI5351_OUTPUT_CLK_DIV_1;
+		new_rate = *parent_rate;
+		err = abs(new_rate - rate);
+		do {
+			new_rate >>= 1;
+			new_err = abs(new_rate - rate);
+			if (new_err > err || rdiv == SI5351_OUTPUT_CLK_DIV_128)
+				break;
+			rdiv++;
+			err = new_err;
+		} while (1);
+	}
+	rate = *parent_rate >> rdiv;
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: rdiv = %u, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk), (1 << rdiv),
+		*parent_rate, rate);
+
+	return rate;
+}
+
+static int si5351_clkout_set_rate(struct clk_hw *hw, unsigned long rate,
+				  unsigned long parent_rate)
+{
+	struct si5351_hw_data *hwdata =
+		container_of(hw, struct si5351_hw_data, hw);
+	unsigned long new_rate, new_err, err;
+	unsigned char rdiv;
+
+	/* round to closed rdiv */
+	rdiv = SI5351_OUTPUT_CLK_DIV_1;
+	new_rate = parent_rate;
+	err = abs(new_rate - rate);
+	do {
+		new_rate >>= 1;
+		new_err = abs(new_rate - rate);
+		if (new_err > err || rdiv == SI5351_OUTPUT_CLK_DIV_128)
+			break;
+		rdiv++;
+		err = new_err;
+	} while (1);
+
+	/* write output divider */
+	switch (hwdata->num) {
+	case 6:
+		si5351_set_bits(hwdata->drvdata, SI5351_CLK6_7_OUTPUT_DIVIDER,
+				SI5351_OUTPUT_CLK6_DIV_MASK, rdiv);
+		break;
+	case 7:
+		si5351_set_bits(hwdata->drvdata, SI5351_CLK6_7_OUTPUT_DIVIDER,
+				SI5351_OUTPUT_CLK_DIV_MASK,
+				rdiv << SI5351_OUTPUT_CLK_DIV_SHIFT);
+		break;
+	default:
+		si5351_set_bits(hwdata->drvdata,
+				si5351_msynth_params_address(hwdata->num) + 2,
+				SI5351_OUTPUT_CLK_DIV_MASK,
+				rdiv << SI5351_OUTPUT_CLK_DIV_SHIFT);
+	}
+
+	/* powerup clkout */
+	si5351_set_bits(hwdata->drvdata, SI5351_CLK0_CTRL + hwdata->num,
+			SI5351_CLK_POWERDOWN, 0);
+
+	dev_dbg(&hwdata->drvdata->client->dev,
+		"%s - %s: rdiv = %u, parent_rate = %lu, rate = %lu\n",
+		__func__, __clk_get_name(hwdata->hw.clk), (1 << rdiv),
+		parent_rate, rate);
+
+	return 0;
+}
+
+static const struct clk_ops si5351_clkout_ops = {
+	.prepare = si5351_clkout_prepare,
+	.unprepare = si5351_clkout_unprepare,
+	.set_parent = si5351_clkout_set_parent,
+	.get_parent = si5351_clkout_get_parent,
+	.recalc_rate = si5351_clkout_recalc_rate,
+	.round_rate = si5351_clkout_round_rate,
+	.set_rate = si5351_clkout_set_rate,
+};
+
+/*
+ * Si5351 i2c probe and DT
+ */
+#ifdef CONFIG_OF
+static const struct of_device_id si5351_dt_ids[] = {
+	{ .compatible = "silabs,si5351a", .data = (void *)SI5351_VARIANT_A, },
+	{ .compatible = "silabs,si5351a-msop",
+					 .data = (void *)SI5351_VARIANT_A3, },
+	{ .compatible = "silabs,si5351b", .data = (void *)SI5351_VARIANT_B, },
+	{ .compatible = "silabs,si5351c", .data = (void *)SI5351_VARIANT_C, },
+	{ }
+};
+MODULE_DEVICE_TABLE(of, si5351_dt_ids);
+
+static int si5351_dt_parse(struct i2c_client *client)
+{
+	struct device_node *child, *np = client->dev.of_node;
+	struct si5351_platform_data *pdata;
+	const struct of_device_id *match;
+	struct property *prop;
+	const __be32 *p;
+	int num = 0;
+	u32 val;
+
+	if (np == NULL)
+		return 0;
+
+	match = of_match_node(si5351_dt_ids, np);
+	if (match == NULL)
+		return -EINVAL;
+
+	pdata = devm_kzalloc(&client->dev, sizeof(*pdata), GFP_KERNEL);
+	if (!pdata)
+		return -ENOMEM;
+
+	pdata->variant = (enum si5351_variant)match->data;
+	pdata->clk_xtal = of_clk_get(np, 0);
+	if (!IS_ERR(pdata->clk_xtal))
+		clk_put(pdata->clk_xtal);
+	pdata->clk_clkin = of_clk_get(np, 1);
+	if (!IS_ERR(pdata->clk_clkin))
+		clk_put(pdata->clk_clkin);
+
+	/*
+	 * property silabs,pll-source : <num src>, [<..>]
+	 * allow to selectively set pll source
+	 */
+	of_property_for_each_u32(np, "silabs,pll-source", prop, p, num) {
+		if (num >= 2) {
+			dev_err(&client->dev,
+				"invalid pll %d on pll-source prop\n", num);
+			return -EINVAL;
+		}
+
+		p = of_prop_next_u32(prop, p, &val);
+		if (!p) {
+			dev_err(&client->dev,
+				"missing pll-source for pll %d\n", num);
+			return -EINVAL;
+		}
+
+		switch (val) {
+		case 0:
+			pdata->pll_src[num] = SI5351_PLL_SRC_XTAL;
+			break;
+		case 1:
+			if (pdata->variant != SI5351_VARIANT_C) {
+				dev_err(&client->dev,
+					"invalid parent %d for pll %d\n",
+					val, num);
+				return -EINVAL;
+			}
+			pdata->pll_src[num] = SI5351_PLL_SRC_CLKIN;
+			break;
+		default:
+			dev_err(&client->dev,
+				 "invalid parent %d for pll %d\n", val, num);
+			return -EINVAL;
+		}
+	}
+
+	/* per clkout properties */
+	for_each_child_of_node(np, child) {
+		if (of_property_read_u32(child, "reg", &num)) {
+			dev_err(&client->dev, "missing reg property of %s\n",
+				child->name);
+			return -EINVAL;
+		}
+
+		if (num >= 8 ||
+		    (pdata->variant == SI5351_VARIANT_A3 && num >= 3)) {
+			dev_err(&client->dev, "invalid clkout %d\n", num);
+			return -EINVAL;
+		}
+
+		if (!of_property_read_u32(child, "silabs,multisynth-source",
+					  &val)) {
+			switch (val) {
+			case 0:
+				pdata->clkout[num].multisynth_src =
+					SI5351_MULTISYNTH_SRC_VCO0;
+				break;
+			case 1:
+				pdata->clkout[num].multisynth_src =
+					SI5351_MULTISYNTH_SRC_VCO1;
+				break;
+			default:
+				dev_err(&client->dev,
+					"invalid parent %d for multisynth %d\n",
+					val, num);
+				return -EINVAL;
+			}
+		}
+
+		if (!of_property_read_u32(child, "silabs,clock-source", &val)) {
+			switch (val) {
+			case 0:
+				pdata->clkout[num].clkout_src =
+					SI5351_CLKOUT_SRC_MSYNTH_N;
+				break;
+			case 1:
+				pdata->clkout[num].clkout_src =
+					SI5351_CLKOUT_SRC_MSYNTH_0_4;
+				break;
+			case 2:
+				pdata->clkout[num].clkout_src =
+					SI5351_CLKOUT_SRC_XTAL;
+				break;
+			case 3:
+				if (pdata->variant != SI5351_VARIANT_C) {
+					dev_err(&client->dev,
+						"invalid parent %d for clkout %d\n",
+						val, num);
+					return -EINVAL;
+				}
+				pdata->clkout[num].clkout_src =
+					SI5351_CLKOUT_SRC_CLKIN;
+				break;
+			default:
+				dev_err(&client->dev,
+					"invalid parent %d for clkout %d\n",
+					val, num);
+				return -EINVAL;
+			}
+		}
+
+		if (!of_property_read_u32(child, "silabs,drive-strength",
+					  &val)) {
+			switch (val) {
+			case SI5351_DRIVE_2MA:
+			case SI5351_DRIVE_4MA:
+			case SI5351_DRIVE_6MA:
+			case SI5351_DRIVE_8MA:
+				pdata->clkout[num].drive = val;
+				break;
+			default:
+				dev_err(&client->dev,
+					"invalid drive strength %d for clkout %d\n",
+					val, num);
+				return -EINVAL;
+			}
+		}
+
+		if (!of_property_read_u32(child, "clock-frequency", &val))
+			pdata->clkout[num].rate = val;
+
+		pdata->clkout[num].pll_master =
+			of_property_read_bool(child, "silabs,pll-master");
+	}
+	client->dev.platform_data = pdata;
+
+	return 0;
+}
+#else
+static int si5351_dt_parse(struct i2c_client *client)
+{
+	return 0;
+}
+#endif /* CONFIG_OF */
+
+static int si5351_i2c_probe(struct i2c_client *client,
+			    const struct i2c_device_id *id)
+{
+	struct si5351_platform_data *pdata;
+	struct si5351_driver_data *drvdata;
+	struct clk_init_data init;
+	struct clk *clk;
+	const char *parent_names[4];
+	u8 num_parents, num_clocks;
+	int ret, n;
+
+	ret = si5351_dt_parse(client);
+	if (ret)
+		return ret;
+
+	pdata = client->dev.platform_data;
+	if (!pdata)
+		return -EINVAL;
+
+	drvdata = devm_kzalloc(&client->dev, sizeof(*drvdata), GFP_KERNEL);
+	if (drvdata == NULL) {
+		dev_err(&client->dev, "unable to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	i2c_set_clientdata(client, drvdata);
+	drvdata->client = client;
+	drvdata->variant = pdata->variant;
+	drvdata->pxtal = pdata->clk_xtal;
+	drvdata->pclkin = pdata->clk_clkin;
+
+	drvdata->regmap = devm_regmap_init_i2c(client, &si5351_regmap_config);
+	if (IS_ERR(drvdata->regmap)) {
+		dev_err(&client->dev, "failed to allocate register map\n");
+		return PTR_ERR(drvdata->regmap);
+	}
+
+	/* Disable interrupts */
+	si5351_reg_write(drvdata, SI5351_INTERRUPT_MASK, 0xf0);
+	/* Set disabled output drivers to drive low */
+	si5351_reg_write(drvdata, SI5351_CLK3_0_DISABLE_STATE, 0x00);
+	si5351_reg_write(drvdata, SI5351_CLK7_4_DISABLE_STATE, 0x00);
+	/* Ensure pll select is on XTAL for Si5351A/B */
+	if (drvdata->variant != SI5351_VARIANT_C)
+		si5351_set_bits(drvdata, SI5351_PLL_INPUT_SOURCE,
+				SI5351_PLLA_SOURCE | SI5351_PLLB_SOURCE, 0);
+
+	/* setup clock configuration */
+	for (n = 0; n < 2; n++) {
+		ret = _si5351_pll_reparent(drvdata, n, pdata->pll_src[n]);
+		if (ret) {
+			dev_err(&client->dev,
+				"failed to reparent pll %d to %d\n",
+				n, pdata->pll_src[n]);
+			return ret;
+		}
+	}
+
+	for (n = 0; n < 8; n++) {
+		ret = _si5351_msynth_reparent(drvdata, n,
+					      pdata->clkout[n].multisynth_src);
+		if (ret) {
+			dev_err(&client->dev,
+				"failed to reparent multisynth %d to %d\n",
+				n, pdata->clkout[n].multisynth_src);
+			return ret;
+		}
+
+		ret = _si5351_clkout_reparent(drvdata, n,
+					      pdata->clkout[n].clkout_src);
+		if (ret) {
+			dev_err(&client->dev,
+				"failed to reparent clkout %d to %d\n",
+				n, pdata->clkout[n].clkout_src);
+			return ret;
+		}
+
+		ret = _si5351_clkout_set_drive_strength(drvdata, n,
+							pdata->clkout[n].drive);
+		if (ret) {
+			dev_err(&client->dev,
+				"failed set drive strength of clkout%d to %d\n",
+				n, pdata->clkout[n].drive);
+			return ret;
+		}
+	}
+
+	/* register xtal input clock gate */
+	memset(&init, 0, sizeof(init));
+	init.name = si5351_input_names[0];
+	init.ops = &si5351_xtal_ops;
+	init.flags = 0;
+	if (!IS_ERR(drvdata->pxtal)) {
+		drvdata->pxtal_name = __clk_get_name(drvdata->pxtal);
+		init.parent_names = &drvdata->pxtal_name;
+		init.num_parents = 1;
+	}
+	drvdata->xtal.init = &init;
+	clk = devm_clk_register(&client->dev, &drvdata->xtal);
+	if (IS_ERR(clk)) {
+		dev_err(&client->dev, "unable to register %s\n", init.name);
+		return PTR_ERR(clk);
+	}
+
+	/* register clkin input clock gate */
+	if (drvdata->variant == SI5351_VARIANT_C) {
+		memset(&init, 0, sizeof(init));
+		init.name = si5351_input_names[1];
+		init.ops = &si5351_clkin_ops;
+		if (!IS_ERR(drvdata->pclkin)) {
+			drvdata->pclkin_name = __clk_get_name(drvdata->pclkin);
+			init.parent_names = &drvdata->pclkin_name;
+			init.num_parents = 1;
+		}
+		drvdata->clkin.init = &init;
+		clk = devm_clk_register(&client->dev, &drvdata->clkin);
+		if (IS_ERR(clk)) {
+			dev_err(&client->dev, "unable to register %s\n",
+				init.name);
+			return PTR_ERR(clk);
+		}
+	}
+
+	/* Si5351C allows to mux either xtal or clkin to PLL input */
+	num_parents = (drvdata->variant == SI5351_VARIANT_C) ? 2 : 1;
+	parent_names[0] = si5351_input_names[0];
+	parent_names[1] = si5351_input_names[1];
+
+	/* register PLLA */
+	drvdata->pll[0].num = 0;
+	drvdata->pll[0].drvdata = drvdata;
+	drvdata->pll[0].hw.init = &init;
+	memset(&init, 0, sizeof(init));
+	init.name = si5351_pll_names[0];
+	init.ops = &si5351_pll_ops;
+	init.flags = 0;
+	init.parent_names = parent_names;
+	init.num_parents = num_parents;
+	clk = devm_clk_register(&client->dev, &drvdata->pll[0].hw);
+	if (IS_ERR(clk)) {
+		dev_err(&client->dev, "unable to register %s\n", init.name);
+		return -EINVAL;
+	}
+
+	/* register PLLB or VXCO (Si5351B) */
+	drvdata->pll[1].num = 1;
+	drvdata->pll[1].drvdata = drvdata;
+	drvdata->pll[1].hw.init = &init;
+	memset(&init, 0, sizeof(init));
+	if (drvdata->variant == SI5351_VARIANT_B) {
+		init.name = si5351_pll_names[2];
+		init.ops = &si5351_vxco_ops;
+		init.flags = CLK_IS_ROOT;
+		init.parent_names = NULL;
+		init.num_parents = 0;
+	} else {
+		init.name = si5351_pll_names[1];
+		init.ops = &si5351_pll_ops;
+		init.flags = 0;
+		init.parent_names = parent_names;
+		init.num_parents = num_parents;
+	}
+	clk = devm_clk_register(&client->dev, &drvdata->pll[1].hw);
+	if (IS_ERR(clk)) {
+		dev_err(&client->dev, "unable to register %s\n", init.name);
+		return -EINVAL;
+	}
+
+	/* register clk multisync and clk out divider */
+	num_clocks = (drvdata->variant == SI5351_VARIANT_A3) ? 3 : 8;
+	parent_names[0] = si5351_pll_names[0];
+	if (drvdata->variant == SI5351_VARIANT_B)
+		parent_names[1] = si5351_pll_names[2];
+	else
+		parent_names[1] = si5351_pll_names[1];
+
+	drvdata->msynth = devm_kzalloc(&client->dev, num_clocks *
+				       sizeof(*drvdata->msynth), GFP_KERNEL);
+	drvdata->clkout = devm_kzalloc(&client->dev, num_clocks *
+				       sizeof(*drvdata->clkout), GFP_KERNEL);
+
+	drvdata->onecell.clk_num = num_clocks;
+	drvdata->onecell.clks = devm_kzalloc(&client->dev,
+		num_clocks * sizeof(*drvdata->onecell.clks), GFP_KERNEL);
+
+	if (WARN_ON(!drvdata->msynth || !drvdata->clkout ||
+		    !drvdata->onecell.clks))
+		return -ENOMEM;
+
+	for (n = 0; n < num_clocks; n++) {
+		drvdata->msynth[n].num = n;
+		drvdata->msynth[n].drvdata = drvdata;
+		drvdata->msynth[n].hw.init = &init;
+		memset(&init, 0, sizeof(init));
+		init.name = si5351_msynth_names[n];
+		init.ops = &si5351_msynth_ops;
+		init.flags = 0;
+		if (pdata->clkout[n].pll_master)
+			init.flags |= CLK_SET_RATE_PARENT;
+		init.parent_names = parent_names;
+		init.num_parents = 2;
+		clk = devm_clk_register(&client->dev, &drvdata->msynth[n].hw);
+		if (IS_ERR(clk)) {
+			dev_err(&client->dev, "unable to register %s\n",
+				init.name);
+			return -EINVAL;
+		}
+	}
+
+	num_parents = (drvdata->variant == SI5351_VARIANT_C) ? 4 : 3;
+	parent_names[2] = si5351_input_names[0];
+	parent_names[3] = si5351_input_names[1];
+	for (n = 0; n < num_clocks; n++) {
+		parent_names[0] = si5351_msynth_names[n];
+		parent_names[1] = (n < 4) ? si5351_msynth_names[0] :
+			si5351_msynth_names[4];
+
+		drvdata->clkout[n].num = n;
+		drvdata->clkout[n].drvdata = drvdata;
+		drvdata->clkout[n].hw.init = &init;
+		memset(&init, 0, sizeof(init));
+		init.name = si5351_clkout_names[n];
+		init.ops = &si5351_clkout_ops;
+		init.flags = 0;
+		if (pdata->clkout[n].clkout_src == SI5351_CLKOUT_SRC_MSYNTH_N)
+			init.flags |= CLK_SET_RATE_PARENT;
+		init.parent_names = parent_names;
+		init.num_parents = num_parents;
+		clk = devm_clk_register(&client->dev, &drvdata->clkout[n].hw);
+		if (IS_ERR(clk)) {
+			dev_err(&client->dev, "unable to register %s\n",
+				init.name);
+			return -EINVAL;
+		}
+		drvdata->onecell.clks[n] = clk;
+	}
+
+	ret = of_clk_add_provider(client->dev.of_node, of_clk_src_onecell_get,
+				  &drvdata->onecell);
+	if (ret) {
+		dev_err(&client->dev, "unable to add clk provider\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static const struct i2c_device_id si5351_i2c_ids[] = {
+	{ "silabs,si5351", 0 },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, si5351_i2c_ids);
+
+static struct i2c_driver si5351_driver = {
+	.driver = {
+		.name = "si5351",
+		.of_match_table = of_match_ptr(si5351_dt_ids),
+	},
+	.probe = si5351_i2c_probe,
+	.id_table = si5351_i2c_ids,
+};
+module_i2c_driver(si5351_driver);
+
+MODULE_AUTHOR("Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com");
+MODULE_DESCRIPTION("Silicon Labs Si5351A/B/C clock generator driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/clk/clk-si5351.h b/drivers/clk/clk-si5351.h
new file mode 100644
index 000000000000..af41b5080f43
--- /dev/null
+++ b/drivers/clk/clk-si5351.h
@@ -0,0 +1,155 @@
+/*
+ * clk-si5351.h: Silicon Laboratories Si5351A/B/C I2C Clock Generator
+ *
+ * Sebastian Hesselbarth <sebastian.hesselbarth@gmail.com>
+ * Rabeeh Khoury <rabeeh@solid-run.com>
+ *
+ * This program is free software; you can redistribute  it and/or modify it
+ * under  the terms of  the GNU General  Public License as published by the
+ * Free Software Foundation;  either version 2 of the  License, or (at your
+ * option) any later version.
+ */
+
+#ifndef _CLK_SI5351_H_
+#define _CLK_SI5351_H_
+
+#define SI5351_BUS_BASE_ADDR			0x60
+
+#define SI5351_PLL_VCO_MIN			600000000
+#define SI5351_PLL_VCO_MAX			900000000
+#define SI5351_MULTISYNTH_MIN_FREQ		1000000
+#define SI5351_MULTISYNTH_DIVBY4_FREQ		150000000
+#define SI5351_MULTISYNTH_MAX_FREQ		160000000
+#define SI5351_MULTISYNTH67_MAX_FREQ		SI5351_MULTISYNTH_DIVBY4_FREQ
+#define SI5351_CLKOUT_MIN_FREQ			8000
+#define SI5351_CLKOUT_MAX_FREQ			SI5351_MULTISYNTH_MAX_FREQ
+#define SI5351_CLKOUT67_MAX_FREQ		SI5351_MULTISYNTH67_MAX_FREQ
+
+#define SI5351_PLL_A_MIN			15
+#define SI5351_PLL_A_MAX			90
+#define SI5351_PLL_B_MAX			(SI5351_PLL_C_MAX-1)
+#define SI5351_PLL_C_MAX			1048575
+#define SI5351_MULTISYNTH_A_MIN			6
+#define SI5351_MULTISYNTH_A_MAX			1800
+#define SI5351_MULTISYNTH67_A_MAX		254
+#define SI5351_MULTISYNTH_B_MAX			(SI5351_MULTISYNTH_C_MAX-1)
+#define SI5351_MULTISYNTH_C_MAX			1048575
+#define SI5351_MULTISYNTH_P1_MAX		((1<<18)-1)
+#define SI5351_MULTISYNTH_P2_MAX		((1<<20)-1)
+#define SI5351_MULTISYNTH_P3_MAX		((1<<20)-1)
+
+#define SI5351_DEVICE_STATUS			0
+#define SI5351_INTERRUPT_STATUS			1
+#define SI5351_INTERRUPT_MASK			2
+#define  SI5351_STATUS_SYS_INIT			(1<<7)
+#define  SI5351_STATUS_LOL_B			(1<<6)
+#define  SI5351_STATUS_LOL_A			(1<<5)
+#define  SI5351_STATUS_LOS			(1<<4)
+#define SI5351_OUTPUT_ENABLE_CTRL		3
+#define SI5351_OEB_PIN_ENABLE_CTRL		9
+#define SI5351_PLL_INPUT_SOURCE			15
+#define  SI5351_CLKIN_DIV_MASK			(3<<6)
+#define  SI5351_CLKIN_DIV_1			(0<<6)
+#define  SI5351_CLKIN_DIV_2			(1<<6)
+#define  SI5351_CLKIN_DIV_4			(2<<6)
+#define  SI5351_CLKIN_DIV_8			(3<<6)
+#define  SI5351_PLLB_SOURCE			(1<<3)
+#define  SI5351_PLLA_SOURCE			(1<<2)
+
+#define SI5351_CLK0_CTRL			16
+#define SI5351_CLK1_CTRL			17
+#define SI5351_CLK2_CTRL			18
+#define SI5351_CLK3_CTRL			19
+#define SI5351_CLK4_CTRL			20
+#define SI5351_CLK5_CTRL			21
+#define SI5351_CLK6_CTRL			22
+#define SI5351_CLK7_CTRL			23
+#define  SI5351_CLK_POWERDOWN			(1<<7)
+#define  SI5351_CLK_INTEGER_MODE		(1<<6)
+#define  SI5351_CLK_PLL_SELECT			(1<<5)
+#define  SI5351_CLK_INVERT			(1<<4)
+#define  SI5351_CLK_INPUT_MASK			(3<<2)
+#define  SI5351_CLK_INPUT_XTAL			(0<<2)
+#define  SI5351_CLK_INPUT_CLKIN			(1<<2)
+#define  SI5351_CLK_INPUT_MULTISYNTH_0_4	(2<<2)
+#define  SI5351_CLK_INPUT_MULTISYNTH_N		(3<<2)
+#define  SI5351_CLK_DRIVE_STRENGTH_MASK		(3<<0)
+#define  SI5351_CLK_DRIVE_STRENGTH_2MA		(0<<0)
+#define  SI5351_CLK_DRIVE_STRENGTH_4MA		(1<<0)
+#define  SI5351_CLK_DRIVE_STRENGTH_6MA		(2<<0)
+#define  SI5351_CLK_DRIVE_STRENGTH_8MA		(3<<0)
+
+#define SI5351_CLK3_0_DISABLE_STATE		24
+#define SI5351_CLK7_4_DISABLE_STATE		25
+#define  SI5351_CLK_DISABLE_STATE_LOW		0
+#define  SI5351_CLK_DISABLE_STATE_HIGH		1
+#define  SI5351_CLK_DISABLE_STATE_FLOAT		2
+#define  SI5351_CLK_DISABLE_STATE_NEVER		3
+
+#define SI5351_PARAMETERS_LENGTH		8
+#define SI5351_PLLA_PARAMETERS			26
+#define SI5351_PLLB_PARAMETERS			34
+#define SI5351_CLK0_PARAMETERS			42
+#define SI5351_CLK1_PARAMETERS			50
+#define SI5351_CLK2_PARAMETERS			58
+#define SI5351_CLK3_PARAMETERS			66
+#define SI5351_CLK4_PARAMETERS			74
+#define SI5351_CLK5_PARAMETERS			82
+#define SI5351_CLK6_PARAMETERS			90
+#define SI5351_CLK7_PARAMETERS			91
+#define SI5351_CLK6_7_OUTPUT_DIVIDER		92
+#define  SI5351_OUTPUT_CLK_DIV_MASK		(7 << 4)
+#define  SI5351_OUTPUT_CLK6_DIV_MASK		(7 << 0)
+#define  SI5351_OUTPUT_CLK_DIV_SHIFT		4
+#define  SI5351_OUTPUT_CLK_DIV6_SHIFT		0
+#define  SI5351_OUTPUT_CLK_DIV_1		0
+#define  SI5351_OUTPUT_CLK_DIV_2		1
+#define  SI5351_OUTPUT_CLK_DIV_4		2
+#define  SI5351_OUTPUT_CLK_DIV_8		3
+#define  SI5351_OUTPUT_CLK_DIV_16		4
+#define  SI5351_OUTPUT_CLK_DIV_32		5
+#define  SI5351_OUTPUT_CLK_DIV_64		6
+#define  SI5351_OUTPUT_CLK_DIV_128		7
+#define  SI5351_OUTPUT_CLK_DIVBY4		(3<<2)
+
+#define SI5351_SSC_PARAM0			149
+#define SI5351_SSC_PARAM1			150
+#define SI5351_SSC_PARAM2			151
+#define SI5351_SSC_PARAM3			152
+#define SI5351_SSC_PARAM4			153
+#define SI5351_SSC_PARAM5			154
+#define SI5351_SSC_PARAM6			155
+#define SI5351_SSC_PARAM7			156
+#define SI5351_SSC_PARAM8			157
+#define SI5351_SSC_PARAM9			158
+#define SI5351_SSC_PARAM10			159
+#define SI5351_SSC_PARAM11			160
+#define SI5351_SSC_PARAM12			161
+
+#define SI5351_VXCO_PARAMETERS_LOW		162
+#define SI5351_VXCO_PARAMETERS_MID		163
+#define SI5351_VXCO_PARAMETERS_HIGH		164
+
+#define SI5351_CLK0_PHASE_OFFSET		165
+#define SI5351_CLK1_PHASE_OFFSET		166
+#define SI5351_CLK2_PHASE_OFFSET		167
+#define SI5351_CLK3_PHASE_OFFSET		168
+#define SI5351_CLK4_PHASE_OFFSET		169
+#define SI5351_CLK5_PHASE_OFFSET		170
+
+#define SI5351_PLL_RESET			177
+#define  SI5351_PLL_RESET_B			(1<<7)
+#define  SI5351_PLL_RESET_A			(1<<5)
+
+#define SI5351_CRYSTAL_LOAD			183
+#define  SI5351_CRYSTAL_LOAD_MASK		(3<<6)
+#define  SI5351_CRYSTAL_LOAD_6PF		(1<<6)
+#define  SI5351_CRYSTAL_LOAD_8PF		(2<<6)
+#define  SI5351_CRYSTAL_LOAD_10PF		(3<<6)
+
+#define SI5351_FANOUT_ENABLE			187
+#define  SI5351_CLKIN_ENABLE			(1<<7)
+#define  SI5351_XTAL_ENABLE			(1<<6)
+#define  SI5351_MULTISYNTH_ENABLE		(1<<4)
+
+#endif
diff --git a/include/linux/platform_data/si5351.h b/include/linux/platform_data/si5351.h
new file mode 100644
index 000000000000..92dabcaf6499
--- /dev/null
+++ b/include/linux/platform_data/si5351.h
@@ -0,0 +1,114 @@
+/*
+ * Si5351A/B/C programmable clock generator platform_data.
+ */
+
+#ifndef __LINUX_PLATFORM_DATA_SI5351_H__
+#define __LINUX_PLATFORM_DATA_SI5351_H__
+
+struct clk;
+
+/**
+ * enum si5351_variant - SiLabs Si5351 chip variant
+ * @SI5351_VARIANT_A: Si5351A (8 output clocks, XTAL input)
+ * @SI5351_VARIANT_A3: Si5351A MSOP10 (3 output clocks, XTAL input)
+ * @SI5351_VARIANT_B: Si5351B (8 output clocks, XTAL/VXCO input)
+ * @SI5351_VARIANT_C: Si5351C (8 output clocks, XTAL/CLKIN input)
+ */
+enum si5351_variant {
+	SI5351_VARIANT_A = 1,
+	SI5351_VARIANT_A3 = 2,
+	SI5351_VARIANT_B = 3,
+	SI5351_VARIANT_C = 4,
+};
+
+/**
+ * enum si5351_pll_src - Si5351 pll clock source
+ * @SI5351_PLL_SRC_DEFAULT: default, do not change eeprom config
+ * @SI5351_PLL_SRC_XTAL: pll source clock is XTAL input
+ * @SI5351_PLL_SRC_CLKIN: pll source clock is CLKIN input (Si5351C only)
+ */
+enum si5351_pll_src {
+	SI5351_PLL_SRC_DEFAULT = 0,
+	SI5351_PLL_SRC_XTAL = 1,
+	SI5351_PLL_SRC_CLKIN = 2,
+};
+
+/**
+ * enum si5351_multisynth_src - Si5351 multisynth clock source
+ * @SI5351_MULTISYNTH_SRC_DEFAULT: default, do not change eeprom config
+ * @SI5351_MULTISYNTH_SRC_VCO0: multisynth source clock is VCO0
+ * @SI5351_MULTISYNTH_SRC_VCO1: multisynth source clock is VCO1/VXCO
+ */
+enum si5351_multisynth_src {
+	SI5351_MULTISYNTH_SRC_DEFAULT = 0,
+	SI5351_MULTISYNTH_SRC_VCO0 = 1,
+	SI5351_MULTISYNTH_SRC_VCO1 = 2,
+};
+
+/**
+ * enum si5351_clkout_src - Si5351 clock output clock source
+ * @SI5351_CLKOUT_SRC_DEFAULT: default, do not change eeprom config
+ * @SI5351_CLKOUT_SRC_MSYNTH_N: clkout N source clock is multisynth N
+ * @SI5351_CLKOUT_SRC_MSYNTH_0_4: clkout N source clock is multisynth 0 (N<4)
+ *                                or 4 (N>=4)
+ * @SI5351_CLKOUT_SRC_XTAL: clkout N source clock is XTAL
+ * @SI5351_CLKOUT_SRC_CLKIN: clkout N source clock is CLKIN (Si5351C only)
+ */
+enum si5351_clkout_src {
+	SI5351_CLKOUT_SRC_DEFAULT = 0,
+	SI5351_CLKOUT_SRC_MSYNTH_N = 1,
+	SI5351_CLKOUT_SRC_MSYNTH_0_4 = 2,
+	SI5351_CLKOUT_SRC_XTAL = 3,
+	SI5351_CLKOUT_SRC_CLKIN = 4,
+};
+
+/**
+ * enum si5351_drive_strength - Si5351 clock output drive strength
+ * @SI5351_DRIVE_DEFAULT: default, do not change eeprom config
+ * @SI5351_DRIVE_2MA: 2mA clock output drive strength
+ * @SI5351_DRIVE_4MA: 4mA clock output drive strength
+ * @SI5351_DRIVE_6MA: 6mA clock output drive strength
+ * @SI5351_DRIVE_8MA: 8mA clock output drive strength
+ */
+enum si5351_drive_strength {
+	SI5351_DRIVE_DEFAULT = 0,
+	SI5351_DRIVE_2MA = 2,
+	SI5351_DRIVE_4MA = 4,
+	SI5351_DRIVE_6MA = 6,
+	SI5351_DRIVE_8MA = 8,
+};
+
+/**
+ * struct si5351_clkout_config - Si5351 clock output configuration
+ * @clkout: clkout number
+ * @multisynth_src: multisynth source clock
+ * @clkout_src: clkout source clock
+ * @pll_master: if true, clkout can also change pll rate
+ * @drive: output drive strength
+ * @rate: initial clkout rate, or default if 0
+ */
+struct si5351_clkout_config {
+	enum si5351_multisynth_src multisynth_src;
+	enum si5351_clkout_src clkout_src;
+	enum si5351_drive_strength drive;
+	bool pll_master;
+	unsigned long rate;
+};
+
+/**
+ * struct si5351_platform_data - Platform data for the Si5351 clock driver
+ * @variant: Si5351 chip variant
+ * @clk_xtal: xtal input clock
+ * @clk_clkin: clkin input clock
+ * @pll_src: array of pll source clock setting
+ * @clkout: array of clkout configuration
+ */
+struct si5351_platform_data {
+	enum si5351_variant variant;
+	struct clk *clk_xtal;
+	struct clk *clk_clkin;
+	enum si5351_pll_src pll_src[2];
+	struct si5351_clkout_config clkout[8];
+};
+
+#endif
-- 
cgit 


From 0d3e3350d5871c53464be4c92d57198744247005 Mon Sep 17 00:00:00 2001
From: Adrian Hunter <adrian.hunter@intel.com>
Date: Thu, 4 Apr 2013 16:41:06 +0300
Subject: mmc: core: fix performance regression initializing MMC host
 controllers

Commit fa5501890d8974301042e0202d342a6cbe8609f4 introduced a performance
regression by adding mmc_power_up() to mmc_start_host().  mmc_power_up()
is not necessary to host controller initialization, it is part of card
initialization and is performed anyway asynchronously.

This patch allows a driver to leave the power up in asynchronous code
(as it was before).

On my current target platform this reduces driver initialization from:

[    1.313220] initcall sdhci_acpi_driver_init+0x0/0x12 returned 0 after 102008 usecs

to this:

[    1.217209] initcall sdhci_acpi_driver_init+0x0/0x12 returned 0 after 8331 usecs

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Acked-by: Ulf Hansson <ulf.hansson@linaro.org>
Signed-off-by: Chris Ball <cjb@laptop.org>
---
 drivers/mmc/core/core.c       | 3 ++-
 drivers/mmc/host/sdhci-acpi.c | 2 ++
 include/linux/mmc/host.h      | 1 +
 3 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c
index 3bf1c462eded..c1893c9c3c31 100644
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2416,7 +2416,8 @@ void mmc_start_host(struct mmc_host *host)
 {
 	host->f_init = max(freqs[0], host->f_min);
 	host->rescan_disable = 0;
-	mmc_power_up(host);
+	if (!(host->caps2 & MMC_CAP2_NO_PRESCAN_POWERUP))
+		mmc_power_up(host);
 	mmc_detect_change(host, 0);
 }
 
diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c
index 2592dddbd965..7bcf74b1a5cd 100644
--- a/drivers/mmc/host/sdhci-acpi.c
+++ b/drivers/mmc/host/sdhci-acpi.c
@@ -195,6 +195,8 @@ static int sdhci_acpi_probe(struct platform_device *pdev)
 		host->mmc->pm_caps  |= c->slot->pm_caps;
 	}
 
+	host->mmc->caps2 |= MMC_CAP2_NO_PRESCAN_POWERUP;
+
 	err = sdhci_add_host(host);
 	if (err)
 		goto err_free;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 17d714801e94..8873e8349597 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -280,6 +280,7 @@ struct mmc_host {
 #define MMC_CAP2_PACKED_WR	(1 << 13)	/* Allow packed write */
 #define MMC_CAP2_PACKED_CMD	(MMC_CAP2_PACKED_RD | \
 				 MMC_CAP2_PACKED_WR)
+#define MMC_CAP2_NO_PRESCAN_POWERUP (1 << 14)	/* Don't power up before scan */
 
 	mmc_pm_flag_t		pm_caps;	/* supported pm features */
 
-- 
cgit 


From d3a1c7be8361e2fbb6affbdb19de47ca48d6c402 Mon Sep 17 00:00:00 2001
From: Mike Turquette <mturquette@linaro.org>
Date: Thu, 11 Apr 2013 11:31:36 -0700
Subject: clk: composite: rename 'div' references to 'rate'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rename all div_hw and div_ops related variables and functions to use
rate_hw, rate_ops, etc.  This is to make the rate-change portion of the
composite clk implementation more generic.  A patch following this one
will allow for fixed-rate clocks to reuse this infrastructure.

Signed-off-by: Mike Turquette <mturquette@linaro.org>
Reviewed-by: Prashant Gaikwad <pgaikwad@nvidia.com>
Tested-by: Emilio López <emilio@elopez.com.ar>
Cc: Gregory CLEMENT <gregory.clement@free-electrons.com>
---
 drivers/clk/clk-composite.c  | 40 ++++++++++++++++++++--------------------
 include/linux/clk-provider.h | 14 +++++++-------
 2 files changed, 27 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/clk/clk-composite.c b/drivers/clk/clk-composite.c
index 097dee4fd209..6f4728c6dbd1 100644
--- a/drivers/clk/clk-composite.c
+++ b/drivers/clk/clk-composite.c
@@ -47,36 +47,36 @@ static unsigned long clk_composite_recalc_rate(struct clk_hw *hw,
 					    unsigned long parent_rate)
 {
 	struct clk_composite *composite = to_clk_composite(hw);
-	const struct clk_ops *div_ops = composite->div_ops;
-	struct clk_hw *div_hw = composite->div_hw;
+	const struct clk_ops *rate_ops = composite->rate_ops;
+	struct clk_hw *rate_hw = composite->rate_hw;
 
-	div_hw->clk = hw->clk;
+	rate_hw->clk = hw->clk;
 
-	return div_ops->recalc_rate(div_hw, parent_rate);
+	return rate_ops->recalc_rate(rate_hw, parent_rate);
 }
 
 static long clk_composite_round_rate(struct clk_hw *hw, unsigned long rate,
 				  unsigned long *prate)
 {
 	struct clk_composite *composite = to_clk_composite(hw);
-	const struct clk_ops *div_ops = composite->div_ops;
-	struct clk_hw *div_hw = composite->div_hw;
+	const struct clk_ops *rate_ops = composite->rate_ops;
+	struct clk_hw *rate_hw = composite->rate_hw;
 
-	div_hw->clk = hw->clk;
+	rate_hw->clk = hw->clk;
 
-	return div_ops->round_rate(div_hw, rate, prate);
+	return rate_ops->round_rate(rate_hw, rate, prate);
 }
 
 static int clk_composite_set_rate(struct clk_hw *hw, unsigned long rate,
 			       unsigned long parent_rate)
 {
 	struct clk_composite *composite = to_clk_composite(hw);
-	const struct clk_ops *div_ops = composite->div_ops;
-	struct clk_hw *div_hw = composite->div_hw;
+	const struct clk_ops *rate_ops = composite->rate_ops;
+	struct clk_hw *rate_hw = composite->rate_hw;
 
-	div_hw->clk = hw->clk;
+	rate_hw->clk = hw->clk;
 
-	return div_ops->set_rate(div_hw, rate, parent_rate);
+	return rate_ops->set_rate(rate_hw, rate, parent_rate);
 }
 
 static int clk_composite_is_enabled(struct clk_hw *hw)
@@ -115,7 +115,7 @@ static void clk_composite_disable(struct clk_hw *hw)
 struct clk *clk_register_composite(struct device *dev, const char *name,
 			const char **parent_names, int num_parents,
 			struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
-			struct clk_hw *div_hw, const struct clk_ops *div_ops,
+			struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 			struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 			unsigned long flags)
 {
@@ -149,15 +149,15 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 		clk_composite_ops->set_parent = clk_composite_set_parent;
 	}
 
-	if (div_hw && div_ops) {
-		if (!div_ops->recalc_rate || !div_ops->round_rate ||
-		    !div_ops->set_rate) {
+	if (rate_hw && rate_ops) {
+		if (!rate_ops->recalc_rate || !rate_ops->round_rate ||
+		    !rate_ops->set_rate) {
 			clk = ERR_PTR(-EINVAL);
 			goto err;
 		}
 
-		composite->div_hw = div_hw;
-		composite->div_ops = div_ops;
+		composite->rate_hw = rate_hw;
+		composite->rate_ops = rate_ops;
 		clk_composite_ops->recalc_rate = clk_composite_recalc_rate;
 		clk_composite_ops->round_rate = clk_composite_round_rate;
 		clk_composite_ops->set_rate = clk_composite_set_rate;
@@ -187,8 +187,8 @@ struct clk *clk_register_composite(struct device *dev, const char *name,
 	if (composite->mux_hw)
 		composite->mux_hw->clk = clk;
 
-	if (composite->div_hw)
-		composite->div_hw->clk = clk;
+	if (composite->rate_hw)
+		composite->rate_hw->clk = clk;
 
 	if (composite->gate_hw)
 		composite->gate_hw->clk = clk;
diff --git a/include/linux/clk-provider.h b/include/linux/clk-provider.h
index e7b7cbc53815..11860985fecb 100644
--- a/include/linux/clk-provider.h
+++ b/include/linux/clk-provider.h
@@ -354,11 +354,11 @@ struct clk *clk_register_fixed_factor(struct device *dev, const char *name,
  * struct clk_composite - aggregate clock of mux, divider and gate clocks
  *
  * @hw:		handle between common and hardware-specific interfaces
- * @mux_hw:	handle between composite and hardware-specifix mux clock
- * @div_hw:	handle between composite and hardware-specifix divider clock
- * @gate_hw:	handle between composite and hardware-specifix gate clock
+ * @mux_hw:	handle between composite and hardware-specific mux clock
+ * @rate_hw:	handle between composite and hardware-specific rate clock
+ * @gate_hw:	handle between composite and hardware-specific gate clock
  * @mux_ops:	clock ops for mux
- * @div_ops:	clock ops for divider
+ * @rate_ops:	clock ops for rate
  * @gate_ops:	clock ops for gate
  */
 struct clk_composite {
@@ -366,18 +366,18 @@ struct clk_composite {
 	struct clk_ops	ops;
 
 	struct clk_hw	*mux_hw;
-	struct clk_hw	*div_hw;
+	struct clk_hw	*rate_hw;
 	struct clk_hw	*gate_hw;
 
 	const struct clk_ops	*mux_ops;
-	const struct clk_ops	*div_ops;
+	const struct clk_ops	*rate_ops;
 	const struct clk_ops	*gate_ops;
 };
 
 struct clk *clk_register_composite(struct device *dev, const char *name,
 		const char **parent_names, int num_parents,
 		struct clk_hw *mux_hw, const struct clk_ops *mux_ops,
-		struct clk_hw *div_hw, const struct clk_ops *div_ops,
+		struct clk_hw *rate_hw, const struct clk_ops *rate_ops,
 		struct clk_hw *gate_hw, const struct clk_ops *gate_ops,
 		unsigned long flags);
 
-- 
cgit 


From 1640f28f6b839637d9b82a3c4a19120601e59c66 Mon Sep 17 00:00:00 2001
From: Franky Lin <frankyl@broadcom.com>
Date: Thu, 11 Apr 2013 13:28:51 +0200
Subject: brcmfmac: add support for dongle ARM CR4 core

Newer WiFi chip use ARM CR4 core to achieve higher performance. Add necessary
code for host driver in order to support CR4 core.

Reviewed-by: Arend van Spriel <arend@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Signed-off-by: Franky Lin <frankyl@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c |  15 ++-
 .../net/wireless/brcm80211/brcmfmac/sdio_chip.c    | 150 ++++++++++++++++++---
 .../net/wireless/brcm80211/brcmfmac/sdio_chip.h    |   6 +-
 include/linux/bcma/bcma.h                          |   1 +
 include/linux/bcma/bcma_regs.h                     |   1 +
 5 files changed, 148 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
index 3147960ad975..d24eb66d324d 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
@@ -2652,7 +2652,7 @@ static int brcmf_sdio_readshared(struct brcmf_sdio *bus,
 	struct sdpcm_shared_le sh_le;
 	__le32 addr_le;
 
-	shaddr = bus->ramsize - 4;
+	shaddr = bus->ci->rambase + bus->ramsize - 4;
 
 	/*
 	 * Read last word in socram to determine
@@ -3030,10 +3030,11 @@ static int brcmf_sdbrcm_get_image(char *buf, int len, struct brcmf_sdio *bus)
 
 static int brcmf_sdbrcm_download_code_file(struct brcmf_sdio *bus)
 {
-	int offset = 0;
+	int offset;
 	uint len;
 	u8 *memblock = NULL, *memptr;
 	int ret;
+	u8 idx;
 
 	brcmf_dbg(INFO, "Enter\n");
 
@@ -3054,9 +3055,14 @@ static int brcmf_sdbrcm_download_code_file(struct brcmf_sdio *bus)
 		memptr += (BRCMF_SDALIGN -
 			   ((u32)(unsigned long)memblock % BRCMF_SDALIGN));
 
+	offset = bus->ci->rambase;
+
 	/* Download image */
-	while ((len =
-		brcmf_sdbrcm_get_image((char *)memptr, MEMBLOCK, bus))) {
+	len = brcmf_sdbrcm_get_image((char *)memptr, MEMBLOCK, bus);
+	idx = brcmf_sdio_chip_getinfidx(bus->ci, BCMA_CORE_ARM_CR4);
+	if (BRCMF_MAX_CORENUM != idx)
+		memcpy(&bus->ci->rst_vec, memptr, sizeof(bus->ci->rst_vec));
+	while (len) {
 		ret = brcmf_sdio_ramrw(bus->sdiodev, true, offset, memptr, len);
 		if (ret) {
 			brcmf_err("error %d on writing %d membytes at 0x%08x\n",
@@ -3065,6 +3071,7 @@ static int brcmf_sdbrcm_download_code_file(struct brcmf_sdio *bus)
 		}
 
 		offset += MEMBLOCK;
+		len = brcmf_sdbrcm_get_image((char *)memptr, MEMBLOCK, bus);
 	}
 
 err:
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.c b/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.c
index 9818598f30ea..5db985cb0e0a 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.c
@@ -52,6 +52,9 @@
 #define CIB_REV_MASK		0xff000000
 #define CIB_REV_SHIFT		24
 
+/* ARM CR4 core specific control flag bits */
+#define ARMCR4_BCMA_IOCTL_CPUHALT	0x0020
+
 #define SDIOD_DRVSTR_KEY(chip, pmu)     (((chip) << 16) | (pmu))
 /* SDIO Pad drive strength to select value mappings */
 struct sdiod_drive_str {
@@ -149,7 +152,7 @@ brcmf_sdio_ai_iscoreup(struct brcmf_sdio_dev *sdiodev,
 
 static void
 brcmf_sdio_sb_coredisable(struct brcmf_sdio_dev *sdiodev,
-			  struct chip_info *ci, u16 coreid)
+			  struct chip_info *ci, u16 coreid, u32 core_bits)
 {
 	u32 regdata, base;
 	u8 idx;
@@ -235,7 +238,7 @@ brcmf_sdio_sb_coredisable(struct brcmf_sdio_dev *sdiodev,
 
 static void
 brcmf_sdio_ai_coredisable(struct brcmf_sdio_dev *sdiodev,
-			  struct chip_info *ci, u16 coreid)
+			  struct chip_info *ci, u16 coreid, u32 core_bits)
 {
 	u8 idx;
 	u32 regdata;
@@ -249,19 +252,36 @@ brcmf_sdio_ai_coredisable(struct brcmf_sdio_dev *sdiodev,
 	if ((regdata & BCMA_RESET_CTL_RESET) != 0)
 		return;
 
-	brcmf_sdio_regwl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL, 0, NULL);
-	regdata = brcmf_sdio_regrl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
+	/* ensure no pending backplane operation
+	 * 300uc should be sufficient for backplane ops to be finish
+	 * extra 10ms is taken into account for firmware load stage
+	 * after 10300us carry on disabling the core anyway
+	 */
+	SPINWAIT(brcmf_sdio_regrl(sdiodev,
+				  ci->c_inf[idx].wrapbase+BCMA_RESET_ST,
+				  NULL), 10300);
+	regdata = brcmf_sdio_regrl(sdiodev,
+				   ci->c_inf[idx].wrapbase+BCMA_RESET_ST,
 				   NULL);
-	udelay(10);
+	if (regdata)
+		brcmf_err("disabling core 0x%x with reset status %x\n",
+			  coreid, regdata);
 
 	brcmf_sdio_regwl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_RESET_CTL,
 			 BCMA_RESET_CTL_RESET, NULL);
 	udelay(1);
+
+	brcmf_sdio_regwl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
+			 core_bits, NULL);
+	regdata = brcmf_sdio_regrl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
+				   NULL);
+	usleep_range(10, 20);
+
 }
 
 static void
 brcmf_sdio_sb_resetcore(struct brcmf_sdio_dev *sdiodev,
-			struct chip_info *ci, u16 coreid)
+			struct chip_info *ci, u16 coreid, u32 core_bits)
 {
 	u32 regdata;
 	u8 idx;
@@ -272,7 +292,7 @@ brcmf_sdio_sb_resetcore(struct brcmf_sdio_dev *sdiodev,
 	 * Must do the disable sequence first to work for
 	 * arbitrary current core state.
 	 */
-	brcmf_sdio_sb_coredisable(sdiodev, ci, coreid);
+	brcmf_sdio_sb_coredisable(sdiodev, ci, coreid, 0);
 
 	/*
 	 * Now do the initialization sequence.
@@ -325,7 +345,7 @@ brcmf_sdio_sb_resetcore(struct brcmf_sdio_dev *sdiodev,
 
 static void
 brcmf_sdio_ai_resetcore(struct brcmf_sdio_dev *sdiodev,
-			struct chip_info *ci, u16 coreid)
+			struct chip_info *ci, u16 coreid, u32 core_bits)
 {
 	u8 idx;
 	u32 regdata;
@@ -333,28 +353,67 @@ brcmf_sdio_ai_resetcore(struct brcmf_sdio_dev *sdiodev,
 	idx = brcmf_sdio_chip_getinfidx(ci, coreid);
 
 	/* must disable first to work for arbitrary current core state */
-	brcmf_sdio_ai_coredisable(sdiodev, ci, coreid);
+	brcmf_sdio_ai_coredisable(sdiodev, ci, coreid, core_bits);
 
 	/* now do initialization sequence */
 	brcmf_sdio_regwl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
-			 BCMA_IOCTL_FGC | BCMA_IOCTL_CLK, NULL);
+			 core_bits | BCMA_IOCTL_FGC | BCMA_IOCTL_CLK, NULL);
 	regdata = brcmf_sdio_regrl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
 				   NULL);
 	brcmf_sdio_regwl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_RESET_CTL,
 			 0, NULL);
+	regdata = brcmf_sdio_regrl(sdiodev,
+				   ci->c_inf[idx].wrapbase+BCMA_RESET_CTL,
+				   NULL);
 	udelay(1);
 
 	brcmf_sdio_regwl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
-			 BCMA_IOCTL_CLK, NULL);
+			 core_bits | BCMA_IOCTL_CLK, NULL);
 	regdata = brcmf_sdio_regrl(sdiodev, ci->c_inf[idx].wrapbase+BCMA_IOCTL,
 				   NULL);
 	udelay(1);
 }
 
+#ifdef DEBUG
+/* safety check for chipinfo */
+static int brcmf_sdio_chip_cichk(struct chip_info *ci)
+{
+	u8 core_idx;
+
+	/* check RAM core presence for ARM CM3 core */
+	core_idx = brcmf_sdio_chip_getinfidx(ci, BCMA_CORE_ARM_CM3);
+	if (BRCMF_MAX_CORENUM != core_idx) {
+		core_idx = brcmf_sdio_chip_getinfidx(ci,
+						     BCMA_CORE_INTERNAL_MEM);
+		if (BRCMF_MAX_CORENUM == core_idx) {
+			brcmf_err("RAM core not provided with ARM CM3 core\n");
+			return -ENODEV;
+		}
+	}
+
+	/* check RAM base for ARM CR4 core */
+	core_idx = brcmf_sdio_chip_getinfidx(ci, BCMA_CORE_ARM_CR4);
+	if (BRCMF_MAX_CORENUM != core_idx) {
+		if (ci->rambase == 0) {
+			brcmf_err("RAM base not provided with ARM CR4 core\n");
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+#else	/* DEBUG */
+static inline int brcmf_sdio_chip_cichk(struct chip_info *ci)
+{
+	return 0;
+}
+#endif
+
 static int brcmf_sdio_chip_recognition(struct brcmf_sdio_dev *sdiodev,
 				       struct chip_info *ci, u32 regs)
 {
 	u32 regdata;
+	int ret;
 
 	/* Get CC core rev
 	 * Chipid is assume to be at offset 0 from regs arg
@@ -439,6 +498,10 @@ static int brcmf_sdio_chip_recognition(struct brcmf_sdio_dev *sdiodev,
 		return -ENODEV;
 	}
 
+	ret = brcmf_sdio_chip_cichk(ci);
+	if (ret)
+		return ret;
+
 	switch (ci->socitype) {
 	case SOCI_SB:
 		ci->iscoreup = brcmf_sdio_sb_iscoreup;
@@ -538,7 +601,7 @@ brcmf_sdio_chip_buscoresetup(struct brcmf_sdio_dev *sdiodev,
 	 * Make sure any on-chip ARM is off (in case strapping is wrong),
 	 * or downloaded code was already running.
 	 */
-	ci->coredisable(sdiodev, ci, BCMA_CORE_ARM_CM3);
+	ci->coredisable(sdiodev, ci, BCMA_CORE_ARM_CM3, 0);
 }
 
 int brcmf_sdio_chip_attach(struct brcmf_sdio_dev *sdiodev,
@@ -701,7 +764,7 @@ static bool brcmf_sdio_chip_writenvram(struct brcmf_sdio_dev *sdiodev,
 	u32 token;
 	__le32 token_le;
 
-	nvram_addr = (ci->ramsize - 4) - nvram_sz;
+	nvram_addr = (ci->ramsize - 4) - nvram_sz + ci->rambase;
 
 	/* Write the vars list */
 	err = brcmf_sdio_ramrw(sdiodev, true, nvram_addr, nvram_dat, nvram_sz);
@@ -728,7 +791,7 @@ static bool brcmf_sdio_chip_writenvram(struct brcmf_sdio_dev *sdiodev,
 		  nvram_addr, nvram_sz, token);
 
 	/* Write the length token to the last word */
-	if (brcmf_sdio_ramrw(sdiodev, true, (ci->ramsize - 4),
+	if (brcmf_sdio_ramrw(sdiodev, true, (ci->ramsize - 4 + ci->rambase),
 			     (u8 *)&token_le, 4))
 		return false;
 
@@ -741,8 +804,8 @@ brcmf_sdio_chip_cm3_enterdl(struct brcmf_sdio_dev *sdiodev,
 {
 	u32 zeros = 0;
 
-	ci->coredisable(sdiodev, ci, BCMA_CORE_ARM_CM3);
-	ci->resetcore(sdiodev, ci, BCMA_CORE_INTERNAL_MEM);
+	ci->coredisable(sdiodev, ci, BCMA_CORE_ARM_CM3, 0);
+	ci->resetcore(sdiodev, ci, BCMA_CORE_INTERNAL_MEM, 0);
 
 	/* clear length token */
 	brcmf_sdio_ramrw(sdiodev, true, ci->ramsize - 4, (u8 *)&zeros, 4);
@@ -769,7 +832,41 @@ brcmf_sdio_chip_cm3_exitdl(struct brcmf_sdio_dev *sdiodev, struct chip_info *ci,
 	reg_addr += offsetof(struct sdpcmd_regs, intstatus);
 	brcmf_sdio_regwl(sdiodev, reg_addr, 0xFFFFFFFF, NULL);
 
-	ci->resetcore(sdiodev, ci, BCMA_CORE_ARM_CM3);
+	ci->resetcore(sdiodev, ci, BCMA_CORE_ARM_CM3, 0);
+
+	return true;
+}
+
+static inline void
+brcmf_sdio_chip_cr4_enterdl(struct brcmf_sdio_dev *sdiodev,
+			    struct chip_info *ci)
+{
+	ci->resetcore(sdiodev, ci, BCMA_CORE_ARM_CR4,
+		      ARMCR4_BCMA_IOCTL_CPUHALT);
+}
+
+static bool
+brcmf_sdio_chip_cr4_exitdl(struct brcmf_sdio_dev *sdiodev, struct chip_info *ci,
+			   char *nvram_dat, uint nvram_sz)
+{
+	u8 core_idx;
+	u32 reg_addr;
+
+	if (!brcmf_sdio_chip_writenvram(sdiodev, ci, nvram_dat, nvram_sz))
+		return false;
+
+	/* clear all interrupts */
+	core_idx = brcmf_sdio_chip_getinfidx(ci, BCMA_CORE_SDIO_DEV);
+	reg_addr = ci->c_inf[core_idx].base;
+	reg_addr += offsetof(struct sdpcmd_regs, intstatus);
+	brcmf_sdio_regwl(sdiodev, reg_addr, 0xFFFFFFFF, NULL);
+
+	/* Write reset vector to address 0 */
+	brcmf_sdio_ramrw(sdiodev, true, 0, (void *)&ci->rst_vec,
+			 sizeof(ci->rst_vec));
+
+	/* restore ARM */
+	ci->resetcore(sdiodev, ci, BCMA_CORE_ARM_CR4, 0);
 
 	return true;
 }
@@ -777,12 +874,27 @@ brcmf_sdio_chip_cm3_exitdl(struct brcmf_sdio_dev *sdiodev, struct chip_info *ci,
 void brcmf_sdio_chip_enter_download(struct brcmf_sdio_dev *sdiodev,
 				    struct chip_info *ci)
 {
-	brcmf_sdio_chip_cm3_enterdl(sdiodev, ci);
+	u8 arm_core_idx;
+
+	arm_core_idx = brcmf_sdio_chip_getinfidx(ci, BCMA_CORE_ARM_CM3);
+	if (BRCMF_MAX_CORENUM != arm_core_idx) {
+		brcmf_sdio_chip_cm3_enterdl(sdiodev, ci);
+		return;
+	}
+
+	brcmf_sdio_chip_cr4_enterdl(sdiodev, ci);
 }
 
 bool brcmf_sdio_chip_exit_download(struct brcmf_sdio_dev *sdiodev,
 				   struct chip_info *ci, char *nvram_dat,
 				   uint nvram_sz)
 {
-	return brcmf_sdio_chip_cm3_exitdl(sdiodev, ci, nvram_dat, nvram_sz);
+	u8 arm_core_idx;
+
+	arm_core_idx = brcmf_sdio_chip_getinfidx(ci, BCMA_CORE_ARM_CM3);
+	if (BRCMF_MAX_CORENUM != arm_core_idx)
+		return brcmf_sdio_chip_cm3_exitdl(sdiodev, ci, nvram_dat,
+						  nvram_sz);
+
+	return brcmf_sdio_chip_cr4_exitdl(sdiodev, ci, nvram_dat, nvram_sz);
 }
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.h b/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.h
index 2123ea71d127..83c041f1bf4a 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.h
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio_chip.h
@@ -73,15 +73,17 @@ struct chip_info {
 	u32 pmurev;
 	u32 pmucaps;
 	u32 ramsize;
+	u32 rambase;
+	u32 rst_vec;	/* reset vertor for ARM CR4 core */
 
 	bool (*iscoreup)(struct brcmf_sdio_dev *sdiodev, struct chip_info *ci,
 			 u16 coreid);
 	u32 (*corerev)(struct brcmf_sdio_dev *sdiodev, struct chip_info *ci,
 			 u16 coreid);
 	void (*coredisable)(struct brcmf_sdio_dev *sdiodev,
-			struct chip_info *ci, u16 coreid);
+			struct chip_info *ci, u16 coreid, u32 core_bits);
 	void (*resetcore)(struct brcmf_sdio_dev *sdiodev,
-			struct chip_info *ci, u16 coreid);
+			struct chip_info *ci, u16 coreid, u32 core_bits);
 };
 
 struct sbconfig {
diff --git a/include/linux/bcma/bcma.h b/include/linux/bcma/bcma.h
index 0ab6712fd76b..f14a98a79c9d 100644
--- a/include/linux/bcma/bcma.h
+++ b/include/linux/bcma/bcma.h
@@ -134,6 +134,7 @@ struct bcma_host_ops {
 #define BCMA_CORE_I2S			0x834
 #define BCMA_CORE_SDR_DDR1_MEM_CTL	0x835	/* SDR/DDR1 memory controller core */
 #define BCMA_CORE_SHIM			0x837	/* SHIM component in ubus/6362 */
+#define BCMA_CORE_ARM_CR4		0x83e
 #define BCMA_CORE_DEFAULT		0xFFF
 
 #define BCMA_MAX_NR_CORES		16
diff --git a/include/linux/bcma/bcma_regs.h b/include/linux/bcma/bcma_regs.h
index 7e8104bb7a7e..917dcd7965e7 100644
--- a/include/linux/bcma/bcma_regs.h
+++ b/include/linux/bcma/bcma_regs.h
@@ -37,6 +37,7 @@
 #define  BCMA_IOST_BIST_DONE		0x8000
 #define BCMA_RESET_CTL			0x0800
 #define  BCMA_RESET_CTL_RESET		0x0001
+#define BCMA_RESET_ST			0x0804
 
 /* BCMA PCI config space registers. */
 #define BCMA_PCI_PMCSR			0x44
-- 
cgit 


From 1e9ab4dd258ecbb0f1c377fd4dbe227cdb93d9bd Mon Sep 17 00:00:00 2001
From: Piotr Haber <phaber@broadcom.com>
Date: Thu, 11 Apr 2013 13:28:52 +0200
Subject: brcmfmac: setup SDIO reset behavior

Set device in a manner that SDIO I/O card reset
will lead to WLAN backplane and PMU state reset.

Reviewed-by: Hante Meuleman <meuleman@broadcom.com>
Reviewed-by: Arend van Spriel <arend@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Reviewed-by: Franky (Zhenhui) Lin <frankyl@broadcom.com>
Signed-off-by: Piotr Haber <phaber@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c | 38 ++++++++++++++++++----
 .../net/wireless/brcm80211/brcmfmac/sdio_host.h    |  2 ++
 include/linux/bcma/bcma_driver_chipcommon.h        |  3 ++
 3 files changed, 36 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
index d24eb66d324d..c06bb083c459 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
@@ -3635,7 +3635,6 @@ brcmf_sdbrcm_probe_attach(struct brcmf_sdio *bus, u32 regsva)
 	int err = 0;
 	int reg_addr;
 	u32 reg_val;
-	u8 idx;
 
 	bus->alp_only = true;
 
@@ -3686,12 +3685,37 @@ brcmf_sdbrcm_probe_attach(struct brcmf_sdio *bus, u32 regsva)
 		goto fail;
 	}
 
-	/* Set core control so an SDIO reset does a backplane reset */
-	idx = brcmf_sdio_chip_getinfidx(bus->ci, BCMA_CORE_SDIO_DEV);
-	reg_addr = bus->ci->c_inf[idx].base +
-		   offsetof(struct sdpcmd_regs, corecontrol);
-	reg_val = brcmf_sdio_regrl(bus->sdiodev, reg_addr, NULL);
-	brcmf_sdio_regwl(bus->sdiodev, reg_addr, reg_val | CC_BPRESEN, NULL);
+	/* Set card control so an SDIO card reset does a WLAN backplane reset */
+	reg_val = brcmf_sdio_regrb(bus->sdiodev,
+				   SDIO_CCCR_BRCM_CARDCTRL, &err);
+	if (err)
+		goto fail;
+
+	reg_val |= SDIO_CCCR_BRCM_CARDCTRL_WLANRESET;
+
+	brcmf_sdio_regwb(bus->sdiodev,
+			 SDIO_CCCR_BRCM_CARDCTRL, reg_val, &err);
+	if (err)
+		goto fail;
+
+	/* set PMUControl so a backplane reset does PMU state reload */
+	reg_addr = CORE_CC_REG(bus->ci->c_inf[0].base,
+			       pmucontrol);
+	reg_val = brcmf_sdio_regrl(bus->sdiodev,
+				   reg_addr,
+				   &err);
+	if (err)
+		goto fail;
+
+	reg_val |= (BCMA_CC_PMU_CTL_RES_RELOAD << BCMA_CC_PMU_CTL_RES_SHIFT);
+
+	brcmf_sdio_regwl(bus->sdiodev,
+			 reg_addr,
+			 reg_val,
+			 &err);
+	if (err)
+		goto fail;
+
 
 	sdio_release_host(bus->sdiodev->func[1]);
 
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h b/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h
index b9b397b59965..28ed3cc9f35a 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h
@@ -52,6 +52,8 @@
 #define SDIO_CCCR_BRCM_CARDCAP_CMD14_SUPPORT	0x02
 #define SDIO_CCCR_BRCM_CARDCAP_CMD14_EXT	0x04
 #define SDIO_CCCR_BRCM_CARDCAP_CMD_NODEC	0x08
+#define SDIO_CCCR_BRCM_CARDCTRL		0xf1
+#define SDIO_CCCR_BRCM_CARDCTRL_WLANRESET	0x02
 #define SDIO_CCCR_BRCM_SEPINT			0xf2
 
 #define  SDIO_SEPINT_MASK		0x01
diff --git a/include/linux/bcma/bcma_driver_chipcommon.h b/include/linux/bcma/bcma_driver_chipcommon.h
index 453fcc914683..b8b09eac60a4 100644
--- a/include/linux/bcma/bcma_driver_chipcommon.h
+++ b/include/linux/bcma/bcma_driver_chipcommon.h
@@ -316,6 +316,9 @@
 #define BCMA_CC_PMU_CTL			0x0600 /* PMU control */
 #define  BCMA_CC_PMU_CTL_ILP_DIV	0xFFFF0000 /* ILP div mask */
 #define  BCMA_CC_PMU_CTL_ILP_DIV_SHIFT	16
+#define  BCMA_CC_PMU_CTL_RES		0x00006000 /* reset control mask */
+#define  BCMA_CC_PMU_CTL_RES_SHIFT	13
+#define  BCMA_CC_PMU_CTL_RES_RELOAD	0x2	/* reload POR values */
 #define  BCMA_CC_PMU_CTL_PLL_UPD	0x00000400
 #define  BCMA_CC_PMU_CTL_NOILPONW	0x00000200 /* No ILP on wait */
 #define  BCMA_CC_PMU_CTL_HTREQEN	0x00000100 /* HT req enable */
-- 
cgit 


From 668761ac01d6f5a36b8e5a24d4e154550e2c4c3b Mon Sep 17 00:00:00 2001
From: Hante Meuleman <meuleman@broadcom.com>
Date: Fri, 12 Apr 2013 10:55:55 +0200
Subject: brcmfmac: define and use platform specific data for SDIO.

This patch adds support for platform specific data for SDIO
fullmac devices. Currently OOB interrupts are configured by Kconfig
BRCMFMAC_SDIO_OOB but that is now determined dynamically by checking
availibility of platform data.

Cc: Hauke Mehrtens <hauke@hauke-m.de>
Reviewed-by: Arend Van Spriel <arend@broadcom.com>
Reviewed-by: Franky (Zhenhui) Lin <frankyl@broadcom.com>
Reviewed-by: Pieter-Paul Giesberts <pieterpg@broadcom.com>
Reviewed-by: Piotr Haber <phaber@broadcom.com>
Signed-off-by: Hante Meuleman <meuleman@broadcom.com>
Signed-off-by: Arend van Spriel <arend@broadcom.com>
Signed-off-by: John W. Linville <linville@tuxdriver.com>
---
 drivers/net/wireless/brcm80211/Kconfig             |   9 --
 drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c   | 155 +++++++++++----------
 .../net/wireless/brcm80211/brcmfmac/bcmsdh_sdmmc.c | 114 +++++----------
 drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c |  29 ++--
 .../net/wireless/brcm80211/brcmfmac/sdio_host.h    |   6 +-
 include/linux/platform_data/brcmfmac-sdio.h        | 124 +++++++++++++++++
 6 files changed, 250 insertions(+), 187 deletions(-)
 create mode 100644 include/linux/platform_data/brcmfmac-sdio.h

(limited to 'include/linux')

diff --git a/drivers/net/wireless/brcm80211/Kconfig b/drivers/net/wireless/brcm80211/Kconfig
index 747e9317dabd..fc8a0fa6d3b2 100644
--- a/drivers/net/wireless/brcm80211/Kconfig
+++ b/drivers/net/wireless/brcm80211/Kconfig
@@ -37,15 +37,6 @@ config BRCMFMAC_SDIO
 	  IEEE802.11n embedded FullMAC WLAN driver. Say Y if you want to
 	  use the driver for a SDIO wireless card.
 
-config BRCMFMAC_SDIO_OOB
-	bool "Out of band interrupt support for SDIO interface chipset"
-	depends on BRCMFMAC_SDIO
-	---help---
-	  This option enables out-of-band interrupt support for Broadcom
-	  SDIO Wifi chipset using fullmac in order to gain better
-	  performance and deep sleep wake up capability on certain
-	  platforms. Say N if you are unsure.
-
 config BRCMFMAC_USB
 	bool "USB bus interface support for FullMAC driver"
 	depends on USB
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
index aa51f370be49..4891e3df2058 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh.c
@@ -25,6 +25,7 @@
 #include <linux/mmc/sdio.h>
 #include <linux/mmc/sdio_func.h>
 #include <linux/mmc/card.h>
+#include <linux/platform_data/brcmfmac-sdio.h>
 
 #include <defs.h>
 #include <brcm_hw_ids.h>
@@ -37,16 +38,15 @@
 
 #define SDIOH_API_ACCESS_RETRY_LIMIT	2
 
-#ifdef CONFIG_BRCMFMAC_SDIO_OOB
-static irqreturn_t brcmf_sdio_irqhandler(int irq, void *dev_id)
+
+static irqreturn_t brcmf_sdio_oob_irqhandler(int irq, void *dev_id)
 {
 	struct brcmf_bus *bus_if = dev_get_drvdata(dev_id);
 	struct brcmf_sdio_dev *sdiodev = bus_if->bus_priv.sdio;
 
-	brcmf_dbg(INTR, "oob intr triggered\n");
+	brcmf_dbg(INTR, "OOB intr triggered\n");
 
-	/*
-	 * out-of-band interrupt is level-triggered which won't
+	/* out-of-band interrupt is level-triggered which won't
 	 * be cleared until dpc
 	 */
 	if (sdiodev->irq_en) {
@@ -59,72 +59,12 @@ static irqreturn_t brcmf_sdio_irqhandler(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-int brcmf_sdio_intr_register(struct brcmf_sdio_dev *sdiodev)
-{
-	int ret = 0;
-	u8 data;
-	unsigned long flags;
-
-	brcmf_dbg(SDIO, "Entering: irq %d\n", sdiodev->irq);
-
-	ret = request_irq(sdiodev->irq, brcmf_sdio_irqhandler,
-			  sdiodev->irq_flags, "brcmf_oob_intr",
-			  &sdiodev->func[1]->dev);
-	if (ret != 0)
-		return ret;
-	spin_lock_init(&sdiodev->irq_en_lock);
-	spin_lock_irqsave(&sdiodev->irq_en_lock, flags);
-	sdiodev->irq_en = true;
-	spin_unlock_irqrestore(&sdiodev->irq_en_lock, flags);
-
-	ret = enable_irq_wake(sdiodev->irq);
-	if (ret != 0)
-		return ret;
-	sdiodev->irq_wake = true;
-
-	sdio_claim_host(sdiodev->func[1]);
-
-	/* must configure SDIO_CCCR_IENx to enable irq */
-	data = brcmf_sdio_regrb(sdiodev, SDIO_CCCR_IENx, &ret);
-	data |= 1 << SDIO_FUNC_1 | 1 << SDIO_FUNC_2 | 1;
-	brcmf_sdio_regwb(sdiodev, SDIO_CCCR_IENx, data, &ret);
-
-	/* redirect, configure and enable io for interrupt signal */
-	data = SDIO_SEPINT_MASK | SDIO_SEPINT_OE;
-	if (sdiodev->irq_flags & IRQF_TRIGGER_HIGH)
-		data |= SDIO_SEPINT_ACT_HI;
-	brcmf_sdio_regwb(sdiodev, SDIO_CCCR_BRCM_SEPINT, data, &ret);
-
-	sdio_release_host(sdiodev->func[1]);
-
-	return 0;
-}
-
-int brcmf_sdio_intr_unregister(struct brcmf_sdio_dev *sdiodev)
-{
-	brcmf_dbg(SDIO, "Entering\n");
-
-	sdio_claim_host(sdiodev->func[1]);
-	brcmf_sdio_regwb(sdiodev, SDIO_CCCR_BRCM_SEPINT, 0, NULL);
-	brcmf_sdio_regwb(sdiodev, SDIO_CCCR_IENx, 0, NULL);
-	sdio_release_host(sdiodev->func[1]);
-
-	if (sdiodev->irq_wake) {
-		disable_irq_wake(sdiodev->irq);
-		sdiodev->irq_wake = false;
-	}
-	free_irq(sdiodev->irq, &sdiodev->func[1]->dev);
-	sdiodev->irq_en = false;
-
-	return 0;
-}
-#else		/* CONFIG_BRCMFMAC_SDIO_OOB */
-static void brcmf_sdio_irqhandler(struct sdio_func *func)
+static void brcmf_sdio_ib_irqhandler(struct sdio_func *func)
 {
 	struct brcmf_bus *bus_if = dev_get_drvdata(&func->dev);
 	struct brcmf_sdio_dev *sdiodev = bus_if->bus_priv.sdio;
 
-	brcmf_dbg(INTR, "ib intr triggered\n");
+	brcmf_dbg(INTR, "IB intr triggered\n");
 
 	brcmf_sdbrcm_isr(sdiodev->bus);
 }
@@ -136,12 +76,56 @@ static void brcmf_sdio_dummy_irqhandler(struct sdio_func *func)
 
 int brcmf_sdio_intr_register(struct brcmf_sdio_dev *sdiodev)
 {
-	brcmf_dbg(SDIO, "Entering\n");
+	int ret = 0;
+	u8 data;
+	unsigned long flags;
 
-	sdio_claim_host(sdiodev->func[1]);
-	sdio_claim_irq(sdiodev->func[1], brcmf_sdio_irqhandler);
-	sdio_claim_irq(sdiodev->func[2], brcmf_sdio_dummy_irqhandler);
-	sdio_release_host(sdiodev->func[1]);
+	if ((sdiodev->pdata) && (sdiodev->pdata->oob_irq_supported)) {
+		brcmf_dbg(SDIO, "Enter, register OOB IRQ %d\n",
+			  sdiodev->pdata->oob_irq_nr);
+		ret = request_irq(sdiodev->pdata->oob_irq_nr,
+				  brcmf_sdio_oob_irqhandler,
+				  sdiodev->pdata->oob_irq_flags,
+				  "brcmf_oob_intr",
+				  &sdiodev->func[1]->dev);
+		if (ret != 0) {
+			brcmf_err("request_irq failed %d\n", ret);
+			return ret;
+		}
+		sdiodev->oob_irq_requested = true;
+		spin_lock_init(&sdiodev->irq_en_lock);
+		spin_lock_irqsave(&sdiodev->irq_en_lock, flags);
+		sdiodev->irq_en = true;
+		spin_unlock_irqrestore(&sdiodev->irq_en_lock, flags);
+
+		ret = enable_irq_wake(sdiodev->pdata->oob_irq_nr);
+		if (ret != 0) {
+			brcmf_err("enable_irq_wake failed %d\n", ret);
+			return ret;
+		}
+		sdiodev->irq_wake = true;
+
+		sdio_claim_host(sdiodev->func[1]);
+
+		/* must configure SDIO_CCCR_IENx to enable irq */
+		data = brcmf_sdio_regrb(sdiodev, SDIO_CCCR_IENx, &ret);
+		data |= 1 << SDIO_FUNC_1 | 1 << SDIO_FUNC_2 | 1;
+		brcmf_sdio_regwb(sdiodev, SDIO_CCCR_IENx, data, &ret);
+
+		/* redirect, configure and enable io for interrupt signal */
+		data = SDIO_SEPINT_MASK | SDIO_SEPINT_OE;
+		if (sdiodev->pdata->oob_irq_flags & IRQF_TRIGGER_HIGH)
+			data |= SDIO_SEPINT_ACT_HI;
+		brcmf_sdio_regwb(sdiodev, SDIO_CCCR_BRCM_SEPINT, data, &ret);
+
+		sdio_release_host(sdiodev->func[1]);
+	} else {
+		brcmf_dbg(SDIO, "Entering\n");
+		sdio_claim_host(sdiodev->func[1]);
+		sdio_claim_irq(sdiodev->func[1], brcmf_sdio_ib_irqhandler);
+		sdio_claim_irq(sdiodev->func[2], brcmf_sdio_dummy_irqhandler);
+		sdio_release_host(sdiodev->func[1]);
+	}
 
 	return 0;
 }
@@ -150,14 +134,31 @@ int brcmf_sdio_intr_unregister(struct brcmf_sdio_dev *sdiodev)
 {
 	brcmf_dbg(SDIO, "Entering\n");
 
-	sdio_claim_host(sdiodev->func[1]);
-	sdio_release_irq(sdiodev->func[2]);
-	sdio_release_irq(sdiodev->func[1]);
-	sdio_release_host(sdiodev->func[1]);
+	if ((sdiodev->pdata) && (sdiodev->pdata->oob_irq_supported)) {
+		sdio_claim_host(sdiodev->func[1]);
+		brcmf_sdio_regwb(sdiodev, SDIO_CCCR_BRCM_SEPINT, 0, NULL);
+		brcmf_sdio_regwb(sdiodev, SDIO_CCCR_IENx, 0, NULL);
+		sdio_release_host(sdiodev->func[1]);
+
+		if (sdiodev->oob_irq_requested) {
+			sdiodev->oob_irq_requested = false;
+			if (sdiodev->irq_wake) {
+				disable_irq_wake(sdiodev->pdata->oob_irq_nr);
+				sdiodev->irq_wake = false;
+			}
+			free_irq(sdiodev->pdata->oob_irq_nr,
+				 &sdiodev->func[1]->dev);
+			sdiodev->irq_en = false;
+		}
+	} else {
+		sdio_claim_host(sdiodev->func[1]);
+		sdio_release_irq(sdiodev->func[2]);
+		sdio_release_irq(sdiodev->func[1]);
+		sdio_release_host(sdiodev->func[1]);
+	}
 
 	return 0;
 }
-#endif		/* CONFIG_BRCMFMAC_SDIO_OOB */
 
 int
 brcmf_sdcard_set_sbaddr_window(struct brcmf_sdio_dev *sdiodev, u32 address)
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh_sdmmc.c b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh_sdmmc.c
index b1ea1036e825..44fa0cdbf97b 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh_sdmmc.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/bcmsdh_sdmmc.c
@@ -26,6 +26,7 @@
 #include <linux/sched.h>	/* request_irq() */
 #include <linux/module.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/brcmfmac-sdio.h>
 #include <net/cfg80211.h>
 
 #include <defs.h>
@@ -62,14 +63,8 @@ static const struct sdio_device_id brcmf_sdmmc_ids[] = {
 };
 MODULE_DEVICE_TABLE(sdio, brcmf_sdmmc_ids);
 
-#ifdef CONFIG_BRCMFMAC_SDIO_OOB
-static struct list_head oobirq_lh;
-struct brcmf_sdio_oobirq {
-	unsigned int irq;
-	unsigned long flags;
-	struct list_head list;
-};
-#endif		/* CONFIG_BRCMFMAC_SDIO_OOB */
+static struct brcmfmac_sdio_platform_data *brcmfmac_sdio_pdata;
+
 
 static bool
 brcmf_pm_resume_error(struct brcmf_sdio_dev *sdiodev)
@@ -428,33 +423,6 @@ void brcmf_sdioh_detach(struct brcmf_sdio_dev *sdiodev)
 
 }
 
-#ifdef CONFIG_BRCMFMAC_SDIO_OOB
-static int brcmf_sdio_getintrcfg(struct brcmf_sdio_dev *sdiodev)
-{
-	struct brcmf_sdio_oobirq *oobirq_entry;
-
-	if (list_empty(&oobirq_lh)) {
-		brcmf_err("no valid oob irq resource\n");
-		return -ENXIO;
-	}
-
-	oobirq_entry = list_first_entry(&oobirq_lh, struct brcmf_sdio_oobirq,
-					list);
-
-	sdiodev->irq = oobirq_entry->irq;
-	sdiodev->irq_flags = oobirq_entry->flags;
-	list_del(&oobirq_entry->list);
-	kfree(oobirq_entry);
-
-	return 0;
-}
-#else
-static inline int brcmf_sdio_getintrcfg(struct brcmf_sdio_dev *sdiodev)
-{
-	return 0;
-}
-#endif		/* CONFIG_BRCMFMAC_SDIO_OOB */
-
 static int brcmf_ops_sdio_probe(struct sdio_func *func,
 				const struct sdio_device_id *id)
 {
@@ -495,15 +463,13 @@ static int brcmf_ops_sdio_probe(struct sdio_func *func,
 	dev_set_drvdata(&func->dev, bus_if);
 	dev_set_drvdata(&sdiodev->func[1]->dev, bus_if);
 	sdiodev->dev = &sdiodev->func[1]->dev;
+	sdiodev->pdata = brcmfmac_sdio_pdata;
 
 	atomic_set(&sdiodev->suspend, false);
 	init_waitqueue_head(&sdiodev->request_byte_wait);
 	init_waitqueue_head(&sdiodev->request_word_wait);
 	init_waitqueue_head(&sdiodev->request_chain_wait);
 	init_waitqueue_head(&sdiodev->request_buffer_wait);
-	err = brcmf_sdio_getintrcfg(sdiodev);
-	if (err)
-		goto fail;
 
 	brcmf_dbg(SDIO, "F2 found, calling brcmf_sdio_probe...\n");
 	err = brcmf_sdio_probe(sdiodev);
@@ -598,7 +564,7 @@ static const struct dev_pm_ops brcmf_sdio_pm_ops = {
 static struct sdio_driver brcmf_sdmmc_driver = {
 	.probe = brcmf_ops_sdio_probe,
 	.remove = brcmf_ops_sdio_remove,
-	.name = "brcmfmac",
+	.name = BRCMFMAC_SDIO_PDATA_NAME,
 	.id_table = brcmf_sdmmc_ids,
 #ifdef CONFIG_PM_SLEEP
 	.drv = {
@@ -607,72 +573,51 @@ static struct sdio_driver brcmf_sdmmc_driver = {
 #endif	/* CONFIG_PM_SLEEP */
 };
 
-#ifdef CONFIG_BRCMFMAC_SDIO_OOB
 static int brcmf_sdio_pd_probe(struct platform_device *pdev)
 {
-	struct resource *res;
-	struct brcmf_sdio_oobirq *oobirq_entry;
-	int i, ret;
+	int ret;
 
-	INIT_LIST_HEAD(&oobirq_lh);
+	brcmf_dbg(SDIO, "Enter\n");
 
-	for (i = 0; ; i++) {
-		res = platform_get_resource(pdev, IORESOURCE_IRQ, i);
-		if (!res)
-			break;
+	brcmfmac_sdio_pdata = pdev->dev.platform_data;
 
-		oobirq_entry = kzalloc(sizeof(struct brcmf_sdio_oobirq),
-				       GFP_KERNEL);
-		if (!oobirq_entry)
-			return -ENOMEM;
-		oobirq_entry->irq = res->start;
-		oobirq_entry->flags = res->flags & IRQF_TRIGGER_MASK;
-		list_add_tail(&oobirq_entry->list, &oobirq_lh);
-	}
-	if (i == 0)
-		return -ENXIO;
+	if (brcmfmac_sdio_pdata->power_on)
+		brcmfmac_sdio_pdata->power_on();
 
 	ret = sdio_register_driver(&brcmf_sdmmc_driver);
-
 	if (ret)
 		brcmf_err("sdio_register_driver failed: %d\n", ret);
 
 	return ret;
 }
 
-static struct platform_driver brcmf_sdio_pd = {
-	.probe		= brcmf_sdio_pd_probe,
-	.driver		= {
-		.name	= "brcmf_sdio_pd"
-	}
-};
-
-void brcmf_sdio_exit(void)
+static int brcmf_sdio_pd_remove(struct platform_device *pdev)
 {
 	brcmf_dbg(SDIO, "Enter\n");
 
+	if (brcmfmac_sdio_pdata->power_off)
+		brcmfmac_sdio_pdata->power_off();
+
 	sdio_unregister_driver(&brcmf_sdmmc_driver);
 
-	platform_driver_unregister(&brcmf_sdio_pd);
+	return 0;
 }
 
-void brcmf_sdio_init(void)
-{
-	int ret;
-
-	brcmf_dbg(SDIO, "Enter\n");
-
-	ret = platform_driver_register(&brcmf_sdio_pd);
+static struct platform_driver brcmf_sdio_pd = {
+	.remove		= brcmf_sdio_pd_remove,
+	.driver		= {
+		.name	= BRCMFMAC_SDIO_PDATA_NAME
+	}
+};
 
-	if (ret)
-		brcmf_err("platform_driver_register failed: %d\n", ret);
-}
-#else
 void brcmf_sdio_exit(void)
 {
 	brcmf_dbg(SDIO, "Enter\n");
 
-	sdio_unregister_driver(&brcmf_sdmmc_driver);
+	if (brcmfmac_sdio_pdata)
+		platform_driver_unregister(&brcmf_sdio_pd);
+	else
+		sdio_unregister_driver(&brcmf_sdmmc_driver);
 }
 
 void brcmf_sdio_init(void)
@@ -681,9 +626,12 @@ void brcmf_sdio_init(void)
 
 	brcmf_dbg(SDIO, "Enter\n");
 
-	ret = sdio_register_driver(&brcmf_sdmmc_driver);
+	ret = platform_driver_probe(&brcmf_sdio_pd, brcmf_sdio_pd_probe);
+	if (ret == -ENODEV) {
+		brcmf_dbg(SDIO, "No platform data available, registering without.\n");
+		ret = sdio_register_driver(&brcmf_sdmmc_driver);
+	}
 
 	if (ret)
-		brcmf_err("sdio_register_driver failed: %d\n", ret);
+		brcmf_err("driver registration failed: %d\n", ret);
 }
-#endif		/* CONFIG_BRCMFMAC_SDIO_OOB */
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
index fd697ce3683c..d2487518bd2a 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
+++ b/drivers/net/wireless/brcm80211/brcmfmac/dhd_sdio.c
@@ -31,6 +31,7 @@
 #include <linux/bcma/bcma.h>
 #include <linux/debugfs.h>
 #include <linux/vmalloc.h>
+#include <linux/platform_data/brcmfmac-sdio.h>
 #include <asm/unaligned.h>
 #include <defs.h>
 #include <brcmu_wifi.h>
@@ -517,7 +518,7 @@ static int qcount[NUMPRIO];
 static int tx_packets[NUMPRIO];
 #endif				/* DEBUG */
 
-#define SDIO_DRIVE_STRENGTH	6	/* in milliamps */
+#define DEFAULT_SDIO_DRIVE_STRENGTH	6	/* in milliamps */
 
 #define RETRYCHAN(chan) ((chan) == SDPCM_EVENT_CHANNEL)
 
@@ -2046,23 +2047,19 @@ static void brcmf_sdbrcm_bus_stop(struct device *dev)
 	bus->tx_seq = bus->rx_seq = 0;
 }
 
-#ifdef CONFIG_BRCMFMAC_SDIO_OOB
 static inline void brcmf_sdbrcm_clrintr(struct brcmf_sdio *bus)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&bus->sdiodev->irq_en_lock, flags);
-	if (!bus->sdiodev->irq_en && !atomic_read(&bus->ipend)) {
-		enable_irq(bus->sdiodev->irq);
-		bus->sdiodev->irq_en = true;
+	if (bus->sdiodev->oob_irq_requested) {
+		spin_lock_irqsave(&bus->sdiodev->irq_en_lock, flags);
+		if (!bus->sdiodev->irq_en && !atomic_read(&bus->ipend)) {
+			enable_irq(bus->sdiodev->pdata->oob_irq_nr);
+			bus->sdiodev->irq_en = true;
+		}
+		spin_unlock_irqrestore(&bus->sdiodev->irq_en_lock, flags);
 	}
-	spin_unlock_irqrestore(&bus->sdiodev->irq_en_lock, flags);
-}
-#else
-static inline void brcmf_sdbrcm_clrintr(struct brcmf_sdio *bus)
-{
 }
-#endif		/* CONFIG_BRCMFMAC_SDIO_OOB */
 
 static inline void brcmf_sdbrcm_adddpctsk(struct brcmf_sdio *bus)
 {
@@ -3639,6 +3636,7 @@ brcmf_sdbrcm_probe_attach(struct brcmf_sdio *bus, u32 regsva)
 	int err = 0;
 	int reg_addr;
 	u32 reg_val;
+	u32 drivestrength;
 
 	bus->alp_only = true;
 
@@ -3679,8 +3677,11 @@ brcmf_sdbrcm_probe_attach(struct brcmf_sdio *bus, u32 regsva)
 		goto fail;
 	}
 
-	brcmf_sdio_chip_drivestrengthinit(bus->sdiodev, bus->ci,
-					  SDIO_DRIVE_STRENGTH);
+	if ((bus->sdiodev->pdata) && (bus->sdiodev->pdata->drive_strength))
+		drivestrength = bus->sdiodev->pdata->drive_strength;
+	else
+		drivestrength = DEFAULT_SDIO_DRIVE_STRENGTH;
+	brcmf_sdio_chip_drivestrengthinit(bus->sdiodev, bus->ci, drivestrength);
 
 	/* Get info on the SOCRAM cores... */
 	bus->ramsize = bus->ci->ramsize;
diff --git a/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h b/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h
index 28ed3cc9f35a..7c1b6332747e 100644
--- a/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h
+++ b/drivers/net/wireless/brcm80211/brcmfmac/sdio_host.h
@@ -174,13 +174,11 @@ struct brcmf_sdio_dev {
 	wait_queue_head_t request_buffer_wait;
 	struct device *dev;
 	struct brcmf_bus *bus_if;
-#ifdef CONFIG_BRCMFMAC_SDIO_OOB
-	unsigned int irq;		/* oob interrupt number */
-	unsigned long irq_flags;	/* board specific oob flags */
+	struct brcmfmac_sdio_platform_data *pdata;
+	bool oob_irq_requested;
 	bool irq_en;			/* irq enable flags */
 	spinlock_t irq_en_lock;
 	bool irq_wake;			/* irq wake enable flags */
-#endif		/* CONFIG_BRCMFMAC_SDIO_OOB */
 };
 
 /* Register/deregister interrupt handler. */
diff --git a/include/linux/platform_data/brcmfmac-sdio.h b/include/linux/platform_data/brcmfmac-sdio.h
new file mode 100644
index 000000000000..1ade657d5fc1
--- /dev/null
+++ b/include/linux/platform_data/brcmfmac-sdio.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2013 Broadcom Corporation
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#ifndef _LINUX_BRCMFMAC_PLATFORM_H
+#define _LINUX_BRCMFMAC_PLATFORM_H
+
+/*
+ * Platform specific driver functions and data. Through the platform specific
+ * device data functions can be provided to help the brcmfmac driver to
+ * operate with the device in combination with the used platform.
+ *
+ * Use the platform data in the following (similar) way:
+ *
+ *
+#include <brcmfmac_platform.h>
+
+
+static void brcmfmac_power_on(void)
+{
+}
+
+static void brcmfmac_power_off(void)
+{
+}
+
+static void brcmfmac_reset(void)
+{
+}
+
+static struct brcmfmac_sdio_platform_data brcmfmac_sdio_pdata = {
+	.power_on		= brcmfmac_power_on,
+	.power_off		= brcmfmac_power_off,
+	.reset			= brcmfmac_reset
+};
+
+static struct platform_device brcmfmac_device = {
+	.name			= BRCMFMAC_SDIO_PDATA_NAME,
+	.id			= PLATFORM_DEVID_NONE,
+	.dev.platform_data	= &brcmfmac_sdio_pdata
+};
+
+void __init brcmfmac_init_pdata(void)
+{
+	brcmfmac_sdio_pdata.oob_irq_supported = true;
+	brcmfmac_sdio_pdata.oob_irq_nr = gpio_to_irq(GPIO_BRCMF_SDIO_OOB);
+	brcmfmac_sdio_pdata.oob_irq_flags = IORESOURCE_IRQ |
+					    IORESOURCE_IRQ_HIGHLEVEL;
+	platform_device_register(&brcmfmac_device);
+}
+ *
+ *
+ * Note: the brcmfmac can be loaded as module or be statically built-in into
+ * the kernel. If built-in then do note that it uses module_init (and
+ * module_exit) routines which equal device_initcall. So if you intend to
+ * create a module with the platform specific data for the brcmfmac and have
+ * it built-in to the kernel then use a higher initcall then device_initcall
+ * (see init.h). If this is not done then brcmfmac will load without problems
+ * but will not pickup the platform data.
+ *
+ * When the driver does not "detect" platform driver data then it will continue
+ * without reporting anything and just assume there is no data needed. Which is
+ * probably true for most platforms.
+ *
+ * Explanation of the platform_data fields:
+ *
+ * drive_strength: is the preferred drive_strength to be used for the SDIO
+ * pins. If 0 then a default value will be used. This is the target drive
+ * strength, the exact drive strength which will be used depends on the
+ * capabilities of the device.
+ *
+ * oob_irq_supported: does the board have support for OOB interrupts. SDIO
+ * in-band interrupts are relatively slow and for having less overhead on
+ * interrupt processing an out of band interrupt can be used. If the HW
+ * supports this then enable this by setting this field to true and configure
+ * the oob related fields.
+ *
+ * oob_irq_nr, oob_irq_flags: the OOB interrupt information. The values are
+ * used for registering the irq using request_irq function.
+ *
+ * power_on: This function is called by the brcmfmac when the module gets
+ * loaded. This can be particularly useful for low power devices. The platform
+ * spcific routine may for example decide to power up the complete device.
+ * If there is no use-case for this function then provide NULL.
+ *
+ * power_off: This function is called by the brcmfmac when the module gets
+ * unloaded. At this point the device can be powered down or otherwise be reset.
+ * So if an actual power_off is not supported but reset is then reset the device
+ * when this function gets called. This can be particularly useful for low power
+ * devices. If there is no use-case for this function (either power-down or
+ * reset) then provide NULL.
+ *
+ * reset: This function can get called if the device communication broke down.
+ * This functionality is particularly useful in case of SDIO type devices. It is
+ * possible to reset a dongle via sdio data interface, but it requires that
+ * this is fully functional. This function is chip/module specific and this
+ * function should return only after the complete reset has completed.
+ */
+
+#define BRCMFMAC_SDIO_PDATA_NAME	"brcmfmac_sdio"
+
+struct brcmfmac_sdio_platform_data {
+	unsigned int drive_strength;
+	bool oob_irq_supported;
+	unsigned int oob_irq_nr;
+	unsigned long oob_irq_flags;
+	void (*power_on)(void);
+	void (*power_off)(void);
+	void (*reset)(void);
+};
+
+#endif /* _LINUX_BRCMFMAC_PLATFORM_H */
-- 
cgit 


From 6a76f8c0ab19f215af2a3442870eeb5f0e81998d Mon Sep 17 00:00:00 2001
From: Namhyung Kim <namhyung.kim@lge.com>
Date: Thu, 11 Apr 2013 15:55:01 +0900
Subject: tracing: Fix possible NULL pointer dereferences

Currently set_ftrace_pid and set_graph_function files use seq_lseek
for their fops.  However seq_open() is called only for FMODE_READ in
the fops->open() so that if an user tries to seek one of those file
when she open it for writing, it sees NULL seq_file and then panic.

It can be easily reproduced with following command:

  $ cd /sys/kernel/debug/tracing
  $ echo 1234 | sudo tee -a set_ftrace_pid

In this example, GNU coreutils' tee opens the file with fopen(, "a")
and then the fopen() internally calls lseek().

Link: http://lkml.kernel.org/r/1365663302-2170-1-git-send-email-namhyung@kernel.org

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Namhyung Kim <namhyung.kim@lge.com>
Cc: stable@vger.kernel.org
Signed-off-by: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h     |  2 +-
 kernel/trace/ftrace.c      | 10 +++++-----
 kernel/trace/trace_stack.c |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 167abf907802..eb3ce327b975 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -396,7 +396,7 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_regex_lseek(struct file *file, loff_t offset, int whence);
+loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 926ebfb74936..affc35d829cc 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2697,7 +2697,7 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 }
 
 loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t ret;
 
@@ -3570,7 +3570,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3578,7 +3578,7 @@ static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = seq_read,
 	.write = ftrace_notrace_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3783,8 +3783,8 @@ static const struct file_operations ftrace_graph_fops = {
 	.open		= ftrace_graph_open,
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_graph_release,
-	.llseek		= seq_lseek,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
@@ -4439,7 +4439,7 @@ static const struct file_operations ftrace_pid_fops = {
 	.open		= ftrace_pid_open,
 	.write		= ftrace_pid_write,
 	.read		= seq_read,
-	.llseek		= seq_lseek,
+	.llseek		= ftrace_filter_lseek,
 	.release	= ftrace_pid_release,
 };
 
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..83a8b5b7bd35 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -322,7 +322,7 @@ static const struct file_operations stack_trace_filter_fops = {
 	.open = stack_trace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_regex_lseek,
+	.llseek = ftrace_filter_lseek,
 	.release = ftrace_regex_release,
 };
 
-- 
cgit 


From 7f49ef69db6bbf756c0abca7e9b65b32e999eec8 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 12 Apr 2013 16:40:13 -0400
Subject: ftrace: Move ftrace_filter_lseek out of CONFIG_DYNAMIC_FTRACE section

As ftrace_filter_lseek is now used with ftrace_pid_fops, it needs to
be moved out of the #ifdef CONFIG_DYNAMIC_FTRACE section as the
ftrace_pid_fops is defined when DYNAMIC_FTRACE is not.

Cc: stable@vger.kernel.org
Cc: Namhyung Kim <namhyung@kernel.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  3 ++-
 kernel/trace/ftrace.c  | 28 ++++++++++++++--------------
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index eb3ce327b975..52da2a250795 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -396,7 +396,6 @@ ssize_t ftrace_filter_write(struct file *file, const char __user *ubuf,
 			    size_t cnt, loff_t *ppos);
 ssize_t ftrace_notrace_write(struct file *file, const char __user *ubuf,
 			     size_t cnt, loff_t *ppos);
-loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
 int ftrace_regex_release(struct inode *inode, struct file *file);
 
 void __init
@@ -569,6 +568,8 @@ static inline int
 ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
+loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
+
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index affc35d829cc..2461ede45a8d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1052,6 +1052,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+	loff_t ret;
+
+	if (file->f_mode & FMODE_READ)
+		ret = seq_lseek(file, offset, whence);
+	else
+		file->f_pos = ret = 1;
+
+	return ret;
+}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2612,7 +2625,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
  * routine, you can use ftrace_filter_write() for the write
  * routine if @flag has FTRACE_ITER_FILTER set, or
  * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
  * release must call ftrace_regex_release().
  */
 int
@@ -2696,19 +2709,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 				 inode, file);
 }
 
-loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	if (file->f_mode & FMODE_READ)
-		ret = seq_lseek(file, offset, whence);
-	else
-		file->f_pos = ret = 1;
-
-	return ret;
-}
-
 static int ftrace_match(char *str, char *regex, int len, int type)
 {
 	int matched = 0;
-- 
cgit 


From 10a9574756201fbbdd0cac11f370f00d3d02bfa1 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Fri, 12 Apr 2013 05:44:20 +0000
Subject: PCI: Add pcibios hooks for adding and removing PCI buses

On ACPI-based platforms, the pci_slot driver creates PCI slot devices
according to information from ACPI tables by registering an ACPI PCI
subdriver.  The ACPI PCI subdriver will only be called when creating/
destroying PCI root buses, and it won't be called when hot-plugging
P2P bridges.  It may cause stale PCI slot devices after hot-removing
a P2P bridge if that bridge has associated PCI slots.  And the acpiphp
driver has the same issue too.

This patch introduces two hook points into the PCI core, which will
be invoked when creating/destroying PCI buses for PCI host and P2P
bridges.  They could be used to setup/destroy platform dependent stuff
in a unified way, both at boot time and for PCI hotplug operations.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Myron Stowe <myron.stowe@redhat.com>
---
 drivers/pci/probe.c  | 12 ++++++++++++
 drivers/pci/remove.c |  1 +
 include/linux/pci.h  |  2 ++
 3 files changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 45c93b39af35..43ece5d41d36 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -673,6 +673,8 @@ add_dev:
 	ret = device_register(&child->dev);
 	WARN_ON(ret < 0);
 
+	pcibios_add_bus(child);
+
 	/* Create legacy_io and legacy_mem files for this bus */
 	pci_create_legacy_files(child);
 
@@ -1660,6 +1662,14 @@ int __weak pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
 	return 0;
 }
 
+void __weak pcibios_add_bus(struct pci_bus *bus)
+{
+}
+
+void __weak pcibios_remove_bus(struct pci_bus *bus)
+{
+}
+
 struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 		struct pci_ops *ops, void *sysdata, struct list_head *resources)
 {
@@ -1714,6 +1724,8 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
 	if (error)
 		goto class_dev_reg_err;
 
+	pcibios_add_bus(b);
+
 	/* Create legacy_io and legacy_mem files for this bus */
 	pci_create_legacy_files(b);
 
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index a8318801c092..8fc54b7327bc 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -51,6 +51,7 @@ void pci_remove_bus(struct pci_bus *bus)
 	pci_bus_release_busn_res(bus);
 	up_write(&pci_bus_sem);
 	pci_remove_legacy_files(bus);
+	pcibios_remove_bus(bus);
 	device_unregister(&bus->dev);
 }
 EXPORT_SYMBOL(pci_remove_bus);
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2461033a7987..be9399452133 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -678,6 +678,8 @@ extern struct list_head pci_root_buses;	/* list of all known PCI buses */
 extern int no_pci_devices(void);
 
 void pcibios_resource_survey_bus(struct pci_bus *bus);
+void pcibios_add_bus(struct pci_bus *bus);
+void pcibios_remove_bus(struct pci_bus *bus);
 void pcibios_fixup_bus(struct pci_bus *);
 int __must_check pcibios_enable_device(struct pci_dev *, int mask);
 /* Architecture specific versions may override this (weak) */
-- 
cgit 


From 5090d4a6a1f150ec593ac5f2a755b5a3e878b21e Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Fri, 12 Apr 2013 05:44:21 +0000
Subject: PCI/ACPI: Prepare stub functions to handle ACPI PCI (hotplug) slots

Prepare two stub functions to handle ACPI PCI slots and ACPI PCI hotplug
slots, which will be invoked by the PCI core when creating/destroying
PCI buses.

It will be used to get rid of ACPI PCI subdrivers for pci_slot and
acpiphp, and eventually remove the ACPI PCI subdriver mechanism.

And it will also be used to handle ACPI PCI (hotplug) slots in a unified
way, both at boot time and for PCI hotplug operations.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Myron Stowe <myron.stowe@redhat.com>
---
 drivers/pci/pci-acpi.c   | 24 ++++++++++++++++++++++++
 include/linux/pci-acpi.h |  8 +++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index dee5dddaa292..b8b8437c703e 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -287,6 +287,30 @@ static struct pci_platform_pm_ops acpi_pci_platform_pm = {
 	.run_wake = acpi_pci_run_wake,
 };
 
+void acpi_pci_add_bus(struct pci_bus *bus)
+{
+	acpi_handle handle = NULL;
+
+	if (bus->bridge)
+		handle = ACPI_HANDLE(bus->bridge);
+	if (acpi_pci_disabled || handle == NULL)
+		return;
+
+	/* TODO: add PCI slots and acpiphp hotplug slots */
+}
+
+void acpi_pci_remove_bus(struct pci_bus *bus)
+{
+	/*
+	 * bus->bridge->acpi_node.handle has already been reset to NULL
+	 * when acpi_pci_remove_bus() is called, so don't check ACPI handle.
+	 */
+	if (acpi_pci_disabled)
+		return;
+
+	/* TODO: remove PCI slots and acpiphp hotplug slots */
+}
+
 /* ACPI bus type */
 static int acpi_pci_find_device(struct device *dev, acpi_handle *handle)
 {
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 9a22b5efb384..2b1743cc3186 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -41,7 +41,13 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 
 	return DEVICE_ACPI_HANDLE(dev);
 }
-#endif
+
+void acpi_pci_add_bus(struct pci_bus *bus);
+void acpi_pci_remove_bus(struct pci_bus *bus);
+#else	/* CONFIG_ACPI */
+static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
+static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
+#endif	/* CONFIG_ACPI */
 
 #ifdef CONFIG_ACPI_APEI
 extern bool aer_acpi_firmware_first(void);
-- 
cgit 


From 5c0b04e3d913c91aee6e48e567e20a3f67849618 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Fri, 12 Apr 2013 05:44:24 +0000
Subject: PCI/ACPI: Handle PCI slot devices when creating/destroying PCI buses

Currently the pci_slot driver doesn't update PCI slot devices when PCI
device hotplug event happens, which may cause memory leak and returning
stale information to user.

Now the pci_slot driver has been changed as built-in driver, so invoke
PCI slot enumeration and destroy routines directly from the PCI core.
And remove ACPI PCI sub-driver related code because it isn't needed
any more.

[bhelgas: removed "extern" from function declarations]
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
---
 drivers/acpi/pci_slot.c  | 170 +++++++----------------------------------------
 drivers/acpi/scan.c      |   1 -
 drivers/pci/pci-acpi.c   |   7 +-
 include/linux/pci-acpi.h |  12 ++++
 4 files changed, 41 insertions(+), 149 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_slot.c b/drivers/acpi/pci_slot.c
index cd1434eb1de8..033d1179bdb5 100644
--- a/drivers/acpi/pci_slot.c
+++ b/drivers/acpi/pci_slot.c
@@ -9,6 +9,9 @@
  *  Copyright (C) 2007-2008 Hewlett-Packard Development Company, L.P.
  *  	Alex Chiang <achiang@hp.com>
  *
+ *  Copyright (C) 2013 Huawei Tech. Co., Ltd.
+ *	Jiang Liu <jiang.liu@huawei.com>
+ *
  *  This program is free software; you can redistribute it and/or modify it
  *  under the terms and conditions of the GNU General Public License,
  *  version 2, as published by the Free Software Foundation.
@@ -28,10 +31,9 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/list.h>
 #include <linux/pci.h>
 #include <linux/acpi.h>
-#include <acpi/acpi_bus.h>
-#include <acpi/acpi_drivers.h>
 #include <linux/dmi.h>
 
 static bool debug;
@@ -61,20 +63,12 @@ ACPI_MODULE_NAME("pci_slot");
 #define SLOT_NAME_SIZE 21		/* Inspired by #define in acpiphp.h */
 
 struct acpi_pci_slot {
-	acpi_handle root_handle;	/* handle of the root bridge */
 	struct pci_slot *pci_slot;	/* corresponding pci_slot */
 	struct list_head list;		/* node in the list of slots */
 };
 
-static int acpi_pci_slot_add(struct acpi_pci_root *root);
-static void acpi_pci_slot_remove(struct acpi_pci_root *root);
-
 static LIST_HEAD(slot_list);
 static DEFINE_MUTEX(slot_list_lock);
-static struct acpi_pci_driver acpi_pci_slot_driver = {
-	.add = acpi_pci_slot_add,
-	.remove = acpi_pci_slot_remove,
-};
 
 static int
 check_slot(acpi_handle handle, unsigned long long *sun)
@@ -113,21 +107,8 @@ out:
 	return device;
 }
 
-struct callback_args {
-	acpi_walk_callback	user_function;	/* only for walk_p2p_bridge */
-	struct pci_bus		*pci_bus;
-	acpi_handle		root_handle;
-};
-
 /*
- * register_slot
- *
- * Called once for each SxFy object in the namespace. Don't worry about
- * calling pci_create_slot multiple times for the same pci_bus:device,
- * since each subsequent call simply bumps the refcount on the pci_slot.
- *
- * The number of calls to pci_destroy_slot from unregister_slot is
- * symmetrical.
+ * Check whether handle has an associated slot and create PCI slot if it has.
  */
 static acpi_status
 register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
@@ -137,13 +118,22 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	char name[SLOT_NAME_SIZE];
 	struct acpi_pci_slot *slot;
 	struct pci_slot *pci_slot;
-	struct callback_args *parent_context = context;
-	struct pci_bus *pci_bus = parent_context->pci_bus;
+	struct pci_bus *pci_bus = context;
 
 	device = check_slot(handle, &sun);
 	if (device < 0)
 		return AE_OK;
 
+	/*
+	 * There may be multiple PCI functions associated with the same slot.
+	 * Check whether PCI slot has already been created for this PCI device.
+	 */
+	list_for_each_entry(slot, &slot_list, list) {
+		pci_slot = slot->pci_slot;
+		if (pci_slot->bus == pci_bus && pci_slot->number == device)
+			return AE_OK;
+	}
+
 	slot = kmalloc(sizeof(*slot), GFP_KERNEL);
 	if (!slot) {
 		err("%s: cannot allocate memory\n", __func__);
@@ -158,12 +148,8 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 		return AE_OK;
 	}
 
-	slot->root_handle = parent_context->root_handle;
 	slot->pci_slot = pci_slot;
-	INIT_LIST_HEAD(&slot->list);
-	mutex_lock(&slot_list_lock);
 	list_add(&slot->list, &slot_list);
-	mutex_unlock(&slot_list_lock);
 
 	get_device(&pci_bus->dev);
 
@@ -173,131 +159,24 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	return AE_OK;
 }
 
-/*
- * walk_p2p_bridge - discover and walk p2p bridges
- * @handle: points to an acpi_pci_root
- * @context: p2p_bridge_context pointer
- *
- * Note that when we call ourselves recursively, we pass a different
- * value of pci_bus in the child_context.
- */
-static acpi_status
-walk_p2p_bridge(acpi_handle handle, u32 lvl, void *context, void **rv)
-{
-	int device, function;
-	unsigned long long adr;
-	acpi_status status;
-	acpi_handle dummy_handle;
-	acpi_walk_callback user_function;
-
-	struct pci_dev *dev;
-	struct pci_bus *pci_bus;
-	struct callback_args child_context;
-	struct callback_args *parent_context = context;
-
-	pci_bus = parent_context->pci_bus;
-	user_function = parent_context->user_function;
-
-	status = acpi_get_handle(handle, "_ADR", &dummy_handle);
-	if (ACPI_FAILURE(status))
-		return AE_OK;
-
-	status = acpi_evaluate_integer(handle, "_ADR", NULL, &adr);
-	if (ACPI_FAILURE(status))
-		return AE_OK;
-
-	device = (adr >> 16) & 0xffff;
-	function = adr & 0xffff;
-
-	dev = pci_get_slot(pci_bus, PCI_DEVFN(device, function));
-	if (!dev || !dev->subordinate)
-		goto out;
-
-	child_context.pci_bus = dev->subordinate;
-	child_context.user_function = user_function;
-	child_context.root_handle = parent_context->root_handle;
-
-	dbg("p2p bridge walk, pci_bus = %x\n", dev->subordinate->number);
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				     user_function, NULL, &child_context, NULL);
-	if (ACPI_FAILURE(status))
-		goto out;
-
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				     walk_p2p_bridge, NULL, &child_context, NULL);
-out:
-	pci_dev_put(dev);
-	return AE_OK;
-}
-
-/*
- * walk_root_bridge - generic root bridge walker
- * @root: poiner of an acpi_pci_root
- * @user_function: user callback for slot objects
- *
- * Call user_function for all objects underneath this root bridge.
- * Walk p2p bridges underneath us and call user_function on those too.
- */
-static int
-walk_root_bridge(struct acpi_pci_root *root, acpi_walk_callback user_function)
-{
-	acpi_status status;
-	acpi_handle handle = root->device->handle;
-	struct pci_bus *pci_bus = root->bus;
-	struct callback_args context;
-
-	context.pci_bus = pci_bus;
-	context.user_function = user_function;
-	context.root_handle = handle;
-
-	dbg("root bridge walk, pci_bus = %x\n", pci_bus->number);
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				     user_function, NULL, &context, NULL);
-	if (ACPI_FAILURE(status))
-		return status;
-
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				     walk_p2p_bridge, NULL, &context, NULL);
-	if (ACPI_FAILURE(status))
-		err("%s: walk_p2p_bridge failure - %d\n", __func__, status);
-
-	return status;
-}
-
-/*
- * acpi_pci_slot_add
- * @handle: points to an acpi_pci_root
- */
-static int
-acpi_pci_slot_add(struct acpi_pci_root *root)
+void acpi_pci_slot_enumerate(struct pci_bus *bus, acpi_handle handle)
 {
-	acpi_status status;
-
-	status = walk_root_bridge(root, register_slot);
-	if (ACPI_FAILURE(status))
-		err("%s: register_slot failure - %d\n", __func__, status);
-
-	return status;
+	mutex_lock(&slot_list_lock);
+	acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, 1,
+			    register_slot, NULL, bus, NULL);
+	mutex_unlock(&slot_list_lock);
 }
 
-/*
- * acpi_pci_slot_remove
- * @handle: points to an acpi_pci_root
- */
-static void
-acpi_pci_slot_remove(struct acpi_pci_root *root)
+void acpi_pci_slot_remove(struct pci_bus *bus)
 {
 	struct acpi_pci_slot *slot, *tmp;
-	struct pci_bus *pbus;
-	acpi_handle handle = root->device->handle;
 
 	mutex_lock(&slot_list_lock);
 	list_for_each_entry_safe(slot, tmp, &slot_list, list) {
-		if (slot->root_handle == handle) {
+		if (slot->pci_slot->bus == bus) {
 			list_del(&slot->list);
-			pbus = slot->pci_slot->bus;
 			pci_destroy_slot(slot->pci_slot);
-			put_device(&pbus->dev);
+			put_device(&bus->dev);
 			kfree(slot);
 		}
 	}
@@ -332,5 +211,4 @@ static struct dmi_system_id acpi_pci_slot_dmi_table[] __initdata = {
 void __init acpi_pci_slot_init(void)
 {
 	dmi_check_system(acpi_pci_slot_dmi_table);
-	acpi_pci_register_driver(&acpi_pci_slot_driver);
 }
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 5e7e991717d7..f54d1985e594 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -1790,7 +1790,6 @@ int __init acpi_scan_init(void)
 	acpi_platform_init();
 	acpi_csrt_init();
 	acpi_container_init();
-	acpi_pci_slot_init();
 
 	mutex_lock(&acpi_scan_lock);
 	/*
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index b8b8437c703e..98e582a2c909 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -296,7 +296,7 @@ void acpi_pci_add_bus(struct pci_bus *bus)
 	if (acpi_pci_disabled || handle == NULL)
 		return;
 
-	/* TODO: add PCI slots and acpiphp hotplug slots */
+	acpi_pci_slot_enumerate(bus, handle);
 }
 
 void acpi_pci_remove_bus(struct pci_bus *bus)
@@ -308,7 +308,7 @@ void acpi_pci_remove_bus(struct pci_bus *bus)
 	if (acpi_pci_disabled)
 		return;
 
-	/* TODO: remove PCI slots and acpiphp hotplug slots */
+	acpi_pci_slot_remove(bus);
 }
 
 /* ACPI bus type */
@@ -385,7 +385,10 @@ static int __init acpi_pci_init(void)
 	ret = register_acpi_bus_type(&acpi_pci_bus);
 	if (ret)
 		return 0;
+
 	pci_set_platform_pm(&acpi_pci_platform_pm);
+	acpi_pci_slot_init();
+
 	return 0;
 }
 arch_initcall(acpi_pci_init);
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 2b1743cc3186..23cd40abba4f 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -44,6 +44,18 @@ static inline acpi_handle acpi_pci_get_bridge_handle(struct pci_bus *pbus)
 
 void acpi_pci_add_bus(struct pci_bus *bus);
 void acpi_pci_remove_bus(struct pci_bus *bus);
+
+#ifdef	CONFIG_ACPI_PCI_SLOT
+void acpi_pci_slot_init(void);
+void acpi_pci_slot_enumerate(struct pci_bus *bus, acpi_handle handle);
+void acpi_pci_slot_remove(struct pci_bus *bus);
+#else
+static inline void acpi_pci_slot_init(void) { }
+static inline void acpi_pci_slot_enumerate(struct pci_bus *bus,
+					   acpi_handle handle) { }
+static inline void acpi_pci_slot_remove(struct pci_bus *bus) { }
+#endif
+
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
-- 
cgit 


From 3b63aaa70e1ccc4b66d60acc78da09700706a703 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Fri, 12 Apr 2013 05:44:26 +0000
Subject: PCI: acpiphp: Do not use ACPI PCI subdriver mechanism

Previously the acpiphp driver registered itself as an ACPI PCI subdriver,
so its callbacks were invoked when creating/destroying PCI root
buses to manage ACPI-based PCI hotplug slots.  But it doesn't handle
P2P bridge hotplug events, so it will cause strange behaviour if there
are hotplug slots associated with a hot-removed P2P bridge.

This patch fixes this issue by:
1) Directly hooking into PCI core to update hotplug slot devices when
   creating/destroying PCI buses through:
	pci_{add|remove}_bus() -> acpi_pci_{add|remove}_bus()
2) Getting rid of unused ACPI PCI subdriver-related code

It also cleans up unused code in the acpiphp driver.

[bhelgaas: keep acpi_pci_add_bus() stub for CONFIG_ACPI=n]
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
---
 drivers/pci/hotplug/acpiphp.h      |   3 -
 drivers/pci/hotplug/acpiphp_core.c |  13 +-
 drivers/pci/hotplug/acpiphp_glue.c | 288 +++++++------------------------------
 drivers/pci/pci-acpi.c             |   3 +
 include/linux/pci-acpi.h           |  11 ++
 5 files changed, 71 insertions(+), 247 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index b06ae681d5b7..abd48d309ad0 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -119,7 +119,6 @@ struct acpiphp_slot {
  */
 struct acpiphp_func {
 	struct acpiphp_slot *slot;	/* parent */
-	struct acpiphp_bridge *bridge;	/* Ejectable PCI-to-PCI bridge */
 
 	struct list_head sibling;
 	struct notifier_block nb;
@@ -176,8 +175,6 @@ extern int acpiphp_register_hotplug_slot(struct acpiphp_slot *slot);
 extern void acpiphp_unregister_hotplug_slot(struct acpiphp_slot *slot);
 
 /* acpiphp_glue.c */
-extern int acpiphp_glue_init (void);
-extern void acpiphp_glue_exit (void);
 typedef int (*acpiphp_callback)(struct acpiphp_slot *slot, void *data);
 
 extern int acpiphp_enable_slot (struct acpiphp_slot *slot);
diff --git a/drivers/pci/hotplug/acpiphp_core.c b/drivers/pci/hotplug/acpiphp_core.c
index 81adbfa4df1b..ca8127950fcd 100644
--- a/drivers/pci/hotplug/acpiphp_core.c
+++ b/drivers/pci/hotplug/acpiphp_core.c
@@ -37,6 +37,7 @@
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/pci-acpi.h>
 #include <linux/pci_hotplug.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -354,19 +355,9 @@ void acpiphp_unregister_hotplug_slot(struct acpiphp_slot *acpiphp_slot)
 }
 
 
-static int __init acpiphp_init(void)
+void __init acpiphp_init(void)
 {
 	info(DRIVER_DESC " version: " DRIVER_VERSION "%s\n",
 		acpiphp_disabled ? ", disabled by user; please report a bug"
 				 : "");
-
-	if (acpi_pci_disabled || acpiphp_disabled)
-		return 0;
-
-	/* read all the ACPI info from the system */
-	/* initialize internal data structure etc. */
-	return acpiphp_glue_init();
 }
-
-
-module_init(acpiphp_init);
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index 718464f8fd40..20db4d6564a8 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -157,6 +157,7 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	int device, function, retval;
 	struct pci_bus *pbus = bridge->pci_bus;
 	struct pci_dev *pdev;
+	u32 val;
 
 	if (!acpi_pci_check_ejectable(pbus, handle) && !is_dock_device(handle))
 		return AE_OK;
@@ -249,11 +250,9 @@ register_slot(acpi_handle handle, u32 lvl, void *context, void **rv)
 	newfunc->slot = slot;
 	list_add_tail(&newfunc->sibling, &slot->funcs);
 
-	pdev = pci_get_slot(pbus, PCI_DEVFN(device, function));
-	if (pdev) {
+	if (pci_bus_read_dev_vendor_id(pbus, PCI_DEVFN(device, function),
+				       &val, 60*1000))
 		slot->flags |= (SLOT_ENABLED | SLOT_POWEREDON);
-		pci_dev_put(pdev);
-	}
 
 	if (is_dock_device(handle)) {
 		/* we don't want to call this device's _EJ0
@@ -366,148 +365,6 @@ static struct acpiphp_func *acpiphp_bridge_handle_to_function(acpi_handle handle
 }
 
 
-static inline void config_p2p_bridge_flags(struct acpiphp_bridge *bridge)
-{
-	acpi_handle dummy_handle;
-	struct acpiphp_func *func;
-
-	if (ACPI_SUCCESS(acpi_get_handle(bridge->handle,
-					"_EJ0", &dummy_handle))) {
-		bridge->flags |= BRIDGE_HAS_EJ0;
-
-		dbg("found ejectable p2p bridge\n");
-
-		/* make link between PCI bridge and PCI function */
-		func = acpiphp_bridge_handle_to_function(bridge->handle);
-		if (!func)
-			return;
-		bridge->func = func;
-		func->bridge = bridge;
-	}
-}
-
-
-/* allocate and initialize host bridge data structure */
-static void add_host_bridge(struct acpi_pci_root *root)
-{
-	struct acpiphp_bridge *bridge;
-	acpi_handle handle = root->device->handle;
-
-	bridge = kzalloc(sizeof(struct acpiphp_bridge), GFP_KERNEL);
-	if (bridge == NULL)
-		return;
-
-	bridge->handle = handle;
-
-	bridge->pci_bus = root->bus;
-
-	init_bridge_misc(bridge);
-}
-
-
-/* allocate and initialize PCI-to-PCI bridge data structure */
-static void add_p2p_bridge(acpi_handle *handle)
-{
-	struct acpiphp_bridge *bridge;
-
-	bridge = kzalloc(sizeof(struct acpiphp_bridge), GFP_KERNEL);
-	if (bridge == NULL) {
-		err("out of memory\n");
-		return;
-	}
-
-	bridge->handle = handle;
-	config_p2p_bridge_flags(bridge);
-
-	bridge->pci_dev = acpi_get_pci_dev(handle);
-	bridge->pci_bus = bridge->pci_dev->subordinate;
-	if (!bridge->pci_bus) {
-		err("This is not a PCI-to-PCI bridge!\n");
-		goto err;
-	}
-
-	/*
-	 * Grab a ref to the subordinate PCI bus in case the bus is
-	 * removed via PCI core logical hotplug. The ref pins the bus
-	 * (which we access during module unload).
-	 */
-	get_device(&bridge->pci_bus->dev);
-
-	init_bridge_misc(bridge);
-	return;
- err:
-	pci_dev_put(bridge->pci_dev);
-	kfree(bridge);
-	return;
-}
-
-
-/* callback routine to find P2P bridges */
-static acpi_status
-find_p2p_bridge(acpi_handle handle, u32 lvl, void *context, void **rv)
-{
-	acpi_status status;
-	struct pci_dev *dev;
-
-	dev = acpi_get_pci_dev(handle);
-	if (!dev || !dev->subordinate)
-		goto out;
-
-	/* check if this bridge has ejectable slots */
-	if ((detect_ejectable_slots(handle) > 0)) {
-		dbg("found PCI-to-PCI bridge at PCI %s\n", pci_name(dev));
-		add_p2p_bridge(handle);
-	}
-
-	/* search P2P bridges under this p2p bridge */
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				     find_p2p_bridge, NULL, NULL, NULL);
-	if (ACPI_FAILURE(status))
-		warn("find_p2p_bridge failed (error code = 0x%x)\n", status);
-
- out:
-	pci_dev_put(dev);
-	return AE_OK;
-}
-
-
-/* find hot-pluggable slots, and then find P2P bridge */
-static int add_bridge(struct acpi_pci_root *root)
-{
-	acpi_status status;
-	unsigned long long tmp;
-	acpi_handle dummy_handle;
-	acpi_handle handle = root->device->handle;
-
-	/* if the bridge doesn't have _STA, we assume it is always there */
-	status = acpi_get_handle(handle, "_STA", &dummy_handle);
-	if (ACPI_SUCCESS(status)) {
-		status = acpi_evaluate_integer(handle, "_STA", NULL, &tmp);
-		if (ACPI_FAILURE(status)) {
-			dbg("%s: _STA evaluation failure\n", __func__);
-			return 0;
-		}
-		if ((tmp & ACPI_STA_DEVICE_FUNCTIONING) == 0)
-			/* don't register this object */
-			return 0;
-	}
-
-	/* check if this bridge has ejectable slots */
-	if (detect_ejectable_slots(handle) > 0) {
-		dbg("found PCI host-bus bridge with hot-pluggable slots\n");
-		add_host_bridge(root);
-	}
-
-	/* search P2P bridges under this host bridge */
-	status = acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				     find_p2p_bridge, NULL, NULL, NULL);
-
-	if (ACPI_FAILURE(status))
-		warn("find_p2p_bridge failed (error code = 0x%x)\n", status);
-
-	return 0;
-}
-
 static struct acpiphp_bridge *acpiphp_handle_to_bridge(acpi_handle handle)
 {
 	struct acpiphp_bridge *bridge;
@@ -567,56 +424,12 @@ static void cleanup_bridge(struct acpiphp_bridge *bridge)
 		slot = next;
 	}
 
-	/*
-	 * Only P2P bridges have a pci_dev
-	 */
-	if (bridge->pci_dev)
-		put_device(&bridge->pci_bus->dev);
-
+	put_device(&bridge->pci_bus->dev);
 	pci_dev_put(bridge->pci_dev);
 	list_del(&bridge->list);
 	kfree(bridge);
 }
 
-static acpi_status
-cleanup_p2p_bridge(acpi_handle handle, u32 lvl, void *context, void **rv)
-{
-	struct acpiphp_bridge *bridge;
-
-	/* cleanup p2p bridges under this P2P bridge
-	   in a depth-first manner */
-	acpi_walk_namespace(ACPI_TYPE_DEVICE, handle, (u32)1,
-				cleanup_p2p_bridge, NULL, NULL, NULL);
-
-	bridge = acpiphp_handle_to_bridge(handle);
-	if (bridge)
-		cleanup_bridge(bridge);
-
-	return AE_OK;
-}
-
-static void remove_bridge(struct acpi_pci_root *root)
-{
-	struct acpiphp_bridge *bridge;
-	acpi_handle handle = root->device->handle;
-
-	/* cleanup p2p bridges under this host bridge
-	   in a depth-first manner */
-	acpi_walk_namespace(ACPI_TYPE_DEVICE, handle,
-				(u32)1, cleanup_p2p_bridge, NULL, NULL, NULL);
-
-	/*
-	 * On root bridges with hotplug slots directly underneath (ie,
-	 * no p2p bridge between), we call cleanup_bridge(). 
-	 *
-	 * The else clause cleans up root bridges that either had no
-	 * hotplug slots at all, or had a p2p bridge underneath.
-	 */
-	bridge = acpiphp_handle_to_bridge(handle);
-	if (bridge)
-		cleanup_bridge(bridge);
-}
-
 static int power_on_slot(struct acpiphp_slot *slot)
 {
 	acpi_status status;
@@ -798,6 +611,7 @@ static void check_hotplug_bridge(struct acpiphp_slot *slot, struct pci_dev *dev)
 		}
 	}
 }
+
 /**
  * enable_device - enable, configure a slot
  * @slot: slot to be enabled
@@ -812,7 +626,6 @@ static int __ref enable_device(struct acpiphp_slot *slot)
 	struct acpiphp_func *func;
 	int retval = 0;
 	int num, max, pass;
-	acpi_status status;
 
 	if (slot->flags & SLOT_ENABLED)
 		goto err_exit;
@@ -867,18 +680,6 @@ static int __ref enable_device(struct acpiphp_slot *slot)
 			slot->flags &= (~SLOT_ENABLED);
 			continue;
 		}
-
-		if (dev->hdr_type != PCI_HEADER_TYPE_BRIDGE &&
-		    dev->hdr_type != PCI_HEADER_TYPE_CARDBUS) {
-			pci_dev_put(dev);
-			continue;
-		}
-
-		status = find_p2p_bridge(func->handle, (u32)1, bus, NULL);
-		if (ACPI_FAILURE(status))
-			warn("find_p2p_bridge failed (error code = 0x%x)\n",
-				status);
-		pci_dev_put(dev);
 	}
 
 
@@ -912,16 +713,6 @@ static int disable_device(struct acpiphp_slot *slot)
 {
 	struct acpiphp_func *func;
 	struct pci_dev *pdev;
-	struct pci_bus *bus = slot->bridge->pci_bus;
-
-	list_for_each_entry(func, &slot->funcs, sibling) {
-		if (func->bridge) {
-			/* cleanup p2p bridges under this P2P bridge */
-			cleanup_p2p_bridge(func->bridge->handle,
-						(u32)1, NULL, NULL);
-			func->bridge = NULL;
-		}
-	}
 
 	/*
 	 * enable_device() enumerates all functions in this device via
@@ -940,7 +731,6 @@ static int disable_device(struct acpiphp_slot *slot)
 
 	slot->flags &= (~SLOT_ENABLED);
 
-err_exit:
 	return 0;
 }
 
@@ -1287,30 +1077,62 @@ static void handle_hotplug_event_func(acpi_handle handle, u32 type,
 	alloc_acpi_hp_work(handle, type, context, _handle_hotplug_event_func);
 }
 
-static struct acpi_pci_driver acpi_pci_hp_driver = {
-	.add =		add_bridge,
-	.remove =	remove_bridge,
-};
-
-/**
- * acpiphp_glue_init - initializes all PCI hotplug - ACPI glue data structures
+/*
+ * Create hotplug slots for the PCI bus.
+ * It should always return 0 to avoid skipping following notifiers.
  */
-int __init acpiphp_glue_init(void)
+void acpiphp_enumerate_slots(struct pci_bus *bus, acpi_handle handle)
 {
-	acpi_pci_register_driver(&acpi_pci_hp_driver);
+	acpi_handle dummy_handle;
+	struct acpiphp_bridge *bridge;
 
-	return 0;
-}
+	if (acpiphp_disabled)
+		return;
 
+	if (detect_ejectable_slots(handle) <= 0)
+		return;
 
-/**
- * acpiphp_glue_exit - terminates all PCI hotplug - ACPI glue data structures
- *
- * This function frees all data allocated in acpiphp_glue_init().
- */
-void  acpiphp_glue_exit(void)
+	bridge = kzalloc(sizeof(struct acpiphp_bridge), GFP_KERNEL);
+	if (bridge == NULL) {
+		err("out of memory\n");
+		return;
+	}
+
+	bridge->handle = handle;
+	bridge->pci_dev = pci_dev_get(bus->self);
+	bridge->pci_bus = bus;
+
+	/*
+	 * Grab a ref to the subordinate PCI bus in case the bus is
+	 * removed via PCI core logical hotplug. The ref pins the bus
+	 * (which we access during module unload).
+	 */
+	get_device(&bus->dev);
+
+	if (!pci_is_root_bus(bridge->pci_bus) &&
+	    ACPI_SUCCESS(acpi_get_handle(bridge->handle,
+					"_EJ0", &dummy_handle))) {
+		dbg("found ejectable p2p bridge\n");
+		bridge->flags |= BRIDGE_HAS_EJ0;
+		bridge->func = acpiphp_bridge_handle_to_function(handle);
+	}
+
+	init_bridge_misc(bridge);
+}
+
+/* Destroy hotplug slots associated with the PCI bus */
+void acpiphp_remove_slots(struct pci_bus *bus)
 {
-	acpi_pci_unregister_driver(&acpi_pci_hp_driver);
+	struct acpiphp_bridge *bridge, *tmp;
+
+	if (acpiphp_disabled)
+		return;
+
+	list_for_each_entry_safe(bridge, tmp, &bridge_list, list)
+		if (bridge->pci_bus == bus) {
+			cleanup_bridge(bridge);
+			break;
+		}
 }
 
 /**
diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 98e582a2c909..d927933dcf44 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -297,6 +297,7 @@ void acpi_pci_add_bus(struct pci_bus *bus)
 		return;
 
 	acpi_pci_slot_enumerate(bus, handle);
+	acpiphp_enumerate_slots(bus, handle);
 }
 
 void acpi_pci_remove_bus(struct pci_bus *bus)
@@ -308,6 +309,7 @@ void acpi_pci_remove_bus(struct pci_bus *bus)
 	if (acpi_pci_disabled)
 		return;
 
+	acpiphp_remove_slots(bus);
 	acpi_pci_slot_remove(bus);
 }
 
@@ -388,6 +390,7 @@ static int __init acpi_pci_init(void)
 
 	pci_set_platform_pm(&acpi_pci_platform_pm);
 	acpi_pci_slot_init();
+	acpiphp_init();
 
 	return 0;
 }
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 23cd40abba4f..81b31613eb25 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -56,6 +56,17 @@ static inline void acpi_pci_slot_enumerate(struct pci_bus *bus,
 static inline void acpi_pci_slot_remove(struct pci_bus *bus) { }
 #endif
 
+#ifdef	CONFIG_HOTPLUG_PCI_ACPI
+void acpiphp_init(void);
+void acpiphp_enumerate_slots(struct pci_bus *bus, acpi_handle handle);
+void acpiphp_remove_slots(struct pci_bus *bus);
+#else
+static inline void acpiphp_init(void) { }
+static inline void acpiphp_enumerate_slots(struct pci_bus *bus,
+					   acpi_handle handle) { }
+static inline void acpiphp_remove_slots(struct pci_bus *bus) { }
+#endif
+
 #else	/* CONFIG_ACPI */
 static inline void acpi_pci_add_bus(struct pci_bus *bus) { }
 static inline void acpi_pci_remove_bus(struct pci_bus *bus) { }
-- 
cgit 


From ea024870cf10687b3fded66a9deb6253888f30b7 Mon Sep 17 00:00:00 2001
From: Anton Arapov <anton@redhat.com>
Date: Wed, 3 Apr 2013 18:00:31 +0200
Subject: uretprobes: Introduce uprobe_consumer->ret_handler()

Enclose return probes implementation, introduce ->ret_handler() and update
existing code to rely on ->handler() *and* ->ret_handler() for uprobe and
uretprobe respectively.

Signed-off-by: Anton Arapov <anton@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/uprobes.h |  3 +++
 kernel/events/uprobes.c | 17 ++++++++++++++---
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 19612881399a..5c8d3290df41 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -46,6 +46,9 @@ enum uprobe_filter_ctx {
 
 struct uprobe_consumer {
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+	int (*ret_handler)(struct uprobe_consumer *self,
+				unsigned long func,
+				struct pt_regs *regs);
 	bool (*filter)(struct uprobe_consumer *self,
 				enum uprobe_filter_ctx ctx,
 				struct mm_struct *mm);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 7312503caf2e..eb384e90ac92 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -838,6 +838,14 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
 	struct uprobe *uprobe;
 	int ret;
 
+	/* Uprobe must have at least one set consumer */
+	if (!uc->handler && !uc->ret_handler)
+		return -EINVAL;
+
+	/* TODO: Implement return probes */
+	if (uc->ret_handler)
+		return -ENOSYS;
+
 	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))
 		return -EINVAL;
@@ -1497,10 +1505,13 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 
 	down_read(&uprobe->register_rwsem);
 	for (uc = uprobe->consumers; uc; uc = uc->next) {
-		int rc = uc->handler(uc, regs);
+		int rc = 0;
 
-		WARN(rc & ~UPROBE_HANDLER_MASK,
-			"bad rc=0x%x from %pf()\n", rc, uc->handler);
+		if (uc->handler) {
+			rc = uc->handler(uc, regs);
+			WARN(rc & ~UPROBE_HANDLER_MASK,
+				"bad rc=0x%x from %pf()\n", rc, uc->handler);
+		}
 		remove &= rc;
 	}
 
-- 
cgit 


From 0dfd0eb8e4d72ded8b21f4fee74ba5547408cbe9 Mon Sep 17 00:00:00 2001
From: Anton Arapov <anton@redhat.com>
Date: Wed, 3 Apr 2013 18:00:35 +0200
Subject: uretprobes: Return probe entry, prepare_uretprobe()

When a uprobe with return probe consumer is hit, prepare_uretprobe()
function is invoked. It creates return_instance, hijacks return address
and replaces it with the trampoline.

* Return instances are kept as stack per uprobed task.
* Return instance is chained, when the original return address is
  trampoline's page vaddr (e.g. recursive call of the probed function).

Signed-off-by: Anton Arapov <anton@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/uprobes.h |  1 +
 kernel/events/uprobes.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 92 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index 5c8d3290df41..b0507f24eeb0 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -71,6 +71,7 @@ struct uprobe_task {
 	enum uprobe_task_state		state;
 	struct arch_uprobe_task		autask;
 
+	struct return_instance		*return_instances;
 	struct uprobe			*active_uprobe;
 
 	unsigned long			xol_vaddr;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index d345b7c6cb2d..3798947b3b58 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
 	struct arch_uprobe	arch;
 };
 
+struct return_instance {
+	struct uprobe		*uprobe;
+	unsigned long		func;
+	unsigned long		orig_ret_vaddr; /* original return address */
+	bool			chained;	/* true, if instance is nested */
+
+	struct return_instance	*next;		/* keep as stack */
+};
+
 /*
  * valid_vma: Verify if the specified vma is an executable vma
  * Relax restrictions while unregistering: vm_flags might have
@@ -1317,6 +1326,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
 void uprobe_free_utask(struct task_struct *t)
 {
 	struct uprobe_task *utask = t->utask;
+	struct return_instance *ri, *tmp;
 
 	if (!utask)
 		return;
@@ -1324,6 +1334,15 @@ void uprobe_free_utask(struct task_struct *t)
 	if (utask->active_uprobe)
 		put_uprobe(utask->active_uprobe);
 
+	ri = utask->return_instances;
+	while (ri) {
+		tmp = ri;
+		ri = ri->next;
+
+		put_uprobe(tmp->uprobe);
+		kfree(tmp);
+	}
+
 	xol_free_insn_slot(t);
 	kfree(utask);
 	t->utask = NULL;
@@ -1371,6 +1390,65 @@ static unsigned long get_trampoline_vaddr(void)
 	return trampoline_vaddr;
 }
 
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+	struct return_instance *ri;
+	struct uprobe_task *utask;
+	unsigned long orig_ret_vaddr, trampoline_vaddr;
+	bool chained = false;
+
+	if (!get_xol_area())
+		return;
+
+	utask = get_utask();
+	if (!utask)
+		return;
+
+	ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+	if (!ri)
+		goto fail;
+
+	trampoline_vaddr = get_trampoline_vaddr();
+	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+	if (orig_ret_vaddr == -1)
+		goto fail;
+
+	/*
+	 * We don't want to keep trampoline address in stack, rather keep the
+	 * original return address of first caller thru all the consequent
+	 * instances. This also makes breakpoint unwrapping easier.
+	 */
+	if (orig_ret_vaddr == trampoline_vaddr) {
+		if (!utask->return_instances) {
+			/*
+			 * This situation is not possible. Likely we have an
+			 * attack from user-space.
+			 */
+			pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+						current->pid, current->tgid);
+			goto fail;
+		}
+
+		chained = true;
+		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+	}
+
+	atomic_inc(&uprobe->ref);
+	ri->uprobe = uprobe;
+	ri->func = instruction_pointer(regs);
+	ri->orig_ret_vaddr = orig_ret_vaddr;
+	ri->chained = chained;
+
+	/* add instance to the stack */
+	ri->next = utask->return_instances;
+	utask->return_instances = ri;
+
+	return;
+
+ fail:
+	kfree(ri);
+}
+
 /* Prepare to single-step probed instruction out of line. */
 static int
 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1527,6 +1605,7 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
 	struct uprobe_consumer *uc;
 	int remove = UPROBE_HANDLER_REMOVE;
+	bool need_prep = false; /* prepare return uprobe, when needed */
 
 	down_read(&uprobe->register_rwsem);
 	for (uc = uprobe->consumers; uc; uc = uc->next) {
@@ -1537,9 +1616,16 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 			WARN(rc & ~UPROBE_HANDLER_MASK,
 				"bad rc=0x%x from %pf()\n", rc, uc->handler);
 		}
+
+		if (uc->ret_handler)
+			need_prep = true;
+
 		remove &= rc;
 	}
 
+	if (need_prep && !remove)
+		prepare_uretprobe(uprobe, regs); /* put bp at return */
+
 	if (remove && uprobe->consumers) {
 		WARN_ON(!uprobe_is_active(uprobe));
 		unapply_uprobe(uprobe, current->mm);
@@ -1658,7 +1744,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
  */
 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
 {
-	if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
+	if (!current->mm)
+		return 0;
+
+	if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+	    (!current->utask || !current->utask->return_instances))
 		return 0;
 
 	set_thread_flag(TIF_UPROBE);
-- 
cgit 


From ded49c55309a37129dc30a5f0e85b8a64e5c1716 Mon Sep 17 00:00:00 2001
From: Anton Arapov <anton@redhat.com>
Date: Wed, 3 Apr 2013 18:00:37 +0200
Subject: uretprobes: Limit the depth of return probe nestedness

Unlike the kretprobes we can't trust userspace, thus must have
protection from user space attacks. User-space have  "unlimited"
stack, and this patch limits the return probes nestedness as a
simple remedy for it.

Note that this implementation leaks return_instance on siglongjmp
until exit()/exec().

The intention is to have KISS and bare minimum solution for the
initial implementation in order to not complicate the uretprobes
code.

In the future we may come up with more sophisticated solution that
remove this depth limitation. It is not easy task and lays beyond
this patchset.

Signed-off-by: Anton Arapov <anton@redhat.com>
Acked-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
---
 include/linux/uprobes.h |  3 +++
 kernel/events/uprobes.c | 11 +++++++++++
 2 files changed, 14 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h
index b0507f24eeb0..06f28beed7c2 100644
--- a/include/linux/uprobes.h
+++ b/include/linux/uprobes.h
@@ -38,6 +38,8 @@ struct inode;
 #define UPROBE_HANDLER_REMOVE		1
 #define UPROBE_HANDLER_MASK		1
 
+#define MAX_URETPROBE_DEPTH		64
+
 enum uprobe_filter_ctx {
 	UPROBE_FILTER_REGISTER,
 	UPROBE_FILTER_UNREGISTER,
@@ -72,6 +74,7 @@ struct uprobe_task {
 	struct arch_uprobe_task		autask;
 
 	struct return_instance		*return_instances;
+	unsigned int			depth;
 	struct uprobe			*active_uprobe;
 
 	unsigned long			xol_vaddr;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 65429ad2ce51..6ab00e090c87 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1404,6 +1404,13 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	if (!utask)
 		return;
 
+	if (utask->depth >= MAX_URETPROBE_DEPTH) {
+		printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+				" nestedness limit pid/tgid=%d/%d\n",
+				current->pid, current->tgid);
+		return;
+	}
+
 	ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
 	if (!ri)
 		goto fail;
@@ -1439,6 +1446,8 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
 	ri->orig_ret_vaddr = orig_ret_vaddr;
 	ri->chained = chained;
 
+	utask->depth++;
+
 	/* add instance to the stack */
 	ri->next = utask->return_instances;
 	utask->return_instances = ri;
@@ -1681,6 +1690,8 @@ static bool handle_trampoline(struct pt_regs *regs)
 		if (!chained)
 			break;
 
+		utask->depth--;
+
 		BUG_ON(!ri);
 	}
 
-- 
cgit 


From 80a26a5c22b90a82b8696cb72c1d09d525ada53e Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 26 Mar 2013 16:38:29 +0800
Subject: Thermal: build thermal governors into thermal_sys module

The thermal governors are part of the thermal framework,
rather than a seperate feature/module.
Because the generic thermal layer can not work without
thermal governors, and it must load the thermal governors
during its initialization.

Build them into one module in this patch.

This also fix a problem that the generic thermal layer does not
work when CONFIG_THERMAL=m and CONFIG_THERMAL_GOV_XXX=y.

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Eduardo Valentin <eduardo.valentin@ti.com>
Acked-by: Durgadoss R <durgadoss.r@intel.com>
---
 Documentation/thermal/sysfs-api.txt |  8 -----
 drivers/thermal/Makefile            |  6 ++--
 drivers/thermal/fair_share.c        | 15 ++--------
 drivers/thermal/step_wise.c         | 16 ++--------
 drivers/thermal/thermal_core.c      | 59 ++++++++++++++++++++++++++++++-------
 drivers/thermal/thermal_core.h      | 27 +++++++++++++++++
 drivers/thermal/user_space.c        | 15 ++--------
 include/linux/thermal.h             |  4 ---
 8 files changed, 84 insertions(+), 66 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index 277530a5786c..b2ffe98cf469 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -379,11 +379,3 @@ platform data is provided, this uses the step_wise throttling policy.
 This function serves as an arbitrator to set the state of a cooling
 device. It sets the cooling device to the deepest cooling state if
 possible.
-
-5.5:thermal_register_governor:
-This function lets the various thermal governors to register themselves
-with the Thermal framework. At run time, depending on a zone's platform
-data, a particular governor is used for throttling.
-
-5.6:thermal_unregister_governor:
-This function unregisters a governor from the thermal framework.
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 1bf2eab50b27..b17bfb055498 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -6,9 +6,9 @@ obj-$(CONFIG_THERMAL)		+= thermal_sys.o
 thermal_sys-y			+= thermal_core.o
 
 # governors
-obj-$(CONFIG_THERMAL_GOV_FAIR_SHARE)	+= fair_share.o
-obj-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= step_wise.o
-obj-$(CONFIG_THERMAL_GOV_USER_SPACE)	+= user_space.o
+thermal_sys-$(CONFIG_THERMAL_GOV_FAIR_SHARE)	+= fair_share.o
+thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= step_wise.o
+thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE)	+= user_space.o
 
 # cpufreq cooling
 obj-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
diff --git a/drivers/thermal/fair_share.c b/drivers/thermal/fair_share.c
index 792479f2b64b..944ba2f340c8 100644
--- a/drivers/thermal/fair_share.c
+++ b/drivers/thermal/fair_share.c
@@ -22,9 +22,6 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
 #include <linux/thermal.h>
 
 #include "thermal_core.h"
@@ -111,23 +108,15 @@ static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
 static struct thermal_governor thermal_gov_fair_share = {
 	.name		= "fair_share",
 	.throttle	= fair_share_throttle,
-	.owner		= THIS_MODULE,
 };
 
-static int __init thermal_gov_fair_share_init(void)
+int thermal_gov_fair_share_register(void)
 {
 	return thermal_register_governor(&thermal_gov_fair_share);
 }
 
-static void __exit thermal_gov_fair_share_exit(void)
+void thermal_gov_fair_share_unregister(void)
 {
 	thermal_unregister_governor(&thermal_gov_fair_share);
 }
 
-/* This should load after thermal framework */
-fs_initcall(thermal_gov_fair_share_init);
-module_exit(thermal_gov_fair_share_exit);
-
-MODULE_AUTHOR("Durgadoss R");
-MODULE_DESCRIPTION("A simple weight based thermal throttling governor");
-MODULE_LICENSE("GPL");
diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c
index ca4f79fb72cf..4d4ddae1a991 100644
--- a/drivers/thermal/step_wise.c
+++ b/drivers/thermal/step_wise.c
@@ -22,9 +22,6 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
 #include <linux/thermal.h>
 
 #include "thermal_core.h"
@@ -186,23 +183,14 @@ static int step_wise_throttle(struct thermal_zone_device *tz, int trip)
 static struct thermal_governor thermal_gov_step_wise = {
 	.name		= "step_wise",
 	.throttle	= step_wise_throttle,
-	.owner		= THIS_MODULE,
 };
 
-static int __init thermal_gov_step_wise_init(void)
+int thermal_gov_step_wise_register(void)
 {
 	return thermal_register_governor(&thermal_gov_step_wise);
 }
 
-static void __exit thermal_gov_step_wise_exit(void)
+void thermal_gov_step_wise_unregister(void)
 {
 	thermal_unregister_governor(&thermal_gov_step_wise);
 }
-
-/* This should load after thermal framework */
-fs_initcall(thermal_gov_step_wise_init);
-module_exit(thermal_gov_step_wise_exit);
-
-MODULE_AUTHOR("Durgadoss R");
-MODULE_DESCRIPTION("A step-by-step thermal throttling governor");
-MODULE_LICENSE("GPL");
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5b7863a03f98..4cdc3e327222 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -99,7 +99,6 @@ int thermal_register_governor(struct thermal_governor *governor)
 
 	return err;
 }
-EXPORT_SYMBOL_GPL(thermal_register_governor);
 
 void thermal_unregister_governor(struct thermal_governor *governor)
 {
@@ -127,7 +126,6 @@ exit:
 	mutex_unlock(&thermal_governor_lock);
 	return;
 }
-EXPORT_SYMBOL_GPL(thermal_unregister_governor);
 
 static int get_idr(struct idr *idr, struct mutex *lock, int *id)
 {
@@ -1858,30 +1856,69 @@ static inline int genetlink_init(void) { return 0; }
 static inline void genetlink_exit(void) {}
 #endif /* !CONFIG_NET */
 
+static int __init thermal_register_governors(void)
+{
+	int result;
+
+	result = thermal_gov_step_wise_register();
+	if (result)
+		return result;
+
+	result = thermal_gov_fair_share_register();
+	if (result)
+		return result;
+
+	return thermal_gov_user_space_register();
+}
+
+static void thermal_unregister_governors(void)
+{
+	thermal_gov_step_wise_unregister();
+	thermal_gov_fair_share_unregister();
+	thermal_gov_user_space_unregister();
+}
+
 static int __init thermal_init(void)
 {
-	int result = 0;
+	int result;
+
+	result = thermal_register_governors();
+	if (result)
+		goto error;
 
 	result = class_register(&thermal_class);
-	if (result) {
-		idr_destroy(&thermal_tz_idr);
-		idr_destroy(&thermal_cdev_idr);
-		mutex_destroy(&thermal_idr_lock);
-		mutex_destroy(&thermal_list_lock);
-		return result;
-	}
+	if (result)
+		goto unregister_governors;
+
 	result = genetlink_init();
+	if (result)
+		goto unregister_class;
+
+	return 0;
+
+unregister_governors:
+	thermal_unregister_governors();
+unregister_class:
+	class_unregister(&thermal_class);
+error:
+	idr_destroy(&thermal_tz_idr);
+	idr_destroy(&thermal_cdev_idr);
+	mutex_destroy(&thermal_idr_lock);
+	mutex_destroy(&thermal_list_lock);
+	mutex_destroy(&thermal_governor_lock);
 	return result;
 }
 
 static void __exit thermal_exit(void)
 {
+	genetlink_exit();
 	class_unregister(&thermal_class);
+	thermal_unregister_governors();
 	idr_destroy(&thermal_tz_idr);
 	idr_destroy(&thermal_cdev_idr);
 	mutex_destroy(&thermal_idr_lock);
 	mutex_destroy(&thermal_list_lock);
-	genetlink_exit();
+	mutex_destroy(&thermal_governor_lock);
 }
 
 fs_initcall(thermal_init);
diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
index 0d3205a18112..7cf2f6626251 100644
--- a/drivers/thermal/thermal_core.h
+++ b/drivers/thermal/thermal_core.h
@@ -50,4 +50,31 @@ struct thermal_instance {
 	struct list_head cdev_node; /* node in cdev->thermal_instances */
 };
 
+int thermal_register_governor(struct thermal_governor *);
+void thermal_unregister_governor(struct thermal_governor *);
+
+#ifdef CONFIG_THERMAL_GOV_STEP_WISE
+int thermal_gov_step_wise_register(void);
+void thermal_gov_step_wise_unregister(void);
+#else
+static inline int thermal_gov_step_wise_register(void) { return 0; }
+static inline void thermal_gov_step_wise_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_STEP_WISE */
+
+#ifdef CONFIG_THERMAL_GOV_FAIR_SHARE
+int thermal_gov_fair_share_register(void);
+void thermal_gov_fair_share_unregister(void);
+#else
+static inline int thermal_gov_fair_share_register(void) { return 0; }
+static inline void thermal_gov_fair_share_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_FAIR_SHARE */
+
+#ifdef CONFIG_THERMAL_GOV_USER_SPACE
+int thermal_gov_user_space_register(void);
+void thermal_gov_user_space_unregister(void);
+#else
+static inline int thermal_gov_user_space_register(void) { return 0; }
+static inline void thermal_gov_user_space_unregister(void) {}
+#endif /* CONFIG_THERMAL_GOV_USER_SPACE */
+
 #endif /* __THERMAL_CORE_H__ */
diff --git a/drivers/thermal/user_space.c b/drivers/thermal/user_space.c
index 6bbb380b6d19..10adcddc8821 100644
--- a/drivers/thermal/user_space.c
+++ b/drivers/thermal/user_space.c
@@ -22,9 +22,6 @@
  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  */
 
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
 #include <linux/thermal.h>
 
 #include "thermal_core.h"
@@ -46,23 +43,15 @@ static int notify_user_space(struct thermal_zone_device *tz, int trip)
 static struct thermal_governor thermal_gov_user_space = {
 	.name		= "user_space",
 	.throttle	= notify_user_space,
-	.owner		= THIS_MODULE,
 };
 
-static int __init thermal_gov_user_space_init(void)
+int thermal_gov_user_space_register(void)
 {
 	return thermal_register_governor(&thermal_gov_user_space);
 }
 
-static void __exit thermal_gov_user_space_exit(void)
+void thermal_gov_user_space_unregister(void)
 {
 	thermal_unregister_governor(&thermal_gov_user_space);
 }
 
-/* This should load after thermal framework */
-fs_initcall(thermal_gov_user_space_init);
-module_exit(thermal_gov_user_space_exit);
-
-MODULE_AUTHOR("Durgadoss R");
-MODULE_DESCRIPTION("A user space Thermal notifier");
-MODULE_LICENSE("GPL");
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 5a3b428daaab..4445b951b57e 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -187,7 +187,6 @@ struct thermal_governor {
 	char name[THERMAL_NAME_LENGTH];
 	int (*throttle)(struct thermal_zone_device *tz, int trip);
 	struct list_head	governor_list;
-	struct module		*owner;
 };
 
 /* Structure that holds binding parameters for a zone */
@@ -247,9 +246,6 @@ struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
 void thermal_cdev_update(struct thermal_cooling_device *);
 void notify_thermal_framework(struct thermal_zone_device *, int);
 
-int thermal_register_governor(struct thermal_governor *);
-void thermal_unregister_governor(struct thermal_governor *);
-
 #ifdef CONFIG_NET
 extern int thermal_generate_netlink_event(struct thermal_zone_device *tz,
 						enum events event);
-- 
cgit 


From bbf7fc88c78f7317e2cdcf77e974c8a9a8312641 Mon Sep 17 00:00:00 2001
From: Zhang Rui <rui.zhang@intel.com>
Date: Tue, 26 Mar 2013 23:57:01 +0800
Subject: Thermal: build cpu_cooling code into thermal_sys module

Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Acked-by: Eduardo Valentin <eduardo.valentin@ti.com>
Acked-by: Durgadoss R <durgadoss.r@intel.com>
---
 drivers/thermal/Kconfig     | 2 +-
 drivers/thermal/Makefile    | 2 +-
 include/linux/cpu_cooling.h | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index fb0672baff40..d1c2160492aa 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -67,7 +67,7 @@ config THERMAL_GOV_USER_SPACE
 	  Enable this to let the user space manage the platform thermals.
 
 config CPU_THERMAL
-	tristate "generic cpu cooling support"
+	bool "generic cpu cooling support"
 	depends on CPU_FREQ
 	select CPU_FREQ_TABLE
 	help
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index b17bfb055498..c054d410ac3f 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -11,7 +11,7 @@ thermal_sys-$(CONFIG_THERMAL_GOV_STEP_WISE)	+= step_wise.o
 thermal_sys-$(CONFIG_THERMAL_GOV_USER_SPACE)	+= user_space.o
 
 # cpufreq cooling
-obj-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
+thermal_sys-$(CONFIG_CPU_THERMAL)	+= cpu_cooling.o
 
 # platform thermal drivers
 obj-$(CONFIG_SPEAR_THERMAL)	+= spear_thermal.o
diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index bc479b1e0fd9..77c87c9d0193 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -29,7 +29,7 @@
 #define CPUFREQ_COOLING_START		0
 #define CPUFREQ_COOLING_STOP		1
 
-#if defined(CONFIG_CPU_THERMAL) || defined(CONFIG_CPU_THERMAL_MODULE)
+#ifdef CONFIG_CPU_THERMAL
 /**
  * cpufreq_cooling_register - function to create cpufreq cooling device.
  * @clip_cpus: cpumask of cpus where the frequency constraints will happen
-- 
cgit 


From ba60eb25ff6be6f8e60488cdfd454e5c612bce60 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 14 Apr 2013 10:49:37 -0400
Subject: SUNRPC: Fix a livelock problem in the xprt->backlog queue

This patch ensures that we throttle new RPC requests if there are
requests already waiting in the xprt->backlog queue. The reason for
doing this is to fix livelock issues that can occur when an existing
(high priority) task is waiting in the backlog queue, gets woken up
by xprt_free_slot(), but a new task then steals the slot.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/xprt.h |  2 ++
 net/sunrpc/clnt.c           | 17 ++++++++++++-
 net/sunrpc/xprt.c           | 61 ++++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 76 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 30834be03011..12815f66a9c4 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -279,6 +279,7 @@ struct xprt_class {
 struct rpc_xprt		*xprt_create_transport(struct xprt_create *args);
 void			xprt_connect(struct rpc_task *task);
 void			xprt_reserve(struct rpc_task *task);
+void			xprt_retry_reserve(struct rpc_task *task);
 int			xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task);
 int			xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task);
 void			xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task);
@@ -334,6 +335,7 @@ int			xs_swapper(struct rpc_xprt *xprt, int enable);
 #define XPRT_CLOSING		(6)
 #define XPRT_CONNECTION_ABORT	(7)
 #define XPRT_CONNECTION_CLOSE	(8)
+#define XPRT_CONGESTED		(9)
 
 static inline void xprt_set_connected(struct rpc_xprt *xprt)
 {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index b95a0a2d5eea..a80ee9b80dcf 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1306,6 +1306,8 @@ call_reserve(struct rpc_task *task)
 	xprt_reserve(task);
 }
 
+static void call_retry_reserve(struct rpc_task *task);
+
 /*
  * 1b.	Grok the result of xprt_reserve()
  */
@@ -1347,7 +1349,7 @@ call_reserveresult(struct rpc_task *task)
 	case -ENOMEM:
 		rpc_delay(task, HZ >> 2);
 	case -EAGAIN:	/* woken up; retry */
-		task->tk_action = call_reserve;
+		task->tk_action = call_retry_reserve;
 		return;
 	case -EIO:	/* probably a shutdown */
 		break;
@@ -1359,6 +1361,19 @@ call_reserveresult(struct rpc_task *task)
 	rpc_exit(task, status);
 }
 
+/*
+ * 1c.	Retry reserving an RPC call slot
+ */
+static void
+call_retry_reserve(struct rpc_task *task)
+{
+	dprint_status(task);
+
+	task->tk_status  = 0;
+	task->tk_action  = call_reserveresult;
+	xprt_retry_reserve(task);
+}
+
 /*
  * 2.	Bind and/or refresh the credentials
  */
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index b7478d5e7ffd..745fca3cfd36 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -948,6 +948,34 @@ void xprt_transmit(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 }
 
+static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	set_bit(XPRT_CONGESTED, &xprt->state);
+	rpc_sleep_on(&xprt->backlog, task, NULL);
+}
+
+static void xprt_wake_up_backlog(struct rpc_xprt *xprt)
+{
+	if (rpc_wake_up_next(&xprt->backlog) == NULL)
+		clear_bit(XPRT_CONGESTED, &xprt->state);
+}
+
+static bool xprt_throttle_congested(struct rpc_xprt *xprt, struct rpc_task *task)
+{
+	bool ret = false;
+
+	if (!test_bit(XPRT_CONGESTED, &xprt->state))
+		goto out;
+	spin_lock(&xprt->reserve_lock);
+	if (test_bit(XPRT_CONGESTED, &xprt->state)) {
+		rpc_sleep_on(&xprt->backlog, task, NULL);
+		ret = true;
+	}
+	spin_unlock(&xprt->reserve_lock);
+out:
+	return ret;
+}
+
 static struct rpc_rqst *xprt_dynamic_alloc_slot(struct rpc_xprt *xprt, gfp_t gfp_flags)
 {
 	struct rpc_rqst *req = ERR_PTR(-EAGAIN);
@@ -992,7 +1020,7 @@ void xprt_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
 		task->tk_status = -ENOMEM;
 		break;
 	case -EAGAIN:
-		rpc_sleep_on(&xprt->backlog, task, NULL);
+		xprt_add_backlog(xprt, task);
 		dprintk("RPC:       waiting for request slot\n");
 	default:
 		task->tk_status = -EAGAIN;
@@ -1028,7 +1056,7 @@ static void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req)
 		memset(req, 0, sizeof(*req));	/* mark unused */
 		list_add(&req->rq_list, &xprt->free);
 	}
-	rpc_wake_up_next(&xprt->backlog);
+	xprt_wake_up_backlog(xprt);
 	spin_unlock(&xprt->reserve_lock);
 }
 
@@ -1092,13 +1120,40 @@ EXPORT_SYMBOL_GPL(xprt_free);
  * xprt_reserve - allocate an RPC request slot
  * @task: RPC task requesting a slot allocation
  *
- * If no more slots are available, place the task on the transport's
+ * If the transport is marked as being congested, or if no more
+ * slots are available, place the task on the transport's
  * backlog queue.
  */
 void xprt_reserve(struct rpc_task *task)
 {
 	struct rpc_xprt	*xprt;
 
+	task->tk_status = 0;
+	if (task->tk_rqstp != NULL)
+		return;
+
+	task->tk_timeout = 0;
+	task->tk_status = -EAGAIN;
+	rcu_read_lock();
+	xprt = rcu_dereference(task->tk_client->cl_xprt);
+	if (!xprt_throttle_congested(xprt, task))
+		xprt->ops->alloc_slot(xprt, task);
+	rcu_read_unlock();
+}
+
+/**
+ * xprt_retry_reserve - allocate an RPC request slot
+ * @task: RPC task requesting a slot allocation
+ *
+ * If no more slots are available, place the task on the transport's
+ * backlog queue.
+ * Note that the only difference with xprt_reserve is that we now
+ * ignore the value of the XPRT_CONGESTED flag.
+ */
+void xprt_retry_reserve(struct rpc_task *task)
+{
+	struct rpc_xprt	*xprt;
+
 	task->tk_status = 0;
 	if (task->tk_rqstp != NULL)
 		return;
-- 
cgit 


From b7993cebb841b0da7a33e9d5ce301a9fd3209165 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 14 Apr 2013 11:42:00 -0400
Subject: SUNRPC: Allow rpc_create() to request that TCP slots be unlimited

This is mainly for use by NFSv4.1, where the session negotiation
ultimately wants to decide how many RPC slots we can fill.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 include/linux/sunrpc/xprt.h | 3 +++
 net/sunrpc/clnt.c           | 2 ++
 net/sunrpc/xprtsock.c       | 6 +++++-
 4 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 2cf4ffaa3cd4..e7d492ce7c18 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -124,6 +124,7 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_NOPING		(1UL << 4)
 #define RPC_CLNT_CREATE_DISCRTRY	(1UL << 5)
 #define RPC_CLNT_CREATE_QUIET		(1UL << 6)
+#define RPC_CLNT_CREATE_INFINITE_SLOTS	(1UL << 7)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 12815f66a9c4..ff5392421cb2 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -255,6 +255,8 @@ static inline int bc_prealloc(struct rpc_rqst *req)
 }
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
+#define XPRT_CREATE_INFINITE_SLOTS	(1U)
+
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
 	struct net *		net;
@@ -263,6 +265,7 @@ struct xprt_create {
 	size_t			addrlen;
 	const char		*servername;
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
+	unsigned int		flags;
 };
 
 struct xprt_class {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index a80ee9b80dcf..651245aa829a 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -414,6 +414,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 	};
 	char servername[48];
 
+	if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
+		xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
 	/*
 	 * If the caller chooses not to specify a hostname, whip
 	 * up a string representation of the passed-in address.
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 3081620cb02c..726e702b7a29 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2762,9 +2762,13 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
 	struct rpc_xprt *xprt;
 	struct sock_xprt *transport;
 	struct rpc_xprt *ret;
+	unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries;
+
+	if (args->flags & XPRT_CREATE_INFINITE_SLOTS)
+		max_slot_table_size = RPC_MAX_SLOT_TABLE_LIMIT;
 
 	xprt = xs_setup_xprt(args, xprt_tcp_slot_table_entries,
-			xprt_max_tcp_slot_table_entries);
+			max_slot_table_size);
 	if (IS_ERR(xprt))
 		return xprt;
 	transport = container_of(xprt, struct sock_xprt, xprt);
-- 
cgit 


From 98f98cf571b7b1f34683455a61dae9f35e7222a1 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Sun, 14 Apr 2013 11:49:51 -0400
Subject: NFSv4.1: Set the RPC_CLNT_CREATE_INFINITE_SLOTS flag for NFSv4.1
 transports

This ensures that the RPC layer doesn't override the NFS session
negotiation.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/client.c           | 2 ++
 fs/nfs/nfs4client.c       | 2 ++
 include/linux/nfs_fs_sb.h | 1 +
 3 files changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 84d8eae203a7..c513b0cc835f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -593,6 +593,8 @@ int nfs_create_rpc_client(struct nfs_client *clp,
 		args.flags |= RPC_CLNT_CREATE_DISCRTRY;
 	if (test_bit(NFS_CS_NORESVPORT, &clp->cl_flags))
 		args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+	if (test_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags))
+		args.flags |= RPC_CLNT_CREATE_INFINITE_SLOTS;
 
 	if (!IS_ERR(clp->cl_rpcclient))
 		return 0;
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 17b34b2da2df..f4d4d4ec6bf7 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -198,6 +198,8 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 	/* Check NFS protocol revision and initialize RPC op vector */
 	clp->rpc_ops = &nfs_v4_clientops;
 
+	if (clp->cl_minorversion != 0)
+		__set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags);
 	__set_bit(NFS_CS_DISCRTRY, &clp->cl_flags);
 	error = nfs_create_rpc_client(clp, timeparms, authflavour);
 	if (error < 0)
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index d8fdfdc7a8fe..3b7fa2abecca 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -40,6 +40,7 @@ struct nfs_client {
 #define NFS_CS_NORESVPORT	0		/* - use ephemeral src port */
 #define NFS_CS_DISCRTRY		1		/* - disconnect on RPC retry */
 #define NFS_CS_MIGRATION	2		/* - transparent state migr */
+#define NFS_CS_INFINITE_SLOTS	3		/* - don't limit TCP slots */
 	struct sockaddr_storage	cl_addr;	/* server identifier */
 	size_t			cl_addrlen;
 	char *			cl_hostname;	/* hostname of server */
-- 
cgit 


From 935d8aabd4331f47a89c3e1daa5779d23cf244ee Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Apr 2013 10:06:31 -0700
Subject: Add file_ns_capable() helper function for open-time capability
 checking

Nothing is using it yet, but this will allow us to delay the open-time
checks to use time, without breaking the normal UNIX permission
semantics where permissions are determined by the opener (and the file
descriptor can then be passed to a different process, or the process can
drop capabilities).

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/capability.h |  2 ++
 kernel/capability.c        | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/capability.h b/include/linux/capability.h
index 98503b792369..d9a4f7f40f32 100644
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -35,6 +35,7 @@ struct cpu_vfs_cap_data {
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
 
+struct file;
 struct inode;
 struct dentry;
 struct user_namespace;
@@ -211,6 +212,7 @@ extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
 extern bool nsown_capable(int cap);
 extern bool inode_capable(const struct inode *inode, int cap);
+extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
 /* audit system wants to get cap info from files as well */
 extern int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps);
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -392,6 +392,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file:  The file we want to check
+ * @ns:  The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+	if (WARN_ON_ONCE(!cap_valid(cap)))
+		return false;
+
+	if (security_capable(file->f_cred, ns, cap) == 0)
+		return true;
+
+	return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
 /**
  * capable - Determine if the current task has a superior capability in effect
  * @cap: The capability to be tested for
-- 
cgit 


From 63c4d919cf66b1b3ffa7861bddb50a697914af5b Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Fri, 5 Apr 2013 12:32:28 +0000
Subject: thermal: introduce thermal_zone_get_zone_by_name helper function

This patch adds a helper function to get a reference of
a thermal zone, based on the zone type name.

It will perform a zone name lookup and return a reference
to a thermal zone device that matches the name requested.
In case the zone is not found or when several zones match
same name or if the required parameters are invalid, it will return
the corresponding error code (ERR_PTR).

Cc: Durgadoss R <durgadoss.r@intel.com>
Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Acked-by: Durgadoss R <durgadoss.r@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/thermal_core.c | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/thermal.h        |  1 +
 2 files changed, 39 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 4cdc3e327222..5045473485cf 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1754,6 +1754,44 @@ void thermal_zone_device_unregister(struct thermal_zone_device *tz)
 }
 EXPORT_SYMBOL(thermal_zone_device_unregister);
 
+/**
+ * thermal_zone_get_zone_by_name() - search for a zone and returns its ref
+ * @name: thermal zone name to fetch the temperature
+ *
+ * When only one zone is found with the passed name, returns a reference to it.
+ *
+ * Return: On success returns a reference to an unique thermal zone with
+ * matching name equals to @name, an ERR_PTR otherwise (-EINVAL for invalid
+ * paramenters, -ENODEV for not found and -EEXIST for multiple matches).
+ */
+struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name)
+{
+	struct thermal_zone_device *pos = NULL, *ref = ERR_PTR(-EINVAL);
+	unsigned int found = 0;
+
+	if (!name)
+		goto exit;
+
+	mutex_lock(&thermal_list_lock);
+	list_for_each_entry(pos, &thermal_tz_list, node)
+		if (!strnicmp(name, pos->type, THERMAL_NAME_LENGTH)) {
+			found++;
+			ref = pos;
+		}
+	mutex_unlock(&thermal_list_lock);
+
+	/* nothing has been found, thus an error code for it */
+	if (found == 0)
+		ref = ERR_PTR(-ENODEV);
+	else if (found > 1)
+	/* Success only when an unique zone is found */
+		ref = ERR_PTR(-EEXIST);
+
+exit:
+	return ref;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name);
+
 #ifdef CONFIG_NET
 static struct genl_family thermal_event_genl_family = {
 	.id = GENL_ID_GENERATE,
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 3bda306f7a50..9af2f3a99658 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -239,6 +239,7 @@ void thermal_zone_device_update(struct thermal_zone_device *);
 struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 		const struct thermal_cooling_device_ops *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
+struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
-- 
cgit 


From 837b26bb2e4a83d224e725f07a1d9ca824bf905c Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Fri, 5 Apr 2013 12:32:29 +0000
Subject: thermal: expose thermal_zone_get_temp API

This patch exports the thermal_zone_get_temp API so that driver
writers can fetch temperature of thermal zones managed by other
drivers.

Acked-by: Durgadoss R <durgadoss.r@intel.com>
Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 drivers/thermal/thermal_core.c | 20 +++++++++++++++++---
 include/linux/thermal.h        |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index 5045473485cf..c0779adb2459 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -369,16 +369,28 @@ static void handle_thermal_trip(struct thermal_zone_device *tz, int trip)
 	monitor_thermal_zone(tz);
 }
 
-static int thermal_zone_get_temp(struct thermal_zone_device *tz,
-				unsigned long *temp)
+/**
+ * thermal_zone_get_temp() - returns its the temperature of thermal zone
+ * @tz: a valid pointer to a struct thermal_zone_device
+ * @temp: a valid pointer to where to store the resulting temperature.
+ *
+ * When a valid thermal zone reference is passed, it will fetch its
+ * temperature and fill @temp.
+ *
+ * Return: On success returns 0, an error code otherwise
+ */
+int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp)
 {
-	int ret = 0;
+	int ret = -EINVAL;
 #ifdef CONFIG_THERMAL_EMULATION
 	int count;
 	unsigned long crit_temp = -1UL;
 	enum thermal_trip_type type;
 #endif
 
+	if (IS_ERR_OR_NULL(tz))
+		goto exit;
+
 	mutex_lock(&tz->lock);
 
 	ret = tz->ops->get_temp(tz, temp);
@@ -402,8 +414,10 @@ static int thermal_zone_get_temp(struct thermal_zone_device *tz,
 skip_emul:
 #endif
 	mutex_unlock(&tz->lock);
+exit:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
 static void update_temperature(struct thermal_zone_device *tz)
 {
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index 9af2f3a99658..ec2b886f9d96 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -240,6 +240,7 @@ struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 		const struct thermal_cooling_device_ops *);
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
+int thermal_zone_get_temp(struct thermal_zone_device *tz, unsigned long *temp);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
-- 
cgit 


From 25a7e6848db76e22677aff202d9c4ef3503be15b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:15:25 -0700
Subject: move cgroupfs_root to include/linux/cgroup.h

While controllers shouldn't be accessing cgroupfs_root directly, it
being hidden inside kern/cgroup.c makes somethings pretty silly.  This
makes routing hierarchy-wide settings which need to be visible to
controllers cumbersome.

We're gonna add another hierarchy-wide setting which needs to be
accessed from controllers.  Move cgroupfs_root and its flags to the
header file so that we can access root settings with inline helpers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c        | 57 --------------------------------------------------
 2 files changed, 57 insertions(+), 57 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 45aee0fc6b98..b21881e1ea08 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/idr.h>
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
+#include <linux/fs.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -238,6 +239,62 @@ struct cgroup {
 	struct simple_xattrs xattrs;
 };
 
+#define MAX_CGROUP_ROOT_NAMELEN 64
+
+/* cgroupfs_root->flags */
+enum {
+	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
+	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
+};
+
+/*
+ * A cgroupfs_root represents the root of a cgroup hierarchy, and may be
+ * associated with a superblock to form an active hierarchy.  This is
+ * internal to cgroup core.  Don't access directly from controllers.
+ */
+struct cgroupfs_root {
+	struct super_block *sb;
+
+	/*
+	 * The bitmask of subsystems intended to be attached to this
+	 * hierarchy
+	 */
+	unsigned long subsys_mask;
+
+	/* Unique id for this hierarchy. */
+	int hierarchy_id;
+
+	/* The bitmask of subsystems currently attached to this hierarchy */
+	unsigned long actual_subsys_mask;
+
+	/* A list running through the attached subsystems */
+	struct list_head subsys_list;
+
+	/* The root cgroup for this hierarchy */
+	struct cgroup top_cgroup;
+
+	/* Tracks how many cgroups are currently defined in hierarchy.*/
+	int number_of_cgroups;
+
+	/* A list running through the active hierarchies */
+	struct list_head root_list;
+
+	/* All cgroups on this root, cgroup_mutex protected */
+	struct list_head allcg_list;
+
+	/* Hierarchy-specific flags */
+	unsigned long flags;
+
+	/* IDs for cgroups in this hierarchy */
+	struct ida cgroup_ida;
+
+	/* The path to use for release notifications. */
+	char release_agent_path[PATH_MAX];
+
+	/* The name for this hierarchy - may be empty */
+	char name[MAX_CGROUP_ROOT_NAMELEN];
+};
+
 /*
  * A css_set is a structure holding pointers to a set of
  * cgroup_subsys_state objects. This saves space in the task struct
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 67428d84e38b..8b8eb7c168ff 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -30,7 +30,6 @@
 #include <linux/cred.h>
 #include <linux/ctype.h>
 #include <linux/errno.h>
-#include <linux/fs.h>
 #include <linux/init_task.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
@@ -104,56 +103,6 @@ static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
 #include <linux/cgroup_subsys.h>
 };
 
-#define MAX_CGROUP_ROOT_NAMELEN 64
-
-/*
- * A cgroupfs_root represents the root of a cgroup hierarchy,
- * and may be associated with a superblock to form an active
- * hierarchy
- */
-struct cgroupfs_root {
-	struct super_block *sb;
-
-	/*
-	 * The bitmask of subsystems intended to be attached to this
-	 * hierarchy
-	 */
-	unsigned long subsys_mask;
-
-	/* Unique id for this hierarchy. */
-	int hierarchy_id;
-
-	/* The bitmask of subsystems currently attached to this hierarchy */
-	unsigned long actual_subsys_mask;
-
-	/* A list running through the attached subsystems */
-	struct list_head subsys_list;
-
-	/* The root cgroup for this hierarchy */
-	struct cgroup top_cgroup;
-
-	/* Tracks how many cgroups are currently defined in hierarchy.*/
-	int number_of_cgroups;
-
-	/* A list running through the active hierarchies */
-	struct list_head root_list;
-
-	/* All cgroups on this root, cgroup_mutex protected */
-	struct list_head allcg_list;
-
-	/* Hierarchy-specific flags */
-	unsigned long flags;
-
-	/* IDs for cgroups in this hierarchy */
-	struct ida cgroup_ida;
-
-	/* The path to use for release notifications. */
-	char release_agent_path[PATH_MAX];
-
-	/* The name for this hierarchy - may be empty */
-	char name[MAX_CGROUP_ROOT_NAMELEN];
-};
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -296,12 +245,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
 }
 EXPORT_SYMBOL_GPL(cgroup_is_descendant);
 
-/* cgroupfs_root->flags */
-enum {
-	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
-	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
-};
-
 static int cgroup_is_releasable(const struct cgroup *cgrp)
 {
 	const int bits =
-- 
cgit 


From 873fe09ea5df6ccf6bb34811d8c9992aacb67598 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 14 Apr 2013 20:15:26 -0700
Subject: cgroup: introduce sane_behavior mount option

It's a sad fact that at this point various cgroup controllers are
carrying so many idiosyncrasies and pure insanities that it simply
isn't possible to reach any sort of sane consistent behavior while
maintaining staying fully compatible with what already has been
exposed to userland.

As we can't break exposed userland interface, transitioning to sane
behaviors can only be done in steps while maintaining backwards
compatibility.  This patch introduces a new mount option -
__DEVEL__sane_behavior - which disables crazy features and enforces
consistent behaviors in cgroup core proper and various controllers.
As exactly which behaviors it changes are still being determined, the
mount option, at this point, is useful only for development of the new
behaviors.  As such, the mount option is prefixed with __DEVEL__ and
generates a warning message when used.

Eventually, once we get to the point where all controller's behaviors
are consistent enough to implement unified hierarchy, the __DEVEL__
prefix will be dropped, and more importantly, unified-hierarchy will
enforce sane_behavior by default.  Maybe we'll able to completely drop
the crazy stuff after a while, maybe not, but we at least have a
strategy to move on to saner behaviors.

This patch introduces the mount option and changes the following
behaviors in cgroup core.

* Mount options "noprefix" and "clone_children" are disallowed.  Also,
  cgroupfs file cgroup.clone_children is not created.

* When mounting an existing superblock, mount options should match.
  This is currently pretty crazy.  If one mounts a cgroup, creates a
  subdirectory, unmounts it and then mount it again with different
  option, it looks like the new options are applied but they aren't.

* Remount is disallowed.

The behaviors changes are documented in the comment above
CGRP_ROOT_SANE_BEHAVIOR enum and will be expanded as different
controllers are converted and planned improvements progress.

v2: Dropped unnecessary explicit file permission setting sane_behavior
    cftype entry as suggested by Li Zefan.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 include/linux/cgroup.h | 43 +++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c        | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 91 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b21881e1ea08..9c300ad9a911 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -156,6 +156,8 @@ enum {
 	 * specified at mount time and thus is implemented here.
 	 */
 	CGRP_CPUSET_CLONE_CHILDREN,
+	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
+	CGRP_SANE_BEHAVIOR,
 };
 
 struct cgroup_name {
@@ -243,6 +245,37 @@ struct cgroup {
 
 /* cgroupfs_root->flags */
 enum {
+	/*
+	 * Unfortunately, cgroup core and various controllers are riddled
+	 * with idiosyncrasies and pointless options.  The following flag,
+	 * when set, will force sane behavior - some options are forced on,
+	 * others are disallowed, and some controllers will change their
+	 * hierarchical or other behaviors.
+	 *
+	 * The set of behaviors affected by this flag are still being
+	 * determined and developed and the mount option for this flag is
+	 * prefixed with __DEVEL__.  The prefix will be dropped once we
+	 * reach the point where all behaviors are compatible with the
+	 * planned unified hierarchy, which will automatically turn on this
+	 * flag.
+	 *
+	 * The followings are the behaviors currently affected this flag.
+	 *
+	 * - Mount options "noprefix" and "clone_children" are disallowed.
+	 *   Also, cgroupfs file cgroup.clone_children is not created.
+	 *
+	 * - When mounting an existing superblock, mount options should
+	 *   match.
+	 *
+	 * - Remount is disallowed.
+	 *
+	 * The followings are planned changes.
+	 *
+	 * - release_agent will be disallowed once replacement notification
+	 *   mechanism is implemented.
+	 */
+	CGRP_ROOT_SANE_BEHAVIOR	= (1 << 0),
+
 	CGRP_ROOT_NOPREFIX	= (1 << 1), /* mounted subsystems have no named prefix */
 	CGRP_ROOT_XATTR		= (1 << 2), /* supports extended attributes */
 };
@@ -360,6 +393,7 @@ struct cgroup_map_cb {
 /* cftype->flags */
 #define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
 #define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
+#define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
 
 #define MAX_CFTYPE_NAME		64
 
@@ -486,6 +520,15 @@ struct cgroup_scanner {
 	void *data;
 };
 
+/*
+ * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details.  This
+ * function can be called as long as @cgrp is accessible.
+ */
+static inline bool cgroup_sane_behavior(const struct cgroup *cgrp)
+{
+	return cgrp->root->flags & CGRP_ROOT_SANE_BEHAVIOR;
+}
+
 /* Caller should hold rcu_read_lock() */
 static inline const char *cgroup_name(const struct cgroup *cgrp)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8b8eb7c168ff..67804590d4b0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1080,6 +1080,8 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	mutex_lock(&cgroup_root_mutex);
 	for_each_subsys(root, ss)
 		seq_printf(seq, ",%s", ss->name);
+	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR)
+		seq_puts(seq, ",sane_behavior");
 	if (root->flags & CGRP_ROOT_NOPREFIX)
 		seq_puts(seq, ",noprefix");
 	if (root->flags & CGRP_ROOT_XATTR)
@@ -1144,6 +1146,10 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 			all_ss = true;
 			continue;
 		}
+		if (!strcmp(token, "__DEVEL__sane_behavior")) {
+			opts->flags |= CGRP_ROOT_SANE_BEHAVIOR;
+			continue;
+		}
 		if (!strcmp(token, "noprefix")) {
 			opts->flags |= CGRP_ROOT_NOPREFIX;
 			continue;
@@ -1231,6 +1237,20 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 
 	/* Consistency checks */
 
+	if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+		pr_warning("cgroup: sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n");
+
+		if (opts->flags & CGRP_ROOT_NOPREFIX) {
+			pr_err("cgroup: sane_behavior: noprefix is not allowed\n");
+			return -EINVAL;
+		}
+
+		if (opts->cpuset_clone_children) {
+			pr_err("cgroup: sane_behavior: clone_children is not allowed\n");
+			return -EINVAL;
+		}
+	}
+
 	/*
 	 * Option noprefix was introduced just for backward compatibility
 	 * with the old cpuset, so we allow noprefix only if mounting just
@@ -1307,6 +1327,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroup_sb_opts opts;
 	unsigned long added_mask, removed_mask;
 
+	if (root->flags & CGRP_ROOT_SANE_BEHAVIOR) {
+		pr_err("cgroup: sane_behavior: remount is not allowed\n");
+		return -EINVAL;
+	}
+
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 	mutex_lock(&cgroup_root_mutex);
@@ -1657,6 +1682,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		 * any) is not needed
 		 */
 		cgroup_drop_root(opts.new_root);
+
+		if (((root->flags | opts.flags) & CGRP_ROOT_SANE_BEHAVIOR) &&
+		    root->flags != opts.flags) {
+			pr_err("cgroup: sane_behavior: new mount options should match the existing superblock\n");
+			ret = -EINVAL;
+			goto drop_new_super;
+		}
+
 		/* no subsys rebinding, so refcounts don't change */
 		drop_parsed_module_refcounts(opts.subsys_mask);
 	}
@@ -2200,6 +2233,13 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft,
+				     struct seq_file *seq)
+{
+	seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp));
+	return 0;
+}
+
 /* A buffer size big enough for numbers or short strings */
 #define CGROUP_LOCAL_BUFFER_SIZE 64
 
@@ -2681,6 +2721,8 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
+		if ((cft->flags & CFTYPE_INSANE) && cgroup_sane_behavior(cgrp))
+			continue;
 		if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
 			continue;
 		if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
@@ -3918,9 +3960,15 @@ static struct cftype files[] = {
 	},
 	{
 		.name = "cgroup.clone_children",
+		.flags = CFTYPE_INSANE,
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
+	{
+		.name = "cgroup.sane_behavior",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cgroup_sane_behavior_show,
+	},
 	{
 		.name = "release_agent",
 		.flags = CFTYPE_ONLY_ON_ROOT,
-- 
cgit 


From a53e28da574a40bcc9f78f5d0b0b60570182595b Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Mon, 25 Mar 2013 13:23:52 +0100
Subject: dma: Make the 'mask' parameter of __dma_request_channel const

The 'mask' parameter is not modified in __dma_request_channel and really
shouldn't be. Make this explicit by making the parameter const.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dmaengine.c   |  9 ++++++---
 include/linux/dmaengine.h | 11 ++++++-----
 2 files changed, 12 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index b2728d6ba2fd..d07ef7dc04ec 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -174,7 +174,8 @@ static struct class dma_devclass = {
 #define dma_device_satisfies_mask(device, mask) \
 	__dma_device_satisfies_mask((device), &(mask))
 static int
-__dma_device_satisfies_mask(struct dma_device *device, dma_cap_mask_t *want)
+__dma_device_satisfies_mask(struct dma_device *device,
+			    const dma_cap_mask_t *want)
 {
 	dma_cap_mask_t has;
 
@@ -463,7 +464,8 @@ static void dma_channel_rebalance(void)
 		}
 }
 
-static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_device *dev,
+static struct dma_chan *private_candidate(const dma_cap_mask_t *mask,
+					  struct dma_device *dev,
 					  dma_filter_fn fn, void *fn_param)
 {
 	struct dma_chan *chan;
@@ -505,7 +507,8 @@ static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_devic
  * @fn: optional callback to disposition available channels
  * @fn_param: opaque parameter to pass to dma_filter_fn
  */
-struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param)
+struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+				       dma_filter_fn fn, void *fn_param)
 {
 	struct dma_device *device, *_d;
 	struct dma_chan *chan = NULL;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 91ac8da25020..dd6d21b335c8 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -967,7 +967,8 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 #ifdef CONFIG_DMA_ENGINE
 enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
 void dma_issue_pending_all(void);
-struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
+struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
+					dma_filter_fn fn, void *fn_param);
 struct dma_chan *dma_request_slave_channel(struct device *dev, char *name);
 void dma_release_channel(struct dma_chan *chan);
 #else
@@ -978,7 +979,7 @@ static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descript
 static inline void dma_issue_pending_all(void)
 {
 }
-static inline struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask,
+static inline struct dma_chan *__dma_request_channel(const dma_cap_mask_t *mask,
 					      dma_filter_fn fn, void *fn_param)
 {
 	return NULL;
@@ -1005,9 +1006,9 @@ struct dma_chan *net_dma_find_channel(void);
 	__dma_request_slave_channel_compat(&(mask), x, y, dev, name)
 
 static inline struct dma_chan
-*__dma_request_slave_channel_compat(dma_cap_mask_t *mask, dma_filter_fn fn,
-				  void *fn_param, struct device *dev,
-				  char *name)
+*__dma_request_slave_channel_compat(const dma_cap_mask_t *mask,
+				  dma_filter_fn fn, void *fn_param,
+				  struct device *dev, char *name)
 {
 	struct dma_chan *chan;
 
-- 
cgit 


From bef29ec508e58bf8b9ec0915de5b0739fb800c91 Mon Sep 17 00:00:00 2001
From: Markus Pargmann <mpa@pengutronix.de>
Date: Sun, 24 Feb 2013 16:36:09 +0100
Subject: DMA: of: Constant names

No DMA of-function alters the name, so this patch changes the name arguments
to be constant. Most drivers will probably request DMA channels using a
constant name.

Signed-off-by: Markus Pargmann <mpa@pengutronix.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/dmaengine.c   | 2 +-
 drivers/dma/of-dma.c      | 6 +++---
 include/linux/dmaengine.h | 4 ++--
 include/linux/of_dma.h    | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index b2728d6ba2fd..2cbfefea93ee 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -555,7 +555,7 @@ EXPORT_SYMBOL_GPL(__dma_request_channel);
  * @dev:	pointer to client device structure
  * @name:	slave channel name
  */
-struct dma_chan *dma_request_slave_channel(struct device *dev, char *name)
+struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name)
 {
 	/* If device-tree is present get slave info from here */
 	if (dev->of_node)
diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 69d04d28b1ef..6036cd08e222 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -172,8 +172,8 @@ EXPORT_SYMBOL_GPL(of_dma_controller_free);
  * specifiers, matches the name provided. Returns 0 if the name matches and
  * a valid pointer to the DMA specifier is found. Otherwise returns -ENODEV.
  */
-static int of_dma_match_channel(struct device_node *np, char *name, int index,
-				struct of_phandle_args *dma_spec)
+static int of_dma_match_channel(struct device_node *np, const char *name,
+				int index, struct of_phandle_args *dma_spec)
 {
 	const char *s;
 
@@ -198,7 +198,7 @@ static int of_dma_match_channel(struct device_node *np, char *name, int index,
  * Returns pointer to appropriate dma channel on success or NULL on error.
  */
 struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-					      char *name)
+					      const char *name)
 {
 	struct of_phandle_args	dma_spec;
 	struct of_dma		*ofdma;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 91ac8da25020..274071ca6f04 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -968,7 +968,7 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie);
 enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx);
 void dma_issue_pending_all(void);
 struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param);
-struct dma_chan *dma_request_slave_channel(struct device *dev, char *name);
+struct dma_chan *dma_request_slave_channel(struct device *dev, const char *name);
 void dma_release_channel(struct dma_chan *chan);
 #else
 static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx)
@@ -984,7 +984,7 @@ static inline struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask,
 	return NULL;
 }
 static inline struct dma_chan *dma_request_slave_channel(struct device *dev,
-							 char *name)
+							 const char *name)
 {
 	return NULL;
 }
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index d15073e080dd..ce6a8ab3d2bb 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -40,7 +40,7 @@ extern int of_dma_controller_register(struct device_node *np,
 		void *data);
 extern int of_dma_controller_free(struct device_node *np);
 extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-						     char *name);
+						     const char *name);
 extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
 		struct of_dma *ofdma);
 #else
@@ -58,7 +58,7 @@ static inline int of_dma_controller_free(struct device_node *np)
 }
 
 static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-						     char *name)
+						     const char *name)
 {
 	return NULL;
 }
-- 
cgit 


From 05fb22ec5456a472a5eadcaacb3e51eca1f8c79c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Mon, 15 Apr 2013 14:25:05 +0800
Subject: cgroup: remove cgrp->top_cgroup

It's not used, and it can be retrieved via cgrp->root->top_cgroup.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 2 --
 2 files changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9c300ad9a911..64047ae7fde1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -204,7 +204,6 @@ struct cgroup {
 	struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT];
 
 	struct cgroupfs_root *root;
-	struct cgroup *top_cgroup;
 
 	/*
 	 * List of cg_cgroup_links pointing at css_sets with
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 67804590d4b0..c16719e056ab 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1418,7 +1418,6 @@ static void init_cgroup_root(struct cgroupfs_root *root)
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->name = &root_cgroup_name;
-	cgrp->top_cgroup = cgrp;
 	init_cgroup_housekeeping(cgrp);
 	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 }
@@ -4145,7 +4144,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 	cgrp->parent = parent;
 	cgrp->root = parent->root;
-	cgrp->top_cgroup = parent->top_cgroup;
 
 	if (notify_on_release(parent))
 		set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
-- 
cgit 


From 95b80e0a9ab3c1c8b41af5f2863801708f8f7288 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Thu, 21 Mar 2013 17:59:19 +0100
Subject: arm: mach-mv78xx0: convert to use the mvebu-mbus driver

This commit convers the mach-mv78xx0 sub-architecture to use the
mvebu-mbus driver. We simply have to call mvebu_mbus_init() in the
->init_early() function, and modify the PCIe code so that it uses the
new functions provided by mvebu-mbus to create the needed PCIe
windows.

Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Cooper <jason@lakedaemon.net>
---
 arch/arm/Kconfig                             |  1 +
 arch/arm/mach-mv78xx0/Makefile               |  2 +-
 arch/arm/mach-mv78xx0/addr-map.c             | 93 ----------------------------
 arch/arm/mach-mv78xx0/common.c               | 10 ++-
 arch/arm/mach-mv78xx0/include/mach/mv78xx0.h |  9 ++-
 arch/arm/mach-mv78xx0/pcie.c                 | 21 ++++---
 arch/arm/plat-orion/Makefile                 |  2 -
 include/linux/mbus.h                         |  3 +
 8 files changed, 33 insertions(+), 108 deletions(-)
 delete mode 100644 arch/arm/mach-mv78xx0/addr-map.c

(limited to 'include/linux')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b6f5f28ef007..9462dd931b33 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -588,6 +588,7 @@ config ARCH_MV78XX0
 	select GENERIC_CLOCKEVENTS
 	select PCI
 	select PLAT_ORION_LEGACY
+	select MVEBU_MBUS
 	help
 	  Support for the following Marvell MV78xx0 series SoCs:
 	  MV781x0, MV782x0.
diff --git a/arch/arm/mach-mv78xx0/Makefile b/arch/arm/mach-mv78xx0/Makefile
index 67a13f9bfe64..7cd04634d302 100644
--- a/arch/arm/mach-mv78xx0/Makefile
+++ b/arch/arm/mach-mv78xx0/Makefile
@@ -1,4 +1,4 @@
-obj-y				+= common.o addr-map.o mpp.o irq.o pcie.o
+obj-y				+= common.o mpp.o irq.o pcie.o
 obj-$(CONFIG_MACH_DB78X00_BP)	+= db78x00-bp-setup.o
 obj-$(CONFIG_MACH_RD78X00_MASA)	+= rd78x00-masa-setup.o
 obj-$(CONFIG_MACH_TERASTATION_WXL) += buffalo-wxl-setup.o
diff --git a/arch/arm/mach-mv78xx0/addr-map.c b/arch/arm/mach-mv78xx0/addr-map.c
deleted file mode 100644
index 26e9876b50e9..000000000000
--- a/arch/arm/mach-mv78xx0/addr-map.c
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * arch/arm/mach-mv78xx0/addr-map.c
- *
- * Address map functions for Marvell MV78xx0 SoCs
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mbus.h>
-#include <linux/io.h>
-#include <plat/addr-map.h>
-#include <mach/mv78xx0.h>
-#include "common.h"
-
-/*
- * Generic Address Decode Windows bit settings
- */
-#define TARGET_DEV_BUS		1
-#define TARGET_PCIE0		4
-#define TARGET_PCIE1		8
-#define TARGET_PCIE(i)		((i) ? TARGET_PCIE1 : TARGET_PCIE0)
-#define ATTR_DEV_SPI_ROM	0x1f
-#define ATTR_DEV_BOOT		0x2f
-#define ATTR_DEV_CS3		0x37
-#define ATTR_DEV_CS2		0x3b
-#define ATTR_DEV_CS1		0x3d
-#define ATTR_DEV_CS0		0x3e
-#define ATTR_PCIE_IO(l)		(0xf0 & ~(0x10 << (l)))
-#define ATTR_PCIE_MEM(l)	(0xf8 & ~(0x10 << (l)))
-
-/*
- * CPU Address Decode Windows registers
- */
-#define WIN0_OFF(n)		(BRIDGE_VIRT_BASE + 0x0000 + ((n) << 4))
-#define WIN8_OFF(n)		(BRIDGE_VIRT_BASE + 0x0900 + (((n) - 8) << 4))
-
-static void __init __iomem *win_cfg_base(const struct orion_addr_map_cfg *cfg, int win)
-{
-	/*
-	 * Find the control register base address for this window.
-	 *
-	 * BRIDGE_VIRT_BASE points to the right (CPU0's or CPU1's)
-	 * MBUS bridge depending on which CPU core we're running on,
-	 * so we don't need to take that into account here.
-	 */
-
-	return (win < 8) ? WIN0_OFF(win) : WIN8_OFF(win);
-}
-
-/*
- * Description of the windows needed by the platform code
- */
-static struct orion_addr_map_cfg addr_map_cfg __initdata = {
-	.num_wins = 14,
-	.remappable_wins = 8,
-	.win_cfg_base = win_cfg_base,
-};
-
-void __init mv78xx0_setup_cpu_mbus(void)
-{
-	/*
-	 * Disable, clear and configure windows.
-	 */
-	orion_config_wins(&addr_map_cfg, NULL);
-
-	/*
-	 * Setup MBUS dram target info.
-	 */
-	if (mv78xx0_core_index() == 0)
-		orion_setup_cpu_mbus_target(&addr_map_cfg,
-					    (void __iomem *) DDR_WINDOW_CPU0_BASE);
-	else
-		orion_setup_cpu_mbus_target(&addr_map_cfg,
-					    (void __iomem *) DDR_WINDOW_CPU1_BASE);
-}
-
-void __init mv78xx0_setup_pcie_io_win(int window, u32 base, u32 size,
-				      int maj, int min)
-{
-	orion_setup_cpu_win(&addr_map_cfg, window, base, size,
-			    TARGET_PCIE(maj), ATTR_PCIE_IO(min), 0);
-}
-
-void __init mv78xx0_setup_pcie_mem_win(int window, u32 base, u32 size,
-				       int maj, int min)
-{
-	orion_setup_cpu_win(&addr_map_cfg, window, base, size,
-			    TARGET_PCIE(maj), ATTR_PCIE_MEM(min), -1);
-}
diff --git a/arch/arm/mach-mv78xx0/common.c b/arch/arm/mach-mv78xx0/common.c
index 0efa14498ebc..749a7f8c4992 100644
--- a/arch/arm/mach-mv78xx0/common.c
+++ b/arch/arm/mach-mv78xx0/common.c
@@ -334,6 +334,14 @@ void __init mv78xx0_uart3_init(void)
 void __init mv78xx0_init_early(void)
 {
 	orion_time_set_base(TIMER_VIRT_BASE);
+	if (mv78xx0_core_index() == 0)
+		mvebu_mbus_init("marvell,mv78xx0-mbus",
+				BRIDGE_WINS_CPU0_BASE, BRIDGE_WINS_SZ,
+				DDR_WINDOW_CPU0_BASE, DDR_WINDOW_CPU_SZ);
+	else
+		mvebu_mbus_init("marvell,mv78xx0-mbus",
+				BRIDGE_WINS_CPU1_BASE, BRIDGE_WINS_SZ,
+				DDR_WINDOW_CPU1_BASE, DDR_WINDOW_CPU_SZ);
 }
 
 void __init_refok mv78xx0_timer_init(void)
@@ -397,8 +405,6 @@ void __init mv78xx0_init(void)
 	printk("HCLK = %dMHz, ", (hclk + 499999) / 1000000);
 	printk("TCLK = %dMHz\n", (get_tclk() + 499999) / 1000000);
 
-	mv78xx0_setup_cpu_mbus();
-
 #ifdef CONFIG_CACHE_FEROCEON_L2
 	feroceon_l2_init(is_l2_writethrough());
 #endif
diff --git a/arch/arm/mach-mv78xx0/include/mach/mv78xx0.h b/arch/arm/mach-mv78xx0/include/mach/mv78xx0.h
index 46200a183cf2..723748d8ba7d 100644
--- a/arch/arm/mach-mv78xx0/include/mach/mv78xx0.h
+++ b/arch/arm/mach-mv78xx0/include/mach/mv78xx0.h
@@ -60,13 +60,18 @@
  */
 #define BRIDGE_VIRT_BASE	(MV78XX0_CORE_REGS_VIRT_BASE)
 #define BRIDGE_PHYS_BASE	(MV78XX0_CORE_REGS_PHYS_BASE)
+#define  BRIDGE_WINS_CPU0_BASE  (MV78XX0_CORE0_REGS_PHYS_BASE)
+#define  BRIDGE_WINS_CPU1_BASE  (MV78XX0_CORE1_REGS_PHYS_BASE)
+#define  BRIDGE_WINS_SZ         (0xA000)
 
 /*
  * Register Map
  */
 #define DDR_VIRT_BASE		(MV78XX0_REGS_VIRT_BASE + 0x00000)
-#define  DDR_WINDOW_CPU0_BASE	(DDR_VIRT_BASE + 0x1500)
-#define  DDR_WINDOW_CPU1_BASE	(DDR_VIRT_BASE + 0x1570)
+#define DDR_PHYS_BASE           (MV78XX0_REGS_PHYS_BASE + 0x00000)
+#define  DDR_WINDOW_CPU0_BASE	(DDR_PHYS_BASE + 0x1500)
+#define  DDR_WINDOW_CPU1_BASE	(DDR_PHYS_BASE + 0x1570)
+#define  DDR_WINDOW_CPU_SZ      (0x20)
 
 #define DEV_BUS_PHYS_BASE	(MV78XX0_REGS_PHYS_BASE + 0x10000)
 #define DEV_BUS_VIRT_BASE	(MV78XX0_REGS_VIRT_BASE + 0x10000)
diff --git a/arch/arm/mach-mv78xx0/pcie.c b/arch/arm/mach-mv78xx0/pcie.c
index ee8c0b51df2c..dc26a654c496 100644
--- a/arch/arm/mach-mv78xx0/pcie.c
+++ b/arch/arm/mach-mv78xx0/pcie.c
@@ -10,11 +10,11 @@
 
 #include <linux/kernel.h>
 #include <linux/pci.h>
+#include <linux/mbus.h>
 #include <video/vga.h>
 #include <asm/irq.h>
 #include <asm/mach/pci.h>
 #include <plat/pcie.h>
-#include <plat/addr-map.h>
 #include <mach/mv78xx0.h>
 #include "common.h"
 
@@ -54,7 +54,6 @@ static void __init mv78xx0_pcie_preinit(void)
 	int i;
 	u32 size_each;
 	u32 start;
-	int win = 0;
 
 	pcie_io_space.name = "PCIe I/O Space";
 	pcie_io_space.start = MV78XX0_PCIE_IO_PHYS_BASE(0);
@@ -72,6 +71,7 @@ static void __init mv78xx0_pcie_preinit(void)
 	start = MV78XX0_PCIE_MEM_PHYS_BASE;
 	for (i = 0; i < num_pcie_ports; i++) {
 		struct pcie_port *pp = pcie_port + i;
+		char winname[MVEBU_MBUS_MAX_WINNAME_SZ];
 
 		snprintf(pp->mem_space_name, sizeof(pp->mem_space_name),
 			"PCIe %d.%d MEM", pp->maj, pp->min);
@@ -85,12 +85,17 @@ static void __init mv78xx0_pcie_preinit(void)
 		if (request_resource(&iomem_resource, &pp->res))
 			panic("can't allocate PCIe MEM sub-space");
 
-		mv78xx0_setup_pcie_mem_win(win + i + 8, pp->res.start,
-					   resource_size(&pp->res),
-					   pp->maj, pp->min);
-
-		mv78xx0_setup_pcie_io_win(win + i, i * SZ_64K, SZ_64K,
-					  pp->maj, pp->min);
+		snprintf(winname, sizeof(winname), "pcie%d.%d",
+			 pp->maj, pp->min);
+
+		mvebu_mbus_add_window_remap_flags(winname,
+						  pp->res.start,
+						  resource_size(&pp->res),
+						  MVEBU_MBUS_NO_REMAP,
+						  MVEBU_MBUS_PCI_MEM);
+		mvebu_mbus_add_window_remap_flags(winname,
+						  i * SZ_64K, SZ_64K,
+						  0, MVEBU_MBUS_PCI_IO);
 	}
 }
 
diff --git a/arch/arm/plat-orion/Makefile b/arch/arm/plat-orion/Makefile
index 09711cdc6817..2eca54b65906 100644
--- a/arch/arm/plat-orion/Makefile
+++ b/arch/arm/plat-orion/Makefile
@@ -3,8 +3,6 @@
 #
 ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include
 
-obj-$(CONFIG_ARCH_MV78XX0)        += addr-map.o
-
 orion-gpio-$(CONFIG_GENERIC_GPIO) += gpio.o
 obj-$(CONFIG_PLAT_ORION_LEGACY)   += irq.o pcie.o time.o common.o mpp.o
 obj-$(CONFIG_PLAT_ORION_LEGACY)   += $(orion-gpio-y)
diff --git a/include/linux/mbus.h b/include/linux/mbus.h
index 462eb9791012..dba482e31a13 100644
--- a/include/linux/mbus.h
+++ b/include/linux/mbus.h
@@ -43,6 +43,9 @@ struct mbus_dram_target_info
  */
 #define MVEBU_MBUS_NO_REMAP (0xffffffff)
 
+/* Maximum size of a mbus window name */
+#define MVEBU_MBUS_MAX_WINNAME_SZ 32
+
 /*
  * The Marvell mbus is to be found only on SOCs from the Orion family
  * at the moment.  Provide a dummy stub for other architectures.
-- 
cgit 


From 1b2e98bc1e35ebe1f65c3db62c8317096ad7f2c8 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 9 Apr 2013 14:05:43 +0300
Subject: dma: acpi-dma: introduce ACPI DMA helpers

There is a new generic API to get a DMA channel for a slave device (commit
9a6cecc8 "dmaengine: add helper function to request a slave DMA channel"). In
similar fashion to the DT case (commit aa3da644 "of: Add generic device tree
DMA helpers") we introduce helpers to the DMAC drivers which are enumerated by
ACPI.

The proposed extension provides the following API calls:
	acpi_dma_controller_register(), devm_acpi_dma_controller_register()
	acpi_dma_controller_free(), devm_acpi_dma_controller_free()
	acpi_dma_simple_xlate()
	acpi_dma_request_slave_chan_by_index()
	acpi_dma_request_slave_chan_by_name()

The first two should be used, for example, at probe() and remove() of the
corresponding DMAC driver. At the register stage the DMAC driver supplies a
custom xlate() function to translate a struct dma_spec into struct dma_chan.

Accordingly to the ACPI Fixed DMA resource specification the only two pieces of
information the slave device has are the channel id and the request line (slave
id). Those two are represented by struct dma_spec. The
acpi_dma_request_slave_chan_by_index() provides access to the specifix FixedDMA
resource by its index. Whereas dma_request_slave_channel() takes a string
parameter to identify the DMA resources required by the slave device. To make a
slave device driver work with both DeviceTree and ACPI enumeration a simple
convention is established: "tx" corresponds to the index 0 and "rx" to the
index 1. In case of robust configuration the slave device driver unfortunately
needs to call acpi_dma_request_slave_chan_by_index() directly.

Additionally the patch provides "managed" version of the register/free pair
i.e. devm_acpi_dma_controller_register() and devm_acpi_dma_controller_free().
Usually, the driver uses only devm_acpi_dma_controller_register().

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Mika Westerberg <mika.westerberg@linux.intel.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 Documentation/acpi/enumeration.txt |  77 ++++++++++
 drivers/dma/Kconfig                |   4 +
 drivers/dma/Makefile               |   1 +
 drivers/dma/acpi-dma.c             | 279 +++++++++++++++++++++++++++++++++++++
 include/linux/acpi_dma.h           | 116 +++++++++++++++
 5 files changed, 477 insertions(+)
 create mode 100644 drivers/dma/acpi-dma.c
 create mode 100644 include/linux/acpi_dma.h

(limited to 'include/linux')

diff --git a/Documentation/acpi/enumeration.txt b/Documentation/acpi/enumeration.txt
index 94a656131885..2874c904f3ef 100644
--- a/Documentation/acpi/enumeration.txt
+++ b/Documentation/acpi/enumeration.txt
@@ -66,6 +66,83 @@ the ACPI device explicitly to acpi_platform_device_ids list defined in
 drivers/acpi/acpi_platform.c. This limitation is only for the platform
 devices, SPI and I2C devices are created automatically as described below.
 
+DMA support
+~~~~~~~~~~~
+DMA controllers enumerated via ACPI should be registered in the system to
+provide generic access to their resources. For example, a driver that would
+like to be accessible to slave devices via generic API call
+dma_request_slave_channel() must register itself at the end of the probe
+function like this:
+
+	err = devm_acpi_dma_controller_register(dev, xlate_func, dw);
+	/* Handle the error if it's not a case of !CONFIG_ACPI */
+
+and implement custom xlate function if needed (usually acpi_dma_simple_xlate()
+is enough) which converts the FixedDMA resource provided by struct
+acpi_dma_spec into the corresponding DMA channel. A piece of code for that case
+could look like:
+
+	#ifdef CONFIG_ACPI
+	struct filter_args {
+		/* Provide necessary information for the filter_func */
+		...
+	};
+
+	static bool filter_func(struct dma_chan *chan, void *param)
+	{
+		/* Choose the proper channel */
+		...
+	}
+
+	static struct dma_chan *xlate_func(struct acpi_dma_spec *dma_spec,
+			struct acpi_dma *adma)
+	{
+		dma_cap_mask_t cap;
+		struct filter_args args;
+
+		/* Prepare arguments for filter_func */
+		...
+		return dma_request_channel(cap, filter_func, &args);
+	}
+	#else
+	static struct dma_chan *xlate_func(struct acpi_dma_spec *dma_spec,
+			struct acpi_dma *adma)
+	{
+		return NULL;
+	}
+	#endif
+
+dma_request_slave_channel() will call xlate_func() for each registered DMA
+controller. In the xlate function the proper channel must be chosen based on
+information in struct acpi_dma_spec and the properties of the controller
+provided by struct acpi_dma.
+
+Clients must call dma_request_slave_channel() with the string parameter that
+corresponds to a specific FixedDMA resource. By default "tx" means the first
+entry of the FixedDMA resource array, "rx" means the second entry. The table
+below shows a layout:
+
+	Device (I2C0)
+	{
+		...
+		Method (_CRS, 0, NotSerialized)
+		{
+			Name (DBUF, ResourceTemplate ()
+			{
+				FixedDMA (0x0018, 0x0004, Width32bit, _Y48)
+				FixedDMA (0x0019, 0x0005, Width32bit, )
+			})
+		...
+		}
+	}
+
+So, the FixedDMA with request line 0x0018 is "tx" and next one is "rx" in
+this example.
+
+In robust cases the client unfortunately needs to call
+acpi_dma_request_slave_chan_by_index() directly and therefore choose the
+specific FixedDMA resource by its index.
+
 SPI serial bus support
 ~~~~~~~~~~~~~~~~~~~~~~
 Slave devices behind SPI bus have SpiSerialBus resource attached to them.
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index d5c58e839b27..afe5b1958382 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -326,6 +326,10 @@ config DMA_ENGINE
 config DMA_VIRTUAL_CHANNELS
 	tristate
 
+config DMA_ACPI
+	def_bool y
+	depends on ACPI
+
 config DMA_OF
 	def_bool y
 	depends on OF
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 488e3ff85b52..268e62634bca 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -3,6 +3,7 @@ ccflags-$(CONFIG_DMADEVICES_VDEBUG) += -DVERBOSE_DEBUG
 
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_DMA_VIRTUAL_CHANNELS) += virt-dma.o
+obj-$(CONFIG_DMA_ACPI) += acpi-dma.o
 obj-$(CONFIG_DMA_OF) += of-dma.o
 
 obj-$(CONFIG_NET_DMA) += iovlock.o
diff --git a/drivers/dma/acpi-dma.c b/drivers/dma/acpi-dma.c
new file mode 100644
index 000000000000..ba6fc62e9651
--- /dev/null
+++ b/drivers/dma/acpi-dma.c
@@ -0,0 +1,279 @@
+/*
+ * ACPI helpers for DMA request / controller
+ *
+ * Based on of-dma.c
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/acpi.h>
+#include <linux/acpi_dma.h>
+
+static LIST_HEAD(acpi_dma_list);
+static DEFINE_MUTEX(acpi_dma_lock);
+
+/**
+ * acpi_dma_controller_register - Register a DMA controller to ACPI DMA helpers
+ * @dev:		struct device of DMA controller
+ * @acpi_dma_xlate:	translation function which converts a dma specifier
+ *			into a dma_chan structure
+ * @data		pointer to controller specific data to be used by
+ *			translation function
+ *
+ * Returns 0 on success or appropriate errno value on error.
+ *
+ * Allocated memory should be freed with appropriate acpi_dma_controller_free()
+ * call.
+ */
+int acpi_dma_controller_register(struct device *dev,
+		struct dma_chan *(*acpi_dma_xlate)
+		(struct acpi_dma_spec *, struct acpi_dma *),
+		void *data)
+{
+	struct acpi_device *adev;
+	struct acpi_dma	*adma;
+
+	if (!dev || !acpi_dma_xlate)
+		return -EINVAL;
+
+	/* Check if the device was enumerated by ACPI */
+	if (!ACPI_HANDLE(dev))
+		return -EINVAL;
+
+	if (acpi_bus_get_device(ACPI_HANDLE(dev), &adev))
+		return -EINVAL;
+
+	adma = kzalloc(sizeof(*adma), GFP_KERNEL);
+	if (!adma)
+		return -ENOMEM;
+
+	adma->dev = dev;
+	adma->acpi_dma_xlate = acpi_dma_xlate;
+	adma->data = data;
+
+	/* Now queue acpi_dma controller structure in list */
+	mutex_lock(&acpi_dma_lock);
+	list_add_tail(&adma->dma_controllers, &acpi_dma_list);
+	mutex_unlock(&acpi_dma_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(acpi_dma_controller_register);
+
+/**
+ * acpi_dma_controller_free - Remove a DMA controller from ACPI DMA helpers list
+ * @dev:	struct device of DMA controller
+ *
+ * Memory allocated by acpi_dma_controller_register() is freed here.
+ */
+int acpi_dma_controller_free(struct device *dev)
+{
+	struct acpi_dma *adma;
+
+	if (!dev)
+		return -EINVAL;
+
+	mutex_lock(&acpi_dma_lock);
+
+	list_for_each_entry(adma, &acpi_dma_list, dma_controllers)
+		if (adma->dev == dev) {
+			list_del(&adma->dma_controllers);
+			mutex_unlock(&acpi_dma_lock);
+			kfree(adma);
+			return 0;
+		}
+
+	mutex_unlock(&acpi_dma_lock);
+	return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(acpi_dma_controller_free);
+
+static void devm_acpi_dma_release(struct device *dev, void *res)
+{
+	acpi_dma_controller_free(dev);
+}
+
+/**
+ * devm_acpi_dma_controller_register - resource managed acpi_dma_controller_register()
+ * @dev:		device that is registering this DMA controller
+ * @acpi_dma_xlate:	translation function
+ * @data		pointer to controller specific data
+ *
+ * Managed acpi_dma_controller_register(). DMA controller registered by this
+ * function are automatically freed on driver detach. See
+ * acpi_dma_controller_register() for more information.
+ */
+int devm_acpi_dma_controller_register(struct device *dev,
+		struct dma_chan *(*acpi_dma_xlate)
+		(struct acpi_dma_spec *, struct acpi_dma *),
+		void *data)
+{
+	void *res;
+	int ret;
+
+	res = devres_alloc(devm_acpi_dma_release, 0, GFP_KERNEL);
+	if (!res)
+		return -ENOMEM;
+
+	ret = acpi_dma_controller_register(dev, acpi_dma_xlate, data);
+	if (ret) {
+		devres_free(res);
+		return ret;
+	}
+	devres_add(dev, res);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(devm_acpi_dma_controller_register);
+
+/**
+ * devm_acpi_dma_controller_free - resource managed acpi_dma_controller_free()
+ *
+ * Unregister a DMA controller registered with
+ * devm_acpi_dma_controller_register(). Normally this function will not need to
+ * be called and the resource management code will ensure that the resource is
+ * freed.
+ */
+void devm_acpi_dma_controller_free(struct device *dev)
+{
+	WARN_ON(devres_destroy(dev, devm_acpi_dma_release, NULL, NULL));
+}
+EXPORT_SYMBOL_GPL(devm_acpi_dma_controller_free);
+
+struct acpi_dma_parser_data {
+	struct acpi_dma_spec dma_spec;
+	size_t index;
+	size_t n;
+};
+
+/**
+ * acpi_dma_parse_fixed_dma - Parse FixedDMA ACPI resources to a DMA specifier
+ * @res:	struct acpi_resource to get FixedDMA resources from
+ * @data:	pointer to a helper struct acpi_dma_parser_data
+ */
+static int acpi_dma_parse_fixed_dma(struct acpi_resource *res, void *data)
+{
+	struct acpi_dma_parser_data *pdata = data;
+
+	if (res->type == ACPI_RESOURCE_TYPE_FIXED_DMA) {
+		struct acpi_resource_fixed_dma *dma = &res->data.fixed_dma;
+
+		if (pdata->n++ == pdata->index) {
+			pdata->dma_spec.chan_id = dma->channels;
+			pdata->dma_spec.slave_id = dma->request_lines;
+		}
+	}
+
+	/* Tell the ACPI core to skip this resource */
+	return 1;
+}
+
+/**
+ * acpi_dma_request_slave_chan_by_index - Get the DMA slave channel
+ * @dev:	struct device to get DMA request from
+ * @index:	index of FixedDMA descriptor for @dev
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
+		size_t index)
+{
+	struct acpi_dma_parser_data pdata;
+	struct acpi_dma_spec *dma_spec = &pdata.dma_spec;
+	struct list_head resource_list;
+	struct acpi_device *adev;
+	struct acpi_dma *adma;
+	struct dma_chan *chan = NULL;
+
+	/* Check if the device was enumerated by ACPI */
+	if (!dev || !ACPI_HANDLE(dev))
+		return NULL;
+
+	if (acpi_bus_get_device(ACPI_HANDLE(dev), &adev))
+		return NULL;
+
+	memset(&pdata, 0, sizeof(pdata));
+	pdata.index = index;
+
+	/* Initial values for the request line and channel */
+	dma_spec->chan_id = -1;
+	dma_spec->slave_id = -1;
+
+	INIT_LIST_HEAD(&resource_list);
+	acpi_dev_get_resources(adev, &resource_list,
+			acpi_dma_parse_fixed_dma, &pdata);
+	acpi_dev_free_resource_list(&resource_list);
+
+	if (dma_spec->slave_id < 0 || dma_spec->chan_id < 0)
+		return NULL;
+
+	mutex_lock(&acpi_dma_lock);
+
+	list_for_each_entry(adma, &acpi_dma_list, dma_controllers) {
+		dma_spec->dev = adma->dev;
+		chan = adma->acpi_dma_xlate(dma_spec, adma);
+		if (chan)
+			break;
+	}
+
+	mutex_unlock(&acpi_dma_lock);
+	return chan;
+}
+EXPORT_SYMBOL_GPL(acpi_dma_request_slave_chan_by_index);
+
+/**
+ * acpi_dma_request_slave_chan_by_name - Get the DMA slave channel
+ * @dev:	struct device to get DMA request from
+ * @name:	represents corresponding FixedDMA descriptor for @dev
+ *
+ * In order to support both Device Tree and ACPI in a single driver we
+ * translate the names "tx" and "rx" here based on the most common case where
+ * the first FixedDMA descriptor is TX and second is RX.
+ *
+ * Returns pointer to appropriate dma channel on success or NULL on error.
+ */
+struct dma_chan *acpi_dma_request_slave_chan_by_name(struct device *dev,
+		const char *name)
+{
+	size_t index;
+
+	if (!strcmp(name, "tx"))
+		index = 0;
+	else if (!strcmp(name, "rx"))
+		index = 1;
+	else
+		return NULL;
+
+	return acpi_dma_request_slave_chan_by_index(dev, index);
+}
+EXPORT_SYMBOL_GPL(acpi_dma_request_slave_chan_by_name);
+
+/**
+ * acpi_dma_simple_xlate - Simple ACPI DMA engine translation helper
+ * @dma_spec: pointer to ACPI DMA specifier
+ * @adma: pointer to ACPI DMA controller data
+ *
+ * A simple translation function for ACPI based devices. Passes &struct
+ * dma_spec to the DMA controller driver provided filter function. Returns
+ * pointer to the channel if found or %NULL otherwise.
+ */
+struct dma_chan *acpi_dma_simple_xlate(struct acpi_dma_spec *dma_spec,
+		struct acpi_dma *adma)
+{
+	struct acpi_dma_filter_info *info = adma->data;
+
+	if (!info || !info->filter_fn)
+		return NULL;
+
+	return dma_request_channel(info->dma_cap, info->filter_fn, dma_spec);
+}
+EXPORT_SYMBOL_GPL(acpi_dma_simple_xlate);
diff --git a/include/linux/acpi_dma.h b/include/linux/acpi_dma.h
new file mode 100644
index 000000000000..d09deabc7bf6
--- /dev/null
+++ b/include/linux/acpi_dma.h
@@ -0,0 +1,116 @@
+/*
+ * ACPI helpers for DMA request / controller
+ *
+ * Based on of_dma.h
+ *
+ * Copyright (C) 2013, Intel Corporation
+ * Author: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __LINUX_ACPI_DMA_H
+#define __LINUX_ACPI_DMA_H
+
+#include <linux/list.h>
+#include <linux/device.h>
+#include <linux/dmaengine.h>
+
+/**
+ * struct acpi_dma_spec - slave device DMA resources
+ * @chan_id:	channel unique id
+ * @slave_id:	request line unique id
+ * @dev:	struct device of the DMA controller to be used in the filter
+ *		function
+ */
+struct acpi_dma_spec {
+	int		chan_id;
+	int		slave_id;
+	struct device	*dev;
+};
+
+/**
+ * struct acpi_dma - representation of the registered DMAC
+ * @dma_controllers:	linked list node
+ * @dev:		struct device of this controller
+ * @acpi_dma_xlate:	callback function to find a suitable channel
+ * @data:		private data used by a callback function
+ */
+struct acpi_dma {
+	struct list_head	dma_controllers;
+	struct device		*dev;
+	struct dma_chan		*(*acpi_dma_xlate)
+				(struct acpi_dma_spec *, struct acpi_dma *);
+	void			*data;
+};
+
+/* Used with acpi_dma_simple_xlate() */
+struct acpi_dma_filter_info {
+	dma_cap_mask_t	dma_cap;
+	dma_filter_fn	filter_fn;
+};
+
+#ifdef CONFIG_DMA_ACPI
+
+int acpi_dma_controller_register(struct device *dev,
+		struct dma_chan *(*acpi_dma_xlate)
+		(struct acpi_dma_spec *, struct acpi_dma *),
+		void *data);
+int acpi_dma_controller_free(struct device *dev);
+int devm_acpi_dma_controller_register(struct device *dev,
+		struct dma_chan *(*acpi_dma_xlate)
+		(struct acpi_dma_spec *, struct acpi_dma *),
+		void *data);
+void devm_acpi_dma_controller_free(struct device *dev);
+
+struct dma_chan *acpi_dma_request_slave_chan_by_index(struct device *dev,
+						      size_t index);
+struct dma_chan *acpi_dma_request_slave_chan_by_name(struct device *dev,
+						     const char *name);
+
+struct dma_chan *acpi_dma_simple_xlate(struct acpi_dma_spec *dma_spec,
+				       struct acpi_dma *adma);
+#else
+
+static inline int acpi_dma_controller_register(struct device *dev,
+		struct dma_chan *(*acpi_dma_xlate)
+		(struct acpi_dma_spec *, struct acpi_dma *),
+		void *data)
+{
+	return -ENODEV;
+}
+static inline int acpi_dma_controller_free(struct device *dev)
+{
+	return -ENODEV;
+}
+static inline int devm_acpi_dma_controller_register(struct device *dev,
+		struct dma_chan *(*acpi_dma_xlate)
+		(struct acpi_dma_spec *, struct acpi_dma *),
+		void *data)
+{
+	return -ENODEV;
+}
+static inline void devm_acpi_dma_controller_free(struct device *dev)
+{
+}
+
+static inline struct dma_chan *acpi_dma_request_slave_chan_by_index(
+		struct device *dev, size_t index)
+{
+	return NULL;
+}
+static inline struct dma_chan *acpi_dma_request_slave_chan_by_name(
+		struct device *dev, const char *name)
+{
+	return NULL;
+}
+
+#define acpi_dma_simple_xlate	NULL
+
+#endif
+
+#define acpi_dma_request_slave_channel	acpi_dma_request_slave_chan_by_index
+
+#endif /* __LINUX_ACPI_DMA_H */
-- 
cgit 


From e6a90810559cc3755a5b1629d1fd0b914462f98c Mon Sep 17 00:00:00 2001
From: Bastian Hecht <hechtb@gmail.com>
Date: Mon, 15 Apr 2013 09:31:00 -0700
Subject: Input: st1232 - add reset pin handling

We add the possibility to hand over a GPIO number for the reset pin.
This way we can remove existing board code that takes care of it and
group this information properly in the platform data or in the device
tree configuration.

Signed-off-by: Bastian Hecht <hechtb+renesas@gmail.com>
Acked-by: Laurent Pinchart <laurent.pinchart@ideasonboard.com>
Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
---
 .../bindings/input/touchscreen/sitronix-st1232.txt | 24 +++++++++++
 drivers/input/touchscreen/st1232.c                 | 47 ++++++++++++++++++++--
 include/linux/platform_data/st1232_pdata.h         | 13 ++++++
 3 files changed, 80 insertions(+), 4 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/input/touchscreen/sitronix-st1232.txt
 create mode 100644 include/linux/platform_data/st1232_pdata.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/input/touchscreen/sitronix-st1232.txt b/Documentation/devicetree/bindings/input/touchscreen/sitronix-st1232.txt
new file mode 100644
index 000000000000..64ad48b824a2
--- /dev/null
+++ b/Documentation/devicetree/bindings/input/touchscreen/sitronix-st1232.txt
@@ -0,0 +1,24 @@
+* Sitronix st1232 touchscreen controller
+
+Required properties:
+- compatible: must be "sitronix,st1232"
+- reg: I2C address of the chip
+- interrupts: interrupt to which the chip is connected
+
+Optional properties:
+- gpios: a phandle to the reset GPIO
+
+Example:
+
+	i2c@00000000 {
+		/* ... */
+
+		touchscreen@55 {
+			compatible = "sitronix,st1232";
+			reg = <0x55>;
+			interrupts = <2 0>;
+			gpios = <&gpio1 166 0>;
+		};
+
+		/* ... */
+	};
diff --git a/drivers/input/touchscreen/st1232.c b/drivers/input/touchscreen/st1232.c
index 75d8eb5ee609..1740a2496371 100644
--- a/drivers/input/touchscreen/st1232.c
+++ b/drivers/input/touchscreen/st1232.c
@@ -19,13 +19,16 @@
  */
 
 #include <linux/delay.h>
+#include <linux/gpio.h>
 #include <linux/i2c.h>
 #include <linux/input.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/of_gpio.h>
 #include <linux/pm_qos.h>
 #include <linux/slab.h>
 #include <linux/types.h>
+#include <linux/platform_data/st1232_pdata.h>
 
 #define ST1232_TS_NAME	"st1232-ts"
 
@@ -48,6 +51,7 @@ struct st1232_ts_data {
 	struct input_dev *input_dev;
 	struct st1232_ts_finger finger[MAX_FINGERS];
 	struct dev_pm_qos_request low_latency_req;
+	int reset_gpio;
 };
 
 static int st1232_ts_read_data(struct st1232_ts_data *ts)
@@ -139,10 +143,17 @@ end:
 	return IRQ_HANDLED;
 }
 
+static void st1232_ts_power(struct st1232_ts_data *ts, bool poweron)
+{
+	if (gpio_is_valid(ts->reset_gpio))
+		gpio_direction_output(ts->reset_gpio, poweron);
+}
+
 static int st1232_ts_probe(struct i2c_client *client,
 					const struct i2c_device_id *id)
 {
 	struct st1232_ts_data *ts;
+	struct st1232_pdata *pdata = client->dev.platform_data;
 	struct input_dev *input_dev;
 	int error;
 
@@ -167,6 +178,25 @@ static int st1232_ts_probe(struct i2c_client *client,
 	ts->client = client;
 	ts->input_dev = input_dev;
 
+	if (pdata)
+		ts->reset_gpio = pdata->reset_gpio;
+	else if (client->dev.of_node)
+		ts->reset_gpio = of_get_gpio(client->dev.of_node, 0);
+	else
+		ts->reset_gpio = -ENODEV;
+
+	if (gpio_is_valid(ts->reset_gpio)) {
+		error = devm_gpio_request(&client->dev, ts->reset_gpio, NULL);
+		if (error) {
+			dev_err(&client->dev,
+				"Unable to request GPIO pin %d.\n",
+				ts->reset_gpio);
+				return error;
+		}
+	}
+
+	st1232_ts_power(ts, true);
+
 	input_dev->name = "st1232-touchscreen";
 	input_dev->id.bustype = BUS_I2C;
 	input_dev->dev.parent = &client->dev;
@@ -203,7 +233,10 @@ static int st1232_ts_probe(struct i2c_client *client,
 
 static int st1232_ts_remove(struct i2c_client *client)
 {
+	struct st1232_ts_data *ts = i2c_get_clientdata(client);
+
 	device_init_wakeup(&client->dev, 0);
+	st1232_ts_power(ts, false);
 
 	return 0;
 }
@@ -212,11 +245,14 @@ static int st1232_ts_remove(struct i2c_client *client)
 static int st1232_ts_suspend(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
+	struct st1232_ts_data *ts = i2c_get_clientdata(client);
 
-	if (device_may_wakeup(&client->dev))
+	if (device_may_wakeup(&client->dev)) {
 		enable_irq_wake(client->irq);
-	else
+	} else {
 		disable_irq(client->irq);
+		st1232_ts_power(ts, false);
+	}
 
 	return 0;
 }
@@ -224,11 +260,14 @@ static int st1232_ts_suspend(struct device *dev)
 static int st1232_ts_resume(struct device *dev)
 {
 	struct i2c_client *client = to_i2c_client(dev);
+	struct st1232_ts_data *ts = i2c_get_clientdata(client);
 
-	if (device_may_wakeup(&client->dev))
+	if (device_may_wakeup(&client->dev)) {
 		disable_irq_wake(client->irq);
-	else
+	} else {
+		st1232_ts_power(ts, true);
 		enable_irq(client->irq);
+	}
 
 	return 0;
 }
diff --git a/include/linux/platform_data/st1232_pdata.h b/include/linux/platform_data/st1232_pdata.h
new file mode 100644
index 000000000000..cac3e7b4c454
--- /dev/null
+++ b/include/linux/platform_data/st1232_pdata.h
@@ -0,0 +1,13 @@
+#ifndef _LINUX_ST1232_PDATA_H
+#define _LINUX_ST1232_PDATA_H
+
+/*
+ * Optional platform data
+ *
+ * Use this if you want the driver to drive the reset pin.
+ */
+struct st1232_pdata {
+	int reset_gpio;
+};
+
+#endif
-- 
cgit 


From c5bfece2d6129131b4ade985e63bc35ddf5868d4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 12 Apr 2013 16:45:34 +0200
Subject: nohz: Switch from "extended nohz" to "full nohz" based naming

"Extended nohz" was used as a naming base for the full dynticks
API and Kconfig symbols. It reflects the fact the system tries
to stop the tick in more places than just idle.

But that "extended" name is a bit opaque and vague. Rename it to
"full" makes it clearer what the system tries to do under this
config: try to shutdown the tick anytime it can. The various
constraints that prevent that to happen shouldn't be considered
as fundamental properties of this feature but rather technical
issues that may be solved in the future.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  4 +--
 include/linux/tick.h                |  6 ++---
 kernel/sched/core.c                 |  6 ++---
 kernel/time/Kconfig                 |  4 +--
 kernel/time/tick-broadcast.c        |  2 +-
 kernel/time/tick-common.c           |  2 +-
 kernel/time/tick-sched.c            | 54 ++++++++++++++++++-------------------
 7 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 231698feaddc..82365dde00a8 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1913,8 +1913,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Valid arguments: on, off
 			Default: on
 
-	nohz_extended=  [KNL,BOOT]
-			In kernels built with CONFIG_NO_HZ_EXTENDED=y, set
+	nohz_full=	[KNL,BOOT]
+			In kernels built with CONFIG_NO_HZ_FULL=y, set
 			the specified list of CPUs whose tick will be stopped
 			whenever possible. You need to keep at least one online
 			CPU outside the range to maintain the timekeeping.
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 5e403339ee14..b4e3b0c9639e 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -157,10 +157,10 @@ static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; }
 static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !CONFIG_NO_HZ_COMMON */
 
-#ifdef CONFIG_NO_HZ_EXTENDED
-extern int tick_nohz_extended_cpu(int cpu);
+#ifdef CONFIG_NO_HZ_FULL
+extern int tick_nohz_full_cpu(int cpu);
 #else
-static inline int tick_nohz_extended_cpu(int cpu) { return 0; }
+static inline int tick_nohz_full_cpu(int cpu) { return 0; }
 #endif
 
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9bb397da63d6..0f0a5b3fd62c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -617,9 +617,9 @@ static void wake_up_idle_cpu(int cpu)
 		smp_send_reschedule(cpu);
 }
 
-static bool wake_up_extended_nohz_cpu(int cpu)
+static bool wake_up_full_nohz_cpu(int cpu)
 {
-	if (tick_nohz_extended_cpu(cpu)) {
+	if (tick_nohz_full_cpu(cpu)) {
 		if (cpu != smp_processor_id() ||
 		    tick_nohz_tick_stopped())
 			smp_send_reschedule(cpu);
@@ -631,7 +631,7 @@ static bool wake_up_extended_nohz_cpu(int cpu)
 
 void wake_up_nohz_cpu(int cpu)
 {
-	if (!wake_up_extended_nohz_cpu(cpu))
+	if (!wake_up_full_nohz_cpu(cpu))
 		wake_up_idle_cpu(cpu);
 }
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index cbe64be17d1f..4a17b5069466 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -96,7 +96,7 @@ config NO_HZ_IDLE
 
 	  Most of the time you want to say Y here.
 
-config NO_HZ_EXTENDED
+config NO_HZ_FULL
 	bool "Full dynticks system (tickless single task)"
 	# NO_HZ_COMMON dependency
 	depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
@@ -115,7 +115,7 @@ config NO_HZ_EXTENDED
 	 task on the CPU. Chances for running tickless are maximized when
 	 the task mostly runs in userspace and has few kernel activity.
 
-	 You need to fill up the nohz_extended boot parameter with the
+	 You need to fill up the nohz_full boot parameter with the
 	 desired range of dynticks CPUs.
 
 	 This is implemented at the expense of some overhead in user <-> kernel
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8a6875cc1879..a3a3123f6272 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -573,7 +573,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 		bc->event_handler = tick_handle_oneshot_broadcast;
 
 		/* Take the do_timer update */
-		if (!tick_nohz_extended_cpu(cpu))
+		if (!tick_nohz_full_cpu(cpu))
 			tick_do_timer_cpu = cpu;
 
 		/*
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b7dc0cbdb59b..83f2bd967161 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -163,7 +163,7 @@ static void tick_setup_device(struct tick_device *td,
 		 * this cpu:
 		 */
 		if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
-			if (!tick_nohz_extended_cpu(cpu))
+			if (!tick_nohz_full_cpu(cpu))
 				tick_do_timer_cpu = cpu;
 			else
 				tick_do_timer_cpu = TICK_DO_TIMER_NONE;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e057d338daa4..369b5769fc97 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -113,7 +113,7 @@ static void tick_sched_do_timer(ktime_t now)
 	 * jiffies_lock.
 	 */
 	if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)
-	    && !tick_nohz_extended_cpu(cpu))
+	    && !tick_nohz_full_cpu(cpu))
 		tick_do_timer_cpu = cpu;
 #endif
 
@@ -143,29 +143,29 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 	profile_tick(CPU_PROFILING);
 }
 
-#ifdef CONFIG_NO_HZ_EXTENDED
-static cpumask_var_t nohz_extended_mask;
-bool have_nohz_extended_mask;
+#ifdef CONFIG_NO_HZ_FULL
+static cpumask_var_t nohz_full_mask;
+bool have_nohz_full_mask;
 
-int tick_nohz_extended_cpu(int cpu)
+int tick_nohz_full_cpu(int cpu)
 {
-	if (!have_nohz_extended_mask)
+	if (!have_nohz_full_mask)
 		return 0;
 
-	return cpumask_test_cpu(cpu, nohz_extended_mask);
+	return cpumask_test_cpu(cpu, nohz_full_mask);
 }
 
 /* Parse the boot-time nohz CPU list from the kernel parameters. */
-static int __init tick_nohz_extended_setup(char *str)
+static int __init tick_nohz_full_setup(char *str)
 {
-	alloc_bootmem_cpumask_var(&nohz_extended_mask);
-	if (cpulist_parse(str, nohz_extended_mask) < 0)
-		pr_warning("NOHZ: Incorrect nohz_extended cpumask\n");
+	alloc_bootmem_cpumask_var(&nohz_full_mask);
+	if (cpulist_parse(str, nohz_full_mask) < 0)
+		pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
 	else
-		have_nohz_extended_mask = true;
+		have_nohz_full_mask = true;
 	return 1;
 }
-__setup("nohz_extended=", tick_nohz_extended_setup);
+__setup("nohz_full=", tick_nohz_full_setup);
 
 static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 						 unsigned long action,
@@ -179,7 +179,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
 		 * If we handle the timekeeping duty for full dynticks CPUs,
 		 * we can't safely shutdown that CPU.
 		 */
-		if (have_nohz_extended_mask && tick_do_timer_cpu == cpu)
+		if (have_nohz_full_mask && tick_do_timer_cpu == cpu)
 			return -EINVAL;
 		break;
 	}
@@ -191,20 +191,20 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
  * separations: 0,2,4,6,...
  * This is NR_CPUS + sizeof('\0')
  */
-static char __initdata nohz_ext_buf[NR_CPUS + 1];
+static char __initdata nohz_full_buf[NR_CPUS + 1];
 
-static int __init init_tick_nohz_extended(void)
+static int __init init_tick_nohz_full(void)
 {
 	cpumask_var_t online_nohz;
 	int cpu;
 
-	if (!have_nohz_extended_mask)
+	if (!have_nohz_full_mask)
 		return 0;
 
 	cpu_notifier(tick_nohz_cpu_down_callback, 0);
 
 	if (!zalloc_cpumask_var(&online_nohz, GFP_KERNEL)) {
-		pr_warning("NO_HZ: Not enough memory to check extended nohz mask\n");
+		pr_warning("NO_HZ: Not enough memory to check full nohz mask\n");
 		return -ENOMEM;
 	}
 
@@ -215,31 +215,31 @@ static int __init init_tick_nohz_extended(void)
 	get_online_cpus();
 
 	/* Ensure we keep a CPU outside the dynticks range for timekeeping */
-	cpumask_and(online_nohz, cpu_online_mask, nohz_extended_mask);
+	cpumask_and(online_nohz, cpu_online_mask, nohz_full_mask);
 	if (cpumask_equal(online_nohz, cpu_online_mask)) {
 		pr_warning("NO_HZ: Must keep at least one online CPU "
-			   "out of nohz_extended range\n");
+			   "out of nohz_full range\n");
 		/*
 		 * We know the current CPU doesn't have its tick stopped.
 		 * Let's use it for the timekeeping duty.
 		 */
 		preempt_disable();
 		cpu = smp_processor_id();
-		pr_warning("NO_HZ: Clearing %d from nohz_extended range\n", cpu);
-		cpumask_clear_cpu(cpu, nohz_extended_mask);
+		pr_warning("NO_HZ: Clearing %d from nohz_full range\n", cpu);
+		cpumask_clear_cpu(cpu, nohz_full_mask);
 		preempt_enable();
 	}
 	put_online_cpus();
 	free_cpumask_var(online_nohz);
 
-	cpulist_scnprintf(nohz_ext_buf, sizeof(nohz_ext_buf), nohz_extended_mask);
-	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_ext_buf);
+	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
+	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
 
 	return 0;
 }
-core_initcall(init_tick_nohz_extended);
+core_initcall(init_tick_nohz_full);
 #else
-#define have_nohz_extended_mask (0)
+#define have_nohz_full_mask (0)
 #endif
 
 /*
@@ -589,7 +589,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 		return false;
 	}
 
-	if (have_nohz_extended_mask) {
+	if (have_nohz_full_mask) {
 		/*
 		 * Keep the tick alive to guarantee timekeeping progression
 		 * if there are full dynticks CPUs around
-- 
cgit 


From 31815c08fc90f44d6165034fd473f23df5d31449 Mon Sep 17 00:00:00 2001
From: Alexander Shiyan <shc_work@mail.ru>
Date: Sat, 13 Apr 2013 08:46:58 +0400
Subject: serial: sccnxp: Replace pdata.init/exit with regulator API

Typical usage of pdata.init/exit is enable/disable power and/or toggle
reset for the target chip.
This patch replaces these callbacks with regulator API.

Signed-off-by: Alexander Shiyan <shc_work@mail.ru>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/tty/serial/sccnxp.c                 | 21 +++++++++++++++------
 include/linux/platform_data/serial-sccnxp.h |  4 ----
 2 files changed, 15 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/serial/sccnxp.c b/drivers/tty/serial/sccnxp.c
index b1d04aabd50c..c77304155410 100644
--- a/drivers/tty/serial/sccnxp.c
+++ b/drivers/tty/serial/sccnxp.c
@@ -27,6 +27,7 @@
 #include <linux/spinlock.h>
 #include <linux/platform_device.h>
 #include <linux/platform_data/serial-sccnxp.h>
+#include <linux/regulator/consumer.h>
 
 #define SCCNXP_NAME			"uart-sccnxp"
 #define SCCNXP_MAJOR			204
@@ -131,6 +132,8 @@ struct sccnxp_port {
 	struct timer_list	timer;
 
 	struct sccnxp_pdata	pdata;
+
+	struct regulator	*regulator;
 };
 
 static inline u8 sccnxp_raw_read(void __iomem *base, u8 reg, u8 shift)
@@ -916,6 +919,16 @@ static int sccnxp_probe(struct platform_device *pdev)
 		goto err_out;
 	}
 
+	s->regulator = devm_regulator_get(&pdev->dev, "VCC");
+	if (!IS_ERR(s->regulator)) {
+		ret = regulator_enable(s->regulator);
+		if (ret) {
+			dev_err(&pdev->dev,
+				"Failed to enable regulator: %i\n", ret);
+			return ret;
+		}
+	}
+
 	membase = devm_ioremap_resource(&pdev->dev, res);
 	if (IS_ERR(membase)) {
 		ret = PTR_ERR(membase);
@@ -965,10 +978,6 @@ static int sccnxp_probe(struct platform_device *pdev)
 	s->imr = 0;
 	sccnxp_write(&s->port[0], SCCNXP_IMR_REG, 0);
 
-	/* Board specific configure */
-	if (s->pdata.init)
-		s->pdata.init();
-
 	if (!s->poll) {
 		ret = devm_request_threaded_irq(&pdev->dev, s->irq, NULL,
 						sccnxp_ist,
@@ -1009,8 +1018,8 @@ static int sccnxp_remove(struct platform_device *pdev)
 	uart_unregister_driver(&s->uart);
 	platform_set_drvdata(pdev, NULL);
 
-	if (s->pdata.exit)
-		s->pdata.exit();
+	if (!IS_ERR(s->regulator))
+		return regulator_disable(s->regulator);
 
 	return 0;
 }
diff --git a/include/linux/platform_data/serial-sccnxp.h b/include/linux/platform_data/serial-sccnxp.h
index 215574d1e81d..bdc510d03245 100644
--- a/include/linux/platform_data/serial-sccnxp.h
+++ b/include/linux/platform_data/serial-sccnxp.h
@@ -86,10 +86,6 @@ struct sccnxp_pdata {
 	const u32		mctrl_cfg[SCCNXP_MAX_UARTS];
 	/* Timer value for polling mode (usecs) */
 	const unsigned int	poll_time_us;
-	/* Called during startup */
-	void (*init)(void);
-	/* Called before finish */
-	void (*exit)(void);
 };
 
 #endif
-- 
cgit 


From 4cd729b04285b7330edaf5a7080aa795d6d15ff3 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevic@redhat.com>
Date: Mon, 15 Apr 2013 09:54:25 +0000
Subject: net: add dev_uc_sync_multiple() and dev_mc_sync_multiple() api

The current implementation of dev_uc_sync/unsync() assumes that there is
a strict 1-to-1 relationship between the source and destination of the sync.
In other words, once an address has been synced to a destination device, it
will not be synced to any other device through the sync API.
However, there are some virtual devices that aggreate a number of lower
devices and need to sync addresses to all of them.  The current
API falls short there.

This patch introduces a new dev_uc_sync_multiple() api that can be called
in the above circumstances and allows sync to work for every invocation.

CC: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |   3 +
 net/core/dev_addr_lists.c | 210 ++++++++++++++++++++++++++++++++++++----------
 2 files changed, 171 insertions(+), 42 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 53d3939358a7..623b57b52195 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -209,6 +209,7 @@ struct netdev_hw_addr {
 #define NETDEV_HW_ADDR_T_UNICAST	4
 #define NETDEV_HW_ADDR_T_MULTICAST	5
 	bool			global_use;
+	int			sync_cnt;
 	int			refcount;
 	int			synced;
 	struct rcu_head		rcu_head;
@@ -2627,6 +2628,7 @@ extern int dev_uc_add(struct net_device *dev, const unsigned char *addr);
 extern int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr);
 extern int dev_uc_del(struct net_device *dev, const unsigned char *addr);
 extern int dev_uc_sync(struct net_device *to, struct net_device *from);
+extern int dev_uc_sync_multiple(struct net_device *to, struct net_device *from);
 extern void dev_uc_unsync(struct net_device *to, struct net_device *from);
 extern void dev_uc_flush(struct net_device *dev);
 extern void dev_uc_init(struct net_device *dev);
@@ -2638,6 +2640,7 @@ extern int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr);
 extern int dev_mc_del(struct net_device *dev, const unsigned char *addr);
 extern int dev_mc_del_global(struct net_device *dev, const unsigned char *addr);
 extern int dev_mc_sync(struct net_device *to, struct net_device *from);
+extern int dev_mc_sync_multiple(struct net_device *to, struct net_device *from);
 extern void dev_mc_unsync(struct net_device *to, struct net_device *from);
 extern void dev_mc_flush(struct net_device *dev);
 extern void dev_mc_init(struct net_device *dev);
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index abdc9e6ef33e..c013f38482a1 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -22,7 +22,8 @@
 
 static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 			       const unsigned char *addr, int addr_len,
-			       unsigned char addr_type, bool global)
+			       unsigned char addr_type, bool global,
+			       bool sync)
 {
 	struct netdev_hw_addr *ha;
 	int alloc_size;
@@ -37,7 +38,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 	ha->type = addr_type;
 	ha->refcount = 1;
 	ha->global_use = global;
-	ha->synced = 0;
+	ha->synced = sync;
 	list_add_tail_rcu(&ha->list, &list->list);
 	list->count++;
 
@@ -46,7 +47,7 @@ static int __hw_addr_create_ex(struct netdev_hw_addr_list *list,
 
 static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 			    const unsigned char *addr, int addr_len,
-			    unsigned char addr_type, bool global)
+			    unsigned char addr_type, bool global, bool sync)
 {
 	struct netdev_hw_addr *ha;
 
@@ -63,43 +64,62 @@ static int __hw_addr_add_ex(struct netdev_hw_addr_list *list,
 				else
 					ha->global_use = true;
 			}
+			if (sync) {
+				if (ha->synced)
+					return 0;
+				else
+					ha->synced = true;
+			}
 			ha->refcount++;
 			return 0;
 		}
 	}
 
-	return __hw_addr_create_ex(list, addr, addr_len, addr_type, global);
+	return __hw_addr_create_ex(list, addr, addr_len, addr_type, global,
+				   sync);
 }
 
 static int __hw_addr_add(struct netdev_hw_addr_list *list,
 			 const unsigned char *addr, int addr_len,
 			 unsigned char addr_type)
 {
-	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false);
+	return __hw_addr_add_ex(list, addr, addr_len, addr_type, false, false);
+}
+
+static int __hw_addr_del_entry(struct netdev_hw_addr_list *list,
+			       struct netdev_hw_addr *ha, bool global,
+			       bool sync)
+{
+	if (global && !ha->global_use)
+		return -ENOENT;
+
+	if (sync && !ha->synced)
+		return -ENOENT;
+
+	if (global)
+		ha->global_use = false;
+
+	if (sync)
+		ha->synced = false;
+
+	if (--ha->refcount)
+		return 0;
+	list_del_rcu(&ha->list);
+	kfree_rcu(ha, rcu_head);
+	list->count--;
+	return 0;
 }
 
 static int __hw_addr_del_ex(struct netdev_hw_addr_list *list,
 			    const unsigned char *addr, int addr_len,
-			    unsigned char addr_type, bool global)
+			    unsigned char addr_type, bool global, bool sync)
 {
 	struct netdev_hw_addr *ha;
 
 	list_for_each_entry(ha, &list->list, list) {
 		if (!memcmp(ha->addr, addr, addr_len) &&
-		    (ha->type == addr_type || !addr_type)) {
-			if (global) {
-				if (!ha->global_use)
-					break;
-				else
-					ha->global_use = false;
-			}
-			if (--ha->refcount)
-				return 0;
-			list_del_rcu(&ha->list);
-			kfree_rcu(ha, rcu_head);
-			list->count--;
-			return 0;
-		}
+		    (ha->type == addr_type || !addr_type))
+			return __hw_addr_del_entry(list, ha, global, sync);
 	}
 	return -ENOENT;
 }
@@ -108,7 +128,57 @@ static int __hw_addr_del(struct netdev_hw_addr_list *list,
 			 const unsigned char *addr, int addr_len,
 			 unsigned char addr_type)
 {
-	return __hw_addr_del_ex(list, addr, addr_len, addr_type, false);
+	return __hw_addr_del_ex(list, addr, addr_len, addr_type, false, false);
+}
+
+static int __hw_addr_sync_one(struct netdev_hw_addr_list *to_list,
+			       struct netdev_hw_addr *ha,
+			       int addr_len)
+{
+	int err;
+
+	err = __hw_addr_add_ex(to_list, ha->addr, addr_len, ha->type,
+			       false, true);
+	if (err)
+		return err;
+	ha->sync_cnt++;
+	ha->refcount++;
+
+	return 0;
+}
+
+static void __hw_addr_unsync_one(struct netdev_hw_addr_list *to_list,
+				 struct netdev_hw_addr_list *from_list,
+				 struct netdev_hw_addr *ha,
+				 int addr_len)
+{
+	int err;
+
+	err = __hw_addr_del_ex(to_list, ha->addr, addr_len, ha->type,
+			       false, true);
+	if (err)
+		return;
+	ha->sync_cnt--;
+	__hw_addr_del_entry(from_list, ha, false, true);
+}
+
+static int __hw_addr_sync_multiple(struct netdev_hw_addr_list *to_list,
+				   struct netdev_hw_addr_list *from_list,
+				   int addr_len)
+{
+	int err = 0;
+	struct netdev_hw_addr *ha, *tmp;
+
+	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
+		if (ha->sync_cnt == ha->refcount) {
+			__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
+		} else {
+			err = __hw_addr_sync_one(to_list, ha, addr_len);
+			if (err)
+				break;
+		}
+	}
+	return err;
 }
 
 int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
@@ -152,6 +222,11 @@ void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
 }
 EXPORT_SYMBOL(__hw_addr_del_multiple);
 
+/* This function only works where there is a strict 1-1 relationship
+ * between source and destionation of they synch. If you ever need to
+ * sync addresses to more then 1 destination, you need to use
+ * __hw_addr_sync_multiple().
+ */
 int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
 		   struct netdev_hw_addr_list *from_list,
 		   int addr_len)
@@ -160,17 +235,12 @@ int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
 	struct netdev_hw_addr *ha, *tmp;
 
 	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
-		if (!ha->synced) {
-			err = __hw_addr_add(to_list, ha->addr,
-					    addr_len, ha->type);
+		if (!ha->sync_cnt) {
+			err = __hw_addr_sync_one(to_list, ha, addr_len);
 			if (err)
 				break;
-			ha->synced++;
-			ha->refcount++;
-		} else if (ha->refcount == 1) {
-			__hw_addr_del(to_list, ha->addr, addr_len, ha->type);
-			__hw_addr_del(from_list, ha->addr, addr_len, ha->type);
-		}
+		} else if (ha->refcount == 1)
+			__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
 	}
 	return err;
 }
@@ -183,13 +253,8 @@ void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
 	struct netdev_hw_addr *ha, *tmp;
 
 	list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
-		if (ha->synced) {
-			__hw_addr_del(to_list, ha->addr,
-				      addr_len, ha->type);
-			ha->synced--;
-			__hw_addr_del(from_list, ha->addr,
-				      addr_len, ha->type);
-		}
+		if (ha->sync_cnt)
+			__hw_addr_unsync_one(to_list, from_list, ha, addr_len);
 	}
 }
 EXPORT_SYMBOL(__hw_addr_unsync);
@@ -406,7 +471,7 @@ int dev_uc_add_excl(struct net_device *dev, const unsigned char *addr)
 		}
 	}
 	err = __hw_addr_create_ex(&dev->uc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_UNICAST, true);
+				  NETDEV_HW_ADDR_T_UNICAST, true, false);
 	if (!err)
 		__dev_set_rx_mode(dev);
 out:
@@ -469,7 +534,8 @@ EXPORT_SYMBOL(dev_uc_del);
  *	locked by netif_addr_lock_bh.
  *
  *	This function is intended to be called from the dev->set_rx_mode
- *	function of layered software devices.
+ *	function of layered software devices.  This function assumes that
+ *	addresses will only ever be synced to the @to devices and no other.
  */
 int dev_uc_sync(struct net_device *to, struct net_device *from)
 {
@@ -487,6 +553,36 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)
 }
 EXPORT_SYMBOL(dev_uc_sync);
 
+/**
+ *	dev_uc_sync_multiple - Synchronize device's unicast list to another
+ *	device, but allow for multiple calls to sync to multiple devices.
+ *	@to: destination device
+ *	@from: source device
+ *
+ *	Add newly added addresses to the destination device and release
+ *	addresses that have been deleted from the source. The source device
+ *	must be locked by netif_addr_lock_bh.
+ *
+ *	This function is intended to be called from the dev->set_rx_mode
+ *	function of layered software devices.  It allows for a single source
+ *	device to be synced to multiple destination devices.
+ */
+int dev_uc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+	int err = 0;
+
+	if (to->addr_len != from->addr_len)
+		return -EINVAL;
+
+	netif_addr_lock_nested(to);
+	err = __hw_addr_sync_multiple(&to->uc, &from->uc, to->addr_len);
+	if (!err)
+		__dev_set_rx_mode(to);
+	netif_addr_unlock(to);
+	return err;
+}
+EXPORT_SYMBOL(dev_uc_sync_multiple);
+
 /**
  *	dev_uc_unsync - Remove synchronized addresses from the destination device
  *	@to: destination device
@@ -559,7 +655,7 @@ int dev_mc_add_excl(struct net_device *dev, const unsigned char *addr)
 		}
 	}
 	err = __hw_addr_create_ex(&dev->mc, addr, dev->addr_len,
-				  NETDEV_HW_ADDR_T_MULTICAST, true);
+				  NETDEV_HW_ADDR_T_MULTICAST, true, false);
 	if (!err)
 		__dev_set_rx_mode(dev);
 out:
@@ -575,7 +671,7 @@ static int __dev_mc_add(struct net_device *dev, const unsigned char *addr,
 
 	netif_addr_lock_bh(dev);
 	err = __hw_addr_add_ex(&dev->mc, addr, dev->addr_len,
-			       NETDEV_HW_ADDR_T_MULTICAST, global);
+			       NETDEV_HW_ADDR_T_MULTICAST, global, false);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	netif_addr_unlock_bh(dev);
@@ -615,7 +711,7 @@ static int __dev_mc_del(struct net_device *dev, const unsigned char *addr,
 
 	netif_addr_lock_bh(dev);
 	err = __hw_addr_del_ex(&dev->mc, addr, dev->addr_len,
-			       NETDEV_HW_ADDR_T_MULTICAST, global);
+			       NETDEV_HW_ADDR_T_MULTICAST, global, false);
 	if (!err)
 		__dev_set_rx_mode(dev);
 	netif_addr_unlock_bh(dev);
@@ -678,6 +774,36 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)
 }
 EXPORT_SYMBOL(dev_mc_sync);
 
+/**
+ *	dev_mc_sync_multiple - Synchronize device's unicast list to another
+ *	device, but allow for multiple calls to sync to multiple devices.
+ *	@to: destination device
+ *	@from: source device
+ *
+ *	Add newly added addresses to the destination device and release
+ *	addresses that have no users left. The source device must be
+ *	locked by netif_addr_lock_bh.
+ *
+ *	This function is intended to be called from the ndo_set_rx_mode
+ *	function of layered software devices.  It allows for a single
+ *	source device to be synced to multiple destination devices.
+ */
+int dev_mc_sync_multiple(struct net_device *to, struct net_device *from)
+{
+	int err = 0;
+
+	if (to->addr_len != from->addr_len)
+		return -EINVAL;
+
+	netif_addr_lock_nested(to);
+	err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
+	if (!err)
+		__dev_set_rx_mode(to);
+	netif_addr_unlock(to);
+	return err;
+}
+EXPORT_SYMBOL(dev_mc_sync_multiple);
+
 /**
  *	dev_mc_unsync - Remove synchronized addresses from the destination device
  *	@to: destination device
-- 
cgit 


From 0635eb8a54cf0fea64b174bb68bc36b9c3d622db Mon Sep 17 00:00:00 2001
From: Matthew Garrett <matthew.garrett@nebula.com>
Date: Mon, 15 Apr 2013 13:09:45 -0700
Subject: Move utf16 functions to kernel core and rename

We want to be able to use the utf16 functions that are currently present
in the EFI variables code in platform-specific code as well. Move them to
the kernel core, and in the process rename them to accurately describe what
they do - they don't handle UTF16, only UCS2.

Signed-off-by: Matthew Garrett <matthew.garrett@nebula.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/Kconfig    |  1 +
 drivers/firmware/efivars.c  | 80 ++++++++++-----------------------------------
 include/linux/ucs2_string.h | 14 ++++++++
 lib/Kconfig                 |  3 ++
 lib/Makefile                |  2 ++
 lib/ucs2_string.c           | 51 +++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 62 deletions(-)
 create mode 100644 include/linux/ucs2_string.h
 create mode 100644 lib/ucs2_string.c

(limited to 'include/linux')

diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 42c759a4d047..3e532002e4d1 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -39,6 +39,7 @@ config FIRMWARE_MEMMAP
 config EFI_VARS
 	tristate "EFI Variable Support via sysfs"
 	depends on EFI
+	select UCS2_STRING
 	default n
 	help
 	  If you say Y here, you are able to get EFI (Extensible Firmware
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index bf15d81d74e1..182ce9471175 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -80,6 +80,7 @@
 #include <linux/slab.h>
 #include <linux/pstore.h>
 #include <linux/ctype.h>
+#include <linux/ucs2_string.h>
 
 #include <linux/fs.h>
 #include <linux/ramfs.h>
@@ -172,51 +173,6 @@ static void efivar_update_sysfs_entries(struct work_struct *);
 static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries);
 static bool efivar_wq_enabled = true;
 
-/* Return the number of unicode characters in data */
-static unsigned long
-utf16_strnlen(efi_char16_t *s, size_t maxlength)
-{
-	unsigned long length = 0;
-
-	while (*s++ != 0 && length < maxlength)
-		length++;
-	return length;
-}
-
-static inline unsigned long
-utf16_strlen(efi_char16_t *s)
-{
-	return utf16_strnlen(s, ~0UL);
-}
-
-/*
- * Return the number of bytes is the length of this string
- * Note: this is NOT the same as the number of unicode characters
- */
-static inline unsigned long
-utf16_strsize(efi_char16_t *data, unsigned long maxlength)
-{
-	return utf16_strnlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t);
-}
-
-static inline int
-utf16_strncmp(const efi_char16_t *a, const efi_char16_t *b, size_t len)
-{
-	while (1) {
-		if (len == 0)
-			return 0;
-		if (*a < *b)
-			return -1;
-		if (*a > *b)
-			return 1;
-		if (*a == 0) /* implies *b == 0 */
-			return 0;
-		a++;
-		b++;
-		len--;
-	}
-}
-
 static bool
 validate_device_path(struct efi_variable *var, int match, u8 *buffer,
 		     unsigned long len)
@@ -268,7 +224,7 @@ validate_load_option(struct efi_variable *var, int match, u8 *buffer,
 	u16 filepathlength;
 	int i, desclength = 0, namelen;
 
-	namelen = utf16_strnlen(var->VariableName, sizeof(var->VariableName));
+	namelen = ucs2_strnlen(var->VariableName, sizeof(var->VariableName));
 
 	/* Either "Boot" or "Driver" followed by four digits of hex */
 	for (i = match; i < match+4; i++) {
@@ -291,7 +247,7 @@ validate_load_option(struct efi_variable *var, int match, u8 *buffer,
 	 * There's no stored length for the description, so it has to be
 	 * found by hand
 	 */
-	desclength = utf16_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
+	desclength = ucs2_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
 
 	/* Each boot entry must have a descriptor */
 	if (!desclength)
@@ -581,7 +537,7 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
 	spin_lock_irq(&efivars->lock);
 
 	status = check_var_size_locked(efivars, new_var->Attributes,
-	       new_var->DataSize + utf16_strsize(new_var->VariableName, 1024));
+	       new_var->DataSize + ucs2_strsize(new_var->VariableName, 1024));
 
 	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
 		status = efivars->ops->set_variable(new_var->VariableName,
@@ -759,7 +715,7 @@ static ssize_t efivarfs_file_write(struct file *file,
 	 * QueryVariableInfo() isn't supported by the firmware.
 	 */
 
-	varsize = datasize + utf16_strsize(var->var.VariableName, 1024);
+	varsize = datasize + ucs2_strsize(var->var.VariableName, 1024);
 	status = check_var_size(efivars, attributes, varsize);
 
 	if (status != EFI_SUCCESS) {
@@ -1211,7 +1167,7 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
 
 		inode = NULL;
 
-		len = utf16_strlen(entry->var.VariableName);
+		len = ucs2_strlen(entry->var.VariableName);
 
 		/* name, plus '-', plus GUID, plus NUL*/
 		name = kmalloc(len + 1 + GUID_LEN + 1, GFP_ATOMIC);
@@ -1469,8 +1425,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 
 		if (efi_guidcmp(entry->var.VendorGuid, vendor))
 			continue;
-		if (utf16_strncmp(entry->var.VariableName, efi_name,
-				  utf16_strlen(efi_name))) {
+		if (ucs2_strncmp(entry->var.VariableName, efi_name,
+				  ucs2_strlen(efi_name))) {
 			/*
 			 * Check if an old format,
 			 * which doesn't support holding
@@ -1482,8 +1438,8 @@ static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
 			for (i = 0; i < DUMP_NAME_LEN; i++)
 				efi_name_old[i] = name_old[i];
 
-			if (utf16_strncmp(entry->var.VariableName, efi_name_old,
-					  utf16_strlen(efi_name_old)))
+			if (ucs2_strncmp(entry->var.VariableName, efi_name_old,
+					  ucs2_strlen(efi_name_old)))
 				continue;
 		}
 
@@ -1561,8 +1517,8 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 	 * Does this variable already exist?
 	 */
 	list_for_each_entry_safe(search_efivar, n, &efivars->list, list) {
-		strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024);
-		strsize2 = utf16_strsize(new_var->VariableName, 1024);
+		strsize1 = ucs2_strsize(search_efivar->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(new_var->VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(&(search_efivar->var.VariableName),
 				new_var->VariableName, strsize1) &&
@@ -1578,7 +1534,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 	}
 
 	status = check_var_size_locked(efivars, new_var->Attributes,
-	       new_var->DataSize + utf16_strsize(new_var->VariableName, 1024));
+	       new_var->DataSize + ucs2_strsize(new_var->VariableName, 1024));
 
 	if (status && status != EFI_UNSUPPORTED) {
 		spin_unlock_irq(&efivars->lock);
@@ -1602,7 +1558,7 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 
 	/* Create the entry in sysfs.  Locking is not required here */
 	status = efivar_create_sysfs_entry(efivars,
-					   utf16_strsize(new_var->VariableName,
+					   ucs2_strsize(new_var->VariableName,
 							 1024),
 					   new_var->VariableName,
 					   &new_var->VendorGuid);
@@ -1632,8 +1588,8 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 	 * Does this variable already exist?
 	 */
 	list_for_each_entry_safe(search_efivar, n, &efivars->list, list) {
-		strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024);
-		strsize2 = utf16_strsize(del_var->VariableName, 1024);
+		strsize1 = ucs2_strsize(search_efivar->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(del_var->VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(&(search_efivar->var.VariableName),
 				del_var->VariableName, strsize1) &&
@@ -1679,9 +1635,9 @@ static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
 	unsigned long strsize1, strsize2;
 	bool found = false;
 
-	strsize1 = utf16_strsize(variable_name, 1024);
+	strsize1 = ucs2_strsize(variable_name, 1024);
 	list_for_each_entry_safe(entry, n, &efivars->list, list) {
-		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+		strsize2 = ucs2_strsize(entry->var.VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(variable_name, &(entry->var.VariableName),
 				strsize2) &&
diff --git a/include/linux/ucs2_string.h b/include/linux/ucs2_string.h
new file mode 100644
index 000000000000..cbb20afdbc01
--- /dev/null
+++ b/include/linux/ucs2_string.h
@@ -0,0 +1,14 @@
+#ifndef _LINUX_UCS2_STRING_H_
+#define _LINUX_UCS2_STRING_H_
+
+#include <linux/types.h>	/* for size_t */
+#include <linux/stddef.h>	/* for NULL */
+
+typedef u16 ucs2_char_t;
+
+unsigned long ucs2_strnlen(const ucs2_char_t *s, size_t maxlength);
+unsigned long ucs2_strlen(const ucs2_char_t *s);
+unsigned long ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength);
+int ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len);
+
+#endif /* _LINUX_UCS2_STRING_H_ */
diff --git a/lib/Kconfig b/lib/Kconfig
index 3958dc4389f9..fe01d418b09a 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -404,4 +404,7 @@ config OID_REGISTRY
 	help
 	  Enable fast lookup object identifier registry.
 
+config UCS2_STRING
+        tristate
+
 endmenu
diff --git a/lib/Makefile b/lib/Makefile
index d7946ff75b2e..6e2cc561f761 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -174,3 +174,5 @@ quiet_cmd_build_OID_registry = GEN     $@
       cmd_build_OID_registry = perl $(srctree)/$(src)/build_OID_registry $< $@
 
 clean-files	+= oid_registry_data.c
+
+obj-$(CONFIG_UCS2_STRING) += ucs2_string.o
diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c
new file mode 100644
index 000000000000..6f500ef2301d
--- /dev/null
+++ b/lib/ucs2_string.c
@@ -0,0 +1,51 @@
+#include <linux/ucs2_string.h>
+#include <linux/module.h>
+
+/* Return the number of unicode characters in data */
+unsigned long
+ucs2_strnlen(const ucs2_char_t *s, size_t maxlength)
+{
+        unsigned long length = 0;
+
+        while (*s++ != 0 && length < maxlength)
+                length++;
+        return length;
+}
+EXPORT_SYMBOL(ucs2_strnlen);
+
+unsigned long
+ucs2_strlen(const ucs2_char_t *s)
+{
+        return ucs2_strnlen(s, ~0UL);
+}
+EXPORT_SYMBOL(ucs2_strlen);
+
+/*
+ * Return the number of bytes is the length of this string
+ * Note: this is NOT the same as the number of unicode characters
+ */
+unsigned long
+ucs2_strsize(const ucs2_char_t *data, unsigned long maxlength)
+{
+        return ucs2_strnlen(data, maxlength/sizeof(ucs2_char_t)) * sizeof(ucs2_char_t);
+}
+EXPORT_SYMBOL(ucs2_strsize);
+
+int
+ucs2_strncmp(const ucs2_char_t *a, const ucs2_char_t *b, size_t len)
+{
+        while (1) {
+                if (len == 0)
+                        return 0;
+                if (*a < *b)
+                        return -1;
+                if (*a > *b)
+                        return 1;
+                if (*a == 0) /* implies *b == 0 */
+                        return 0;
+                a++;
+                b++;
+                len--;
+        }
+}
+EXPORT_SYMBOL(ucs2_strncmp);
-- 
cgit 


From 8e7ee6f5dfb56a32da760d990be908ed35b1c5bf Mon Sep 17 00:00:00 2001
From: Xiangliang Yu <yuxiangl@marvell.com>
Date: Wed, 20 Mar 2013 22:34:56 -0600
Subject: PCI: Define macro for Marvell vendor ID

Define PCI_VENDOR_ID_MARVELL_EXT macro for 0x1b4b vendor ID

Signed-off-by: Xiangliang Yu <yuxiangl@marvell.com>
Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/pci_ids.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f11c1c2609d5..a80f9e6ce9e5 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1604,6 +1604,7 @@
 #define PCI_SUBDEVICE_ID_KEYSPAN_SX2	0x5334
 
 #define PCI_VENDOR_ID_MARVELL		0x11ab
+#define PCI_VENDOR_ID_MARVELL_EXT	0x1b4b
 #define PCI_DEVICE_ID_MARVELL_GT64111	0x4146
 #define PCI_DEVICE_ID_MARVELL_GT64260	0x6430
 #define PCI_DEVICE_ID_MARVELL_MV64360	0x6460
-- 
cgit 


From f00baae7ad6c5f1503528efa852f0be8e9513f0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 15 Apr 2013 13:41:15 -0700
Subject: memcg: force use_hierarchy if sane_behavior

Turn on use_hierarchy by default if sane_behavior is specified and
don't create .use_hierarchy file.

It is debatable whether to remove .use_hierarchy file or make it ro as
the former could make transition easier in certain cases; however, the
behavior changes which will be gated by sane_behavior are intensive
including changing basic meaning of certain control knobs in a few
controllers and I don't really think keeping this piece would make
things easier in any noticeable way, so let's remove it.

v2: Explain that mem_cgroup_bind() doesn't have to worry about
    children as suggested by Michal Hocko.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge E. Hallyn <serge.hallyn@ubuntu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h |  3 +++
 mm/memcontrol.c        | 17 +++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 64047ae7fde1..cda7eb2239e1 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -268,6 +268,9 @@ enum {
 	 *
 	 * - Remount is disallowed.
 	 *
+	 * - memcg: use_hierarchy is on by default and the cgroup file for
+	 *   the flag is not created.
+	 *
 	 * The followings are planned changes.
 	 *
 	 * - release_agent will be disallowed once replacement notification
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9715c0c491b0..df87bdd4d692 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5814,6 +5814,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "use_hierarchy",
+		.flags = CFTYPE_INSANE,
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
@@ -6784,6 +6785,21 @@ static void mem_cgroup_move_task(struct cgroup *cont,
 }
 #endif
 
+/*
+ * Cgroup retains root cgroups across [un]mount cycles making it necessary
+ * to verify sane_behavior flag on each mount attempt.
+ */
+static void mem_cgroup_bind(struct cgroup *root)
+{
+	/*
+	 * use_hierarchy is forced with sane_behavior.  cgroup core
+	 * guarantees that @root doesn't have any children, so turning it
+	 * on for the root memcg is enough.
+	 */
+	if (cgroup_sane_behavior(root))
+		mem_cgroup_from_cont(root)->use_hierarchy = true;
+}
+
 struct cgroup_subsys mem_cgroup_subsys = {
 	.name = "memory",
 	.subsys_id = mem_cgroup_subsys_id,
@@ -6794,6 +6810,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.can_attach = mem_cgroup_can_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.attach = mem_cgroup_move_task,
+	.bind = mem_cgroup_bind,
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
-- 
cgit 


From aa2fbe6d44892070d78995f0df875ce930904e29 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:21:40 +0800
Subject: KVM: Let ioapic know the irq line status

Userspace may deliver RTC interrupt without query the status. So we
want to track RTC EOI for this case.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8254.c     |  4 ++--
 arch/x86/kvm/x86.c       |  6 ++++--
 include/linux/kvm_host.h | 11 +++++++----
 virt/kvm/assigned-dev.c  | 13 +++++++------
 virt/kvm/eventfd.c       | 15 +++++++++------
 virt/kvm/ioapic.c        | 18 ++++++++++--------
 virt/kvm/ioapic.h        |  2 +-
 virt/kvm/irq_comm.c      | 19 ++++++++++++-------
 virt/kvm/kvm_main.c      |  3 ++-
 9 files changed, 54 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c1d30b2fc9bb..412a5aa0ef94 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -290,8 +290,8 @@ static void pit_do_work(struct kthread_work *work)
 	}
 	spin_unlock(&ps->inject_lock);
 	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
 
 		/*
 		 * Provides NMI watchdog support via Virtual Wire mode.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 999d1243a6ce..ae9744d03c83 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3491,13 +3491,15 @@ out:
 	return r;
 }
 
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event)
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
+			bool line_status)
 {
 	if (!irqchip_in_kernel(kvm))
 		return -ENXIO;
 
 	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
-					irq_event->irq, irq_event->level);
+					irq_event->irq, irq_event->level,
+					line_status);
 	return 0;
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 20d77d24d764..4a76aca31478 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -292,7 +292,8 @@ struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
 	int (*set)(struct kvm_kernel_irq_routing_entry *e,
-		   struct kvm *kvm, int irq_source_id, int level);
+		   struct kvm *kvm, int irq_source_id, int level,
+		   bool line_status);
 	union {
 		struct {
 			unsigned irqchip;
@@ -591,7 +592,8 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
 				   struct kvm_userspace_memory_region *mem);
-int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level);
+int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
+			bool line_status);
 long kvm_arch_vm_ioctl(struct file *filp,
 		       unsigned int ioctl, unsigned long arg);
 
@@ -722,10 +724,11 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
 #endif
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level);
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status);
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
-		int irq_source_id, int level);
+		int irq_source_id, int level, bool line_status);
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/assigned-dev.c b/virt/kvm/assigned-dev.c
index 3642239252b0..f4c7f591b5d8 100644
--- a/virt/kvm/assigned-dev.c
+++ b/virt/kvm/assigned-dev.c
@@ -80,11 +80,12 @@ kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev,
 		spin_lock(&assigned_dev->intx_mask_lock);
 		if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX))
 			kvm_set_irq(assigned_dev->kvm,
-				    assigned_dev->irq_source_id, vector, 1);
+				    assigned_dev->irq_source_id, vector, 1,
+				    false);
 		spin_unlock(&assigned_dev->intx_mask_lock);
 	} else
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-			    vector, 1);
+			    vector, 1, false);
 }
 
 static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
@@ -165,7 +166,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 		container_of(kian, struct kvm_assigned_dev_kernel,
 			     ack_notifier);
 
-	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0);
+	kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false);
 
 	spin_lock(&dev->intx_mask_lock);
 
@@ -188,7 +189,7 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 
 		if (reassert)
 			kvm_set_irq(dev->kvm, dev->irq_source_id,
-				    dev->guest_irq, 1);
+				    dev->guest_irq, 1, false);
 	}
 
 	spin_unlock(&dev->intx_mask_lock);
@@ -202,7 +203,7 @@ static void deassign_guest_irq(struct kvm *kvm,
 						&assigned_dev->ack_notifier);
 
 	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 0);
+		    assigned_dev->guest_irq, 0, false);
 
 	if (assigned_dev->irq_source_id != -1)
 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
@@ -901,7 +902,7 @@ static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 	if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) {
 		if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) {
 			kvm_set_irq(match->kvm, match->irq_source_id,
-				    match->guest_irq, 0);
+				    match->guest_irq, 0, false);
 			/*
 			 * Masking at hardware-level is performed on demand,
 			 * i.e. when an IRQ actually arrives at the host.
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 48790989f8d2..c5d43ffbf1f3 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -100,11 +100,13 @@ irqfd_inject(struct work_struct *work)
 	struct kvm *kvm = irqfd->kvm;
 
 	if (!irqfd->resampler) {
-		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1);
-		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0);
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 1,
+				false);
+		kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, irqfd->gsi, 0,
+				false);
 	} else
 		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-			    irqfd->gsi, 1);
+			    irqfd->gsi, 1, false);
 }
 
 /*
@@ -121,7 +123,7 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 	resampler = container_of(kian, struct _irqfd_resampler, notifier);
 
 	kvm_set_irq(resampler->kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-		    resampler->notifier.gsi, 0);
+		    resampler->notifier.gsi, 0, false);
 
 	rcu_read_lock();
 
@@ -146,7 +148,7 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
 		list_del(&resampler->link);
 		kvm_unregister_irq_ack_notifier(kvm, &resampler->notifier);
 		kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
-			    resampler->notifier.gsi, 0);
+			    resampler->notifier.gsi, 0, false);
 		kfree(resampler);
 	}
 
@@ -225,7 +227,8 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 		irq = rcu_dereference(irqfd->irq_entry);
 		/* An event has been signaled, inject an interrupt */
 		if (irq)
-			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+			kvm_set_msi(irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+					false);
 		else
 			schedule_work(&irqfd->inject);
 		rcu_read_unlock();
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 76528fff252d..a49fcd55b378 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -50,7 +50,8 @@
 #else
 #define ioapic_debug(fmt, arg...)
 #endif
-static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq);
+static int ioapic_deliver(struct kvm_ioapic *vioapic, int irq,
+		bool line_status);
 
 static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 					  unsigned long addr,
@@ -146,7 +147,8 @@ static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic)
 	    __rtc_irq_eoi_tracking_restore_one(vcpu);
 }
 
-static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
+static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx,
+		bool line_status)
 {
 	union kvm_ioapic_redirect_entry *pent;
 	int injected = -1;
@@ -154,7 +156,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 	pent = &ioapic->redirtbl[idx];
 
 	if (!pent->fields.mask) {
-		injected = ioapic_deliver(ioapic, idx);
+		injected = ioapic_deliver(ioapic, idx, line_status);
 		if (injected && pent->fields.trig_mode == IOAPIC_LEVEL_TRIG)
 			pent->fields.remote_irr = 1;
 	}
@@ -248,13 +250,13 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
-			ioapic_service(ioapic, index);
+			ioapic_service(ioapic, index, false);
 		kvm_ioapic_make_eoibitmap_request(ioapic->kvm);
 		break;
 	}
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq, bool line_status)
 {
 	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
 	struct kvm_lapic_irq irqe;
@@ -277,7 +279,7 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-		       int level)
+		       int level, bool line_status)
 {
 	u32 old_irr;
 	u32 mask = 1 << irq;
@@ -300,7 +302,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
 		ioapic->irr |= mask;
 		if ((edge && old_irr != ioapic->irr) ||
 		    (!edge && !entry.fields.remote_irr))
-			ret = ioapic_service(ioapic, irq);
+			ret = ioapic_service(ioapic, irq, line_status);
 		else
 			ret = 0; /* report coalesced interrupt */
 	}
@@ -349,7 +351,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
 		ent->fields.remote_irr = 0;
 		if (!ent->fields.mask && (ioapic->irr & (1 << i)))
-			ioapic_service(ioapic, i);
+			ioapic_service(ioapic, i, false);
 	}
 }
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 313fc4ea61d9..554157bbb586 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -89,7 +89,7 @@ bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-		       int level);
+		       int level, bool line_status);
 void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
 int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 2f07d2e59a2d..8efb580edfef 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -35,7 +35,8 @@
 #include "ioapic.h"
 
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
-			   struct kvm *kvm, int irq_source_id, int level)
+			   struct kvm *kvm, int irq_source_id, int level,
+			   bool line_status)
 {
 #ifdef CONFIG_X86
 	struct kvm_pic *pic = pic_irqchip(kvm);
@@ -46,10 +47,12 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
-			      struct kvm *kvm, int irq_source_id, int level)
+			      struct kvm *kvm, int irq_source_id, int level,
+			      bool line_status)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level);
+	return kvm_ioapic_set_irq(ioapic, e->irqchip.pin, irq_source_id, level,
+				line_status);
 }
 
 inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
@@ -121,7 +124,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 }
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-		struct kvm *kvm, int irq_source_id, int level)
+		struct kvm *kvm, int irq_source_id, int level, bool line_status)
 {
 	struct kvm_lapic_irq irq;
 
@@ -159,7 +162,7 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
 	route.msi.address_hi = msi->address_hi;
 	route.msi.data = msi->data;
 
-	return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+	return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1, false);
 }
 
 /*
@@ -168,7 +171,8 @@ int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
  *  = 0   Interrupt was coalesced (previous irq is still pending)
  *  > 0   Number of CPUs interrupt was delivered to
  */
-int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
 {
 	struct kvm_kernel_irq_routing_entry *e, irq_set[KVM_NR_IRQCHIPS];
 	int ret = -1, i = 0;
@@ -189,7 +193,8 @@ int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level)
 
 	while(i--) {
 		int r;
-		r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level);
+		r = irq_set[i].set(&irq_set[i], kvm, irq_source_id, level,
+				line_status);
 		if (r < 0)
 			continue;
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5cc53c907d3b..ac3182eed462 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2258,7 +2258,8 @@ static long kvm_vm_ioctl(struct file *filp,
 		if (copy_from_user(&irq_event, argp, sizeof irq_event))
 			goto out;
 
-		r = kvm_vm_ioctl_irq_line(kvm, &irq_event);
+		r = kvm_vm_ioctl_irq_line(kvm, &irq_event,
+					ioctl == KVM_IRQ_LINE_STATUS);
 		if (r)
 			goto out;
 
-- 
cgit 


From 76ec9d18b8972f1b228f819f6126c9f022b3e642 Mon Sep 17 00:00:00 2001
From: Alexandre Courbot <acourbot@nvidia.com>
Date: Thu, 28 Mar 2013 04:34:56 -0700
Subject: Convert selectors of GENERIC_GPIO to GPIOLIB

GENERIC_GPIO is now equivalent to GPIOLIB and features that depended on
GENERIC_GPIO can now depend on GPIOLIB to allow removal of this option.

Signed-off-by: Alexandre Courbot <acourbot@nvidia.com>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
---
 arch/unicore32/Kconfig               |  2 +-
 drivers/extcon/Kconfig               |  2 +-
 drivers/i2c/busses/Kconfig           |  4 ++--
 drivers/i2c/muxes/Kconfig            |  2 +-
 drivers/input/keyboard/Kconfig       |  6 +++---
 drivers/input/misc/Kconfig           |  8 ++++----
 drivers/input/mouse/Kconfig          |  2 +-
 drivers/leds/Kconfig                 |  6 +++---
 drivers/mtd/maps/Kconfig             |  2 +-
 drivers/mtd/nand/Kconfig             |  2 +-
 drivers/net/phy/Kconfig              |  2 +-
 drivers/pinctrl/sh-pfc/Kconfig       | 26 +++++++++++++-------------
 drivers/regulator/Kconfig            |  2 +-
 drivers/spi/Kconfig                  |  8 ++++----
 drivers/staging/android/Kconfig      |  2 +-
 drivers/staging/iio/accel/Kconfig    |  2 +-
 drivers/staging/iio/adc/Kconfig      |  2 +-
 drivers/staging/iio/addac/Kconfig    |  2 +-
 drivers/staging/iio/resolver/Kconfig |  4 ++--
 drivers/staging/iio/trigger/Kconfig  |  2 +-
 drivers/usb/otg/Kconfig              |  2 +-
 drivers/video/Kconfig                |  2 +-
 drivers/video/backlight/Kconfig      |  4 ++--
 drivers/w1/masters/Kconfig           |  2 +-
 include/linux/gpio.h                 |  6 +++---
 25 files changed, 52 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig
index 63521dc6c9d5..468cd706be1d 100644
--- a/arch/unicore32/Kconfig
+++ b/arch/unicore32/Kconfig
@@ -156,7 +156,7 @@ source "mm/Kconfig"
 
 config LEDS
 	def_bool y
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 
 config ALIGNMENT_TRAP
 	def_bool y
diff --git a/drivers/extcon/Kconfig b/drivers/extcon/Kconfig
index 5168a1324a65..3297301a42d4 100644
--- a/drivers/extcon/Kconfig
+++ b/drivers/extcon/Kconfig
@@ -16,7 +16,7 @@ comment "Extcon Device Drivers"
 
 config EXTCON_GPIO
 	tristate "GPIO extcon support"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say Y here to enable GPIO based extcon support. Note that GPIO
 	  extcon supports single state per extcon instance.
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index adfee98486b1..631736e2e7ed 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -363,7 +363,7 @@ config I2C_BLACKFIN_TWI_CLK_KHZ
 
 config I2C_CBUS_GPIO
 	tristate "CBUS I2C driver"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Support for CBUS access using I2C API. Mostly relevant for Nokia
 	  Internet Tablets (770, N800 and N810).
@@ -436,7 +436,7 @@ config I2C_EG20T
 
 config I2C_GPIO
 	tristate "GPIO-based bitbanging I2C"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select I2C_ALGOBIT
 	help
 	  This is a very simple bitbanging I2C driver utilizing the
diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig
index 0be5b83c08fa..40062ed83f37 100644
--- a/drivers/i2c/muxes/Kconfig
+++ b/drivers/i2c/muxes/Kconfig
@@ -7,7 +7,7 @@ menu "Multiplexer I2C Chip support"
 
 config I2C_MUX_GPIO
 	tristate "GPIO-based I2C multiplexer"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  If you say yes to this option, support will be included for a
 	  GPIO based I2C multiplexer. This driver provides access to
diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig
index ac0500667000..772666f7d2ba 100644
--- a/drivers/input/keyboard/Kconfig
+++ b/drivers/input/keyboard/Kconfig
@@ -175,7 +175,7 @@ config KEYBOARD_EP93XX
 
 config KEYBOARD_GPIO
 	tristate "GPIO Buttons"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  This driver implements support for buttons connected
 	  to GPIO pins of various CPUs (and some other chips).
@@ -190,7 +190,7 @@ config KEYBOARD_GPIO
 
 config KEYBOARD_GPIO_POLLED
 	tristate "Polled GPIO buttons"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select INPUT_POLLDEV
 	help
 	  This driver implements support for buttons connected
@@ -241,7 +241,7 @@ config KEYBOARD_TCA8418
 
 config KEYBOARD_MATRIX
 	tristate "GPIO driven matrix keypad support"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select INPUT_MATRIXKMAP
 	help
 	  Enable support for GPIO driven matrix keypad.
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig
index 259ef31abb18..262cda21dcb4 100644
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -214,7 +214,7 @@ config INPUT_APANEL
 config INPUT_GP2A
 	tristate "Sharp GP2AP002A00F I2C Proximity/Opto sensor driver"
 	depends on I2C
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say Y here if you have a Sharp GP2AP002A00F proximity/als combo-chip
 	  hooked to an I2C bus.
@@ -224,7 +224,7 @@ config INPUT_GP2A
 
 config INPUT_GPIO_TILT_POLLED
 	tristate "Polled GPIO tilt switch"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select INPUT_POLLDEV
 	help
 	  This driver implements support for tilt switches connected
@@ -472,7 +472,7 @@ config INPUT_PWM_BEEPER
 
 config INPUT_GPIO_ROTARY_ENCODER
 	tristate "Rotary encoders connected to GPIO pins"
-	depends on GPIOLIB && GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say Y here to add support for rotary encoders connected to GPIO lines.
 	  Check file:Documentation/input/rotary-encoder.txt for more
@@ -484,7 +484,7 @@ config INPUT_GPIO_ROTARY_ENCODER
 config INPUT_RB532_BUTTON
 	tristate "Mikrotik Routerboard 532 button interface"
 	depends on MIKROTIK_RB532
-	depends on GPIOLIB && GENERIC_GPIO
+	depends on GPIOLIB
 	select INPUT_POLLDEV
 	help
 	  Say Y here if you want support for the S1 button built into
diff --git a/drivers/input/mouse/Kconfig b/drivers/input/mouse/Kconfig
index 802bd6a72d73..effa9c5f2c5c 100644
--- a/drivers/input/mouse/Kconfig
+++ b/drivers/input/mouse/Kconfig
@@ -295,7 +295,7 @@ config MOUSE_VSXXXAA
 
 config MOUSE_GPIO
 	tristate "GPIO mouse"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select INPUT_POLLDEV
 	help
 	  This driver simulates a mouse on GPIO lines of various CPUs (and some
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index ec50824c02ec..2a531774f4d1 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -173,7 +173,7 @@ config LEDS_PCA9532_GPIO
 config LEDS_GPIO
 	tristate "LED Support for GPIO connected LEDs"
 	depends on LEDS_CLASS
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  This option enables support for the LEDs connected to GPIO
 	  outputs. To be useful the particular board must have LEDs
@@ -352,7 +352,7 @@ config LEDS_INTEL_SS4200
 config LEDS_LT3593
 	tristate "LED driver for LT3593 controllers"
 	depends on LEDS_CLASS
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  This option enables support for LEDs driven by a Linear Technology
 	  LT3593 controller. This controller uses a special one-wire pulse
@@ -421,7 +421,7 @@ config LEDS_ASIC3
 
 config LEDS_RENESAS_TPU
 	bool "LED support for Renesas TPU"
-	depends on LEDS_CLASS=y && HAVE_CLK && GENERIC_GPIO
+	depends on LEDS_CLASS=y && HAVE_CLK && GPIOLIB
 	help
 	  This option enables build of the LED TPU platform driver,
 	  suitable to drive any TPU channel on newer Renesas SoCs.
diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig
index 3ed17c4d4358..c26938382f64 100644
--- a/drivers/mtd/maps/Kconfig
+++ b/drivers/mtd/maps/Kconfig
@@ -419,7 +419,7 @@ config MTD_BFIN_ASYNC
 
 config MTD_GPIO_ADDR
 	tristate "GPIO-assisted Flash Chip Support"
-	depends on GENERIC_GPIO || GPIOLIB
+	depends on GPIOLIB
 	depends on MTD_COMPLEX_MAPPINGS
 	help
 	  Map driver which allows flashes to be partially physically addressed
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index 81bf5e52601e..5d54ad32697f 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -89,7 +89,7 @@ config MTD_NAND_H1900
 
 config MTD_NAND_GPIO
 	tristate "GPIO NAND Flash driver"
-	depends on GENERIC_GPIO && ARM
+	depends on GPIOLIB && ARM
 	help
 	  This enables a GPIO based NAND flash driver.
 
diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 450345261bd3..1e11f2bfd9ce 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -126,7 +126,7 @@ config MDIO_BITBANG
 
 config MDIO_GPIO
 	tristate "Support for GPIO lib-based bitbanged MDIO buses"
-	depends on MDIO_BITBANG && GENERIC_GPIO
+	depends on MDIO_BITBANG && GPIOLIB
 	---help---
 	  Supports GPIO lib-based MDIO busses.
 
diff --git a/drivers/pinctrl/sh-pfc/Kconfig b/drivers/pinctrl/sh-pfc/Kconfig
index c3340f54d2ad..99ba8e4f2a30 100644
--- a/drivers/pinctrl/sh-pfc/Kconfig
+++ b/drivers/pinctrl/sh-pfc/Kconfig
@@ -6,7 +6,7 @@ if ARCH_SHMOBILE || SUPERH
 
 config PINCTRL_SH_PFC
 	# XXX move off the gpio dependency
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select GPIO_SH_PFC if ARCH_REQUIRE_GPIOLIB
 	select PINMUX
 	select PINCONF
@@ -34,19 +34,19 @@ config PINCTRL_PFC_R8A7779
 config PINCTRL_PFC_SH7203
 	def_bool y
 	depends on CPU_SUBTYPE_SH7203
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7264
 	def_bool y
 	depends on CPU_SUBTYPE_SH7264
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7269
 	def_bool y
 	depends on CPU_SUBTYPE_SH7269
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7372
@@ -62,55 +62,55 @@ config PINCTRL_PFC_SH73A0
 config PINCTRL_PFC_SH7720
 	def_bool y
 	depends on CPU_SUBTYPE_SH7720
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7722
 	def_bool y
 	depends on CPU_SUBTYPE_SH7722
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7723
 	def_bool y
 	depends on CPU_SUBTYPE_SH7723
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7724
 	def_bool y
 	depends on CPU_SUBTYPE_SH7724
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7734
 	def_bool y
 	depends on CPU_SUBTYPE_SH7734
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7757
 	def_bool y
 	depends on CPU_SUBTYPE_SH7757
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7785
 	def_bool y
 	depends on CPU_SUBTYPE_SH7785
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SH7786
 	def_bool y
 	depends on CPU_SUBTYPE_SH7786
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 config PINCTRL_PFC_SHX3
 	def_bool y
 	depends on CPU_SUBTYPE_SHX3
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select PINCTRL_SH_PFC
 
 endif
diff --git a/drivers/regulator/Kconfig b/drivers/regulator/Kconfig
index a5d97eaee99e..8bb26446037e 100644
--- a/drivers/regulator/Kconfig
+++ b/drivers/regulator/Kconfig
@@ -66,7 +66,7 @@ config REGULATOR_USERSPACE_CONSUMER
 
 config REGULATOR_GPIO
 	tristate "GPIO regulator support"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  This driver provides support for regulators that can be
 	  controlled via gpios.
diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig
index f80eee74a311..134b6f3d0274 100644
--- a/drivers/spi/Kconfig
+++ b/drivers/spi/Kconfig
@@ -61,7 +61,7 @@ config SPI_ALTERA
 
 config SPI_ATH79
 	tristate "Atheros AR71XX/AR724X/AR913X SPI controller driver"
-	depends on ATH79 && GENERIC_GPIO
+	depends on ATH79 && GPIOLIB
 	select SPI_BITBANG
 	help
 	  This enables support for the SPI controller present on the
@@ -163,7 +163,7 @@ config SPI_FALCON
 
 config SPI_GPIO
 	tristate "GPIO-based bitbanging SPI Master"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select SPI_BITBANG
 	help
 	  This simple GPIO bitbanging SPI master uses the arch-neutral GPIO
@@ -240,7 +240,7 @@ config SPI_FSL_ESPI
 
 config SPI_OC_TINY
 	tristate "OpenCores tiny SPI"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select SPI_BITBANG
 	help
 	  This is the driver for OpenCores tiny SPI master controller.
@@ -430,7 +430,7 @@ config SPI_TOPCLIFF_PCH
 
 config SPI_TXX9
 	tristate "Toshiba TXx9 SPI controller"
-	depends on GENERIC_GPIO && CPU_TX49XX
+	depends on GPIOLIB && CPU_TX49XX
 	help
 	  SPI driver for Toshiba TXx9 MIPS SoCs
 
diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig
index 465a28c08f20..9feb6ff06505 100644
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -54,7 +54,7 @@ config ANDROID_TIMED_OUTPUT
 
 config ANDROID_TIMED_GPIO
 	tristate "Android timed gpio driver"
-	depends on GENERIC_GPIO && ANDROID_TIMED_OUTPUT
+	depends on GPIOLIB && ANDROID_TIMED_OUTPUT
 	default n
 
 config ANDROID_LOW_MEMORY_KILLER
diff --git a/drivers/staging/iio/accel/Kconfig b/drivers/staging/iio/accel/Kconfig
index e2e786dc9c7b..ad45dfbdf417 100644
--- a/drivers/staging/iio/accel/Kconfig
+++ b/drivers/staging/iio/accel/Kconfig
@@ -61,7 +61,7 @@ config LIS3L02DQ
 	depends on SPI
 	select IIO_TRIGGER if IIO_BUFFER
 	depends on !IIO_BUFFER || IIO_KFIFO_BUF
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say yes here to build SPI support for the ST microelectronics
 	  accelerometer. The driver supplies direct access via sysfs files
diff --git a/drivers/staging/iio/adc/Kconfig b/drivers/staging/iio/adc/Kconfig
index 7b2a01d64f5e..a48eee1b6457 100644
--- a/drivers/staging/iio/adc/Kconfig
+++ b/drivers/staging/iio/adc/Kconfig
@@ -73,7 +73,7 @@ config AD7780
 config AD7816
 	tristate "Analog Devices AD7816/7/8 temperature sensor and ADC driver"
 	depends on SPI
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say yes here to build support for Analog Devices AD7816/7/8
 	  temperature sensors and ADC.
diff --git a/drivers/staging/iio/addac/Kconfig b/drivers/staging/iio/addac/Kconfig
index 698a8970b372..e6795e0bed1d 100644
--- a/drivers/staging/iio/addac/Kconfig
+++ b/drivers/staging/iio/addac/Kconfig
@@ -5,7 +5,7 @@ menu "Analog digital bi-direction converters"
 
 config ADT7316
 	tristate "Analog Devices ADT7316/7/8 ADT7516/7/9 temperature sensor, ADC and DAC driver"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say yes here to build support for Analog Devices ADT7316, ADT7317, ADT7318
 	  and ADT7516, ADT7517, ADT7519 temperature sensors, ADC and DAC.
diff --git a/drivers/staging/iio/resolver/Kconfig b/drivers/staging/iio/resolver/Kconfig
index 49f69ef986fc..ce360f163216 100644
--- a/drivers/staging/iio/resolver/Kconfig
+++ b/drivers/staging/iio/resolver/Kconfig
@@ -13,7 +13,7 @@ config AD2S90
 config AD2S1200
 	tristate "Analog Devices ad2s1200/ad2s1205 driver"
 	depends on SPI
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say yes here to build support for Analog Devices spi resolver
 	  to digital converters, ad2s1200 and ad2s1205, provides direct access
@@ -22,7 +22,7 @@ config AD2S1200
 config AD2S1210
 	tristate "Analog Devices ad2s1210 driver"
 	depends on SPI
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say yes here to build support for Analog Devices spi resolver
 	  to digital converters, ad2s1210, provides direct access via sysfs.
diff --git a/drivers/staging/iio/trigger/Kconfig b/drivers/staging/iio/trigger/Kconfig
index d44d3ad26fa5..1a051da62505 100644
--- a/drivers/staging/iio/trigger/Kconfig
+++ b/drivers/staging/iio/trigger/Kconfig
@@ -14,7 +14,7 @@ config IIO_PERIODIC_RTC_TRIGGER
 
 config IIO_GPIO_TRIGGER
 	tristate "GPIO trigger"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Provides support for using GPIO pins as IIO triggers.
 
diff --git a/drivers/usb/otg/Kconfig b/drivers/usb/otg/Kconfig
index 37962c99ff1e..83778683ab4d 100644
--- a/drivers/usb/otg/Kconfig
+++ b/drivers/usb/otg/Kconfig
@@ -19,7 +19,7 @@ if USB || USB_GADGET
 #
 config USB_GPIO_VBUS
 	tristate "GPIO based peripheral-only VBUS sensing 'transceiver'"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select USB_OTG_UTILS
 	help
 	  Provides simple GPIO VBUS sensing for controllers with an
diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 4c1546f71d56..df9c738d1ff2 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2481,7 +2481,7 @@ config FB_SSD1307
 	tristate "Solomon SSD1307 framebuffer support"
 	depends on FB && I2C
 	depends on OF
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	select FB_SYS_FOPS
 	select FB_SYS_FILLRECT
 	select FB_SYS_COPYAREA
diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig
index db10d0120d2b..b83d1557d421 100644
--- a/drivers/video/backlight/Kconfig
+++ b/drivers/video/backlight/Kconfig
@@ -36,14 +36,14 @@ config LCD_CORGI
 
 config LCD_L4F00242T03
 	tristate "Epson L4F00242T03 LCD"
-	depends on SPI_MASTER && GENERIC_GPIO
+	depends on SPI_MASTER && GPIOLIB
 	help
 	  SPI driver for Epson L4F00242T03. This provides basic support
 	  for init and powering the LCD up/down through a sysfs interface.
 
 config LCD_LMS283GF05
 	tristate "Samsung LMS283GF05 LCD"
-	depends on SPI_MASTER && GENERIC_GPIO
+	depends on SPI_MASTER && GPIOLIB
 	help
 	  SPI driver for Samsung LMS283GF05. This provides basic support
 	  for powering the LCD up/down through a sysfs interface.
diff --git a/drivers/w1/masters/Kconfig b/drivers/w1/masters/Kconfig
index e8ca63a82b97..2bd1257dcc1c 100644
--- a/drivers/w1/masters/Kconfig
+++ b/drivers/w1/masters/Kconfig
@@ -50,7 +50,7 @@ config W1_MASTER_DS1WM
 
 config W1_MASTER_GPIO
 	tristate "GPIO 1-wire busmaster"
-	depends on GENERIC_GPIO
+	depends on GPIOLIB
 	help
 	  Say Y here if you want to communicate with your 1-wire devices using
 	  GPIO pins. This driver uses the GPIO API to control the wire.
diff --git a/include/linux/gpio.h b/include/linux/gpio.h
index f6c7ae3e223b..552e3f46e4a3 100644
--- a/include/linux/gpio.h
+++ b/include/linux/gpio.h
@@ -39,7 +39,7 @@ struct gpio {
 	const char	*label;
 };
 
-#ifdef CONFIG_GENERIC_GPIO
+#ifdef CONFIG_GPIOLIB
 
 #ifdef CONFIG_ARCH_HAVE_CUSTOM_GPIO_H
 #include <asm/gpio.h>
@@ -74,7 +74,7 @@ static inline int irq_to_gpio(unsigned int irq)
 
 #endif /* ! CONFIG_ARCH_HAVE_CUSTOM_GPIO_H */
 
-#else /* ! CONFIG_GENERIC_GPIO */
+#else /* ! CONFIG_GPIOLIB */
 
 #include <linux/kernel.h>
 #include <linux/types.h>
@@ -226,7 +226,7 @@ gpiochip_remove_pin_ranges(struct gpio_chip *chip)
 	WARN_ON(1);
 }
 
-#endif /* ! CONFIG_GENERIC_GPIO */
+#endif /* ! CONFIG_GPIOLIB */
 
 struct device;
 
-- 
cgit 


From 37799e52a29af2268d1fbe18908a0d6b9f68af88 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 Mar 2013 14:02:26 +0100
Subject: mac80211: unify CSA action frame/beacon processing

CSA action frame content should be processed as variable IEs
rather than fixed to make it extensible. Unify the code and
process them just like CSA in beacons to make it easier to
extend for HT/VHT.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  |  4 +--
 net/mac80211/ieee80211_i.h |  4 ---
 net/mac80211/mlme.c        | 71 ++++++++++++++++++++++++++++------------------
 net/mac80211/rx.c          |  4 ---
 4 files changed, 44 insertions(+), 39 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index e46fea8b972e..8f80b3a93501 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -840,9 +840,7 @@ struct ieee80211_mgmt {
 				} __packed wme_action;
 				struct{
 					u8 action_code;
-					u8 element_id;
-					u8 length;
-					struct ieee80211_channel_sw_ie sw_elem;
+					u8 variable[0];
 				} __packed chan_switch;
 				struct{
 					u8 action_code;
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8d5dcbf17bbc..373460f9c069 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1252,10 +1252,6 @@ void ieee80211_recalc_ps_vif(struct ieee80211_sub_if_data *sdata);
 int ieee80211_max_network_latency(struct notifier_block *nb,
 				  unsigned long data, void *dummy);
 int ieee80211_set_arp_filter(struct ieee80211_sub_if_data *sdata);
-void
-ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
-				 const struct ieee80211_channel_sw_ie *sw_elem,
-				 struct ieee80211_bss *bss, u64 timestamp);
 void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata);
 void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 				  struct sk_buff *skb);
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 2a2c45354498..ade3cd6c337d 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1020,33 +1020,37 @@ static void ieee80211_chswitch_timer(unsigned long data)
 	ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.chswitch_work);
 }
 
-void
+static void
 ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
-				 const struct ieee80211_channel_sw_ie *sw_elem,
-				 struct ieee80211_bss *bss, u64 timestamp)
+				 u64 timestamp, struct ieee802_11_elems *elems)
 {
-	struct cfg80211_bss *cbss =
-		container_of((void *)bss, struct cfg80211_bss, priv);
-	struct ieee80211_channel *new_ch;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
-	int new_freq = ieee80211_channel_to_frequency(sw_elem->new_ch_num,
-						      cbss->channel->band);
+	struct cfg80211_bss *cbss = ifmgd->associated;
+	struct ieee80211_bss *bss;
+	struct ieee80211_channel *new_ch;
+	int new_freq;
 	struct ieee80211_chanctx *chanctx;
 
 	ASSERT_MGD_MTX(ifmgd);
 
-	if (!ifmgd->associated)
+	if (!cbss)
 		return;
 
 	if (sdata->local->scanning)
 		return;
 
-	/* Disregard subsequent beacons if we are already running a timer
-	   processing a CSA */
-
+	/* disregard subsequent announcements if we are already processing */
 	if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
 		return;
 
+	if (!elems->ch_switch_ie)
+		return;
+
+	bss = (void *)cbss->priv;
+
+	new_freq = ieee80211_channel_to_frequency(
+			elems->ch_switch_ie->new_ch_num,
+			cbss->channel->band);
 	new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
 	if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED) {
 		sdata_info(sdata,
@@ -1086,7 +1090,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 
 	sdata->local->csa_channel = new_ch;
 
-	if (sw_elem->mode)
+	if (elems->ch_switch_ie->mode)
 		ieee80211_stop_queues_by_reason(&sdata->local->hw,
 				IEEE80211_MAX_QUEUE_MAP,
 				IEEE80211_QUEUE_STOP_REASON_CSA);
@@ -1095,9 +1099,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		/* use driver's channel switch callback */
 		struct ieee80211_channel_switch ch_switch = {
 			.timestamp = timestamp,
-			.block_tx = sw_elem->mode,
+			.block_tx = elems->ch_switch_ie->mode,
 			.channel = new_ch,
-			.count = sw_elem->count,
+			.count = elems->ch_switch_ie->count,
 		};
 
 		drv_channel_switch(sdata->local, &ch_switch);
@@ -1105,11 +1109,11 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	}
 
 	/* channel switch handled in software */
-	if (sw_elem->count <= 1)
+	if (elems->ch_switch_ie->count <= 1)
 		ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work);
 	else
 		mod_timer(&ifmgd->chswitch_timer,
-			  TU_TO_EXP_TIME(sw_elem->count *
+			  TU_TO_EXP_TIME(elems->ch_switch_ie->count *
 					 cbss->beacon_interval));
 }
 
@@ -2655,7 +2659,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	if (bss)
 		ieee80211_rx_bss_put(local, bss);
 
-	if (!sdata->u.mgd.associated)
+	if (!sdata->u.mgd.associated ||
+	    !ether_addr_equal(mgmt->bssid, sdata->u.mgd.associated->bssid))
 		return;
 
 	if (need_ps) {
@@ -2664,10 +2669,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 		mutex_unlock(&local->iflist_mtx);
 	}
 
-	if (elems->ch_switch_ie &&
-	    memcmp(mgmt->bssid, sdata->u.mgd.associated->bssid, ETH_ALEN) == 0)
-		ieee80211_sta_process_chanswitch(sdata, elems->ch_switch_ie,
-						 bss, rx_status->mactime);
+	ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, elems);
 }
 
 
@@ -3061,14 +3063,27 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		rma = ieee80211_rx_mgmt_assoc_resp(sdata, mgmt, skb->len, &bss);
 		break;
 	case IEEE80211_STYPE_ACTION:
-		switch (mgmt->u.action.category) {
-		case WLAN_CATEGORY_SPECTRUM_MGMT:
+		if (mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) {
+			struct ieee802_11_elems elems;
+			int ies_len = skb->len -
+				      offsetof(struct ieee80211_mgmt,
+					       u.action.u.chan_switch.variable);
+
+			if (ies_len < 0)
+				break;
+
+			ieee802_11_parse_elems(
+				mgmt->u.action.u.chan_switch.variable,
+				ies_len, &elems);
+
+			if (elems.parse_error)
+				break;
+
 			ieee80211_sta_process_chanswitch(sdata,
-					&mgmt->u.action.u.chan_switch.sw_elem,
-					(void *)ifmgd->associated->priv,
-					rx_status->mactime);
-			break;
+							 rx_status->mactime,
+							 &elems);
 		}
+		break;
 	}
 	mutex_unlock(&ifmgd->mtx);
 
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 5168f89c754d..e9825f15c14c 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2507,10 +2507,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 			ieee80211_process_measurement_req(sdata, mgmt, len);
 			goto handled;
 		case WLAN_ACTION_SPCT_CHL_SWITCH:
-			if (len < (IEEE80211_MIN_ACTION_SIZE +
-				   sizeof(mgmt->u.action.u.chan_switch)))
-				break;
-
 			if (sdata->vif.type != NL80211_IFTYPE_STATION)
 				break;
 
-- 
cgit 


From b4f286a1c0ad0b84c2d502b354d4d98d5a86c64b Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 Mar 2013 14:13:58 +0100
Subject: mac80211: support extended channel switch

Support extended channel switch when the operating
class is one of the global operating classes as
defined in Annex E of 802.11-2012. If it isn't,
disconnect from the AP instead.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 12 ++++++++
 net/mac80211/ieee80211_i.h |  1 +
 net/mac80211/mlme.c        | 77 ++++++++++++++++++++++++++++++----------------
 net/mac80211/util.c        |  7 +++++
 4 files changed, 71 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 8f80b3a93501..2a10acc65a54 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -672,6 +672,18 @@ struct ieee80211_channel_sw_ie {
 	u8 count;
 } __packed;
 
+/**
+ * struct ieee80211_ext_chansw_ie
+ *
+ * This structure represents the "Extended Channel Switch Announcement element"
+ */
+struct ieee80211_ext_chansw_ie {
+	u8 mode;
+	u8 new_operating_class;
+	u8 new_ch_num;
+	u8 count;
+} __packed;
+
 /**
  * struct ieee80211_tim
  *
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 373460f9c069..10c3180b165e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1178,6 +1178,7 @@ struct ieee802_11_elems {
 	const u8 *perr;
 	const struct ieee80211_rann_ie *rann;
 	const struct ieee80211_channel_sw_ie *ch_switch_ie;
+	const struct ieee80211_ext_chansw_ie *ext_chansw_ie;
 	const u8 *country_elem;
 	const u8 *pwr_constr_elem;
 	const struct ieee80211_timeout_interval_ie *timeout_int;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index ade3cd6c337d..bc6f87edc624 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -1024,56 +1024,79 @@ static void
 ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 				 u64 timestamp, struct ieee802_11_elems *elems)
 {
+	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct cfg80211_bss *cbss = ifmgd->associated;
 	struct ieee80211_bss *bss;
 	struct ieee80211_channel *new_ch;
-	int new_freq;
 	struct ieee80211_chanctx *chanctx;
+	enum ieee80211_band new_band;
+	int new_freq;
+	u8 new_chan_no;
+	u8 count;
+	u8 mode;
 
 	ASSERT_MGD_MTX(ifmgd);
 
 	if (!cbss)
 		return;
 
-	if (sdata->local->scanning)
+	if (local->scanning)
 		return;
 
 	/* disregard subsequent announcements if we are already processing */
 	if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
 		return;
 
-	if (!elems->ch_switch_ie)
+	if (elems->ext_chansw_ie) {
+		if (!ieee80211_operating_class_to_band(
+				elems->ext_chansw_ie->new_operating_class,
+				&new_band)) {
+			sdata_info(sdata,
+				   "cannot understand ECSA IE operating class %d, disconnecting\n",
+				   elems->ext_chansw_ie->new_operating_class);
+			ieee80211_queue_work(&local->hw,
+					     &ifmgd->csa_connection_drop_work);
+		}
+		new_chan_no = elems->ext_chansw_ie->new_ch_num;
+		count = elems->ext_chansw_ie->count;
+		mode = elems->ext_chansw_ie->mode;
+	} else if (elems->ch_switch_ie) {
+		new_band = cbss->channel->band;
+		new_chan_no = elems->ch_switch_ie->new_ch_num;
+		count = elems->ch_switch_ie->count;
+		mode = elems->ch_switch_ie->mode;
+	} else {
+		/* nothing here we understand */
 		return;
+	}
 
 	bss = (void *)cbss->priv;
 
-	new_freq = ieee80211_channel_to_frequency(
-			elems->ch_switch_ie->new_ch_num,
-			cbss->channel->band);
-	new_ch = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
+	new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band);
+	new_ch = ieee80211_get_channel(local->hw.wiphy, new_freq);
 	if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED) {
 		sdata_info(sdata,
 			   "AP %pM switches to unsupported channel (%d MHz), disconnecting\n",
 			   ifmgd->associated->bssid, new_freq);
-		ieee80211_queue_work(&sdata->local->hw,
+		ieee80211_queue_work(&local->hw,
 				     &ifmgd->csa_connection_drop_work);
 		return;
 	}
 
 	ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
 
-	if (sdata->local->use_chanctx) {
+	if (local->use_chanctx) {
 		sdata_info(sdata,
 			   "not handling channel switch with channel contexts\n");
-		ieee80211_queue_work(&sdata->local->hw,
+		ieee80211_queue_work(&local->hw,
 				     &ifmgd->csa_connection_drop_work);
 		return;
 	}
 
-	mutex_lock(&sdata->local->chanctx_mtx);
+	mutex_lock(&local->chanctx_mtx);
 	if (WARN_ON(!rcu_access_pointer(sdata->vif.chanctx_conf))) {
-		mutex_unlock(&sdata->local->chanctx_mtx);
+		mutex_unlock(&local->chanctx_mtx);
 		return;
 	}
 	chanctx = container_of(rcu_access_pointer(sdata->vif.chanctx_conf),
@@ -1081,40 +1104,39 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	if (chanctx->refcount > 1) {
 		sdata_info(sdata,
 			   "channel switch with multiple interfaces on the same channel, disconnecting\n");
-		ieee80211_queue_work(&sdata->local->hw,
+		ieee80211_queue_work(&local->hw,
 				     &ifmgd->csa_connection_drop_work);
-		mutex_unlock(&sdata->local->chanctx_mtx);
+		mutex_unlock(&local->chanctx_mtx);
 		return;
 	}
-	mutex_unlock(&sdata->local->chanctx_mtx);
+	mutex_unlock(&local->chanctx_mtx);
 
-	sdata->local->csa_channel = new_ch;
+	local->csa_channel = new_ch;
 
-	if (elems->ch_switch_ie->mode)
-		ieee80211_stop_queues_by_reason(&sdata->local->hw,
+	if (mode)
+		ieee80211_stop_queues_by_reason(&local->hw,
 				IEEE80211_MAX_QUEUE_MAP,
 				IEEE80211_QUEUE_STOP_REASON_CSA);
 
-	if (sdata->local->ops->channel_switch) {
+	if (local->ops->channel_switch) {
 		/* use driver's channel switch callback */
 		struct ieee80211_channel_switch ch_switch = {
 			.timestamp = timestamp,
-			.block_tx = elems->ch_switch_ie->mode,
+			.block_tx = mode,
 			.channel = new_ch,
-			.count = elems->ch_switch_ie->count,
+			.count = count,
 		};
 
-		drv_channel_switch(sdata->local, &ch_switch);
+		drv_channel_switch(local, &ch_switch);
 		return;
 	}
 
 	/* channel switch handled in software */
-	if (elems->ch_switch_ie->count <= 1)
-		ieee80211_queue_work(&sdata->local->hw, &ifmgd->chswitch_work);
+	if (count <= 1)
+		ieee80211_queue_work(&local->hw, &ifmgd->chswitch_work);
 	else
 		mod_timer(&ifmgd->chswitch_timer,
-			  TU_TO_EXP_TIME(elems->ch_switch_ie->count *
-					 cbss->beacon_interval));
+			  TU_TO_EXP_TIME(count * cbss->beacon_interval));
 }
 
 static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
@@ -2629,6 +2651,8 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_channel *channel;
 	bool need_ps = false;
 
+	lockdep_assert_held(&sdata->u.mgd.mtx);
+
 	if ((sdata->u.mgd.associated &&
 	     ether_addr_equal(mgmt->bssid, sdata->u.mgd.associated->bssid)) ||
 	    (sdata->u.mgd.assoc_data &&
@@ -2670,6 +2694,7 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata,
 	}
 
 	ieee80211_sta_process_chanswitch(sdata, rx_status->mactime, elems);
+
 }
 
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 1d6217ac3ba3..e4a6d559372d 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -863,6 +863,13 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			}
 			elems->ch_switch_ie = (void *)pos;
 			break;
+		case WLAN_EID_EXT_CHANSWITCH_ANN:
+			if (elen != sizeof(struct ieee80211_ext_chansw_ie)) {
+				elem_parse_failed = true;
+				break;
+			}
+			elems->ext_chansw_ie = (void *)pos;
+			break;
 		case WLAN_EID_COUNTRY:
 			elems->country_elem = pos;
 			elems->country_elem_len = elen;
-- 
cgit 


From 85220d71bf3ca1ba9129e0744247ae5f61bec559 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Mon, 25 Mar 2013 18:29:27 +0100
Subject: mac80211: support secondary channel offset in CSA

Add support for the secondary channel offset IE in channel
switch announcements. This is necessary for proper handling
of CSA on HT access points.

For this to work it is also necessary to convert everything
here to use chandef structs instead of just channels. The
driver updates aren't really correct though. In particular,
the TI wl18xx driver update can't possibly be right since
it just ignores the new channel width for lack of firmware
API.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/iwlegacy/4965-mac.c    | 32 ++++++-------
 drivers/net/wireless/iwlegacy/4965.c        |  2 +-
 drivers/net/wireless/iwlwifi/dvm/devices.c  | 10 ++--
 drivers/net/wireless/iwlwifi/dvm/mac80211.c | 20 ++++++--
 drivers/net/wireless/iwlwifi/dvm/rxon.c     |  2 +-
 drivers/net/wireless/ti/wl12xx/cmd.c        |  2 +-
 drivers/net/wireless/ti/wl18xx/cmd.c        |  6 +--
 include/linux/ieee80211.h                   | 11 +++++
 include/net/mac80211.h                      |  4 +-
 net/mac80211/ieee80211_i.h                  |  3 +-
 net/mac80211/mlme.c                         | 71 +++++++++++++++++++++++------
 net/mac80211/trace.h                        |  8 ++--
 net/mac80211/util.c                         |  8 ++++
 13 files changed, 125 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/wireless/iwlegacy/4965-mac.c b/drivers/net/wireless/iwlegacy/4965-mac.c
index c092fcbbe965..cb5882ea5f3a 100644
--- a/drivers/net/wireless/iwlegacy/4965-mac.c
+++ b/drivers/net/wireless/iwlegacy/4965-mac.c
@@ -6057,7 +6057,7 @@ il4965_mac_channel_switch(struct ieee80211_hw *hw,
 	struct il_priv *il = hw->priv;
 	const struct il_channel_info *ch_info;
 	struct ieee80211_conf *conf = &hw->conf;
-	struct ieee80211_channel *channel = ch_switch->channel;
+	struct ieee80211_channel *channel = ch_switch->chandef.chan;
 	struct il_ht_config *ht_conf = &il->current_ht_config;
 	u16 ch;
 
@@ -6094,23 +6094,21 @@ il4965_mac_channel_switch(struct ieee80211_hw *hw,
 	il->current_ht_config.smps = conf->smps_mode;
 
 	/* Configure HT40 channels */
-	il->ht.enabled = conf_is_ht(conf);
-	if (il->ht.enabled) {
-		if (conf_is_ht40_minus(conf)) {
-			il->ht.extension_chan_offset =
-			    IEEE80211_HT_PARAM_CHA_SEC_BELOW;
-			il->ht.is_40mhz = true;
-		} else if (conf_is_ht40_plus(conf)) {
-			il->ht.extension_chan_offset =
-			    IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
-			il->ht.is_40mhz = true;
-		} else {
-			il->ht.extension_chan_offset =
-			    IEEE80211_HT_PARAM_CHA_SEC_NONE;
-			il->ht.is_40mhz = false;
-		}
-	} else
+	switch (cfg80211_get_chandef_type(&ch_switch->chandef)) {
+	case NL80211_CHAN_NO_HT:
+	case NL80211_CHAN_HT20:
 		il->ht.is_40mhz = false;
+		il->ht.extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+		break;
+	case NL80211_CHAN_HT40MINUS:
+		il->ht.extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
+		il->ht.is_40mhz = true;
+		break;
+	case NL80211_CHAN_HT40PLUS:
+		il->ht.extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
+		il->ht.is_40mhz = true;
+		break;
+	}
 
 	if ((le16_to_cpu(il->staging.channel) != ch))
 		il->staging.flags = 0;
diff --git a/drivers/net/wireless/iwlegacy/4965.c b/drivers/net/wireless/iwlegacy/4965.c
index 91eb2d07fdb8..777a578294bd 100644
--- a/drivers/net/wireless/iwlegacy/4965.c
+++ b/drivers/net/wireless/iwlegacy/4965.c
@@ -1493,7 +1493,7 @@ il4965_hw_channel_switch(struct il_priv *il,
 
 	cmd.band = band;
 	cmd.expect_beacon = 0;
-	ch = ch_switch->channel->hw_value;
+	ch = ch_switch->chandef.chan->hw_value;
 	cmd.channel = cpu_to_le16(ch);
 	cmd.rxon_flags = il->staging.flags;
 	cmd.rxon_filter_flags = il->staging.filter_flags;
diff --git a/drivers/net/wireless/iwlwifi/dvm/devices.c b/drivers/net/wireless/iwlwifi/dvm/devices.c
index 15cca2ef9294..c48907c8ab43 100644
--- a/drivers/net/wireless/iwlwifi/dvm/devices.c
+++ b/drivers/net/wireless/iwlwifi/dvm/devices.c
@@ -379,7 +379,7 @@ static int iwl5000_hw_channel_switch(struct iwl_priv *priv,
 	};
 
 	cmd.band = priv->band == IEEE80211_BAND_2GHZ;
-	ch = ch_switch->channel->hw_value;
+	ch = ch_switch->chandef.chan->hw_value;
 	IWL_DEBUG_11H(priv, "channel switch from %d to %d\n",
 		      ctx->active.channel, ch);
 	cmd.channel = cpu_to_le16(ch);
@@ -414,7 +414,8 @@ static int iwl5000_hw_channel_switch(struct iwl_priv *priv,
 	}
 	IWL_DEBUG_11H(priv, "uCode time for the switch is 0x%x\n",
 		      cmd.switch_time);
-	cmd.expect_beacon = ch_switch->channel->flags & IEEE80211_CHAN_RADAR;
+	cmd.expect_beacon =
+		ch_switch->chandef.chan->flags & IEEE80211_CHAN_RADAR;
 
 	return iwl_dvm_send_cmd(priv, &hcmd);
 }
@@ -540,7 +541,7 @@ static int iwl6000_hw_channel_switch(struct iwl_priv *priv,
 	hcmd.data[0] = cmd;
 
 	cmd->band = priv->band == IEEE80211_BAND_2GHZ;
-	ch = ch_switch->channel->hw_value;
+	ch = ch_switch->chandef.chan->hw_value;
 	IWL_DEBUG_11H(priv, "channel switch from %u to %u\n",
 		      ctx->active.channel, ch);
 	cmd->channel = cpu_to_le16(ch);
@@ -575,7 +576,8 @@ static int iwl6000_hw_channel_switch(struct iwl_priv *priv,
 	}
 	IWL_DEBUG_11H(priv, "uCode time for the switch is 0x%x\n",
 		      cmd->switch_time);
-	cmd->expect_beacon = ch_switch->channel->flags & IEEE80211_CHAN_RADAR;
+	cmd->expect_beacon =
+		ch_switch->chandef.chan->flags & IEEE80211_CHAN_RADAR;
 
 	err = iwl_dvm_send_cmd(priv, &hcmd);
 	kfree(cmd);
diff --git a/drivers/net/wireless/iwlwifi/dvm/mac80211.c b/drivers/net/wireless/iwlwifi/dvm/mac80211.c
index a7294fa4d7e5..2dc101fe0d24 100644
--- a/drivers/net/wireless/iwlwifi/dvm/mac80211.c
+++ b/drivers/net/wireless/iwlwifi/dvm/mac80211.c
@@ -967,7 +967,7 @@ static void iwlagn_mac_channel_switch(struct ieee80211_hw *hw,
 {
 	struct iwl_priv *priv = IWL_MAC80211_GET_DVM(hw);
 	struct ieee80211_conf *conf = &hw->conf;
-	struct ieee80211_channel *channel = ch_switch->channel;
+	struct ieee80211_channel *channel = ch_switch->chandef.chan;
 	struct iwl_ht_config *ht_conf = &priv->current_ht_config;
 	/*
 	 * MULTI-FIXME
@@ -1005,11 +1005,21 @@ static void iwlagn_mac_channel_switch(struct ieee80211_hw *hw,
 	priv->current_ht_config.smps = conf->smps_mode;
 
 	/* Configure HT40 channels */
-	ctx->ht.enabled = conf_is_ht(conf);
-	if (ctx->ht.enabled)
-		iwlagn_config_ht40(conf, ctx);
-	else
+	switch (cfg80211_get_chandef_type(&ch_switch->chandef)) {
+	case NL80211_CHAN_NO_HT:
+	case NL80211_CHAN_HT20:
 		ctx->ht.is_40mhz = false;
+		ctx->ht.extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+		break;
+	case NL80211_CHAN_HT40MINUS:
+		ctx->ht.extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_BELOW;
+		ctx->ht.is_40mhz = true;
+		break;
+	case NL80211_CHAN_HT40PLUS:
+		ctx->ht.extension_chan_offset = IEEE80211_HT_PARAM_CHA_SEC_ABOVE;
+		ctx->ht.is_40mhz = true;
+		break;
+	}
 
 	if ((le16_to_cpu(ctx->staging.channel) != ch))
 		ctx->staging.flags = 0;
diff --git a/drivers/net/wireless/iwlwifi/dvm/rxon.c b/drivers/net/wireless/iwlwifi/dvm/rxon.c
index 085c589e7149..acbb50b5f1e8 100644
--- a/drivers/net/wireless/iwlwifi/dvm/rxon.c
+++ b/drivers/net/wireless/iwlwifi/dvm/rxon.c
@@ -1160,7 +1160,7 @@ int iwlagn_commit_rxon(struct iwl_priv *priv, struct iwl_rxon_context *ctx)
 }
 
 void iwlagn_config_ht40(struct ieee80211_conf *conf,
-	struct iwl_rxon_context *ctx)
+			struct iwl_rxon_context *ctx)
 {
 	if (conf_is_ht40_minus(conf)) {
 		ctx->ht.extension_chan_offset =
diff --git a/drivers/net/wireless/ti/wl12xx/cmd.c b/drivers/net/wireless/ti/wl12xx/cmd.c
index 7dc9f965037d..7485dbae8c4b 100644
--- a/drivers/net/wireless/ti/wl12xx/cmd.c
+++ b/drivers/net/wireless/ti/wl12xx/cmd.c
@@ -301,7 +301,7 @@ int wl12xx_cmd_channel_switch(struct wl1271 *wl,
 	}
 
 	cmd->role_id = wlvif->role_id;
-	cmd->channel = ch_switch->channel->hw_value;
+	cmd->channel = ch_switch->chandef.chan->hw_value;
 	cmd->switch_time = ch_switch->count;
 	cmd->stop_tx = ch_switch->block_tx;
 
diff --git a/drivers/net/wireless/ti/wl18xx/cmd.c b/drivers/net/wireless/ti/wl18xx/cmd.c
index 1d1f6cc7a50a..7649c75cd68d 100644
--- a/drivers/net/wireless/ti/wl18xx/cmd.c
+++ b/drivers/net/wireless/ti/wl18xx/cmd.c
@@ -42,11 +42,11 @@ int wl18xx_cmd_channel_switch(struct wl1271 *wl,
 	}
 
 	cmd->role_id = wlvif->role_id;
-	cmd->channel = ch_switch->channel->hw_value;
+	cmd->channel = ch_switch->chandef.chan->hw_value;
 	cmd->switch_time = ch_switch->count;
 	cmd->stop_tx = ch_switch->block_tx;
 
-	switch (ch_switch->channel->band) {
+	switch (ch_switch->chandef.chan->band) {
 	case IEEE80211_BAND_2GHZ:
 		cmd->band = WLCORE_BAND_2_4GHZ;
 		break;
@@ -55,7 +55,7 @@ int wl18xx_cmd_channel_switch(struct wl1271 *wl,
 		break;
 	default:
 		wl1271_error("invalid channel switch band: %d",
-			     ch_switch->channel->band);
+			     ch_switch->chandef.chan->band);
 		ret = -EINVAL;
 		goto out_free;
 	}
diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 2a10acc65a54..95621528436c 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -684,6 +684,16 @@ struct ieee80211_ext_chansw_ie {
 	u8 count;
 } __packed;
 
+/**
+ * struct ieee80211_sec_chan_offs_ie - secondary channel offset IE
+ * @sec_chan_offs: secondary channel offset, uses IEEE80211_HT_PARAM_CHA_SEC_*
+ *	values here
+ * This structure represents the "Secondary Channel Offset element"
+ */
+struct ieee80211_sec_chan_offs_ie {
+	u8 sec_chan_offs;
+} __packed;
+
 /**
  * struct ieee80211_tim
  *
@@ -1648,6 +1658,7 @@ enum ieee80211_eid {
 
 	WLAN_EID_HT_CAPABILITY = 45,
 	WLAN_EID_HT_OPERATION = 61,
+	WLAN_EID_SECONDARY_CHANNEL_OFFSET = 62,
 
 	WLAN_EID_RSN = 48,
 	WLAN_EID_MMIE = 76,
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 0dde213dd3b6..9ff10b33b711 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1017,13 +1017,13 @@ struct ieee80211_conf {
  *	the driver passed into mac80211.
  * @block_tx: Indicates whether transmission must be blocked before the
  *	scheduled channel switch, as indicated by the AP.
- * @channel: the new channel to switch to
+ * @chandef: the new channel to switch to
  * @count: the number of TBTT's until the channel switch event
  */
 struct ieee80211_channel_switch {
 	u64 timestamp;
 	bool block_tx;
-	struct ieee80211_channel *channel;
+	struct cfg80211_chan_def chandef;
 	u8 count;
 };
 
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 10c3180b165e..8f240c0ec304 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1019,7 +1019,7 @@ struct ieee80211_local {
 	enum mac80211_scan_state next_scan_state;
 	struct delayed_work scan_work;
 	struct ieee80211_sub_if_data __rcu *scan_sdata;
-	struct ieee80211_channel *csa_channel;
+	struct cfg80211_chan_def csa_chandef;
 	/* For backward compatibility only -- do not use */
 	struct cfg80211_chan_def _oper_chandef;
 
@@ -1183,6 +1183,7 @@ struct ieee802_11_elems {
 	const u8 *pwr_constr_elem;
 	const struct ieee80211_timeout_interval_ie *timeout_int;
 	const u8 *opmode_notif;
+	const struct ieee80211_sec_chan_offs_ie *sec_chan_offs;
 
 	/* length of them, respectively */
 	u8 ssid_len;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index bc6f87edc624..bd581a80e4b7 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -289,6 +289,8 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
 	} else {
 		/* 40 MHz (and 80 MHz) must be supported for VHT */
 		ret = IEEE80211_STA_DISABLE_VHT;
+		/* also mark 40 MHz disabled */
+		ret |= IEEE80211_STA_DISABLE_40MHZ;
 		goto out;
 	}
 
@@ -964,16 +966,7 @@ static void ieee80211_chswitch_work(struct work_struct *work)
 	if (!ifmgd->associated)
 		goto out;
 
-	/*
-	 * FIXME: Here we are downgrading to NL80211_CHAN_WIDTH_20_NOHT
-	 * and don't adjust our ht/vht settings
-	 * This is wrong - we should behave according to the CSA params
-	 */
-	local->_oper_chandef.chan = local->csa_channel;
-	local->_oper_chandef.width = NL80211_CHAN_WIDTH_20_NOHT;
-	local->_oper_chandef.center_freq1 =
-		local->_oper_chandef.chan->center_freq;
-	local->_oper_chandef.center_freq2 = 0;
+	local->_oper_chandef = local->csa_chandef;
 
 	if (!local->ops->channel_switch) {
 		/* call "hw_config" only if doing sw channel switch */
@@ -1028,13 +1021,14 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_if_managed *ifmgd = &sdata->u.mgd;
 	struct cfg80211_bss *cbss = ifmgd->associated;
 	struct ieee80211_bss *bss;
-	struct ieee80211_channel *new_ch;
 	struct ieee80211_chanctx *chanctx;
 	enum ieee80211_band new_band;
 	int new_freq;
 	u8 new_chan_no;
 	u8 count;
 	u8 mode;
+	struct cfg80211_chan_def new_chandef = {};
+	int secondary_channel_offset = -1;
 
 	ASSERT_MGD_MTX(ifmgd);
 
@@ -1048,6 +1042,19 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	if (ifmgd->flags & IEEE80211_STA_CSA_RECEIVED)
 		return;
 
+	if (!(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) {
+		/* if HT is enabled and the IE not present, it's still HT */
+		secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+		if (elems->sec_chan_offs)
+			secondary_channel_offset =
+				elems->sec_chan_offs->sec_chan_offs;
+	}
+
+	if (ifmgd->flags & IEEE80211_STA_DISABLE_40MHZ &&
+	    (secondary_channel_offset == IEEE80211_HT_PARAM_CHA_SEC_ABOVE ||
+	     secondary_channel_offset == IEEE80211_HT_PARAM_CHA_SEC_BELOW))
+		secondary_channel_offset = IEEE80211_HT_PARAM_CHA_SEC_NONE;
+
 	if (elems->ext_chansw_ie) {
 		if (!ieee80211_operating_class_to_band(
 				elems->ext_chansw_ie->new_operating_class,
@@ -1074,8 +1081,9 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	bss = (void *)cbss->priv;
 
 	new_freq = ieee80211_channel_to_frequency(new_chan_no, new_band);
-	new_ch = ieee80211_get_channel(local->hw.wiphy, new_freq);
-	if (!new_ch || new_ch->flags & IEEE80211_CHAN_DISABLED) {
+	new_chandef.chan = ieee80211_get_channel(sdata->local->hw.wiphy, new_freq);
+	if (!new_chandef.chan ||
+	    new_chandef.chan->flags & IEEE80211_CHAN_DISABLED) {
 		sdata_info(sdata,
 			   "AP %pM switches to unsupported channel (%d MHz), disconnecting\n",
 			   ifmgd->associated->bssid, new_freq);
@@ -1084,6 +1092,39 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		return;
 	}
 
+	switch (secondary_channel_offset) {
+	default:
+		/* secondary_channel_offset was present but is invalid */
+	case IEEE80211_HT_PARAM_CHA_SEC_NONE:
+		cfg80211_chandef_create(&new_chandef, new_chandef.chan,
+					NL80211_CHAN_HT20);
+		break;
+	case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
+		cfg80211_chandef_create(&new_chandef, new_chandef.chan,
+					NL80211_CHAN_HT40PLUS);
+		break;
+	case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
+		cfg80211_chandef_create(&new_chandef, new_chandef.chan,
+					NL80211_CHAN_HT40MINUS);
+		break;
+	case -1:
+		cfg80211_chandef_create(&new_chandef, new_chandef.chan,
+					NL80211_CHAN_NO_HT);
+		break;
+	}
+
+	if (!cfg80211_chandef_usable(local->hw.wiphy, &new_chandef,
+				     IEEE80211_CHAN_DISABLED)) {
+		sdata_info(sdata,
+			   "AP %pM switches to unsupported channel (%d MHz, width:%d, CF1/2: %d/%d MHz), disconnecting\n",
+			   ifmgd->associated->bssid, new_freq,
+			   new_chandef.width, new_chandef.center_freq1,
+			   new_chandef.center_freq2);
+		ieee80211_queue_work(&local->hw,
+				     &ifmgd->csa_connection_drop_work);
+		return;
+	}
+
 	ifmgd->flags |= IEEE80211_STA_CSA_RECEIVED;
 
 	if (local->use_chanctx) {
@@ -1111,7 +1152,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 	}
 	mutex_unlock(&local->chanctx_mtx);
 
-	local->csa_channel = new_ch;
+	local->csa_chandef = new_chandef;
 
 	if (mode)
 		ieee80211_stop_queues_by_reason(&local->hw,
@@ -1123,7 +1164,7 @@ ieee80211_sta_process_chanswitch(struct ieee80211_sub_if_data *sdata,
 		struct ieee80211_channel_switch ch_switch = {
 			.timestamp = timestamp,
 			.block_tx = mode,
-			.channel = new_ch,
+			.chandef = new_chandef,
 			.count = count,
 		};
 
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 8286dcef228b..c215fafd7a2f 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -990,23 +990,23 @@ TRACE_EVENT(drv_channel_switch,
 
 	TP_STRUCT__entry(
 		LOCAL_ENTRY
+		CHANDEF_ENTRY
 		__field(u64, timestamp)
 		__field(bool, block_tx)
-		__field(u16, freq)
 		__field(u8, count)
 	),
 
 	TP_fast_assign(
 		LOCAL_ASSIGN;
+		CHANDEF_ASSIGN(&ch_switch->chandef)
 		__entry->timestamp = ch_switch->timestamp;
 		__entry->block_tx = ch_switch->block_tx;
-		__entry->freq = ch_switch->channel->center_freq;
 		__entry->count = ch_switch->count;
 	),
 
 	TP_printk(
-		LOCAL_PR_FMT " new freq:%u count:%d",
-		LOCAL_PR_ARG, __entry->freq, __entry->count
+		LOCAL_PR_FMT " new " CHANDEF_PR_FMT " count:%d",
+		LOCAL_PR_ARG, CHANDEF_PR_ARG, __entry->count
 	)
 );
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index e4a6d559372d..155056c90edf 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -716,6 +716,7 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 		case WLAN_EID_COUNTRY:
 		case WLAN_EID_PWR_CONSTRAINT:
 		case WLAN_EID_TIMEOUT_INTERVAL:
+		case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
 			if (test_bit(id, seen_elems)) {
 				elems->parse_error = true;
 				left -= elen;
@@ -870,6 +871,13 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			}
 			elems->ext_chansw_ie = (void *)pos;
 			break;
+		case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
+			if (elen != sizeof(struct ieee80211_sec_chan_offs_ie)) {
+				elem_parse_failed = true;
+				break;
+			}
+			elems->sec_chan_offs = (void *)pos;
+			break;
 		case WLAN_EID_COUNTRY:
 			elems->country_elem = pos;
 			elems->country_elem_len = elen;
-- 
cgit 


From 1b3a2e494bc793445f576c5476e9767cf7621684 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 Mar 2013 15:17:18 +0100
Subject: mac80211: handle extended channel switch announcement

Handle the (public) extended channel switch announcement
action frames. Parts of the data in these frames isn't
really in IEs, but put it into the elems struct anyway
to simplify the handling.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h |  6 ++++++
 net/mac80211/mlme.c       | 31 +++++++++++++++++++++++++++----
 net/mac80211/rx.c         | 16 ++++++++++++++++
 3 files changed, 49 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index 95621528436c..ce07161c8735 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -864,6 +864,11 @@ struct ieee80211_mgmt {
 					u8 action_code;
 					u8 variable[0];
 				} __packed chan_switch;
+				struct{
+					u8 action_code;
+					struct ieee80211_ext_chansw_ie data;
+					u8 variable[0];
+				} __packed ext_chan_switch;
 				struct{
 					u8 action_code;
 					u8 dialog_token;
@@ -1816,6 +1821,7 @@ enum ieee80211_key_len {
 
 /* Public action codes */
 enum ieee80211_pub_actioncode {
+	WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4,
 	WLAN_PUB_ACTION_TDLS_DISCOVER_RES = 14,
 };
 
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index bd581a80e4b7..c53aedb47a6a 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -3100,6 +3100,8 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 	enum rx_mgmt_action rma = RX_MGMT_NONE;
 	u8 deauth_buf[IEEE80211_DEAUTH_FRAME_LEN];
 	u16 fc;
+	struct ieee802_11_elems elems;
+	int ies_len;
 
 	rx_status = (struct ieee80211_rx_status *) skb->cb;
 	mgmt = (struct ieee80211_mgmt *) skb->data;
@@ -3130,10 +3132,9 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 		break;
 	case IEEE80211_STYPE_ACTION:
 		if (mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT) {
-			struct ieee802_11_elems elems;
-			int ies_len = skb->len -
-				      offsetof(struct ieee80211_mgmt,
-					       u.action.u.chan_switch.variable);
+			ies_len = skb->len -
+				  offsetof(struct ieee80211_mgmt,
+					   u.action.u.chan_switch.variable);
 
 			if (ies_len < 0)
 				break;
@@ -3145,6 +3146,28 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 			if (elems.parse_error)
 				break;
 
+			ieee80211_sta_process_chanswitch(sdata,
+							 rx_status->mactime,
+							 &elems);
+		} else if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) {
+			ies_len = skb->len -
+				  offsetof(struct ieee80211_mgmt,
+					   u.action.u.ext_chan_switch.variable);
+
+			if (ies_len < 0)
+				break;
+
+			ieee802_11_parse_elems(
+				mgmt->u.action.u.ext_chan_switch.variable,
+				ies_len, &elems);
+
+			if (elems.parse_error)
+				break;
+
+			/* for the handling code pretend this was also an IE */
+			elems.ext_chansw_ie =
+				&mgmt->u.action.u.ext_chan_switch.data;
+
 			ieee80211_sta_process_chanswitch(sdata,
 							 rx_status->mactime,
 							 &elems);
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e9825f15c14c..643fcf7c9dcd 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -2424,6 +2424,22 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
 		}
 
 		break;
+	case WLAN_CATEGORY_PUBLIC:
+		if (len < IEEE80211_MIN_ACTION_SIZE + 1)
+			goto invalid;
+		if (sdata->vif.type != NL80211_IFTYPE_STATION)
+			break;
+		if (!rx->sta)
+			break;
+		if (!ether_addr_equal(mgmt->bssid, sdata->u.mgd.bssid))
+			break;
+		if (mgmt->u.action.u.ext_chan_switch.action_code !=
+				WLAN_PUB_ACTION_EXT_CHANSW_ANN)
+			break;
+		if (len < offsetof(struct ieee80211_mgmt,
+				   u.action.u.ext_chan_switch.variable))
+			goto invalid;
+		goto queue;
 	case WLAN_CATEGORY_VHT:
 		if (sdata->vif.type != NL80211_IFTYPE_STATION &&
 		    sdata->vif.type != NL80211_IFTYPE_MESH_POINT &&
-- 
cgit 


From b2e506bfc4d752b68a0ccaae1e977898263eba4c Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 26 Mar 2013 14:54:16 +0100
Subject: mac80211: parse VHT channel switch IEs

VHT introduces multiple IEs that need to be parsed for a
wide bandwidth channel switch. Two are (currently) needed
in mac80211:
 * wide bandwidth channel switch element
 * channel switch wrapper element

The former is contained in the latter for beacons and probe
responses, but not for the spectrum management action frames
so the IE parser needs a new argument to differentiate them.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/linux/ieee80211.h  | 10 ++++++++++
 net/mac80211/ibss.c        |  2 +-
 net/mac80211/ieee80211_i.h |  7 ++++---
 net/mac80211/mesh.c        |  4 ++--
 net/mac80211/mesh_hwmp.c   |  2 +-
 net/mac80211/mesh_plink.c  |  2 +-
 net/mac80211/mlme.c        | 16 ++++++++--------
 net/mac80211/scan.c        |  2 +-
 net/mac80211/util.c        | 36 +++++++++++++++++++++++++++++++++++-
 9 files changed, 63 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h
index ce07161c8735..06b0ed0154a4 100644
--- a/include/linux/ieee80211.h
+++ b/include/linux/ieee80211.h
@@ -694,6 +694,14 @@ struct ieee80211_sec_chan_offs_ie {
 	u8 sec_chan_offs;
 } __packed;
 
+/**
+ * struct ieee80211_wide_bw_chansw_ie - wide bandwidth channel switch IE
+ */
+struct ieee80211_wide_bw_chansw_ie {
+	u8 new_channel_width;
+	u8 new_center_freq_seg0, new_center_freq_seg1;
+} __packed;
+
 /**
  * struct ieee80211_tim
  *
@@ -1698,6 +1706,8 @@ enum ieee80211_eid {
 	WLAN_EID_VHT_CAPABILITY = 191,
 	WLAN_EID_VHT_OPERATION = 192,
 	WLAN_EID_OPMODE_NOTIF = 199,
+	WLAN_EID_WIDE_BW_CHANNEL_SWITCH = 194,
+	WLAN_EID_CHANNEL_SWITCH_WRAPPER = 196,
 
 	/* 802.11ad */
 	WLAN_EID_NON_TX_BSSID_CAP =  83,
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index b7bf6d76f1d9..170f9a7fa319 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -914,7 +914,7 @@ void ieee80211_rx_mgmt_probe_beacon(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
-				&elems);
+			       false, &elems);
 
 	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
 }
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8f240c0ec304..f4a65a340a52 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -1179,6 +1179,7 @@ struct ieee802_11_elems {
 	const struct ieee80211_rann_ie *rann;
 	const struct ieee80211_channel_sw_ie *ch_switch_ie;
 	const struct ieee80211_ext_chansw_ie *ext_chansw_ie;
+	const struct ieee80211_wide_bw_chansw_ie *wide_bw_chansw_ie;
 	const u8 *country_elem;
 	const u8 *pwr_constr_elem;
 	const struct ieee80211_timeout_interval_ie *timeout_int;
@@ -1490,13 +1491,13 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata,
 	ieee80211_tx_skb_tid(sdata, skb, 7);
 }
 
-u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
+u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, bool action,
 			       struct ieee802_11_elems *elems,
 			       u64 filter, u32 crc);
-static inline void ieee802_11_parse_elems(u8 *start, size_t len,
+static inline void ieee802_11_parse_elems(u8 *start, size_t len, bool action,
 					  struct ieee802_11_elems *elems)
 {
-	ieee802_11_parse_elems_crc(start, len, elems, 0, 0);
+	ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0);
 }
 
 u32 ieee80211_mandatory_rates(struct ieee80211_local *local,
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index 0acc2874d294..4b984765d62d 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -838,7 +838,7 @@ ieee80211_mesh_rx_probe_req(struct ieee80211_sub_if_data *sdata,
 	if (baselen > len)
 		return;
 
-	ieee802_11_parse_elems(pos, len - baselen, &elems);
+	ieee802_11_parse_elems(pos, len - baselen, false, &elems);
 
 	/* 802.11-2012 10.1.4.3.2 */
 	if ((!ether_addr_equal(mgmt->da, sdata->vif.addr) &&
@@ -899,7 +899,7 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
-			       &elems);
+			       false, &elems);
 
 	/* ignore non-mesh or secure / unsecure mismatch */
 	if ((!elems.mesh_id || !elems.mesh_config) ||
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index c82d5e6a24c0..486819cd02cd 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -880,7 +880,7 @@ void mesh_rx_path_sel_frame(struct ieee80211_sub_if_data *sdata,
 
 	baselen = (u8 *) mgmt->u.action.u.mesh_action.variable - (u8 *) mgmt;
 	ieee802_11_parse_elems(mgmt->u.action.u.mesh_action.variable,
-			len - baselen, &elems);
+			       len - baselen, false, &elems);
 
 	if (elems.preq) {
 		if (elems.preq_len != 37)
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index cdd41835334d..09bebed99416 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -687,7 +687,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata,
 		baseaddr += 4;
 		baselen += 4;
 	}
-	ieee802_11_parse_elems(baseaddr, len - baselen, &elems);
+	ieee802_11_parse_elems(baseaddr, len - baselen, true, &elems);
 
 	if (!elems.peering) {
 		mpl_dbg(sdata,
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index c53aedb47a6a..3e0421265bfe 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -2203,7 +2203,7 @@ static void ieee80211_auth_challenge(struct ieee80211_sub_if_data *sdata,
 	u32 tx_flags = 0;
 
 	pos = mgmt->u.auth.variable;
-	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
 	if (!elems.challenge)
 		return;
 	auth_data->expected_transaction = 4;
@@ -2468,7 +2468,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata,
 	}
 
 	pos = mgmt->u.assoc_resp.variable;
-	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
 
 	if (!elems.supp_rates) {
 		sdata_info(sdata, "no SuppRates element in AssocResp\n");
@@ -2637,7 +2637,7 @@ ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
 		   capab_info, status_code, (u16)(aid & ~(BIT(15) | BIT(14))));
 
 	pos = mgmt->u.assoc_resp.variable;
-	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), &elems);
+	ieee802_11_parse_elems(pos, len - (pos - (u8 *) mgmt), false, &elems);
 
 	if (status_code == WLAN_STATUS_ASSOC_REJECTED_TEMPORARILY &&
 	    elems.timeout_int &&
@@ -2760,7 +2760,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata,
 		return;
 
 	ieee802_11_parse_elems(mgmt->u.probe_resp.variable, len - baselen,
-				&elems);
+			       false, &elems);
 
 	ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
 
@@ -2843,7 +2843,7 @@ ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 	if (ifmgd->assoc_data && ifmgd->assoc_data->need_beacon &&
 	    ether_addr_equal(mgmt->bssid, ifmgd->assoc_data->bss->bssid)) {
 		ieee802_11_parse_elems(mgmt->u.beacon.variable,
-				       len - baselen, &elems);
+				       len - baselen, false, &elems);
 
 		ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems);
 		ifmgd->assoc_data->have_beacon = true;
@@ -2953,7 +2953,7 @@ ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
 
 	ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4);
 	ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable,
-					  len - baselen, &elems,
+					  len - baselen, false, &elems,
 					  care_about_ies, ncrc);
 
 	if (local->hw.flags & IEEE80211_HW_PS_NULLFUNC_STACK) {
@@ -3141,7 +3141,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 
 			ieee802_11_parse_elems(
 				mgmt->u.action.u.chan_switch.variable,
-				ies_len, &elems);
+				ies_len, true, &elems);
 
 			if (elems.parse_error)
 				break;
@@ -3159,7 +3159,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata,
 
 			ieee802_11_parse_elems(
 				mgmt->u.action.u.ext_chan_switch.variable,
-				ies_len, &elems);
+				ies_len, true, &elems);
 
 			if (elems.parse_error)
 				break;
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 33fbf1045690..99b103921a4b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -181,7 +181,7 @@ void ieee80211_scan_rx(struct ieee80211_local *local, struct sk_buff *skb)
 	if (baselen > skb->len)
 		return;
 
-	ieee802_11_parse_elems(elements, skb->len - baselen, &elems);
+	ieee802_11_parse_elems(elements, skb->len - baselen, false, &elems);
 
 	channel = ieee80211_get_channel(local->hw.wiphy, rx_status->freq);
 
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 155056c90edf..3f87fa468b1f 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -661,7 +661,7 @@ void ieee80211_queue_delayed_work(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(ieee80211_queue_delayed_work);
 
-u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
+u32 ieee802_11_parse_elems_crc(u8 *start, size_t len, bool action,
 			       struct ieee802_11_elems *elems,
 			       u64 filter, u32 crc)
 {
@@ -669,6 +669,7 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 	u8 *pos = start;
 	bool calc_crc = filter != 0;
 	DECLARE_BITMAP(seen_elems, 256);
+	const u8 *ie;
 
 	bitmap_zero(seen_elems, 256);
 	memset(elems, 0, sizeof(*elems));
@@ -717,6 +718,11 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 		case WLAN_EID_PWR_CONSTRAINT:
 		case WLAN_EID_TIMEOUT_INTERVAL:
 		case WLAN_EID_SECONDARY_CHANNEL_OFFSET:
+		case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
+		/*
+		 * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible
+		 * that if the content gets bigger it might be needed more than once
+		 */
 			if (test_bit(id, seen_elems)) {
 				elems->parse_error = true;
 				left -= elen;
@@ -878,6 +884,34 @@ u32 ieee802_11_parse_elems_crc(u8 *start, size_t len,
 			}
 			elems->sec_chan_offs = (void *)pos;
 			break;
+		case WLAN_EID_WIDE_BW_CHANNEL_SWITCH:
+			if (!action ||
+			    elen != sizeof(*elems->wide_bw_chansw_ie)) {
+				elem_parse_failed = true;
+				break;
+			}
+			elems->wide_bw_chansw_ie = (void *)pos;
+			break;
+		case WLAN_EID_CHANNEL_SWITCH_WRAPPER:
+			if (action) {
+				elem_parse_failed = true;
+				break;
+			}
+			/*
+			 * This is a bit tricky, but as we only care about
+			 * the wide bandwidth channel switch element, so
+			 * just parse it out manually.
+			 */
+			ie = cfg80211_find_ie(WLAN_EID_WIDE_BW_CHANNEL_SWITCH,
+					      pos, elen);
+			if (ie) {
+				if (ie[1] == sizeof(*elems->wide_bw_chansw_ie))
+					elems->wide_bw_chansw_ie =
+						(void *)(ie + 2);
+				else
+					elem_parse_failed = true;
+			}
+			break;
 		case WLAN_EID_COUNTRY:
 			elems->country_elem = pos;
 			elems->country_elem_len = elen;
-- 
cgit 


From c309dbb4debb714566e4cf0d5d4e4023c3c4ff2f Mon Sep 17 00:00:00 2001
From: Myron Stowe <myron.stowe@redhat.com>
Date: Fri, 12 Apr 2013 05:44:30 +0000
Subject: PCI/ACPI: Remove support of ACPI PCI subdrivers

Both sub-drivers of the "PCI Root Bridge ("pci_bridge")" driver, "acpiphp"
and "pci_slot", have been converted to hook directly into the PCI core.

With the conversions there are no remaining usages of the 'struct
acpi_pci_driver' list based infrastructure.  This patch removes it.

Signed-off-by: Myron Stowe <myron.stowe@redhat.com>
Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Reviewed-by: Yinghai Lu <yinghai@kernel.org>
Cc: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
Cc: Toshi Kani <toshi.kani@hp.com>
---
 drivers/acpi/pci_root.c | 48 +-----------------------------------------------
 include/linux/acpi.h    |  9 ---------
 2 files changed, 1 insertion(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index 0ac546d5e53f..b80e06e0b2d9 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -65,44 +65,12 @@ static struct acpi_scan_handler pci_root_handler = {
 	.detach = acpi_pci_root_remove,
 };
 
-/* Lock to protect both acpi_pci_roots and acpi_pci_drivers lists */
+/* Lock to protect both acpi_pci_roots lists */
 static DEFINE_MUTEX(acpi_pci_root_lock);
 static LIST_HEAD(acpi_pci_roots);
-static LIST_HEAD(acpi_pci_drivers);
 
 static DEFINE_MUTEX(osc_lock);
 
-int acpi_pci_register_driver(struct acpi_pci_driver *driver)
-{
-	int n = 0;
-	struct acpi_pci_root *root;
-
-	mutex_lock(&acpi_pci_root_lock);
-	list_add_tail(&driver->node, &acpi_pci_drivers);
-	if (driver->add)
-		list_for_each_entry(root, &acpi_pci_roots, node) {
-			driver->add(root);
-			n++;
-		}
-	mutex_unlock(&acpi_pci_root_lock);
-
-	return n;
-}
-EXPORT_SYMBOL(acpi_pci_register_driver);
-
-void acpi_pci_unregister_driver(struct acpi_pci_driver *driver)
-{
-	struct acpi_pci_root *root;
-
-	mutex_lock(&acpi_pci_root_lock);
-	list_del(&driver->node);
-	if (driver->remove)
-		list_for_each_entry(root, &acpi_pci_roots, node)
-			driver->remove(root);
-	mutex_unlock(&acpi_pci_root_lock);
-}
-EXPORT_SYMBOL(acpi_pci_unregister_driver);
-
 /**
  * acpi_is_root_bridge - determine whether an ACPI CA node is a PCI root bridge
  * @handle - the ACPI CA node in question.
@@ -413,7 +381,6 @@ static int acpi_pci_root_add(struct acpi_device *device,
 	acpi_status status;
 	int result;
 	struct acpi_pci_root *root;
-	struct acpi_pci_driver *driver;
 	u32 flags, base_flags;
 	bool is_osc_granted = false;
 
@@ -573,12 +540,6 @@ static int acpi_pci_root_add(struct acpi_device *device,
 		pci_assign_unassigned_bus_resources(root->bus);
 	}
 
-	mutex_lock(&acpi_pci_root_lock);
-	list_for_each_entry(driver, &acpi_pci_drivers, node)
-		if (driver->add)
-			driver->add(root);
-	mutex_unlock(&acpi_pci_root_lock);
-
 	/* need to after hot-added ioapic is registered */
 	if (system_state != SYSTEM_BOOTING)
 		pci_enable_bridges(root->bus);
@@ -599,16 +560,9 @@ end:
 static void acpi_pci_root_remove(struct acpi_device *device)
 {
 	struct acpi_pci_root *root = acpi_driver_data(device);
-	struct acpi_pci_driver *driver;
 
 	pci_stop_root_bus(root->bus);
 
-	mutex_lock(&acpi_pci_root_lock);
-	list_for_each_entry_reverse(driver, &acpi_pci_drivers, node)
-		if (driver->remove)
-			driver->remove(root);
-	mutex_unlock(&acpi_pci_root_lock);
-
 	device_set_run_wake(root->bus->bridge, false);
 	pci_acpi_remove_bus_pm_notifier(device);
 
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index bcbdd7484e58..03053aca5b32 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -152,15 +152,6 @@ void acpi_penalize_isa_irq(int irq, int active);
 
 void acpi_pci_irq_disable (struct pci_dev *dev);
 
-struct acpi_pci_driver {
-	struct list_head node;
-	int (*add)(struct acpi_pci_root *root);
-	void (*remove)(struct acpi_pci_root *root);
-};
-
-int acpi_pci_register_driver(struct acpi_pci_driver *driver);
-void acpi_pci_unregister_driver(struct acpi_pci_driver *driver);
-
 extern int ec_read(u8 addr, u8 *val);
 extern int ec_write(u8 addr, u8 val);
 extern int ec_transaction(u8 command,
-- 
cgit 


From 063cc6d5591ea9c0631b81ac5c7b829d99738b2f Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Wed, 27 Mar 2013 21:28:22 -0400
Subject: NVMe: Abstract out sector to block number conversion

Introduce nvme_block_nr() to help convert sectors to block numbers.
This fixes an integer overflow in the SCSI conversion layer, and it's
slightly less typing than opencoding it.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Acked-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 4 ++--
 drivers/block/nvme-scsi.c | 2 +-
 include/linux/nvme.h      | 5 +++++
 3 files changed, 8 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 32fdfe9a5156..f3ea52aa3e5d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -477,7 +477,7 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 
 	range->cattr = cpu_to_le32(0);
 	range->nlb = cpu_to_le32(bio->bi_size >> ns->lba_shift);
-	range->slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector));
 
 	memset(cmnd, 0, sizeof(*cmnd));
 	cmnd->dsm.opcode = nvme_cmd_dsm;
@@ -590,7 +590,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
 	cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
 	length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
 								GFP_ATOMIC);
-	cmnd->rw.slba = cpu_to_le64(bio->bi_sector >> (ns->lba_shift - 9));
+	cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_sector));
 	cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
 	cmnd->rw.control = cpu_to_le16(control);
 	cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt);
diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 483af3585c92..db7052ea8d0d 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -2040,7 +2040,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 	struct nvme_command c;
 	u8 opcode = (is_write ? nvme_cmd_write : nvme_cmd_read);
 	u16 control;
-	u32 max_blocks = (dev->max_hw_sectors << 9) >> ns->lba_shift;
+	u32 max_blocks = nvme_block_nr(ns, dev->max_hw_sectors);
 
 	num_cmds = nvme_trans_io_get_num_cmds(hdr, cdb_info, max_blocks);
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index aa575033dbe7..09f419d4da4e 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -566,6 +566,11 @@ struct nvme_iod {
 	struct scatterlist sg[0];
 };
 
+static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector)
+{
+	return (sector >> (ns->lba_shift - 9));
+}
+
 /**
  * nvme_free_iod - frees an nvme_iod
  * @dev: The device that the I/O was submitted to
-- 
cgit 


From 1c9b52651dad0ff1fa71fc6205c86d972f25bcc0 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Tue, 16 Apr 2013 15:21:06 -0400
Subject: NVMe: Fix endian-related problems in user I/O submission path

When constructing the command, dsmgmt needs to be treated as a 32-bit
value, not a 16-bit value.  reftag, apptag and appmask all need to be
converted from native-endian to little-endian.  Again, sparse's bitwise
warnings caught this problem.  Thanks to Keith for pointing out the
correct way to fix the reftag.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Acked-by: Keith Busch <keith.busch@intel.com>
---
 drivers/block/nvme-core.c | 8 ++++----
 include/linux/nvme.h      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index a3c7d504d5e2..dbd2103533c1 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1166,10 +1166,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.slba = cpu_to_le64(io.slba);
 	c.rw.length = cpu_to_le16(io.nblocks);
 	c.rw.control = cpu_to_le16(io.control);
-	c.rw.dsmgmt = cpu_to_le16(io.dsmgmt);
-	c.rw.reftag = io.reftag;
-	c.rw.apptag = io.apptag;
-	c.rw.appmask = io.appmask;
+	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	c.rw.reftag = cpu_to_le32(io.reftag);
+	c.rw.apptag = cpu_to_le16(io.apptag);
+	c.rw.appmask = cpu_to_le16(io.appmask);
 	/* XXX: metadata */
 	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 09f419d4da4e..7ae7ecfc0947 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -207,11 +207,11 @@ struct nvme_common_command {
 	__u8			flags;
 	__u16			command_id;
 	__le32			nsid;
-	__u32			cdw2[2];
+	__le32			cdw2[2];
 	__le64			metadata;
 	__le64			prp1;
 	__le64			prp2;
-	__u32			cdw10[6];
+	__le32			cdw10[6];
 };
 
 struct nvme_rw_command {
-- 
cgit 


From 3d81bc7e96d6bca0b8f8b7d1bf6ea72caa3aac57 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.z.zhang@Intel.com>
Date: Thu, 11 Apr 2013 19:25:13 +0800
Subject: KVM: Call common update function when ioapic entry changed.

Both TMR and EOI exit bitmap need to be updated when ioapic changed
or vcpu's id/ldr/dfr changed. So use common function instead eoi exit
bitmap specific function.

Signed-off-by: Yang Zhang <yang.z.zhang@Intel.com>
Reviewed-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/lapic.h    |  6 ------
 arch/x86/kvm/lapic.c     |  2 +-
 arch/x86/kvm/vmx.c       |  3 +++
 arch/x86/kvm/x86.c       | 11 +++++++----
 include/linux/kvm_host.h |  4 ++--
 virt/kvm/ioapic.c        | 22 +++++++++++++---------
 virt/kvm/ioapic.h        |  6 ++----
 virt/kvm/irq_comm.c      |  4 ++--
 virt/kvm/kvm_main.c      |  4 ++--
 9 files changed, 32 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index c3e2935b6db4..c5f92a926a9a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -27,10 +27,4 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 #define kvm_apic_present(x) (true)
 #define kvm_lapic_enabled(x) (true)
 
-static inline bool kvm_apic_vid_enabled(void)
-{
-	/* IA64 has no apicv supporting, do nothing here */
-	return false;
-}
-
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 67962188c775..34a8ca845280 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -217,7 +217,7 @@ out:
 	if (old)
 		kfree_rcu(old, rcu);
 
-	kvm_ioapic_make_eoibitmap_request(kvm);
+	kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7607c6c09c74..8535519d0352 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6414,6 +6414,9 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
+	if (!vmx_vm_has_apicv(vcpu->kvm))
+		return;
+
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
 	vmcs_write64(EOI_EXIT_BITMAP1, eoi_exit_bitmap[1]);
 	vmcs_write64(EOI_EXIT_BITMAP2, eoi_exit_bitmap[2]);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1562671a8e18..87a05df3eae4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5661,13 +5661,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-static void update_eoi_exitmap(struct kvm_vcpu *vcpu)
+static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 	u64 eoi_exit_bitmap[4];
 
+	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
+		return;
+
 	memset(eoi_exit_bitmap, 0, 32);
 
-	kvm_ioapic_calculate_eoi_exitmap(vcpu, eoi_exit_bitmap);
+	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap);
 	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
 }
 
@@ -5724,8 +5727,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_deliver_pmi(vcpu);
-		if (kvm_check_request(KVM_REQ_EOIBITMAP, vcpu))
-			update_eoi_exitmap(vcpu);
+		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
+			vcpu_scan_ioapic(vcpu);
 	}
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4a76aca31478..93a50054d46c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -126,7 +126,7 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_MASTERCLOCK_UPDATE 19
 #define KVM_REQ_MCLOCK_INPROGRESS 20
 #define KVM_REQ_EPR_EXIT          21
-#define KVM_REQ_EOIBITMAP         22
+#define KVM_REQ_SCAN_IOAPIC       22
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -575,7 +575,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu);
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
 void kvm_make_mclock_inprogress_request(struct kvm *kvm);
-void kvm_make_update_eoibitmap_request(struct kvm *kvm);
+void kvm_make_scan_ioapic_request(struct kvm *kvm);
 
 long kvm_arch_dev_ioctl(struct file *filp,
 			unsigned int ioctl, unsigned long arg);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 97c67a50f904..f2157a985a1f 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -193,15 +193,13 @@ static void update_handled_vectors(struct kvm_ioapic *ioapic)
 	smp_wmb();
 }
 
-void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-					u64 *eoi_exit_bitmap)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 	union kvm_ioapic_redirect_entry *e;
 	int index;
 
 	spin_lock(&ioapic->lock);
-	/* traverse ioapic entry to set eoi exit bitmap*/
 	for (index = 0; index < IOAPIC_NUM_PINS; index++) {
 		e = &ioapic->redirtbl[index];
 		if (!e->fields.mask &&
@@ -215,16 +213,22 @@ void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
 	}
 	spin_unlock(&ioapic->lock);
 }
-EXPORT_SYMBOL_GPL(kvm_ioapic_calculate_eoi_exitmap);
 
-void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm)
+#ifdef CONFIG_X86
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
 	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
 
-	if (!kvm_apic_vid_enabled(kvm) || !ioapic)
+	if (!ioapic)
 		return;
-	kvm_make_update_eoibitmap_request(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 }
+#else
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
+{
+	return;
+}
+#endif
 
 static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
@@ -267,7 +271,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index, false);
-		kvm_ioapic_make_eoibitmap_request(ioapic->kvm);
+		kvm_vcpu_request_scan_ioapic(ioapic->kvm);
 		break;
 	}
 }
@@ -586,7 +590,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	update_handled_vectors(ioapic);
-	kvm_ioapic_make_eoibitmap_request(kvm);
+	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_rtc_eoi_tracking_restore_all(ioapic);
 	spin_unlock(&ioapic->lock);
 	return 0;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 554157bbb586..674a388612b4 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -96,9 +96,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_make_eoibitmap_request(struct kvm *kvm);
-void kvm_ioapic_calculate_eoi_exitmap(struct kvm_vcpu *vcpu,
-					u64 *eoi_exit_bitmap);
-
+void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 8efb580edfef..25ab48007adb 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -285,7 +285,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm,
 	mutex_lock(&kvm->irq_lock);
 	hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list);
 	mutex_unlock(&kvm->irq_lock);
-	kvm_ioapic_make_eoibitmap_request(kvm);
+	kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
@@ -295,7 +295,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm,
 	hlist_del_init_rcu(&kian->link);
 	mutex_unlock(&kvm->irq_lock);
 	synchronize_rcu();
-	kvm_ioapic_make_eoibitmap_request(kvm);
+	kvm_vcpu_request_scan_ioapic(kvm);
 }
 
 int kvm_request_irq_source_id(struct kvm *kvm)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ac3182eed462..65a9e0716a8d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -217,9 +217,9 @@ void kvm_make_mclock_inprogress_request(struct kvm *kvm)
 	make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
 }
 
-void kvm_make_update_eoibitmap_request(struct kvm *kvm)
+void kvm_make_scan_ioapic_request(struct kvm *kvm)
 {
-	make_all_cpus_request(kvm, KVM_REQ_EOIBITMAP);
+	make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
 }
 
 int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
-- 
cgit 


From 5e82e952f04681c10f35e02ee0a4a43ec027137a Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 19 Feb 2013 10:17:58 -0700
Subject: NVMe: Add a character device for each nvme device

Registers a miscellaneous device for each nvme controller probed. This
creates character device files as /dev/nvmeN, where N is the device
instance, and supports nvme admin ioctl commands so devices without
namespaces can be managed.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 74 ++++++++++++++++++++++++++++++++++++++++-------
 include/linux/nvme.h      |  5 ++++
 2 files changed, 69 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index dbd2103533c1..40908a06bd5e 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1647,6 +1647,56 @@ static void nvme_release_instance(struct nvme_dev *dev)
 	spin_unlock(&dev_list_lock);
 }
 
+static void nvme_free_dev(struct kref *kref)
+{
+	struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref);
+	nvme_dev_remove(dev);
+	pci_disable_msix(dev->pci_dev);
+	iounmap(dev->bar);
+	nvme_release_instance(dev);
+	nvme_release_prp_pools(dev);
+	pci_disable_device(dev->pci_dev);
+	pci_release_regions(dev->pci_dev);
+	kfree(dev->queues);
+	kfree(dev->entry);
+	kfree(dev);
+}
+
+static int nvme_dev_open(struct inode *inode, struct file *f)
+{
+	struct nvme_dev *dev = container_of(f->private_data, struct nvme_dev,
+								miscdev);
+	kref_get(&dev->kref);
+	f->private_data = dev;
+	return 0;
+}
+
+static int nvme_dev_release(struct inode *inode, struct file *f)
+{
+	struct nvme_dev *dev = f->private_data;
+	kref_put(&dev->kref, nvme_free_dev);
+	return 0;
+}
+
+static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg)
+{
+	struct nvme_dev *dev = f->private_data;
+	switch (cmd) {
+	case NVME_IOCTL_ADMIN_CMD:
+		return nvme_user_admin_cmd(dev, (void __user *)arg);
+	default:
+		return -ENOTTY;
+	}
+}
+
+static const struct file_operations nvme_dev_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nvme_dev_open,
+	.release	= nvme_dev_release,
+	.unlocked_ioctl	= nvme_dev_ioctl,
+	.compat_ioctl	= nvme_dev_ioctl,
+};
+
 static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	int bars, result = -ENOMEM;
@@ -1705,8 +1755,20 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	if (result)
 		goto delete;
 
+	scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance);
+	dev->miscdev.minor = MISC_DYNAMIC_MINOR;
+	dev->miscdev.parent = &pdev->dev;
+	dev->miscdev.name = dev->name;
+	dev->miscdev.fops = &nvme_dev_fops;
+	result = misc_register(&dev->miscdev);
+	if (result)
+		goto remove;
+
+	kref_init(&dev->kref);
 	return 0;
 
+ remove:
+	nvme_dev_remove(dev);
  delete:
 	spin_lock(&dev_list_lock);
 	list_del(&dev->node);
@@ -1732,16 +1794,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 static void nvme_remove(struct pci_dev *pdev)
 {
 	struct nvme_dev *dev = pci_get_drvdata(pdev);
-	nvme_dev_remove(dev);
-	pci_disable_msix(pdev);
-	iounmap(dev->bar);
-	nvme_release_instance(dev);
-	nvme_release_prp_pools(dev);
-	pci_disable_device(pdev);
-	pci_release_regions(pdev);
-	kfree(dev->queues);
-	kfree(dev->entry);
-	kfree(dev);
+	misc_deregister(&dev->miscdev);
+	kref_put(&dev->kref, nvme_free_dev);
 }
 
 /* These functions are yet to be implemented */
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7ae7ecfc0947..9b6fba872f47 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -507,6 +507,8 @@ struct nvme_admin_cmd {
 
 #ifdef __KERNEL__
 #include <linux/pci.h>
+#include <linux/miscdevice.h>
+#include <linux/kref.h>
 
 #define NVME_IO_TIMEOUT	(5 * HZ)
 
@@ -527,6 +529,9 @@ struct nvme_dev {
 	struct msix_entry *entry;
 	struct nvme_bar __iomem *bar;
 	struct list_head namespaces;
+	struct kref kref;
+	struct miscdevice miscdev;
+	char name[12];
 	char serial[20];
 	char model[40];
 	char firmware_rev[8];
-- 
cgit 


From 43b5abe0640100a9e9424c91298c7993d443ffb7 Mon Sep 17 00:00:00 2001
From: Sascha Herrmann <sascha@ps.nvbi.de>
Date: Sun, 14 Apr 2013 22:33:28 +0000
Subject: at86rf230: add irq type configuration option

Add option to at86rf230 platform data to configure the type of the
interrupt used by the driver. The irq polarity of the device will
be configured accordingly.

Signed-off-by: Sascha Herrmann <sascha@ps.nvbi.de>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ieee802154/at86rf230.c | 47 +++++++++++++++++++++++++-------------
 include/linux/spi/at86rf230.h      | 14 ++++++++++++
 2 files changed, 45 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c
index fc315ddea61c..cf098889ba64 100644
--- a/drivers/net/ieee802154/at86rf230.c
+++ b/drivers/net/ieee802154/at86rf230.c
@@ -219,6 +219,9 @@ struct at86rf230_local {
 #define IRQ_PLL_UNL	(1 << 1)
 #define IRQ_PLL_LOCK	(1 << 0)
 
+#define IRQ_ACTIVE_HIGH	0
+#define IRQ_ACTIVE_LOW	1
+
 #define STATE_P_ON		0x00	/* BUSY */
 #define STATE_BUSY_RX		0x01
 #define STATE_BUSY_TX		0x02
@@ -726,11 +729,16 @@ static irqreturn_t at86rf230_isr(int irq, void *data)
 	return IRQ_HANDLED;
 }
 
+static int at86rf230_irq_polarity(struct at86rf230_local *lp, int pol)
+{
+	return at86rf230_write_subreg(lp, SR_IRQ_POLARITY, pol);
+}
 
 static int at86rf230_hw_init(struct at86rf230_local *lp)
 {
+	struct at86rf230_platform_data *pdata = lp->spi->dev.platform_data;
+	int rc, irq_pol;
 	u8 status;
-	int rc;
 
 	rc = at86rf230_read_subreg(lp, SR_TRX_STATUS, &status);
 	if (rc)
@@ -748,6 +756,16 @@ static int at86rf230_hw_init(struct at86rf230_local *lp)
 		dev_info(&lp->spi->dev, "Status: %02x\n", status);
 	}
 
+	/* configure irq polarity, defaults to high active */
+	if (pdata->irq_type & (IRQF_TRIGGER_FALLING | IRQF_TRIGGER_LOW))
+		irq_pol = IRQ_ACTIVE_LOW;
+	else
+		irq_pol = IRQ_ACTIVE_HIGH;
+
+	rc = at86rf230_irq_polarity(lp, irq_pol);
+	if (rc)
+		return rc;
+
 	rc = at86rf230_write_subreg(lp, SR_IRQ_MASK, 0xff); /* IRQ_TRX_UR |
 							     * IRQ_CCA_ED |
 							     * IRQ_TRX_END |
@@ -798,37 +816,36 @@ static int at86rf230_hw_init(struct at86rf230_local *lp)
 	return 0;
 }
 
-static int at86rf230_fill_data(struct spi_device *spi)
+static void at86rf230_fill_data(struct spi_device *spi)
 {
 	struct at86rf230_local *lp = spi_get_drvdata(spi);
 	struct at86rf230_platform_data *pdata = spi->dev.platform_data;
 
-	if (!pdata) {
-		dev_err(&spi->dev, "no platform_data\n");
-		return -EINVAL;
-	}
-
 	lp->rstn = pdata->rstn;
 	lp->slp_tr = pdata->slp_tr;
 	lp->dig2 = pdata->dig2;
-
-	return 0;
 }
 
 static int at86rf230_probe(struct spi_device *spi)
 {
+	struct at86rf230_platform_data *pdata;
 	struct ieee802154_dev *dev;
 	struct at86rf230_local *lp;
 	u8 man_id_0, man_id_1;
-	int rc;
+	int rc, supported = 0;
 	const char *chip;
-	int supported = 0;
 
 	if (!spi->irq) {
 		dev_err(&spi->dev, "no IRQ specified\n");
 		return -EINVAL;
 	}
 
+	pdata = spi->dev.platform_data;
+	if (!pdata) {
+		dev_err(&spi->dev, "no platform_data\n");
+		return -EINVAL;
+	}
+
 	dev = ieee802154_alloc_device(sizeof(*lp), &at86rf230_ops);
 	if (!dev)
 		return -ENOMEM;
@@ -851,9 +868,7 @@ static int at86rf230_probe(struct spi_device *spi)
 
 	spi_set_drvdata(spi, lp);
 
-	rc = at86rf230_fill_data(spi);
-	if (rc)
-		goto err_fill;
+	at86rf230_fill_data(spi);
 
 	rc = gpio_request(lp->rstn, "rstn");
 	if (rc)
@@ -928,7 +943,8 @@ static int at86rf230_probe(struct spi_device *spi)
 	if (rc)
 		goto err_gpio_dir;
 
-	rc = request_irq(spi->irq, at86rf230_isr, IRQF_SHARED,
+	rc = request_irq(spi->irq, at86rf230_isr,
+			 IRQF_SHARED | pdata->irq_type,
 			 dev_name(&spi->dev), lp);
 	if (rc)
 		goto err_gpio_dir;
@@ -948,7 +964,6 @@ err_gpio_dir:
 err_slp_tr:
 	gpio_free(lp->rstn);
 err_rstn:
-err_fill:
 	spi_set_drvdata(spi, NULL);
 	mutex_destroy(&lp->bmux);
 	ieee802154_free_device(lp->dev);
diff --git a/include/linux/spi/at86rf230.h b/include/linux/spi/at86rf230.h
index b2b1afbb3202..aa327a8105ad 100644
--- a/include/linux/spi/at86rf230.h
+++ b/include/linux/spi/at86rf230.h
@@ -26,6 +26,20 @@ struct at86rf230_platform_data {
 	int rstn;
 	int slp_tr;
 	int dig2;
+
+	/* Setting the irq_type will configure the driver to request
+	 * the platform irq trigger type according to the given value
+	 * and configure the interrupt polarity of the device to the
+	 * corresponding polarity.
+	 *
+	 * Allowed values are: IRQF_TRIGGER_RISING, IRQF_TRIGGER_FALLING,
+	 *                     IRQF_TRIGGER_HIGH and IRQF_TRIGGER_LOW
+	 *
+	 * Setting it to 0, the driver does not touch the trigger type
+	 * configuration of the interrupt and sets the interrupt polarity
+	 * of the device to high active (the default value).
+	 */
+	int irq_type;
 };
 
 #endif
-- 
cgit 


From 62062cf8a3a99a933efdac549da380f230dbe982 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 16 Apr 2013 13:08:43 -0400
Subject: audit: allow checking the type of audit message in the user filter

When userspace sends messages to the audit system it includes a type.
We want to be able to filter messages based on that type without have to
do the all or nothing option currently available on the
AUDIT_FILTER_TYPE filter list.  Instead we should be able to use the
AUDIT_FILTER_USER filter list and just use the message type as one part
of the matching decision.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  2 +-
 kernel/audit.c        |  2 +-
 kernel/auditfilter.c  | 28 +++++++++++++++++++++++++---
 3 files changed, 27 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 8f92e1dea966..b26d7f121ac5 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -438,7 +438,7 @@ static inline void	    audit_log_secctx(struct audit_buffer *ab, u32 secid)
 extern int		    audit_update_lsm_rules(void);
 
 				/* Private API (for audit.c only) */
-extern int audit_filter_user(void);
+extern int audit_filter_user(int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int seq,
 				void *data, size_t datasz, kuid_t loginuid,
diff --git a/kernel/audit.c b/kernel/audit.c
index c45e6d2809d7..132271448b89 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -737,7 +737,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (!audit_enabled && msg_type != AUDIT_USER_AVC)
 			return 0;
 
-		err = audit_filter_user();
+		err = audit_filter_user(msg_type);
 		if (err == 1) {
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..9e666004e0dc 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -310,6 +310,18 @@ static u32 audit_to_op(u32 op)
 	return n;
 }
 
+/* check if a field is valid for a given list */
+static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
+{
+	switch(f->type) {
+	case AUDIT_MSGTYPE:
+		if (entry->rule.listnr != AUDIT_FILTER_TYPE &&
+		    entry->rule.listnr != AUDIT_FILTER_USER)
+			return -EINVAL;
+		break;
+	};
+	return 0;
+}
 
 /* Translate struct audit_rule to kernel's rule respresentation.
  * Exists for backward compatibility with userspace. */
@@ -459,6 +471,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		f->gid = INVALID_GID;
 		f->lsm_str = NULL;
 		f->lsm_rule = NULL;
+
+		err = audit_field_valid(entry, f);
+		if (err)
+			goto exit_free;
+
+		err = -EINVAL;
+
 		switch(f->type) {
 		case AUDIT_UID:
 		case AUDIT_EUID:
@@ -1354,7 +1373,7 @@ int audit_compare_dname_path(const char *dname, const char *path, int parentlen)
 	return strncmp(p, dname, dlen);
 }
 
-static int audit_filter_user_rules(struct audit_krule *rule,
+static int audit_filter_user_rules(struct audit_krule *rule, int type,
 				   enum audit_state *state)
 {
 	int i;
@@ -1378,6 +1397,9 @@ static int audit_filter_user_rules(struct audit_krule *rule,
 			result = audit_uid_comparator(audit_get_loginuid(current),
 						  f->op, f->uid);
 			break;
+		case AUDIT_MSGTYPE:
+			result = audit_comparator(type, f->op, f->val);
+			break;
 		case AUDIT_SUBJ_USER:
 		case AUDIT_SUBJ_ROLE:
 		case AUDIT_SUBJ_TYPE:
@@ -1404,7 +1426,7 @@ static int audit_filter_user_rules(struct audit_krule *rule,
 	return 1;
 }
 
-int audit_filter_user(void)
+int audit_filter_user(int type)
 {
 	enum audit_state state = AUDIT_DISABLED;
 	struct audit_entry *e;
@@ -1412,7 +1434,7 @@ int audit_filter_user(void)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) {
-		if (audit_filter_user_rules(&e->rule, &state)) {
+		if (audit_filter_user_rules(&e->rule, type, &state)) {
 			if (state == AUDIT_DISABLED)
 				ret = 0;
 			break;
-- 
cgit 


From 549b19cc9f31e8fdda317625d564bac0052a3328 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 16 Apr 2013 18:42:34 -0400
Subject: NFSv4: Record the OPEN create mode used in the nfs4_opendata
 structure

If we're doing NFSv4.1 against a server that has persistent sessions,
then we should not need to call SETATTR in order to reset the file
attributes immediately after doing an exclusive create.

Note that since the create mode depends on the type of session that
has been negotiated with the server, we should not choose the
mode until after we've got a session slot.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 16 ++++++++++++++--
 fs/nfs/nfs4xdr.c        | 37 ++++++++++++++++---------------------
 include/linux/nfs_xdr.h |  1 +
 3 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index af05df3c7c79..282d9fa6994a 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1525,6 +1525,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 {
 	struct nfs4_opendata *data = calldata;
 	struct nfs4_state_owner *sp = data->owner;
+	struct nfs_client *clp = sp->so_server->nfs_client;
 
 	if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
 		goto out_wait;
@@ -1545,7 +1546,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 		rcu_read_unlock();
 	}
 	/* Update client id. */
-	data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid;
+	data->o_arg.clientid = clp->cl_clientid;
 	if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) {
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR];
 		data->o_arg.open_bitmap = &nfs4_open_noattr_bitmap[0];
@@ -1557,6 +1558,16 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 				&data->o_res.seq_res,
 				task) != 0)
 		nfs_release_seqid(data->o_arg.seqid);
+
+	/* Set the create mode (note dependency on the session type) */
+	data->o_arg.createmode = NFS4_CREATE_UNCHECKED;
+	if (data->o_arg.open_flags & O_EXCL) {
+		data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE;
+		if (nfs4_has_persistent_session(clp))
+			data->o_arg.createmode = NFS4_CREATE_GUARDED;
+		else if (clp->cl_mvops->minor_version > 0)
+			data->o_arg.createmode = NFS4_CREATE_EXCLUSIVE4_1;
+	}
 	return;
 unlock_no_action:
 	rcu_read_unlock();
@@ -2000,7 +2011,8 @@ static int _nfs4_do_open(struct inode *dir,
 	if (status != 0)
 		goto err_opendata_put;
 
-	if (opendata->o_arg.open_flags & O_EXCL) {
+	if ((opendata->o_arg.open_flags & O_EXCL) &&
+	    (opendata->o_arg.createmode != NFS4_CREATE_GUARDED)) {
 		nfs4_exclusive_attrset(opendata, sattr);
 
 		nfs_fattr_init(opendata->o_res.f_attr);
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 0b744895b9e1..fef71cbec501 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1366,33 +1366,28 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
+	struct iattr dummy;
 	__be32 *p;
-	struct nfs_client *clp;
 
 	p = reserve_space(xdr, 4);
-	switch(arg->open_flags & O_EXCL) {
-	case 0:
+	switch(arg->createmode) {
+	case NFS4_CREATE_UNCHECKED:
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
 		encode_attrs(xdr, arg->u.attrs, arg->server);
 		break;
-	default:
-		clp = arg->server->nfs_client;
-		if (clp->cl_mvops->minor_version > 0) {
-			if (nfs4_has_persistent_session(clp)) {
-				*p = cpu_to_be32(NFS4_CREATE_GUARDED);
-				encode_attrs(xdr, arg->u.attrs, arg->server);
-			} else {
-				struct iattr dummy;
-
-				*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
-				encode_nfs4_verifier(xdr, &arg->u.verifier);
-				dummy.ia_valid = 0;
-				encode_attrs(xdr, &dummy, arg->server);
-			}
-		} else {
-			*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
-			encode_nfs4_verifier(xdr, &arg->u.verifier);
-		}
+	case NFS4_CREATE_GUARDED:
+		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
+		encode_attrs(xdr, arg->u.attrs, arg->server);
+		break;
+	case NFS4_CREATE_EXCLUSIVE:
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
+		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		break;
+	case NFS4_CREATE_EXCLUSIVE4_1:
+		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
+		encode_nfs4_verifier(xdr, &arg->u.verifier);
+		dummy.ia_valid = 0;
+		encode_attrs(xdr, &dummy, arg->server);
 	}
 }
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 90a4aa190b43..bdc100f66dfb 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -349,6 +349,7 @@ struct nfs_openargs {
 	const u32 *		bitmask;
 	const u32 *		open_bitmap;
 	__u32			claim;
+	enum createmode4	createmode;
 };
 
 struct nfs_openres {
-- 
cgit 


From b4cbb197c7e7a68dbad0d491242e3ca67420c13e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 16 Apr 2013 13:45:37 -0700
Subject: vm: add vm_iomap_memory() helper function

Various drivers end up replicating the code to mmap() their memory
buffers into user space, and our core memory remapping function may be
very flexible but it is unnecessarily complicated for the common cases
to use.

Our internal VM uses pfn's ("page frame numbers") which simplifies
things for the VM, and allows us to pass physical addresses around in a
denser and more efficient format than passing a "phys_addr_t" around,
and having to shift it up and down by the page size.  But it just means
that drivers end up doing that shifting instead at the interface level.

It also means that drivers end up mucking around with internal VM things
like the vma details (vm_pgoff, vm_start/end) way more than they really
need to.

So this just exports a function to map a certain physical memory range
into user space (using a phys_addr_t based interface that is much more
natural for a driver) and hides all the complexity from the driver.
Some drivers will still end up tweaking the vm_page_prot details for
things like prefetching or cacheability etc, but that's actually
relevant to the driver, rather than caring about what the page offset of
the mapping is into the particular IO memory region.

Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h |  2 ++
 mm/memory.c        | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e19ff30ad0a2..e2091b88d24c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1611,6 +1611,8 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long pfn);
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len);
+
 
 struct page *follow_page_mask(struct vm_area_struct *vma,
 			      unsigned long address, unsigned int foll_flags,
diff --git a/mm/memory.c b/mm/memory.c
index 13cbc420fead..ba94dec5b259 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2393,6 +2393,53 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 }
 EXPORT_SYMBOL(remap_pfn_range);
 
+/**
+ * vm_iomap_memory - remap memory to userspace
+ * @vma: user vma to map to
+ * @start: start of area
+ * @len: size of area
+ *
+ * This is a simplified io_remap_pfn_range() for common driver use. The
+ * driver just needs to give us the physical memory range to be mapped,
+ * we'll figure out the rest from the vma information.
+ *
+ * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
+ * whatever write-combining details or similar.
+ */
+int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
+{
+	unsigned long vm_len, pfn, pages;
+
+	/* Check that the physical memory area passed in looks valid */
+	if (start + len < start)
+		return -EINVAL;
+	/*
+	 * You *really* shouldn't map things that aren't page-aligned,
+	 * but we've historically allowed it because IO memory might
+	 * just have smaller alignment.
+	 */
+	len += start & ~PAGE_MASK;
+	pfn = start >> PAGE_SHIFT;
+	pages = (len + ~PAGE_MASK) >> PAGE_SHIFT;
+	if (pfn + pages < pfn)
+		return -EINVAL;
+
+	/* We start the mapping 'vm_pgoff' pages into the area */
+	if (vma->vm_pgoff > pages)
+		return -EINVAL;
+	pfn += vma->vm_pgoff;
+	pages -= vma->vm_pgoff;
+
+	/* Can we fit all of the mapping? */
+	vm_len = vma->vm_end - vma->vm_start;
+	if (vm_len >> PAGE_SHIFT > pages)
+		return -EINVAL;
+
+	/* Ok, let it rip */
+	return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_iomap_memory);
+
 static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 				     unsigned long addr, unsigned long end,
 				     pte_fn_t fn, void *data)
-- 
cgit 


From 6f780965aaf3354a3a6165d9018393a4a381b81d Mon Sep 17 00:00:00 2001
From: Hongbo Zhang <hongbo.zhang@linaro.org>
Date: Wed, 3 Apr 2013 20:18:08 +0800
Subject: ab8500_btemp: Make ab8500_btemp_get* interfaces public

Make ab8500_btemp_get_temp interface public, export it and also export the
ab8500_btemp_get, ab8500_btemp_get_batctrl_temp interfaces, so that the
ab8500 hwmon driver can use them.

Signed-off-by: Hongbo Zhang <hongbo.zhang@linaro.org>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 drivers/power/ab8500_btemp.c         | 5 ++++-
 include/linux/mfd/abx500/ab8500-bm.h | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_btemp.c b/drivers/power/ab8500_btemp.c
index a9486f1a1b5b..d412d34bf3df 100644
--- a/drivers/power/ab8500_btemp.c
+++ b/drivers/power/ab8500_btemp.c
@@ -131,6 +131,7 @@ struct ab8500_btemp *ab8500_btemp_get(void)
 
 	return btemp;
 }
+EXPORT_SYMBOL(ab8500_btemp_get);
 
 /**
  * ab8500_btemp_batctrl_volt_to_res() - convert batctrl voltage to resistance
@@ -815,7 +816,7 @@ static void ab8500_btemp_periodic(struct ab8500_btemp *di,
  *
  * Returns battery temperature
  */
-static int ab8500_btemp_get_temp(struct ab8500_btemp *di)
+int ab8500_btemp_get_temp(struct ab8500_btemp *di)
 {
 	int temp = 0;
 
@@ -851,6 +852,7 @@ static int ab8500_btemp_get_temp(struct ab8500_btemp *di)
 	}
 	return temp;
 }
+EXPORT_SYMBOL(ab8500_btemp_get_temp);
 
 /**
  * ab8500_btemp_get_batctrl_temp() - get the temperature
@@ -862,6 +864,7 @@ int ab8500_btemp_get_batctrl_temp(struct ab8500_btemp *btemp)
 {
 	return btemp->bat_temp * 1000;
 }
+EXPORT_SYMBOL(ab8500_btemp_get_batctrl_temp);
 
 /**
  * ab8500_btemp_get_property() - get the btemp properties
diff --git a/include/linux/mfd/abx500/ab8500-bm.h b/include/linux/mfd/abx500/ab8500-bm.h
index f5214dc651f9..cc892a8d8d6e 100644
--- a/include/linux/mfd/abx500/ab8500-bm.h
+++ b/include/linux/mfd/abx500/ab8500-bm.h
@@ -465,6 +465,7 @@ void ab8500_fg_reinit(void);
 void ab8500_charger_usb_state_changed(u8 bm_usb_state, u16 mA);
 struct ab8500_btemp *ab8500_btemp_get(void);
 int ab8500_btemp_get_batctrl_temp(struct ab8500_btemp *btemp);
+int ab8500_btemp_get_temp(struct ab8500_btemp *btemp);
 struct ab8500_fg *ab8500_fg_get(void);
 int ab8500_fg_inst_curr_blocking(struct ab8500_fg *dev);
 int ab8500_fg_inst_curr_start(struct ab8500_fg *di);
-- 
cgit 


From 2c89940786ef8c6e4dbcd2960142ce513e289f1e Mon Sep 17 00:00:00 2001
From: Hongbo Zhang <hongbo.zhang@linaro.org>
Date: Wed, 3 Apr 2013 20:18:10 +0800
Subject: ab8500_{bmdata,fg}: Add const attributes to some data arrays

This patch adds const attributes to AB8500 power and temperature related
read-only data arrays.

Signed-off-by: Hongbo Zhang <hongbo.zhang@linaro.org>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 drivers/power/ab8500_bmdata.c | 20 ++++++++++----------
 drivers/power/ab8500_fg.c     |  4 ++--
 include/linux/mfd/abx500.h    |  6 +++---
 3 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_bmdata.c b/drivers/power/ab8500_bmdata.c
index 05ad9664b05d..3cdcdcff4dd1 100644
--- a/drivers/power/ab8500_bmdata.c
+++ b/drivers/power/ab8500_bmdata.c
@@ -11,7 +11,7 @@
  * Note that the res_to_temp table must be strictly sorted by falling resistance
  * values to work.
  */
-static struct abx500_res_to_temp temp_tbl_a_thermistor[] = {
+static const struct abx500_res_to_temp temp_tbl_a_thermistor[] = {
 	{-5, 53407},
 	{ 0, 48594},
 	{ 5, 43804},
@@ -29,7 +29,7 @@ static struct abx500_res_to_temp temp_tbl_a_thermistor[] = {
 	{65, 12500},
 };
 
-static struct abx500_res_to_temp temp_tbl_b_thermistor[] = {
+static const struct abx500_res_to_temp temp_tbl_b_thermistor[] = {
 	{-5, 200000},
 	{ 0, 159024},
 	{ 5, 151921},
@@ -47,7 +47,7 @@ static struct abx500_res_to_temp temp_tbl_b_thermistor[] = {
 	{65,  82869},
 };
 
-static struct abx500_v_to_cap cap_tbl_a_thermistor[] = {
+static const struct abx500_v_to_cap cap_tbl_a_thermistor[] = {
 	{4171,	100},
 	{4114,	 95},
 	{4009,	 83},
@@ -70,7 +70,7 @@ static struct abx500_v_to_cap cap_tbl_a_thermistor[] = {
 	{3247,	  0},
 };
 
-static struct abx500_v_to_cap cap_tbl_b_thermistor[] = {
+static const struct abx500_v_to_cap cap_tbl_b_thermistor[] = {
 	{4161,	100},
 	{4124,	 98},
 	{4044,	 90},
@@ -93,7 +93,7 @@ static struct abx500_v_to_cap cap_tbl_b_thermistor[] = {
 	{3250,	  0},
 };
 
-static struct abx500_v_to_cap cap_tbl[] = {
+static const struct abx500_v_to_cap cap_tbl[] = {
 	{4186,	100},
 	{4163,	 99},
 	{4114,	 95},
@@ -124,7 +124,7 @@ static struct abx500_v_to_cap cap_tbl[] = {
  * Note that the res_to_temp table must be strictly sorted by falling
  * resistance values to work.
  */
-static struct abx500_res_to_temp temp_tbl[] = {
+static const struct abx500_res_to_temp temp_tbl[] = {
 	{-5, 214834},
 	{ 0, 162943},
 	{ 5, 124820},
@@ -146,7 +146,7 @@ static struct abx500_res_to_temp temp_tbl[] = {
  * Note that the batres_vs_temp table must be strictly sorted by falling
  * temperature values to work.
  */
-static struct batres_vs_temp temp_to_batres_tbl_thermistor[] = {
+static const struct batres_vs_temp temp_to_batres_tbl_thermistor[] = {
 	{ 40, 120},
 	{ 30, 135},
 	{ 20, 165},
@@ -160,7 +160,7 @@ static struct batres_vs_temp temp_to_batres_tbl_thermistor[] = {
  * Note that the batres_vs_temp table must be strictly sorted by falling
  * temperature values to work.
  */
-static struct batres_vs_temp temp_to_batres_tbl_ext_thermistor[] = {
+static const struct batres_vs_temp temp_to_batres_tbl_ext_thermistor[] = {
 	{ 60, 300},
 	{ 30, 300},
 	{ 20, 300},
@@ -171,7 +171,7 @@ static struct batres_vs_temp temp_to_batres_tbl_ext_thermistor[] = {
 };
 
 /* battery resistance table for LI ION 9100 battery */
-static struct batres_vs_temp temp_to_batres_tbl_9100[] = {
+static const struct batres_vs_temp temp_to_batres_tbl_9100[] = {
 	{ 60, 180},
 	{ 30, 180},
 	{ 20, 180},
@@ -547,7 +547,7 @@ int ab8500_bm_of_probe(struct device *dev,
 		       struct device_node *np,
 		       struct abx500_bm_data *bm)
 {
-	struct batres_vs_temp *tmp_batres_tbl;
+	const struct batres_vs_temp *tmp_batres_tbl;
 	struct device_node *battery_node;
 	const char *btech;
 	int i;
diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 1601d27ce5d4..c5391f5c372d 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -863,7 +863,7 @@ static int ab8500_fg_bat_voltage(struct ab8500_fg *di)
 static int ab8500_fg_volt_to_capacity(struct ab8500_fg *di, int voltage)
 {
 	int i, tbl_size;
-	struct abx500_v_to_cap *tbl;
+	const struct abx500_v_to_cap *tbl;
 	int cap = 0;
 
 	tbl = di->bm->bat_type[di->bm->batt_id].v_to_cap_tbl,
@@ -915,7 +915,7 @@ static int ab8500_fg_uncomp_volt_to_capacity(struct ab8500_fg *di)
 static int ab8500_fg_battery_resistance(struct ab8500_fg *di)
 {
 	int i, tbl_size;
-	struct batres_vs_temp *tbl;
+	const struct batres_vs_temp *tbl;
 	int resist = 0;
 
 	tbl = di->bm->bat_type[di->bm->batt_id].batres_tbl;
diff --git a/include/linux/mfd/abx500.h b/include/linux/mfd/abx500.h
index 33b0253569a3..3301b2031c8d 100644
--- a/include/linux/mfd/abx500.h
+++ b/include/linux/mfd/abx500.h
@@ -183,11 +183,11 @@ struct abx500_battery_type {
 	int low_high_vol_lvl;
 	int battery_resistance;
 	int n_temp_tbl_elements;
-	struct abx500_res_to_temp *r_to_t_tbl;
+	const struct abx500_res_to_temp *r_to_t_tbl;
 	int n_v_cap_tbl_elements;
-	struct abx500_v_to_cap *v_to_cap_tbl;
+	const struct abx500_v_to_cap *v_to_cap_tbl;
 	int n_batres_tbl_elements;
-	struct batres_vs_temp *batres_tbl;
+	const struct batres_vs_temp *batres_tbl;
 };
 
 /**
-- 
cgit 


From ea2be6f21071b4af3b765a0f228be2bef08515e9 Mon Sep 17 00:00:00 2001
From: Hongbo Zhang <hongbo.zhang@linaro.org>
Date: Wed, 3 Apr 2013 20:18:11 +0800
Subject: ab8500_bmdata: Export abx500_res_to_temp tables for hwmon

This patch exports the thermistor resistance-to-temperature tables, so
that the hwmon driver can access them, and also adds the corresponding
table size variables.

Signed-off-by: Hongbo Zhang <hongbo.zhang@linaro.org>
Acked-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 drivers/power/ab8500_bmdata.c | 20 ++++++++++++++------
 include/linux/power/ab8500.h  | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 6 deletions(-)
 create mode 100644 include/linux/power/ab8500.h

(limited to 'include/linux')

diff --git a/drivers/power/ab8500_bmdata.c b/drivers/power/ab8500_bmdata.c
index 3cdcdcff4dd1..d29864533093 100644
--- a/drivers/power/ab8500_bmdata.c
+++ b/drivers/power/ab8500_bmdata.c
@@ -11,7 +11,7 @@
  * Note that the res_to_temp table must be strictly sorted by falling resistance
  * values to work.
  */
-static const struct abx500_res_to_temp temp_tbl_a_thermistor[] = {
+const struct abx500_res_to_temp ab8500_temp_tbl_a_thermistor[] = {
 	{-5, 53407},
 	{ 0, 48594},
 	{ 5, 43804},
@@ -28,8 +28,12 @@ static const struct abx500_res_to_temp temp_tbl_a_thermistor[] = {
 	{60, 13437},
 	{65, 12500},
 };
+EXPORT_SYMBOL(ab8500_temp_tbl_a_thermistor);
 
-static const struct abx500_res_to_temp temp_tbl_b_thermistor[] = {
+const int ab8500_temp_tbl_a_size = ARRAY_SIZE(ab8500_temp_tbl_a_thermistor);
+EXPORT_SYMBOL(ab8500_temp_tbl_a_size);
+
+const struct abx500_res_to_temp ab8500_temp_tbl_b_thermistor[] = {
 	{-5, 200000},
 	{ 0, 159024},
 	{ 5, 151921},
@@ -46,6 +50,10 @@ static const struct abx500_res_to_temp temp_tbl_b_thermistor[] = {
 	{60,  85461},
 	{65,  82869},
 };
+EXPORT_SYMBOL(ab8500_temp_tbl_b_thermistor);
+
+const int ab8500_temp_tbl_b_size = ARRAY_SIZE(ab8500_temp_tbl_b_thermistor);
+EXPORT_SYMBOL(ab8500_temp_tbl_b_size);
 
 static const struct abx500_v_to_cap cap_tbl_a_thermistor[] = {
 	{4171,	100},
@@ -230,8 +238,8 @@ static struct abx500_battery_type bat_type_thermistor[] = {
 		.maint_b_chg_timer_h = 200,
 		.low_high_cur_lvl = 300,
 		.low_high_vol_lvl = 4000,
-		.n_temp_tbl_elements = ARRAY_SIZE(temp_tbl_a_thermistor),
-		.r_to_t_tbl = temp_tbl_a_thermistor,
+		.n_temp_tbl_elements = ARRAY_SIZE(ab8500_temp_tbl_a_thermistor),
+		.r_to_t_tbl = ab8500_temp_tbl_a_thermistor,
 		.n_v_cap_tbl_elements = ARRAY_SIZE(cap_tbl_a_thermistor),
 		.v_to_cap_tbl = cap_tbl_a_thermistor,
 		.n_batres_tbl_elements = ARRAY_SIZE(temp_to_batres_tbl_thermistor),
@@ -258,8 +266,8 @@ static struct abx500_battery_type bat_type_thermistor[] = {
 		.maint_b_chg_timer_h = 200,
 		.low_high_cur_lvl = 300,
 		.low_high_vol_lvl = 4000,
-		.n_temp_tbl_elements = ARRAY_SIZE(temp_tbl_b_thermistor),
-		.r_to_t_tbl = temp_tbl_b_thermistor,
+		.n_temp_tbl_elements = ARRAY_SIZE(ab8500_temp_tbl_b_thermistor),
+		.r_to_t_tbl = ab8500_temp_tbl_b_thermistor,
 		.n_v_cap_tbl_elements = ARRAY_SIZE(cap_tbl_b_thermistor),
 		.v_to_cap_tbl = cap_tbl_b_thermistor,
 		.n_batres_tbl_elements = ARRAY_SIZE(temp_to_batres_tbl_thermistor),
diff --git a/include/linux/power/ab8500.h b/include/linux/power/ab8500.h
new file mode 100644
index 000000000000..cdbb6c2a8c51
--- /dev/null
+++ b/include/linux/power/ab8500.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright (C) ST-Ericsson 2013
+ * Author: Hongbo Zhang <hongbo.zhang@linaro.com>
+ * License terms: GNU General Public License v2
+ */
+
+#ifndef PWR_AB8500_H
+#define PWR_AB8500_H
+
+extern const struct abx500_res_to_temp ab8500_temp_tbl_a_thermistor[];
+extern const int ab8500_temp_tbl_a_size;
+
+extern const struct abx500_res_to_temp ab8500_temp_tbl_b_thermistor[];
+extern const int ab8500_temp_tbl_b_size;
+
+#endif /* PWR_AB8500_H */
-- 
cgit 


From 5e0848c6026ab98f47e0e179f5c76875cd509d58 Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Mon, 1 Apr 2013 17:45:54 -0400
Subject: power_supply: Add core support for supplied_from

This patch adds support for supplies to register a list of char *'s which
represent the list of supplies which supply them. This is the opposite as
the supplied_to list.

This change maintains support for supplied_to until all drivers which make
use of it already are converted.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Reviewed-by: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 drivers/power/power_supply_core.c | 47 ++++++++++++++++++++++++++++-----------
 include/linux/power_supply.h      |  3 +++
 2 files changed, 37 insertions(+), 13 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index 5deac432e2ae..d843cc9df030 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -26,17 +26,42 @@ EXPORT_SYMBOL_GPL(power_supply_class);
 
 static struct device_type power_supply_dev_type;
 
+static bool __power_supply_is_supplied_by(struct power_supply *supplier,
+					 struct power_supply *supply)
+{
+	int i;
+
+	if (!supply->supplied_from && !supplier->supplied_to)
+		return false;
+
+	/* Support both supplied_to and supplied_from modes */
+	if (supply->supplied_from) {
+		if (!supplier->name)
+			return false;
+		for (i = 0; i < supply->num_supplies; i++)
+			if (!strcmp(supplier->name, supply->supplied_from[i]))
+				return true;
+	} else {
+		if (!supply->name)
+			return false;
+		for (i = 0; i < supplier->num_supplicants; i++)
+			if (!strcmp(supplier->supplied_to[i], supply->name))
+				return true;
+	}
+
+	return false;
+}
+
 static int __power_supply_changed_work(struct device *dev, void *data)
 {
 	struct power_supply *psy = (struct power_supply *)data;
 	struct power_supply *pst = dev_get_drvdata(dev);
-	int i;
 
-	for (i = 0; i < psy->num_supplicants; i++)
-		if (!strcmp(psy->supplied_to[i], pst->name)) {
-			if (pst->external_power_changed)
-				pst->external_power_changed(pst);
-		}
+	if (__power_supply_is_supplied_by(psy, pst)) {
+		if (pst->external_power_changed)
+			pst->external_power_changed(pst);
+	}
+
 	return 0;
 }
 
@@ -68,17 +93,13 @@ static int __power_supply_am_i_supplied(struct device *dev, void *data)
 	union power_supply_propval ret = {0,};
 	struct power_supply *psy = (struct power_supply *)data;
 	struct power_supply *epsy = dev_get_drvdata(dev);
-	int i;
 
-	for (i = 0; i < epsy->num_supplicants; i++) {
-		if (!strcmp(epsy->supplied_to[i], psy->name)) {
-			if (epsy->get_property(epsy,
-				  POWER_SUPPLY_PROP_ONLINE, &ret))
-				continue;
+	if (__power_supply_is_supplied_by(epsy, psy))
+		if (!epsy->get_property(epsy, POWER_SUPPLY_PROP_ONLINE, &ret)) {
 			if (ret.intval)
 				return ret.intval;
 		}
-	}
+
 	return 0;
 }
 
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 002a99f96331..c1cbd5e4e484 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -171,6 +171,9 @@ struct power_supply {
 	char **supplied_to;
 	size_t num_supplicants;
 
+	char **supplied_from;
+	size_t num_supplies;
+
 	int (*get_property)(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val);
-- 
cgit 


From f6e0b081fb300a4601b064346963cf6bb163f437 Mon Sep 17 00:00:00 2001
From: Rhyland Klein <rklein@nvidia.com>
Date: Mon, 1 Apr 2013 17:45:55 -0400
Subject: power_supply: Populate supplied_from hierarchy from the device tree

With this patch the power_supply_core will try to populate supplied_from
hierarchy from the device tree.

Signed-off-by: Rhyland Klein <rklein@nvidia.com>
Signed-off-by: Anton Vorontsov <anton@enomsg.org>
---
 drivers/power/power_supply_core.c | 140 ++++++++++++++++++++++++++++++++++++++
 include/linux/power_supply.h      |   3 +
 2 files changed, 143 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index d843cc9df030..1c517c34e4be 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -88,6 +88,139 @@ void power_supply_changed(struct power_supply *psy)
 }
 EXPORT_SYMBOL_GPL(power_supply_changed);
 
+#ifdef CONFIG_OF
+#include <linux/of.h>
+
+static int __power_supply_populate_supplied_from(struct device *dev,
+						 void *data)
+{
+	struct power_supply *psy = (struct power_supply *)data;
+	struct power_supply *epsy = dev_get_drvdata(dev);
+	struct device_node *np;
+	int i = 0;
+
+	do {
+		np = of_parse_phandle(psy->of_node, "power-supplies", i++);
+		if (!np)
+			continue;
+
+		if (np == epsy->of_node) {
+			dev_info(psy->dev, "%s: Found supply : %s\n",
+				psy->name, epsy->name);
+			psy->supplied_from[i-1] = (char *)epsy->name;
+			psy->num_supplies++;
+			break;
+		}
+	} while (np);
+
+	return 0;
+}
+
+static int power_supply_populate_supplied_from(struct power_supply *psy)
+{
+	int error;
+
+	error = class_for_each_device(power_supply_class, NULL, psy,
+				      __power_supply_populate_supplied_from);
+
+	dev_dbg(psy->dev, "%s %d\n", __func__, error);
+
+	return error;
+}
+
+static int  __power_supply_find_supply_from_node(struct device *dev,
+						 void *data)
+{
+	struct device_node *np = (struct device_node *)data;
+	struct power_supply *epsy = dev_get_drvdata(dev);
+
+	/* return error breaks out of class_for_each_device loop */
+	if (epsy->of_node == np)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int power_supply_find_supply_from_node(struct device_node *supply_node)
+{
+	int error;
+	struct device *dev;
+	struct class_dev_iter iter;
+
+	/*
+	 * Use iterator to see if any other device is registered.
+	 * This is required since class_for_each_device returns 0
+	 * if there are no devices registered.
+	 */
+	class_dev_iter_init(&iter, power_supply_class, NULL, NULL);
+	dev = class_dev_iter_next(&iter);
+
+	if (!dev)
+		return -EPROBE_DEFER;
+
+	/*
+	 * We have to treat the return value as inverted, because if
+	 * we return error on not found, then it won't continue looking.
+	 * So we trick it by returning error on success to stop looking
+	 * once the matching device is found.
+	 */
+	error = class_for_each_device(power_supply_class, NULL, supply_node,
+				       __power_supply_find_supply_from_node);
+
+	return error ? 0 : -EPROBE_DEFER;
+}
+
+static int power_supply_check_supplies(struct power_supply *psy)
+{
+	struct device_node *np;
+	int cnt = 0;
+
+	/* If there is already a list honor it */
+	if (psy->supplied_from && psy->num_supplies > 0)
+		return 0;
+
+	/* No device node found, nothing to do */
+	if (!psy->of_node)
+		return 0;
+
+	do {
+		int ret;
+
+		np = of_parse_phandle(psy->of_node, "power-supplies", cnt++);
+		if (!np)
+			continue;
+
+		ret = power_supply_find_supply_from_node(np);
+		if (ret) {
+			dev_dbg(psy->dev, "Failed to find supply, defer!\n");
+			return -EPROBE_DEFER;
+		}
+	} while (np);
+
+	/* All supplies found, allocate char ** array for filling */
+	psy->supplied_from = devm_kzalloc(psy->dev, sizeof(psy->supplied_from),
+					  GFP_KERNEL);
+	if (!psy->supplied_from) {
+		dev_err(psy->dev, "Couldn't allocate memory for supply list\n");
+		return -ENOMEM;
+	}
+
+	*psy->supplied_from = devm_kzalloc(psy->dev, sizeof(char *) * cnt,
+					   GFP_KERNEL);
+	if (!*psy->supplied_from) {
+		dev_err(psy->dev, "Couldn't allocate memory for supply list\n");
+		return -ENOMEM;
+	}
+
+	return power_supply_populate_supplied_from(psy);
+}
+#else
+static inline int power_supply_check_supplies(struct power_supply *psy)
+{
+	return 0;
+}
+#endif
+
 static int __power_supply_am_i_supplied(struct device *dev, void *data)
 {
 	union power_supply_propval ret = {0,};
@@ -357,6 +490,12 @@ int power_supply_register(struct device *parent, struct power_supply *psy)
 
 	INIT_WORK(&psy->changed_work, power_supply_changed_work);
 
+	rc = power_supply_check_supplies(psy);
+	if (rc) {
+		dev_info(dev, "Not all required supplies found, defer probe\n");
+		goto check_supplies_failed;
+	}
+
 	rc = kobject_set_name(&dev->kobj, "%s", psy->name);
 	if (rc)
 		goto kobject_set_name_failed;
@@ -389,6 +528,7 @@ register_thermal_failed:
 	device_del(dev);
 kobject_set_name_failed:
 device_add_failed:
+check_supplies_failed:
 	put_device(dev);
 success:
 	return rc;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index c1cbd5e4e484..3828cefb4f65 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -173,6 +173,9 @@ struct power_supply {
 
 	char **supplied_from;
 	size_t num_supplies;
+#ifdef CONFIG_OF
+	struct device_node *of_node;
+#endif
 
 	int (*get_property)(struct power_supply *psy,
 			    enum power_supply_property psp,
-- 
cgit 


From 9f550553a470d3b05fc8bdced3a738d8ed5b8d8a Mon Sep 17 00:00:00 2001
From: Shlomo Pongratz <shlomop@mellanox.com>
Date: Wed, 10 Apr 2013 14:26:47 +0000
Subject: mlx4_core: Implement SRQ object lookup from srqn

Expose a new API mlx4_srq_lookup() to retrive a SRQ based on its
number.  This API is needed in the mlx4_ib driver CQ polling logic,
when a work completion is associated with a XRC TGT QP.  Since a
target QP may redirect to more than one XRC SRQ, the srq field in the
QP has no usage and the real XRC SRQ need to be retrived using the
information from the XRCETH IB header which is placed in the HW CQE.

Signed-off-by: Shlomo Pongratz <shlomop@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/srq.c | 15 +++++++++++++++
 include/linux/mlx4/srq.h                 |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/srq.c b/drivers/net/ethernet/mellanox/mlx4/srq.c
index e329fe1f11b7..79fd269e2c54 100644
--- a/drivers/net/ethernet/mellanox/mlx4/srq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/srq.c
@@ -298,3 +298,18 @@ void mlx4_cleanup_srq_table(struct mlx4_dev *dev)
 		return;
 	mlx4_bitmap_cleanup(&mlx4_priv(dev)->srq_table.bitmap);
 }
+
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn)
+{
+	struct mlx4_srq_table *srq_table = &mlx4_priv(dev)->srq_table;
+	struct mlx4_srq *srq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&srq_table->lock, flags);
+	srq = radix_tree_lookup(&srq_table->tree,
+				srqn & (dev->caps.num_srqs - 1));
+	spin_unlock_irqrestore(&srq_table->lock, flags);
+
+	return srq;
+}
+EXPORT_SYMBOL_GPL(mlx4_srq_lookup);
diff --git a/include/linux/mlx4/srq.h b/include/linux/mlx4/srq.h
index 799a0697a383..192e0f7784f2 100644
--- a/include/linux/mlx4/srq.h
+++ b/include/linux/mlx4/srq.h
@@ -39,4 +39,6 @@ struct mlx4_wqe_srq_next_seg {
 	u32			reserved2[3];
 };
 
+struct mlx4_srq *mlx4_srq_lookup(struct mlx4_dev *dev, u32 srqn);
+
 #endif /* MLX4_SRQ_H */
-- 
cgit 


From d5abc7c1050ab2b9556a4bf21626cd74e83cd086 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Sun, 3 Feb 2013 19:53:46 +0000
Subject: efi: move utf16 string functions to efi.h

There are currently two implementations of the utf16 string functions.
Somewhat confusingly, they've got different names.

Centralise the functions in efi.h.

Reviewed-by: Tom Gundersen <teg@jklm.no>
Tested-by: Tom Gundersen <teg@jklm.no>
Reviewed-by: Mike Waychison <mikew@google.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/efivars.c     | 17 -----------------
 drivers/firmware/google/gsmi.c | 19 ++++---------------
 include/linux/efi.h            | 17 +++++++++++++++++
 3 files changed, 21 insertions(+), 32 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 7acafb80fd4c..34c8783a2aa8 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -172,23 +172,6 @@ static void efivar_update_sysfs_entries(struct work_struct *);
 static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries);
 static bool efivar_wq_enabled = true;
 
-/* Return the number of unicode characters in data */
-static unsigned long
-utf16_strnlen(efi_char16_t *s, size_t maxlength)
-{
-	unsigned long length = 0;
-
-	while (*s++ != 0 && length < maxlength)
-		length++;
-	return length;
-}
-
-static inline unsigned long
-utf16_strlen(efi_char16_t *s)
-{
-	return utf16_strnlen(s, ~0UL);
-}
-
 /*
  * Return the number of bytes is the length of this string
  * Note: this is NOT the same as the number of unicode characters
diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index 91ddf0f7a1b9..c409a7577afc 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -288,17 +288,6 @@ static int gsmi_exec(u8 func, u8 sub)
 	return rc;
 }
 
-/* Return the number of unicode characters in data */
-static size_t
-utf16_strlen(efi_char16_t *data, unsigned long maxlength)
-{
-	unsigned long length = 0;
-
-	while (*data++ != 0 && length < maxlength)
-		length++;
-	return length;
-}
-
 static efi_status_t gsmi_get_variable(efi_char16_t *name,
 				      efi_guid_t *vendor, u32 *attr,
 				      unsigned long *data_size,
@@ -311,7 +300,7 @@ static efi_status_t gsmi_get_variable(efi_char16_t *name,
 	};
 	efi_status_t ret = EFI_SUCCESS;
 	unsigned long flags;
-	size_t name_len = utf16_strlen(name, GSMI_BUF_SIZE / 2);
+	size_t name_len = utf16_strnlen(name, GSMI_BUF_SIZE / 2);
 	int rc;
 
 	if (name_len >= GSMI_BUF_SIZE / 2)
@@ -380,7 +369,7 @@ static efi_status_t gsmi_get_next_variable(unsigned long *name_size,
 		return EFI_BAD_BUFFER_SIZE;
 
 	/* Let's make sure the thing is at least null-terminated */
-	if (utf16_strlen(name, GSMI_BUF_SIZE / 2) == GSMI_BUF_SIZE / 2)
+	if (utf16_strnlen(name, GSMI_BUF_SIZE / 2) == GSMI_BUF_SIZE / 2)
 		return EFI_INVALID_PARAMETER;
 
 	spin_lock_irqsave(&gsmi_dev.lock, flags);
@@ -408,7 +397,7 @@ static efi_status_t gsmi_get_next_variable(unsigned long *name_size,
 
 		/* Copy the name back */
 		memcpy(name, gsmi_dev.name_buf->start, GSMI_BUF_SIZE);
-		*name_size = utf16_strlen(name, GSMI_BUF_SIZE / 2) * 2;
+		*name_size = utf16_strnlen(name, GSMI_BUF_SIZE / 2) * 2;
 
 		/* copy guid to return buffer */
 		memcpy(vendor, &param.guid, sizeof(param.guid));
@@ -434,7 +423,7 @@ static efi_status_t gsmi_set_variable(efi_char16_t *name,
 			      EFI_VARIABLE_BOOTSERVICE_ACCESS |
 			      EFI_VARIABLE_RUNTIME_ACCESS,
 	};
-	size_t name_len = utf16_strlen(name, GSMI_BUF_SIZE / 2);
+	size_t name_len = utf16_strnlen(name, GSMI_BUF_SIZE / 2);
 	efi_status_t ret = EFI_SUCCESS;
 	int rc;
 	unsigned long flags;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 9bf2f1fcae27..d1d782a6d34c 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -719,6 +719,23 @@ static inline void memrange_efi_to_native(u64 *addr, u64 *npages)
 	*addr &= PAGE_MASK;
 }
 
+/* Return the number of unicode characters in data */
+static inline unsigned long
+utf16_strnlen(efi_char16_t *s, size_t maxlength)
+{
+	unsigned long length = 0;
+
+	while (*s++ != 0 && length < maxlength)
+		length++;
+	return length;
+}
+
+static inline unsigned long
+utf16_strlen(efi_char16_t *s)
+{
+	return utf16_strnlen(s, ~0UL);
+}
+
 #if defined(CONFIG_EFI_VARS) || defined(CONFIG_EFI_VARS_MODULE)
 /*
  * EFI Variable support.
-- 
cgit 


From e14ab23dde12b80db4c94b684a2e485b72b16af3 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Sun, 3 Feb 2013 20:16:40 +0000
Subject: efivars: efivar_entry API

There isn't really a formal interface for dealing with EFI variables
or struct efivar_entry. Historically, this has led to various bits of
code directly accessing the generic EFI variable ops, which inherently
ties it to specific EFI variable operations instead of indirectly
using whatever ops were registered with register_efivars(). This lead
to the efivarfs code only working with the generic EFI variable ops
and not CONFIG_GOOGLE_SMI.

Encapsulate everything that needs to access '__efivars' inside an
efivar_entry_* API and use the new API in the pstore, sysfs and
efivarfs code.

Much of the efivars code had to be rewritten to use this new API. For
instance, it is now up to the users of the API to build the initial
list of EFI variables in their efivar_init() callback function. The
variable list needs to be passed to efivar_init() which allows us to
keep work arounds for things like implementation bugs in
GetNextVariable() in a central location.

Allowing users of the API to use a callback function to build the list
greatly benefits the efivarfs code which needs to allocate inodes and
dentries for every variable.  It previously did this in a racy way
because the code ran without holding the variable spinlock. Both the
sysfs and efivarfs code maintain their own lists which means the two
interfaces can be running simultaneously without interference, though
it should be noted that because no synchronisation is performed it is
very easy to create inconsistencies. efibootmgr doesn't currently use
efivarfs and users are likely to also require the old sysfs interface,
so it makes sense to allow both to be built.

Reviewed-by: Tom Gundersen <teg@jklm.no>
Tested-by: Tom Gundersen <teg@jklm.no>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Mike Waychison <mikew@google.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/efivars.c     | 1819 ++++++++++++++++++++++++----------------
 drivers/firmware/google/gsmi.c |   11 +-
 include/linux/efi.h            |   76 +-
 3 files changed, 1176 insertions(+), 730 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index eab4ec41c523..82fd145ead3b 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -97,39 +97,14 @@ MODULE_VERSION(EFIVARS_VERSION);
 
 #define DUMP_NAME_LEN 52
 
-/*
- * Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))
- * not including trailing NUL
- */
-#define GUID_LEN 36
+static LIST_HEAD(efivarfs_list);
+static LIST_HEAD(efivar_sysfs_list);
 
 static bool efivars_pstore_disable =
 	IS_ENABLED(CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE);
 
 module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644);
 
-/*
- * The maximum size of VariableName + Data = 1024
- * Therefore, it's reasonable to save that much
- * space in each part of the structure,
- * and we use a page for reading/writing.
- */
-
-struct efi_variable {
-	efi_char16_t  VariableName[1024/sizeof(efi_char16_t)];
-	efi_guid_t    VendorGuid;
-	unsigned long DataSize;
-	__u8          Data[1024];
-	efi_status_t  Status;
-	__u32         Attributes;
-} __attribute__((packed));
-
-struct efivar_entry {
-	struct efi_variable var;
-	struct list_head list;
-	struct kobject kobj;
-};
-
 struct efivar_attribute {
 	struct attribute attr;
 	ssize_t (*show) (struct efivar_entry *entry, char *buf);
@@ -144,6 +119,11 @@ static struct efivars *__efivars;
 	 EFI_VARIABLE_BOOTSERVICE_ACCESS | \
 	 EFI_VARIABLE_RUNTIME_ACCESS)
 
+static struct kset *efivars_kset;
+
+static struct bin_attribute *efivars_new_var;
+static struct bin_attribute *efivars_del_var;
+
 #define EFIVAR_ATTR(_name, _mode, _show, _store) \
 struct efivar_attribute efivar_attr_##_name = { \
 	.attr = {.name = __stringify(_name), .mode = _mode}, \
@@ -158,10 +138,7 @@ struct efivar_attribute efivar_attr_##_name = { \
  * Prototype for sysfs creation function
  */
 static int
-efivar_create_sysfs_entry(struct efivars *efivars,
-			  unsigned long variable_name_size,
-			  efi_char16_t *variable_name,
-			  efi_guid_t *vendor_guid);
+efivar_create_sysfs_entry(struct efivar_entry *new_var);
 
 /*
  * Prototype for workqueue functions updating sysfs entry
@@ -346,8 +323,8 @@ static const struct variable_validate variable_validate[] = {
 	{ "", NULL },
 };
 
-static bool
-validate_var(struct efi_variable *var, u8 *data, unsigned long len)
+bool
+efivar_validate(struct efi_variable *var, u8 *data, unsigned long len)
 {
 	int i;
 	u16 *unicode_name = var->VariableName;
@@ -382,47 +359,16 @@ validate_var(struct efi_variable *var, u8 *data, unsigned long len)
 
 	return true;
 }
+EXPORT_SYMBOL_GPL(efivar_validate);
 
 static efi_status_t
-get_var_data_locked(struct efivars *efivars, struct efi_variable *var)
-{
-	efi_status_t status;
-
-	var->DataSize = 1024;
-	status = efivars->ops->get_variable(var->VariableName,
-					    &var->VendorGuid,
-					    &var->Attributes,
-					    &var->DataSize,
-					    var->Data);
-	return status;
-}
-
-static efi_status_t
-get_var_data(struct efivars *efivars, struct efi_variable *var)
-{
-	efi_status_t status;
-	unsigned long flags;
-
-	spin_lock_irqsave(&efivars->lock, flags);
-	status = get_var_data_locked(efivars, var);
-	spin_unlock_irqrestore(&efivars->lock, flags);
-
-	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "efivars: get_variable() failed 0x%lx!\n",
-			status);
-	}
-	return status;
-}
-
-static efi_status_t
-check_var_size_locked(struct efivars *efivars, u32 attributes,
-			unsigned long size)
+check_var_size(u32 attributes, unsigned long size)
 {
 	u64 storage_size, remaining_size, max_size;
 	efi_status_t status;
-	const struct efivar_operations *fops = efivars->ops;
+	const struct efivar_operations *fops = __efivars->ops;
 
-	if (!efivars->ops->query_variable_info)
+	if (!fops->query_variable_info)
 		return EFI_UNSUPPORTED;
 
 	status = fops->query_variable_info(attributes, &storage_size,
@@ -438,20 +384,6 @@ check_var_size_locked(struct efivars *efivars, u32 attributes,
 	return status;
 }
 
-
-static efi_status_t
-check_var_size(struct efivars *efivars, u32 attributes, unsigned long size)
-{
-	efi_status_t status;
-	unsigned long flags;
-
-	spin_lock_irqsave(&efivars->lock, flags);
-	status = check_var_size_locked(efivars, attributes, size);
-	spin_unlock_irqrestore(&efivars->lock, flags);
-
-	return status;
-}
-
 static ssize_t
 efivar_guid_read(struct efivar_entry *entry, char *buf)
 {
@@ -473,13 +405,12 @@ efivar_attr_read(struct efivar_entry *entry, char *buf)
 {
 	struct efi_variable *var = &entry->var;
 	char *str = buf;
-	efi_status_t status;
 
 	if (!entry || !buf)
 		return -EINVAL;
 
-	status = get_var_data(__efivars, var);
-	if (status != EFI_SUCCESS)
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
 		return -EIO;
 
 	if (var->Attributes & EFI_VARIABLE_NON_VOLATILE)
@@ -507,13 +438,12 @@ efivar_size_read(struct efivar_entry *entry, char *buf)
 {
 	struct efi_variable *var = &entry->var;
 	char *str = buf;
-	efi_status_t status;
 
 	if (!entry || !buf)
 		return -EINVAL;
 
-	status = get_var_data(__efivars, var);
-	if (status != EFI_SUCCESS)
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
 		return -EIO;
 
 	str += sprintf(str, "0x%lx\n", var->DataSize);
@@ -524,13 +454,12 @@ static ssize_t
 efivar_data_read(struct efivar_entry *entry, char *buf)
 {
 	struct efi_variable *var = &entry->var;
-	efi_status_t status;
 
 	if (!entry || !buf)
 		return -EINVAL;
 
-	status = get_var_data(__efivars, var);
-	if (status != EFI_SUCCESS)
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
 		return -EIO;
 
 	memcpy(buf, var->Data, var->DataSize);
@@ -544,8 +473,7 @@ static ssize_t
 efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
 {
 	struct efi_variable *new_var, *var = &entry->var;
-	struct efivars *efivars = __efivars;
-	efi_status_t status = EFI_NOT_FOUND;
+	int err;
 
 	if (count != sizeof(struct efi_variable))
 		return -EINVAL;
@@ -567,32 +495,20 @@ efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
 	}
 
 	if ((new_var->Attributes & ~EFI_VARIABLE_MASK) != 0 ||
-	    validate_var(new_var, new_var->Data, new_var->DataSize) == false) {
+	    efivar_validate(new_var, new_var->Data, new_var->DataSize) == false) {
 		printk(KERN_ERR "efivars: Malformed variable content\n");
 		return -EINVAL;
 	}
 
-	spin_lock_irq(&efivars->lock);
-
-	status = check_var_size_locked(efivars, new_var->Attributes,
-	       new_var->DataSize + utf16_strsize(new_var->VariableName, 1024));
-
-	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
-		status = efivars->ops->set_variable(new_var->VariableName,
-						    &new_var->VendorGuid,
-						    new_var->Attributes,
-						    new_var->DataSize,
-						    new_var->Data);
-
-	spin_unlock_irq(&efivars->lock);
+	memcpy(&entry->var, new_var, count);
 
-	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
-			status);
+	err = efivar_entry_set(entry, new_var->Attributes,
+			       new_var->DataSize, new_var->Data, false);
+	if (err) {
+		printk(KERN_WARNING "efivars: set_variable() failed: status=%d\n", err);
 		return -EIO;
 	}
 
-	memcpy(&entry->var, new_var, count);
 	return count;
 }
 
@@ -600,16 +516,17 @@ static ssize_t
 efivar_show_raw(struct efivar_entry *entry, char *buf)
 {
 	struct efi_variable *var = &entry->var;
-	efi_status_t status;
 
 	if (!entry || !buf)
 		return 0;
 
-	status = get_var_data(__efivars, var);
-	if (status != EFI_SUCCESS)
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &entry->var.Attributes,
+			     &entry->var.DataSize, entry->var.Data))
 		return -EIO;
 
 	memcpy(buf, var, sizeof(*var));
+
 	return sizeof(*var);
 }
 
@@ -698,6 +615,9 @@ static int efi_status_to_err(efi_status_t status)
 	int err;
 
 	switch (status) {
+	case EFI_SUCCESS:
+		err = 0;
+		break;
 	case EFI_INVALID_PARAMETER:
 		err = -EINVAL;
 		break;
@@ -714,7 +634,7 @@ static int efi_status_to_err(efi_status_t status)
 		err = -EACCES;
 		break;
 	case EFI_NOT_FOUND:
-		err = -EIO;
+		err = -ENOENT;
 		break;
 	default:
 		err = -EINVAL;
@@ -727,14 +647,12 @@ static ssize_t efivarfs_file_write(struct file *file,
 		const char __user *userbuf, size_t count, loff_t *ppos)
 {
 	struct efivar_entry *var = file->private_data;
-	struct efivars *efivars = __efivars;
-	efi_status_t status;
 	void *data;
 	u32 attributes;
 	struct inode *inode = file->f_mapping->host;
 	unsigned long datasize = count - sizeof(attributes);
-	unsigned long newdatasize, varsize;
 	ssize_t bytes = 0;
+	bool set = false;
 
 	if (count < sizeof(attributes))
 		return -EINVAL;
@@ -745,23 +663,6 @@ static ssize_t efivarfs_file_write(struct file *file,
 	if (attributes & ~(EFI_VARIABLE_MASK))
 		return -EINVAL;
 
-	/*
-	 * Ensure that the user can't allocate arbitrarily large
-	 * amounts of memory. Pick a default size of 64K if
-	 * QueryVariableInfo() isn't supported by the firmware.
-	 */
-
-	varsize = datasize + utf16_strsize(var->var.VariableName, 1024);
-	status = check_var_size(efivars, attributes, varsize);
-
-	if (status != EFI_SUCCESS) {
-		if (status != EFI_UNSUPPORTED)
-			return efi_status_to_err(status);
-
-		if (datasize > 65536)
-			return -ENOSPC;
-	}
-
 	data = kmalloc(datasize, GFP_KERNEL);
 	if (!data)
 		return -ENOMEM;
@@ -771,76 +672,24 @@ static ssize_t efivarfs_file_write(struct file *file,
 		goto out;
 	}
 
-	if (validate_var(&var->var, data, datasize) == false) {
-		bytes = -EINVAL;
+	bytes = efivar_entry_set_get_size(var, attributes, &datasize,
+					  data, &set);
+	if (!set && bytes)
 		goto out;
-	}
-
-	/*
-	 * The lock here protects the get_variable call, the conditional
-	 * set_variable call, and removal of the variable from the efivars
-	 * list (in the case of an authenticated delete).
-	 */
-	spin_lock_irq(&efivars->lock);
-
-	/*
-	 * Ensure that the available space hasn't shrunk below the safe level
-	 */
-
-	status = check_var_size_locked(efivars, attributes, varsize);
-
-	if (status != EFI_SUCCESS && status != EFI_UNSUPPORTED) {
-		spin_unlock_irq(&efivars->lock);
-		kfree(data);
-
-		return efi_status_to_err(status);
-	}
-
-	status = efivars->ops->set_variable(var->var.VariableName,
-					    &var->var.VendorGuid,
-					    attributes, datasize,
-					    data);
-
-	if (status != EFI_SUCCESS) {
-		spin_unlock_irq(&efivars->lock);
-		kfree(data);
-
-		return efi_status_to_err(status);
-	}
-
-	bytes = count;
 
-	/*
-	 * Writing to the variable may have caused a change in size (which
-	 * could either be an append or an overwrite), or the variable to be
-	 * deleted. Perform a GetVariable() so we can tell what actually
-	 * happened.
-	 */
-	newdatasize = 0;
-	status = efivars->ops->get_variable(var->var.VariableName,
-					    &var->var.VendorGuid,
-					    NULL, &newdatasize,
-					    NULL);
-
-	if (status == EFI_BUFFER_TOO_SMALL) {
-		spin_unlock_irq(&efivars->lock);
+	if (!bytes) {
 		mutex_lock(&inode->i_mutex);
-		i_size_write(inode, newdatasize + sizeof(attributes));
+		i_size_write(inode, datasize + sizeof(attributes));
 		mutex_unlock(&inode->i_mutex);
-
-	} else if (status == EFI_NOT_FOUND) {
-		list_del(&var->list);
-		spin_unlock_irq(&efivars->lock);
-		efivar_unregister(var);
+	} else if (bytes == -ENOENT) {
 		drop_nlink(inode);
 		d_delete(file->f_dentry);
 		dput(file->f_dentry);
-
-	} else {
-		spin_unlock_irq(&efivars->lock);
+	} else
 		pr_warn("efivarfs: inconsistent EFI variable implementation? "
-				"status = %lx\n", status);
-	}
+				"status=%zu\n", bytes);
+
+	bytes = count;
 
 out:
 	kfree(data);
@@ -852,38 +701,25 @@ static ssize_t efivarfs_file_read(struct file *file, char __user *userbuf,
 		size_t count, loff_t *ppos)
 {
 	struct efivar_entry *var = file->private_data;
-	struct efivars *efivars = __efivars;
-	efi_status_t status;
 	unsigned long datasize = 0;
 	u32 attributes;
 	void *data;
 	ssize_t size = 0;
+	int err;
 
-	spin_lock_irq(&efivars->lock);
-	status = efivars->ops->get_variable(var->var.VariableName,
-					    &var->var.VendorGuid,
-					    &attributes, &datasize, NULL);
-	spin_unlock_irq(&efivars->lock);
-
-	if (status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
+	err = efivar_entry_size(var, &datasize);
+	if (err)
+		return err;
 
 	data = kmalloc(datasize + sizeof(attributes), GFP_KERNEL);
 
 	if (!data)
 		return -ENOMEM;
 
-	spin_lock_irq(&efivars->lock);
-	status = efivars->ops->get_variable(var->var.VariableName,
-					    &var->var.VendorGuid,
-					    &attributes, &datasize,
-					    (data + sizeof(attributes)));
-	spin_unlock_irq(&efivars->lock);
-
-	if (status != EFI_SUCCESS) {
-		size = efi_status_to_err(status);
+	size = efivar_entry_get(var, &attributes, &datasize,
+				data + sizeof(attributes));
+	if (size)
 		goto out_free;
-	}
 
 	memcpy(data, &attributes, sizeof(attributes));
 	size = simple_read_from_buffer(userbuf, count, ppos,
@@ -947,17 +783,17 @@ static struct inode *efivarfs_get_inode(struct super_block *sb,
  */
 static bool efivarfs_valid_name(const char *str, int len)
 {
-	static const char dashes[GUID_LEN] = {
+	static const char dashes[EFI_VARIABLE_GUID_LEN] = {
 		[8] = 1, [13] = 1, [18] = 1, [23] = 1
 	};
-	const char *s = str + len - GUID_LEN;
+	const char *s = str + len - EFI_VARIABLE_GUID_LEN;
 	int i;
 
 	/*
 	 * We need a GUID, plus at least one letter for the variable name,
 	 * plus the '-' separator
 	 */
-	if (len < GUID_LEN + 2)
+	if (len < EFI_VARIABLE_GUID_LEN + 2)
 		return false;
 
 	/* GUID must be preceded by a '-' */
@@ -969,7 +805,7 @@ static bool efivarfs_valid_name(const char *str, int len)
 	 *
 	 *	12345678-1234-1234-1234-123456789abc
 	 */
-	for (i = 0; i < GUID_LEN; i++) {
+	for (i = 0; i < EFI_VARIABLE_GUID_LEN; i++) {
 		if (dashes[i]) {
 			if (*s++ != '-')
 				return false;
@@ -1006,7 +842,6 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
 			  umode_t mode, bool excl)
 {
 	struct inode *inode;
-	struct efivars *efivars = __efivars;
 	struct efivar_entry *var;
 	int namelen, i = 0, err = 0;
 
@@ -1024,7 +859,7 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* length of the variable name itself: remove GUID and separator */
-	namelen = dentry->d_name.len - GUID_LEN - 1;
+	namelen = dentry->d_name.len - EFI_VARIABLE_GUID_LEN - 1;
 
 	efivarfs_hex_to_guid(dentry->d_name.name + namelen + 1,
 			&var->var.VendorGuid);
@@ -1035,17 +870,8 @@ static int efivarfs_create(struct inode *dir, struct dentry *dentry,
 	var->var.VariableName[i] = '\0';
 
 	inode->i_private = var;
-	var->kobj.kset = efivars->kset;
-
-	err = kobject_init_and_add(&var->kobj, &efivar_ktype, NULL, "%s",
-			     dentry->d_name.name);
-	if (err)
-		goto out;
 
-	kobject_uevent(&var->kobj, KOBJ_ADD);
-	spin_lock_irq(&efivars->lock);
-	list_add(&var->list, &efivars->list);
-	spin_unlock_irq(&efivars->lock);
+	efivar_entry_add(var, &efivarfs_list);
 	d_instantiate(dentry, inode);
 	dget(dentry);
 out:
@@ -1059,26 +885,13 @@ out:
 static int efivarfs_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct efivar_entry *var = dentry->d_inode->i_private;
-	struct efivars *efivars = __efivars;
-	efi_status_t status;
-
-	spin_lock_irq(&efivars->lock);
 
-	status = efivars->ops->set_variable(var->var.VariableName,
-					    &var->var.VendorGuid,
-					    0, 0, NULL);
-
-	if (status == EFI_SUCCESS || status == EFI_NOT_FOUND) {
-		list_del(&var->list);
-		spin_unlock_irq(&efivars->lock);
-		efivar_unregister(var);
-		drop_nlink(dentry->d_inode);
-		dput(dentry);
-		return 0;
-	}
+	if (efivar_entry_delete(var))
+		return -EINVAL;
 
-	spin_unlock_irq(&efivars->lock);
-	return -EINVAL;
+	drop_nlink(dentry->d_inode);
+	dput(dentry);
+	return 0;
 };
 
 /*
@@ -1097,7 +910,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
 			      unsigned int len, const char *str,
 			      const struct qstr *name)
 {
-	int guid = len - GUID_LEN;
+	int guid = len - EFI_VARIABLE_GUID_LEN;
 
 	if (name->len != len)
 		return 1;
@@ -1107,7 +920,7 @@ static int efivarfs_d_compare(const struct dentry *parent, const struct inode *p
 		return 1;
 
 	/* Case-insensitive compare for the GUID */
-	return strncasecmp(name->name + guid, str + guid, GUID_LEN);
+	return strncasecmp(name->name + guid, str + guid, EFI_VARIABLE_GUID_LEN);
 }
 
 static int efivarfs_d_hash(const struct dentry *dentry,
@@ -1120,7 +933,7 @@ static int efivarfs_d_hash(const struct dentry *dentry,
 	if (!efivarfs_valid_name(s, len))
 		return -EINVAL;
 
-	while (len-- > GUID_LEN)
+	while (len-- > EFI_VARIABLE_GUID_LEN)
 		hash = partial_name_hash(*s++, hash);
 
 	/* GUID is case-insensitive. */
@@ -1166,15 +979,87 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
 	return ERR_PTR(-ENOMEM);
 }
 
-static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
+static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
+			     unsigned long name_size, void *data)
 {
+	struct super_block *sb = (struct super_block *)data;
+	struct efivar_entry *entry;
 	struct inode *inode = NULL;
-	struct dentry *root;
-	struct efivar_entry *entry, *n;
-	struct efivars *efivars = __efivars;
+	struct dentry *dentry, *root = sb->s_root;
+	unsigned long size = 0;
 	char *name;
+	int len, i;
 	int err = -ENOMEM;
 
+	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return err;
+
+	memcpy(entry->var.VariableName, name16, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	len = utf16_strlen(entry->var.VariableName);
+
+	/* name, plus '-', plus GUID, plus NUL*/
+	name = kmalloc(len + 1 + EFI_VARIABLE_GUID_LEN + 1, GFP_KERNEL);
+	if (!name)
+		goto fail;
+
+	for (i = 0; i < len; i++)
+		name[i] = entry->var.VariableName[i] & 0xFF;
+
+	name[len] = '-';
+
+	efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+
+	name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
+
+	inode = efivarfs_get_inode(sb, root->d_inode, S_IFREG | 0644, 0);
+	if (!inode)
+		goto fail_name;
+
+	dentry = efivarfs_alloc_dentry(root, name);
+	if (IS_ERR(dentry)) {
+		err = PTR_ERR(dentry);
+		goto fail_inode;
+	}
+
+	/* copied by the above to local storage in the dentry. */
+	kfree(name);
+
+	efivar_entry_size(entry, &size);
+	efivar_entry_add(entry, &efivarfs_list);
+
+	mutex_lock(&inode->i_mutex);
+	inode->i_private = entry;
+	i_size_write(inode, size + sizeof(entry->var.Attributes));
+	mutex_unlock(&inode->i_mutex);
+	d_add(dentry, inode);
+
+	return 0;
+
+fail_inode:
+	iput(inode);
+fail_name:
+	kfree(name);
+fail:
+	kfree(entry);
+	return err;
+}
+
+static int efivarfs_destroy(struct efivar_entry *entry, void *data)
+{
+	efivar_entry_remove(entry);
+	kfree(entry);
+	return 0;
+}
+
+static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+	struct dentry *root;
+	int err;
+
 	efivarfs_sb = sb;
 
 	sb->s_maxbytes          = MAX_LFS_FILESIZE;
@@ -1195,65 +1080,13 @@ static int efivarfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!root)
 		return -ENOMEM;
 
-	list_for_each_entry_safe(entry, n, &efivars->list, list) {
-		struct dentry *dentry, *root = efivarfs_sb->s_root;
-		unsigned long size = 0;
-		int len, i;
-
-		inode = NULL;
-
-		len = utf16_strlen(entry->var.VariableName);
-
-		/* name, plus '-', plus GUID, plus NUL*/
-		name = kmalloc(len + 1 + GUID_LEN + 1, GFP_ATOMIC);
-		if (!name)
-			goto fail;
-
-		for (i = 0; i < len; i++)
-			name[i] = entry->var.VariableName[i] & 0xFF;
-
-		name[len] = '-';
-
-		efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
-
-		name[len+GUID_LEN+1] = '\0';
-
-		inode = efivarfs_get_inode(efivarfs_sb, root->d_inode,
-					  S_IFREG | 0644, 0);
-		if (!inode)
-			goto fail_name;
-
-		dentry = efivarfs_alloc_dentry(root, name);
-		if (IS_ERR(dentry)) {
-			err = PTR_ERR(dentry);
-			goto fail_inode;
-		}
-
-		/* copied by the above to local storage in the dentry. */
-		kfree(name);
-
-		spin_lock_irq(&efivars->lock);
-		efivars->ops->get_variable(entry->var.VariableName,
-					   &entry->var.VendorGuid,
-					   &entry->var.Attributes,
-					   &size,
-					   NULL);
-		spin_unlock_irq(&efivars->lock);
-
-		mutex_lock(&inode->i_mutex);
-		inode->i_private = entry;
-		i_size_write(inode, size + sizeof(entry->var.Attributes));
-		mutex_unlock(&inode->i_mutex);
-		d_add(dentry, inode);
-	}
+	INIT_LIST_HEAD(&efivarfs_list);
 
-	return 0;
+	err = efivar_init(efivarfs_callback, (void *)sb, false,
+			  true, &efivarfs_list);
+	if (err)
+		__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
 
-fail_inode:
-	iput(inode);
-fail_name:
-	kfree(name);
-fail:
 	return err;
 }
 
@@ -1267,6 +1100,9 @@ static void efivarfs_kill_sb(struct super_block *sb)
 {
 	kill_litter_super(sb);
 	efivarfs_sb = NULL;
+
+	/* Remove all entries and destroy */
+	__efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL, NULL);
 }
 
 static struct file_system_type efivarfs_type = {
@@ -1298,80 +1134,84 @@ static const struct inode_operations efivarfs_dir_inode_operations = {
 
 static int efi_pstore_open(struct pstore_info *psi)
 {
-	struct efivars *efivars = __efivars;
-
-	spin_lock_irq(&efivars->lock);
-	efivars->walk_entry = list_first_entry(&efivars->list,
-					       struct efivar_entry, list);
+	efivar_entry_iter_begin();
+	psi->data = NULL;
 	return 0;
 }
 
 static int efi_pstore_close(struct pstore_info *psi)
 {
-	struct efivars *efivars = __efivars;
-
-	spin_unlock_irq(&efivars->lock);
+	efivar_entry_iter_end();
+	psi->data = NULL;
 	return 0;
 }
 
-static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
-			       int *count, struct timespec *timespec,
-			       char **buf, struct pstore_info *psi)
+struct pstore_read_data {
+	u64 *id;
+	enum pstore_type_id *type;
+	int *count;
+	struct timespec *timespec;
+	char **buf;
+};
+
+static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
 {
 	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	struct efivars *efivars = __efivars;
+	struct pstore_read_data *cb_data = data;
 	char name[DUMP_NAME_LEN];
 	int i;
 	int cnt;
-	unsigned int part, size;
-	unsigned long time;
-
-	while (&efivars->walk_entry->list != &efivars->list) {
-		if (!efi_guidcmp(efivars->walk_entry->var.VendorGuid,
-				 vendor)) {
-			for (i = 0; i < DUMP_NAME_LEN; i++) {
-				name[i] = efivars->walk_entry->var.VariableName[i];
-			}
-			if (sscanf(name, "dump-type%u-%u-%d-%lu",
-				   type, &part, &cnt, &time) == 4) {
-				*id = part;
-				*count = cnt;
-				timespec->tv_sec = time;
-				timespec->tv_nsec = 0;
-			} else if (sscanf(name, "dump-type%u-%u-%lu",
-				   type, &part, &time) == 3) {
-				/*
-				 * Check if an old format,
-				 * which doesn't support holding
-				 * multiple logs, remains.
-				 */
-				*id = part;
-				*count = 0;
-				timespec->tv_sec = time;
-				timespec->tv_nsec = 0;
-			} else {
-				efivars->walk_entry = list_entry(
-						efivars->walk_entry->list.next,
-						struct efivar_entry, list);
-				continue;
-			}
+	unsigned int part;
+	unsigned long time, size;
 
-			get_var_data_locked(efivars, &efivars->walk_entry->var);
-			size = efivars->walk_entry->var.DataSize;
-			*buf = kmalloc(size, GFP_KERNEL);
-			if (*buf == NULL)
-				return -ENOMEM;
-			memcpy(*buf, efivars->walk_entry->var.Data,
-			       size);
-			efivars->walk_entry = list_entry(
-					efivars->walk_entry->list.next,
-					struct efivar_entry, list);
-			return size;
-		}
-		efivars->walk_entry = list_entry(efivars->walk_entry->list.next,
-						 struct efivar_entry, list);
-	}
-	return 0;
+	if (efi_guidcmp(entry->var.VendorGuid, vendor))
+		return 0;
+
+	for (i = 0; i < DUMP_NAME_LEN; i++)
+		name[i] = entry->var.VariableName[i];
+
+	if (sscanf(name, "dump-type%u-%u-%d-%lu",
+		   cb_data->type, &part, &cnt, &time) == 4) {
+		*cb_data->id = part;
+		*cb_data->count = cnt;
+		cb_data->timespec->tv_sec = time;
+		cb_data->timespec->tv_nsec = 0;
+	} else if (sscanf(name, "dump-type%u-%u-%lu",
+			  cb_data->type, &part, &time) == 3) {
+		/*
+		 * Check if an old format,
+		 * which doesn't support holding
+		 * multiple logs, remains.
+		 */
+		*cb_data->id = part;
+		*cb_data->count = 0;
+		cb_data->timespec->tv_sec = time;
+		cb_data->timespec->tv_nsec = 0;
+	} else
+		return 0;
+
+	__efivar_entry_size(entry, &size);
+	*cb_data->buf = kmalloc(size, GFP_KERNEL);
+	if (*cb_data->buf == NULL)
+		return -ENOMEM;
+	memcpy(*cb_data->buf, entry->var.Data, size);
+	return size;
+}
+
+static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
+			       int *count, struct timespec *timespec,
+			       char **buf, struct pstore_info *psi)
+{
+	struct pstore_read_data data;
+
+	data.id = id;
+	data.type = type;
+	data.count = count;
+	data.timespec = timespec;
+	data.buf = buf;
+
+	return __efivar_entry_iter(efi_pstore_read_func, &efivar_sysfs_list, &data,
+				   (struct efivar_entry **)&psi->data);
 }
 
 static int efi_pstore_write(enum pstore_type_id type,
@@ -1382,36 +1222,7 @@ static int efi_pstore_write(enum pstore_type_id type,
 	char name[DUMP_NAME_LEN];
 	efi_char16_t efi_name[DUMP_NAME_LEN];
 	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	struct efivars *efivars = __efivars;
 	int i, ret = 0;
-	efi_status_t status = EFI_NOT_FOUND;
-	unsigned long flags;
-
-	if (pstore_cannot_block_path(reason)) {
-		/*
-		 * If the lock is taken by another cpu in non-blocking path,
-		 * this driver returns without entering firmware to avoid
-		 * hanging up.
-		 */
-		if (!spin_trylock_irqsave(&efivars->lock, flags))
-			return -EBUSY;
-	} else
-		spin_lock_irqsave(&efivars->lock, flags);
-
-	/*
-	 * Check if there is a space enough to log.
-	 * size: a size of logging data
-	 * DUMP_NAME_LEN * 2: a maximum size of variable name
-	 */
-
-	status = check_var_size_locked(efivars, PSTORE_EFI_ATTRIBUTES,
-					 size + DUMP_NAME_LEN * 2);
-
-	if (status) {
-		spin_unlock_irqrestore(&efivars->lock, flags);
-		*id = part;
-		return -ENOSPC;
-	}
 
 	sprintf(name, "dump-type%u-%u-%d-%lu", type, part, count,
 		get_seconds());
@@ -1419,10 +1230,9 @@ static int efi_pstore_write(enum pstore_type_id type,
 	for (i = 0; i < DUMP_NAME_LEN; i++)
 		efi_name[i] = name[i];
 
-	efivars->ops->set_variable(efi_name, &vendor, PSTORE_EFI_ATTRIBUTES,
-				   size, psi->buf);
-
-	spin_unlock_irqrestore(&efivars->lock, flags);
+	ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
+				    !pstore_cannot_block_path(reason),
+				    size, psi->buf);
 
 	if (reason == KMSG_DUMP_OOPS && efivar_wq_enabled)
 		schedule_work(&efivar_work);
@@ -1431,69 +1241,79 @@ static int efi_pstore_write(enum pstore_type_id type,
 	return ret;
 };
 
-static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
-			    struct timespec time, struct pstore_info *psi)
+struct pstore_erase_data {
+	u64 id;
+	enum pstore_type_id type;
+	int count;
+	struct timespec time;
+	efi_char16_t *name;
+};
+
+/*
+ * Clean up an entry with the same name
+ */
+static int efi_pstore_erase_func(struct efivar_entry *entry, void *data)
 {
-	char name[DUMP_NAME_LEN];
-	efi_char16_t efi_name[DUMP_NAME_LEN];
-	char name_old[DUMP_NAME_LEN];
-	efi_char16_t efi_name_old[DUMP_NAME_LEN];
+	struct pstore_erase_data *ed = data;
 	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	struct efivars *efivars = __efivars;
-	struct efivar_entry *entry, *found = NULL;
+	efi_char16_t efi_name_old[DUMP_NAME_LEN];
+	efi_char16_t *efi_name = ed->name;
+	unsigned long utf16_len = utf16_strlen(ed->name);
+	char name_old[DUMP_NAME_LEN];
 	int i;
 
-	sprintf(name, "dump-type%u-%u-%d-%lu", type, (unsigned int)id, count,
-		time.tv_sec);
+	if (efi_guidcmp(entry->var.VendorGuid, vendor))
+		return 0;
 
-	spin_lock_irq(&efivars->lock);
+	if (utf16_strncmp(entry->var.VariableName,
+			  efi_name, (size_t)utf16_len)) {
+		/*
+		 * Check if an old format, which doesn't support
+		 * holding multiple logs, remains.
+		 */
+		sprintf(name_old, "dump-type%u-%u-%lu", ed->type,
+			(unsigned int)ed->id, ed->time.tv_sec);
 
-	for (i = 0; i < DUMP_NAME_LEN; i++)
-		efi_name[i] = name[i];
+		for (i = 0; i < DUMP_NAME_LEN; i++)
+			efi_name_old[i] = name_old[i];
 
-	/*
-	 * Clean up an entry with the same name
-	 */
+		if (utf16_strncmp(entry->var.VariableName, efi_name_old,
+				  utf16_strlen(efi_name_old)))
+			return 0;
+	}
 
-	list_for_each_entry(entry, &efivars->list, list) {
-		get_var_data_locked(efivars, &entry->var);
-
-		if (efi_guidcmp(entry->var.VendorGuid, vendor))
-			continue;
-		if (utf16_strncmp(entry->var.VariableName, efi_name,
-				  utf16_strlen(efi_name))) {
-			/*
-			 * Check if an old format,
-			 * which doesn't support holding
-			 * multiple logs, remains.
-			 */
-			sprintf(name_old, "dump-type%u-%u-%lu", type,
-				(unsigned int)id, time.tv_sec);
+	/* found */
+	__efivar_entry_delete(entry);
+	return 1;
+}
 
-			for (i = 0; i < DUMP_NAME_LEN; i++)
-				efi_name_old[i] = name_old[i];
+static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
+			    struct timespec time, struct pstore_info *psi)
+{
+	struct pstore_erase_data edata;
+	struct efivar_entry *entry;
+	char name[DUMP_NAME_LEN];
+	efi_char16_t efi_name[DUMP_NAME_LEN];
+	int found, i;
 
-			if (utf16_strncmp(entry->var.VariableName, efi_name_old,
-					  utf16_strlen(efi_name_old)))
-				continue;
-		}
+	sprintf(name, "dump-type%u-%u-%d-%lu", type, (unsigned int)id, count,
+		time.tv_sec);
 
-		/* found */
-		found = entry;
-		efivars->ops->set_variable(entry->var.VariableName,
-					   &entry->var.VendorGuid,
-					   PSTORE_EFI_ATTRIBUTES,
-					   0, NULL);
-		break;
-	}
+	for (i = 0; i < DUMP_NAME_LEN; i++)
+		efi_name[i] = name[i];
 
-	if (found)
-		list_del(&found->list);
+	edata.id = id;
+	edata.type = type;
+	edata.count = count;
+	edata.time = time;
+	edata.name = efi_name;
 
-	spin_unlock_irq(&efivars->lock);
+	efivar_entry_iter_begin();
+	found = __efivar_entry_iter(efi_pstore_erase_func, &efivar_sysfs_list, &edata, &entry);
+	efivar_entry_iter_end();
 
 	if (found)
-		efivar_unregister(found);
+		efivar_unregister(entry);
 
 	return 0;
 }
@@ -1508,19 +1328,17 @@ static struct pstore_info efi_pstore_info = {
 	.erase		= efi_pstore_erase,
 };
 
-static void efivar_pstore_register(struct efivars *efivars)
+static void efivar_pstore_register(void)
 {
-	efivars->efi_pstore_info = efi_pstore_info;
-	efivars->efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL);
-	if (efivars->efi_pstore_info.buf) {
-		efivars->efi_pstore_info.bufsize = 1024;
-		efivars->efi_pstore_info.data = efivars;
-		spin_lock_init(&efivars->efi_pstore_info.buf_lock);
-		pstore_register(&efivars->efi_pstore_info);
+	efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL);
+	if (efi_pstore_info.buf) {
+		efi_pstore_info.bufsize = 1024;
+		spin_lock_init(&efi_pstore_info.buf_lock);
+		pstore_register(&efi_pstore_info);
 	}
 }
 #else
-static void efivar_pstore_register(struct efivars *efivars)
+static void efivar_pstore_register(void)
 {
 	return;
 }
@@ -1531,76 +1349,41 @@ static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 			     char *buf, loff_t pos, size_t count)
 {
 	struct efi_variable *new_var = (struct efi_variable *)buf;
-	struct efivars *efivars = __efivars;
-	struct efivar_entry *search_efivar, *n;
-	unsigned long strsize1, strsize2;
-	efi_status_t status = EFI_NOT_FOUND;
-	int found = 0;
+	struct efivar_entry *new_entry;
+	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
 	if ((new_var->Attributes & ~EFI_VARIABLE_MASK) != 0 ||
-	    validate_var(new_var, new_var->Data, new_var->DataSize) == false) {
+	    efivar_validate(new_var, new_var->Data, new_var->DataSize) == false) {
 		printk(KERN_ERR "efivars: Malformed variable content\n");
 		return -EINVAL;
 	}
 
-	spin_lock_irq(&efivars->lock);
-
-	/*
-	 * Does this variable already exist?
-	 */
-	list_for_each_entry_safe(search_efivar, n, &efivars->list, list) {
-		strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024);
-		strsize2 = utf16_strsize(new_var->VariableName, 1024);
-		if (strsize1 == strsize2 &&
-			!memcmp(&(search_efivar->var.VariableName),
-				new_var->VariableName, strsize1) &&
-			!efi_guidcmp(search_efivar->var.VendorGuid,
-				new_var->VendorGuid)) {
-			found = 1;
-			break;
-		}
-	}
-	if (found) {
-		spin_unlock_irq(&efivars->lock);
-		return -EINVAL;
-	}
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
 
-	status = check_var_size_locked(efivars, new_var->Attributes,
-	       new_var->DataSize + utf16_strsize(new_var->VariableName, 1024));
+	memcpy(&new_entry->var, new_var, sizeof(*new_var));
 
-	if (status && status != EFI_UNSUPPORTED) {
-		spin_unlock_irq(&efivars->lock);
-		return efi_status_to_err(status);
+	err = efivar_entry_set(new_entry, new_var->Attributes, new_var->DataSize,
+			       new_var->Data, &efivar_sysfs_list);
+	if (err) {
+		if (err == -EEXIST)
+			err = -EINVAL;
+		goto out;
 	}
 
-	/* now *really* create the variable via EFI */
-	status = efivars->ops->set_variable(new_var->VariableName,
-					    &new_var->VendorGuid,
-					    new_var->Attributes,
-					    new_var->DataSize,
-					    new_var->Data);
-
-	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
-			status);
-		spin_unlock_irq(&efivars->lock);
-		return -EIO;
-	}
-	spin_unlock_irq(&efivars->lock);
-
-	/* Create the entry in sysfs.  Locking is not required here */
-	status = efivar_create_sysfs_entry(efivars,
-					   utf16_strsize(new_var->VariableName,
-							 1024),
-					   new_var->VariableName,
-					   &new_var->VendorGuid);
-	if (status) {
-		printk(KERN_WARNING "efivars: variable created, but sysfs entry wasn't.\n");
+	if (efivar_create_sysfs_entry(new_entry)) {
+		printk(KERN_WARNING "efivars: failed to create sysfs entry.\n");
+		kfree(new_entry);
 	}
 	return count;
+
+out:
+	kfree(new_entry);
+	return err;
 }
 
 static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
@@ -1608,70 +1391,40 @@ static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
 			     char *buf, loff_t pos, size_t count)
 {
 	struct efi_variable *del_var = (struct efi_variable *)buf;
-	struct efivars *efivars = __efivars;
-	struct efivar_entry *search_efivar, *n;
-	unsigned long strsize1, strsize2;
-	efi_status_t status = EFI_NOT_FOUND;
-	int found = 0;
+	struct efivar_entry *entry;
+	int err = 0;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
-	spin_lock_irq(&efivars->lock);
+	efivar_entry_iter_begin();
+	entry = efivar_entry_find(del_var->VariableName, del_var->VendorGuid,
+				  &efivar_sysfs_list, true);
+	if (!entry)
+		err = -EINVAL;
+	else if (__efivar_entry_delete(entry))
+		err = -EIO;
 
-	/*
-	 * Does this variable already exist?
-	 */
-	list_for_each_entry_safe(search_efivar, n, &efivars->list, list) {
-		strsize1 = utf16_strsize(search_efivar->var.VariableName, 1024);
-		strsize2 = utf16_strsize(del_var->VariableName, 1024);
-		if (strsize1 == strsize2 &&
-			!memcmp(&(search_efivar->var.VariableName),
-				del_var->VariableName, strsize1) &&
-			!efi_guidcmp(search_efivar->var.VendorGuid,
-				del_var->VendorGuid)) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found) {
-		spin_unlock_irq(&efivars->lock);
-		return -EINVAL;
-	}
-	/* force the Attributes/DataSize to 0 to ensure deletion */
-	del_var->Attributes = 0;
-	del_var->DataSize = 0;
+	efivar_entry_iter_end();
 
-	status = efivars->ops->set_variable(del_var->VariableName,
-					    &del_var->VendorGuid,
-					    del_var->Attributes,
-					    del_var->DataSize,
-					    del_var->Data);
+	if (err)
+		return err;
 
-	if (status != EFI_SUCCESS) {
-		printk(KERN_WARNING "efivars: set_variable() failed: status=%lx\n",
-			status);
-		spin_unlock_irq(&efivars->lock);
-		return -EIO;
-	}
-	list_del(&search_efivar->list);
-	/* We need to release this lock before unregistering. */
-	spin_unlock_irq(&efivars->lock);
-	efivar_unregister(search_efivar);
+	efivar_unregister(entry);
 
 	/* It's dead Jim.... */
 	return count;
 }
 
-static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
+				struct list_head *head)
 {
 	struct efivar_entry *entry, *n;
-	struct efivars *efivars = __efivars;
 	unsigned long strsize1, strsize2;
 	bool found = false;
 
 	strsize1 = utf16_strsize(variable_name, 1024);
-	list_for_each_entry_safe(entry, n, &efivars->list, list) {
+	list_for_each_entry_safe(entry, n, head, list) {
 		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
 		if (strsize1 == strsize2 &&
 			!memcmp(variable_name, &(entry->var.VariableName),
@@ -1685,6 +1438,20 @@ static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor)
 	return found;
 }
 
+static int efivar_update_sysfs_entry(efi_char16_t *name, efi_guid_t vendor,
+				     unsigned long name_size, void *data)
+{
+	struct efivar_entry *entry = data;
+
+	if (efivar_entry_find(name, vendor, &efivar_sysfs_list, false))
+		return 0;
+
+	memcpy(entry->var.VariableName, name, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	return 1;
+}
+
 /*
  * Returns the size of variable_name, in bytes, including the
  * terminating NULL character, or variable_name_size if no NULL
@@ -1712,52 +1479,26 @@ static unsigned long var_name_strnsize(efi_char16_t *variable_name,
 
 static void efivar_update_sysfs_entries(struct work_struct *work)
 {
-	struct efivars *efivars = __efivars;
-	efi_guid_t vendor;
-	efi_char16_t *variable_name;
-	unsigned long variable_name_size = 1024;
-	efi_status_t status = EFI_NOT_FOUND;
-	bool found;
+	struct efivar_entry *entry;
+	int err;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return;
 
 	/* Add new sysfs entries */
 	while (1) {
-		variable_name = kzalloc(variable_name_size, GFP_KERNEL);
-		if (!variable_name) {
-			pr_err("efivars: Memory allocation failed.\n");
-			return;
-		}
+		memset(entry, 0, sizeof(*entry));
 
-		spin_lock_irq(&efivars->lock);
-		found = false;
-		while (1) {
-			variable_name_size = 1024;
-			status = efivars->ops->get_next_variable(
-							&variable_name_size,
-							variable_name,
-							&vendor);
-			if (status != EFI_SUCCESS) {
-				break;
-			} else {
-				if (!variable_is_present(variable_name,
-				    &vendor)) {
-					found = true;
-					break;
-				}
-			}
-		}
-		spin_unlock_irq(&efivars->lock);
-
-		if (!found) {
-			kfree(variable_name);
+		err = efivar_init(efivar_update_sysfs_entry, entry,
+				  true, false, &efivar_sysfs_list);
+		if (!err)
 			break;
-		} else {
-			variable_name_size = var_name_strnsize(variable_name,
-							       variable_name_size);
-			efivar_create_sysfs_entry(efivars,
-						  variable_name_size,
-						  variable_name, &vendor);
-		}
+
+		efivar_create_sysfs_entry(entry);
 	}
+
+	kfree(entry);
 }
 
 /*
@@ -1804,45 +1545,37 @@ static struct attribute_group efi_subsys_attr_group = {
 
 static struct kobject *efi_kobj;
 
-/*
- * efivar_create_sysfs_entry()
- * Requires:
- *    variable_name_size = number of bytes required to hold
- *                         variable_name (not counting the NULL
- *                         character at the end.
- *    efivars->lock is not held on entry or exit.
+/**
+ * efivar_create_sysfs_entry - create a new entry in sysfs
+ * @new_var: efivar entry to create
+ *
  * Returns 1 on failure, 0 on success
  */
 static int
-efivar_create_sysfs_entry(struct efivars *efivars,
-			  unsigned long variable_name_size,
-			  efi_char16_t *variable_name,
-			  efi_guid_t *vendor_guid)
+efivar_create_sysfs_entry(struct efivar_entry *new_var)
 {
 	int i, short_name_size;
 	char *short_name;
-	struct efivar_entry *new_efivar;
+	unsigned long variable_name_size;
+	efi_char16_t *variable_name;
+
+	variable_name = new_var->var.VariableName;
+	variable_name_size = utf16_strlen(variable_name) * sizeof(efi_char16_t);
 
 	/*
 	 * Length of the variable bytes in ASCII, plus the '-' separator,
 	 * plus the GUID, plus trailing NUL
 	 */
 	short_name_size = variable_name_size / sizeof(efi_char16_t)
-				+ 1 + GUID_LEN + 1;
+				+ 1 + EFI_VARIABLE_GUID_LEN + 1;
 
 	short_name = kzalloc(short_name_size, GFP_KERNEL);
-	new_efivar = kzalloc(sizeof(struct efivar_entry), GFP_KERNEL);
 
-	if (!short_name || !new_efivar)  {
+	if (!short_name) {
 		kfree(short_name);
-		kfree(new_efivar);
 		return 1;
 	}
 
-	memcpy(new_efivar->var.VariableName, variable_name,
-		variable_name_size);
-	memcpy(&(new_efivar->var.VendorGuid), vendor_guid, sizeof(efi_guid_t));
-
 	/* Convert Unicode to normal chars (assume top bits are 0),
 	   ala UTF-8 */
 	for (i=0; i < (int)(variable_name_size / sizeof(efi_char16_t)); i++) {
@@ -1852,30 +1585,25 @@ efivar_create_sysfs_entry(struct efivars *efivars,
 	   private variables from another's.         */
 
 	*(short_name + strlen(short_name)) = '-';
-	efi_guid_unparse(vendor_guid, short_name + strlen(short_name));
+	efi_guid_unparse(&new_var->var.VendorGuid,
+			 short_name + strlen(short_name));
 
-	new_efivar->kobj.kset = efivars->kset;
-	i = kobject_init_and_add(&new_efivar->kobj, &efivar_ktype, NULL,
-				 "%s", short_name);
-	if (i) {
-		kfree(short_name);
-		kfree(new_efivar);
-		return 1;
-	}
+	new_var->kobj.kset = efivars_kset;
 
-	kobject_uevent(&new_efivar->kobj, KOBJ_ADD);
+	i = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
+				   NULL, "%s", short_name);
 	kfree(short_name);
-	short_name = NULL;
+	if (i)
+		return 1;
 
-	spin_lock_irq(&efivars->lock);
-	list_add(&new_efivar->list, &efivars->list);
-	spin_unlock_irq(&efivars->lock);
+	kobject_uevent(&new_var->kobj, KOBJ_ADD);
+	efivar_entry_add(new_var, &efivar_sysfs_list);
 
 	return 0;
 }
 
 static int
-create_efivars_bin_attributes(struct efivars *efivars)
+create_efivars_bin_attributes(void)
 {
 	struct bin_attribute *attr;
 	int error;
@@ -1888,8 +1616,7 @@ create_efivars_bin_attributes(struct efivars *efivars)
 	attr->attr.name = "new_var";
 	attr->attr.mode = 0200;
 	attr->write = efivar_create;
-	attr->private = efivars;
-	efivars->new_var = attr;
+	efivars_new_var = attr;
 
 	/* del_var */
 	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
@@ -1900,61 +1627,59 @@ create_efivars_bin_attributes(struct efivars *efivars)
 	attr->attr.name = "del_var";
 	attr->attr.mode = 0200;
 	attr->write = efivar_delete;
-	attr->private = efivars;
-	efivars->del_var = attr;
+	efivars_del_var = attr;
 
-	sysfs_bin_attr_init(efivars->new_var);
-	sysfs_bin_attr_init(efivars->del_var);
+	sysfs_bin_attr_init(efivars_new_var);
+	sysfs_bin_attr_init(efivars_del_var);
 
 	/* Register */
-	error = sysfs_create_bin_file(&efivars->kset->kobj,
-				      efivars->new_var);
+	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_new_var);
 	if (error) {
 		printk(KERN_ERR "efivars: unable to create new_var sysfs file"
 			" due to error %d\n", error);
 		goto out_free;
 	}
-	error = sysfs_create_bin_file(&efivars->kset->kobj,
-				      efivars->del_var);
+
+	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_del_var);
 	if (error) {
 		printk(KERN_ERR "efivars: unable to create del_var sysfs file"
 			" due to error %d\n", error);
-		sysfs_remove_bin_file(&efivars->kset->kobj,
-				      efivars->new_var);
+		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
 		goto out_free;
 	}
 
 	return 0;
 out_free:
-	kfree(efivars->del_var);
-	efivars->del_var = NULL;
-	kfree(efivars->new_var);
-	efivars->new_var = NULL;
+	kfree(efivars_del_var);
+	efivars_del_var = NULL;
+	kfree(efivars_new_var);
+	efivars_new_var = NULL;
 	return error;
 }
 
-void unregister_efivars(struct efivars *efivars)
+static int efivars_sysfs_callback(efi_char16_t *name, efi_guid_t vendor,
+				  unsigned long name_size, void *data)
 {
-	struct efivar_entry *entry, *n;
+	struct efivar_entry *entry;
 
-	__efivars = NULL;
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
 
-	list_for_each_entry_safe(entry, n, &efivars->list, list) {
-		spin_lock_irq(&efivars->lock);
-		list_del(&entry->list);
-		spin_unlock_irq(&efivars->lock);
-		efivar_unregister(entry);
-	}
-	if (efivars->new_var)
-		sysfs_remove_bin_file(&efivars->kset->kobj, efivars->new_var);
-	if (efivars->del_var)
-		sysfs_remove_bin_file(&efivars->kset->kobj, efivars->del_var);
-	kfree(efivars->new_var);
-	kfree(efivars->del_var);
-	kobject_put(efivars->kobject);
-	kset_unregister(efivars->kset);
+	memcpy(entry->var.VariableName, name, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	efivar_create_sysfs_entry(entry);
+
+	return 0;
+}
+
+static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data)
+{
+	efivar_entry_remove(entry);
+	efivar_unregister(entry);
+	return 0;
 }
-EXPORT_SYMBOL_GPL(unregister_efivars);
 
 /*
  * Print a warning when duplicate EFI variables are encountered and
@@ -1985,43 +1710,91 @@ static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid,
 	kfree(s8);
 }
 
-int register_efivars(struct efivars *efivars,
-		     const struct efivar_operations *ops,
-		     struct kobject *parent_kobj)
+static struct kobject *efivars_kobj;
+
+void efivars_sysfs_exit(void)
 {
-	efi_status_t status = EFI_NOT_FOUND;
-	efi_guid_t vendor_guid;
-	efi_char16_t *variable_name;
-	unsigned long variable_name_size = 1024;
-	int error = 0;
+	/* Remove all entries and destroy */
+	__efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL, NULL);
+
+	if (efivars_new_var)
+		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
+	if (efivars_del_var)
+		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_del_var);
+	kfree(efivars_new_var);
+	kfree(efivars_del_var);
+	kobject_put(efivars_kobj);
+	kset_unregister(efivars_kset);
+}
 
-	__efivars = efivars;
+int efivars_sysfs_init(void)
+{
+	struct kobject *parent_kobj = efivars_kobject();
+	int error = 0;
 
-	variable_name = kzalloc(variable_name_size, GFP_KERNEL);
-	if (!variable_name) {
-		printk(KERN_ERR "efivars: Memory allocation failed.\n");
-		return -ENOMEM;
-	}
+	/* No efivars has been registered yet */
+	if (!parent_kobj)
+		return 0;
 
-	spin_lock_init(&efivars->lock);
-	INIT_LIST_HEAD(&efivars->list);
-	efivars->ops = ops;
+	printk(KERN_INFO "EFI Variables Facility v%s %s\n", EFIVARS_VERSION,
+	       EFIVARS_DATE);
 
-	efivars->kset = kset_create_and_add("vars", NULL, parent_kobj);
-	if (!efivars->kset) {
+	efivars_kset = kset_create_and_add("vars", NULL, parent_kobj);
+	if (!efivars_kset) {
 		printk(KERN_ERR "efivars: Subsystem registration failed.\n");
-		error = -ENOMEM;
-		goto out;
+		return -ENOMEM;
 	}
 
-	efivars->kobject = kobject_create_and_add("efivars", parent_kobj);
-	if (!efivars->kobject) {
+	efivars_kobj = kobject_create_and_add("efivars", parent_kobj);
+	if (!efivars_kobj) {
 		pr_err("efivars: Subsystem registration failed.\n");
-		error = -ENOMEM;
-		kset_unregister(efivars->kset);
-		goto out;
+		kset_unregister(efivars_kset);
+		return -ENOMEM;
+	}
+
+	efivar_init(efivars_sysfs_callback, NULL, false,
+		    true, &efivar_sysfs_list);
+
+	error = create_efivars_bin_attributes();
+	if (error)
+		efivars_sysfs_exit();
+
+	return error;
+}
+EXPORT_SYMBOL_GPL(efivars_sysfs_init);
+
+/**
+ * efivar_init - build the initial list of EFI variables
+ * @func: callback function to invoke for every variable
+ * @data: function-specific data to pass to @func
+ * @atomic: do we need to execute the @func-loop atomically?
+ * @duplicates: error if we encounter duplicates on @head?
+ * @head: initialised head of variable list
+ *
+ * Get every EFI variable from the firmware and invoke @func. @func
+ * should call efivar_entry_add() to build the list of variables.
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data, bool atomic, bool duplicates,
+		struct list_head *head)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	unsigned long variable_name_size = 1024;
+	efi_char16_t *variable_name;
+	efi_status_t status;
+	efi_guid_t vendor_guid;
+	int err = 0;
+
+	variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+	if (!variable_name) {
+		printk(KERN_ERR "efivars: Memory allocation failed.\n");
+		return -ENOMEM;
 	}
 
+	spin_lock_irq(&__efivars->lock);
+
 	/*
 	 * Per EFI spec, the maximum storage allocated for both
 	 * the variable name and variable data is 1024 bytes.
@@ -2035,6 +1808,9 @@ int register_efivars(struct efivars *efivars,
 						&vendor_guid);
 		switch (status) {
 		case EFI_SUCCESS:
+			if (!atomic)
+				spin_unlock_irq(&__efivars->lock);
+
 			variable_name_size = var_name_strnsize(variable_name,
 							       variable_name_size);
 
@@ -2046,17 +1822,24 @@ int register_efivars(struct efivars *efivars,
 			 * we'll ever see a different variable name,
 			 * and may end up looping here forever.
 			 */
-			if (variable_is_present(variable_name, &vendor_guid)) {
+			if (duplicates &&
+			    variable_is_present(variable_name, &vendor_guid, head)) {
 				dup_variable_bug(variable_name, &vendor_guid,
 						 variable_name_size);
+				if (!atomic)
+					spin_lock_irq(&__efivars->lock);
+
 				status = EFI_NOT_FOUND;
 				break;
 			}
 
-			efivar_create_sysfs_entry(efivars,
-						  variable_name_size,
-						  variable_name,
-						  &vendor_guid);
+			err = func(variable_name, vendor_guid, variable_name_size, data);
+			if (err)
+				status = EFI_NOT_FOUND;
+
+			if (!atomic)
+				spin_lock_irq(&__efivars->lock);
+
 			break;
 		case EFI_NOT_FOUND:
 			break;
@@ -2066,40 +1849,637 @@ int register_efivars(struct efivars *efivars,
 			status = EFI_NOT_FOUND;
 			break;
 		}
+
 	} while (status != EFI_NOT_FOUND);
 
-	error = create_efivars_bin_attributes(efivars);
-	if (error)
-		unregister_efivars(efivars);
+	spin_unlock_irq(&__efivars->lock);
+
+	kfree(variable_name);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(efivar_init);
+
+/**
+ * efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ */
+void efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+	spin_lock_irq(&__efivars->lock);
+	list_add(&entry->list, head);
+	spin_unlock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_add);
+
+/**
+ * efivar_entry_remove - remove entry from variable list
+ * @entry: entry to remove from list
+ */
+void efivar_entry_remove(struct efivar_entry *entry)
+{
+	spin_lock_irq(&__efivars->lock);
+	list_del(&entry->list);
+	spin_unlock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_remove);
+
+/*
+ * efivar_entry_list_del_unlock - remove entry from variable list
+ * @entry: entry to remove
+ *
+ * Remove @entry from the variable list and release the list lock.
+ *
+ * NOTE: slightly weird locking semantics here - we expect to be
+ * called with the efivars lock already held, and we release it before
+ * returning. This is because this function is usually called after
+ * set_variable() while the lock is still held.
+ */
+static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
+{
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	list_del(&entry->list);
+	spin_unlock_irq(&__efivars->lock);
+}
+
+/**
+ * __efivar_entry_delete - delete an EFI variable
+ * @entry: entry containing EFI variable to delete
+ *
+ * Delete the variable from the firmware and remove @entry from the
+ * variable list. It is the caller's responsibility to free @entry
+ * once we return.
+ *
+ * This function differs from efivar_entry_delete() because it is
+ * safe to be called from within a efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() region, unlike efivar_entry_delete().
+ *
+ * Returns 0 on success, or a converted EFI status code if
+ * set_variable() fails. If set_variable() fails the entry remains
+ * on the list.
+ */
+int __efivar_entry_delete(struct efivar_entry *entry)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	status = ops->set_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   0, 0, NULL);
+	if (status)
+		return efi_status_to_err(status);
+
+	list_del(&entry->list);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_delete);
+
+/**
+ * efivar_entry_delete - delete variable and remove entry from list
+ * @entry: entry containing variable to delete
+ *
+ * Delete the variable from the firmware and remove @entry from the
+ * variable list. It is the caller's responsibility to free @entry
+ * once we return.
+ *
+ * Returns 0 on success, or a converted EFI status code if
+ * set_variable() fails.
+ */
+int efivar_entry_delete(struct efivar_entry *entry)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	spin_lock_irq(&__efivars->lock);
+	status = ops->set_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   0, 0, NULL);
+	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
+		spin_unlock_irq(&__efivars->lock);
+		return efi_status_to_err(status);
+	}
+
+	efivar_entry_list_del_unlock(entry);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_delete);
+
+/**
+ * efivar_entry_set - call set_variable()
+ * @entry: entry containing the EFI variable to write
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer containing variable data
+ * @head: head of variable list
+ *
+ * Calls set_variable() for an EFI variable. If creating a new EFI
+ * variable, this function is usually followed by efivar_entry_add().
+ *
+ * Before writing the variable, the remaining EFI variable storage
+ * space is checked to ensure there is enough room available.
+ *
+ * If @head is not NULL a lookup is performed to determine whether
+ * the entry is already on the list.
+ *
+ * Returns 0 on success, -EEXIST if a lookup is performed and the entry
+ * already exists on the list, or a converted EFI status code if
+ * set_variable() fails.
+ */
+int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
+		     unsigned long size, void *data, struct list_head *head)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+	efi_char16_t *name = entry->var.VariableName;
+	efi_guid_t vendor = entry->var.VendorGuid;
+
+	spin_lock_irq(&__efivars->lock);
+
+	if (head && efivar_entry_find(name, vendor, head, false)) {
+		spin_unlock_irq(&__efivars->lock);
+		return -EEXIST;
+	}
+
+	status = check_var_size(attributes, size + utf16_strsize(name, 1024));
+	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
+		status = ops->set_variable(name, &vendor,
+					   attributes, size, data);
+
+	spin_unlock_irq(&__efivars->lock);
+
+	return efi_status_to_err(status);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_set);
+
+/**
+ * efivar_entry_set_safe - call set_variable() if enough space in firmware
+ * @name: buffer containing the variable name
+ * @vendor: variable vendor guid
+ * @attributes: variable attributes
+ * @block: can we block in this context?
+ * @size: size of @data buffer
+ * @data: buffer containing variable data
+ *
+ * Ensures there is enough free storage in the firmware for this variable, and
+ * if so, calls set_variable(). If creating a new EFI variable, this function
+ * is usually followed by efivar_entry_add().
+ *
+ * Returns 0 on success, -ENOSPC if the firmware does not have enough
+ * space for set_variable() to succeed, or a converted EFI status code
+ * if set_variable() fails.
+ */
+int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
+			  bool block, unsigned long size, void *data)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	unsigned long flags;
+	efi_status_t status;
+
+	if (!ops->query_variable_info)
+		return -ENOSYS;
+
+	if (!block && !spin_trylock_irqsave(&__efivars->lock, flags))
+		return -EBUSY;
+	else
+		spin_lock_irqsave(&__efivars->lock, flags);
+
+	status = check_var_size(attributes, size + utf16_strsize(name, 1024));
+	if (status != EFI_SUCCESS) {
+		spin_unlock_irqrestore(&__efivars->lock, flags);
+		return -ENOSPC;
+	}
+
+	status = ops->set_variable(name, &vendor, attributes, size, data);
+
+	spin_unlock_irqrestore(&__efivars->lock, flags);
+
+	return efi_status_to_err(status);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_set_safe);
+
+/**
+ * efivar_entry_find - search for an entry
+ * @name: the EFI variable name
+ * @guid: the EFI variable vendor's guid
+ * @head: head of the variable list
+ * @remove: should we remove the entry from the list?
+ *
+ * Search for an entry on the variable list that has the EFI variable
+ * name @name and vendor guid @guid. If an entry is found on the list
+ * and @remove is true, the entry is removed from the list.
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ *
+ * Returns the entry if found on the list, %NULL otherwise.
+ */
+struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
+				       struct list_head *head, bool remove)
+{
+	struct efivar_entry *entry, *n;
+	int strsize1, strsize2;
+	bool found = false;
+
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	list_for_each_entry_safe(entry, n, head, list) {
+		strsize1 = utf16_strsize(name, 1024);
+		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+		if (strsize1 == strsize2 &&
+		    !memcmp(name, &(entry->var.VariableName), strsize1) &&
+		    !efi_guidcmp(guid, entry->var.VendorGuid)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return NULL;
+
+	if (remove)
+		list_del(&entry->list);
+
+	return entry;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_find);
+
+/**
+ * __efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ */
+int __efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	*size = 0;
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid, NULL, size, NULL);
+	if (status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_size);
+
+/**
+ * efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ */
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	*size = 0;
+
+	spin_lock_irq(&__efivars->lock);
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid, NULL, size, NULL);
+	spin_unlock_irq(&__efivars->lock);
+
+	if (status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_size);
+
+/**
+ * efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ */
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		     unsigned long *size, void *data)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	spin_lock_irq(&__efivars->lock);
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   attributes, size, data);
+	spin_unlock_irq(&__efivars->lock);
+
+	return efi_status_to_err(status);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_get);
+
+/**
+ * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
+ * @entry: entry containing variable to set and get
+ * @attributes: attributes of variable to be written
+ * @size: size of data buffer
+ * @data: buffer containing data to write
+ * @set: did the set_variable() call succeed?
+ *
+ * This is a pretty special (complex) function. See efivarfs_file_write().
+ *
+ * Atomically call set_variable() for @entry and if the call is
+ * successful, return the new size of the variable from get_variable()
+ * in @size. The success of set_variable() is indicated by @set.
+ *
+ * Returns 0 on success, -EINVAL if the variable data is invalid,
+ * -ENOSPC if the firmware does not have enough available space, or a
+ * converted EFI status code if either of set_variable() or
+ * get_variable() fail.
+ *
+ * If the EFI variable does not exist when calling set_variable()
+ * (EFI_NOT_FOUND), @entry is removed from the variable list.
+ */
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+			      unsigned long *size, void *data, bool *set)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_char16_t *name = entry->var.VariableName;
+	efi_guid_t *vendor = &entry->var.VendorGuid;
+	efi_status_t status;
+	int err;
+
+	*set = false;
+
+	if (efivar_validate(&entry->var, data, *size) == false)
+		return -EINVAL;
+
+	/*
+	 * The lock here protects the get_variable call, the conditional
+	 * set_variable call, and removal of the variable from the efivars
+	 * list (in the case of an authenticated delete).
+	 */
+	spin_lock_irq(&__efivars->lock);
+
+	/*
+	 * Ensure that the available space hasn't shrunk below the safe level
+	 */
+	status = check_var_size(attributes, *size + utf16_strsize(name, 1024));
+	if (status != EFI_SUCCESS) {
+		if (status != EFI_UNSUPPORTED) {
+			err = efi_status_to_err(status);
+			goto out;
+		}
+
+		if (*size > 65536) {
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	status = ops->set_variable(name, vendor, attributes, *size, data);
+	if (status != EFI_SUCCESS) {
+		err = efi_status_to_err(status);
+		goto out;
+	}
+
+	*set = true;
+
+	/*
+	 * Writing to the variable may have caused a change in size (which
+	 * could either be an append or an overwrite), or the variable to be
+	 * deleted. Perform a GetVariable() so we can tell what actually
+	 * happened.
+	 */
+	*size = 0;
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   NULL, size, NULL);
+
+	if (status == EFI_NOT_FOUND)
+		efivar_entry_list_del_unlock(entry);
+	else
+		spin_unlock_irq(&__efivars->lock);
+
+	if (status && status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+
+out:
+	spin_unlock_irq(&__efivars->lock);
+	return err;
+
+}
+EXPORT_SYMBOL_GPL(efivar_entry_set_get_size);
+
+/**
+ * efivar_entry_iter_begin - begin iterating the variable list
+ *
+ * Lock the variable list to prevent entry insertion and removal until
+ * efivar_entry_iter_end() is called. This function is usually used in
+ * conjunction with __efivar_entry_iter() or efivar_entry_iter().
+ */
+void efivar_entry_iter_begin(void)
+{
+	spin_lock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_iter_begin);
+
+/**
+ * efivar_entry_iter_end - finish iterating the variable list
+ *
+ * Unlock the variable list and allow modifications to the list again.
+ */
+void efivar_entry_iter_end(void)
+{
+	spin_unlock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_iter_end);
+
+/**
+ * __efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of the variable list
+ * @data: function-specific data to pass to callback
+ * @prev: entry to begin iterating from
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete().
+ *
+ * You MUST call efivar_enter_iter_begin() before this function, and
+ * efivar_entry_iter_end() afterwards.
+ *
+ * It is possible to begin iteration from an arbitrary entry within
+ * the list by passing @prev. @prev is updated on return to point to
+ * the last entry passed to @func. To begin iterating from the
+ * beginning of the list @prev must be %NULL.
+ *
+ * The restrictions for @func are the same as documented for
+ * efivar_entry_iter().
+ */
+int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+			struct list_head *head, void *data,
+			struct efivar_entry **prev)
+{
+	struct efivar_entry *entry, *n;
+	int err = 0;
+
+	if (!prev || !*prev) {
+		list_for_each_entry_safe(entry, n, head, list) {
+			err = func(entry, data);
+			if (err)
+				break;
+		}
+
+		if (prev)
+			*prev = entry;
+
+		return err;
+	}
+
+
+	list_for_each_entry_safe_continue((*prev), n, head, list) {
+		err = func(*prev, data);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_iter);
+
+/**
+ * efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of variable list
+ * @data: function-specific data to pass to callback
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete() while iterating.
+ *
+ * Some notes for the callback function:
+ *  - a non-zero return value indicates an error and terminates the loop
+ *  - @func is called from atomic context
+ */
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+		      struct list_head *head, void *data)
+{
+	int err = 0;
+
+	efivar_entry_iter_begin();
+	err = __efivar_entry_iter(func, head, data, NULL);
+	efivar_entry_iter_end();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_iter);
+
+/**
+ * efivars_kobject - get the kobject for the registered efivars
+ *
+ * If efivars_register() has not been called we return NULL,
+ * otherwise return the kobject used at registration time.
+ */
+struct kobject *efivars_kobject(void)
+{
+	if (!__efivars)
+		return NULL;
+
+	return __efivars->kobject;
+}
+EXPORT_SYMBOL_GPL(efivars_kobject);
+
+/**
+ * efivars_register - register an efivars
+ * @efivars: efivars to register
+ * @ops: efivars operations
+ * @kobject: @efivars-specific kobject
+ *
+ * Only a single efivars can be registered at any time.
+ */
+int efivars_register(struct efivars *efivars,
+		     const struct efivar_operations *ops,
+		     struct kobject *kobject)
+{
+	spin_lock_init(&efivars->lock);
+	efivars->ops = ops;
+	efivars->kobject = kobject;
+
+	__efivars = efivars;
 
 	if (!efivars_pstore_disable)
-		efivar_pstore_register(efivars);
+		efivar_pstore_register();
 
 	register_filesystem(&efivarfs_type);
 
-out:
-	kfree(variable_name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivars_register);
 
-	return error;
+/**
+ * efivars_unregister - unregister an efivars
+ * @efivars: efivars to unregister
+ *
+ * The caller must have already removed every entry from the list,
+ * failure to do so is an error.
+ */
+int efivars_unregister(struct efivars *efivars)
+{
+	int rv;
+
+	if (!__efivars) {
+		printk(KERN_ERR "efivars not registered\n");
+		rv = -EINVAL;
+		goto out;
+	}
+
+	if (__efivars != efivars) {
+		rv = -EINVAL;
+		goto out;
+	}
+
+	__efivars = NULL;
+
+	rv = 0;
+out:
+	return rv;
 }
-EXPORT_SYMBOL_GPL(register_efivars);
+EXPORT_SYMBOL_GPL(efivars_unregister);
 
 static struct efivars generic_efivars;
 static struct efivar_operations generic_ops;
 
 static int generic_ops_register(void)
 {
+	int error;
+
 	generic_ops.get_variable = efi.get_variable;
 	generic_ops.set_variable = efi.set_variable;
 	generic_ops.get_next_variable = efi.get_next_variable;
 	generic_ops.query_variable_info = efi.query_variable_info;
 
-	return register_efivars(&generic_efivars, &generic_ops, efi_kobj);
+	error = efivars_register(&generic_efivars, &generic_ops, efi_kobj);
+	if (error)
+		return error;
+
+	error = efivars_sysfs_init();
+	if (error)
+		efivars_unregister(&generic_efivars);
+
+	return error;
 }
 
 static void generic_ops_unregister(void)
 {
-	unregister_efivars(&generic_efivars);
+	efivars_sysfs_exit();
+	efivars_unregister(&generic_efivars);
 }
 
 /*
@@ -2113,15 +2493,12 @@ static void generic_ops_unregister(void)
 static int __init
 efivars_init(void)
 {
-	int error = 0;
-
-	printk(KERN_INFO "EFI Variables Facility v%s %s\n", EFIVARS_VERSION,
-	       EFIVARS_DATE);
+	int error;
 
 	if (!efi_enabled(EFI_RUNTIME_SERVICES))
 		return 0;
 
-	/* For now we'll register the efi directory at /sys/firmware/efi */
+	/* Register the efi directory at /sys/firmware/efi */
 	efi_kobj = kobject_create_and_add("efi", firmware_kobj);
 	if (!efi_kobj) {
 		printk(KERN_ERR "efivars: Firmware registration failed.\n");
diff --git a/drivers/firmware/google/gsmi.c b/drivers/firmware/google/gsmi.c
index c409a7577afc..757b2d92d5b0 100644
--- a/drivers/firmware/google/gsmi.c
+++ b/drivers/firmware/google/gsmi.c
@@ -882,12 +882,19 @@ static __init int gsmi_init(void)
 		goto out_remove_bin_file;
 	}
 
-	ret = register_efivars(&efivars, &efivar_ops, gsmi_kobj);
+	ret = efivars_register(&efivars, &efivar_ops, gsmi_kobj);
 	if (ret) {
 		printk(KERN_INFO "gsmi: Failed to register efivars\n");
 		goto out_remove_sysfs_files;
 	}
 
+	ret = efivars_sysfs_init();
+	if (ret) {
+		printk(KERN_INFO "gsmi: Failed to create efivars files\n");
+		efivars_unregister(&efivars);
+		goto out_remove_sysfs_files;
+	}
+
 	register_reboot_notifier(&gsmi_reboot_notifier);
 	register_die_notifier(&gsmi_die_notifier);
 	atomic_notifier_chain_register(&panic_notifier_list,
@@ -919,7 +926,7 @@ static void __exit gsmi_exit(void)
 	unregister_die_notifier(&gsmi_die_notifier);
 	atomic_notifier_chain_unregister(&panic_notifier_list,
 					 &gsmi_panic_notifier);
-	unregister_efivars(&efivars);
+	efivars_unregister(&efivars);
 
 	sysfs_remove_files(gsmi_kobj, gsmi_attrs);
 	sysfs_remove_bin_file(gsmi_kobj, &eventlog_bin_attr);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index d1d782a6d34c..cd561b301e6a 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -662,6 +662,12 @@ static inline int efi_enabled(int facility)
 				EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS | \
 				EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS | \
 				EFI_VARIABLE_APPEND_WRITE)
+/*
+ * Length of a GUID string (strlen("aaaaaaaa-bbbb-cccc-dddd-eeeeeeeeeeee"))
+ * not including trailing NUL
+ */
+#define EFI_VARIABLE_GUID_LEN 36
+
 /*
  * The type of search to perform when calling boottime->locate_handle
  */
@@ -762,19 +768,75 @@ struct efivars {
 	 * which is protected by the BKL, so that path is safe.
 	 */
 	spinlock_t lock;
-	struct list_head list;
 	struct kset *kset;
 	struct kobject *kobject;
-	struct bin_attribute *new_var, *del_var;
 	const struct efivar_operations *ops;
-	struct efivar_entry *walk_entry;
-	struct pstore_info efi_pstore_info;
 };
 
-int register_efivars(struct efivars *efivars,
+/*
+ * The maximum size of VariableName + Data = 1024
+ * Therefore, it's reasonable to save that much
+ * space in each part of the structure,
+ * and we use a page for reading/writing.
+ */
+
+struct efi_variable {
+	efi_char16_t  VariableName[1024/sizeof(efi_char16_t)];
+	efi_guid_t    VendorGuid;
+	unsigned long DataSize;
+	__u8          Data[1024];
+	efi_status_t  Status;
+	__u32         Attributes;
+} __attribute__((packed));
+
+struct efivar_entry {
+	struct efi_variable var;
+	struct list_head list;
+	struct kobject kobj;
+};
+
+int efivars_register(struct efivars *efivars,
 		     const struct efivar_operations *ops,
-		     struct kobject *parent_kobj);
-void unregister_efivars(struct efivars *efivars);
+		     struct kobject *kobject);
+int efivars_unregister(struct efivars *efivars);
+struct kobject *efivars_kobject(void);
+
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data, bool atomic, bool duplicates,
+		struct list_head *head);
+
+void efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
+void efivar_entry_remove(struct efivar_entry *entry);
+
+int __efivar_entry_delete(struct efivar_entry *entry);
+int efivar_entry_delete(struct efivar_entry *entry);
+
+int __efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		     unsigned long *size, void *data);
+int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
+		     unsigned long size, void *data, struct list_head *head);
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+			      unsigned long *size, void *data, bool *set);
+int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
+			  bool block, unsigned long size, void *data);
+
+void efivar_entry_iter_begin(void);
+void efivar_entry_iter_end(void);
+
+int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+			struct list_head *head, void *data,
+			struct efivar_entry **prev);
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+		      struct list_head *head, void *data);
+
+struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
+				       struct list_head *head, bool remove);
+
+bool efivar_validate(struct efi_variable *var, u8 *data, unsigned long len);
+
+int efivars_sysfs_init(void);
 
 #endif /* CONFIG_EFI_VARS */
 
-- 
cgit 


From 048517722cde2595a7366d0c3c72b8b1ec142a9c Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Fri, 8 Feb 2013 15:48:51 +0000
Subject: efivars: Move pstore code into the new EFI directory

efivars.c has grown far too large and needs to be divided up. Create a
new directory and move the persistence storage code to efi-pstore.c now
that it uses the new efivar API. This helps us to greatly reduce the
size of efivars.c and paves the way for moving other code out of
efivars.c.

Note that because CONFIG_EFI_VARS can be built as a module efi-pstore
must also include support for building as a module.

Reviewed-by: Tom Gundersen <teg@jklm.no>
Tested-by: Tom Gundersen <teg@jklm.no>
Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Anton Vorontsov <cbouatmailru@gmail.com>
Cc: Colin Cross <ccross@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Tony Luck <tony.luck@intel.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 MAINTAINERS                       |   2 +-
 drivers/firmware/Kconfig          |  36 +----
 drivers/firmware/Makefile         |   1 +
 drivers/firmware/efi/Kconfig      |  39 ++++++
 drivers/firmware/efi/Makefile     |   4 +
 drivers/firmware/efi/efi-pstore.c | 244 +++++++++++++++++++++++++++++++++
 drivers/firmware/efivars.c        | 277 ++------------------------------------
 include/linux/efi.h               |  38 ++++++
 8 files changed, 340 insertions(+), 301 deletions(-)
 create mode 100644 drivers/firmware/efi/Kconfig
 create mode 100644 drivers/firmware/efi/Makefile
 create mode 100644 drivers/firmware/efi/efi-pstore.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 74e58a4d035b..9e4862bf9961 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6325,7 +6325,7 @@ S:	Maintained
 T:	git git://git.infradead.org/users/cbou/linux-pstore.git
 F:	fs/pstore/
 F:	include/linux/pstore*
-F:	drivers/firmware/efivars.c
+F:	drivers/firmware/efi/efi-pstore.c
 F:	drivers/acpi/apei/erst.c
 
 PTP HARDWARE CLOCK SUPPORT
diff --git a/drivers/firmware/Kconfig b/drivers/firmware/Kconfig
index 42c759a4d047..93876302fb2e 100644
--- a/drivers/firmware/Kconfig
+++ b/drivers/firmware/Kconfig
@@ -36,41 +36,6 @@ config FIRMWARE_MEMMAP
 
       See also Documentation/ABI/testing/sysfs-firmware-memmap.
 
-config EFI_VARS
-	tristate "EFI Variable Support via sysfs"
-	depends on EFI
-	default n
-	help
-	  If you say Y here, you are able to get EFI (Extensible Firmware
-	  Interface) variable information via sysfs.  You may read,
-	  write, create, and destroy EFI variables through this interface.
-
-	  Note that using this driver in concert with efibootmgr requires
-	  at least test release version 0.5.0-test3 or later, which is
-	  available from Matt Domsch's website located at:
-	  <http://linux.dell.com/efibootmgr/testing/efibootmgr-0.5.0-test3.tar.gz>
-
-	  Subsequent efibootmgr releases may be found at:
-	  <http://linux.dell.com/efibootmgr>
-
-config EFI_VARS_PSTORE
-	bool "Register efivars backend for pstore"
-	depends on EFI_VARS && PSTORE
-	default y
-	help
-	  Say Y here to enable use efivars as a backend to pstore. This
-	  will allow writing console messages, crash dumps, or anything
-	  else supported by pstore to EFI variables.
-
-config EFI_VARS_PSTORE_DEFAULT_DISABLE
-	bool "Disable using efivars as a pstore backend by default"
-	depends on EFI_VARS_PSTORE
-	default n
-	help
-	  Saying Y here will disable the use of efivars as a storage
-	  backend for pstore by default. This setting can be overridden
-	  using the efivars module's pstore_disable parameter.
-
 config EFI_PCDP
 	bool "Console device selection via EFI PCDP or HCDP table"
 	depends on ACPI && EFI && IA64
@@ -164,5 +129,6 @@ config ISCSI_IBFT
 	  Otherwise, say N.
 
 source "drivers/firmware/google/Kconfig"
+source "drivers/firmware/efi/Kconfig"
 
 endmenu
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 5a7e27399729..31bf68c93593 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_ISCSI_IBFT)	+= iscsi_ibft.o
 obj-$(CONFIG_FIRMWARE_MEMMAP)	+= memmap.o
 
 obj-$(CONFIG_GOOGLE_FIRMWARE)	+= google/
+obj-$(CONFIG_EFI)		+= efi/
diff --git a/drivers/firmware/efi/Kconfig b/drivers/firmware/efi/Kconfig
new file mode 100644
index 000000000000..b0fc7c79dfbb
--- /dev/null
+++ b/drivers/firmware/efi/Kconfig
@@ -0,0 +1,39 @@
+menu "EFI (Extensible Firmware Interface) Support"
+	depends on EFI
+
+config EFI_VARS
+	tristate "EFI Variable Support via sysfs"
+	depends on EFI
+	default n
+	help
+	  If you say Y here, you are able to get EFI (Extensible Firmware
+	  Interface) variable information via sysfs.  You may read,
+	  write, create, and destroy EFI variables through this interface.
+
+	  Note that using this driver in concert with efibootmgr requires
+	  at least test release version 0.5.0-test3 or later, which is
+	  available from Matt Domsch's website located at:
+	  <http://linux.dell.com/efibootmgr/testing/efibootmgr-0.5.0-test3.tar.gz>
+
+	  Subsequent efibootmgr releases may be found at:
+	  <http://linux.dell.com/efibootmgr>
+
+config EFI_VARS_PSTORE
+	tristate "Register efivars backend for pstore"
+	depends on EFI_VARS && PSTORE
+	default y
+	help
+	  Say Y here to enable use efivars as a backend to pstore. This
+	  will allow writing console messages, crash dumps, or anything
+	  else supported by pstore to EFI variables.
+
+config EFI_VARS_PSTORE_DEFAULT_DISABLE
+	bool "Disable using efivars as a pstore backend by default"
+	depends on EFI_VARS_PSTORE
+	default n
+	help
+	  Saying Y here will disable the use of efivars as a storage
+	  backend for pstore by default. This setting can be overridden
+	  using the efivars module's pstore_disable parameter.
+
+endmenu
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
new file mode 100644
index 000000000000..e03cd51525c2
--- /dev/null
+++ b/drivers/firmware/efi/Makefile
@@ -0,0 +1,4 @@
+#
+# Makefile for linux kernel
+#
+obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
new file mode 100644
index 000000000000..47ae712c9504
--- /dev/null
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -0,0 +1,244 @@
+#include <linux/efi.h>
+#include <linux/module.h>
+#include <linux/pstore.h>
+
+#define DUMP_NAME_LEN 52
+
+static bool efivars_pstore_disable =
+	IS_ENABLED(CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE);
+
+module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644);
+
+#define PSTORE_EFI_ATTRIBUTES \
+	(EFI_VARIABLE_NON_VOLATILE | \
+	 EFI_VARIABLE_BOOTSERVICE_ACCESS | \
+	 EFI_VARIABLE_RUNTIME_ACCESS)
+
+static int efi_pstore_open(struct pstore_info *psi)
+{
+	efivar_entry_iter_begin();
+	psi->data = NULL;
+	return 0;
+}
+
+static int efi_pstore_close(struct pstore_info *psi)
+{
+	efivar_entry_iter_end();
+	psi->data = NULL;
+	return 0;
+}
+
+struct pstore_read_data {
+	u64 *id;
+	enum pstore_type_id *type;
+	int *count;
+	struct timespec *timespec;
+	char **buf;
+};
+
+static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
+{
+	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
+	struct pstore_read_data *cb_data = data;
+	char name[DUMP_NAME_LEN];
+	int i;
+	int cnt;
+	unsigned int part;
+	unsigned long time, size;
+
+	if (efi_guidcmp(entry->var.VendorGuid, vendor))
+		return 0;
+
+	for (i = 0; i < DUMP_NAME_LEN; i++)
+		name[i] = entry->var.VariableName[i];
+
+	if (sscanf(name, "dump-type%u-%u-%d-%lu",
+		   cb_data->type, &part, &cnt, &time) == 4) {
+		*cb_data->id = part;
+		*cb_data->count = cnt;
+		cb_data->timespec->tv_sec = time;
+		cb_data->timespec->tv_nsec = 0;
+	} else if (sscanf(name, "dump-type%u-%u-%lu",
+			  cb_data->type, &part, &time) == 3) {
+		/*
+		 * Check if an old format,
+		 * which doesn't support holding
+		 * multiple logs, remains.
+		 */
+		*cb_data->id = part;
+		*cb_data->count = 0;
+		cb_data->timespec->tv_sec = time;
+		cb_data->timespec->tv_nsec = 0;
+	} else
+		return 0;
+
+	__efivar_entry_size(entry, &size);
+	*cb_data->buf = kmalloc(size, GFP_KERNEL);
+	if (*cb_data->buf == NULL)
+		return -ENOMEM;
+	memcpy(*cb_data->buf, entry->var.Data, size);
+	return size;
+}
+
+static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
+			       int *count, struct timespec *timespec,
+			       char **buf, struct pstore_info *psi)
+{
+	struct pstore_read_data data;
+
+	data.id = id;
+	data.type = type;
+	data.count = count;
+	data.timespec = timespec;
+	data.buf = buf;
+
+	return __efivar_entry_iter(efi_pstore_read_func, &efivar_sysfs_list, &data,
+				   (struct efivar_entry **)&psi->data);
+}
+
+static int efi_pstore_write(enum pstore_type_id type,
+		enum kmsg_dump_reason reason, u64 *id,
+		unsigned int part, int count, size_t size,
+		struct pstore_info *psi)
+{
+	char name[DUMP_NAME_LEN];
+	efi_char16_t efi_name[DUMP_NAME_LEN];
+	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
+	int i, ret = 0;
+
+	sprintf(name, "dump-type%u-%u-%d-%lu", type, part, count,
+		get_seconds());
+
+	for (i = 0; i < DUMP_NAME_LEN; i++)
+		efi_name[i] = name[i];
+
+	efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
+			      !pstore_cannot_block_path(reason),
+			      size, psi->buf);
+
+	if (reason == KMSG_DUMP_OOPS)
+		efivar_run_worker();
+
+	*id = part;
+	return ret;
+};
+
+struct pstore_erase_data {
+	u64 id;
+	enum pstore_type_id type;
+	int count;
+	struct timespec time;
+	efi_char16_t *name;
+};
+
+/*
+ * Clean up an entry with the same name
+ */
+static int efi_pstore_erase_func(struct efivar_entry *entry, void *data)
+{
+	struct pstore_erase_data *ed = data;
+	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
+	efi_char16_t efi_name_old[DUMP_NAME_LEN];
+	efi_char16_t *efi_name = ed->name;
+	unsigned long utf16_len = utf16_strlen(ed->name);
+	char name_old[DUMP_NAME_LEN];
+	int i;
+
+	if (efi_guidcmp(entry->var.VendorGuid, vendor))
+		return 0;
+
+	if (utf16_strncmp(entry->var.VariableName,
+			  efi_name, (size_t)utf16_len)) {
+		/*
+		 * Check if an old format, which doesn't support
+		 * holding multiple logs, remains.
+		 */
+		sprintf(name_old, "dump-type%u-%u-%lu", ed->type,
+			(unsigned int)ed->id, ed->time.tv_sec);
+
+		for (i = 0; i < DUMP_NAME_LEN; i++)
+			efi_name_old[i] = name_old[i];
+
+		if (utf16_strncmp(entry->var.VariableName, efi_name_old,
+				  utf16_strlen(efi_name_old)))
+			return 0;
+	}
+
+	/* found */
+	__efivar_entry_delete(entry);
+	return 1;
+}
+
+static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
+			    struct timespec time, struct pstore_info *psi)
+{
+	struct pstore_erase_data edata;
+	struct efivar_entry *entry;
+	char name[DUMP_NAME_LEN];
+	efi_char16_t efi_name[DUMP_NAME_LEN];
+	int found, i;
+
+	sprintf(name, "dump-type%u-%u-%d-%lu", type, (unsigned int)id, count,
+		time.tv_sec);
+
+	for (i = 0; i < DUMP_NAME_LEN; i++)
+		efi_name[i] = name[i];
+
+	edata.id = id;
+	edata.type = type;
+	edata.count = count;
+	edata.time = time;
+	edata.name = efi_name;
+
+	efivar_entry_iter_begin();
+	found = __efivar_entry_iter(efi_pstore_erase_func, &efivar_sysfs_list, &edata, &entry);
+	efivar_entry_iter_end();
+
+	if (found)
+		efivar_unregister(entry);
+
+	return 0;
+}
+
+static struct pstore_info efi_pstore_info = {
+	.owner		= THIS_MODULE,
+	.name		= "efi",
+	.open		= efi_pstore_open,
+	.close		= efi_pstore_close,
+	.read		= efi_pstore_read,
+	.write		= efi_pstore_write,
+	.erase		= efi_pstore_erase,
+};
+
+static __init int efivars_pstore_init(void)
+{
+	if (!efi_enabled(EFI_RUNTIME_SERVICES))
+		return 0;
+
+	if (!efivars_kobject())
+		return 0;
+
+	if (efivars_pstore_disable)
+		return 0;
+
+	efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL);
+	if (!efi_pstore_info.buf)
+		return -ENOMEM;
+
+	efi_pstore_info.bufsize = 1024;
+	spin_lock_init(&efi_pstore_info.buf_lock);
+
+	pstore_register(&efi_pstore_info);
+
+	return 0;
+}
+
+static __exit void efivars_pstore_exit(void)
+{
+}
+
+module_init(efivars_pstore_init);
+module_exit(efivars_pstore_exit);
+
+MODULE_DESCRIPTION("EFI variable backend for pstore");
+MODULE_LICENSE("GPL");
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
index 82fd145ead3b..5e7c3b1acde9 100644
--- a/drivers/firmware/efivars.c
+++ b/drivers/firmware/efivars.c
@@ -78,7 +78,6 @@
 #include <linux/kobject.h>
 #include <linux/device.h>
 #include <linux/slab.h>
-#include <linux/pstore.h>
 #include <linux/ctype.h>
 
 #include <linux/fs.h>
@@ -95,15 +94,9 @@ MODULE_DESCRIPTION("sysfs interface to EFI Variables");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(EFIVARS_VERSION);
 
-#define DUMP_NAME_LEN 52
-
 static LIST_HEAD(efivarfs_list);
-static LIST_HEAD(efivar_sysfs_list);
-
-static bool efivars_pstore_disable =
-	IS_ENABLED(CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE);
-
-module_param_named(pstore_disable, efivars_pstore_disable, bool, 0644);
+LIST_HEAD(efivar_sysfs_list);
+EXPORT_SYMBOL_GPL(efivar_sysfs_list);
 
 struct efivar_attribute {
 	struct attribute attr;
@@ -114,11 +107,6 @@ struct efivar_attribute {
 /* Private pointer to registered efivars */
 static struct efivars *__efivars;
 
-#define PSTORE_EFI_ATTRIBUTES \
-	(EFI_VARIABLE_NON_VOLATILE | \
-	 EFI_VARIABLE_BOOTSERVICE_ACCESS | \
-	 EFI_VARIABLE_RUNTIME_ACCESS)
-
 static struct kset *efivars_kset;
 
 static struct bin_attribute *efivars_new_var;
@@ -148,34 +136,6 @@ static void efivar_update_sysfs_entries(struct work_struct *);
 static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries);
 static bool efivar_wq_enabled = true;
 
-/*
- * Return the number of bytes is the length of this string
- * Note: this is NOT the same as the number of unicode characters
- */
-static inline unsigned long
-utf16_strsize(efi_char16_t *data, unsigned long maxlength)
-{
-	return utf16_strnlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t);
-}
-
-static inline int
-utf16_strncmp(const efi_char16_t *a, const efi_char16_t *b, size_t len)
-{
-	while (1) {
-		if (len == 0)
-			return 0;
-		if (*a < *b)
-			return -1;
-		if (*a > *b)
-			return 1;
-		if (*a == 0) /* implies *b == 0 */
-			return 0;
-		a++;
-		b++;
-		len--;
-	}
-}
-
 static bool
 validate_device_path(struct efi_variable *var, int match, u8 *buffer,
 		     unsigned long len)
@@ -598,12 +558,6 @@ static struct kobj_type efivar_ktype = {
 	.default_attrs = def_attrs,
 };
 
-static inline void
-efivar_unregister(struct efivar_entry *var)
-{
-	kobject_put(&var->kobj);
-}
-
 static int efivarfs_file_open(struct inode *inode, struct file *file)
 {
 	file->private_data = inode->i_private;
@@ -1130,220 +1084,6 @@ static const struct inode_operations efivarfs_dir_inode_operations = {
 	.create = efivarfs_create,
 };
 
-#ifdef CONFIG_EFI_VARS_PSTORE
-
-static int efi_pstore_open(struct pstore_info *psi)
-{
-	efivar_entry_iter_begin();
-	psi->data = NULL;
-	return 0;
-}
-
-static int efi_pstore_close(struct pstore_info *psi)
-{
-	efivar_entry_iter_end();
-	psi->data = NULL;
-	return 0;
-}
-
-struct pstore_read_data {
-	u64 *id;
-	enum pstore_type_id *type;
-	int *count;
-	struct timespec *timespec;
-	char **buf;
-};
-
-static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
-{
-	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	struct pstore_read_data *cb_data = data;
-	char name[DUMP_NAME_LEN];
-	int i;
-	int cnt;
-	unsigned int part;
-	unsigned long time, size;
-
-	if (efi_guidcmp(entry->var.VendorGuid, vendor))
-		return 0;
-
-	for (i = 0; i < DUMP_NAME_LEN; i++)
-		name[i] = entry->var.VariableName[i];
-
-	if (sscanf(name, "dump-type%u-%u-%d-%lu",
-		   cb_data->type, &part, &cnt, &time) == 4) {
-		*cb_data->id = part;
-		*cb_data->count = cnt;
-		cb_data->timespec->tv_sec = time;
-		cb_data->timespec->tv_nsec = 0;
-	} else if (sscanf(name, "dump-type%u-%u-%lu",
-			  cb_data->type, &part, &time) == 3) {
-		/*
-		 * Check if an old format,
-		 * which doesn't support holding
-		 * multiple logs, remains.
-		 */
-		*cb_data->id = part;
-		*cb_data->count = 0;
-		cb_data->timespec->tv_sec = time;
-		cb_data->timespec->tv_nsec = 0;
-	} else
-		return 0;
-
-	__efivar_entry_size(entry, &size);
-	*cb_data->buf = kmalloc(size, GFP_KERNEL);
-	if (*cb_data->buf == NULL)
-		return -ENOMEM;
-	memcpy(*cb_data->buf, entry->var.Data, size);
-	return size;
-}
-
-static ssize_t efi_pstore_read(u64 *id, enum pstore_type_id *type,
-			       int *count, struct timespec *timespec,
-			       char **buf, struct pstore_info *psi)
-{
-	struct pstore_read_data data;
-
-	data.id = id;
-	data.type = type;
-	data.count = count;
-	data.timespec = timespec;
-	data.buf = buf;
-
-	return __efivar_entry_iter(efi_pstore_read_func, &efivar_sysfs_list, &data,
-				   (struct efivar_entry **)&psi->data);
-}
-
-static int efi_pstore_write(enum pstore_type_id type,
-		enum kmsg_dump_reason reason, u64 *id,
-		unsigned int part, int count, size_t size,
-		struct pstore_info *psi)
-{
-	char name[DUMP_NAME_LEN];
-	efi_char16_t efi_name[DUMP_NAME_LEN];
-	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	int i, ret = 0;
-
-	sprintf(name, "dump-type%u-%u-%d-%lu", type, part, count,
-		get_seconds());
-
-	for (i = 0; i < DUMP_NAME_LEN; i++)
-		efi_name[i] = name[i];
-
-	ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES,
-				    !pstore_cannot_block_path(reason),
-				    size, psi->buf);
-
-	if (reason == KMSG_DUMP_OOPS && efivar_wq_enabled)
-		schedule_work(&efivar_work);
-
-	*id = part;
-	return ret;
-};
-
-struct pstore_erase_data {
-	u64 id;
-	enum pstore_type_id type;
-	int count;
-	struct timespec time;
-	efi_char16_t *name;
-};
-
-/*
- * Clean up an entry with the same name
- */
-static int efi_pstore_erase_func(struct efivar_entry *entry, void *data)
-{
-	struct pstore_erase_data *ed = data;
-	efi_guid_t vendor = LINUX_EFI_CRASH_GUID;
-	efi_char16_t efi_name_old[DUMP_NAME_LEN];
-	efi_char16_t *efi_name = ed->name;
-	unsigned long utf16_len = utf16_strlen(ed->name);
-	char name_old[DUMP_NAME_LEN];
-	int i;
-
-	if (efi_guidcmp(entry->var.VendorGuid, vendor))
-		return 0;
-
-	if (utf16_strncmp(entry->var.VariableName,
-			  efi_name, (size_t)utf16_len)) {
-		/*
-		 * Check if an old format, which doesn't support
-		 * holding multiple logs, remains.
-		 */
-		sprintf(name_old, "dump-type%u-%u-%lu", ed->type,
-			(unsigned int)ed->id, ed->time.tv_sec);
-
-		for (i = 0; i < DUMP_NAME_LEN; i++)
-			efi_name_old[i] = name_old[i];
-
-		if (utf16_strncmp(entry->var.VariableName, efi_name_old,
-				  utf16_strlen(efi_name_old)))
-			return 0;
-	}
-
-	/* found */
-	__efivar_entry_delete(entry);
-	return 1;
-}
-
-static int efi_pstore_erase(enum pstore_type_id type, u64 id, int count,
-			    struct timespec time, struct pstore_info *psi)
-{
-	struct pstore_erase_data edata;
-	struct efivar_entry *entry;
-	char name[DUMP_NAME_LEN];
-	efi_char16_t efi_name[DUMP_NAME_LEN];
-	int found, i;
-
-	sprintf(name, "dump-type%u-%u-%d-%lu", type, (unsigned int)id, count,
-		time.tv_sec);
-
-	for (i = 0; i < DUMP_NAME_LEN; i++)
-		efi_name[i] = name[i];
-
-	edata.id = id;
-	edata.type = type;
-	edata.count = count;
-	edata.time = time;
-	edata.name = efi_name;
-
-	efivar_entry_iter_begin();
-	found = __efivar_entry_iter(efi_pstore_erase_func, &efivar_sysfs_list, &edata, &entry);
-	efivar_entry_iter_end();
-
-	if (found)
-		efivar_unregister(entry);
-
-	return 0;
-}
-
-static struct pstore_info efi_pstore_info = {
-	.owner		= THIS_MODULE,
-	.name		= "efi",
-	.open		= efi_pstore_open,
-	.close		= efi_pstore_close,
-	.read		= efi_pstore_read,
-	.write		= efi_pstore_write,
-	.erase		= efi_pstore_erase,
-};
-
-static void efivar_pstore_register(void)
-{
-	efi_pstore_info.buf = kmalloc(4096, GFP_KERNEL);
-	if (efi_pstore_info.buf) {
-		efi_pstore_info.bufsize = 1024;
-		spin_lock_init(&efi_pstore_info.buf_lock);
-		pstore_register(&efi_pstore_info);
-	}
-}
-#else
-static void efivar_pstore_register(void)
-{
-	return;
-}
-#endif
-
 static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
 			     struct bin_attribute *bin_attr,
 			     char *buf, loff_t pos, size_t count)
@@ -2396,6 +2136,16 @@ struct kobject *efivars_kobject(void)
 }
 EXPORT_SYMBOL_GPL(efivars_kobject);
 
+/**
+ * efivar_run_worker - schedule the efivar worker thread
+ */
+void efivar_run_worker(void)
+{
+	if (efivar_wq_enabled)
+		schedule_work(&efivar_work);
+}
+EXPORT_SYMBOL_GPL(efivar_run_worker);
+
 /**
  * efivars_register - register an efivars
  * @efivars: efivars to register
@@ -2414,9 +2164,6 @@ int efivars_register(struct efivars *efivars,
 
 	__efivars = efivars;
 
-	if (!efivars_pstore_disable)
-		efivar_pstore_register();
-
 	register_filesystem(&efivarfs_type);
 
 	return 0;
diff --git a/include/linux/efi.h b/include/linux/efi.h
index cd561b301e6a..8ff6ec1ac046 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -743,6 +743,34 @@ utf16_strlen(efi_char16_t *s)
 }
 
 #if defined(CONFIG_EFI_VARS) || defined(CONFIG_EFI_VARS_MODULE)
+/*
+ * Return the number of bytes is the length of this string
+ * Note: this is NOT the same as the number of unicode characters
+ */
+static inline unsigned long
+utf16_strsize(efi_char16_t *data, unsigned long maxlength)
+{
+	return utf16_strnlen(data, maxlength/sizeof(efi_char16_t)) * sizeof(efi_char16_t);
+}
+
+static inline int
+utf16_strncmp(const efi_char16_t *a, const efi_char16_t *b, size_t len)
+{
+	while (1) {
+		if (len == 0)
+			return 0;
+		if (*a < *b)
+			return -1;
+		if (*a > *b)
+			return 1;
+		if (*a == 0) /* implies *b == 0 */
+			return 0;
+		a++;
+		b++;
+		len--;
+	}
+}
+
 /*
  * EFI Variable support.
  *
@@ -795,6 +823,14 @@ struct efivar_entry {
 	struct kobject kobj;
 };
 
+extern struct list_head efivar_sysfs_list;
+
+static inline void
+efivar_unregister(struct efivar_entry *var)
+{
+	kobject_put(&var->kobj);
+}
+
 int efivars_register(struct efivars *efivars,
 		     const struct efivar_operations *ops,
 		     struct kobject *kobject);
@@ -836,6 +872,8 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
 
 bool efivar_validate(struct efi_variable *var, u8 *data, unsigned long len);
 
+void efivar_run_worker(void);
+
 int efivars_sysfs_init(void);
 
 #endif /* CONFIG_EFI_VARS */
-- 
cgit 


From a9499fa7cd3fd4824a7202d00c766b269fa3bda6 Mon Sep 17 00:00:00 2001
From: Tom Gundersen <teg@jklm.no>
Date: Fri, 8 Feb 2013 15:37:06 +0000
Subject: efi: split efisubsystem from efivars

This registers /sys/firmware/efi/{,systab,efivars/} whenever EFI is enabled
and the system is booted with EFI.

This allows
 *) userspace to check for the existence of /sys/firmware/efi as a way
    to determine whether or it is running on an EFI system.
 *) 'mount -t efivarfs none /sys/firmware/efi/efivars' without manually
    loading any modules.

[ Also, move the efivar API into vars.c and unconditionally compile it.
  This allows us to move efivars.c, which now only contains the sysfs
  variable code, into the firmware/efi directory. Note that the efivars.c
  filename is kept to maintain backwards compatability with the old
  efivars.ko module. With this patch it is now possible for efivarfs
  to be built without CONFIG_EFI_VARS - Matt ]

Cc: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Mike Waychison <mikew@google.com>
Cc: Kay Sievers <kay@vrfy.org>
Cc: Jeremy Kerr <jk@ozlabs.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Chun-Yi Lee <jlee@suse.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Tobias Powalowski <tpowa@archlinux.org>
Signed-off-by: Tom Gundersen <teg@jklm.no>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 MAINTAINERS                    |    2 +-
 drivers/firmware/Makefile      |    1 -
 drivers/firmware/efi/Makefile  |    2 +
 drivers/firmware/efi/efi.c     |  134 +++
 drivers/firmware/efi/efivars.c |  617 ++++++++++++++
 drivers/firmware/efi/vars.c    | 1049 +++++++++++++++++++++++
 drivers/firmware/efivars.c     | 1794 ----------------------------------------
 fs/efivarfs/Kconfig            |    2 +-
 include/linux/efi.h            |    3 +-
 9 files changed, 1806 insertions(+), 1798 deletions(-)
 create mode 100644 drivers/firmware/efi/efi.c
 create mode 100644 drivers/firmware/efi/efivars.c
 create mode 100644 drivers/firmware/efi/vars.c
 delete mode 100644 drivers/firmware/efivars.c

(limited to 'include/linux')

diff --git a/MAINTAINERS b/MAINTAINERS
index 0855f4450e91..e0cd7e53acc0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2987,7 +2987,7 @@ F:	arch/ia64/kernel/efi.c
 F:	arch/x86/boot/compressed/eboot.[ch]
 F:	arch/x86/include/asm/efi.h
 F:	arch/x86/platform/efi/*
-F:	drivers/firmware/efivars.c
+F:	drivers/firmware/efi/*
 F:	include/linux/efi*.h
 
 EFI VARIABLE FILESYSTEM
diff --git a/drivers/firmware/Makefile b/drivers/firmware/Makefile
index 31bf68c93593..299fad6b5867 100644
--- a/drivers/firmware/Makefile
+++ b/drivers/firmware/Makefile
@@ -4,7 +4,6 @@
 obj-$(CONFIG_DMI)		+= dmi_scan.o
 obj-$(CONFIG_DMI_SYSFS)		+= dmi-sysfs.o
 obj-$(CONFIG_EDD)		+= edd.o
-obj-$(CONFIG_EFI_VARS)		+= efivars.o
 obj-$(CONFIG_EFI_PCDP)		+= pcdp.o
 obj-$(CONFIG_DELL_RBU)          += dell_rbu.o
 obj-$(CONFIG_DCDBAS)		+= dcdbas.o
diff --git a/drivers/firmware/efi/Makefile b/drivers/firmware/efi/Makefile
index e03cd51525c2..99245ab5a79c 100644
--- a/drivers/firmware/efi/Makefile
+++ b/drivers/firmware/efi/Makefile
@@ -1,4 +1,6 @@
 #
 # Makefile for linux kernel
 #
+obj-y					+= efi.o vars.o
+obj-$(CONFIG_EFI_VARS)			+= efivars.o
 obj-$(CONFIG_EFI_VARS_PSTORE)		+= efi-pstore.o
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
new file mode 100644
index 000000000000..32bdf4f8e432
--- /dev/null
+++ b/drivers/firmware/efi/efi.c
@@ -0,0 +1,134 @@
+/*
+ * efi.c - EFI subsystem
+ *
+ * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
+ * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
+ * Copyright (C) 2013 Tom Gundersen <teg@jklm.no>
+ *
+ * This code registers /sys/firmware/efi{,/efivars} when EFI is supported,
+ * allowing the efivarfs to be mounted or the efivars module to be loaded.
+ * The existance of /sys/firmware/efi may also be used by userspace to
+ * determine that the system supports EFI.
+ *
+ * This file is released under the GPLv2.
+ */
+
+#include <linux/kobject.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/efi.h>
+
+static struct kobject *efi_kobj;
+static struct kobject *efivars_kobj;
+
+/*
+ * Let's not leave out systab information that snuck into
+ * the efivars driver
+ */
+static ssize_t systab_show(struct kobject *kobj,
+			   struct kobj_attribute *attr, char *buf)
+{
+	char *str = buf;
+
+	if (!kobj || !buf)
+		return -EINVAL;
+
+	if (efi.mps != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "MPS=0x%lx\n", efi.mps);
+	if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
+	if (efi.acpi != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
+	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
+	if (efi.hcdp != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "HCDP=0x%lx\n", efi.hcdp);
+	if (efi.boot_info != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "BOOTINFO=0x%lx\n", efi.boot_info);
+	if (efi.uga != EFI_INVALID_TABLE_ADDR)
+		str += sprintf(str, "UGA=0x%lx\n", efi.uga);
+
+	return str - buf;
+}
+
+static struct kobj_attribute efi_attr_systab =
+			__ATTR(systab, 0400, systab_show, NULL);
+
+static struct attribute *efi_subsys_attrs[] = {
+	&efi_attr_systab.attr,
+	NULL,	/* maybe more in the future? */
+};
+
+static struct attribute_group efi_subsys_attr_group = {
+	.attrs = efi_subsys_attrs,
+};
+
+static struct efivars generic_efivars;
+static struct efivar_operations generic_ops;
+
+static int generic_ops_register(void)
+{
+	generic_ops.get_variable = efi.get_variable;
+	generic_ops.set_variable = efi.set_variable;
+	generic_ops.get_next_variable = efi.get_next_variable;
+	generic_ops.query_variable_info = efi.query_variable_info;
+
+	return efivars_register(&generic_efivars, &generic_ops, efi_kobj);
+}
+
+static void generic_ops_unregister(void)
+{
+	efivars_unregister(&generic_efivars);
+}
+
+/*
+ * We register the efi subsystem with the firmware subsystem and the
+ * efivars subsystem with the efi subsystem, if the system was booted with
+ * EFI.
+ */
+static int __init efisubsys_init(void)
+{
+	int error;
+
+	if (!efi_enabled(EFI_BOOT))
+		return 0;
+
+	/* We register the efi directory at /sys/firmware/efi */
+	efi_kobj = kobject_create_and_add("efi", firmware_kobj);
+	if (!efi_kobj) {
+		pr_err("efi: Firmware registration failed.\n");
+		return -ENOMEM;
+	}
+
+	error = generic_ops_register();
+	if (error)
+		goto err_put;
+
+	error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group);
+	if (error) {
+		pr_err("efi: Sysfs attribute export failed with error %d.\n",
+		       error);
+		goto err_unregister;
+	}
+
+	/* and the standard mountpoint for efivarfs */
+	efivars_kobj = kobject_create_and_add("efivars", efi_kobj);
+	if (!efivars_kobj) {
+		pr_err("efivars: Subsystem registration failed.\n");
+		error = -ENOMEM;
+		goto err_remove_group;
+	}
+
+	return 0;
+
+err_remove_group:
+	sysfs_remove_group(efi_kobj, &efi_subsys_attr_group);
+err_unregister:
+	generic_ops_unregister();
+err_put:
+	kobject_put(efi_kobj);
+	return error;
+}
+
+subsys_initcall(efisubsys_init);
diff --git a/drivers/firmware/efi/efivars.c b/drivers/firmware/efi/efivars.c
new file mode 100644
index 000000000000..70635b3b59d3
--- /dev/null
+++ b/drivers/firmware/efi/efivars.c
@@ -0,0 +1,617 @@
+/*
+ * Originally from efivars.c,
+ *
+ * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
+ * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
+ *
+ * This code takes all variables accessible from EFI runtime and
+ *  exports them via sysfs
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Changelog:
+ *
+ *  17 May 2004 - Matt Domsch <Matt_Domsch@dell.com>
+ *   remove check for efi_enabled in exit
+ *   add MODULE_VERSION
+ *
+ *  26 Apr 2004 - Matt Domsch <Matt_Domsch@dell.com>
+ *   minor bug fixes
+ *
+ *  21 Apr 2004 - Matt Tolentino <matthew.e.tolentino@intel.com)
+ *   converted driver to export variable information via sysfs
+ *   and moved to drivers/firmware directory
+ *   bumped revision number to v0.07 to reflect conversion & move
+ *
+ *  10 Dec 2002 - Matt Domsch <Matt_Domsch@dell.com>
+ *   fix locking per Peter Chubb's findings
+ *
+ *  25 Mar 2002 - Matt Domsch <Matt_Domsch@dell.com>
+ *   move uuid_unparse() to include/asm-ia64/efi.h:efi_guid_unparse()
+ *
+ *  12 Feb 2002 - Matt Domsch <Matt_Domsch@dell.com>
+ *   use list_for_each_safe when deleting vars.
+ *   remove ifdef CONFIG_SMP around include <linux/smp.h>
+ *   v0.04 release to linux-ia64@linuxia64.org
+ *
+ *  20 April 2001 - Matt Domsch <Matt_Domsch@dell.com>
+ *   Moved vars from /proc/efi to /proc/efi/vars, and made
+ *   efi.c own the /proc/efi directory.
+ *   v0.03 release to linux-ia64@linuxia64.org
+ *
+ *  26 March 2001 - Matt Domsch <Matt_Domsch@dell.com>
+ *   At the request of Stephane, moved ownership of /proc/efi
+ *   to efi.c, and now efivars lives under /proc/efi/vars.
+ *
+ *  12 March 2001 - Matt Domsch <Matt_Domsch@dell.com>
+ *   Feedback received from Stephane Eranian incorporated.
+ *   efivar_write() checks copy_from_user() return value.
+ *   efivar_read/write() returns proper errno.
+ *   v0.02 release to linux-ia64@linuxia64.org
+ *
+ *  26 February 2001 - Matt Domsch <Matt_Domsch@dell.com>
+ *   v0.01 release to linux-ia64@linuxia64.org
+ */
+
+#include <linux/efi.h>
+#include <linux/module.h>
+
+#define EFIVARS_VERSION "0.08"
+#define EFIVARS_DATE "2004-May-17"
+
+MODULE_AUTHOR("Matt Domsch <Matt_Domsch@Dell.com>");
+MODULE_DESCRIPTION("sysfs interface to EFI Variables");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(EFIVARS_VERSION);
+
+LIST_HEAD(efivar_sysfs_list);
+EXPORT_SYMBOL_GPL(efivar_sysfs_list);
+
+static struct kset *efivars_kset;
+
+static struct bin_attribute *efivars_new_var;
+static struct bin_attribute *efivars_del_var;
+
+struct efivar_attribute {
+	struct attribute attr;
+	ssize_t (*show) (struct efivar_entry *entry, char *buf);
+	ssize_t (*store)(struct efivar_entry *entry, const char *buf, size_t count);
+};
+
+#define EFIVAR_ATTR(_name, _mode, _show, _store) \
+struct efivar_attribute efivar_attr_##_name = { \
+	.attr = {.name = __stringify(_name), .mode = _mode}, \
+	.show = _show, \
+	.store = _store, \
+};
+
+#define to_efivar_attr(_attr) container_of(_attr, struct efivar_attribute, attr)
+#define to_efivar_entry(obj)  container_of(obj, struct efivar_entry, kobj)
+
+/*
+ * Prototype for sysfs creation function
+ */
+static int
+efivar_create_sysfs_entry(struct efivar_entry *new_var);
+
+static ssize_t
+efivar_guid_read(struct efivar_entry *entry, char *buf)
+{
+	struct efi_variable *var = &entry->var;
+	char *str = buf;
+
+	if (!entry || !buf)
+		return 0;
+
+	efi_guid_unparse(&var->VendorGuid, str);
+	str += strlen(str);
+	str += sprintf(str, "\n");
+
+	return str - buf;
+}
+
+static ssize_t
+efivar_attr_read(struct efivar_entry *entry, char *buf)
+{
+	struct efi_variable *var = &entry->var;
+	char *str = buf;
+
+	if (!entry || !buf)
+		return -EINVAL;
+
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
+		return -EIO;
+
+	if (var->Attributes & EFI_VARIABLE_NON_VOLATILE)
+		str += sprintf(str, "EFI_VARIABLE_NON_VOLATILE\n");
+	if (var->Attributes & EFI_VARIABLE_BOOTSERVICE_ACCESS)
+		str += sprintf(str, "EFI_VARIABLE_BOOTSERVICE_ACCESS\n");
+	if (var->Attributes & EFI_VARIABLE_RUNTIME_ACCESS)
+		str += sprintf(str, "EFI_VARIABLE_RUNTIME_ACCESS\n");
+	if (var->Attributes & EFI_VARIABLE_HARDWARE_ERROR_RECORD)
+		str += sprintf(str, "EFI_VARIABLE_HARDWARE_ERROR_RECORD\n");
+	if (var->Attributes & EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS)
+		str += sprintf(str,
+			"EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS\n");
+	if (var->Attributes &
+			EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS)
+		str += sprintf(str,
+			"EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS\n");
+	if (var->Attributes & EFI_VARIABLE_APPEND_WRITE)
+		str += sprintf(str, "EFI_VARIABLE_APPEND_WRITE\n");
+	return str - buf;
+}
+
+static ssize_t
+efivar_size_read(struct efivar_entry *entry, char *buf)
+{
+	struct efi_variable *var = &entry->var;
+	char *str = buf;
+
+	if (!entry || !buf)
+		return -EINVAL;
+
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
+		return -EIO;
+
+	str += sprintf(str, "0x%lx\n", var->DataSize);
+	return str - buf;
+}
+
+static ssize_t
+efivar_data_read(struct efivar_entry *entry, char *buf)
+{
+	struct efi_variable *var = &entry->var;
+
+	if (!entry || !buf)
+		return -EINVAL;
+
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
+		return -EIO;
+
+	memcpy(buf, var->Data, var->DataSize);
+	return var->DataSize;
+}
+/*
+ * We allow each variable to be edited via rewriting the
+ * entire efi variable structure.
+ */
+static ssize_t
+efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
+{
+	struct efi_variable *new_var, *var = &entry->var;
+	int err;
+
+	if (count != sizeof(struct efi_variable))
+		return -EINVAL;
+
+	new_var = (struct efi_variable *)buf;
+	/*
+	 * If only updating the variable data, then the name
+	 * and guid should remain the same
+	 */
+	if (memcmp(new_var->VariableName, var->VariableName, sizeof(var->VariableName)) ||
+		efi_guidcmp(new_var->VendorGuid, var->VendorGuid)) {
+		printk(KERN_ERR "efivars: Cannot edit the wrong variable!\n");
+		return -EINVAL;
+	}
+
+	if ((new_var->DataSize <= 0) || (new_var->Attributes == 0)){
+		printk(KERN_ERR "efivars: DataSize & Attributes must be valid!\n");
+		return -EINVAL;
+	}
+
+	if ((new_var->Attributes & ~EFI_VARIABLE_MASK) != 0 ||
+	    efivar_validate(new_var, new_var->Data, new_var->DataSize) == false) {
+		printk(KERN_ERR "efivars: Malformed variable content\n");
+		return -EINVAL;
+	}
+
+	memcpy(&entry->var, new_var, count);
+
+	err = efivar_entry_set(entry, new_var->Attributes,
+			       new_var->DataSize, new_var->Data, false);
+	if (err) {
+		printk(KERN_WARNING "efivars: set_variable() failed: status=%d\n", err);
+		return -EIO;
+	}
+
+	return count;
+}
+
+static ssize_t
+efivar_show_raw(struct efivar_entry *entry, char *buf)
+{
+	struct efi_variable *var = &entry->var;
+
+	if (!entry || !buf)
+		return 0;
+
+	var->DataSize = 1024;
+	if (efivar_entry_get(entry, &entry->var.Attributes,
+			     &entry->var.DataSize, entry->var.Data))
+		return -EIO;
+
+	memcpy(buf, var, sizeof(*var));
+
+	return sizeof(*var);
+}
+
+/*
+ * Generic read/write functions that call the specific functions of
+ * the attributes...
+ */
+static ssize_t efivar_attr_show(struct kobject *kobj, struct attribute *attr,
+				char *buf)
+{
+	struct efivar_entry *var = to_efivar_entry(kobj);
+	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (efivar_attr->show) {
+		ret = efivar_attr->show(var, buf);
+	}
+	return ret;
+}
+
+static ssize_t efivar_attr_store(struct kobject *kobj, struct attribute *attr,
+				const char *buf, size_t count)
+{
+	struct efivar_entry *var = to_efivar_entry(kobj);
+	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
+	ssize_t ret = -EIO;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (efivar_attr->store)
+		ret = efivar_attr->store(var, buf, count);
+
+	return ret;
+}
+
+static const struct sysfs_ops efivar_attr_ops = {
+	.show = efivar_attr_show,
+	.store = efivar_attr_store,
+};
+
+static void efivar_release(struct kobject *kobj)
+{
+	struct efivar_entry *var = container_of(kobj, struct efivar_entry, kobj);
+	kfree(var);
+}
+
+static EFIVAR_ATTR(guid, 0400, efivar_guid_read, NULL);
+static EFIVAR_ATTR(attributes, 0400, efivar_attr_read, NULL);
+static EFIVAR_ATTR(size, 0400, efivar_size_read, NULL);
+static EFIVAR_ATTR(data, 0400, efivar_data_read, NULL);
+static EFIVAR_ATTR(raw_var, 0600, efivar_show_raw, efivar_store_raw);
+
+static struct attribute *def_attrs[] = {
+	&efivar_attr_guid.attr,
+	&efivar_attr_size.attr,
+	&efivar_attr_attributes.attr,
+	&efivar_attr_data.attr,
+	&efivar_attr_raw_var.attr,
+	NULL,
+};
+
+static struct kobj_type efivar_ktype = {
+	.release = efivar_release,
+	.sysfs_ops = &efivar_attr_ops,
+	.default_attrs = def_attrs,
+};
+
+static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buf, loff_t pos, size_t count)
+{
+	struct efi_variable *new_var = (struct efi_variable *)buf;
+	struct efivar_entry *new_entry;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if ((new_var->Attributes & ~EFI_VARIABLE_MASK) != 0 ||
+	    efivar_validate(new_var, new_var->Data, new_var->DataSize) == false) {
+		printk(KERN_ERR "efivars: Malformed variable content\n");
+		return -EINVAL;
+	}
+
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
+	if (!new_entry)
+		return -ENOMEM;
+
+	memcpy(&new_entry->var, new_var, sizeof(*new_var));
+
+	err = efivar_entry_set(new_entry, new_var->Attributes, new_var->DataSize,
+			       new_var->Data, &efivar_sysfs_list);
+	if (err) {
+		if (err == -EEXIST)
+			err = -EINVAL;
+		goto out;
+	}
+
+	if (efivar_create_sysfs_entry(new_entry)) {
+		printk(KERN_WARNING "efivars: failed to create sysfs entry.\n");
+		kfree(new_entry);
+	}
+	return count;
+
+out:
+	kfree(new_entry);
+	return err;
+}
+
+static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
+			     struct bin_attribute *bin_attr,
+			     char *buf, loff_t pos, size_t count)
+{
+	struct efi_variable *del_var = (struct efi_variable *)buf;
+	struct efivar_entry *entry;
+	int err = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	efivar_entry_iter_begin();
+	entry = efivar_entry_find(del_var->VariableName, del_var->VendorGuid,
+				  &efivar_sysfs_list, true);
+	if (!entry)
+		err = -EINVAL;
+	else if (__efivar_entry_delete(entry))
+		err = -EIO;
+
+	efivar_entry_iter_end();
+
+	if (err)
+		return err;
+
+	efivar_unregister(entry);
+
+	/* It's dead Jim.... */
+	return count;
+}
+
+/**
+ * efivar_create_sysfs_entry - create a new entry in sysfs
+ * @new_var: efivar entry to create
+ *
+ * Returns 1 on failure, 0 on success
+ */
+static int
+efivar_create_sysfs_entry(struct efivar_entry *new_var)
+{
+	int i, short_name_size;
+	char *short_name;
+	unsigned long variable_name_size;
+	efi_char16_t *variable_name;
+
+	variable_name = new_var->var.VariableName;
+	variable_name_size = utf16_strlen(variable_name) * sizeof(efi_char16_t);
+
+	/*
+	 * Length of the variable bytes in ASCII, plus the '-' separator,
+	 * plus the GUID, plus trailing NUL
+	 */
+	short_name_size = variable_name_size / sizeof(efi_char16_t)
+				+ 1 + EFI_VARIABLE_GUID_LEN + 1;
+
+	short_name = kzalloc(short_name_size, GFP_KERNEL);
+
+	if (!short_name) {
+		kfree(short_name);
+		return 1;
+	}
+
+	/* Convert Unicode to normal chars (assume top bits are 0),
+	   ala UTF-8 */
+	for (i=0; i < (int)(variable_name_size / sizeof(efi_char16_t)); i++) {
+		short_name[i] = variable_name[i] & 0xFF;
+	}
+	/* This is ugly, but necessary to separate one vendor's
+	   private variables from another's.         */
+
+	*(short_name + strlen(short_name)) = '-';
+	efi_guid_unparse(&new_var->var.VendorGuid,
+			 short_name + strlen(short_name));
+
+	new_var->kobj.kset = efivars_kset;
+
+	i = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
+				   NULL, "%s", short_name);
+	kfree(short_name);
+	if (i)
+		return 1;
+
+	kobject_uevent(&new_var->kobj, KOBJ_ADD);
+	efivar_entry_add(new_var, &efivar_sysfs_list);
+
+	return 0;
+}
+
+static int
+create_efivars_bin_attributes(void)
+{
+	struct bin_attribute *attr;
+	int error;
+
+	/* new_var */
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->attr.name = "new_var";
+	attr->attr.mode = 0200;
+	attr->write = efivar_create;
+	efivars_new_var = attr;
+
+	/* del_var */
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	if (!attr) {
+		error = -ENOMEM;
+		goto out_free;
+	}
+	attr->attr.name = "del_var";
+	attr->attr.mode = 0200;
+	attr->write = efivar_delete;
+	efivars_del_var = attr;
+
+	sysfs_bin_attr_init(efivars_new_var);
+	sysfs_bin_attr_init(efivars_del_var);
+
+	/* Register */
+	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_new_var);
+	if (error) {
+		printk(KERN_ERR "efivars: unable to create new_var sysfs file"
+			" due to error %d\n", error);
+		goto out_free;
+	}
+
+	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_del_var);
+	if (error) {
+		printk(KERN_ERR "efivars: unable to create del_var sysfs file"
+			" due to error %d\n", error);
+		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
+		goto out_free;
+	}
+
+	return 0;
+out_free:
+	kfree(efivars_del_var);
+	efivars_del_var = NULL;
+	kfree(efivars_new_var);
+	efivars_new_var = NULL;
+	return error;
+}
+
+static int efivar_update_sysfs_entry(efi_char16_t *name, efi_guid_t vendor,
+				     unsigned long name_size, void *data)
+{
+	struct efivar_entry *entry = data;
+
+	if (efivar_entry_find(name, vendor, &efivar_sysfs_list, false))
+		return 0;
+
+	memcpy(entry->var.VariableName, name, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	return 1;
+}
+
+static void efivar_update_sysfs_entries(struct work_struct *work)
+{
+	struct efivar_entry *entry;
+	int err;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return;
+
+	/* Add new sysfs entries */
+	while (1) {
+		memset(entry, 0, sizeof(*entry));
+
+		err = efivar_init(efivar_update_sysfs_entry, entry,
+				  true, false, &efivar_sysfs_list);
+		if (!err)
+			break;
+
+		efivar_create_sysfs_entry(entry);
+	}
+
+	kfree(entry);
+}
+
+static int efivars_sysfs_callback(efi_char16_t *name, efi_guid_t vendor,
+				  unsigned long name_size, void *data)
+{
+	struct efivar_entry *entry;
+
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	memcpy(entry->var.VariableName, name, name_size);
+	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
+
+	efivar_create_sysfs_entry(entry);
+
+	return 0;
+}
+
+static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data)
+{
+	efivar_entry_remove(entry);
+	efivar_unregister(entry);
+	return 0;
+}
+
+void efivars_sysfs_exit(void)
+{
+	/* Remove all entries and destroy */
+	__efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL, NULL);
+
+	if (efivars_new_var)
+		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
+	if (efivars_del_var)
+		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_del_var);
+	kfree(efivars_new_var);
+	kfree(efivars_del_var);
+	kset_unregister(efivars_kset);
+}
+
+int efivars_sysfs_init(void)
+{
+	struct kobject *parent_kobj = efivars_kobject();
+	int error = 0;
+
+	/* No efivars has been registered yet */
+	if (!parent_kobj)
+		return 0;
+
+	printk(KERN_INFO "EFI Variables Facility v%s %s\n", EFIVARS_VERSION,
+	       EFIVARS_DATE);
+
+	efivars_kset = kset_create_and_add("vars", NULL, parent_kobj);
+	if (!efivars_kset) {
+		printk(KERN_ERR "efivars: Subsystem registration failed.\n");
+		return -ENOMEM;
+	}
+
+	efivar_init(efivars_sysfs_callback, NULL, false,
+		    true, &efivar_sysfs_list);
+
+	error = create_efivars_bin_attributes();
+	if (error) {
+		efivars_sysfs_exit();
+		return error;
+	}
+
+	INIT_WORK(&efivar_work, efivar_update_sysfs_entries);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivars_sysfs_init);
+
+module_init(efivars_sysfs_init);
+module_exit(efivars_sysfs_exit);
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
new file mode 100644
index 000000000000..dd1c20a426fa
--- /dev/null
+++ b/drivers/firmware/efi/vars.c
@@ -0,0 +1,1049 @@
+/*
+ * Originally from efivars.c
+ *
+ * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
+ * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/capability.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/efi.h>
+#include <linux/sysfs.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/ctype.h>
+
+/* Private pointer to registered efivars */
+static struct efivars *__efivars;
+
+static bool efivar_wq_enabled = true;
+DECLARE_WORK(efivar_work, NULL);
+EXPORT_SYMBOL_GPL(efivar_work);
+
+static bool
+validate_device_path(struct efi_variable *var, int match, u8 *buffer,
+		     unsigned long len)
+{
+	struct efi_generic_dev_path *node;
+	int offset = 0;
+
+	node = (struct efi_generic_dev_path *)buffer;
+
+	if (len < sizeof(*node))
+		return false;
+
+	while (offset <= len - sizeof(*node) &&
+	       node->length >= sizeof(*node) &&
+		node->length <= len - offset) {
+		offset += node->length;
+
+		if ((node->type == EFI_DEV_END_PATH ||
+		     node->type == EFI_DEV_END_PATH2) &&
+		    node->sub_type == EFI_DEV_END_ENTIRE)
+			return true;
+
+		node = (struct efi_generic_dev_path *)(buffer + offset);
+	}
+
+	/*
+	 * If we're here then either node->length pointed past the end
+	 * of the buffer or we reached the end of the buffer without
+	 * finding a device path end node.
+	 */
+	return false;
+}
+
+static bool
+validate_boot_order(struct efi_variable *var, int match, u8 *buffer,
+		    unsigned long len)
+{
+	/* An array of 16-bit integers */
+	if ((len % 2) != 0)
+		return false;
+
+	return true;
+}
+
+static bool
+validate_load_option(struct efi_variable *var, int match, u8 *buffer,
+		     unsigned long len)
+{
+	u16 filepathlength;
+	int i, desclength = 0, namelen;
+
+	namelen = utf16_strnlen(var->VariableName, sizeof(var->VariableName));
+
+	/* Either "Boot" or "Driver" followed by four digits of hex */
+	for (i = match; i < match+4; i++) {
+		if (var->VariableName[i] > 127 ||
+		    hex_to_bin(var->VariableName[i] & 0xff) < 0)
+			return true;
+	}
+
+	/* Reject it if there's 4 digits of hex and then further content */
+	if (namelen > match + 4)
+		return false;
+
+	/* A valid entry must be at least 8 bytes */
+	if (len < 8)
+		return false;
+
+	filepathlength = buffer[4] | buffer[5] << 8;
+
+	/*
+	 * There's no stored length for the description, so it has to be
+	 * found by hand
+	 */
+	desclength = utf16_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
+
+	/* Each boot entry must have a descriptor */
+	if (!desclength)
+		return false;
+
+	/*
+	 * If the sum of the length of the description, the claimed filepath
+	 * length and the original header are greater than the length of the
+	 * variable, it's malformed
+	 */
+	if ((desclength + filepathlength + 6) > len)
+		return false;
+
+	/*
+	 * And, finally, check the filepath
+	 */
+	return validate_device_path(var, match, buffer + desclength + 6,
+				    filepathlength);
+}
+
+static bool
+validate_uint16(struct efi_variable *var, int match, u8 *buffer,
+		unsigned long len)
+{
+	/* A single 16-bit integer */
+	if (len != 2)
+		return false;
+
+	return true;
+}
+
+static bool
+validate_ascii_string(struct efi_variable *var, int match, u8 *buffer,
+		      unsigned long len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		if (buffer[i] > 127)
+			return false;
+
+		if (buffer[i] == 0)
+			return true;
+	}
+
+	return false;
+}
+
+struct variable_validate {
+	char *name;
+	bool (*validate)(struct efi_variable *var, int match, u8 *data,
+			 unsigned long len);
+};
+
+static const struct variable_validate variable_validate[] = {
+	{ "BootNext", validate_uint16 },
+	{ "BootOrder", validate_boot_order },
+	{ "DriverOrder", validate_boot_order },
+	{ "Boot*", validate_load_option },
+	{ "Driver*", validate_load_option },
+	{ "ConIn", validate_device_path },
+	{ "ConInDev", validate_device_path },
+	{ "ConOut", validate_device_path },
+	{ "ConOutDev", validate_device_path },
+	{ "ErrOut", validate_device_path },
+	{ "ErrOutDev", validate_device_path },
+	{ "Timeout", validate_uint16 },
+	{ "Lang", validate_ascii_string },
+	{ "PlatformLang", validate_ascii_string },
+	{ "", NULL },
+};
+
+bool
+efivar_validate(struct efi_variable *var, u8 *data, unsigned long len)
+{
+	int i;
+	u16 *unicode_name = var->VariableName;
+
+	for (i = 0; variable_validate[i].validate != NULL; i++) {
+		const char *name = variable_validate[i].name;
+		int match;
+
+		for (match = 0; ; match++) {
+			char c = name[match];
+			u16 u = unicode_name[match];
+
+			/* All special variables are plain ascii */
+			if (u > 127)
+				return true;
+
+			/* Wildcard in the matching name means we've matched */
+			if (c == '*')
+				return variable_validate[i].validate(var,
+							     match, data, len);
+
+			/* Case sensitive match */
+			if (c != u)
+				break;
+
+			/* Reached the end of the string while matching */
+			if (!c)
+				return variable_validate[i].validate(var,
+							     match, data, len);
+		}
+	}
+
+	return true;
+}
+EXPORT_SYMBOL_GPL(efivar_validate);
+
+static efi_status_t
+check_var_size(u32 attributes, unsigned long size)
+{
+	u64 storage_size, remaining_size, max_size;
+	efi_status_t status;
+	const struct efivar_operations *fops = __efivars->ops;
+
+	if (!fops->query_variable_info)
+		return EFI_UNSUPPORTED;
+
+	status = fops->query_variable_info(attributes, &storage_size,
+					   &remaining_size, &max_size);
+
+	if (status != EFI_SUCCESS)
+		return status;
+
+	if (!storage_size || size > remaining_size || size > max_size ||
+	    (remaining_size - size) < (storage_size / 2))
+		return EFI_OUT_OF_RESOURCES;
+
+	return status;
+}
+
+static int efi_status_to_err(efi_status_t status)
+{
+	int err;
+
+	switch (status) {
+	case EFI_SUCCESS:
+		err = 0;
+		break;
+	case EFI_INVALID_PARAMETER:
+		err = -EINVAL;
+		break;
+	case EFI_OUT_OF_RESOURCES:
+		err = -ENOSPC;
+		break;
+	case EFI_DEVICE_ERROR:
+		err = -EIO;
+		break;
+	case EFI_WRITE_PROTECTED:
+		err = -EROFS;
+		break;
+	case EFI_SECURITY_VIOLATION:
+		err = -EACCES;
+		break;
+	case EFI_NOT_FOUND:
+		err = -ENOENT;
+		break;
+	default:
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
+				struct list_head *head)
+{
+	struct efivar_entry *entry, *n;
+	unsigned long strsize1, strsize2;
+	bool found = false;
+
+	strsize1 = utf16_strsize(variable_name, 1024);
+	list_for_each_entry_safe(entry, n, head, list) {
+		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+		if (strsize1 == strsize2 &&
+			!memcmp(variable_name, &(entry->var.VariableName),
+				strsize2) &&
+			!efi_guidcmp(entry->var.VendorGuid,
+				*vendor)) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+/*
+ * Returns the size of variable_name, in bytes, including the
+ * terminating NULL character, or variable_name_size if no NULL
+ * character is found among the first variable_name_size bytes.
+ */
+static unsigned long var_name_strnsize(efi_char16_t *variable_name,
+				       unsigned long variable_name_size)
+{
+	unsigned long len;
+	efi_char16_t c;
+
+	/*
+	 * The variable name is, by definition, a NULL-terminated
+	 * string, so make absolutely sure that variable_name_size is
+	 * the value we expect it to be. If not, return the real size.
+	 */
+	for (len = 2; len <= variable_name_size; len += sizeof(c)) {
+		c = variable_name[(len / sizeof(c)) - 1];
+		if (!c)
+			break;
+	}
+
+	return min(len, variable_name_size);
+}
+
+/*
+ * Print a warning when duplicate EFI variables are encountered and
+ * disable the sysfs workqueue since the firmware is buggy.
+ */
+static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid,
+			     unsigned long len16)
+{
+	size_t i, len8 = len16 / sizeof(efi_char16_t);
+	char *s8;
+
+	/*
+	 * Disable the workqueue since the algorithm it uses for
+	 * detecting new variables won't work with this buggy
+	 * implementation of GetNextVariableName().
+	 */
+	efivar_wq_enabled = false;
+
+	s8 = kzalloc(len8, GFP_KERNEL);
+	if (!s8)
+		return;
+
+	for (i = 0; i < len8; i++)
+		s8[i] = s16[i];
+
+	printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n",
+	       s8, vendor_guid);
+	kfree(s8);
+}
+
+/**
+ * efivar_init - build the initial list of EFI variables
+ * @func: callback function to invoke for every variable
+ * @data: function-specific data to pass to @func
+ * @atomic: do we need to execute the @func-loop atomically?
+ * @duplicates: error if we encounter duplicates on @head?
+ * @head: initialised head of variable list
+ *
+ * Get every EFI variable from the firmware and invoke @func. @func
+ * should call efivar_entry_add() to build the list of variables.
+ *
+ * Returns 0 on success, or a kernel error code on failure.
+ */
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
+		void *data, bool atomic, bool duplicates,
+		struct list_head *head)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	unsigned long variable_name_size = 1024;
+	efi_char16_t *variable_name;
+	efi_status_t status;
+	efi_guid_t vendor_guid;
+	int err = 0;
+
+	variable_name = kzalloc(variable_name_size, GFP_KERNEL);
+	if (!variable_name) {
+		printk(KERN_ERR "efivars: Memory allocation failed.\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_irq(&__efivars->lock);
+
+	/*
+	 * Per EFI spec, the maximum storage allocated for both
+	 * the variable name and variable data is 1024 bytes.
+	 */
+
+	do {
+		variable_name_size = 1024;
+
+		status = ops->get_next_variable(&variable_name_size,
+						variable_name,
+						&vendor_guid);
+		switch (status) {
+		case EFI_SUCCESS:
+			if (!atomic)
+				spin_unlock_irq(&__efivars->lock);
+
+			variable_name_size = var_name_strnsize(variable_name,
+							       variable_name_size);
+
+			/*
+			 * Some firmware implementations return the
+			 * same variable name on multiple calls to
+			 * get_next_variable(). Terminate the loop
+			 * immediately as there is no guarantee that
+			 * we'll ever see a different variable name,
+			 * and may end up looping here forever.
+			 */
+			if (duplicates &&
+			    variable_is_present(variable_name, &vendor_guid, head)) {
+				dup_variable_bug(variable_name, &vendor_guid,
+						 variable_name_size);
+				if (!atomic)
+					spin_lock_irq(&__efivars->lock);
+
+				status = EFI_NOT_FOUND;
+				break;
+			}
+
+			err = func(variable_name, vendor_guid, variable_name_size, data);
+			if (err)
+				status = EFI_NOT_FOUND;
+
+			if (!atomic)
+				spin_lock_irq(&__efivars->lock);
+
+			break;
+		case EFI_NOT_FOUND:
+			break;
+		default:
+			printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
+				status);
+			status = EFI_NOT_FOUND;
+			break;
+		}
+
+	} while (status != EFI_NOT_FOUND);
+
+	spin_unlock_irq(&__efivars->lock);
+
+	kfree(variable_name);
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(efivar_init);
+
+/**
+ * efivar_entry_add - add entry to variable list
+ * @entry: entry to add to list
+ * @head: list head
+ */
+void efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
+{
+	spin_lock_irq(&__efivars->lock);
+	list_add(&entry->list, head);
+	spin_unlock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_add);
+
+/**
+ * efivar_entry_remove - remove entry from variable list
+ * @entry: entry to remove from list
+ */
+void efivar_entry_remove(struct efivar_entry *entry)
+{
+	spin_lock_irq(&__efivars->lock);
+	list_del(&entry->list);
+	spin_unlock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_remove);
+
+/*
+ * efivar_entry_list_del_unlock - remove entry from variable list
+ * @entry: entry to remove
+ *
+ * Remove @entry from the variable list and release the list lock.
+ *
+ * NOTE: slightly weird locking semantics here - we expect to be
+ * called with the efivars lock already held, and we release it before
+ * returning. This is because this function is usually called after
+ * set_variable() while the lock is still held.
+ */
+static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
+{
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	list_del(&entry->list);
+	spin_unlock_irq(&__efivars->lock);
+}
+
+/**
+ * __efivar_entry_delete - delete an EFI variable
+ * @entry: entry containing EFI variable to delete
+ *
+ * Delete the variable from the firmware but leave @entry on the
+ * variable list.
+ *
+ * This function differs from efivar_entry_delete() because it does
+ * not remove @entry from the variable list. Also, it is safe to be
+ * called from within a efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() region, unlike efivar_entry_delete().
+ *
+ * Returns 0 on success, or a converted EFI status code if
+ * set_variable() fails.
+ */
+int __efivar_entry_delete(struct efivar_entry *entry)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	status = ops->set_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   0, 0, NULL);
+
+	return efi_status_to_err(status);
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_delete);
+
+/**
+ * efivar_entry_delete - delete variable and remove entry from list
+ * @entry: entry containing variable to delete
+ *
+ * Delete the variable from the firmware and remove @entry from the
+ * variable list. It is the caller's responsibility to free @entry
+ * once we return.
+ *
+ * Returns 0 on success, or a converted EFI status code if
+ * set_variable() fails.
+ */
+int efivar_entry_delete(struct efivar_entry *entry)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	spin_lock_irq(&__efivars->lock);
+	status = ops->set_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   0, 0, NULL);
+	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
+		spin_unlock_irq(&__efivars->lock);
+		return efi_status_to_err(status);
+	}
+
+	efivar_entry_list_del_unlock(entry);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_delete);
+
+/**
+ * efivar_entry_set - call set_variable()
+ * @entry: entry containing the EFI variable to write
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer containing variable data
+ * @head: head of variable list
+ *
+ * Calls set_variable() for an EFI variable. If creating a new EFI
+ * variable, this function is usually followed by efivar_entry_add().
+ *
+ * Before writing the variable, the remaining EFI variable storage
+ * space is checked to ensure there is enough room available.
+ *
+ * If @head is not NULL a lookup is performed to determine whether
+ * the entry is already on the list.
+ *
+ * Returns 0 on success, -EEXIST if a lookup is performed and the entry
+ * already exists on the list, or a converted EFI status code if
+ * set_variable() fails.
+ */
+int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
+		     unsigned long size, void *data, struct list_head *head)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+	efi_char16_t *name = entry->var.VariableName;
+	efi_guid_t vendor = entry->var.VendorGuid;
+
+	spin_lock_irq(&__efivars->lock);
+
+	if (head && efivar_entry_find(name, vendor, head, false)) {
+		spin_unlock_irq(&__efivars->lock);
+		return -EEXIST;
+	}
+
+	status = check_var_size(attributes, size + utf16_strsize(name, 1024));
+	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
+		status = ops->set_variable(name, &vendor,
+					   attributes, size, data);
+
+	spin_unlock_irq(&__efivars->lock);
+
+	return efi_status_to_err(status);
+
+}
+EXPORT_SYMBOL_GPL(efivar_entry_set);
+
+/**
+ * efivar_entry_set_safe - call set_variable() if enough space in firmware
+ * @name: buffer containing the variable name
+ * @vendor: variable vendor guid
+ * @attributes: variable attributes
+ * @block: can we block in this context?
+ * @size: size of @data buffer
+ * @data: buffer containing variable data
+ *
+ * Ensures there is enough free storage in the firmware for this variable, and
+ * if so, calls set_variable(). If creating a new EFI variable, this function
+ * is usually followed by efivar_entry_add().
+ *
+ * Returns 0 on success, -ENOSPC if the firmware does not have enough
+ * space for set_variable() to succeed, or a converted EFI status code
+ * if set_variable() fails.
+ */
+int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
+			  bool block, unsigned long size, void *data)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	unsigned long flags;
+	efi_status_t status;
+
+	if (!ops->query_variable_info)
+		return -ENOSYS;
+
+	if (!block && spin_trylock_irqsave(&__efivars->lock, flags))
+		return -EBUSY;
+	else
+		spin_lock_irqsave(&__efivars->lock, flags);
+
+	status = check_var_size(attributes, size + utf16_strsize(name, 1024));
+	if (status != EFI_SUCCESS) {
+		spin_unlock_irqrestore(&__efivars->lock, flags);
+		return -ENOSPC;
+	}
+
+	status = ops->set_variable(name, &vendor, attributes, size, data);
+
+	spin_unlock_irqrestore(&__efivars->lock, flags);
+
+	return efi_status_to_err(status);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_set_safe);
+
+/**
+ * efivar_entry_find - search for an entry
+ * @name: the EFI variable name
+ * @guid: the EFI variable vendor's guid
+ * @head: head of the variable list
+ * @remove: should we remove the entry from the list?
+ *
+ * Search for an entry on the variable list that has the EFI variable
+ * name @name and vendor guid @guid. If an entry is found on the list
+ * and @remove is true, the entry is removed from the list.
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ *
+ * Returns the entry if found on the list, %NULL otherwise.
+ */
+struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
+				       struct list_head *head, bool remove)
+{
+	struct efivar_entry *entry, *n;
+	int strsize1, strsize2;
+	bool found = false;
+
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	list_for_each_entry_safe(entry, n, head, list) {
+		strsize1 = utf16_strsize(name, 1024);
+		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
+		if (strsize1 == strsize2 &&
+		    !memcmp(name, &(entry->var.VariableName), strsize1) &&
+		    !efi_guidcmp(guid, entry->var.VendorGuid)) {
+			found = true;
+			break;
+		}
+	}
+
+	if (!found)
+		return NULL;
+
+	if (remove)
+		list_del(&entry->list);
+
+	return entry;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_find);
+
+/**
+ * __efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
+ */
+int __efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	WARN_ON(!spin_is_locked(&__efivars->lock));
+
+	*size = 0;
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid, NULL, size, NULL);
+	if (status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_size);
+
+/**
+ * efivar_entry_size - obtain the size of a variable
+ * @entry: entry for this variable
+ * @size: location to store the variable's size
+ */
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	*size = 0;
+
+	spin_lock_irq(&__efivars->lock);
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid, NULL, size, NULL);
+	spin_unlock_irq(&__efivars->lock);
+
+	if (status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_size);
+
+/**
+ * efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ */
+int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		     unsigned long *size, void *data)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_status_t status;
+
+	spin_lock_irq(&__efivars->lock);
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   attributes, size, data);
+	spin_unlock_irq(&__efivars->lock);
+
+	return efi_status_to_err(status);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_get);
+
+/**
+ * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
+ * @entry: entry containing variable to set and get
+ * @attributes: attributes of variable to be written
+ * @size: size of data buffer
+ * @data: buffer containing data to write
+ * @set: did the set_variable() call succeed?
+ *
+ * This is a pretty special (complex) function. See efivarfs_file_write().
+ *
+ * Atomically call set_variable() for @entry and if the call is
+ * successful, return the new size of the variable from get_variable()
+ * in @size. The success of set_variable() is indicated by @set.
+ *
+ * Returns 0 on success, -EINVAL if the variable data is invalid,
+ * -ENOSPC if the firmware does not have enough available space, or a
+ * converted EFI status code if either of set_variable() or
+ * get_variable() fail.
+ *
+ * If the EFI variable does not exist when calling set_variable()
+ * (EFI_NOT_FOUND), @entry is removed from the variable list.
+ */
+int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
+			      unsigned long *size, void *data, bool *set)
+{
+	const struct efivar_operations *ops = __efivars->ops;
+	efi_char16_t *name = entry->var.VariableName;
+	efi_guid_t *vendor = &entry->var.VendorGuid;
+	efi_status_t status;
+	int err;
+
+	*set = false;
+
+	if (efivar_validate(&entry->var, data, *size) == false)
+		return -EINVAL;
+
+	/*
+	 * The lock here protects the get_variable call, the conditional
+	 * set_variable call, and removal of the variable from the efivars
+	 * list (in the case of an authenticated delete).
+	 */
+	spin_lock_irq(&__efivars->lock);
+
+	/*
+	 * Ensure that the available space hasn't shrunk below the safe level
+	 */
+	status = check_var_size(attributes, *size + utf16_strsize(name, 1024));
+	if (status != EFI_SUCCESS) {
+		if (status != EFI_UNSUPPORTED) {
+			err = efi_status_to_err(status);
+			goto out;
+		}
+
+		if (*size > 65536) {
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	status = ops->set_variable(name, vendor, attributes, *size, data);
+	if (status != EFI_SUCCESS) {
+		err = efi_status_to_err(status);
+		goto out;
+	}
+
+	*set = true;
+
+	/*
+	 * Writing to the variable may have caused a change in size (which
+	 * could either be an append or an overwrite), or the variable to be
+	 * deleted. Perform a GetVariable() so we can tell what actually
+	 * happened.
+	 */
+	*size = 0;
+	status = ops->get_variable(entry->var.VariableName,
+				   &entry->var.VendorGuid,
+				   NULL, size, NULL);
+
+	if (status == EFI_NOT_FOUND)
+		efivar_entry_list_del_unlock(entry);
+	else
+		spin_unlock_irq(&__efivars->lock);
+
+	if (status && status != EFI_BUFFER_TOO_SMALL)
+		return efi_status_to_err(status);
+
+	return 0;
+
+out:
+	spin_unlock_irq(&__efivars->lock);
+	return err;
+
+}
+EXPORT_SYMBOL_GPL(efivar_entry_set_get_size);
+
+/**
+ * efivar_entry_iter_begin - begin iterating the variable list
+ *
+ * Lock the variable list to prevent entry insertion and removal until
+ * efivar_entry_iter_end() is called. This function is usually used in
+ * conjunction with __efivar_entry_iter() or efivar_entry_iter().
+ */
+void efivar_entry_iter_begin(void)
+{
+	spin_lock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_iter_begin);
+
+/**
+ * efivar_entry_iter_end - finish iterating the variable list
+ *
+ * Unlock the variable list and allow modifications to the list again.
+ */
+void efivar_entry_iter_end(void)
+{
+	spin_unlock_irq(&__efivars->lock);
+}
+EXPORT_SYMBOL_GPL(efivar_entry_iter_end);
+
+/**
+ * __efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of the variable list
+ * @data: function-specific data to pass to callback
+ * @prev: entry to begin iterating from
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete().
+ *
+ * You MUST call efivar_enter_iter_begin() before this function, and
+ * efivar_entry_iter_end() afterwards.
+ *
+ * It is possible to begin iteration from an arbitrary entry within
+ * the list by passing @prev. @prev is updated on return to point to
+ * the last entry passed to @func. To begin iterating from the
+ * beginning of the list @prev must be %NULL.
+ *
+ * The restrictions for @func are the same as documented for
+ * efivar_entry_iter().
+ */
+int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+			struct list_head *head, void *data,
+			struct efivar_entry **prev)
+{
+	struct efivar_entry *entry, *n;
+	int err = 0;
+
+	if (!prev || !*prev) {
+		list_for_each_entry_safe(entry, n, head, list) {
+			err = func(entry, data);
+			if (err)
+				break;
+		}
+
+		if (prev)
+			*prev = entry;
+
+		return err;
+	}
+
+
+	list_for_each_entry_safe_continue((*prev), n, head, list) {
+		err = func(*prev, data);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(__efivar_entry_iter);
+
+/**
+ * efivar_entry_iter - iterate over variable list
+ * @func: callback function
+ * @head: head of variable list
+ * @data: function-specific data to pass to callback
+ *
+ * Iterate over the list of EFI variables and call @func with every
+ * entry on the list. It is safe for @func to remove entries in the
+ * list via efivar_entry_delete() while iterating.
+ *
+ * Some notes for the callback function:
+ *  - a non-zero return value indicates an error and terminates the loop
+ *  - @func is called from atomic context
+ */
+int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
+		      struct list_head *head, void *data)
+{
+	int err = 0;
+
+	efivar_entry_iter_begin();
+	err = __efivar_entry_iter(func, head, data, NULL);
+	efivar_entry_iter_end();
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(efivar_entry_iter);
+
+/**
+ * efivars_kobject - get the kobject for the registered efivars
+ *
+ * If efivars_register() has not been called we return NULL,
+ * otherwise return the kobject used at registration time.
+ */
+struct kobject *efivars_kobject(void)
+{
+	if (!__efivars)
+		return NULL;
+
+	return __efivars->kobject;
+}
+EXPORT_SYMBOL_GPL(efivars_kobject);
+
+/**
+ * efivar_run_worker - schedule the efivar worker thread
+ */
+void efivar_run_worker(void)
+{
+	if (efivar_wq_enabled)
+		schedule_work(&efivar_work);
+}
+EXPORT_SYMBOL_GPL(efivar_run_worker);
+
+/**
+ * efivars_register - register an efivars
+ * @efivars: efivars to register
+ * @ops: efivars operations
+ * @kobject: @efivars-specific kobject
+ *
+ * Only a single efivars can be registered at any time.
+ */
+int efivars_register(struct efivars *efivars,
+		     const struct efivar_operations *ops,
+		     struct kobject *kobject)
+{
+	spin_lock_init(&efivars->lock);
+	efivars->ops = ops;
+	efivars->kobject = kobject;
+
+	__efivars = efivars;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(efivars_register);
+
+/**
+ * efivars_unregister - unregister an efivars
+ * @efivars: efivars to unregister
+ *
+ * The caller must have already removed every entry from the list,
+ * failure to do so is an error.
+ */
+int efivars_unregister(struct efivars *efivars)
+{
+	int rv;
+
+	if (!__efivars) {
+		printk(KERN_ERR "efivars not registered\n");
+		rv = -EINVAL;
+		goto out;
+	}
+
+	if (__efivars != efivars) {
+		rv = -EINVAL;
+		goto out;
+	}
+
+	__efivars = NULL;
+
+	rv = 0;
+out:
+	return rv;
+}
+EXPORT_SYMBOL_GPL(efivars_unregister);
diff --git a/drivers/firmware/efivars.c b/drivers/firmware/efivars.c
deleted file mode 100644
index af396758482f..000000000000
--- a/drivers/firmware/efivars.c
+++ /dev/null
@@ -1,1794 +0,0 @@
-/*
- * EFI Variables - efivars.c
- *
- * Copyright (C) 2001,2003,2004 Dell <Matt_Domsch@dell.com>
- * Copyright (C) 2004 Intel Corporation <matthew.e.tolentino@intel.com>
- *
- * This code takes all variables accessible from EFI runtime and
- *  exports them via sysfs
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License as published by
- *  the Free Software Foundation; either version 2 of the License, or
- *  (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
- *
- * Changelog:
- *
- *  17 May 2004 - Matt Domsch <Matt_Domsch@dell.com>
- *   remove check for efi_enabled in exit
- *   add MODULE_VERSION
- *
- *  26 Apr 2004 - Matt Domsch <Matt_Domsch@dell.com>
- *   minor bug fixes
- *
- *  21 Apr 2004 - Matt Tolentino <matthew.e.tolentino@intel.com)
- *   converted driver to export variable information via sysfs
- *   and moved to drivers/firmware directory
- *   bumped revision number to v0.07 to reflect conversion & move
- *
- *  10 Dec 2002 - Matt Domsch <Matt_Domsch@dell.com>
- *   fix locking per Peter Chubb's findings
- *
- *  25 Mar 2002 - Matt Domsch <Matt_Domsch@dell.com>
- *   move uuid_unparse() to include/asm-ia64/efi.h:efi_guid_unparse()
- *
- *  12 Feb 2002 - Matt Domsch <Matt_Domsch@dell.com>
- *   use list_for_each_safe when deleting vars.
- *   remove ifdef CONFIG_SMP around include <linux/smp.h>
- *   v0.04 release to linux-ia64@linuxia64.org
- *
- *  20 April 2001 - Matt Domsch <Matt_Domsch@dell.com>
- *   Moved vars from /proc/efi to /proc/efi/vars, and made
- *   efi.c own the /proc/efi directory.
- *   v0.03 release to linux-ia64@linuxia64.org
- *
- *  26 March 2001 - Matt Domsch <Matt_Domsch@dell.com>
- *   At the request of Stephane, moved ownership of /proc/efi
- *   to efi.c, and now efivars lives under /proc/efi/vars.
- *
- *  12 March 2001 - Matt Domsch <Matt_Domsch@dell.com>
- *   Feedback received from Stephane Eranian incorporated.
- *   efivar_write() checks copy_from_user() return value.
- *   efivar_read/write() returns proper errno.
- *   v0.02 release to linux-ia64@linuxia64.org
- *
- *  26 February 2001 - Matt Domsch <Matt_Domsch@dell.com>
- *   v0.01 release to linux-ia64@linuxia64.org
- */
-
-#include <linux/capability.h>
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/efi.h>
-#include <linux/sysfs.h>
-#include <linux/kobject.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/ctype.h>
-
-#include <linux/fs.h>
-#include <linux/ramfs.h>
-#include <linux/pagemap.h>
-
-#include <asm/uaccess.h>
-
-#define EFIVARS_VERSION "0.08"
-#define EFIVARS_DATE "2004-May-17"
-
-MODULE_AUTHOR("Matt Domsch <Matt_Domsch@Dell.com>");
-MODULE_DESCRIPTION("sysfs interface to EFI Variables");
-MODULE_LICENSE("GPL");
-MODULE_VERSION(EFIVARS_VERSION);
-
-LIST_HEAD(efivar_sysfs_list);
-EXPORT_SYMBOL_GPL(efivar_sysfs_list);
-
-struct efivar_attribute {
-	struct attribute attr;
-	ssize_t (*show) (struct efivar_entry *entry, char *buf);
-	ssize_t (*store)(struct efivar_entry *entry, const char *buf, size_t count);
-};
-
-/* Private pointer to registered efivars */
-static struct efivars *__efivars;
-
-static struct kset *efivars_kset;
-
-static struct bin_attribute *efivars_new_var;
-static struct bin_attribute *efivars_del_var;
-
-#define EFIVAR_ATTR(_name, _mode, _show, _store) \
-struct efivar_attribute efivar_attr_##_name = { \
-	.attr = {.name = __stringify(_name), .mode = _mode}, \
-	.show = _show, \
-	.store = _store, \
-};
-
-#define to_efivar_attr(_attr) container_of(_attr, struct efivar_attribute, attr)
-#define to_efivar_entry(obj)  container_of(obj, struct efivar_entry, kobj)
-
-/*
- * Prototype for sysfs creation function
- */
-static int
-efivar_create_sysfs_entry(struct efivar_entry *new_var);
-
-/*
- * Prototype for workqueue functions updating sysfs entry
- */
-
-static void efivar_update_sysfs_entries(struct work_struct *);
-static DECLARE_WORK(efivar_work, efivar_update_sysfs_entries);
-static bool efivar_wq_enabled = true;
-
-static bool
-validate_device_path(struct efi_variable *var, int match, u8 *buffer,
-		     unsigned long len)
-{
-	struct efi_generic_dev_path *node;
-	int offset = 0;
-
-	node = (struct efi_generic_dev_path *)buffer;
-
-	if (len < sizeof(*node))
-		return false;
-
-	while (offset <= len - sizeof(*node) &&
-	       node->length >= sizeof(*node) &&
-		node->length <= len - offset) {
-		offset += node->length;
-
-		if ((node->type == EFI_DEV_END_PATH ||
-		     node->type == EFI_DEV_END_PATH2) &&
-		    node->sub_type == EFI_DEV_END_ENTIRE)
-			return true;
-
-		node = (struct efi_generic_dev_path *)(buffer + offset);
-	}
-
-	/*
-	 * If we're here then either node->length pointed past the end
-	 * of the buffer or we reached the end of the buffer without
-	 * finding a device path end node.
-	 */
-	return false;
-}
-
-static bool
-validate_boot_order(struct efi_variable *var, int match, u8 *buffer,
-		    unsigned long len)
-{
-	/* An array of 16-bit integers */
-	if ((len % 2) != 0)
-		return false;
-
-	return true;
-}
-
-static bool
-validate_load_option(struct efi_variable *var, int match, u8 *buffer,
-		     unsigned long len)
-{
-	u16 filepathlength;
-	int i, desclength = 0, namelen;
-
-	namelen = utf16_strnlen(var->VariableName, sizeof(var->VariableName));
-
-	/* Either "Boot" or "Driver" followed by four digits of hex */
-	for (i = match; i < match+4; i++) {
-		if (var->VariableName[i] > 127 ||
-		    hex_to_bin(var->VariableName[i] & 0xff) < 0)
-			return true;
-	}
-
-	/* Reject it if there's 4 digits of hex and then further content */
-	if (namelen > match + 4)
-		return false;
-
-	/* A valid entry must be at least 8 bytes */
-	if (len < 8)
-		return false;
-
-	filepathlength = buffer[4] | buffer[5] << 8;
-
-	/*
-	 * There's no stored length for the description, so it has to be
-	 * found by hand
-	 */
-	desclength = utf16_strsize((efi_char16_t *)(buffer + 6), len - 6) + 2;
-
-	/* Each boot entry must have a descriptor */
-	if (!desclength)
-		return false;
-
-	/*
-	 * If the sum of the length of the description, the claimed filepath
-	 * length and the original header are greater than the length of the
-	 * variable, it's malformed
-	 */
-	if ((desclength + filepathlength + 6) > len)
-		return false;
-
-	/*
-	 * And, finally, check the filepath
-	 */
-	return validate_device_path(var, match, buffer + desclength + 6,
-				    filepathlength);
-}
-
-static bool
-validate_uint16(struct efi_variable *var, int match, u8 *buffer,
-		unsigned long len)
-{
-	/* A single 16-bit integer */
-	if (len != 2)
-		return false;
-
-	return true;
-}
-
-static bool
-validate_ascii_string(struct efi_variable *var, int match, u8 *buffer,
-		      unsigned long len)
-{
-	int i;
-
-	for (i = 0; i < len; i++) {
-		if (buffer[i] > 127)
-			return false;
-
-		if (buffer[i] == 0)
-			return true;
-	}
-
-	return false;
-}
-
-struct variable_validate {
-	char *name;
-	bool (*validate)(struct efi_variable *var, int match, u8 *data,
-			 unsigned long len);
-};
-
-static const struct variable_validate variable_validate[] = {
-	{ "BootNext", validate_uint16 },
-	{ "BootOrder", validate_boot_order },
-	{ "DriverOrder", validate_boot_order },
-	{ "Boot*", validate_load_option },
-	{ "Driver*", validate_load_option },
-	{ "ConIn", validate_device_path },
-	{ "ConInDev", validate_device_path },
-	{ "ConOut", validate_device_path },
-	{ "ConOutDev", validate_device_path },
-	{ "ErrOut", validate_device_path },
-	{ "ErrOutDev", validate_device_path },
-	{ "Timeout", validate_uint16 },
-	{ "Lang", validate_ascii_string },
-	{ "PlatformLang", validate_ascii_string },
-	{ "", NULL },
-};
-
-bool
-efivar_validate(struct efi_variable *var, u8 *data, unsigned long len)
-{
-	int i;
-	u16 *unicode_name = var->VariableName;
-
-	for (i = 0; variable_validate[i].validate != NULL; i++) {
-		const char *name = variable_validate[i].name;
-		int match;
-
-		for (match = 0; ; match++) {
-			char c = name[match];
-			u16 u = unicode_name[match];
-
-			/* All special variables are plain ascii */
-			if (u > 127)
-				return true;
-
-			/* Wildcard in the matching name means we've matched */
-			if (c == '*')
-				return variable_validate[i].validate(var,
-							     match, data, len);
-
-			/* Case sensitive match */
-			if (c != u)
-				break;
-
-			/* Reached the end of the string while matching */
-			if (!c)
-				return variable_validate[i].validate(var,
-							     match, data, len);
-		}
-	}
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(efivar_validate);
-
-static efi_status_t
-check_var_size(u32 attributes, unsigned long size)
-{
-	u64 storage_size, remaining_size, max_size;
-	efi_status_t status;
-	const struct efivar_operations *fops = __efivars->ops;
-
-	if (!fops->query_variable_info)
-		return EFI_UNSUPPORTED;
-
-	status = fops->query_variable_info(attributes, &storage_size,
-					   &remaining_size, &max_size);
-
-	if (status != EFI_SUCCESS)
-		return status;
-
-	if (!storage_size || size > remaining_size || size > max_size ||
-	    (remaining_size - size) < (storage_size / 2))
-		return EFI_OUT_OF_RESOURCES;
-
-	return status;
-}
-
-static ssize_t
-efivar_guid_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	char *str = buf;
-
-	if (!entry || !buf)
-		return 0;
-
-	efi_guid_unparse(&var->VendorGuid, str);
-	str += strlen(str);
-	str += sprintf(str, "\n");
-
-	return str - buf;
-}
-
-static ssize_t
-efivar_attr_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	char *str = buf;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	var->DataSize = 1024;
-	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
-		return -EIO;
-
-	if (var->Attributes & EFI_VARIABLE_NON_VOLATILE)
-		str += sprintf(str, "EFI_VARIABLE_NON_VOLATILE\n");
-	if (var->Attributes & EFI_VARIABLE_BOOTSERVICE_ACCESS)
-		str += sprintf(str, "EFI_VARIABLE_BOOTSERVICE_ACCESS\n");
-	if (var->Attributes & EFI_VARIABLE_RUNTIME_ACCESS)
-		str += sprintf(str, "EFI_VARIABLE_RUNTIME_ACCESS\n");
-	if (var->Attributes & EFI_VARIABLE_HARDWARE_ERROR_RECORD)
-		str += sprintf(str, "EFI_VARIABLE_HARDWARE_ERROR_RECORD\n");
-	if (var->Attributes & EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS)
-		str += sprintf(str,
-			"EFI_VARIABLE_AUTHENTICATED_WRITE_ACCESS\n");
-	if (var->Attributes &
-			EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS)
-		str += sprintf(str,
-			"EFI_VARIABLE_TIME_BASED_AUTHENTICATED_WRITE_ACCESS\n");
-	if (var->Attributes & EFI_VARIABLE_APPEND_WRITE)
-		str += sprintf(str, "EFI_VARIABLE_APPEND_WRITE\n");
-	return str - buf;
-}
-
-static ssize_t
-efivar_size_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-	char *str = buf;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	var->DataSize = 1024;
-	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
-		return -EIO;
-
-	str += sprintf(str, "0x%lx\n", var->DataSize);
-	return str - buf;
-}
-
-static ssize_t
-efivar_data_read(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-
-	if (!entry || !buf)
-		return -EINVAL;
-
-	var->DataSize = 1024;
-	if (efivar_entry_get(entry, &var->Attributes, &var->DataSize, var->Data))
-		return -EIO;
-
-	memcpy(buf, var->Data, var->DataSize);
-	return var->DataSize;
-}
-/*
- * We allow each variable to be edited via rewriting the
- * entire efi variable structure.
- */
-static ssize_t
-efivar_store_raw(struct efivar_entry *entry, const char *buf, size_t count)
-{
-	struct efi_variable *new_var, *var = &entry->var;
-	int err;
-
-	if (count != sizeof(struct efi_variable))
-		return -EINVAL;
-
-	new_var = (struct efi_variable *)buf;
-	/*
-	 * If only updating the variable data, then the name
-	 * and guid should remain the same
-	 */
-	if (memcmp(new_var->VariableName, var->VariableName, sizeof(var->VariableName)) ||
-		efi_guidcmp(new_var->VendorGuid, var->VendorGuid)) {
-		printk(KERN_ERR "efivars: Cannot edit the wrong variable!\n");
-		return -EINVAL;
-	}
-
-	if ((new_var->DataSize <= 0) || (new_var->Attributes == 0)){
-		printk(KERN_ERR "efivars: DataSize & Attributes must be valid!\n");
-		return -EINVAL;
-	}
-
-	if ((new_var->Attributes & ~EFI_VARIABLE_MASK) != 0 ||
-	    efivar_validate(new_var, new_var->Data, new_var->DataSize) == false) {
-		printk(KERN_ERR "efivars: Malformed variable content\n");
-		return -EINVAL;
-	}
-
-	memcpy(&entry->var, new_var, count);
-
-	err = efivar_entry_set(entry, new_var->Attributes,
-			       new_var->DataSize, new_var->Data, false);
-	if (err) {
-		printk(KERN_WARNING "efivars: set_variable() failed: status=%d\n", err);
-		return -EIO;
-	}
-
-	return count;
-}
-
-static ssize_t
-efivar_show_raw(struct efivar_entry *entry, char *buf)
-{
-	struct efi_variable *var = &entry->var;
-
-	if (!entry || !buf)
-		return 0;
-
-	var->DataSize = 1024;
-	if (efivar_entry_get(entry, &entry->var.Attributes,
-			     &entry->var.DataSize, entry->var.Data))
-		return -EIO;
-
-	memcpy(buf, var, sizeof(*var));
-
-	return sizeof(*var);
-}
-
-/*
- * Generic read/write functions that call the specific functions of
- * the attributes...
- */
-static ssize_t efivar_attr_show(struct kobject *kobj, struct attribute *attr,
-				char *buf)
-{
-	struct efivar_entry *var = to_efivar_entry(kobj);
-	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
-	ssize_t ret = -EIO;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (efivar_attr->show) {
-		ret = efivar_attr->show(var, buf);
-	}
-	return ret;
-}
-
-static ssize_t efivar_attr_store(struct kobject *kobj, struct attribute *attr,
-				const char *buf, size_t count)
-{
-	struct efivar_entry *var = to_efivar_entry(kobj);
-	struct efivar_attribute *efivar_attr = to_efivar_attr(attr);
-	ssize_t ret = -EIO;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if (efivar_attr->store)
-		ret = efivar_attr->store(var, buf, count);
-
-	return ret;
-}
-
-static const struct sysfs_ops efivar_attr_ops = {
-	.show = efivar_attr_show,
-	.store = efivar_attr_store,
-};
-
-static void efivar_release(struct kobject *kobj)
-{
-	struct efivar_entry *var = container_of(kobj, struct efivar_entry, kobj);
-	kfree(var);
-}
-
-static EFIVAR_ATTR(guid, 0400, efivar_guid_read, NULL);
-static EFIVAR_ATTR(attributes, 0400, efivar_attr_read, NULL);
-static EFIVAR_ATTR(size, 0400, efivar_size_read, NULL);
-static EFIVAR_ATTR(data, 0400, efivar_data_read, NULL);
-static EFIVAR_ATTR(raw_var, 0600, efivar_show_raw, efivar_store_raw);
-
-static struct attribute *def_attrs[] = {
-	&efivar_attr_guid.attr,
-	&efivar_attr_size.attr,
-	&efivar_attr_attributes.attr,
-	&efivar_attr_data.attr,
-	&efivar_attr_raw_var.attr,
-	NULL,
-};
-
-static struct kobj_type efivar_ktype = {
-	.release = efivar_release,
-	.sysfs_ops = &efivar_attr_ops,
-	.default_attrs = def_attrs,
-};
-
-static int efi_status_to_err(efi_status_t status)
-{
-	int err;
-
-	switch (status) {
-	case EFI_SUCCESS:
-		err = 0;
-		break;
-	case EFI_INVALID_PARAMETER:
-		err = -EINVAL;
-		break;
-	case EFI_OUT_OF_RESOURCES:
-		err = -ENOSPC;
-		break;
-	case EFI_DEVICE_ERROR:
-		err = -EIO;
-		break;
-	case EFI_WRITE_PROTECTED:
-		err = -EROFS;
-		break;
-	case EFI_SECURITY_VIOLATION:
-		err = -EACCES;
-		break;
-	case EFI_NOT_FOUND:
-		err = -ENOENT;
-		break;
-	default:
-		err = -EINVAL;
-	}
-
-	return err;
-}
-
-static ssize_t efivar_create(struct file *filp, struct kobject *kobj,
-			     struct bin_attribute *bin_attr,
-			     char *buf, loff_t pos, size_t count)
-{
-	struct efi_variable *new_var = (struct efi_variable *)buf;
-	struct efivar_entry *new_entry;
-	int err;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	if ((new_var->Attributes & ~EFI_VARIABLE_MASK) != 0 ||
-	    efivar_validate(new_var, new_var->Data, new_var->DataSize) == false) {
-		printk(KERN_ERR "efivars: Malformed variable content\n");
-		return -EINVAL;
-	}
-
-	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-	if (!new_entry)
-		return -ENOMEM;
-
-	memcpy(&new_entry->var, new_var, sizeof(*new_var));
-
-	err = efivar_entry_set(new_entry, new_var->Attributes, new_var->DataSize,
-			       new_var->Data, &efivar_sysfs_list);
-	if (err) {
-		if (err == -EEXIST)
-			err = -EINVAL;
-		goto out;
-	}
-
-	if (efivar_create_sysfs_entry(new_entry)) {
-		printk(KERN_WARNING "efivars: failed to create sysfs entry.\n");
-		kfree(new_entry);
-	}
-	return count;
-
-out:
-	kfree(new_entry);
-	return err;
-}
-
-static ssize_t efivar_delete(struct file *filp, struct kobject *kobj,
-			     struct bin_attribute *bin_attr,
-			     char *buf, loff_t pos, size_t count)
-{
-	struct efi_variable *del_var = (struct efi_variable *)buf;
-	struct efivar_entry *entry;
-	int err = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
-	efivar_entry_iter_begin();
-	entry = efivar_entry_find(del_var->VariableName, del_var->VendorGuid,
-				  &efivar_sysfs_list, true);
-	if (!entry)
-		err = -EINVAL;
-	else if (__efivar_entry_delete(entry))
-		err = -EIO;
-
-	efivar_entry_iter_end();
-
-	if (err)
-		return err;
-
-	efivar_unregister(entry);
-
-	/* It's dead Jim.... */
-	return count;
-}
-
-static bool variable_is_present(efi_char16_t *variable_name, efi_guid_t *vendor,
-				struct list_head *head)
-{
-	struct efivar_entry *entry, *n;
-	unsigned long strsize1, strsize2;
-	bool found = false;
-
-	strsize1 = utf16_strsize(variable_name, 1024);
-	list_for_each_entry_safe(entry, n, head, list) {
-		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
-		if (strsize1 == strsize2 &&
-			!memcmp(variable_name, &(entry->var.VariableName),
-				strsize2) &&
-			!efi_guidcmp(entry->var.VendorGuid,
-				*vendor)) {
-			found = true;
-			break;
-		}
-	}
-	return found;
-}
-
-static int efivar_update_sysfs_entry(efi_char16_t *name, efi_guid_t vendor,
-				     unsigned long name_size, void *data)
-{
-	struct efivar_entry *entry = data;
-
-	if (efivar_entry_find(name, vendor, &efivar_sysfs_list, false))
-		return 0;
-
-	memcpy(entry->var.VariableName, name, name_size);
-	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
-
-	return 1;
-}
-
-/*
- * Returns the size of variable_name, in bytes, including the
- * terminating NULL character, or variable_name_size if no NULL
- * character is found among the first variable_name_size bytes.
- */
-static unsigned long var_name_strnsize(efi_char16_t *variable_name,
-				       unsigned long variable_name_size)
-{
-	unsigned long len;
-	efi_char16_t c;
-
-	/*
-	 * The variable name is, by definition, a NULL-terminated
-	 * string, so make absolutely sure that variable_name_size is
-	 * the value we expect it to be. If not, return the real size.
-	 */
-	for (len = 2; len <= variable_name_size; len += sizeof(c)) {
-		c = variable_name[(len / sizeof(c)) - 1];
-		if (!c)
-			break;
-	}
-
-	return min(len, variable_name_size);
-}
-
-static void efivar_update_sysfs_entries(struct work_struct *work)
-{
-	struct efivar_entry *entry;
-	int err;
-
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return;
-
-	/* Add new sysfs entries */
-	while (1) {
-		memset(entry, 0, sizeof(*entry));
-
-		err = efivar_init(efivar_update_sysfs_entry, entry,
-				  true, false, &efivar_sysfs_list);
-		if (!err)
-			break;
-
-		efivar_create_sysfs_entry(entry);
-	}
-
-	kfree(entry);
-}
-
-/*
- * Let's not leave out systab information that snuck into
- * the efivars driver
- */
-static ssize_t systab_show(struct kobject *kobj,
-			   struct kobj_attribute *attr, char *buf)
-{
-	char *str = buf;
-
-	if (!kobj || !buf)
-		return -EINVAL;
-
-	if (efi.mps != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "MPS=0x%lx\n", efi.mps);
-	if (efi.acpi20 != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "ACPI20=0x%lx\n", efi.acpi20);
-	if (efi.acpi != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "ACPI=0x%lx\n", efi.acpi);
-	if (efi.smbios != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "SMBIOS=0x%lx\n", efi.smbios);
-	if (efi.hcdp != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "HCDP=0x%lx\n", efi.hcdp);
-	if (efi.boot_info != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "BOOTINFO=0x%lx\n", efi.boot_info);
-	if (efi.uga != EFI_INVALID_TABLE_ADDR)
-		str += sprintf(str, "UGA=0x%lx\n", efi.uga);
-
-	return str - buf;
-}
-
-static struct kobj_attribute efi_attr_systab =
-			__ATTR(systab, 0400, systab_show, NULL);
-
-static struct attribute *efi_subsys_attrs[] = {
-	&efi_attr_systab.attr,
-	NULL,	/* maybe more in the future? */
-};
-
-static struct attribute_group efi_subsys_attr_group = {
-	.attrs = efi_subsys_attrs,
-};
-
-static struct kobject *efi_kobj;
-
-/**
- * efivar_create_sysfs_entry - create a new entry in sysfs
- * @new_var: efivar entry to create
- *
- * Returns 1 on failure, 0 on success
- */
-static int
-efivar_create_sysfs_entry(struct efivar_entry *new_var)
-{
-	int i, short_name_size;
-	char *short_name;
-	unsigned long variable_name_size;
-	efi_char16_t *variable_name;
-
-	variable_name = new_var->var.VariableName;
-	variable_name_size = utf16_strlen(variable_name) * sizeof(efi_char16_t);
-
-	/*
-	 * Length of the variable bytes in ASCII, plus the '-' separator,
-	 * plus the GUID, plus trailing NUL
-	 */
-	short_name_size = variable_name_size / sizeof(efi_char16_t)
-				+ 1 + EFI_VARIABLE_GUID_LEN + 1;
-
-	short_name = kzalloc(short_name_size, GFP_KERNEL);
-
-	if (!short_name) {
-		kfree(short_name);
-		return 1;
-	}
-
-	/* Convert Unicode to normal chars (assume top bits are 0),
-	   ala UTF-8 */
-	for (i=0; i < (int)(variable_name_size / sizeof(efi_char16_t)); i++) {
-		short_name[i] = variable_name[i] & 0xFF;
-	}
-	/* This is ugly, but necessary to separate one vendor's
-	   private variables from another's.         */
-
-	*(short_name + strlen(short_name)) = '-';
-	efi_guid_unparse(&new_var->var.VendorGuid,
-			 short_name + strlen(short_name));
-
-	new_var->kobj.kset = efivars_kset;
-
-	i = kobject_init_and_add(&new_var->kobj, &efivar_ktype,
-				   NULL, "%s", short_name);
-	kfree(short_name);
-	if (i)
-		return 1;
-
-	kobject_uevent(&new_var->kobj, KOBJ_ADD);
-	efivar_entry_add(new_var, &efivar_sysfs_list);
-
-	return 0;
-}
-
-static int
-create_efivars_bin_attributes(void)
-{
-	struct bin_attribute *attr;
-	int error;
-
-	/* new_var */
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	if (!attr)
-		return -ENOMEM;
-
-	attr->attr.name = "new_var";
-	attr->attr.mode = 0200;
-	attr->write = efivar_create;
-	efivars_new_var = attr;
-
-	/* del_var */
-	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
-	if (!attr) {
-		error = -ENOMEM;
-		goto out_free;
-	}
-	attr->attr.name = "del_var";
-	attr->attr.mode = 0200;
-	attr->write = efivar_delete;
-	efivars_del_var = attr;
-
-	sysfs_bin_attr_init(efivars_new_var);
-	sysfs_bin_attr_init(efivars_del_var);
-
-	/* Register */
-	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_new_var);
-	if (error) {
-		printk(KERN_ERR "efivars: unable to create new_var sysfs file"
-			" due to error %d\n", error);
-		goto out_free;
-	}
-
-	error = sysfs_create_bin_file(&efivars_kset->kobj, efivars_del_var);
-	if (error) {
-		printk(KERN_ERR "efivars: unable to create del_var sysfs file"
-			" due to error %d\n", error);
-		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
-		goto out_free;
-	}
-
-	return 0;
-out_free:
-	kfree(efivars_del_var);
-	efivars_del_var = NULL;
-	kfree(efivars_new_var);
-	efivars_new_var = NULL;
-	return error;
-}
-
-static int efivars_sysfs_callback(efi_char16_t *name, efi_guid_t vendor,
-				  unsigned long name_size, void *data)
-{
-	struct efivar_entry *entry;
-
-	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
-	if (!entry)
-		return -ENOMEM;
-
-	memcpy(entry->var.VariableName, name, name_size);
-	memcpy(&(entry->var.VendorGuid), &vendor, sizeof(efi_guid_t));
-
-	efivar_create_sysfs_entry(entry);
-
-	return 0;
-}
-
-static int efivar_sysfs_destroy(struct efivar_entry *entry, void *data)
-{
-	efivar_entry_remove(entry);
-	efivar_unregister(entry);
-	return 0;
-}
-
-/*
- * Print a warning when duplicate EFI variables are encountered and
- * disable the sysfs workqueue since the firmware is buggy.
- */
-static void dup_variable_bug(efi_char16_t *s16, efi_guid_t *vendor_guid,
-			     unsigned long len16)
-{
-	size_t i, len8 = len16 / sizeof(efi_char16_t);
-	char *s8;
-
-	/*
-	 * Disable the workqueue since the algorithm it uses for
-	 * detecting new variables won't work with this buggy
-	 * implementation of GetNextVariableName().
-	 */
-	efivar_wq_enabled = false;
-
-	s8 = kzalloc(len8, GFP_KERNEL);
-	if (!s8)
-		return;
-
-	for (i = 0; i < len8; i++)
-		s8[i] = s16[i];
-
-	printk(KERN_WARNING "efivars: duplicate variable: %s-%pUl\n",
-	       s8, vendor_guid);
-	kfree(s8);
-}
-
-static struct kobject *efivars_kobj;
-
-void efivars_sysfs_exit(void)
-{
-	/* Remove all entries and destroy */
-	__efivar_entry_iter(efivar_sysfs_destroy, &efivar_sysfs_list, NULL, NULL);
-
-	if (efivars_new_var)
-		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_new_var);
-	if (efivars_del_var)
-		sysfs_remove_bin_file(&efivars_kset->kobj, efivars_del_var);
-	kfree(efivars_new_var);
-	kfree(efivars_del_var);
-	kobject_put(efivars_kobj);
-	kset_unregister(efivars_kset);
-}
-
-int efivars_sysfs_init(void)
-{
-	struct kobject *parent_kobj = efivars_kobject();
-	int error = 0;
-
-	/* No efivars has been registered yet */
-	if (!parent_kobj)
-		return 0;
-
-	printk(KERN_INFO "EFI Variables Facility v%s %s\n", EFIVARS_VERSION,
-	       EFIVARS_DATE);
-
-	efivars_kset = kset_create_and_add("vars", NULL, parent_kobj);
-	if (!efivars_kset) {
-		printk(KERN_ERR "efivars: Subsystem registration failed.\n");
-		return -ENOMEM;
-	}
-
-	efivars_kobj = kobject_create_and_add("efivars", parent_kobj);
-	if (!efivars_kobj) {
-		pr_err("efivars: Subsystem registration failed.\n");
-		kset_unregister(efivars_kset);
-		return -ENOMEM;
-	}
-
-	efivar_init(efivars_sysfs_callback, NULL, false,
-		    true, &efivar_sysfs_list);
-
-	error = create_efivars_bin_attributes();
-	if (error)
-		efivars_sysfs_exit();
-
-	return error;
-}
-EXPORT_SYMBOL_GPL(efivars_sysfs_init);
-
-/**
- * efivar_init - build the initial list of EFI variables
- * @func: callback function to invoke for every variable
- * @data: function-specific data to pass to @func
- * @atomic: do we need to execute the @func-loop atomically?
- * @duplicates: error if we encounter duplicates on @head?
- * @head: initialised head of variable list
- *
- * Get every EFI variable from the firmware and invoke @func. @func
- * should call efivar_entry_add() to build the list of variables.
- *
- * Returns 0 on success, or a kernel error code on failure.
- */
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
-		void *data, bool atomic, bool duplicates,
-		struct list_head *head)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	unsigned long variable_name_size = 1024;
-	efi_char16_t *variable_name;
-	efi_status_t status;
-	efi_guid_t vendor_guid;
-	int err = 0;
-
-	variable_name = kzalloc(variable_name_size, GFP_KERNEL);
-	if (!variable_name) {
-		printk(KERN_ERR "efivars: Memory allocation failed.\n");
-		return -ENOMEM;
-	}
-
-	spin_lock_irq(&__efivars->lock);
-
-	/*
-	 * Per EFI spec, the maximum storage allocated for both
-	 * the variable name and variable data is 1024 bytes.
-	 */
-
-	do {
-		variable_name_size = 1024;
-
-		status = ops->get_next_variable(&variable_name_size,
-						variable_name,
-						&vendor_guid);
-		switch (status) {
-		case EFI_SUCCESS:
-			if (!atomic)
-				spin_unlock_irq(&__efivars->lock);
-
-			variable_name_size = var_name_strnsize(variable_name,
-							       variable_name_size);
-
-			/*
-			 * Some firmware implementations return the
-			 * same variable name on multiple calls to
-			 * get_next_variable(). Terminate the loop
-			 * immediately as there is no guarantee that
-			 * we'll ever see a different variable name,
-			 * and may end up looping here forever.
-			 */
-			if (duplicates &&
-			    variable_is_present(variable_name, &vendor_guid, head)) {
-				dup_variable_bug(variable_name, &vendor_guid,
-						 variable_name_size);
-				if (!atomic)
-					spin_lock_irq(&__efivars->lock);
-
-				status = EFI_NOT_FOUND;
-				break;
-			}
-
-			err = func(variable_name, vendor_guid, variable_name_size, data);
-			if (err)
-				status = EFI_NOT_FOUND;
-
-			if (!atomic)
-				spin_lock_irq(&__efivars->lock);
-
-			break;
-		case EFI_NOT_FOUND:
-			break;
-		default:
-			printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
-				status);
-			status = EFI_NOT_FOUND;
-			break;
-		}
-
-	} while (status != EFI_NOT_FOUND);
-
-	spin_unlock_irq(&__efivars->lock);
-
-	kfree(variable_name);
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(efivar_init);
-
-/**
- * efivar_entry_add - add entry to variable list
- * @entry: entry to add to list
- * @head: list head
- */
-void efivar_entry_add(struct efivar_entry *entry, struct list_head *head)
-{
-	spin_lock_irq(&__efivars->lock);
-	list_add(&entry->list, head);
-	spin_unlock_irq(&__efivars->lock);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_add);
-
-/**
- * efivar_entry_remove - remove entry from variable list
- * @entry: entry to remove from list
- */
-void efivar_entry_remove(struct efivar_entry *entry)
-{
-	spin_lock_irq(&__efivars->lock);
-	list_del(&entry->list);
-	spin_unlock_irq(&__efivars->lock);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_remove);
-
-/*
- * efivar_entry_list_del_unlock - remove entry from variable list
- * @entry: entry to remove
- *
- * Remove @entry from the variable list and release the list lock.
- *
- * NOTE: slightly weird locking semantics here - we expect to be
- * called with the efivars lock already held, and we release it before
- * returning. This is because this function is usually called after
- * set_variable() while the lock is still held.
- */
-static void efivar_entry_list_del_unlock(struct efivar_entry *entry)
-{
-	WARN_ON(!spin_is_locked(&__efivars->lock));
-
-	list_del(&entry->list);
-	spin_unlock_irq(&__efivars->lock);
-}
-
-/**
- * __efivar_entry_delete - delete an EFI variable
- * @entry: entry containing EFI variable to delete
- *
- * Delete the variable from the firmware and remove @entry from the
- * variable list. It is the caller's responsibility to free @entry
- * once we return.
- *
- * This function differs from efivar_entry_delete() because it is
- * safe to be called from within a efivar_entry_iter_begin() and
- * efivar_entry_iter_end() region, unlike efivar_entry_delete().
- *
- * Returns 0 on success, or a converted EFI status code if
- * set_variable() fails. If set_variable() fails the entry remains
- * on the list.
- */
-int __efivar_entry_delete(struct efivar_entry *entry)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_status_t status;
-
-	WARN_ON(!spin_is_locked(&__efivars->lock));
-
-	status = ops->set_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid,
-				   0, 0, NULL);
-	if (status)
-		return efi_status_to_err(status);
-
-	list_del(&entry->list);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_delete);
-
-/**
- * efivar_entry_delete - delete variable and remove entry from list
- * @entry: entry containing variable to delete
- *
- * Delete the variable from the firmware and remove @entry from the
- * variable list. It is the caller's responsibility to free @entry
- * once we return.
- *
- * Returns 0 on success, or a converted EFI status code if
- * set_variable() fails.
- */
-int efivar_entry_delete(struct efivar_entry *entry)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_status_t status;
-
-	spin_lock_irq(&__efivars->lock);
-	status = ops->set_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid,
-				   0, 0, NULL);
-	if (!(status == EFI_SUCCESS || status == EFI_NOT_FOUND)) {
-		spin_unlock_irq(&__efivars->lock);
-		return efi_status_to_err(status);
-	}
-
-	efivar_entry_list_del_unlock(entry);
-	return 0;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_delete);
-
-/**
- * efivar_entry_set - call set_variable()
- * @entry: entry containing the EFI variable to write
- * @attributes: variable attributes
- * @size: size of @data buffer
- * @data: buffer containing variable data
- * @head: head of variable list
- *
- * Calls set_variable() for an EFI variable. If creating a new EFI
- * variable, this function is usually followed by efivar_entry_add().
- *
- * Before writing the variable, the remaining EFI variable storage
- * space is checked to ensure there is enough room available.
- *
- * If @head is not NULL a lookup is performed to determine whether
- * the entry is already on the list.
- *
- * Returns 0 on success, -EEXIST if a lookup is performed and the entry
- * already exists on the list, or a converted EFI status code if
- * set_variable() fails.
- */
-int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
-		     unsigned long size, void *data, struct list_head *head)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_status_t status;
-	efi_char16_t *name = entry->var.VariableName;
-	efi_guid_t vendor = entry->var.VendorGuid;
-
-	spin_lock_irq(&__efivars->lock);
-
-	if (head && efivar_entry_find(name, vendor, head, false)) {
-		spin_unlock_irq(&__efivars->lock);
-		return -EEXIST;
-	}
-
-	status = check_var_size(attributes, size + utf16_strsize(name, 1024));
-	if (status == EFI_SUCCESS || status == EFI_UNSUPPORTED)
-		status = ops->set_variable(name, &vendor,
-					   attributes, size, data);
-
-	spin_unlock_irq(&__efivars->lock);
-
-	return efi_status_to_err(status);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_set);
-
-/**
- * efivar_entry_set_safe - call set_variable() if enough space in firmware
- * @name: buffer containing the variable name
- * @vendor: variable vendor guid
- * @attributes: variable attributes
- * @block: can we block in this context?
- * @size: size of @data buffer
- * @data: buffer containing variable data
- *
- * Ensures there is enough free storage in the firmware for this variable, and
- * if so, calls set_variable(). If creating a new EFI variable, this function
- * is usually followed by efivar_entry_add().
- *
- * Returns 0 on success, -ENOSPC if the firmware does not have enough
- * space for set_variable() to succeed, or a converted EFI status code
- * if set_variable() fails.
- */
-int efivar_entry_set_safe(efi_char16_t *name, efi_guid_t vendor, u32 attributes,
-			  bool block, unsigned long size, void *data)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	unsigned long flags;
-	efi_status_t status;
-
-	if (!ops->query_variable_info)
-		return -ENOSYS;
-
-	if (!block && !spin_trylock_irqsave(&__efivars->lock, flags))
-		return -EBUSY;
-	else
-		spin_lock_irqsave(&__efivars->lock, flags);
-
-	status = check_var_size(attributes, size + utf16_strsize(name, 1024));
-	if (status != EFI_SUCCESS) {
-		spin_unlock_irqrestore(&__efivars->lock, flags);
-		return -ENOSPC;
-	}
-
-	status = ops->set_variable(name, &vendor, attributes, size, data);
-
-	spin_unlock_irqrestore(&__efivars->lock, flags);
-
-	return efi_status_to_err(status);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_set_safe);
-
-/**
- * efivar_entry_find - search for an entry
- * @name: the EFI variable name
- * @guid: the EFI variable vendor's guid
- * @head: head of the variable list
- * @remove: should we remove the entry from the list?
- *
- * Search for an entry on the variable list that has the EFI variable
- * name @name and vendor guid @guid. If an entry is found on the list
- * and @remove is true, the entry is removed from the list.
- *
- * The caller MUST call efivar_entry_iter_begin() and
- * efivar_entry_iter_end() before and after the invocation of this
- * function, respectively.
- *
- * Returns the entry if found on the list, %NULL otherwise.
- */
-struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
-				       struct list_head *head, bool remove)
-{
-	struct efivar_entry *entry, *n;
-	int strsize1, strsize2;
-	bool found = false;
-
-	WARN_ON(!spin_is_locked(&__efivars->lock));
-
-	list_for_each_entry_safe(entry, n, head, list) {
-		strsize1 = utf16_strsize(name, 1024);
-		strsize2 = utf16_strsize(entry->var.VariableName, 1024);
-		if (strsize1 == strsize2 &&
-		    !memcmp(name, &(entry->var.VariableName), strsize1) &&
-		    !efi_guidcmp(guid, entry->var.VendorGuid)) {
-			found = true;
-			break;
-		}
-	}
-
-	if (!found)
-		return NULL;
-
-	if (remove)
-		list_del(&entry->list);
-
-	return entry;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_find);
-
-/**
- * __efivar_entry_size - obtain the size of a variable
- * @entry: entry for this variable
- * @size: location to store the variable's size
- *
- * The caller MUST call efivar_entry_iter_begin() and
- * efivar_entry_iter_end() before and after the invocation of this
- * function, respectively.
- */
-int __efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_status_t status;
-
-	WARN_ON(!spin_is_locked(&__efivars->lock));
-
-	*size = 0;
-	status = ops->get_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid, NULL, size, NULL);
-	if (status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_size);
-
-/**
- * efivar_entry_size - obtain the size of a variable
- * @entry: entry for this variable
- * @size: location to store the variable's size
- */
-int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_status_t status;
-
-	*size = 0;
-
-	spin_lock_irq(&__efivars->lock);
-	status = ops->get_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid, NULL, size, NULL);
-	spin_unlock_irq(&__efivars->lock);
-
-	if (status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_size);
-
-/**
- * efivar_entry_get - call get_variable()
- * @entry: read data for this variable
- * @attributes: variable attributes
- * @size: size of @data buffer
- * @data: buffer to store variable data
- */
-int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
-		     unsigned long *size, void *data)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_status_t status;
-
-	spin_lock_irq(&__efivars->lock);
-	status = ops->get_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid,
-				   attributes, size, data);
-	spin_unlock_irq(&__efivars->lock);
-
-	return efi_status_to_err(status);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_get);
-
-/**
- * efivar_entry_set_get_size - call set_variable() and get new size (atomic)
- * @entry: entry containing variable to set and get
- * @attributes: attributes of variable to be written
- * @size: size of data buffer
- * @data: buffer containing data to write
- * @set: did the set_variable() call succeed?
- *
- * This is a pretty special (complex) function. See efivarfs_file_write().
- *
- * Atomically call set_variable() for @entry and if the call is
- * successful, return the new size of the variable from get_variable()
- * in @size. The success of set_variable() is indicated by @set.
- *
- * Returns 0 on success, -EINVAL if the variable data is invalid,
- * -ENOSPC if the firmware does not have enough available space, or a
- * converted EFI status code if either of set_variable() or
- * get_variable() fail.
- *
- * If the EFI variable does not exist when calling set_variable()
- * (EFI_NOT_FOUND), @entry is removed from the variable list.
- */
-int efivar_entry_set_get_size(struct efivar_entry *entry, u32 attributes,
-			      unsigned long *size, void *data, bool *set)
-{
-	const struct efivar_operations *ops = __efivars->ops;
-	efi_char16_t *name = entry->var.VariableName;
-	efi_guid_t *vendor = &entry->var.VendorGuid;
-	efi_status_t status;
-	int err;
-
-	*set = false;
-
-	if (efivar_validate(&entry->var, data, *size) == false)
-		return -EINVAL;
-
-	/*
-	 * The lock here protects the get_variable call, the conditional
-	 * set_variable call, and removal of the variable from the efivars
-	 * list (in the case of an authenticated delete).
-	 */
-	spin_lock_irq(&__efivars->lock);
-
-	/*
-	 * Ensure that the available space hasn't shrunk below the safe level
-	 */
-	status = check_var_size(attributes, *size + utf16_strsize(name, 1024));
-	if (status != EFI_SUCCESS) {
-		if (status != EFI_UNSUPPORTED) {
-			err = efi_status_to_err(status);
-			goto out;
-		}
-
-		if (*size > 65536) {
-			err = -ENOSPC;
-			goto out;
-		}
-	}
-
-	status = ops->set_variable(name, vendor, attributes, *size, data);
-	if (status != EFI_SUCCESS) {
-		err = efi_status_to_err(status);
-		goto out;
-	}
-
-	*set = true;
-
-	/*
-	 * Writing to the variable may have caused a change in size (which
-	 * could either be an append or an overwrite), or the variable to be
-	 * deleted. Perform a GetVariable() so we can tell what actually
-	 * happened.
-	 */
-	*size = 0;
-	status = ops->get_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid,
-				   NULL, size, NULL);
-
-	if (status == EFI_NOT_FOUND)
-		efivar_entry_list_del_unlock(entry);
-	else
-		spin_unlock_irq(&__efivars->lock);
-
-	if (status && status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
-
-	return 0;
-
-out:
-	spin_unlock_irq(&__efivars->lock);
-	return err;
-
-}
-EXPORT_SYMBOL_GPL(efivar_entry_set_get_size);
-
-/**
- * efivar_entry_iter_begin - begin iterating the variable list
- *
- * Lock the variable list to prevent entry insertion and removal until
- * efivar_entry_iter_end() is called. This function is usually used in
- * conjunction with __efivar_entry_iter() or efivar_entry_iter().
- */
-void efivar_entry_iter_begin(void)
-{
-	spin_lock_irq(&__efivars->lock);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_iter_begin);
-
-/**
- * efivar_entry_iter_end - finish iterating the variable list
- *
- * Unlock the variable list and allow modifications to the list again.
- */
-void efivar_entry_iter_end(void)
-{
-	spin_unlock_irq(&__efivars->lock);
-}
-EXPORT_SYMBOL_GPL(efivar_entry_iter_end);
-
-/**
- * __efivar_entry_iter - iterate over variable list
- * @func: callback function
- * @head: head of the variable list
- * @data: function-specific data to pass to callback
- * @prev: entry to begin iterating from
- *
- * Iterate over the list of EFI variables and call @func with every
- * entry on the list. It is safe for @func to remove entries in the
- * list via efivar_entry_delete().
- *
- * You MUST call efivar_enter_iter_begin() before this function, and
- * efivar_entry_iter_end() afterwards.
- *
- * It is possible to begin iteration from an arbitrary entry within
- * the list by passing @prev. @prev is updated on return to point to
- * the last entry passed to @func. To begin iterating from the
- * beginning of the list @prev must be %NULL.
- *
- * The restrictions for @func are the same as documented for
- * efivar_entry_iter().
- */
-int __efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-			struct list_head *head, void *data,
-			struct efivar_entry **prev)
-{
-	struct efivar_entry *entry, *n;
-	int err = 0;
-
-	if (!prev || !*prev) {
-		list_for_each_entry_safe(entry, n, head, list) {
-			err = func(entry, data);
-			if (err)
-				break;
-		}
-
-		if (prev)
-			*prev = entry;
-
-		return err;
-	}
-
-
-	list_for_each_entry_safe_continue((*prev), n, head, list) {
-		err = func(*prev, data);
-		if (err)
-			break;
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(__efivar_entry_iter);
-
-/**
- * efivar_entry_iter - iterate over variable list
- * @func: callback function
- * @head: head of variable list
- * @data: function-specific data to pass to callback
- *
- * Iterate over the list of EFI variables and call @func with every
- * entry on the list. It is safe for @func to remove entries in the
- * list via efivar_entry_delete() while iterating.
- *
- * Some notes for the callback function:
- *  - a non-zero return value indicates an error and terminates the loop
- *  - @func is called from atomic context
- */
-int efivar_entry_iter(int (*func)(struct efivar_entry *, void *),
-		      struct list_head *head, void *data)
-{
-	int err = 0;
-
-	efivar_entry_iter_begin();
-	err = __efivar_entry_iter(func, head, data, NULL);
-	efivar_entry_iter_end();
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(efivar_entry_iter);
-
-/**
- * efivars_kobject - get the kobject for the registered efivars
- *
- * If efivars_register() has not been called we return NULL,
- * otherwise return the kobject used at registration time.
- */
-struct kobject *efivars_kobject(void)
-{
-	if (!__efivars)
-		return NULL;
-
-	return __efivars->kobject;
-}
-EXPORT_SYMBOL_GPL(efivars_kobject);
-
-/**
- * efivar_run_worker - schedule the efivar worker thread
- */
-void efivar_run_worker(void)
-{
-	if (efivar_wq_enabled)
-		schedule_work(&efivar_work);
-}
-EXPORT_SYMBOL_GPL(efivar_run_worker);
-
-/**
- * efivars_register - register an efivars
- * @efivars: efivars to register
- * @ops: efivars operations
- * @kobject: @efivars-specific kobject
- *
- * Only a single efivars can be registered at any time.
- */
-int efivars_register(struct efivars *efivars,
-		     const struct efivar_operations *ops,
-		     struct kobject *kobject)
-{
-	spin_lock_init(&efivars->lock);
-	efivars->ops = ops;
-	efivars->kobject = kobject;
-
-	__efivars = efivars;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(efivars_register);
-
-/**
- * efivars_unregister - unregister an efivars
- * @efivars: efivars to unregister
- *
- * The caller must have already removed every entry from the list,
- * failure to do so is an error.
- */
-int efivars_unregister(struct efivars *efivars)
-{
-	int rv;
-
-	if (!__efivars) {
-		printk(KERN_ERR "efivars not registered\n");
-		rv = -EINVAL;
-		goto out;
-	}
-
-	if (__efivars != efivars) {
-		rv = -EINVAL;
-		goto out;
-	}
-
-	__efivars = NULL;
-
-	rv = 0;
-out:
-	return rv;
-}
-EXPORT_SYMBOL_GPL(efivars_unregister);
-
-static struct efivars generic_efivars;
-static struct efivar_operations generic_ops;
-
-static int generic_ops_register(void)
-{
-	int error;
-
-	generic_ops.get_variable = efi.get_variable;
-	generic_ops.set_variable = efi.set_variable;
-	generic_ops.get_next_variable = efi.get_next_variable;
-	generic_ops.query_variable_info = efi.query_variable_info;
-
-	error = efivars_register(&generic_efivars, &generic_ops, efi_kobj);
-	if (error)
-		return error;
-
-	error = efivars_sysfs_init();
-	if (error)
-		efivars_unregister(&generic_efivars);
-
-	return error;
-}
-
-static void generic_ops_unregister(void)
-{
-	efivars_sysfs_exit();
-	efivars_unregister(&generic_efivars);
-}
-
-/*
- * For now we register the efi subsystem with the firmware subsystem
- * and the vars subsystem with the efi subsystem.  In the future, it
- * might make sense to split off the efi subsystem into its own
- * driver, but for now only efivars will register with it, so just
- * include it here.
- */
-
-static int __init
-efivars_init(void)
-{
-	int error;
-
-	if (!efi_enabled(EFI_RUNTIME_SERVICES))
-		return 0;
-
-	/* Register the efi directory at /sys/firmware/efi */
-	efi_kobj = kobject_create_and_add("efi", firmware_kobj);
-	if (!efi_kobj) {
-		printk(KERN_ERR "efivars: Firmware registration failed.\n");
-		return -ENOMEM;
-	}
-
-	error = generic_ops_register();
-	if (error)
-		goto err_put;
-
-	/* Don't forget the systab entry */
-	error = sysfs_create_group(efi_kobj, &efi_subsys_attr_group);
-	if (error) {
-		printk(KERN_ERR
-		       "efivars: Sysfs attribute export failed with error %d.\n",
-		       error);
-		goto err_unregister;
-	}
-
-	return 0;
-
-err_unregister:
-	generic_ops_unregister();
-err_put:
-	kobject_put(efi_kobj);
-	return error;
-}
-
-static void __exit
-efivars_exit(void)
-{
-	cancel_work_sync(&efivar_work);
-
-	if (efi_enabled(EFI_RUNTIME_SERVICES)) {
-		generic_ops_unregister();
-		kobject_put(efi_kobj);
-	}
-}
-
-module_init(efivars_init);
-module_exit(efivars_exit);
-
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 1fb2b7f849f3..367bbb10c543 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,6 @@
 config EFIVAR_FS
 	tristate "EFI Variable filesystem"
-	depends on EFI_VARS
+	depends on EFI
 	help
 	  efivarfs is a replacement filesystem for the old EFI
 	  variable support via sysfs, as it doesn't suffer from the
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 8ff6ec1ac046..2fc816682714 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -742,7 +742,6 @@ utf16_strlen(efi_char16_t *s)
 	return utf16_strnlen(s, ~0UL);
 }
 
-#if defined(CONFIG_EFI_VARS) || defined(CONFIG_EFI_VARS_MODULE)
 /*
  * Return the number of bytes is the length of this string
  * Note: this is NOT the same as the number of unicode characters
@@ -872,8 +871,10 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
 
 bool efivar_validate(struct efi_variable *var, u8 *data, unsigned long len);
 
+extern struct work_struct efivar_work;
 void efivar_run_worker(void);
 
+#if defined(CONFIG_EFI_VARS) || defined(CONFIG_EFI_VARS_MODULE)
 int efivars_sysfs_init(void);
 
 #endif /* CONFIG_EFI_VARS */
-- 
cgit 


From aa07f02793ec149d560142f25af0243fff84208b Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 17 Apr 2013 15:13:12 +0530
Subject: regulator: palmas: support for external regulator through control
 outputs

Palmas device have control outputs like REGEN1, REGEN2, REGEN3,
SYSEN1 and SYSEN2. These control outputs can be used for controlling
external voltage switches to enabled/disable voltage outputs.

Add support of these control outputs through regulator framework.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/palmas-regulator.c | 95 +++++++++++++++++++++++++++++++-----
 include/linux/mfd/palmas.h           |  6 +++
 2 files changed, 89 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c
index af2109a73a80..c61c0fa83e22 100644
--- a/drivers/regulator/palmas-regulator.c
+++ b/drivers/regulator/palmas-regulator.c
@@ -165,6 +165,26 @@ static const struct regs_info palmas_regs_info[] = {
 		.vsel_addr	= PALMAS_LDOUSB_VOLTAGE,
 		.ctrl_addr	= PALMAS_LDOUSB_CTRL,
 	},
+	{
+		.name		= "REGEN1",
+		.ctrl_addr	= PALMAS_REGEN1_CTRL,
+	},
+	{
+		.name		= "REGEN2",
+		.ctrl_addr	= PALMAS_REGEN2_CTRL,
+	},
+	{
+		.name		= "REGEN3",
+		.ctrl_addr	= PALMAS_REGEN3_CTRL,
+	},
+	{
+		.name		= "SYSEN1",
+		.ctrl_addr	= PALMAS_SYSEN1_CTRL,
+	},
+	{
+		.name		= "SYSEN2",
+		.ctrl_addr	= PALMAS_SYSEN2_CTRL,
+	},
 };
 
 #define SMPS_CTRL_MODE_OFF		0x00
@@ -422,6 +442,12 @@ static struct regulator_ops palmas_ops_ldo = {
 	.map_voltage		= regulator_map_voltage_linear,
 };
 
+static struct regulator_ops palmas_ops_extreg = {
+	.is_enabled		= regulator_is_enabled_regmap,
+	.enable			= regulator_enable_regmap,
+	.disable		= regulator_disable_regmap,
+};
+
 /*
  * setup the hardware based sleep configuration of the SMPS/LDO regulators
  * from the platform data. This is different to the software based control
@@ -523,6 +549,28 @@ static int palmas_ldo_init(struct palmas *palmas, int id,
 	return 0;
 }
 
+static int palmas_extreg_init(struct palmas *palmas, int id,
+		struct palmas_reg_init *reg_init)
+{
+	unsigned int addr;
+	int ret;
+	unsigned int val = 0;
+
+	addr = palmas_regs_info[id].ctrl_addr;
+
+	if (reg_init->mode_sleep)
+		val = PALMAS_REGEN1_CTRL_MODE_SLEEP;
+
+	ret = palmas_update_bits(palmas, PALMAS_RESOURCE_BASE,
+			addr, PALMAS_REGEN1_CTRL_MODE_SLEEP, val);
+	if (ret < 0) {
+		dev_err(palmas->dev, "Resource reg 0x%02x update failed %d\n",
+			addr, ret);
+		return ret;
+	}
+	return 0;
+}
+
 static struct of_regulator_match palmas_matches[] = {
 	{ .name = "smps12", },
 	{ .name = "smps123", },
@@ -545,6 +593,11 @@ static struct of_regulator_match palmas_matches[] = {
 	{ .name = "ldo9", },
 	{ .name = "ldoln", },
 	{ .name = "ldousb", },
+	{ .name = "regen1", },
+	{ .name = "regen2", },
+	{ .name = "regen3", },
+	{ .name = "sysen1", },
+	{ .name = "sysen2", },
 };
 
 static void palmas_dt_to_pdata(struct device *dev,
@@ -763,21 +816,34 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 		/* Register the regulators */
 		pmic->desc[id].name = palmas_regs_info[id].name;
 		pmic->desc[id].id = id;
-		pmic->desc[id].n_voltages = PALMAS_LDO_NUM_VOLTAGES;
-
-		pmic->desc[id].ops = &palmas_ops_ldo;
-
 		pmic->desc[id].type = REGULATOR_VOLTAGE;
 		pmic->desc[id].owner = THIS_MODULE;
-		pmic->desc[id].min_uV = 900000;
-		pmic->desc[id].uV_step = 50000;
-		pmic->desc[id].linear_min_sel = 1;
-		pmic->desc[id].vsel_reg = PALMAS_BASE_TO_REG(PALMAS_LDO_BASE,
+
+		if (id < PALMAS_REG_REGEN1) {
+			pmic->desc[id].n_voltages = PALMAS_LDO_NUM_VOLTAGES;
+			pmic->desc[id].ops = &palmas_ops_ldo;
+			pmic->desc[id].min_uV = 900000;
+			pmic->desc[id].uV_step = 50000;
+			pmic->desc[id].linear_min_sel = 1;
+			pmic->desc[id].vsel_reg =
+					PALMAS_BASE_TO_REG(PALMAS_LDO_BASE,
 						palmas_regs_info[id].vsel_addr);
-		pmic->desc[id].vsel_mask = PALMAS_LDO1_VOLTAGE_VSEL_MASK;
-		pmic->desc[id].enable_reg = PALMAS_BASE_TO_REG(PALMAS_LDO_BASE,
+			pmic->desc[id].vsel_mask =
+					PALMAS_LDO1_VOLTAGE_VSEL_MASK;
+			pmic->desc[id].enable_reg =
+					PALMAS_BASE_TO_REG(PALMAS_LDO_BASE,
+						palmas_regs_info[id].ctrl_addr);
+			pmic->desc[id].enable_mask =
+					PALMAS_LDO1_CTRL_MODE_ACTIVE;
+		} else {
+			pmic->desc[id].n_voltages = 1;
+			pmic->desc[id].ops = &palmas_ops_extreg;
+			pmic->desc[id].enable_reg =
+					PALMAS_BASE_TO_REG(PALMAS_RESOURCE_BASE,
 						palmas_regs_info[id].ctrl_addr);
-		pmic->desc[id].enable_mask = PALMAS_LDO1_CTRL_MODE_ACTIVE;
+			pmic->desc[id].enable_mask =
+					PALMAS_REGEN1_CTRL_MODE_ACTIVE;
+		}
 
 		if (pdata)
 			config.init_data = pdata->reg_data[id];
@@ -803,7 +869,12 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 		if (pdata) {
 			reg_init = pdata->reg_init[id];
 			if (reg_init) {
-				ret = palmas_ldo_init(palmas, id, reg_init);
+				if (id < PALMAS_REG_REGEN1)
+					ret = palmas_ldo_init(palmas,
+							id, reg_init);
+				else
+					ret = palmas_extreg_init(palmas,
+							id, reg_init);
 				if (ret) {
 					regulator_unregister(pmic->rdev[id]);
 					goto err_unregister_regulator;
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index a4d13d7cd001..44256aa7b46a 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -154,6 +154,12 @@ enum palmas_regulators {
 	PALMAS_REG_LDO9,
 	PALMAS_REG_LDOLN,
 	PALMAS_REG_LDOUSB,
+	/* External regulators */
+	PALMAS_REG_REGEN1,
+	PALMAS_REG_REGEN2,
+	PALMAS_REG_REGEN3,
+	PALMAS_REG_SYSEN1,
+	PALMAS_REG_SYSEN2,
 	/* Total number of regulators */
 	PALMAS_NUM_REGS,
 };
-- 
cgit 


From 17c11a7603496949989ed286ed218a9e645b6259 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Wed, 17 Apr 2013 15:13:13 +0530
Subject: regulator: palmas: add support for LDO8 tracking mode

LDO8 of Palma device like tps65913 support the tracking mode
on which LDO8 track the SMPS45 voltage when SMPS45 is ON
and use the LDO8.VOLTAGE_SEL register when SMPS45 is OFF.

On track mode, the steps of voltage change for LDO8 is 25mV
where in non-tracking mode it is 50mV. Set the steps accordingly.
Number of voltage count is still same for both the cases.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Acked-by: Graeme Gregory <gg@slimlogic.co.uk>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/palmas-regulator.c | 53 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/palmas.h           |  3 ++
 2 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c
index c61c0fa83e22..6538b339e441 100644
--- a/drivers/regulator/palmas-regulator.c
+++ b/drivers/regulator/palmas-regulator.c
@@ -571,6 +571,46 @@ static int palmas_extreg_init(struct palmas *palmas, int id,
 	return 0;
 }
 
+static void palmas_enable_ldo8_track(struct palmas *palmas)
+{
+	unsigned int reg;
+	unsigned int addr;
+	int ret;
+
+	addr = palmas_regs_info[PALMAS_REG_LDO8].ctrl_addr;
+
+	ret = palmas_ldo_read(palmas, addr, &reg);
+	if (ret) {
+		dev_err(palmas->dev, "Error in reading ldo8 control reg\n");
+		return;
+	}
+
+	reg |= PALMAS_LDO8_CTRL_LDO_TRACKING_EN;
+	ret = palmas_ldo_write(palmas, addr, reg);
+	if (ret < 0) {
+		dev_err(palmas->dev, "Error in enabling tracking mode\n");
+		return;
+	}
+	/*
+	 * When SMPS45 is set to off and LDO8 tracking is enabled, the LDO8
+	 * output is defined by the LDO8_VOLTAGE.VSEL register divided by two,
+	 * and can be set from 0.45 to 1.65 V.
+	 */
+	addr = palmas_regs_info[PALMAS_REG_LDO8].vsel_addr;
+	ret = palmas_ldo_read(palmas, addr, &reg);
+	if (ret) {
+		dev_err(palmas->dev, "Error in reading ldo8 voltage reg\n");
+		return;
+	}
+
+	reg = (reg << 1) & PALMAS_LDO8_VOLTAGE_VSEL_MASK;
+	ret = palmas_ldo_write(palmas, addr, reg);
+	if (ret < 0)
+		dev_err(palmas->dev, "Error in setting ldo8 voltage reg\n");
+
+	return;
+}
+
 static struct of_regulator_match palmas_matches[] = {
 	{ .name = "smps12", },
 	{ .name = "smps123", },
@@ -656,6 +696,11 @@ static void palmas_dt_to_pdata(struct device *dev,
 		if (ret)
 			pdata->reg_init[idx]->vsel =
 				PALMAS_SMPS12_VOLTAGE_RANGE;
+
+		if (idx == PALMAS_REG_LDO8)
+			pdata->enable_ldo8_tracking = of_property_read_bool(
+						palmas_matches[idx].of_node,
+						"ti,enable-ldo8-tracking");
 	}
 
 	pdata->ldo6_vibrator = of_property_read_bool(node, "ti,ldo6-vibrator");
@@ -835,6 +880,13 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 						palmas_regs_info[id].ctrl_addr);
 			pmic->desc[id].enable_mask =
 					PALMAS_LDO1_CTRL_MODE_ACTIVE;
+
+			/* Check if LDO8 is in tracking mode or not */
+			if (pdata && (id == PALMAS_REG_LDO8) &&
+					pdata->enable_ldo8_tracking) {
+				palmas_enable_ldo8_track(palmas);
+				pmic->desc[id].uV_step = 25000;
+			}
 		} else {
 			pmic->desc[id].n_voltages = 1;
 			pmic->desc[id].ops = &palmas_ops_extreg;
@@ -883,6 +935,7 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 		}
 	}
 
+
 	return 0;
 
 err_unregister_regulator:
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index 44256aa7b46a..10daa8c1e73c 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -177,6 +177,9 @@ struct palmas_pmic_platform_data {
 
 	/* use LDO6 for vibrator control */
 	int ldo6_vibrator;
+
+	/* Enable tracking mode of LDO8 */
+	bool enable_ldo8_tracking;
 };
 
 struct palmas_usb_platform_data {
-- 
cgit 


From f39d5b72913e2a9ff00ba5ab145ee05a888b1286 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Fri, 12 Apr 2013 12:02:59 -0600
Subject: PCI: Remove "extern" from function declarations

We had an inconsistent mix of using and omitting the "extern" keyword
on function declarations in header files.  This removes them all.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/hotplug/acpiphp.h      | 26 +++++------
 drivers/pci/hotplug/cpci_hotplug.h | 36 +++++++--------
 drivers/pci/hotplug/cpqphp.h       | 70 +++++++++++++---------------
 drivers/pci/hotplug/cpqphp_nvram.h | 12 ++---
 drivers/pci/hotplug/ibmphp.h       | 66 +++++++++++++-------------
 drivers/pci/hotplug/pciehp.h       | 22 ++++-----
 drivers/pci/hotplug/rpadlpar.h     |  8 ++--
 drivers/pci/hotplug/rpaphp.h       | 16 +++----
 drivers/pci/hotplug/shpchp.h       | 26 +++++------
 drivers/pci/pci.h                  | 89 +++++++++++++++++------------------
 drivers/pci/pcie/aer/aerdrv.h      | 14 +++---
 drivers/pci/pcie/portdrv.h         | 18 ++++----
 include/linux/pci-aspm.h           | 20 ++++----
 include/linux/pci-ats.h            | 26 +++++------
 include/linux/pci.h                | 95 +++++++++++++++++++-------------------
 include/linux/pci_hotplug.h        | 12 ++---
 include/linux/pcieport_if.h        |  4 +-
 17 files changed, 275 insertions(+), 285 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/hotplug/acpiphp.h b/drivers/pci/hotplug/acpiphp.h
index b70ac00a117e..9daddf4b9576 100644
--- a/drivers/pci/hotplug/acpiphp.h
+++ b/drivers/pci/hotplug/acpiphp.h
@@ -174,23 +174,23 @@ struct acpiphp_attention_info
 /* function prototypes */
 
 /* acpiphp_core.c */
-extern int acpiphp_register_attention(struct acpiphp_attention_info*info);
-extern int acpiphp_unregister_attention(struct acpiphp_attention_info *info);
-extern int acpiphp_register_hotplug_slot(struct acpiphp_slot *slot);
-extern void acpiphp_unregister_hotplug_slot(struct acpiphp_slot *slot);
+int acpiphp_register_attention(struct acpiphp_attention_info*info);
+int acpiphp_unregister_attention(struct acpiphp_attention_info *info);
+int acpiphp_register_hotplug_slot(struct acpiphp_slot *slot);
+void acpiphp_unregister_hotplug_slot(struct acpiphp_slot *slot);
 
 /* acpiphp_glue.c */
-extern int acpiphp_glue_init (void);
-extern void acpiphp_glue_exit (void);
+int acpiphp_glue_init(void);
+void acpiphp_glue_exit(void);
 typedef int (*acpiphp_callback)(struct acpiphp_slot *slot, void *data);
 
-extern int acpiphp_enable_slot (struct acpiphp_slot *slot);
-extern int acpiphp_disable_slot (struct acpiphp_slot *slot);
-extern int acpiphp_eject_slot (struct acpiphp_slot *slot);
-extern u8 acpiphp_get_power_status (struct acpiphp_slot *slot);
-extern u8 acpiphp_get_attention_status (struct acpiphp_slot *slot);
-extern u8 acpiphp_get_latch_status (struct acpiphp_slot *slot);
-extern u8 acpiphp_get_adapter_status (struct acpiphp_slot *slot);
+int acpiphp_enable_slot(struct acpiphp_slot *slot);
+int acpiphp_disable_slot(struct acpiphp_slot *slot);
+int acpiphp_eject_slot(struct acpiphp_slot *slot);
+u8 acpiphp_get_power_status(struct acpiphp_slot *slot);
+u8 acpiphp_get_attention_status(struct acpiphp_slot *slot);
+u8 acpiphp_get_latch_status(struct acpiphp_slot *slot);
+u8 acpiphp_get_adapter_status(struct acpiphp_slot *slot);
 
 /* variables */
 extern bool acpiphp_debug;
diff --git a/drivers/pci/hotplug/cpci_hotplug.h b/drivers/pci/hotplug/cpci_hotplug.h
index 3ae19f774cb3..1356211431d0 100644
--- a/drivers/pci/hotplug/cpci_hotplug.h
+++ b/drivers/pci/hotplug/cpci_hotplug.h
@@ -75,29 +75,29 @@ static inline const char *slot_name(struct slot *slot)
 	return hotplug_slot_name(slot->hotplug_slot);
 }
 
-extern int cpci_hp_register_controller(struct cpci_hp_controller *controller);
-extern int cpci_hp_unregister_controller(struct cpci_hp_controller *controller);
-extern int cpci_hp_register_bus(struct pci_bus *bus, u8 first, u8 last);
-extern int cpci_hp_unregister_bus(struct pci_bus *bus);
-extern int cpci_hp_start(void);
-extern int cpci_hp_stop(void);
+int cpci_hp_register_controller(struct cpci_hp_controller *controller);
+int cpci_hp_unregister_controller(struct cpci_hp_controller *controller);
+int cpci_hp_register_bus(struct pci_bus *bus, u8 first, u8 last);
+int cpci_hp_unregister_bus(struct pci_bus *bus);
+int cpci_hp_start(void);
+int cpci_hp_stop(void);
 
 /*
  * Internal function prototypes, these functions should not be used by
  * board/chassis drivers.
  */
-extern u8 cpci_get_attention_status(struct slot *slot);
-extern u8 cpci_get_latch_status(struct slot *slot);
-extern u8 cpci_get_adapter_status(struct slot *slot);
-extern u16 cpci_get_hs_csr(struct slot * slot);
-extern int cpci_set_attention_status(struct slot *slot, int status);
-extern int cpci_check_and_clear_ins(struct slot * slot);
-extern int cpci_check_ext(struct slot * slot);
-extern int cpci_clear_ext(struct slot * slot);
-extern int cpci_led_on(struct slot * slot);
-extern int cpci_led_off(struct slot * slot);
-extern int cpci_configure_slot(struct slot *slot);
-extern int cpci_unconfigure_slot(struct slot *slot);
+u8 cpci_get_attention_status(struct slot *slot);
+u8 cpci_get_latch_status(struct slot *slot);
+u8 cpci_get_adapter_status(struct slot *slot);
+u16 cpci_get_hs_csr(struct slot * slot);
+int cpci_set_attention_status(struct slot *slot, int status);
+int cpci_check_and_clear_ins(struct slot * slot);
+int cpci_check_ext(struct slot * slot);
+int cpci_clear_ext(struct slot * slot);
+int cpci_led_on(struct slot * slot);
+int cpci_led_off(struct slot * slot);
+int cpci_configure_slot(struct slot *slot);
+int cpci_unconfigure_slot(struct slot *slot);
 
 #ifdef CONFIG_HOTPLUG_PCI_CPCI
 int cpci_hotplug_init(int debug);
diff --git a/drivers/pci/hotplug/cpqphp.h b/drivers/pci/hotplug/cpqphp.h
index d8ffc7366801..516b87738b6e 100644
--- a/drivers/pci/hotplug/cpqphp.h
+++ b/drivers/pci/hotplug/cpqphp.h
@@ -404,50 +404,44 @@ struct resource_lists {
 
 
 /* debugfs functions for the hotplug controller info */
-extern void cpqhp_initialize_debugfs(void);
-extern void cpqhp_shutdown_debugfs(void);
-extern void cpqhp_create_debugfs_files(struct controller *ctrl);
-extern void cpqhp_remove_debugfs_files(struct controller *ctrl);
+void cpqhp_initialize_debugfs(void);
+void cpqhp_shutdown_debugfs(void);
+void cpqhp_create_debugfs_files(struct controller *ctrl);
+void cpqhp_remove_debugfs_files(struct controller *ctrl);
 
 /* controller functions */
-extern void cpqhp_pushbutton_thread(unsigned long event_pointer);
-extern irqreturn_t cpqhp_ctrl_intr(int IRQ, void *data);
-extern int cpqhp_find_available_resources(struct controller *ctrl,
-					  void __iomem *rom_start);
-extern int cpqhp_event_start_thread(void);
-extern void cpqhp_event_stop_thread(void);
-extern struct pci_func *cpqhp_slot_create(unsigned char busnumber);
-extern struct pci_func *cpqhp_slot_find(unsigned char bus, unsigned char device,
-					unsigned char index);
-extern int cpqhp_process_SI(struct controller *ctrl, struct pci_func *func);
-extern int cpqhp_process_SS(struct controller *ctrl, struct pci_func *func);
-extern int cpqhp_hardware_test(struct controller *ctrl, int test_num);
+void cpqhp_pushbutton_thread(unsigned long event_pointer);
+irqreturn_t cpqhp_ctrl_intr(int IRQ, void *data);
+int cpqhp_find_available_resources(struct controller *ctrl,
+				   void __iomem *rom_start);
+int cpqhp_event_start_thread(void);
+void cpqhp_event_stop_thread(void);
+struct pci_func *cpqhp_slot_create(unsigned char busnumber);
+struct pci_func *cpqhp_slot_find(unsigned char bus, unsigned char device,
+				 unsigned char index);
+int cpqhp_process_SI(struct controller *ctrl, struct pci_func *func);
+int cpqhp_process_SS(struct controller *ctrl, struct pci_func *func);
+int cpqhp_hardware_test(struct controller *ctrl, int test_num);
 
 /* resource functions */
-extern int	cpqhp_resource_sort_and_combine	(struct pci_resource **head);
+int	cpqhp_resource_sort_and_combine	(struct pci_resource **head);
 
 /* pci functions */
-extern int cpqhp_set_irq(u8 bus_num, u8 dev_num, u8 int_pin, u8 irq_num);
-extern int cpqhp_get_bus_dev(struct controller *ctrl, u8 *bus_num, u8 *dev_num,
-			     u8 slot);
-extern int cpqhp_save_config(struct controller *ctrl, int busnumber,
-			     int is_hot_plug);
-extern int cpqhp_save_base_addr_length(struct controller *ctrl,
-				       struct pci_func *func);
-extern int cpqhp_save_used_resources(struct controller *ctrl,
-				     struct pci_func *func);
-extern int cpqhp_configure_board(struct controller *ctrl,
-				 struct pci_func *func);
-extern int cpqhp_save_slot_config(struct controller *ctrl,
-				  struct pci_func *new_slot);
-extern int cpqhp_valid_replace(struct controller *ctrl, struct pci_func *func);
-extern void cpqhp_destroy_board_resources(struct pci_func *func);
-extern int cpqhp_return_board_resources	(struct pci_func *func,
-					 struct resource_lists *resources);
-extern void cpqhp_destroy_resource_list(struct resource_lists *resources);
-extern int cpqhp_configure_device(struct controller *ctrl,
-				  struct pci_func *func);
-extern int cpqhp_unconfigure_device(struct pci_func *func);
+int cpqhp_set_irq(u8 bus_num, u8 dev_num, u8 int_pin, u8 irq_num);
+int cpqhp_get_bus_dev(struct controller *ctrl, u8 *bus_num, u8 *dev_num,
+		      u8 slot);
+int cpqhp_save_config(struct controller *ctrl, int busnumber, int is_hot_plug);
+int cpqhp_save_base_addr_length(struct controller *ctrl, struct pci_func *func);
+int cpqhp_save_used_resources(struct controller *ctrl, struct pci_func *func);
+int cpqhp_configure_board(struct controller *ctrl, struct pci_func *func);
+int cpqhp_save_slot_config(struct controller *ctrl, struct pci_func *new_slot);
+int cpqhp_valid_replace(struct controller *ctrl, struct pci_func *func);
+void cpqhp_destroy_board_resources(struct pci_func *func);
+int cpqhp_return_board_resources(struct pci_func *func,
+				 struct resource_lists *resources);
+void cpqhp_destroy_resource_list(struct resource_lists *resources);
+int cpqhp_configure_device(struct controller *ctrl, struct pci_func *func);
+int cpqhp_unconfigure_device(struct pci_func *func);
 
 /* Global variables */
 extern int cpqhp_debug;
diff --git a/drivers/pci/hotplug/cpqphp_nvram.h b/drivers/pci/hotplug/cpqphp_nvram.h
index e89c0702119d..34e4e54fcf15 100644
--- a/drivers/pci/hotplug/cpqphp_nvram.h
+++ b/drivers/pci/hotplug/cpqphp_nvram.h
@@ -30,26 +30,26 @@
 
 #ifndef CONFIG_HOTPLUG_PCI_COMPAQ_NVRAM
 
-static inline void compaq_nvram_init (void __iomem *rom_start)
+static inline void compaq_nvram_init(void __iomem *rom_start)
 {
 	return;
 }
 
-static inline int compaq_nvram_load (void __iomem *rom_start, struct controller *ctrl)
+static inline int compaq_nvram_load(void __iomem *rom_start, struct controller *ctrl)
 {
 	return 0;
 }
 
-static inline int compaq_nvram_store (void __iomem *rom_start)
+static inline int compaq_nvram_store(void __iomem *rom_start)
 {
 	return 0;
 }
 
 #else
 
-extern void compaq_nvram_init	(void __iomem *rom_start);
-extern int compaq_nvram_load	(void __iomem *rom_start, struct controller *ctrl);
-extern int compaq_nvram_store	(void __iomem *rom_start);
+void compaq_nvram_init(void __iomem *rom_start);
+int compaq_nvram_load(void __iomem *rom_start, struct controller *ctrl);
+int compaq_nvram_store(void __iomem *rom_start);
 
 #endif
 
diff --git a/drivers/pci/hotplug/ibmphp.h b/drivers/pci/hotplug/ibmphp.h
index a8d391a4957d..8c5b25871d02 100644
--- a/drivers/pci/hotplug/ibmphp.h
+++ b/drivers/pci/hotplug/ibmphp.h
@@ -275,17 +275,17 @@ extern struct list_head ibmphp_slot_head;
 * FUNCTION PROTOTYPES                                      *
 ***********************************************************/
 
-extern void ibmphp_free_ebda_hpc_queue (void);
-extern int ibmphp_access_ebda (void);
-extern struct slot *ibmphp_get_slot_from_physical_num (u8);
-extern int ibmphp_get_total_hp_slots (void);
-extern void ibmphp_free_ibm_slot (struct slot *);
-extern void ibmphp_free_bus_info_queue (void);
-extern void ibmphp_free_ebda_pci_rsrc_queue (void);
-extern struct bus_info *ibmphp_find_same_bus_num (u32);
-extern int ibmphp_get_bus_index (u8);
-extern u16 ibmphp_get_total_controllers (void);
-extern int ibmphp_register_pci (void);
+void ibmphp_free_ebda_hpc_queue(void);
+int ibmphp_access_ebda(void);
+struct slot *ibmphp_get_slot_from_physical_num(u8);
+int ibmphp_get_total_hp_slots(void);
+void ibmphp_free_ibm_slot(struct slot *);
+void ibmphp_free_bus_info_queue(void);
+void ibmphp_free_ebda_pci_rsrc_queue(void);
+struct bus_info *ibmphp_find_same_bus_num(u32);
+int ibmphp_get_bus_index(u8);
+u16 ibmphp_get_total_controllers(void);
+int ibmphp_register_pci(void);
 
 /* passed parameters */
 #define MEM		0
@@ -381,24 +381,24 @@ struct res_needed {
 
 /* functions */
 
-extern int ibmphp_rsrc_init (void);
-extern int ibmphp_add_resource (struct resource_node *);
-extern int ibmphp_remove_resource (struct resource_node *);
-extern int ibmphp_find_resource (struct bus_node *, u32, struct resource_node **, int);
-extern int ibmphp_check_resource (struct resource_node *, u8);
-extern int ibmphp_remove_bus (struct bus_node *, u8);
-extern void ibmphp_free_resources (void);
-extern int ibmphp_add_pfmem_from_mem (struct resource_node *);
-extern struct bus_node *ibmphp_find_res_bus (u8);
-extern void ibmphp_print_test (void);	/* for debugging purposes */
+int ibmphp_rsrc_init(void);
+int ibmphp_add_resource(struct resource_node *);
+int ibmphp_remove_resource(struct resource_node *);
+int ibmphp_find_resource(struct bus_node *, u32, struct resource_node **, int);
+int ibmphp_check_resource(struct resource_node *, u8);
+int ibmphp_remove_bus(struct bus_node *, u8);
+void ibmphp_free_resources(void);
+int ibmphp_add_pfmem_from_mem(struct resource_node *);
+struct bus_node *ibmphp_find_res_bus(u8);
+void ibmphp_print_test(void);	/* for debugging purposes */
 
-extern void ibmphp_hpc_initvars (void);
-extern int ibmphp_hpc_readslot (struct slot *, u8, u8 *);
-extern int ibmphp_hpc_writeslot (struct slot *, u8);
-extern void ibmphp_lock_operations (void);
-extern void ibmphp_unlock_operations (void);
-extern int ibmphp_hpc_start_poll_thread (void);
-extern void ibmphp_hpc_stop_poll_thread (void);
+void ibmphp_hpc_initvars(void);
+int ibmphp_hpc_readslot(struct slot *, u8, u8 *);
+int ibmphp_hpc_writeslot(struct slot *, u8);
+void ibmphp_lock_operations(void);
+void ibmphp_unlock_operations(void);
+int ibmphp_hpc_start_poll_thread(void);
+void ibmphp_hpc_stop_poll_thread(void);
 
 //----------------------------------------------------------------------------
 
@@ -749,11 +749,11 @@ struct controller {
 
 /* Functions */
 
-extern int ibmphp_init_devno (struct slot **);	/* This function is called from EBDA, so we need it not be static */
-extern int ibmphp_do_disable_slot (struct slot *slot_cur);
-extern int ibmphp_update_slot_info (struct slot *);	/* This function is called from HPC, so we need it to not be be static */
-extern int ibmphp_configure_card (struct pci_func *, u8);
-extern int ibmphp_unconfigure_card (struct slot **, int);
+int ibmphp_init_devno(struct slot **);	/* This function is called from EBDA, so we need it not be static */
+int ibmphp_do_disable_slot(struct slot *slot_cur);
+int ibmphp_update_slot_info(struct slot *);	/* This function is called from HPC, so we need it to not be be static */
+int ibmphp_configure_card(struct pci_func *, u8);
+int ibmphp_unconfigure_card(struct slot **, int);
 extern struct hotplug_slot_ops ibmphp_hotplug_slot_ops;
 
 #endif				//__IBMPHP_H
diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h
index 2c113de94323..7fb326983ed6 100644
--- a/drivers/pci/hotplug/pciehp.h
+++ b/drivers/pci/hotplug/pciehp.h
@@ -127,15 +127,15 @@ struct controller {
 #define NO_CMD_CMPL(ctrl)	((ctrl)->slot_cap & PCI_EXP_SLTCAP_NCCS)
 #define PSN(ctrl)		((ctrl)->slot_cap >> 19)
 
-extern int pciehp_sysfs_enable_slot(struct slot *slot);
-extern int pciehp_sysfs_disable_slot(struct slot *slot);
-extern u8 pciehp_handle_attention_button(struct slot *p_slot);
-extern u8 pciehp_handle_switch_change(struct slot *p_slot);
-extern u8 pciehp_handle_presence_change(struct slot *p_slot);
-extern u8 pciehp_handle_power_fault(struct slot *p_slot);
-extern int pciehp_configure_device(struct slot *p_slot);
-extern int pciehp_unconfigure_device(struct slot *p_slot);
-extern void pciehp_queue_pushbutton_work(struct work_struct *work);
+int pciehp_sysfs_enable_slot(struct slot *slot);
+int pciehp_sysfs_disable_slot(struct slot *slot);
+u8 pciehp_handle_attention_button(struct slot *p_slot);
+u8 pciehp_handle_switch_change(struct slot *p_slot);
+u8 pciehp_handle_presence_change(struct slot *p_slot);
+u8 pciehp_handle_power_fault(struct slot *p_slot);
+int pciehp_configure_device(struct slot *p_slot);
+int pciehp_unconfigure_device(struct slot *p_slot);
+void pciehp_queue_pushbutton_work(struct work_struct *work);
 struct controller *pcie_init(struct pcie_device *dev);
 int pcie_init_notification(struct controller *ctrl);
 int pciehp_enable_slot(struct slot *p_slot);
@@ -166,8 +166,8 @@ static inline const char *slot_name(struct slot *slot)
 #include <acpi/acpi_bus.h>
 #include <linux/pci-acpi.h>
 
-extern void __init pciehp_acpi_slot_detection_init(void);
-extern int pciehp_acpi_slot_detection_check(struct pci_dev *dev);
+void __init pciehp_acpi_slot_detection_init(void);
+int pciehp_acpi_slot_detection_check(struct pci_dev *dev);
 
 static inline void pciehp_firmware_init(void)
 {
diff --git a/drivers/pci/hotplug/rpadlpar.h b/drivers/pci/hotplug/rpadlpar.h
index 4a0a59b82eae..81df93931ad0 100644
--- a/drivers/pci/hotplug/rpadlpar.h
+++ b/drivers/pci/hotplug/rpadlpar.h
@@ -15,10 +15,10 @@
 #ifndef _RPADLPAR_IO_H_
 #define _RPADLPAR_IO_H_
 
-extern int dlpar_sysfs_init(void);
-extern void dlpar_sysfs_exit(void);
+int dlpar_sysfs_init(void);
+void dlpar_sysfs_exit(void);
 
-extern int dlpar_add_slot(char *drc_name);
-extern int dlpar_remove_slot(char *drc_name);
+int dlpar_add_slot(char *drc_name);
+int dlpar_remove_slot(char *drc_name);
 
 #endif
diff --git a/drivers/pci/hotplug/rpaphp.h b/drivers/pci/hotplug/rpaphp.h
index df5677440a08..3135856e5e1c 100644
--- a/drivers/pci/hotplug/rpaphp.h
+++ b/drivers/pci/hotplug/rpaphp.h
@@ -86,18 +86,18 @@ extern struct list_head rpaphp_slot_head;
 /* function prototypes */
 
 /* rpaphp_pci.c */
-extern int rpaphp_enable_slot(struct slot *slot);
-extern int rpaphp_get_sensor_state(struct slot *slot, int *state);
+int rpaphp_enable_slot(struct slot *slot);
+int rpaphp_get_sensor_state(struct slot *slot, int *state);
 
 /* rpaphp_core.c */
-extern int rpaphp_add_slot(struct device_node *dn);
-extern int rpaphp_get_drc_props(struct device_node *dn, int *drc_index,
+int rpaphp_add_slot(struct device_node *dn);
+int rpaphp_get_drc_props(struct device_node *dn, int *drc_index,
 		char **drc_name, char **drc_type, int *drc_power_domain);
 
 /* rpaphp_slot.c */
-extern void dealloc_slot_struct(struct slot *slot);
-extern struct slot *alloc_slot_struct(struct device_node *dn, int drc_index, char *drc_name, int power_domain);
-extern int rpaphp_register_slot(struct slot *slot);
-extern int rpaphp_deregister_slot(struct slot *slot);
+void dealloc_slot_struct(struct slot *slot);
+struct slot *alloc_slot_struct(struct device_node *dn, int drc_index, char *drc_name, int power_domain);
+int rpaphp_register_slot(struct slot *slot);
+int rpaphp_deregister_slot(struct slot *slot);
 	
 #endif				/* _PPC64PHP_H */
diff --git a/drivers/pci/hotplug/shpchp.h b/drivers/pci/hotplug/shpchp.h
index b849f995075a..e260f207a90e 100644
--- a/drivers/pci/hotplug/shpchp.h
+++ b/drivers/pci/hotplug/shpchp.h
@@ -168,19 +168,19 @@ struct controller {
 #define WRONG_BUS_FREQUENCY		0x0000000D
 #define POWER_FAILURE			0x0000000E
 
-extern int __must_check shpchp_create_ctrl_files(struct controller *ctrl);
-extern void shpchp_remove_ctrl_files(struct controller *ctrl);
-extern int shpchp_sysfs_enable_slot(struct slot *slot);
-extern int shpchp_sysfs_disable_slot(struct slot *slot);
-extern u8 shpchp_handle_attention_button(u8 hp_slot, struct controller *ctrl);
-extern u8 shpchp_handle_switch_change(u8 hp_slot, struct controller *ctrl);
-extern u8 shpchp_handle_presence_change(u8 hp_slot, struct controller *ctrl);
-extern u8 shpchp_handle_power_fault(u8 hp_slot, struct controller *ctrl);
-extern int shpchp_configure_device(struct slot *p_slot);
-extern int shpchp_unconfigure_device(struct slot *p_slot);
-extern void cleanup_slots(struct controller *ctrl);
-extern void shpchp_queue_pushbutton_work(struct work_struct *work);
-extern int shpc_init( struct controller *ctrl, struct pci_dev *pdev);
+int __must_check shpchp_create_ctrl_files(struct controller *ctrl);
+void shpchp_remove_ctrl_files(struct controller *ctrl);
+int shpchp_sysfs_enable_slot(struct slot *slot);
+int shpchp_sysfs_disable_slot(struct slot *slot);
+u8 shpchp_handle_attention_button(u8 hp_slot, struct controller *ctrl);
+u8 shpchp_handle_switch_change(u8 hp_slot, struct controller *ctrl);
+u8 shpchp_handle_presence_change(u8 hp_slot, struct controller *ctrl);
+u8 shpchp_handle_power_fault(u8 hp_slot, struct controller *ctrl);
+int shpchp_configure_device(struct slot *p_slot);
+int shpchp_unconfigure_device(struct slot *p_slot);
+void cleanup_slots(struct controller *ctrl);
+void shpchp_queue_pushbutton_work(struct work_struct *work);
+int shpc_init( struct controller *ctrl, struct pci_dev *pdev);
 
 static inline const char *slot_name(struct slot *slot)
 {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 7346ee68f47d..68678ed76b0d 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -8,26 +8,25 @@
 
 /* Functions internal to the PCI core code */
 
-extern int pci_create_sysfs_dev_files(struct pci_dev *pdev);
-extern void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
+int pci_create_sysfs_dev_files(struct pci_dev *pdev);
+void pci_remove_sysfs_dev_files(struct pci_dev *pdev);
 #if !defined(CONFIG_DMI) && !defined(CONFIG_ACPI)
 static inline void pci_create_firmware_label_files(struct pci_dev *pdev)
 { return; }
 static inline void pci_remove_firmware_label_files(struct pci_dev *pdev)
 { return; }
 #else
-extern void pci_create_firmware_label_files(struct pci_dev *pdev);
-extern void pci_remove_firmware_label_files(struct pci_dev *pdev);
+void pci_create_firmware_label_files(struct pci_dev *pdev);
+void pci_remove_firmware_label_files(struct pci_dev *pdev);
 #endif
-extern void pci_cleanup_rom(struct pci_dev *dev);
+void pci_cleanup_rom(struct pci_dev *dev);
 #ifdef HAVE_PCI_MMAP
 enum pci_mmap_api {
 	PCI_MMAP_SYSFS,	/* mmap on /sys/bus/pci/devices/<BDF>/resource<N> */
 	PCI_MMAP_PROCFS	/* mmap on /proc/bus/pci/<BDF> */
 };
-extern int pci_mmap_fits(struct pci_dev *pdev, int resno,
-			 struct vm_area_struct *vmai,
-			 enum pci_mmap_api mmap_api);
+int pci_mmap_fits(struct pci_dev *pdev, int resno, struct vm_area_struct *vmai,
+		  enum pci_mmap_api mmap_api);
 #endif
 int pci_probe_reset_function(struct pci_dev *dev);
 
@@ -60,17 +59,17 @@ struct pci_platform_pm_ops {
 	int (*run_wake)(struct pci_dev *dev, bool enable);
 };
 
-extern int pci_set_platform_pm(struct pci_platform_pm_ops *ops);
-extern void pci_update_current_state(struct pci_dev *dev, pci_power_t state);
-extern void pci_power_up(struct pci_dev *dev);
-extern void pci_disable_enabled_device(struct pci_dev *dev);
-extern int pci_finish_runtime_suspend(struct pci_dev *dev);
-extern int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
-extern void pci_wakeup_bus(struct pci_bus *bus);
-extern void pci_config_pm_runtime_get(struct pci_dev *dev);
-extern void pci_config_pm_runtime_put(struct pci_dev *dev);
-extern void pci_pm_init(struct pci_dev *dev);
-extern void pci_allocate_cap_save_buffers(struct pci_dev *dev);
+int pci_set_platform_pm(struct pci_platform_pm_ops *ops);
+void pci_update_current_state(struct pci_dev *dev, pci_power_t state);
+void pci_power_up(struct pci_dev *dev);
+void pci_disable_enabled_device(struct pci_dev *dev);
+int pci_finish_runtime_suspend(struct pci_dev *dev);
+int __pci_pme_wakeup(struct pci_dev *dev, void *ign);
+void pci_wakeup_bus(struct pci_bus *bus);
+void pci_config_pm_runtime_get(struct pci_dev *dev);
+void pci_config_pm_runtime_put(struct pci_dev *dev);
+void pci_pm_init(struct pci_dev *dev);
+void pci_allocate_cap_save_buffers(struct pci_dev *dev);
 void pci_free_cap_save_buffers(struct pci_dev *dev);
 
 static inline void pci_wakeup_event(struct pci_dev *dev)
@@ -96,7 +95,7 @@ struct pci_vpd {
 	struct bin_attribute *attr; /* descriptor for sysfs VPD entry */
 };
 
-extern int pci_vpd_pci22_init(struct pci_dev *dev);
+int pci_vpd_pci22_init(struct pci_dev *dev);
 static inline void pci_vpd_release(struct pci_dev *dev)
 {
 	if (dev->vpd)
@@ -105,9 +104,9 @@ static inline void pci_vpd_release(struct pci_dev *dev)
 
 /* PCI /proc functions */
 #ifdef CONFIG_PROC_FS
-extern int pci_proc_attach_device(struct pci_dev *dev);
-extern int pci_proc_detach_device(struct pci_dev *dev);
-extern int pci_proc_detach_bus(struct pci_bus *bus);
+int pci_proc_attach_device(struct pci_dev *dev);
+int pci_proc_detach_device(struct pci_dev *dev);
+int pci_proc_detach_bus(struct pci_bus *bus);
 #else
 static inline int pci_proc_attach_device(struct pci_dev *dev) { return 0; }
 static inline int pci_proc_detach_device(struct pci_dev *dev) { return 0; }
@@ -118,8 +117,8 @@ static inline int pci_proc_detach_bus(struct pci_bus *bus) { return 0; }
 int pci_hp_add_bridge(struct pci_dev *dev);
 
 #ifdef HAVE_PCI_LEGACY
-extern void pci_create_legacy_files(struct pci_bus *bus);
-extern void pci_remove_legacy_files(struct pci_bus *bus);
+void pci_create_legacy_files(struct pci_bus *bus);
+void pci_remove_legacy_files(struct pci_bus *bus);
 #else
 static inline void pci_create_legacy_files(struct pci_bus *bus) { return; }
 static inline void pci_remove_legacy_files(struct pci_bus *bus) { return; }
@@ -134,7 +133,7 @@ extern unsigned int pci_pm_d3_delay;
 
 #ifdef CONFIG_PCI_MSI
 void pci_no_msi(void);
-extern void pci_msi_init_pci_dev(struct pci_dev *dev);
+void pci_msi_init_pci_dev(struct pci_dev *dev);
 #else
 static inline void pci_no_msi(void) { }
 static inline void pci_msi_init_pci_dev(struct pci_dev *dev) { }
@@ -198,12 +197,11 @@ enum pci_bar_type {
 
 bool pci_bus_read_dev_vendor_id(struct pci_bus *bus, int devfn, u32 *pl,
 				int crs_timeout);
-extern int pci_setup_device(struct pci_dev *dev);
-extern int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
-				struct resource *res, unsigned int reg);
-extern int pci_resource_bar(struct pci_dev *dev, int resno,
-			    enum pci_bar_type *type);
-extern void pci_configure_ari(struct pci_dev *dev);
+int pci_setup_device(struct pci_dev *dev);
+int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
+		    struct resource *res, unsigned int reg);
+int pci_resource_bar(struct pci_dev *dev, int resno, enum pci_bar_type *type);
+void pci_configure_ari(struct pci_dev *dev);
 
 /**
  * pci_ari_enabled - query ARI forwarding status
@@ -217,7 +215,7 @@ static inline int pci_ari_enabled(struct pci_bus *bus)
 }
 
 void pci_reassigndev_resource_alignment(struct pci_dev *dev);
-extern void pci_disable_bridge_window(struct pci_dev *dev);
+void pci_disable_bridge_window(struct pci_dev *dev);
 
 /* Single Root I/O Virtualization */
 struct pci_sriov {
@@ -241,7 +239,7 @@ struct pci_sriov {
 };
 
 #ifdef CONFIG_PCI_ATS
-extern void pci_restore_ats_state(struct pci_dev *dev);
+void pci_restore_ats_state(struct pci_dev *dev);
 #else
 static inline void pci_restore_ats_state(struct pci_dev *dev)
 {
@@ -249,14 +247,13 @@ static inline void pci_restore_ats_state(struct pci_dev *dev)
 #endif /* CONFIG_PCI_ATS */
 
 #ifdef CONFIG_PCI_IOV
-extern int pci_iov_init(struct pci_dev *dev);
-extern void pci_iov_release(struct pci_dev *dev);
-extern int pci_iov_resource_bar(struct pci_dev *dev, int resno,
-				enum pci_bar_type *type);
-extern resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev,
-						    int resno);
-extern void pci_restore_iov_state(struct pci_dev *dev);
-extern int pci_iov_bus_range(struct pci_bus *bus);
+int pci_iov_init(struct pci_dev *dev);
+void pci_iov_release(struct pci_dev *dev);
+int pci_iov_resource_bar(struct pci_dev *dev, int resno,
+			 enum pci_bar_type *type);
+resource_size_t pci_sriov_resource_alignment(struct pci_dev *dev, int resno);
+void pci_restore_iov_state(struct pci_dev *dev);
+int pci_iov_bus_range(struct pci_bus *bus);
 
 #else
 static inline int pci_iov_init(struct pci_dev *dev)
@@ -282,10 +279,10 @@ static inline int pci_iov_bus_range(struct pci_bus *bus)
 
 #endif /* CONFIG_PCI_IOV */
 
-extern unsigned long pci_cardbus_resource_alignment(struct resource *);
+unsigned long pci_cardbus_resource_alignment(struct resource *);
 
 static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
-					 struct resource *res)
+						     struct resource *res)
 {
 #ifdef CONFIG_PCI_IOV
 	int resno = res - dev->resource;
@@ -298,7 +295,7 @@ static inline resource_size_t pci_resource_alignment(struct pci_dev *dev,
 	return resource_alignment(res);
 }
 
-extern void pci_enable_acs(struct pci_dev *dev);
+void pci_enable_acs(struct pci_dev *dev);
 
 struct pci_dev_reset_methods {
 	u16 vendor;
@@ -307,7 +304,7 @@ struct pci_dev_reset_methods {
 };
 
 #ifdef CONFIG_PCI_QUIRKS
-extern int pci_dev_specific_reset(struct pci_dev *dev, int probe);
+int pci_dev_specific_reset(struct pci_dev *dev, int probe);
 #else
 static inline int pci_dev_specific_reset(struct pci_dev *dev, int probe)
 {
diff --git a/drivers/pci/pcie/aer/aerdrv.h b/drivers/pci/pcie/aer/aerdrv.h
index 22f840f4dda1..d12c77cd6991 100644
--- a/drivers/pci/pcie/aer/aerdrv.h
+++ b/drivers/pci/pcie/aer/aerdrv.h
@@ -110,15 +110,15 @@ static inline pci_ers_result_t merge_result(enum pci_ers_result orig,
 }
 
 extern struct bus_type pcie_port_bus_type;
-extern void aer_do_secondary_bus_reset(struct pci_dev *dev);
-extern int aer_init(struct pcie_device *dev);
-extern void aer_isr(struct work_struct *work);
-extern void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
-extern void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
-extern irqreturn_t aer_irq(int irq, void *context);
+void aer_do_secondary_bus_reset(struct pci_dev *dev);
+int aer_init(struct pcie_device *dev);
+void aer_isr(struct work_struct *work);
+void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
+void aer_print_port_info(struct pci_dev *dev, struct aer_err_info *info);
+irqreturn_t aer_irq(int irq, void *context);
 
 #ifdef CONFIG_ACPI_APEI
-extern int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
+int pcie_aer_get_firmware_first(struct pci_dev *pci_dev);
 #else
 static inline int pcie_aer_get_firmware_first(struct pci_dev *pci_dev)
 {
diff --git a/drivers/pci/pcie/portdrv.h b/drivers/pci/pcie/portdrv.h
index eea2ca2375e6..d2eb80aab569 100644
--- a/drivers/pci/pcie/portdrv.h
+++ b/drivers/pci/pcie/portdrv.h
@@ -21,18 +21,18 @@
 #define get_descriptor_id(type, service) (((type - 4) << 4) | service)
 
 extern struct bus_type pcie_port_bus_type;
-extern int pcie_port_device_register(struct pci_dev *dev);
+int pcie_port_device_register(struct pci_dev *dev);
 #ifdef CONFIG_PM
-extern int pcie_port_device_suspend(struct device *dev);
-extern int pcie_port_device_resume(struct device *dev);
+int pcie_port_device_suspend(struct device *dev);
+int pcie_port_device_resume(struct device *dev);
 #endif
-extern void pcie_port_device_remove(struct pci_dev *dev);
-extern int __must_check pcie_port_bus_register(void);
-extern void pcie_port_bus_unregister(void);
+void pcie_port_device_remove(struct pci_dev *dev);
+int __must_check pcie_port_bus_register(void);
+void pcie_port_bus_unregister(void);
 
 struct pci_dev;
 
-extern void pcie_clear_root_pme_status(struct pci_dev *dev);
+void pcie_clear_root_pme_status(struct pci_dev *dev);
 
 #ifdef CONFIG_HOTPLUG_PCI_PCIE
 extern bool pciehp_msi_disabled;
@@ -59,7 +59,7 @@ static inline bool pcie_pme_no_msi(void)
 	return pcie_pme_msi_disabled;
 }
 
-extern void pcie_pme_interrupt_enable(struct pci_dev *dev, bool enable);
+void pcie_pme_interrupt_enable(struct pci_dev *dev, bool enable);
 #else /* !CONFIG_PCIE_PME */
 static inline void pcie_pme_disable_msi(void) {}
 static inline bool pcie_pme_no_msi(void) { return false; }
@@ -67,7 +67,7 @@ static inline void pcie_pme_interrupt_enable(struct pci_dev *dev, bool en) {}
 #endif /* !CONFIG_PCIE_PME */
 
 #ifdef CONFIG_ACPI
-extern int pcie_port_acpi_setup(struct pci_dev *port, int *mask);
+int pcie_port_acpi_setup(struct pci_dev *port, int *mask);
 
 static inline int pcie_port_platform_notify(struct pci_dev *port, int *mask)
 {
diff --git a/include/linux/pci-aspm.h b/include/linux/pci-aspm.h
index c8320144fe79..8af4610c2e41 100644
--- a/include/linux/pci-aspm.h
+++ b/include/linux/pci-aspm.h
@@ -23,14 +23,14 @@
 #define PCIE_LINK_STATE_CLKPM	4
 
 #ifdef CONFIG_PCIEASPM
-extern void pcie_aspm_init_link_state(struct pci_dev *pdev);
-extern void pcie_aspm_exit_link_state(struct pci_dev *pdev);
-extern void pcie_aspm_pm_state_change(struct pci_dev *pdev);
-extern void pcie_aspm_powersave_config_link(struct pci_dev *pdev);
-extern void pci_disable_link_state(struct pci_dev *pdev, int state);
-extern void pci_disable_link_state_locked(struct pci_dev *pdev, int state);
-extern void pcie_clear_aspm(struct pci_bus *bus);
-extern void pcie_no_aspm(void);
+void pcie_aspm_init_link_state(struct pci_dev *pdev);
+void pcie_aspm_exit_link_state(struct pci_dev *pdev);
+void pcie_aspm_pm_state_change(struct pci_dev *pdev);
+void pcie_aspm_powersave_config_link(struct pci_dev *pdev);
+void pci_disable_link_state(struct pci_dev *pdev, int state);
+void pci_disable_link_state_locked(struct pci_dev *pdev, int state);
+void pcie_clear_aspm(struct pci_bus *bus);
+void pcie_no_aspm(void);
 #else
 static inline void pcie_aspm_init_link_state(struct pci_dev *pdev)
 {
@@ -56,8 +56,8 @@ static inline void pcie_no_aspm(void)
 #endif
 
 #ifdef CONFIG_PCIEASPM_DEBUG /* this depends on CONFIG_PCIEASPM */
-extern void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev);
-extern void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev);
+void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev);
+void pcie_aspm_remove_sysfs_dev_files(struct pci_dev *pdev);
 #else
 static inline void pcie_aspm_create_sysfs_dev_files(struct pci_dev *pdev)
 {
diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h
index 7ef68724f0f0..68bcefd7fca0 100644
--- a/include/linux/pci-ats.h
+++ b/include/linux/pci-ats.h
@@ -14,9 +14,9 @@ struct pci_ats {
 
 #ifdef CONFIG_PCI_ATS
 
-extern int pci_enable_ats(struct pci_dev *dev, int ps);
-extern void pci_disable_ats(struct pci_dev *dev);
-extern int pci_ats_queue_depth(struct pci_dev *dev);
+int pci_enable_ats(struct pci_dev *dev, int ps);
+void pci_disable_ats(struct pci_dev *dev);
+int pci_ats_queue_depth(struct pci_dev *dev);
 
 /**
  * pci_ats_enabled - query the ATS status
@@ -54,12 +54,12 @@ static inline int pci_ats_enabled(struct pci_dev *dev)
 
 #ifdef CONFIG_PCI_PRI
 
-extern int  pci_enable_pri(struct pci_dev *pdev, u32 reqs);
-extern void pci_disable_pri(struct pci_dev *pdev);
-extern bool pci_pri_enabled(struct pci_dev *pdev);
-extern int  pci_reset_pri(struct pci_dev *pdev);
-extern bool pci_pri_stopped(struct pci_dev *pdev);
-extern int  pci_pri_status(struct pci_dev *pdev);
+int pci_enable_pri(struct pci_dev *pdev, u32 reqs);
+void pci_disable_pri(struct pci_dev *pdev);
+bool pci_pri_enabled(struct pci_dev *pdev);
+int pci_reset_pri(struct pci_dev *pdev);
+bool pci_pri_stopped(struct pci_dev *pdev);
+int pci_pri_status(struct pci_dev *pdev);
 
 #else /* CONFIG_PCI_PRI */
 
@@ -95,10 +95,10 @@ static inline int pci_pri_status(struct pci_dev *pdev)
 
 #ifdef CONFIG_PCI_PASID
 
-extern int pci_enable_pasid(struct pci_dev *pdev, int features);
-extern void pci_disable_pasid(struct pci_dev *pdev);
-extern int pci_pasid_features(struct pci_dev *pdev);
-extern int pci_max_pasids(struct pci_dev *pdev);
+int pci_enable_pasid(struct pci_dev *pdev, int features);
+void pci_disable_pasid(struct pci_dev *pdev);
+int pci_pasid_features(struct pci_dev *pdev);
+int pci_max_pasids(struct pci_dev *pdev);
 
 #else  /* CONFIG_PCI_PASID */
 
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2461033a7987..614512bed7b0 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -348,7 +348,7 @@ static inline struct pci_dev *pci_physfn(struct pci_dev *dev)
 	return dev;
 }
 
-extern struct pci_dev *alloc_pci_dev(void);
+struct pci_dev *alloc_pci_dev(void);
 
 #define	to_pci_dev(n) container_of(n, struct pci_dev, dev)
 #define for_each_pci_dev(d) while ((d = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, d)) != NULL)
@@ -504,10 +504,10 @@ struct pci_ops {
  * ACPI needs to be able to access PCI config space before we've done a
  * PCI bus scan and created pci_bus structures.
  */
-extern int raw_pci_read(unsigned int domain, unsigned int bus,
-			unsigned int devfn, int reg, int len, u32 *val);
-extern int raw_pci_write(unsigned int domain, unsigned int bus,
-			unsigned int devfn, int reg, int len, u32 val);
+int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
+		 int reg, int len, u32 *val);
+int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
+		  int reg, int len, u32 val);
 
 struct pci_bus_region {
 	resource_size_t start;
@@ -658,7 +658,7 @@ struct pci_driver {
 /* these external functions are only available when PCI support is enabled */
 #ifdef CONFIG_PCI
 
-extern void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
+void pcie_bus_configure_settings(struct pci_bus *bus, u8 smpss);
 
 enum pcie_bus_config_types {
 	PCIE_BUS_TUNE_OFF,
@@ -675,7 +675,7 @@ extern struct bus_type pci_bus_type;
  * code, or pci core code. */
 extern struct list_head pci_root_buses;	/* list of all known PCI buses */
 /* Some device drivers need know if pci is initiated */
-extern int no_pci_devices(void);
+int no_pci_devices(void);
 
 void pcibios_resource_survey_bus(struct pci_bus *bus);
 void pcibios_fixup_bus(struct pci_bus *);
@@ -699,7 +699,7 @@ void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
 void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 			     struct pci_bus_region *region);
 void pcibios_scan_specific_bus(int busn);
-extern struct pci_bus *pci_find_bus(int domain, int busnr);
+struct pci_bus *pci_find_bus(int domain, int busnr);
 void pci_bus_add_devices(const struct pci_bus *bus);
 struct pci_bus *pci_scan_bus_parented(struct device *parent, int bus,
 				      struct pci_ops *ops, void *sysdata);
@@ -732,14 +732,14 @@ struct resource *pci_find_parent_resource(const struct pci_dev *dev,
 u8 pci_swizzle_interrupt_pin(const struct pci_dev *dev, u8 pin);
 int pci_get_interrupt_pin(struct pci_dev *dev, struct pci_dev **bridge);
 u8 pci_common_swizzle(struct pci_dev *dev, u8 *pinp);
-extern struct pci_dev *pci_dev_get(struct pci_dev *dev);
-extern void pci_dev_put(struct pci_dev *dev);
-extern void pci_remove_bus(struct pci_bus *b);
-extern void pci_stop_and_remove_bus_device(struct pci_dev *dev);
+struct pci_dev *pci_dev_get(struct pci_dev *dev);
+void pci_dev_put(struct pci_dev *dev);
+void pci_remove_bus(struct pci_bus *b);
+void pci_stop_and_remove_bus_device(struct pci_dev *dev);
 void pci_stop_root_bus(struct pci_bus *bus);
 void pci_remove_root_bus(struct pci_bus *bus);
 void pci_setup_cardbus(struct pci_bus *bus);
-extern void pci_sort_breadthfirst(void);
+void pci_sort_breadthfirst(void);
 #define dev_is_pci(d) ((d)->bus == &pci_bus_type)
 #define dev_is_pf(d) ((dev_is_pci(d) ? to_pci_dev(d)->is_physfn : false))
 #define dev_num_vf(d) ((dev_is_pci(d) ? pci_num_vf(to_pci_dev(d)) : 0))
@@ -1141,18 +1141,17 @@ static inline int pci_msi_enabled(void)
 	return 0;
 }
 #else
-extern int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
-extern int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec);
-extern void pci_msi_shutdown(struct pci_dev *dev);
-extern void pci_disable_msi(struct pci_dev *dev);
-extern int pci_msix_table_size(struct pci_dev *dev);
-extern int pci_enable_msix(struct pci_dev *dev,
-	struct msix_entry *entries, int nvec);
-extern void pci_msix_shutdown(struct pci_dev *dev);
-extern void pci_disable_msix(struct pci_dev *dev);
-extern void msi_remove_pci_irq_vectors(struct pci_dev *dev);
-extern void pci_restore_msi_state(struct pci_dev *dev);
-extern int pci_msi_enabled(void);
+int pci_enable_msi_block(struct pci_dev *dev, unsigned int nvec);
+int pci_enable_msi_block_auto(struct pci_dev *dev, unsigned int *maxvec);
+void pci_msi_shutdown(struct pci_dev *dev);
+void pci_disable_msi(struct pci_dev *dev);
+int pci_msix_table_size(struct pci_dev *dev);
+int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec);
+void pci_msix_shutdown(struct pci_dev *dev);
+void pci_disable_msix(struct pci_dev *dev);
+void msi_remove_pci_irq_vectors(struct pci_dev *dev);
+void pci_restore_msi_state(struct pci_dev *dev);
+int pci_msi_enabled(void);
 #endif
 
 #ifdef CONFIG_PCIEPORTBUS
@@ -1167,8 +1166,8 @@ extern bool pcie_ports_auto;
 static inline int pcie_aspm_enabled(void) { return 0; }
 static inline bool pcie_aspm_support_enabled(void) { return false; }
 #else
-extern int pcie_aspm_enabled(void);
-extern bool pcie_aspm_support_enabled(void);
+int pcie_aspm_enabled(void);
+bool pcie_aspm_support_enabled(void);
 #endif
 
 #ifdef CONFIG_PCIEAER
@@ -1186,8 +1185,8 @@ static inline void pcie_set_ecrc_checking(struct pci_dev *dev)
 }
 static inline void pcie_ecrc_get_policy(char *str) {};
 #else
-extern void pcie_set_ecrc_checking(struct pci_dev *dev);
-extern void pcie_ecrc_get_policy(char *str);
+void pcie_set_ecrc_checking(struct pci_dev *dev);
+void pcie_ecrc_get_policy(char *str);
 #endif
 
 #define pci_enable_msi(pdev)	pci_enable_msi_block(pdev, 1)
@@ -1198,9 +1197,9 @@ int  ht_create_irq(struct pci_dev *dev, int idx);
 void ht_destroy_irq(unsigned int irq);
 #endif /* CONFIG_HT_IRQ */
 
-extern void pci_cfg_access_lock(struct pci_dev *dev);
-extern bool pci_cfg_access_trylock(struct pci_dev *dev);
-extern void pci_cfg_access_unlock(struct pci_dev *dev);
+void pci_cfg_access_lock(struct pci_dev *dev);
+bool pci_cfg_access_trylock(struct pci_dev *dev);
+void pci_cfg_access_unlock(struct pci_dev *dev);
 
 /*
  * PCI domain support.  Sometimes called PCI segment (eg by ACPI),
@@ -1225,7 +1224,7 @@ static inline int pci_proc_domain(struct pci_bus *bus)
 /* some architectures require additional setup to direct VGA traffic */
 typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode,
 		      unsigned int command_bits, u32 flags);
-extern void pci_register_set_vga_state(arch_set_vga_state_t func);
+void pci_register_set_vga_state(arch_set_vga_state_t func);
 
 #else /* CONFIG_PCI is not enabled */
 
@@ -1627,8 +1626,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev,
 int pcibios_add_device(struct pci_dev *dev);
 
 #ifdef CONFIG_PCI_MMCONFIG
-extern void __init pci_mmcfg_early_init(void);
-extern void __init pci_mmcfg_late_init(void);
+void __init pci_mmcfg_early_init(void);
+void __init pci_mmcfg_late_init(void);
 #else
 static inline void pci_mmcfg_early_init(void) { }
 static inline void pci_mmcfg_late_init(void) { }
@@ -1639,12 +1638,12 @@ int pci_ext_cfg_avail(void);
 void __iomem *pci_ioremap_bar(struct pci_dev *pdev, int bar);
 
 #ifdef CONFIG_PCI_IOV
-extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
-extern void pci_disable_sriov(struct pci_dev *dev);
-extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
-extern int pci_num_vf(struct pci_dev *dev);
-extern int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
-extern int pci_sriov_get_totalvfs(struct pci_dev *dev);
+int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
+void pci_disable_sriov(struct pci_dev *dev);
+irqreturn_t pci_sriov_migration(struct pci_dev *dev);
+int pci_num_vf(struct pci_dev *dev);
+int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
+int pci_sriov_get_totalvfs(struct pci_dev *dev);
 #else
 static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 {
@@ -1672,8 +1671,8 @@ static inline int pci_sriov_get_totalvfs(struct pci_dev *dev)
 #endif
 
 #if defined(CONFIG_HOTPLUG_PCI) || defined(CONFIG_HOTPLUG_PCI_MODULE)
-extern void pci_hp_create_module_link(struct pci_slot *pci_slot);
-extern void pci_hp_remove_module_link(struct pci_slot *pci_slot);
+void pci_hp_create_module_link(struct pci_slot *pci_slot);
+void pci_hp_remove_module_link(struct pci_slot *pci_slot);
 #endif
 
 /**
@@ -1817,13 +1816,13 @@ int pci_vpd_find_info_keyword(const u8 *buf, unsigned int off,
 /* PCI <-> OF binding helpers */
 #ifdef CONFIG_OF
 struct device_node;
-extern void pci_set_of_node(struct pci_dev *dev);
-extern void pci_release_of_node(struct pci_dev *dev);
-extern void pci_set_bus_of_node(struct pci_bus *bus);
-extern void pci_release_bus_of_node(struct pci_bus *bus);
+void pci_set_of_node(struct pci_dev *dev);
+void pci_release_of_node(struct pci_dev *dev);
+void pci_set_bus_of_node(struct pci_bus *bus);
+void pci_release_bus_of_node(struct pci_bus *bus);
 
 /* Arch may override this (weak) */
-extern struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus);
+struct device_node * __weak pcibios_get_phb_of_node(struct pci_bus *bus);
 
 static inline struct device_node *
 pci_device_to_OF_node(const struct pci_dev *pdev)
diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h
index 45fc162cbdc0..8db71dcd6337 100644
--- a/include/linux/pci_hotplug.h
+++ b/include/linux/pci_hotplug.h
@@ -125,12 +125,12 @@ static inline const char *hotplug_slot_name(const struct hotplug_slot *slot)
 	return pci_slot_name(slot->pci_slot);
 }
 
-extern int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *pbus,
-			     int nr, const char *name,
-			     struct module *owner, const char *mod_name);
-extern int pci_hp_deregister(struct hotplug_slot *slot);
-extern int __must_check pci_hp_change_slot_info	(struct hotplug_slot *slot,
-						 struct hotplug_slot_info *info);
+int __pci_hp_register(struct hotplug_slot *slot, struct pci_bus *pbus, int nr,
+		      const char *name, struct module *owner,
+		      const char *mod_name);
+int pci_hp_deregister(struct hotplug_slot *slot);
+int __must_check pci_hp_change_slot_info(struct hotplug_slot *slot,
+					 struct hotplug_slot_info *info);
 
 /* use a define to avoid include chaining to get THIS_MODULE & friends */
 #define pci_hp_register(slot, pbus, devnr, name) \
diff --git a/include/linux/pcieport_if.h b/include/linux/pcieport_if.h
index e6f91b1406d8..9572669eea97 100644
--- a/include/linux/pcieport_if.h
+++ b/include/linux/pcieport_if.h
@@ -62,7 +62,7 @@ struct pcie_port_service_driver {
 #define to_service_driver(d) \
 	container_of(d, struct pcie_port_service_driver, driver)
 
-extern int pcie_port_service_register(struct pcie_port_service_driver *new);
-extern void pcie_port_service_unregister(struct pcie_port_service_driver *new);
+int pcie_port_service_register(struct pcie_port_service_driver *new);
+void pcie_port_service_unregister(struct pcie_port_service_driver *new);
 
 #endif /* _PCIEPORT_IF_H_ */
-- 
cgit 


From 5ed67f05f66c41e39880a6d61358438a25f9fee5 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Mon, 11 Mar 2013 13:12:21 +0400
Subject: posix timers: Allocate timer id per process (v2)

Currently kernel generates IDs for posix timers in a global manner --
there's a kernel-wide IDR tree from which IDs are created. This makes
it impossible to recreate a timer with a desired ID (in particular
this is done by the CRIU checkpoint-restore project) -- since these
IDs are global it may happen, that at the time we recreate a timer, the
ID we want for it is already busy by some other timer.

In order to address this, replace the IDR tree with a global hash
table for timers and makes timer IDs unique per signal_struct (to
which timers are linked anyway). With this, two timers belonging to
different processes may have equal IDs and we can recreate either of
them with the ID we want.

Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Matthew Helsley <matt.helsley@gmail.com>
Link: http://lkml.kernel.org/r/513D9FF5.9010004@parallels.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h |   1 +
 include/linux/sched.h        |   3 +-
 kernel/posix-timers.c        | 106 ++++++++++++++++++++++++++++---------------
 3 files changed, 72 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 042058fdb0af..60bac697a91b 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -55,6 +55,7 @@ struct cpu_timer_list {
 /* POSIX.1b interval timer structure. */
 struct k_itimer {
 	struct list_head list;		/* free/ allocate list */
+	struct hlist_node t_hash;
 	spinlock_t it_lock;
 	clockid_t it_clock;		/* which timer type */
 	timer_t it_id;			/* timer id */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..d13341b55096 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -526,7 +526,8 @@ struct signal_struct {
 	unsigned int		has_child_subreaper:1;
 
 	/* POSIX.1b Interval Timers */
-	struct list_head posix_timers;
+	int			posix_timer_id;
+	struct list_head	posix_timers;
 
 	/* ITIMER_REAL timer for the process */
 	struct hrtimer real_timer;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 2a2e173d0a7a..34d75926b843 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -40,38 +40,31 @@
 #include <linux/list.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
-#include <linux/idr.h>
+#include <linux/hash.h>
 #include <linux/posix-clock.h>
 #include <linux/posix-timers.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
 #include <linux/export.h>
+#include <linux/hashtable.h>
 
 /*
- * Management arrays for POSIX timers.	 Timers are kept in slab memory
- * Timer ids are allocated by an external routine that keeps track of the
- * id and the timer.  The external interface is:
- *
- * void *idr_find(struct idr *idp, int id);           to find timer_id <id>
- * int idr_get_new(struct idr *idp, void *ptr);       to get a new id and
- *                                                    related it to <ptr>
- * void idr_remove(struct idr *idp, int id);          to release <id>
- * void idr_init(struct idr *idp);                    to initialize <idp>
- *                                                    which we supply.
- * The idr_get_new *may* call slab for more memory so it must not be
- * called under a spin lock.  Likewise idr_remore may release memory
- * (but it may be ok to do this under a lock...).
- * idr_find is just a memory look up and is quite fast.  A -1 return
- * indicates that the requested id does not exist.
+ * Management arrays for POSIX timers. Timers are now kept in static hash table
+ * with 512 entries.
+ * Timer ids are allocated by local routine, which selects proper hash head by
+ * key, constructed from current->signal address and per signal struct counter.
+ * This keeps timer ids unique per process, but now they can intersect between
+ * processes.
  */
 
 /*
  * Lets keep our timers in a slab cache :-)
  */
 static struct kmem_cache *posix_timers_cache;
-static struct idr posix_timers_id;
-static DEFINE_SPINLOCK(idr_lock);
+
+static DEFINE_HASHTABLE(posix_timers_hashtable, 9);
+static DEFINE_SPINLOCK(hash_lock);
 
 /*
  * we assume that the new SIGEV_THREAD_ID shares no bits with the other
@@ -152,6 +145,57 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags);
 	__timr;								   \
 })
 
+static int hash(struct signal_struct *sig, unsigned int nr)
+{
+	return hash_32(hash32_ptr(sig) ^ nr, HASH_BITS(posix_timers_hashtable));
+}
+
+static struct k_itimer *__posix_timers_find(struct hlist_head *head,
+					    struct signal_struct *sig,
+					    timer_t id)
+{
+	struct hlist_node *node;
+	struct k_itimer *timer;
+
+	hlist_for_each_entry_rcu(timer, head, t_hash) {
+		if ((timer->it_signal == sig) && (timer->it_id == id))
+			return timer;
+	}
+	return NULL;
+}
+
+static struct k_itimer *posix_timer_by_id(timer_t id)
+{
+	struct signal_struct *sig = current->signal;
+	struct hlist_head *head = &posix_timers_hashtable[hash(sig, id)];
+
+	return __posix_timers_find(head, sig, id);
+}
+
+static int posix_timer_add(struct k_itimer *timer)
+{
+	struct signal_struct *sig = current->signal;
+	int first_free_id = sig->posix_timer_id;
+	struct hlist_head *head;
+	int ret = -ENOENT;
+
+	do {
+		spin_lock(&hash_lock);
+		head = &posix_timers_hashtable[hash(sig, sig->posix_timer_id)];
+		if (!__posix_timers_find(head, sig, sig->posix_timer_id)) {
+			hlist_add_head_rcu(&timer->t_hash, head);
+			ret = sig->posix_timer_id;
+		}
+		if (++sig->posix_timer_id < 0)
+			sig->posix_timer_id = 0;
+		if ((sig->posix_timer_id == first_free_id) && (ret == -ENOENT))
+			/* Loop over all possible ids completed */
+			ret = -EAGAIN;
+		spin_unlock(&hash_lock);
+	} while (ret == -ENOENT);
+	return ret;
+}
+
 static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
 {
 	spin_unlock_irqrestore(&timr->it_lock, flags);
@@ -298,7 +342,6 @@ static __init int init_posix_timers(void)
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
 					sizeof (struct k_itimer), 0, SLAB_PANIC,
 					NULL);
-	idr_init(&posix_timers_id);
 	return 0;
 }
 
@@ -520,9 +563,9 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
 {
 	if (it_id_set) {
 		unsigned long flags;
-		spin_lock_irqsave(&idr_lock, flags);
-		idr_remove(&posix_timers_id, tmr->it_id);
-		spin_unlock_irqrestore(&idr_lock, flags);
+		spin_lock_irqsave(&hash_lock, flags);
+		hlist_del_rcu(&tmr->t_hash);
+		spin_unlock_irqrestore(&hash_lock, flags);
 	}
 	put_pid(tmr->it_pid);
 	sigqueue_free(tmr->sigq);
@@ -568,22 +611,11 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
 		return -EAGAIN;
 
 	spin_lock_init(&new_timer->it_lock);
-
-	idr_preload(GFP_KERNEL);
-	spin_lock_irq(&idr_lock);
-	error = idr_alloc(&posix_timers_id, new_timer, 0, 0, GFP_NOWAIT);
-	spin_unlock_irq(&idr_lock);
-	idr_preload_end();
-	if (error < 0) {
-		/*
-		 * Weird looking, but we return EAGAIN if the IDR is
-		 * full (proper POSIX return value for this)
-		 */
-		if (error == -ENOSPC)
-			error = -EAGAIN;
+	new_timer_id = posix_timer_add(new_timer);
+	if (new_timer_id < 0) {
+		error = new_timer_id;
 		goto out;
 	}
-	new_timer_id = error;
 
 	it_id_set = IT_ID_SET;
 	new_timer->it_id = (timer_t) new_timer_id;
@@ -661,7 +693,7 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
 		return NULL;
 
 	rcu_read_lock();
-	timr = idr_find(&posix_timers_id, (int)timer_id);
+	timr = posix_timer_by_id(timer_id);
 	if (timr) {
 		spin_lock_irqsave(&timr->it_lock, *flags);
 		if (timr->it_signal == current->signal) {
-- 
cgit 


From c729de8fcea37a1c444e81857eace12494c804a9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:45 -0700
Subject: x86, kdump: Set crashkernel_low automatically

Chao said that kdump does does work well on his system on 3.8
without extra parameter, even iommu does not work with kdump.
And now have to append crashkernel_low=Y in first kernel to make
kdump work.

We have now modified crashkernel=X to allocate memory beyong 4G (if
available) and do not allocate low range for crashkernel if the user
does not specify that with crashkernel_low=Y.  This causes regression
if iommu is not enabled.  Without iommu, swiotlb needs to be setup in
first 4G and there is no low memory available to second kernel.

Set crashkernel_low automatically if the user does not specify that.

For system that does support IOMMU with kdump properly, user could
specify crashkernel_low=0 to save that 72M low ram.

-v3: add swiotlb_size() according to Konrad.
-v4: add comments what 8M is for according to hpa.
     also update more crashkernel_low= in kernel-parameters.txt
-v5: update changelog according to Vivek.
-v6: Change description about swiotlb referring according to HATAYAMA.

Reported-by: WANG Chao <chaowang@redhat.com>
Tested-by: WANG Chao <chaowang@redhat.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-2-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/kernel-parameters.txt | 14 +++++++++++---
 arch/x86/kernel/setup.c             | 21 ++++++++++++++++++---
 include/linux/swiotlb.h             |  1 +
 lib/swiotlb.c                       | 19 +++++++++++++++----
 4 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4609e81dbc37..cff672da2486 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -596,9 +596,6 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			is selected automatically. Check
 			Documentation/kdump/kdump.txt for further details.
 
-	crashkernel_low=size[KMG]
-			[KNL, x86] parts under 4G.
-
 	crashkernel=range1:size1[,range2:size2,...][@offset]
 			[KNL] Same as above, but depends on the memory
 			in the running system. The syntax of range is
@@ -606,6 +603,17 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			a memory unit (amount[KMG]). See also
 			Documentation/kdump/kdump.txt for an example.
 
+	crashkernel_low=size[KMG]
+			[KNL, x86_64] range under 4G. When crashkernel= is
+			passed, kernel allocate physical memory region
+			above 4G, that cause second kernel crash on system
+			that require some amount of low memory, e.g. swiotlb
+			requires at least 64M+32K low memory.  Kernel would
+			try to allocate 72M below 4G automatically.
+			This one let user to specify own low range under 4G
+			for second kernel instead.
+			0: to disable low allocation.
+
 	cs89x0_dma=	[HW,NET]
 			Format: <dma>
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 90d8cc930f5e..12349202cae7 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -521,19 +521,34 @@ static void __init reserve_crashkernel_low(void)
 	unsigned long long low_base = 0, low_size = 0;
 	unsigned long total_low_mem;
 	unsigned long long base;
+	bool auto_set = false;
 	int ret;
 
 	total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
 	ret = parse_crashkernel_low(boot_command_line, total_low_mem,
 						&low_size, &base);
-	if (ret != 0 || low_size <= 0)
-		return;
+	if (ret != 0) {
+		/*
+		 * two parts from lib/swiotlb.c:
+		 *	swiotlb size: user specified with swiotlb= or default.
+		 *	swiotlb overflow buffer: now is hardcoded to 32k.
+		 *		We round it to 8M for other buffers that
+		 *		may need to stay low too.
+		 */
+		low_size = swiotlb_size_or_default() + (8UL<<20);
+		auto_set = true;
+	} else {
+		/* passed with crashkernel_low=0 ? */
+		if (!low_size)
+			return;
+	}
 
 	low_base = memblock_find_in_range(low_size, (1ULL<<32),
 					low_size, alignment);
 
 	if (!low_base) {
-		pr_info("crashkernel low reservation failed - No suitable area found.\n");
+		if (!auto_set)
+			pr_info("crashkernel low reservation failed - No suitable area found.\n");
 
 		return;
 	}
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 2de42f9401d2..a5ffd32642fd 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -25,6 +25,7 @@ extern int swiotlb_force;
 extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
+unsigned long swiotlb_size_or_default(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 
 /*
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index bfe02b8fc55b..d23762e6652c 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -105,9 +105,9 @@ setup_io_tlb_npages(char *str)
 	if (!strcmp(str, "force"))
 		swiotlb_force = 1;
 
-	return 1;
+	return 0;
 }
-__setup("swiotlb=", setup_io_tlb_npages);
+early_param("swiotlb", setup_io_tlb_npages);
 /* make io_tlb_overflow tunable too? */
 
 unsigned long swiotlb_nr_tbl(void)
@@ -115,6 +115,18 @@ unsigned long swiotlb_nr_tbl(void)
 	return io_tlb_nslabs;
 }
 EXPORT_SYMBOL_GPL(swiotlb_nr_tbl);
+
+/* default to 64MB */
+#define IO_TLB_DEFAULT_SIZE (64UL<<20)
+unsigned long swiotlb_size_or_default(void)
+{
+	unsigned long size;
+
+	size = io_tlb_nslabs << IO_TLB_SHIFT;
+
+	return size ? size : (IO_TLB_DEFAULT_SIZE);
+}
+
 /* Note that this doesn't work with highmem page */
 static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 				      volatile void *address)
@@ -188,8 +200,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
 void  __init
 swiotlb_init(int verbose)
 {
-	/* default to 64MB */
-	size_t default_size = 64UL<<20;
+	size_t default_size = IO_TLB_DEFAULT_SIZE;
 	unsigned char *vstart;
 	unsigned long bytes;
 
-- 
cgit 


From 55a20ee7804ab64ac90bcdd4e2868a42829e2784 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 15 Apr 2013 22:23:47 -0700
Subject: x86, kdump: Retore crashkernel= to allocate under 896M

Vivek found old kexec-tools does not work new kernel anymore.

So change back crashkernel= back to old behavoir, and add crashkernel_high=
to let user decide if buffer could be above 4G, and also new kexec-tools will
be needed.

-v2: let crashkernel=X override crashkernel_high=
    update description about _high will be ignored by crashkernel=X
-v3: update description about kernel-parameters.txt according to Vivek.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Link: http://lkml.kernel.org/r/1366089828-19692-4-git-send-email-yinghai@kernel.org
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 Documentation/kernel-parameters.txt | 13 +++++++++++--
 arch/x86/kernel/setup.c             | 24 +++++++++++++++++++-----
 include/linux/kexec.h               |  2 ++
 kernel/kexec.c                      |  9 +++++++++
 4 files changed, 41 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index cff672da2486..709eb3edc6b2 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -603,9 +603,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			a memory unit (amount[KMG]). See also
 			Documentation/kdump/kdump.txt for an example.
 
+	crashkernel_high=size[KMG]
+			[KNL, x86_64] range could be above 4G. Allow kernel
+			to allocate physical memory region from top, so could
+			be above 4G if system have more than 4G ram installed.
+			Otherwise memory region will be allocated below 4G, if
+			available.
+			It will be ignored if crashkernel=X is specified.
 	crashkernel_low=size[KMG]
-			[KNL, x86_64] range under 4G. When crashkernel= is
-			passed, kernel allocate physical memory region
+			[KNL, x86_64] range under 4G. When crashkernel_high= is
+			passed, kernel could allocate physical memory region
 			above 4G, that cause second kernel crash on system
 			that require some amount of low memory, e.g. swiotlb
 			requires at least 64M+32K low memory.  Kernel would
@@ -613,6 +620,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			This one let user to specify own low range under 4G
 			for second kernel instead.
 			0: to disable low allocation.
+			It will be ignored when crashkernel_high=X is not used
+			or memory reserved is below 4G.
 
 	cs89x0_dma=	[HW,NET]
 			Format: <dma>
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 12349202cae7..a85a144f2052 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -507,11 +507,14 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 /*
  * Keep the crash kernel below this limit.  On 32 bits earlier kernels
  * would limit the kernel to the low 512 MiB due to mapping restrictions.
+ * On 64bit, old kexec-tools need to under 896MiB.
  */
 #ifdef CONFIG_X86_32
-# define CRASH_KERNEL_ADDR_MAX	(512 << 20)
+# define CRASH_KERNEL_ADDR_LOW_MAX	(512 << 20)
+# define CRASH_KERNEL_ADDR_HIGH_MAX	(512 << 20)
 #else
-# define CRASH_KERNEL_ADDR_MAX	MAXMEM
+# define CRASH_KERNEL_ADDR_LOW_MAX	(896UL<<20)
+# define CRASH_KERNEL_ADDR_HIGH_MAX	MAXMEM
 #endif
 
 static void __init reserve_crashkernel_low(void)
@@ -525,6 +528,7 @@ static void __init reserve_crashkernel_low(void)
 	int ret;
 
 	total_low_mem = memblock_mem_size(1UL<<(32-PAGE_SHIFT));
+	/* crashkernel_low=YM */
 	ret = parse_crashkernel_low(boot_command_line, total_low_mem,
 						&low_size, &base);
 	if (ret != 0) {
@@ -569,14 +573,22 @@ static void __init reserve_crashkernel(void)
 	const unsigned long long alignment = 16<<20;	/* 16M */
 	unsigned long long total_mem;
 	unsigned long long crash_size, crash_base;
+	bool high = false;
 	int ret;
 
 	total_mem = memblock_phys_mem_size();
 
+	/* crashkernel=XM */
 	ret = parse_crashkernel(boot_command_line, total_mem,
 			&crash_size, &crash_base);
-	if (ret != 0 || crash_size <= 0)
-		return;
+	if (ret != 0 || crash_size <= 0) {
+		/* crashkernel_high=XM */
+		ret = parse_crashkernel_high(boot_command_line, total_mem,
+				&crash_size, &crash_base);
+		if (ret != 0 || crash_size <= 0)
+			return;
+		high = true;
+	}
 
 	/* 0 means: find the address automatically */
 	if (crash_base <= 0) {
@@ -584,7 +596,9 @@ static void __init reserve_crashkernel(void)
 		 *  kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
 		 */
 		crash_base = memblock_find_in_range(alignment,
-			       CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
+					high ? CRASH_KERNEL_ADDR_HIGH_MAX :
+					       CRASH_KERNEL_ADDR_LOW_MAX,
+					crash_size, alignment);
 
 		if (!crash_base) {
 			pr_info("crashkernel reservation failed - No suitable area found.\n");
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index d2e6927bbaae..d78d28a733b1 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -200,6 +200,8 @@ extern size_t vmcoreinfo_max_size;
 
 int __init parse_crashkernel(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
+int parse_crashkernel_high(char *cmdline, unsigned long long system_ram,
+		unsigned long long *crash_size, unsigned long long *crash_base);
 int parse_crashkernel_low(char *cmdline, unsigned long long system_ram,
 		unsigned long long *crash_size, unsigned long long *crash_base);
 int crash_shrink_memory(unsigned long new_size);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..1b2f73f5f9b9 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1422,6 +1422,15 @@ int __init parse_crashkernel(char *cmdline,
 					"crashkernel=");
 }
 
+int __init parse_crashkernel_high(char *cmdline,
+			     unsigned long long system_ram,
+			     unsigned long long *crash_size,
+			     unsigned long long *crash_base)
+{
+	return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+					"crashkernel_high=");
+}
+
 int __init parse_crashkernel_low(char *cmdline,
 			     unsigned long long system_ram,
 			     unsigned long long *crash_size,
-- 
cgit 


From 86239ceb33b0d8480b0f0ca0eec08e7f7a807374 Mon Sep 17 00:00:00 2001
From: Len Brown <len.brown@intel.com>
Date: Wed, 27 Feb 2013 13:18:50 -0500
Subject: intel_idle: initial C8, C9, C10 support

Allow intel_idle and cpuidle to utilize C8, C9, C10
when they are present on...
"Fourth Generation Intel(R) Core(TM) Processors",
which are based on Intel(R) microarchitecture code name Haswell.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 drivers/idle/intel_idle.c | 21 +++++++++++++++++++++
 include/linux/cpuidle.h   |  2 +-
 2 files changed, 22 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 1a38dd7dfe4e..c7fbac392952 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -273,6 +273,27 @@ static struct cpuidle_state hsw_cstates[CPUIDLE_STATE_MAX] = {
 		.exit_latency = 166,
 		.target_residency = 500,
 		.enter = &intel_idle },
+	{
+		.name = "C8-HSW",
+		.desc = "MWAIT 0x40",
+		.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 300,
+		.target_residency = 900,
+		.enter = &intel_idle },
+	{
+		.name = "C9-HSW",
+		.desc = "MWAIT 0x50",
+		.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 600,
+		.target_residency = 1800,
+		.enter = &intel_idle },
+	{
+		.name = "C10-HSW",
+		.desc = "MWAIT 0x60",
+		.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TLB_FLUSHED,
+		.exit_latency = 2600,
+		.target_residency = 7700,
+		.enter = &intel_idle },
 	{
 		.enter = NULL }
 };
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 480c14dc1ddd..309f7f596b4c 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -17,7 +17,7 @@
 #include <linux/completion.h>
 #include <linux/hrtimer.h>
 
-#define CPUIDLE_STATE_MAX	8
+#define CPUIDLE_STATE_MAX	10
 #define CPUIDLE_NAME_LEN	16
 #define CPUIDLE_DESC_LEN	32
 
-- 
cgit 


From fcf371ee5624cc87abac205cd0dad2432d7f0346 Mon Sep 17 00:00:00 2001
From: Axel Lin <axel.lin@ingics.com>
Date: Thu, 18 Apr 2013 10:34:49 +0800
Subject: regulator: core: Add regulator_map_voltage_ascend() API

A lot of regulator hardware has ascendant voltage list.
This patch adds regulator_map_voltage_ascend() and export it.

Drivers that have ascendant voltage list can use this as their map_voltage()
operation, this is more efficient than default regulator_map_voltage_iterate()
function.

Signed-off-by: Axel Lin <axel.lin@ingics.com>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/core.c         | 31 +++++++++++++++++++++++++++++++
 include/linux/regulator/driver.h |  2 ++
 2 files changed, 33 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/regulator/core.c b/drivers/regulator/core.c
index e3661c20cf38..56f4ca0854c9 100644
--- a/drivers/regulator/core.c
+++ b/drivers/regulator/core.c
@@ -2137,6 +2137,37 @@ int regulator_map_voltage_iterate(struct regulator_dev *rdev,
 }
 EXPORT_SYMBOL_GPL(regulator_map_voltage_iterate);
 
+/**
+ * regulator_map_voltage_ascend - map_voltage() for ascendant voltage list
+ *
+ * @rdev: Regulator to operate on
+ * @min_uV: Lower bound for voltage
+ * @max_uV: Upper bound for voltage
+ *
+ * Drivers that have ascendant voltage list can use this as their
+ * map_voltage() operation.
+ */
+int regulator_map_voltage_ascend(struct regulator_dev *rdev,
+				 int min_uV, int max_uV)
+{
+	int i, ret;
+
+	for (i = 0; i < rdev->desc->n_voltages; i++) {
+		ret = rdev->desc->ops->list_voltage(rdev, i);
+		if (ret < 0)
+			continue;
+
+		if (ret > max_uV)
+			break;
+
+		if (ret >= min_uV && ret <= max_uV)
+			return i;
+	}
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(regulator_map_voltage_ascend);
+
 /**
  * regulator_map_voltage_linear - map_voltage() for simple linear mappings
  *
diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h
index 7df93f52db08..2acdc66f8342 100644
--- a/include/linux/regulator/driver.h
+++ b/include/linux/regulator/driver.h
@@ -329,6 +329,8 @@ int regulator_map_voltage_linear(struct regulator_dev *rdev,
 				  int min_uV, int max_uV);
 int regulator_map_voltage_iterate(struct regulator_dev *rdev,
 				  int min_uV, int max_uV);
+int regulator_map_voltage_ascend(struct regulator_dev *rdev,
+				  int min_uV, int max_uV);
 int regulator_get_voltage_sel_regmap(struct regulator_dev *rdev);
 int regulator_set_voltage_sel_regmap(struct regulator_dev *rdev, unsigned sel);
 int regulator_is_enabled_regmap(struct regulator_dev *rdev);
-- 
cgit 


From 0a82a8d132b26d438eb90b3ab35a7016e7227a1d Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Thu, 18 Apr 2013 09:00:26 -0700
Subject: Revert "block: add missing block_bio_complete() tracepoint"

This reverts commit 3a366e614d0837d9fc23f78cdb1a1186ebc3387f.

Wanlong Gao reports that it causes a kernel panic on his machine several
minutes after boot. Reverting it removes the panic.

Jens says:
 "It's not quite clear why that is yet, so I think we should just revert
  the commit for 3.9 final (which I'm assuming is pretty close).

  The wifi is crap at the LSF hotel, so sending this email instead of
  queueing up a revert and pull request."

Reported-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
Requested-by: Jens Axboe <axboe@kernel.dk>
Cc: Tejun Heo <tj@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c             |  1 +
 drivers/md/dm.c              |  1 +
 drivers/md/raid5.c           | 11 ++++++++++-
 fs/bio.c                     |  2 --
 include/linux/blktrace_api.h |  1 -
 include/trace/events/block.h |  8 ++++----
 kernel/trace/blktrace.c      | 26 +++-----------------------
 7 files changed, 19 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/block/blk-core.c b/block/blk-core.c
index 074b758efc42..7c288358a745 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
 
 DEFINE_IDA(blk_queue_ida);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7e469260fe5e..9a0bdad9ad8f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -611,6 +611,7 @@ static void dec_pending(struct dm_io *io, int error)
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
+			trace_block_bio_complete(md->queue, bio, io_error);
 			bio_endio(bio, io_error);
 		}
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 24909eb13fec..f4e87bfc7567 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -184,6 +184,8 @@ static void return_io(struct bio *return_bi)
 		return_bi = bi->bi_next;
 		bi->bi_next = NULL;
 		bi->bi_size = 0;
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 		bi = return_bi;
 	}
@@ -3914,6 +3916,8 @@ static void raid5_align_endio(struct bio *bi, int error)
 	rdev_dec_pending(rdev, conf->mddev);
 
 	if (!error && uptodate) {
+		trace_block_bio_complete(bdev_get_queue(raid_bi->bi_bdev),
+					 raid_bi, 0);
 		bio_endio(raid_bi, 0);
 		if (atomic_dec_and_test(&conf->active_aligned_reads))
 			wake_up(&conf->wait_for_stripe);
@@ -4382,6 +4386,8 @@ static void make_request(struct mddev *mddev, struct bio * bi)
 		if ( rw == WRITE )
 			md_write_end(mddev);
 
+		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
+					 bi, 0);
 		bio_endio(bi, 0);
 	}
 }
@@ -4758,8 +4764,11 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
 		handled++;
 	}
 	remaining = raid5_dec_bi_active_stripes(raid_bio);
-	if (remaining == 0)
+	if (remaining == 0) {
+		trace_block_bio_complete(bdev_get_queue(raid_bio->bi_bdev),
+					 raid_bio, 0);
 		bio_endio(raid_bio, 0);
+	}
 	if (atomic_dec_and_test(&conf->active_aligned_reads))
 		wake_up(&conf->wait_for_stripe);
 	return handled;
diff --git a/fs/bio.c b/fs/bio.c
index bb5768f59b32..b96fc6ce4855 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1428,8 +1428,6 @@ void bio_endio(struct bio *bio, int error)
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 
-	trace_block_bio_complete(bio, error);
-
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio, error);
 }
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 0ea61e07a91c..7c2e030e72f1 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -12,7 +12,6 @@
 
 struct blk_trace {
 	int trace_state;
-	bool rq_based;
 	struct rchan *rchan;
 	unsigned long __percpu *sequence;
 	unsigned char __percpu *msg_data;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 9961726523d0..9c1467357b03 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -257,6 +257,7 @@ TRACE_EVENT(block_bio_bounce,
 
 /**
  * block_bio_complete - completed all work on the block operation
+ * @q: queue holding the block operation
  * @bio: block operation completed
  * @error: io error value
  *
@@ -265,9 +266,9 @@ TRACE_EVENT(block_bio_bounce,
  */
 TRACE_EVENT(block_bio_complete,
 
-	TP_PROTO(struct bio *bio, int error),
+	TP_PROTO(struct request_queue *q, struct bio *bio, int error),
 
-	TP_ARGS(bio, error),
+	TP_ARGS(q, bio, error),
 
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
@@ -278,8 +279,7 @@ TRACE_EVENT(block_bio_complete,
 	),
 
 	TP_fast_assign(
-		__entry->dev		= bio->bi_bdev ?
-					  bio->bi_bdev->bd_dev : 0;
+		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		__entry->error		= error;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..5a0f781cd729 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
 				      struct request_queue *q,
 				      struct request *rq)
 {
-	struct blk_trace *bt = q->blk_trace;
-
-	/* if control ever passes through here, it's a request based driver */
-	if (unlikely(bt && !bt->rq_based))
-		bt->rq_based = true;
-
 	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
 
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
 	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 
-static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
+static void blk_add_trace_bio_complete(void *ignore,
+				       struct request_queue *q, struct bio *bio,
+				       int error)
 {
-	struct request_queue *q;
-	struct blk_trace *bt;
-
-	if (!bio->bi_bdev)
-		return;
-
-	q = bdev_get_queue(bio->bi_bdev);
-	bt = q->blk_trace;
-
-	/*
-	 * Request based drivers will generate both rq and bio completions.
-	 * Ignore bio ones.
-	 */
-	if (likely(!bt) || bt->rq_based)
-		return;
-
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 
-- 
cgit 


From 76c24fb054b52b34af4dcde589cbb9e2b98fc74c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 18 Apr 2013 00:15:40 +0200
Subject: nohz: New APIs to re-evaluate the tick on full dynticks CPUs

Provide two new helpers in order to notify the full dynticks CPUs about
some internal system changes against which they may reconsider the state
of their tick. Some practical examples include: posix cpu timers, perf tick
and sched clock tick.

For now the notifying handler, implemented through IPIs, is a stub
that will be implemented when we get the tick stop/restart infrastructure
in.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     |  4 ++++
 kernel/time/Kconfig      |  1 +
 kernel/time/tick-sched.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index b4e3b0c9639e..c2dcfb18f65b 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -159,8 +159,12 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 
 #ifdef CONFIG_NO_HZ_FULL
 extern int tick_nohz_full_cpu(int cpu);
+extern void tick_nohz_full_kick(void);
+extern void tick_nohz_full_kick_all(void);
 #else
 static inline int tick_nohz_full_cpu(int cpu) { return 0; }
+static inline void tick_nohz_full_kick(void) { }
+static inline void tick_nohz_full_kick_all(void) { }
 #endif
 
 
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 358d601a4fec..fbb4c7eb92a0 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -111,6 +111,7 @@ config NO_HZ_FULL
 	select RCU_USER_QS
 	select RCU_NOCB_CPU
 	select CONTEXT_TRACKING_FORCE
+	select IRQ_WORK
 	help
 	 Adaptively try to shutdown the tick whenever possible, even when
 	 the CPU is running tasks. Typically this requires running a single
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 369b5769fc97..2bcad5b904d8 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -147,6 +147,57 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 static cpumask_var_t nohz_full_mask;
 bool have_nohz_full_mask;
 
+/*
+ * Re-evaluate the need for the tick on the current CPU
+ * and restart it if necessary.
+ */
+static void tick_nohz_full_check(void)
+{
+	/*
+	 * STUB for now, will be filled with the full tick stop/restart
+	 * infrastructure patches
+	 */
+}
+
+static void nohz_full_kick_work_func(struct irq_work *work)
+{
+	tick_nohz_full_check();
+}
+
+static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
+	.func = nohz_full_kick_work_func,
+};
+
+/*
+ * Kick the current CPU if it's full dynticks in order to force it to
+ * re-evaluate its dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick(void)
+{
+	if (tick_nohz_full_cpu(smp_processor_id()))
+		irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
+}
+
+static void nohz_full_kick_ipi(void *info)
+{
+	tick_nohz_full_check();
+}
+
+/*
+ * Kick all full dynticks CPUs in order to force these to re-evaluate
+ * their dependency on the tick and restart it if necessary.
+ */
+void tick_nohz_full_kick_all(void)
+{
+	if (!have_nohz_full_mask)
+		return;
+
+	preempt_disable();
+	smp_call_function_many(nohz_full_mask,
+			       nohz_full_kick_ipi, NULL, false);
+	preempt_enable();
+}
+
 int tick_nohz_full_cpu(int cpu)
 {
 	if (!have_nohz_full_mask)
-- 
cgit 


From 28d1e8cd671a53d6b4f967abbbc2a55f7bd333f6 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Thu, 18 Apr 2013 18:32:47 +0530
Subject: regulator: palma: add ramp delay support through regulator
 constraints

Currently Palma regulator driver support the ramp delay
through rail specific platform data.

As regulator framework support the configuration of ramp
delay through regulator constraint, using the framework
method and removing the platform specific data approach.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Acked-by: Graeme Gregory <gg@slimlogic.co.uk>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/palmas-regulator.c | 93 ++++++++++++++++++++++++++++++------
 include/linux/mfd/palmas.h           | 14 +-----
 2 files changed, 79 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c
index 6538b339e441..33369160b770 100644
--- a/drivers/regulator/palmas-regulator.c
+++ b/drivers/regulator/palmas-regulator.c
@@ -187,6 +187,8 @@ static const struct regs_info palmas_regs_info[] = {
 	},
 };
 
+static unsigned int palmas_smps_ramp_delay[4] = {0, 10000, 5000, 2500};
+
 #define SMPS_CTRL_MODE_OFF		0x00
 #define SMPS_CTRL_MODE_ON		0x01
 #define SMPS_CTRL_MODE_ECO		0x02
@@ -397,6 +399,56 @@ static int palmas_map_voltage_smps(struct regulator_dev *rdev,
 	return ret;
 }
 
+static int palma_smps_set_voltage_smps_time_sel(struct regulator_dev *rdev,
+	unsigned int old_selector, unsigned int new_selector)
+{
+	struct palmas_pmic *pmic = rdev_get_drvdata(rdev);
+	int id = rdev_get_id(rdev);
+	int old_uv, new_uv;
+	unsigned int ramp_delay = pmic->ramp_delay[id];
+
+	if (!ramp_delay)
+		return 0;
+
+	old_uv = palmas_list_voltage_smps(rdev, old_selector);
+	if (old_uv < 0)
+		return old_uv;
+
+	new_uv = palmas_list_voltage_smps(rdev, new_selector);
+	if (new_uv < 0)
+		return new_uv;
+
+	return DIV_ROUND_UP(abs(old_uv - new_uv), ramp_delay);
+}
+
+static int palmas_smps_set_ramp_delay(struct regulator_dev *rdev,
+		 int ramp_delay)
+{
+	struct palmas_pmic *pmic = rdev_get_drvdata(rdev);
+	int id = rdev_get_id(rdev);
+	unsigned int reg = 0;
+	unsigned int addr = palmas_regs_info[id].tstep_addr;
+	int ret;
+
+	if (ramp_delay <= 0)
+		reg = 0;
+	else if (ramp_delay < 2500)
+		reg = 3;
+	else if (ramp_delay < 5000)
+		reg = 2;
+	else
+		reg = 1;
+
+	ret = palmas_smps_write(pmic->palmas, addr, reg);
+	if (ret < 0) {
+		dev_err(pmic->palmas->dev, "TSTEP write failed: %d\n", ret);
+		return ret;
+	}
+
+	pmic->ramp_delay[id] = palmas_smps_ramp_delay[reg];
+	return ret;
+}
+
 static struct regulator_ops palmas_ops_smps = {
 	.is_enabled		= palmas_is_enabled_smps,
 	.enable			= palmas_enable_smps,
@@ -407,6 +459,8 @@ static struct regulator_ops palmas_ops_smps = {
 	.set_voltage_sel	= regulator_set_voltage_sel_regmap,
 	.list_voltage		= palmas_list_voltage_smps,
 	.map_voltage		= palmas_map_voltage_smps,
+	.set_voltage_time_sel	= palma_smps_set_voltage_smps_time_sel,
+	.set_ramp_delay		= palmas_smps_set_ramp_delay,
 };
 
 static struct regulator_ops palmas_ops_smps10 = {
@@ -495,16 +549,6 @@ static int palmas_smps_init(struct palmas *palmas, int id,
 	if (ret)
 		return ret;
 
-	if (palmas_regs_info[id].tstep_addr && reg_init->tstep) {
-		addr = palmas_regs_info[id].tstep_addr;
-
-		reg = reg_init->tstep & PALMAS_SMPS12_TSTEP_TSTEP_MASK;
-
-		ret = palmas_smps_write(palmas, addr, reg);
-		if (ret)
-			return ret;
-	}
-
 	if (palmas_regs_info[id].vsel_addr && reg_init->vsel) {
 		addr = palmas_regs_info[id].vsel_addr;
 
@@ -686,11 +730,6 @@ static void palmas_dt_to_pdata(struct device *dev,
 		if (!ret)
 			pdata->reg_init[idx]->mode_sleep = prop;
 
-		ret = of_property_read_u32(palmas_matches[idx].of_node,
-				"ti,tstep", &prop);
-		if (!ret)
-			pdata->reg_init[idx]->tstep = prop;
-
 		ret = of_property_read_bool(palmas_matches[idx].of_node,
 					    "ti,smps-range");
 		if (ret)
@@ -752,6 +791,7 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 	config.driver_data = pmic;
 
 	for (id = 0; id < PALMAS_REG_LDO1; id++) {
+		bool ramp_delay_support = false;
 
 		/*
 		 * Miss out regulators which are not available due
@@ -762,19 +802,42 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 		case PALMAS_REG_SMPS3:
 			if (pmic->smps123)
 				continue;
+			if (id == PALMAS_REG_SMPS12)
+				ramp_delay_support = true;
 			break;
 		case PALMAS_REG_SMPS123:
 			if (!pmic->smps123)
 				continue;
+			ramp_delay_support = true;
 			break;
 		case PALMAS_REG_SMPS45:
 		case PALMAS_REG_SMPS7:
 			if (pmic->smps457)
 				continue;
+			if (id == PALMAS_REG_SMPS45)
+				ramp_delay_support = true;
 			break;
 		case PALMAS_REG_SMPS457:
 			if (!pmic->smps457)
 				continue;
+			ramp_delay_support = true;
+			break;
+		}
+
+		if ((id == PALMAS_REG_SMPS6) && (id == PALMAS_REG_SMPS8))
+			ramp_delay_support = true;
+
+		if (ramp_delay_support) {
+			addr = palmas_regs_info[id].tstep_addr;
+			ret = palmas_smps_read(pmic->palmas, addr, &reg);
+			if (ret < 0) {
+				dev_err(&pdev->dev,
+					"reading TSTEP reg failed: %d\n", ret);
+				goto err_unregister_regulator;
+			}
+			pmic->desc[id].ramp_delay =
+					palmas_smps_ramp_delay[reg & 0x3];
+			pmic->ramp_delay[id] = pmic->desc[id].ramp_delay;
 		}
 
 		/* Initialise sleep/init values from platform data */
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index 10daa8c1e73c..a58c579e8cf4 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -109,19 +109,6 @@ struct palmas_reg_init {
 	 */
 	int mode_sleep;
 
-	/* tstep is the timestep loaded to the TSTEP register
-	 *
-	 * For SMPS
-	 *
-	 * 0: Jump (no slope control)
-	 * 1: 10mV/us
-	 * 2: 5mV/us
-	 * 3: 2.5mV/us
-	 *
-	 * For LDO unused
-	 */
-	int tstep;
-
 	/* voltage_sel is the bitfield loaded onto the SMPSX_VOLTAGE
 	 * register. Set this is the default voltage set in OTP needs
 	 * to be overridden.
@@ -339,6 +326,7 @@ struct palmas_pmic {
 	int smps457;
 
 	int range[PALMAS_REG_SMPS10];
+	unsigned int ramp_delay[PALMAS_REG_SMPS10];
 };
 
 struct palmas_resource {
-- 
cgit 


From 51d3a0c999e18a802a654171b5e05952b4630148 Mon Sep 17 00:00:00 2001
From: Laxman Dewangan <ldewangan@nvidia.com>
Date: Thu, 18 Apr 2013 18:32:48 +0530
Subject: regulator: palmas: preserve modes of rails during enable/disable

The Palma device like TPS65913 have the mode mask which is also
used for enable/disable the rails. The mode bits are defined as
	00: OFF
	01: AUTO
	10: ECO
	11: Forced PWM

and modes are set accordingly as
	REGULATOR_MODE_NORMAL: AUTO
	REGULATOR_MODE_IDLE: ECO
	REGULATOR_MODE_FAST: PWM

Two issue observed:
1. If client calls following sequence:
	regulator_enable(),
	regulator_set_mode(FAST),
	regulator_disable()

	and again the regulator_enable() then the mode is reset
	to NORMAL inplace of keeping the mode as FAST.

	Fixing this by storing the current mode configured by client
	and restoring modes when enable() is called after disable().

2. In following sequence, the regulator get enabled:
	regulator_disable()
	regulator_set_mode(FAST),

	Fixing this by updating new mode in register only if it is
	enabled.

Signed-off-by: Laxman Dewangan <ldewangan@nvidia.com>
Acked-by: Graeme Gregory <gg@slimlogic.co.uk>
Signed-off-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
---
 drivers/regulator/palmas-regulator.c | 30 +++++++++++++++++++++++-------
 include/linux/mfd/palmas.h           |  1 +
 2 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/regulator/palmas-regulator.c b/drivers/regulator/palmas-regulator.c
index 33369160b770..38447ac53cb8 100644
--- a/drivers/regulator/palmas-regulator.c
+++ b/drivers/regulator/palmas-regulator.c
@@ -274,7 +274,10 @@ static int palmas_enable_smps(struct regulator_dev *dev)
 	palmas_smps_read(pmic->palmas, palmas_regs_info[id].ctrl_addr, &reg);
 
 	reg &= ~PALMAS_SMPS12_CTRL_MODE_ACTIVE_MASK;
-	reg |= SMPS_CTRL_MODE_ON;
+	if (pmic->current_reg_mode[id])
+		reg |= pmic->current_reg_mode[id];
+	else
+		reg |= SMPS_CTRL_MODE_ON;
 
 	palmas_smps_write(pmic->palmas, palmas_regs_info[id].ctrl_addr, reg);
 
@@ -296,16 +299,19 @@ static int palmas_disable_smps(struct regulator_dev *dev)
 	return 0;
 }
 
-
 static int palmas_set_mode_smps(struct regulator_dev *dev, unsigned int mode)
 {
 	struct palmas_pmic *pmic = rdev_get_drvdata(dev);
 	int id = rdev_get_id(dev);
 	unsigned int reg;
+	bool rail_enable = true;
 
 	palmas_smps_read(pmic->palmas, palmas_regs_info[id].ctrl_addr, &reg);
 	reg &= ~PALMAS_SMPS12_CTRL_MODE_ACTIVE_MASK;
 
+	if (reg == SMPS_CTRL_MODE_OFF)
+		rail_enable = false;
+
 	switch (mode) {
 	case REGULATOR_MODE_NORMAL:
 		reg |= SMPS_CTRL_MODE_ON;
@@ -319,8 +325,11 @@ static int palmas_set_mode_smps(struct regulator_dev *dev, unsigned int mode)
 	default:
 		return -EINVAL;
 	}
-	palmas_smps_write(pmic->palmas, palmas_regs_info[id].ctrl_addr, reg);
 
+	pmic->current_reg_mode[id] = reg & PALMAS_SMPS12_CTRL_MODE_ACTIVE_MASK;
+	if (rail_enable)
+		palmas_smps_write(pmic->palmas,
+			palmas_regs_info[id].ctrl_addr, reg);
 	return 0;
 }
 
@@ -330,9 +339,7 @@ static unsigned int palmas_get_mode_smps(struct regulator_dev *dev)
 	int id = rdev_get_id(dev);
 	unsigned int reg;
 
-	palmas_smps_read(pmic->palmas, palmas_regs_info[id].ctrl_addr, &reg);
-	reg &= PALMAS_SMPS12_CTRL_STATUS_MASK;
-	reg >>= PALMAS_SMPS12_CTRL_STATUS_SHIFT;
+	reg = pmic->current_reg_mode[id] & PALMAS_SMPS12_CTRL_MODE_ACTIVE_MASK;
 
 	switch (reg) {
 	case SMPS_CTRL_MODE_ON:
@@ -871,7 +878,8 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 			/*
 			 * Read and store the RANGE bit for later use
 			 * This must be done before regulator is probed,
-			 * otherwise we error in probe with unsupportable ranges.
+			 * otherwise we error in probe with unsupportable
+			 * ranges. Read the current smps mode for later use.
 			 */
 			addr = palmas_regs_info[id].vsel_addr;
 
@@ -888,6 +896,14 @@ static int palmas_regulators_probe(struct platform_device *pdev)
 						palmas_regs_info[id].vsel_addr);
 			pmic->desc[id].vsel_mask =
 					PALMAS_SMPS12_VOLTAGE_VSEL_MASK;
+
+			/* Read the smps mode for later use. */
+			addr = palmas_regs_info[id].ctrl_addr;
+			ret = palmas_smps_read(pmic->palmas, addr, &reg);
+			if (ret)
+				goto err_unregister_regulator;
+			pmic->current_reg_mode[id] = reg &
+					PALMAS_SMPS12_CTRL_MODE_ACTIVE_MASK;
 		}
 
 		pmic->desc[id].type = REGULATOR_VOLTAGE;
diff --git a/include/linux/mfd/palmas.h b/include/linux/mfd/palmas.h
index a58c579e8cf4..e971a7c81b79 100644
--- a/include/linux/mfd/palmas.h
+++ b/include/linux/mfd/palmas.h
@@ -327,6 +327,7 @@ struct palmas_pmic {
 
 	int range[PALMAS_REG_SMPS10];
 	unsigned int ramp_delay[PALMAS_REG_SMPS10];
+	unsigned int current_reg_mode[PALMAS_REG_SMPS10];
 };
 
 struct palmas_resource {
-- 
cgit 


From 360e64d8bbe7c78784d769a60d152804f5079577 Mon Sep 17 00:00:00 2001
From: Marek Vasut <marex@denx.de>
Date: Sun, 14 Apr 2013 20:35:48 +0200
Subject: mfd: ucb1400: Pass ucb1400-gpio data through ac97 bus

Cc: Jean Delvare <jdelvare@suse.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Linus Walleij <linus.walleij@linaro.org>
Reviewed-by: Mark Brown <broonie@opensource.wolfsonmicro.com>
Signed-off-by: Marek Vasut <marex@denx.de>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/gpio/gpio-ucb1400.c | 19 ++++++-------------
 drivers/mfd/ucb1400_core.c  |  5 +++++
 include/linux/ucb1400.h     | 18 ++++++------------
 3 files changed, 17 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/gpio/gpio-ucb1400.c b/drivers/gpio/gpio-ucb1400.c
index 26405efe0f9f..6d0feb234d3c 100644
--- a/drivers/gpio/gpio-ucb1400.c
+++ b/drivers/gpio/gpio-ucb1400.c
@@ -12,8 +12,6 @@
 #include <linux/module.h>
 #include <linux/ucb1400.h>
 
-struct ucb1400_gpio_data *ucbdata;
-
 static int ucb1400_gpio_dir_in(struct gpio_chip *gc, unsigned off)
 {
 	struct ucb1400_gpio *gpio;
@@ -50,7 +48,7 @@ static int ucb1400_gpio_probe(struct platform_device *dev)
 	struct ucb1400_gpio *ucb = dev->dev.platform_data;
 	int err = 0;
 
-	if (!(ucbdata && ucbdata->gpio_offset)) {
+	if (!(ucb && ucb->gpio_offset)) {
 		err = -EINVAL;
 		goto err;
 	}
@@ -58,7 +56,7 @@ static int ucb1400_gpio_probe(struct platform_device *dev)
 	platform_set_drvdata(dev, ucb);
 
 	ucb->gc.label = "ucb1400_gpio";
-	ucb->gc.base = ucbdata->gpio_offset;
+	ucb->gc.base = ucb->gpio_offset;
 	ucb->gc.ngpio = 10;
 	ucb->gc.owner = THIS_MODULE;
 
@@ -72,8 +70,8 @@ static int ucb1400_gpio_probe(struct platform_device *dev)
 	if (err)
 		goto err;
 
-	if (ucbdata && ucbdata->gpio_setup)
-		err = ucbdata->gpio_setup(&dev->dev, ucb->gc.ngpio);
+	if (ucb && ucb->gpio_setup)
+		err = ucb->gpio_setup(&dev->dev, ucb->gc.ngpio);
 
 err:
 	return err;
@@ -85,8 +83,8 @@ static int ucb1400_gpio_remove(struct platform_device *dev)
 	int err = 0;
 	struct ucb1400_gpio *ucb = platform_get_drvdata(dev);
 
-	if (ucbdata && ucbdata->gpio_teardown) {
-		err = ucbdata->gpio_teardown(&dev->dev, ucb->gc.ngpio);
+	if (ucb && ucb->gpio_teardown) {
+		err = ucb->gpio_teardown(&dev->dev, ucb->gc.ngpio);
 		if (err)
 			return err;
 	}
@@ -103,11 +101,6 @@ static struct platform_driver ucb1400_gpio_driver = {
 	},
 };
 
-void __init ucb1400_gpio_set_data(struct ucb1400_gpio_data *data)
-{
-	ucbdata = data;
-}
-
 module_platform_driver(ucb1400_gpio_driver);
 
 MODULE_DESCRIPTION("Philips UCB1400 GPIO driver");
diff --git a/drivers/mfd/ucb1400_core.c b/drivers/mfd/ucb1400_core.c
index daf69527ed83..e9031fa9d53d 100644
--- a/drivers/mfd/ucb1400_core.c
+++ b/drivers/mfd/ucb1400_core.c
@@ -75,6 +75,11 @@ static int ucb1400_core_probe(struct device *dev)
 
 	/* GPIO */
 	ucb_gpio.ac97 = ac97;
+	if (pdata) {
+		ucb_gpio.gpio_setup = pdata->gpio_setup;
+		ucb_gpio.gpio_teardown = pdata->gpio_teardown;
+		ucb_gpio.gpio_offset = pdata->gpio_offset;
+	}
 	ucb->ucb1400_gpio = platform_device_alloc("ucb1400_gpio", -1);
 	if (!ucb->ucb1400_gpio) {
 		err = -ENOMEM;
diff --git a/include/linux/ucb1400.h b/include/linux/ucb1400.h
index d21b33c4c6ca..2e9ee4d1c676 100644
--- a/include/linux/ucb1400.h
+++ b/include/linux/ucb1400.h
@@ -83,15 +83,12 @@
 #define UCB_ID			0x7e
 #define UCB_ID_1400             0x4304
 
-struct ucb1400_gpio_data {
-	int gpio_offset;
-	int (*gpio_setup)(struct device *dev, int ngpio);
-	int (*gpio_teardown)(struct device *dev, int ngpio);
-};
-
 struct ucb1400_gpio {
 	struct gpio_chip	gc;
 	struct snd_ac97		*ac97;
+	int			gpio_offset;
+	int			(*gpio_setup)(struct device *dev, int ngpio);
+	int			(*gpio_teardown)(struct device *dev, int ngpio);
 };
 
 struct ucb1400_ts {
@@ -110,6 +107,9 @@ struct ucb1400 {
 
 struct ucb1400_pdata {
 	int	irq;
+	int	gpio_offset;
+	int	(*gpio_setup)(struct device *dev, int ngpio);
+	int	(*gpio_teardown)(struct device *dev, int ngpio);
 };
 
 static inline u16 ucb1400_reg_read(struct snd_ac97 *ac97, u16 reg)
@@ -162,10 +162,4 @@ static inline void ucb1400_adc_disable(struct snd_ac97 *ac97)
 unsigned int ucb1400_adc_read(struct snd_ac97 *ac97, u16 adc_channel,
 			      int adcsync);
 
-#ifdef CONFIG_GPIO_UCB1400
-void __init ucb1400_gpio_set_data(struct ucb1400_gpio_data *data);
-#else
-static inline void ucb1400_gpio_set_data(struct ucb1400_gpio_data *data) {}
-#endif
-
 #endif
-- 
cgit 


From 95e50f6a2fe9ece6503e355400c171e0f5de61be Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@iki.fi>
Date: Tue, 9 Apr 2013 22:51:25 +0300
Subject: mfd: retu: Add Tahvo support

Tahvo is a multi-function device on Nokia 770, implementing USB
transceiver and charge/battery control.

It's so close to Retu that a single driver can support both.

Signed-off-by: Aaro Koskinen <aaro.koskinen@iki.fi>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig      |  6 ++--
 drivers/mfd/retu-mfd.c   | 85 ++++++++++++++++++++++++++++++++++++++++++------
 include/linux/mfd/retu.h |  8 ++++-
 3 files changed, 85 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 2f3ce1836181..9de05020c660 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -371,13 +371,13 @@ config MFD_VIPERBOARD
 	  The drivers do not support all features the board exposes.
 
 config MFD_RETU
-	tristate "Nokia Retu multi-function device"
+	tristate "Nokia Retu and Tahvo multi-function device"
 	select MFD_CORE
 	depends on I2C && GENERIC_HARDIRQS
 	select REGMAP_IRQ
 	help
-	  Retu is a multi-function device found on Nokia Internet Tablets
-	  (770, N800 and N810).
+	  Retu and Tahvo are a multi-function devices found on Nokia
+	  Internet Tablets (770, N800 and N810).
 
 config MFD_PCF50633
 	tristate "NXP PCF50633"
diff --git a/drivers/mfd/retu-mfd.c b/drivers/mfd/retu-mfd.c
index 3ba048655bf3..a1830986eeb7 100644
--- a/drivers/mfd/retu-mfd.c
+++ b/drivers/mfd/retu-mfd.c
@@ -1,5 +1,5 @@
 /*
- * Retu MFD driver
+ * Retu/Tahvo MFD driver
  *
  * Copyright (C) 2004, 2005 Nokia Corporation
  *
@@ -33,7 +33,8 @@
 #define RETU_REG_ASICR		0x00		/* ASIC ID and revision */
 #define RETU_REG_ASICR_VILMA	(1 << 7)	/* Bit indicating Vilma */
 #define RETU_REG_IDR		0x01		/* Interrupt ID */
-#define RETU_REG_IMR		0x02		/* Interrupt mask */
+#define RETU_REG_IMR		0x02		/* Interrupt mask (Retu) */
+#define TAHVO_REG_IMR		0x03		/* Interrupt mask (Tahvo) */
 
 /* Interrupt sources */
 #define RETU_INT_PWR		0		/* Power button */
@@ -84,6 +85,62 @@ static struct regmap_irq_chip retu_irq_chip = {
 /* Retu device registered for the power off. */
 static struct retu_dev *retu_pm_power_off;
 
+static struct resource tahvo_usb_res[] = {
+	{
+		.name	= "tahvo-usb",
+		.start	= TAHVO_INT_VBUS,
+		.end	= TAHVO_INT_VBUS,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct mfd_cell tahvo_devs[] = {
+	{
+		.name		= "tahvo-usb",
+		.resources	= tahvo_usb_res,
+		.num_resources	= ARRAY_SIZE(tahvo_usb_res),
+	},
+};
+
+static struct regmap_irq tahvo_irqs[] = {
+	[TAHVO_INT_VBUS] = {
+		.mask = 1 << TAHVO_INT_VBUS,
+	}
+};
+
+static struct regmap_irq_chip tahvo_irq_chip = {
+	.name		= "TAHVO",
+	.irqs		= tahvo_irqs,
+	.num_irqs	= ARRAY_SIZE(tahvo_irqs),
+	.num_regs	= 1,
+	.status_base	= RETU_REG_IDR,
+	.mask_base	= TAHVO_REG_IMR,
+	.ack_base	= RETU_REG_IDR,
+};
+
+static const struct retu_data {
+	char			*chip_name;
+	char			*companion_name;
+	struct regmap_irq_chip	*irq_chip;
+	struct mfd_cell		*children;
+	int			nchildren;
+} retu_data[] = {
+	[0] = {
+		.chip_name	= "Retu",
+		.companion_name	= "Vilma",
+		.irq_chip	= &retu_irq_chip,
+		.children	= retu_devs,
+		.nchildren	= ARRAY_SIZE(retu_devs),
+	},
+	[1] = {
+		.chip_name	= "Tahvo",
+		.companion_name	= "Betty",
+		.irq_chip	= &tahvo_irq_chip,
+		.children	= tahvo_devs,
+		.nchildren	= ARRAY_SIZE(tahvo_devs),
+	}
+};
+
 int retu_read(struct retu_dev *rdev, u8 reg)
 {
 	int ret;
@@ -173,9 +230,14 @@ static struct regmap_config retu_config = {
 
 static int retu_probe(struct i2c_client *i2c, const struct i2c_device_id *id)
 {
+	struct retu_data const *rdat;
 	struct retu_dev *rdev;
 	int ret;
 
+	if (i2c->addr > ARRAY_SIZE(retu_data))
+		return -ENODEV;
+	rdat = &retu_data[i2c->addr - 1];
+
 	rdev = devm_kzalloc(&i2c->dev, sizeof(*rdev), GFP_KERNEL);
 	if (rdev == NULL)
 		return -ENOMEM;
@@ -190,25 +252,27 @@ static int retu_probe(struct i2c_client *i2c, const struct i2c_device_id *id)
 
 	ret = retu_read(rdev, RETU_REG_ASICR);
 	if (ret < 0) {
-		dev_err(rdev->dev, "could not read Retu revision: %d\n", ret);
+		dev_err(rdev->dev, "could not read %s revision: %d\n",
+			rdat->chip_name, ret);
 		return ret;
 	}
 
-	dev_info(rdev->dev, "Retu%s v%d.%d found\n",
-		 (ret & RETU_REG_ASICR_VILMA) ? " & Vilma" : "",
+	dev_info(rdev->dev, "%s%s%s v%d.%d found\n", rdat->chip_name,
+		 (ret & RETU_REG_ASICR_VILMA) ? " & " : "",
+		 (ret & RETU_REG_ASICR_VILMA) ? rdat->companion_name : "",
 		 (ret >> 4) & 0x7, ret & 0xf);
 
-	/* Mask all RETU interrupts. */
-	ret = retu_write(rdev, RETU_REG_IMR, 0xffff);
+	/* Mask all interrupts. */
+	ret = retu_write(rdev, rdat->irq_chip->mask_base, 0xffff);
 	if (ret < 0)
 		return ret;
 
 	ret = regmap_add_irq_chip(rdev->regmap, i2c->irq, IRQF_ONESHOT, -1,
-				  &retu_irq_chip, &rdev->irq_data);
+				  rdat->irq_chip, &rdev->irq_data);
 	if (ret < 0)
 		return ret;
 
-	ret = mfd_add_devices(rdev->dev, -1, retu_devs, ARRAY_SIZE(retu_devs),
+	ret = mfd_add_devices(rdev->dev, -1, rdat->children, rdat->nchildren,
 			      NULL, regmap_irq_chip_get_base(rdev->irq_data),
 			      NULL);
 	if (ret < 0) {
@@ -216,7 +280,7 @@ static int retu_probe(struct i2c_client *i2c, const struct i2c_device_id *id)
 		return ret;
 	}
 
-	if (!pm_power_off) {
+	if (i2c->addr == 1 && !pm_power_off) {
 		retu_pm_power_off = rdev;
 		pm_power_off	  = retu_power_off;
 	}
@@ -240,6 +304,7 @@ static int retu_remove(struct i2c_client *i2c)
 
 static const struct i2c_device_id retu_id[] = {
 	{ "retu-mfd", 0 },
+	{ "tahvo-mfd", 0 },
 	{ }
 };
 MODULE_DEVICE_TABLE(i2c, retu_id);
diff --git a/include/linux/mfd/retu.h b/include/linux/mfd/retu.h
index 1e2715d5b836..65471c4a3926 100644
--- a/include/linux/mfd/retu.h
+++ b/include/linux/mfd/retu.h
@@ -1,5 +1,5 @@
 /*
- * Retu MFD driver interface
+ * Retu/Tahvo MFD driver interface
  *
  * This file is subject to the terms and conditions of the GNU General
  * Public License. See the file "COPYING" in the main directory of this
@@ -19,4 +19,10 @@ int retu_write(struct retu_dev *, u8, u16);
 #define RETU_REG_CC1		0x0d		/* Common control register 1 */
 #define RETU_REG_STATUS		0x16		/* Status register */
 
+/* Interrupt sources */
+#define TAHVO_INT_VBUS		0		/* VBUS state */
+
+/* Interrupt status */
+#define TAHVO_STAT_VBUS		(1 << TAHVO_INT_VBUS)
+
 #endif /* __LINUX_MFD_RETU_H */
-- 
cgit 


From 712317ad97f41e738e1a19aa0a6392a78a84094e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Thu, 18 Apr 2013 23:09:52 -0700
Subject: cgroup: fix broken file xattrs

We should store file xattrs in struct cfent instead of struct cftype,
because cftype is a type while cfent is object instance of cftype.

For example each cgroup has a tasks file, and each tasks file is
associated with a uniq cfent, but all those files share the same
struct cftype.

Alexey Kodanev reported a crash, which can be reproduced:

  # mount -t cgroup -o xattr /sys/fs/cgroup
  # mkdir /sys/fs/cgroup/test
  # setfattr -n trusted.value -v test_value /sys/fs/cgroup/tasks
  # rmdir /sys/fs/cgroup/test
  # umount /sys/fs/cgroup
  oops!

In this case, simple_xattrs_free() will free the same struct simple_xattrs
twice.

tj: Dropped unused local variable @cft from cgroup_diput().

Cc: <stable@vger.kernel.org> # 3.8.x
Reported-by: Alexey Kodanev <alexey.kodanev@oracle.com>
Signed-off-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |  3 ---
 kernel/cgroup.c        | 11 ++++++-----
 2 files changed, 6 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index cda7eb2239e1..c371888298d5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -422,9 +422,6 @@ struct cftype {
 	/* CFTYPE_* flags */
 	unsigned int flags;
 
-	/* file xattrs */
-	struct simple_xattrs xattrs;
-
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c16719e056ab..192d762ec1f4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -117,6 +117,9 @@ struct cfent {
 	struct list_head		node;
 	struct dentry			*dentry;
 	struct cftype			*type;
+
+	/* file xattrs */
+	struct simple_xattrs		xattrs;
 };
 
 /*
@@ -882,13 +885,12 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 	} else {
 		struct cfent *cfe = __d_cfe(dentry);
 		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
-		struct cftype *cft = cfe->type;
 
 		WARN_ONCE(!list_empty(&cfe->node) &&
 			  cgrp != &cgrp->root->top_cgroup,
 			  "cfe still linked for %s\n", cfe->type->name);
+		simple_xattrs_free(&cfe->xattrs);
 		kfree(cfe);
-		simple_xattrs_free(&cft->xattrs);
 	}
 	iput(inode);
 }
@@ -2501,7 +2503,7 @@ static struct simple_xattrs *__d_xattrs(struct dentry *dentry)
 	if (S_ISDIR(dentry->d_inode->i_mode))
 		return &__d_cgrp(dentry)->xattrs;
 	else
-		return &__d_cft(dentry)->xattrs;
+		return &__d_cfe(dentry)->xattrs;
 }
 
 static inline int xattr_enabled(struct dentry *dentry)
@@ -2677,8 +2679,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 	umode_t mode;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 
-	simple_xattrs_init(&cft->xattrs);
-
 	if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
@@ -2703,6 +2703,7 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		cfe->type = (void *)cft;
 		cfe->dentry = dentry;
 		dentry->d_fsdata = cfe;
+		simple_xattrs_init(&cfe->xattrs);
 		list_add_tail(&cfe->node, &parent->files);
 		cfe = NULL;
 	}
-- 
cgit 


From 41fcb9f230bf773656d1768b73000ef720bf00c3 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Wed, 17 Apr 2013 15:23:11 -0400
Subject: mutex: Move mutex spinning code from sched/core.c back to mutex.c

As mentioned by Ingo, the SCHED_FEAT_OWNER_SPIN scheduler
feature bit was really just an early hack to make with/without
mutex-spinning testable. So it is no longer necessary.

This patch removes the SCHED_FEAT_OWNER_SPIN feature bit and
move the mutex spinning code from kernel/sched/core.c back to
kernel/mutex.c which is where they should belong.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chandramouleeswaran Aswin <aswin@hp.com>
Cc: Davidlohr Bueso <davidlohr.bueso@hp.com>
Cc: Norton Scott J <scott.norton@hp.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1366226594-5506-2-git-send-email-Waiman.Long@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h   |  1 -
 kernel/mutex.c          | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/sched/core.c     | 45 ---------------------------------------------
 kernel/sched/features.h |  7 -------
 4 files changed, 46 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d35d2b6ddbfb..aefe45d79f53 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -320,7 +320,6 @@ extern signed long schedule_timeout_killable(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
-extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
 
 struct nsproxy;
 struct user_namespace;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..262d7177adad 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -95,6 +95,52 @@ void __sched mutex_lock(struct mutex *lock)
 EXPORT_SYMBOL(mutex_lock);
 #endif
 
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+	if (lock->owner != owner)
+		return false;
+
+	/*
+	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
+	 * lock->owner still matches owner, if that fails, owner might
+	 * point to free()d memory, if it still matches, the rcu_read_lock()
+	 * ensures the memory stays valid.
+	 */
+	barrier();
+
+	return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+	rcu_read_lock();
+	while (owner_running(lock, owner)) {
+		if (need_resched())
+			break;
+
+		arch_mutex_cpu_relax();
+	}
+	rcu_read_unlock();
+
+	/*
+	 * We break out the loop above on need_resched() and when the
+	 * owner changed, which is a sign for heavy contention. Return
+	 * success only when lock->owner is NULL.
+	 */
+	return lock->owner == NULL;
+}
+#endif
+
 static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..b37a22b99e0e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2997,51 +2997,6 @@ void __sched schedule_preempt_disabled(void)
 	preempt_disable();
 }
 
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
-	if (lock->owner != owner)
-		return false;
-
-	/*
-	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
-	 * lock->owner still matches owner, if that fails, owner might
-	 * point to free()d memory, if it still matches, the rcu_read_lock()
-	 * ensures the memory stays valid.
-	 */
-	barrier();
-
-	return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
-	if (!sched_feat(OWNER_SPIN))
-		return 0;
-
-	rcu_read_lock();
-	while (owner_running(lock, owner)) {
-		if (need_resched())
-			break;
-
-		arch_mutex_cpu_relax();
-	}
-	rcu_read_unlock();
-
-	/*
-	 * We break out the loop above on need_resched() and when the
-	 * owner changed, which is a sign for heavy contention. Return
-	 * success only when lock->owner is NULL.
-	 */
-	return lock->owner == NULL;
-}
-#endif
-
 #ifdef CONFIG_PREEMPT
 /*
  * this is the entry point to schedule() from in-kernel preemption
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -45,13 +45,6 @@ SCHED_FEAT(HRTICK, false)
 SCHED_FEAT(DOUBLE_TICK, false)
 SCHED_FEAT(LB_BIAS, true)
 
-/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
 /*
  * Decrement CPU power based on time not spent running tasks
  */
-- 
cgit 


From 2bd2c92cf07cc4a373bf316c75b78ac465fefd35 Mon Sep 17 00:00:00 2001
From: Waiman Long <Waiman.Long@hp.com>
Date: Wed, 17 Apr 2013 15:23:13 -0400
Subject: mutex: Queue mutex spinners with MCS lock to reduce cacheline
 contention

The current mutex spinning code (with MUTEX_SPIN_ON_OWNER option
turned on) allow multiple tasks to spin on a single mutex
concurrently. A potential problem with the current approach is
that when the mutex becomes available, all the spinning tasks
will try to acquire the mutex more or less simultaneously. As a
result, there will be a lot of cacheline bouncing especially on
systems with a large number of CPUs.

This patch tries to reduce this kind of contention by putting
the mutex spinners into a queue so that only the first one in
the queue will try to acquire the mutex. This will reduce
contention and allow all the tasks to move forward faster.

The queuing of mutex spinners is done using an MCS lock based
implementation which will further reduce contention on the mutex
cacheline than a similar ticket spinlock based implementation.
This patch will add a new field into the mutex data structure
for holding the MCS lock. This expands the mutex size by 8 bytes
for 64-bit system and 4 bytes for 32-bit system. This overhead
will be avoid if the MUTEX_SPIN_ON_OWNER option is turned off.

The following table shows the jobs per minute (JPM) scalability
data on an 8-node 80-core Westmere box with a 3.7.10 kernel. The
numactl command is used to restrict the running of the fserver
workloads to 1/2/4/8 nodes with hyperthreading off.

+-----------------+-----------+-----------+-------------+----------+
|  Configuration  | Mean JPM  | Mean JPM  |  Mean JPM   | % Change |
|                 | w/o patch | patch 1   | patches 1&2 |  1->1&2  |
+-----------------+------------------------------------------------+
|                 |              User Range 1100 - 2000            |
+-----------------+------------------------------------------------+
| 8 nodes, HT off |  227972   |  227237   |   305043    |  +34.2%  |
| 4 nodes, HT off |  393503   |  381558   |   394650    |   +3.4%  |
| 2 nodes, HT off |  334957   |  325240   |   338853    |   +4.2%  |
| 1 node , HT off |  198141   |  197972   |   198075    |   +0.1%  |
+-----------------+------------------------------------------------+
|                 |              User Range 200 - 1000             |
+-----------------+------------------------------------------------+
| 8 nodes, HT off |  282325   |  312870   |   332185    |   +6.2%  |
| 4 nodes, HT off |  390698   |  378279   |   393419    |   +4.0%  |
| 2 nodes, HT off |  336986   |  326543   |   340260    |   +4.2%  |
| 1 node , HT off |  197588   |  197622   |   197582    |    0.0%  |
+-----------------+-----------+-----------+-------------+----------+

At low user range 10-100, the JPM differences were within +/-1%.
So they are not that interesting.

The fserver workload uses mutex spinning extensively. With just
the mutex change in the first patch, there is no noticeable
change in performance.  Rather, there is a slight drop in
performance. This mutex spinning patch more than recovers the
lost performance and show a significant increase of +30% at high
user load with the full 8 nodes. Similar improvements were also
seen in a 3.8 kernel.

The table below shows the %time spent by different kernel
functions as reported by perf when running the fserver workload
at 1500 users with all 8 nodes.

+-----------------------+-----------+---------+-------------+
|        Function       |  % time   | % time  |   % time    |
|                       | w/o patch | patch 1 | patches 1&2 |
+-----------------------+-----------+---------+-------------+
| __read_lock_failed    |  34.96%   | 34.91%  |   29.14%    |
| __write_lock_failed   |  10.14%   | 10.68%  |    7.51%    |
| mutex_spin_on_owner   |   3.62%   |  3.42%  |    2.33%    |
| mspin_lock            |    N/A    |   N/A   |    9.90%    |
| __mutex_lock_slowpath |   1.46%   |  0.81%  |    0.14%    |
| _raw_spin_lock        |   2.25%   |  2.50%  |    1.10%    |
+-----------------------+-----------+---------+-------------+

The fserver workload for an 8-node system is dominated by the
contention in the read/write lock. Mutex contention also plays a
role. With the first patch only, mutex contention is down (as
shown by the __mutex_lock_slowpath figure) which help a little
bit. We saw only a few percents improvement with that.

By applying patch 2 as well, the single mutex_spin_on_owner
figure is now split out into an additional mspin_lock figure.
The time increases from 3.42% to 11.23%. It shows a great
reduction in contention among the spinners leading to a 30%
improvement. The time ratio 9.9/2.33=4.3 indicates that there
are on average 4+ spinners waiting in the spin_lock loop for
each spinner in the mutex_spin_on_owner loop. Contention in
other locking functions also go down by quite a lot.

The table below shows the performance change of both patches 1 &
2 over patch 1 alone in other AIM7 workloads (at 8 nodes,
hyperthreading off).

+--------------+---------------+----------------+-----------------+
|   Workload   | mean % change | mean % change  | mean % change   |
|              | 10-100 users  | 200-1000 users | 1100-2000 users |
+--------------+---------------+----------------+-----------------+
| alltests     |      0.0%     |     -0.8%      |     +0.6%       |
| five_sec     |     -0.3%     |     +0.8%      |     +0.8%       |
| high_systime |     +0.4%     |     +2.4%      |     +2.1%       |
| new_fserver  |     +0.1%     |    +14.1%      |    +34.2%       |
| shared       |     -0.5%     |     -0.3%      |     -0.4%       |
| short        |     -1.7%     |     -9.8%      |     -8.3%       |
+--------------+---------------+----------------+-----------------+

The short workload is the only one that shows a decline in
performance probably due to the spinner locking and queuing
overhead.

Signed-off-by: Waiman Long <Waiman.Long@hp.com>
Reviewed-by: Davidlohr Bueso <davidlohr.bueso@hp.com>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Chandramouleeswaran Aswin <aswin@hp.com>
Cc: Norton Scott J <scott.norton@hp.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Dave Jones <davej@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1366226594-5506-4-git-send-email-Waiman.Long@hp.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/mutex.h |  3 ++
 kernel/mutex.c        | 91 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 93 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 9121595a8ebf..433da8a1a426 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -53,6 +53,9 @@ struct mutex {
 #if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
 	struct task_struct	*owner;
 #endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+	void			*spin_mlock;	/* Spinner MCS lock */
+#endif
 #ifdef CONFIG_DEBUG_MUTEXES
 	const char 		*name;
 	void			*magic;
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 70ebd855d9e8..1dbd4210baef 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -55,6 +55,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
 	spin_lock_init(&lock->wait_lock);
 	INIT_LIST_HEAD(&lock->wait_list);
 	mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+	lock->spin_mlock = NULL;
+#endif
 
 	debug_mutex_init(lock, name, key);
 }
@@ -107,6 +110,60 @@ EXPORT_SYMBOL(mutex_lock);
 #endif
 
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+	struct mspin_node *next ;
+	int		  locked;	/* 1 if lock acquired */
+};
+#define	MLOCK(mutex)	((struct mspin_node **)&((mutex)->spin_mlock))
+
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+	struct mspin_node *prev;
+
+	/* Init node */
+	node->locked = 0;
+	node->next   = NULL;
+
+	prev = xchg(lock, node);
+	if (likely(prev == NULL)) {
+		/* Lock acquired */
+		node->locked = 1;
+		return;
+	}
+	ACCESS_ONCE(prev->next) = node;
+	smp_wmb();
+	/* Wait until the lock holder passes the lock down */
+	while (!ACCESS_ONCE(node->locked))
+		arch_mutex_cpu_relax();
+}
+
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+	struct mspin_node *next = ACCESS_ONCE(node->next);
+
+	if (likely(!next)) {
+		/*
+		 * Release the lock by setting it to NULL
+		 */
+		if (cmpxchg(lock, node, NULL) == node)
+			return;
+		/* Wait until the next pointer is set */
+		while (!(next = ACCESS_ONCE(node->next)))
+			arch_mutex_cpu_relax();
+	}
+	ACCESS_ONCE(next->locked) = 1;
+	smp_wmb();
+}
+
 /*
  * Mutex spinning code migrated from kernel/sched/core.c
  */
@@ -150,6 +207,24 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 	 */
 	return lock->owner == NULL;
 }
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+	int retval = 1;
+
+	rcu_read_lock();
+	if (lock->owner)
+		retval = lock->owner->on_cpu;
+	rcu_read_unlock();
+	/*
+	 * if lock->owner is not set, the mutex owner may have just acquired
+	 * it and not set the owner yet or the mutex has been released.
+	 */
+	return retval;
+}
 #endif
 
 static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
@@ -215,26 +290,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 	 *
 	 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
 	 * to serialize everything.
+	 *
+	 * The mutex spinners are queued up using MCS lock so that only one
+	 * spinner can compete for the mutex. However, if mutex spinning isn't
+	 * going to happen, there is no point in going through the lock/unlock
+	 * overhead.
 	 */
+	if (!mutex_can_spin_on_owner(lock))
+		goto slowpath;
 
 	for (;;) {
 		struct task_struct *owner;
+		struct mspin_node  node;
 
 		/*
 		 * If there's an owner, wait for it to either
 		 * release the lock or go to sleep.
 		 */
+		mspin_lock(MLOCK(lock), &node);
 		owner = ACCESS_ONCE(lock->owner);
-		if (owner && !mutex_spin_on_owner(lock, owner))
+		if (owner && !mutex_spin_on_owner(lock, owner)) {
+			mspin_unlock(MLOCK(lock), &node);
 			break;
+		}
 
 		if ((atomic_read(&lock->count) == 1) &&
 		    (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
 			lock_acquired(&lock->dep_map, ip);
 			mutex_set_owner(lock);
+			mspin_unlock(MLOCK(lock), &node);
 			preempt_enable();
 			return 0;
 		}
+		mspin_unlock(MLOCK(lock), &node);
 
 		/*
 		 * When there's no owner, we might have preempted between the
@@ -253,6 +341,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 		 */
 		arch_mutex_cpu_relax();
 	}
+slowpath:
 #endif
 	spin_lock_mutex(&lock->wait_lock, flags);
 
-- 
cgit 


From 94c1acf2c85b03a59a42d931a94a13a76c123a62 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Date: Wed, 17 Apr 2013 14:57:13 -0500
Subject: amd64_edac: Add Family 16h support

Add code to handle DRAM ECC errors decoding for Fam16h.

Tested on Fam16h with ECC turned on using the mce_amd_inj facility and
works fine.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
[ Boris: cleanups and clarifications ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/amd_nb.c  |  3 ++-
 drivers/edac/amd64_edac.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++-
 drivers/edac/amd64_edac.h |  4 ++-
 include/linux/pci_ids.h   |  2 ++
 4 files changed, 71 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index aadf3359e2a7..f1d0a1446d3a 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -20,12 +20,14 @@ const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M10H_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) },
 	{}
 };
 EXPORT_SYMBOL(amd_nb_misc_ids);
 
 static struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) },
 	{}
 };
 
@@ -81,7 +83,6 @@ int amd_cache_northbridges(void)
 			next_northbridge(link, amd_nb_link_ids);
         }
 
-	/* some CPU families (e.g. family 0x11) do not support GART */
 	if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
 	    boot_cpu_data.x86 == 0x15)
 		amd_northbridges.flags |= AMD_NB_GART;
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e1d13c463c90..8b6a0343c220 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -98,6 +98,7 @@ int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset,
  *
  * F15h: we select which DCT we access using F1x10C[DctCfgSel]
  *
+ * F16h: has only 1 DCT
  */
 static int k8_read_dct_pci_cfg(struct amd64_pvt *pvt, int addr, u32 *val,
 			       const char *func)
@@ -340,6 +341,27 @@ static void get_cs_base_and_mask(struct amd64_pvt *pvt, int csrow, u8 dct,
 		base_bits	= GENMASK(21, 31) | GENMASK(9, 15);
 		mask_bits	= GENMASK(21, 29) | GENMASK(9, 15);
 		addr_shift	= 4;
+
+	/*
+	* F16h needs two addr_shift values: 8 for high and 6 for low
+	* (cf. F16h BKDG).
+	*/
+	} else if (boot_cpu_data.x86 == 0x16) {
+		csbase          = pvt->csels[dct].csbases[csrow];
+		csmask          = pvt->csels[dct].csmasks[csrow >> 1];
+
+		*base  = (csbase & GENMASK(5,  15)) << 6;
+		*base |= (csbase & GENMASK(19, 30)) << 8;
+
+		*mask = ~0ULL;
+		/* poke holes for the csmask */
+		*mask &= ~((GENMASK(5, 15)  << 6) |
+			   (GENMASK(19, 30) << 8));
+
+		*mask |= (csmask & GENMASK(5, 15))  << 6;
+		*mask |= (csmask & GENMASK(19, 30)) << 8;
+
+		return;
 	} else {
 		csbase		= pvt->csels[dct].csbases[csrow];
 		csmask		= pvt->csels[dct].csmasks[csrow >> 1];
@@ -1150,6 +1172,21 @@ static int f15_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct,
 	return ddr3_cs_size(cs_mode, false);
 }
 
+/*
+ * F16h has only limited cs_modes
+ */
+static int f16_dbam_to_chip_select(struct amd64_pvt *pvt, u8 dct,
+				unsigned cs_mode)
+{
+	WARN_ON(cs_mode > 12);
+
+	if (cs_mode == 6 || cs_mode == 8 ||
+	    cs_mode == 9 || cs_mode == 12)
+		return -1;
+	else
+		return ddr3_cs_size(cs_mode, false);
+}
+
 static void read_dram_ctl_register(struct amd64_pvt *pvt)
 {
 
@@ -1587,6 +1624,17 @@ static struct amd64_family_type amd64_family_types[] = {
 			.read_dct_pci_cfg	= f15_read_dct_pci_cfg,
 		}
 	},
+	[F16_CPUS] = {
+		.ctl_name = "F16h",
+		.f1_id = PCI_DEVICE_ID_AMD_16H_NB_F1,
+		.f3_id = PCI_DEVICE_ID_AMD_16H_NB_F3,
+		.ops = {
+			.early_channel_count	= f1x_early_channel_count,
+			.map_sysaddr_to_csrow	= f1x_map_sysaddr_to_csrow,
+			.dbam_to_cs		= f16_dbam_to_chip_select,
+			.read_dct_pci_cfg	= f10_read_dct_pci_cfg,
+		}
+	},
 };
 
 /*
@@ -1939,7 +1987,9 @@ static void read_mc_regs(struct amd64_pvt *pvt)
 
 	if (c->x86 >= 0x10) {
 		amd64_read_pci_cfg(pvt->F3, EXT_NB_MCA_CFG, &tmp);
-		amd64_read_dct_pci_cfg(pvt, DBAM1, &pvt->dbam1);
+		if (c->x86 != 0x16)
+			/* F16h has only DCT0 */
+			amd64_read_dct_pci_cfg(pvt, DBAM1, &pvt->dbam1);
 
 		/* F10h, revD and later can do x8 ECC too */
 		if ((c->x86 > 0x10 || c->x86_model > 7) && tmp & BIT(25))
@@ -2356,6 +2406,11 @@ static struct amd64_family_type *amd64_per_family_init(struct amd64_pvt *pvt)
 		pvt->ops		= &amd64_family_types[F15_CPUS].ops;
 		break;
 
+	case 0x16:
+		fam_type		= &amd64_family_types[F16_CPUS];
+		pvt->ops		= &amd64_family_types[F16_CPUS].ops;
+		break;
+
 	default:
 		amd64_err("Unsupported family!\n");
 		return NULL;
@@ -2581,6 +2636,14 @@ static DEFINE_PCI_DEVICE_TABLE(amd64_pci_table) = {
 		.class		= 0,
 		.class_mask	= 0,
 	},
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_16H_NB_F2,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+	},
 
 	{0, }
 };
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 35637d83f235..2c6f113bae2b 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -172,7 +172,8 @@
  */
 #define PCI_DEVICE_ID_AMD_15H_NB_F1	0x1601
 #define PCI_DEVICE_ID_AMD_15H_NB_F2	0x1602
-
+#define PCI_DEVICE_ID_AMD_16H_NB_F1	0x1531
+#define PCI_DEVICE_ID_AMD_16H_NB_F2	0x1532
 
 /*
  * Function 1 - Address Map
@@ -296,6 +297,7 @@ enum amd_families {
 	K8_CPUS = 0,
 	F10_CPUS,
 	F15_CPUS,
+	F16_CPUS,
 	NUM_FAMILIES,
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index f11c1c2609d5..9b3b85822302 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -524,6 +524,8 @@
 #define PCI_DEVICE_ID_AMD_15H_NB_F3	0x1603
 #define PCI_DEVICE_ID_AMD_15H_NB_F4	0x1604
 #define PCI_DEVICE_ID_AMD_15H_NB_F5	0x1605
+#define PCI_DEVICE_ID_AMD_16H_NB_F3	0x1533
+#define PCI_DEVICE_ID_AMD_16H_NB_F4	0x1534
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
cgit 


From cc014f3e29e15140e35e14fa194034baee92df2a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 4 Mar 2013 18:28:21 +0100
Subject: mmc: sdhci-s3c: remove platform dependencies

plat/regs-sdhci.h is not used anywhere but in the sdhci-s3c
driver, so it can become a local file there and all other
inclusions removed.

plat/sdhci.h is used only to define the platform devices,
and with the exception of the platform_data structure not
needed by the driver, so we can split out the platform_data
definition instead and leave the rest to platform code.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Chris Ball <cjb@laptop.org>
---
 arch/arm/mach-exynos/setup-sdhci-gpio.c         |  2 +-
 arch/arm/mach-s5pc100/setup-sdhci-gpio.c        |  1 -
 arch/arm/mach-s5pv210/setup-sdhci-gpio.c        |  1 -
 arch/arm/plat-samsung/include/plat/regs-sdhci.h | 87 -------------------------
 arch/arm/plat-samsung/include/plat/sdhci.h      | 56 +---------------
 drivers/mmc/host/Kconfig                        |  2 +-
 drivers/mmc/host/sdhci-s3c-regs.h               | 87 +++++++++++++++++++++++++
 drivers/mmc/host/sdhci-s3c.c                    |  5 +-
 include/linux/platform_data/mmc-sdhci-s3c.h     | 56 ++++++++++++++++
 9 files changed, 148 insertions(+), 149 deletions(-)
 delete mode 100644 arch/arm/plat-samsung/include/plat/regs-sdhci.h
 create mode 100644 drivers/mmc/host/sdhci-s3c-regs.h
 create mode 100644 include/linux/platform_data/mmc-sdhci-s3c.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-exynos/setup-sdhci-gpio.c b/arch/arm/mach-exynos/setup-sdhci-gpio.c
index e8d08bf8965a..d5b98c866738 100644
--- a/arch/arm/mach-exynos/setup-sdhci-gpio.c
+++ b/arch/arm/mach-exynos/setup-sdhci-gpio.c
@@ -19,8 +19,8 @@
 #include <linux/mmc/host.h>
 #include <linux/mmc/card.h>
 
+#include <mach/gpio.h>
 #include <plat/gpio-cfg.h>
-#include <plat/regs-sdhci.h>
 #include <plat/sdhci.h>
 
 void exynos4_setup_sdhci0_cfg_gpio(struct platform_device *dev, int width)
diff --git a/arch/arm/mach-s5pc100/setup-sdhci-gpio.c b/arch/arm/mach-s5pc100/setup-sdhci-gpio.c
index 03c02d04c68c..6010c0310cb5 100644
--- a/arch/arm/mach-s5pc100/setup-sdhci-gpio.c
+++ b/arch/arm/mach-s5pc100/setup-sdhci-gpio.c
@@ -19,7 +19,6 @@
 #include <linux/mmc/card.h>
 
 #include <plat/gpio-cfg.h>
-#include <plat/regs-sdhci.h>
 #include <plat/sdhci.h>
 
 void s5pc100_setup_sdhci0_cfg_gpio(struct platform_device *dev, int width)
diff --git a/arch/arm/mach-s5pv210/setup-sdhci-gpio.c b/arch/arm/mach-s5pv210/setup-sdhci-gpio.c
index 3e3ac05bb7b1..0512ada00522 100644
--- a/arch/arm/mach-s5pv210/setup-sdhci-gpio.c
+++ b/arch/arm/mach-s5pv210/setup-sdhci-gpio.c
@@ -20,7 +20,6 @@
 #include <linux/mmc/card.h>
 
 #include <plat/gpio-cfg.h>
-#include <plat/regs-sdhci.h>
 #include <plat/sdhci.h>
 
 void s5pv210_setup_sdhci0_cfg_gpio(struct platform_device *dev, int width)
diff --git a/arch/arm/plat-samsung/include/plat/regs-sdhci.h b/arch/arm/plat-samsung/include/plat/regs-sdhci.h
deleted file mode 100644
index e34049ad44cc..000000000000
--- a/arch/arm/plat-samsung/include/plat/regs-sdhci.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/* linux/arch/arm/plat-s3c/include/plat/regs-sdhci.h
- *
- * Copyright 2008 Openmoko, Inc.
- * Copyright 2008 Simtec Electronics
- *	http://armlinux.simtec.co.uk/
- *	Ben Dooks <ben@simtec.co.uk>
- *
- * S3C Platform - SDHCI (HSMMC) register definitions
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef __PLAT_S3C_SDHCI_REGS_H
-#define __PLAT_S3C_SDHCI_REGS_H __FILE__
-
-#define S3C_SDHCI_CONTROL2			(0x80)
-#define S3C_SDHCI_CONTROL3			(0x84)
-#define S3C64XX_SDHCI_CONTROL4			(0x8C)
-
-#define S3C64XX_SDHCI_CTRL2_ENSTAASYNCCLR	(1 << 31)
-#define S3C64XX_SDHCI_CTRL2_ENCMDCNFMSK		(1 << 30)
-#define S3C_SDHCI_CTRL2_CDINVRXD3		(1 << 29)
-#define S3C_SDHCI_CTRL2_SLCARDOUT		(1 << 28)
-
-#define S3C_SDHCI_CTRL2_FLTCLKSEL_MASK		(0xf << 24)
-#define S3C_SDHCI_CTRL2_FLTCLKSEL_SHIFT		(24)
-#define S3C_SDHCI_CTRL2_FLTCLKSEL(_x)		((_x) << 24)
-
-#define S3C_SDHCI_CTRL2_LVLDAT_MASK		(0xff << 16)
-#define S3C_SDHCI_CTRL2_LVLDAT_SHIFT		(16)
-#define S3C_SDHCI_CTRL2_LVLDAT(_x)		((_x) << 16)
-
-#define S3C_SDHCI_CTRL2_ENFBCLKTX		(1 << 15)
-#define S3C_SDHCI_CTRL2_ENFBCLKRX		(1 << 14)
-#define S3C_SDHCI_CTRL2_SDCDSEL			(1 << 13)
-#define S3C_SDHCI_CTRL2_SDSIGPC			(1 << 12)
-#define S3C_SDHCI_CTRL2_ENBUSYCHKTXSTART	(1 << 11)
-
-#define S3C_SDHCI_CTRL2_DFCNT_MASK		(0x3 << 9)
-#define S3C_SDHCI_CTRL2_DFCNT_SHIFT		(9)
-#define S3C_SDHCI_CTRL2_DFCNT_NONE		(0x0 << 9)
-#define S3C_SDHCI_CTRL2_DFCNT_4SDCLK		(0x1 << 9)
-#define S3C_SDHCI_CTRL2_DFCNT_16SDCLK		(0x2 << 9)
-#define S3C_SDHCI_CTRL2_DFCNT_64SDCLK		(0x3 << 9)
-
-#define S3C_SDHCI_CTRL2_ENCLKOUTHOLD		(1 << 8)
-#define S3C_SDHCI_CTRL2_RWAITMODE		(1 << 7)
-#define S3C_SDHCI_CTRL2_DISBUFRD		(1 << 6)
-#define S3C_SDHCI_CTRL2_SELBASECLK_MASK		(0x3 << 4)
-#define S3C_SDHCI_CTRL2_SELBASECLK_SHIFT	(4)
-#define S3C_SDHCI_CTRL2_PWRSYNC			(1 << 3)
-#define S3C_SDHCI_CTRL2_ENCLKOUTMSKCON		(1 << 1)
-#define S3C_SDHCI_CTRL2_HWINITFIN		(1 << 0)
-
-#define S3C_SDHCI_CTRL3_FCSEL3			(1 << 31)
-#define S3C_SDHCI_CTRL3_FCSEL2			(1 << 23)
-#define S3C_SDHCI_CTRL3_FCSEL1			(1 << 15)
-#define S3C_SDHCI_CTRL3_FCSEL0			(1 << 7)
-
-#define S3C_SDHCI_CTRL3_FIA3_MASK		(0x7f << 24)
-#define S3C_SDHCI_CTRL3_FIA3_SHIFT		(24)
-#define S3C_SDHCI_CTRL3_FIA3(_x)		((_x) << 24)
-
-#define S3C_SDHCI_CTRL3_FIA2_MASK		(0x7f << 16)
-#define S3C_SDHCI_CTRL3_FIA2_SHIFT		(16)
-#define S3C_SDHCI_CTRL3_FIA2(_x)		((_x) << 16)
-
-#define S3C_SDHCI_CTRL3_FIA1_MASK		(0x7f << 8)
-#define S3C_SDHCI_CTRL3_FIA1_SHIFT		(8)
-#define S3C_SDHCI_CTRL3_FIA1(_x)		((_x) << 8)
-
-#define S3C_SDHCI_CTRL3_FIA0_MASK		(0x7f << 0)
-#define S3C_SDHCI_CTRL3_FIA0_SHIFT		(0)
-#define S3C_SDHCI_CTRL3_FIA0(_x)		((_x) << 0)
-
-#define S3C64XX_SDHCI_CONTROL4_DRIVE_MASK	(0x3 << 16)
-#define S3C64XX_SDHCI_CONTROL4_DRIVE_SHIFT	(16)
-#define S3C64XX_SDHCI_CONTROL4_DRIVE_2mA	(0x0 << 16)
-#define S3C64XX_SDHCI_CONTROL4_DRIVE_4mA	(0x1 << 16)
-#define S3C64XX_SDHCI_CONTROL4_DRIVE_7mA	(0x2 << 16)
-#define S3C64XX_SDHCI_CONTROL4_DRIVE_9mA	(0x3 << 16)
-
-#define S3C64XX_SDHCI_CONTROL4_BUSY		(1)
-
-#endif /* __PLAT_S3C_SDHCI_REGS_H */
diff --git a/arch/arm/plat-samsung/include/plat/sdhci.h b/arch/arm/plat-samsung/include/plat/sdhci.h
index 9b87f38fc4f4..1e0aab08c13f 100644
--- a/arch/arm/plat-samsung/include/plat/sdhci.h
+++ b/arch/arm/plat-samsung/include/plat/sdhci.h
@@ -18,62 +18,9 @@
 #ifndef __PLAT_S3C_SDHCI_H
 #define __PLAT_S3C_SDHCI_H __FILE__
 
+#include <linux/platform_data/mmc-sdhci-s3c.h>
 #include <plat/devs.h>
 
-struct platform_device;
-struct mmc_host;
-struct mmc_card;
-struct mmc_ios;
-
-enum cd_types {
-	S3C_SDHCI_CD_INTERNAL,	/* use mmc internal CD line */
-	S3C_SDHCI_CD_EXTERNAL,	/* use external callback */
-	S3C_SDHCI_CD_GPIO,	/* use external gpio pin for CD line */
-	S3C_SDHCI_CD_NONE,	/* no CD line, use polling to detect card */
-	S3C_SDHCI_CD_PERMANENT,	/* no CD line, card permanently wired to host */
-};
-
-/**
- * struct s3c_sdhci_platdata() - Platform device data for Samsung SDHCI
- * @max_width: The maximum number of data bits supported.
- * @host_caps: Standard MMC host capabilities bit field.
- * @host_caps2: The second standard MMC host capabilities bit field.
- * @cd_type: Type of Card Detection method (see cd_types enum above)
- * @ext_cd_init: Initialize external card detect subsystem. Called on
- *		 sdhci-s3c driver probe when cd_type == S3C_SDHCI_CD_EXTERNAL.
- *		 notify_func argument is a callback to the sdhci-s3c driver
- *		 that triggers the card detection event. Callback arguments:
- *		 dev is pointer to platform device of the host controller,
- *		 state is new state of the card (0 - removed, 1 - inserted).
- * @ext_cd_cleanup: Cleanup external card detect subsystem. Called on
- *		 sdhci-s3c driver remove when cd_type == S3C_SDHCI_CD_EXTERNAL.
- *		 notify_func argument is the same callback as for ext_cd_init.
- * @ext_cd_gpio: gpio pin used for external CD line, valid only if
- *		 cd_type == S3C_SDHCI_CD_GPIO
- * @ext_cd_gpio_invert: invert values for external CD gpio line
- * @cfg_gpio: Configure the GPIO for a specific card bit-width
- *
- * Initialisation data specific to either the machine or the platform
- * for the device driver to use or call-back when configuring gpio or
- * card speed information.
-*/
-struct s3c_sdhci_platdata {
-	unsigned int	max_width;
-	unsigned int	host_caps;
-	unsigned int	host_caps2;
-	unsigned int	pm_caps;
-	enum cd_types	cd_type;
-
-	int		ext_cd_gpio;
-	bool		ext_cd_gpio_invert;
-	int	(*ext_cd_init)(void (*notify_func)(struct platform_device *,
-						   int state));
-	int	(*ext_cd_cleanup)(void (*notify_func)(struct platform_device *,
-						      int state));
-
-	void	(*cfg_gpio)(struct platform_device *dev, int width);
-};
-
 /* s3c_sdhci_set_platdata() - common helper for setting SDHCI platform data
  * @pd: The default platform data for this device.
  * @set: Pointer to the platform data to fill in.
@@ -378,5 +325,4 @@ static inline void s3c_sdhci_setname(int id, char *name)
 		break;
 	}
 }
-
 #endif /* __PLAT_S3C_SDHCI_H */
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index d88219e1d86e..d0547448606d 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -180,7 +180,7 @@ config MMC_SDHCI_TEGRA
 
 config MMC_SDHCI_S3C
 	tristate "SDHCI support on Samsung S3C SoC"
-	depends on MMC_SDHCI && PLAT_SAMSUNG
+	depends on MMC_SDHCI && PLAT_SAMSUNG_SINGLE
 	help
 	  This selects the Secure Digital Host Controller Interface (SDHCI)
 	  often referrered to as the HSMMC block in some of the Samsung S3C
diff --git a/drivers/mmc/host/sdhci-s3c-regs.h b/drivers/mmc/host/sdhci-s3c-regs.h
new file mode 100644
index 000000000000..e34049ad44cc
--- /dev/null
+++ b/drivers/mmc/host/sdhci-s3c-regs.h
@@ -0,0 +1,87 @@
+/* linux/arch/arm/plat-s3c/include/plat/regs-sdhci.h
+ *
+ * Copyright 2008 Openmoko, Inc.
+ * Copyright 2008 Simtec Electronics
+ *	http://armlinux.simtec.co.uk/
+ *	Ben Dooks <ben@simtec.co.uk>
+ *
+ * S3C Platform - SDHCI (HSMMC) register definitions
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+*/
+
+#ifndef __PLAT_S3C_SDHCI_REGS_H
+#define __PLAT_S3C_SDHCI_REGS_H __FILE__
+
+#define S3C_SDHCI_CONTROL2			(0x80)
+#define S3C_SDHCI_CONTROL3			(0x84)
+#define S3C64XX_SDHCI_CONTROL4			(0x8C)
+
+#define S3C64XX_SDHCI_CTRL2_ENSTAASYNCCLR	(1 << 31)
+#define S3C64XX_SDHCI_CTRL2_ENCMDCNFMSK		(1 << 30)
+#define S3C_SDHCI_CTRL2_CDINVRXD3		(1 << 29)
+#define S3C_SDHCI_CTRL2_SLCARDOUT		(1 << 28)
+
+#define S3C_SDHCI_CTRL2_FLTCLKSEL_MASK		(0xf << 24)
+#define S3C_SDHCI_CTRL2_FLTCLKSEL_SHIFT		(24)
+#define S3C_SDHCI_CTRL2_FLTCLKSEL(_x)		((_x) << 24)
+
+#define S3C_SDHCI_CTRL2_LVLDAT_MASK		(0xff << 16)
+#define S3C_SDHCI_CTRL2_LVLDAT_SHIFT		(16)
+#define S3C_SDHCI_CTRL2_LVLDAT(_x)		((_x) << 16)
+
+#define S3C_SDHCI_CTRL2_ENFBCLKTX		(1 << 15)
+#define S3C_SDHCI_CTRL2_ENFBCLKRX		(1 << 14)
+#define S3C_SDHCI_CTRL2_SDCDSEL			(1 << 13)
+#define S3C_SDHCI_CTRL2_SDSIGPC			(1 << 12)
+#define S3C_SDHCI_CTRL2_ENBUSYCHKTXSTART	(1 << 11)
+
+#define S3C_SDHCI_CTRL2_DFCNT_MASK		(0x3 << 9)
+#define S3C_SDHCI_CTRL2_DFCNT_SHIFT		(9)
+#define S3C_SDHCI_CTRL2_DFCNT_NONE		(0x0 << 9)
+#define S3C_SDHCI_CTRL2_DFCNT_4SDCLK		(0x1 << 9)
+#define S3C_SDHCI_CTRL2_DFCNT_16SDCLK		(0x2 << 9)
+#define S3C_SDHCI_CTRL2_DFCNT_64SDCLK		(0x3 << 9)
+
+#define S3C_SDHCI_CTRL2_ENCLKOUTHOLD		(1 << 8)
+#define S3C_SDHCI_CTRL2_RWAITMODE		(1 << 7)
+#define S3C_SDHCI_CTRL2_DISBUFRD		(1 << 6)
+#define S3C_SDHCI_CTRL2_SELBASECLK_MASK		(0x3 << 4)
+#define S3C_SDHCI_CTRL2_SELBASECLK_SHIFT	(4)
+#define S3C_SDHCI_CTRL2_PWRSYNC			(1 << 3)
+#define S3C_SDHCI_CTRL2_ENCLKOUTMSKCON		(1 << 1)
+#define S3C_SDHCI_CTRL2_HWINITFIN		(1 << 0)
+
+#define S3C_SDHCI_CTRL3_FCSEL3			(1 << 31)
+#define S3C_SDHCI_CTRL3_FCSEL2			(1 << 23)
+#define S3C_SDHCI_CTRL3_FCSEL1			(1 << 15)
+#define S3C_SDHCI_CTRL3_FCSEL0			(1 << 7)
+
+#define S3C_SDHCI_CTRL3_FIA3_MASK		(0x7f << 24)
+#define S3C_SDHCI_CTRL3_FIA3_SHIFT		(24)
+#define S3C_SDHCI_CTRL3_FIA3(_x)		((_x) << 24)
+
+#define S3C_SDHCI_CTRL3_FIA2_MASK		(0x7f << 16)
+#define S3C_SDHCI_CTRL3_FIA2_SHIFT		(16)
+#define S3C_SDHCI_CTRL3_FIA2(_x)		((_x) << 16)
+
+#define S3C_SDHCI_CTRL3_FIA1_MASK		(0x7f << 8)
+#define S3C_SDHCI_CTRL3_FIA1_SHIFT		(8)
+#define S3C_SDHCI_CTRL3_FIA1(_x)		((_x) << 8)
+
+#define S3C_SDHCI_CTRL3_FIA0_MASK		(0x7f << 0)
+#define S3C_SDHCI_CTRL3_FIA0_SHIFT		(0)
+#define S3C_SDHCI_CTRL3_FIA0(_x)		((_x) << 0)
+
+#define S3C64XX_SDHCI_CONTROL4_DRIVE_MASK	(0x3 << 16)
+#define S3C64XX_SDHCI_CONTROL4_DRIVE_SHIFT	(16)
+#define S3C64XX_SDHCI_CONTROL4_DRIVE_2mA	(0x0 << 16)
+#define S3C64XX_SDHCI_CONTROL4_DRIVE_4mA	(0x1 << 16)
+#define S3C64XX_SDHCI_CONTROL4_DRIVE_7mA	(0x2 << 16)
+#define S3C64XX_SDHCI_CONTROL4_DRIVE_9mA	(0x3 << 16)
+
+#define S3C64XX_SDHCI_CONTROL4_BUSY		(1)
+
+#endif /* __PLAT_S3C_SDHCI_REGS_H */
diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c
index 7363efe72287..e4f52b5c2592 100644
--- a/drivers/mmc/host/sdhci-s3c.c
+++ b/drivers/mmc/host/sdhci-s3c.c
@@ -15,6 +15,7 @@
 #include <linux/delay.h>
 #include <linux/dma-mapping.h>
 #include <linux/platform_device.h>
+#include <linux/platform_data/mmc-sdhci-s3c.h>
 #include <linux/slab.h>
 #include <linux/clk.h>
 #include <linux/io.h>
@@ -28,9 +29,7 @@
 
 #include <linux/mmc/host.h>
 
-#include <plat/sdhci.h>
-#include <plat/regs-sdhci.h>
-
+#include "sdhci-s3c-regs.h"
 #include "sdhci.h"
 
 #define MAX_BUS_CLK	(4)
diff --git a/include/linux/platform_data/mmc-sdhci-s3c.h b/include/linux/platform_data/mmc-sdhci-s3c.h
new file mode 100644
index 000000000000..249f02387a35
--- /dev/null
+++ b/include/linux/platform_data/mmc-sdhci-s3c.h
@@ -0,0 +1,56 @@
+#ifndef __PLATFORM_DATA_SDHCI_S3C_H
+#define __PLATFORM_DATA_SDHCI_S3C_H
+
+struct platform_device;
+
+enum cd_types {
+	S3C_SDHCI_CD_INTERNAL,	/* use mmc internal CD line */
+	S3C_SDHCI_CD_EXTERNAL,	/* use external callback */
+	S3C_SDHCI_CD_GPIO,	/* use external gpio pin for CD line */
+	S3C_SDHCI_CD_NONE,	/* no CD line, use polling to detect card */
+	S3C_SDHCI_CD_PERMANENT,	/* no CD line, card permanently wired to host */
+};
+
+/**
+ * struct s3c_sdhci_platdata() - Platform device data for Samsung SDHCI
+ * @max_width: The maximum number of data bits supported.
+ * @host_caps: Standard MMC host capabilities bit field.
+ * @host_caps2: The second standard MMC host capabilities bit field.
+ * @cd_type: Type of Card Detection method (see cd_types enum above)
+ * @ext_cd_init: Initialize external card detect subsystem. Called on
+ *		 sdhci-s3c driver probe when cd_type == S3C_SDHCI_CD_EXTERNAL.
+ *		 notify_func argument is a callback to the sdhci-s3c driver
+ *		 that triggers the card detection event. Callback arguments:
+ *		 dev is pointer to platform device of the host controller,
+ *		 state is new state of the card (0 - removed, 1 - inserted).
+ * @ext_cd_cleanup: Cleanup external card detect subsystem. Called on
+ *		 sdhci-s3c driver remove when cd_type == S3C_SDHCI_CD_EXTERNAL.
+ *		 notify_func argument is the same callback as for ext_cd_init.
+ * @ext_cd_gpio: gpio pin used for external CD line, valid only if
+ *		 cd_type == S3C_SDHCI_CD_GPIO
+ * @ext_cd_gpio_invert: invert values for external CD gpio line
+ * @cfg_gpio: Configure the GPIO for a specific card bit-width
+ *
+ * Initialisation data specific to either the machine or the platform
+ * for the device driver to use or call-back when configuring gpio or
+ * card speed information.
+*/
+struct s3c_sdhci_platdata {
+	unsigned int	max_width;
+	unsigned int	host_caps;
+	unsigned int	host_caps2;
+	unsigned int	pm_caps;
+	enum cd_types	cd_type;
+
+	int		ext_cd_gpio;
+	bool		ext_cd_gpio_invert;
+	int	(*ext_cd_init)(void (*notify_func)(struct platform_device *,
+						   int state));
+	int	(*ext_cd_cleanup)(void (*notify_func)(struct platform_device *,
+						      int state));
+
+	void	(*cfg_gpio)(struct platform_device *dev, int width);
+};
+
+
+#endif /* __PLATFORM_DATA_SDHCI_S3C_H */
-- 
cgit 


From d1e43fa5f8bb25f83a86a29f11fcfb57ed4d7566 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 26 Mar 2013 23:47:24 +0100
Subject: nohz: Ensure full dynticks CPUs are RCU nocbs

We need full dynticks CPU to also be RCU nocb so
that we don't have to keep the tick to handle RCU
callbacks.

Make sure the range passed to nohz_full= boot
parameter is a subset of rcu_nocbs=

The CPUs that fail to meet this requirement will be
excluded from the nohz_full range. This is checked
early in boot time, before any CPU has the opportunity
to stop its tick.

Suggested-by: Steven Rostedt <rostedt@goodmis.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/kernel-parameters.txt |  2 ++
 include/linux/rcupdate.h            |  7 +++++++
 include/linux/tick.h                |  2 ++
 init/main.c                         |  1 +
 kernel/rcutree.c                    |  6 +++---
 kernel/rcutree.h                    |  1 -
 kernel/rcutree_plugin.h             | 13 ++++---------
 kernel/time/tick-sched.c            | 22 ++++++++++++++++------
 8 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 887b29708447..4865e9bfd08d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1918,6 +1918,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			the specified list of CPUs whose tick will be stopped
 			whenever possible. The boot CPU will be forced outside
 			the range to maintain the timekeeping.
+			The CPUs in this range must also be included in the
+			rcu_nocbs= set.
 
 	noiotrap	[SH] Disables trapped I/O port accesses.
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index b758ce17b309..8e0948c872fc 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -999,4 +999,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 #define kfree_rcu(ptr, rcu_head)					\
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
+#ifdef CONFIG_RCU_NOCB_CPU
+extern bool rcu_is_nocb_cpu(int cpu);
+#else
+static inline bool rcu_is_nocb_cpu(int cpu) { return false; }
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
+
+
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b4e3b0c9639e..0b6873cbf512 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -158,8 +158,10 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 # endif /* !CONFIG_NO_HZ_COMMON */
 
 #ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
 extern int tick_nohz_full_cpu(int cpu);
 #else
+static inline void tick_nohz_init(void) { }
 static inline int tick_nohz_full_cpu(int cpu) { return 0; }
 #endif
 
diff --git a/init/main.c b/init/main.c
index 63534a141b4e..2acb5bbde99b 100644
--- a/init/main.c
+++ b/init/main.c
@@ -547,6 +547,7 @@ asmlinkage void __init start_kernel(void)
 	idr_init_cache();
 	perf_event_init();
 	rcu_init();
+	tick_nohz_init();
 	radix_tree_init();
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f5ab50235cba..1d4ceff793a4 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1695,7 +1695,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 			  struct rcu_node *rnp, struct rcu_data *rdp)
 {
 	/* No-CBs CPUs do not have orphanable callbacks. */
-	if (is_nocb_cpu(rdp->cpu))
+	if (rcu_is_nocb_cpu(rdp->cpu))
 		return;
 
 	/*
@@ -2757,10 +2757,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
 	 * corresponding CPU's preceding callbacks have been invoked.
 	 */
 	for_each_possible_cpu(cpu) {
-		if (!cpu_online(cpu) && !is_nocb_cpu(cpu))
+		if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu))
 			continue;
 		rdp = per_cpu_ptr(rsp->rda, cpu);
-		if (is_nocb_cpu(cpu)) {
+		if (rcu_is_nocb_cpu(cpu)) {
 			_rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
 					   rsp->n_barrier_done);
 			atomic_inc(&rsp->barrier_cpu_count);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index f993c0ac47db..38acc49da2c6 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -529,7 +529,6 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
 static void zero_cpu_stall_ticks(struct rcu_data *rdp);
 static void increment_cpu_stall_ticks(void);
-static bool is_nocb_cpu(int cpu);
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy);
 static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a5745e9b5d5a..0cd91cc18db4 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -2167,7 +2167,7 @@ static int __init parse_rcu_nocb_poll(char *arg)
 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
 
 /* Is the specified CPU a no-CPUs CPU? */
-static bool is_nocb_cpu(int cpu)
+bool rcu_is_nocb_cpu(int cpu)
 {
 	if (have_rcu_nocb_mask)
 		return cpumask_test_cpu(cpu, rcu_nocb_mask);
@@ -2225,7 +2225,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy)
 {
 
-	if (!is_nocb_cpu(rdp->cpu))
+	if (!rcu_is_nocb_cpu(rdp->cpu))
 		return 0;
 	__call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy);
 	return 1;
@@ -2242,7 +2242,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
 	long qll = rsp->qlen_lazy;
 
 	/* If this is not a no-CBs CPU, tell the caller to do it the old way. */
-	if (!is_nocb_cpu(smp_processor_id()))
+	if (!rcu_is_nocb_cpu(smp_processor_id()))
 		return 0;
 	rsp->qlen = 0;
 	rsp->qlen_lazy = 0;
@@ -2282,7 +2282,7 @@ static bool nocb_cpu_expendable(int cpu)
 	 * If there are no no-CB CPUs or if this CPU is not a no-CB CPU,
 	 * then offlining this CPU is harmless.  Let it happen.
 	 */
-	if (!have_rcu_nocb_mask || is_nocb_cpu(cpu))
+	if (!have_rcu_nocb_mask || rcu_is_nocb_cpu(cpu))
 		return 1;
 
 	/* If no memory, play it safe and keep the CPU around. */
@@ -2464,11 +2464,6 @@ static void __init rcu_init_nocb(void)
 
 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
 
-static bool is_nocb_cpu(int cpu)
-{
-	return false;
-}
-
 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
 			    bool lazy)
 {
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2bac5ea2c9af..d71a5f2bd7b2 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -203,17 +203,27 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb,
  */
 static char __initdata nohz_full_buf[NR_CPUS + 1];
 
-static int __init init_tick_nohz_full(void)
+void __init tick_nohz_init(void)
 {
-	if (have_nohz_full_mask)
-		cpu_notifier(tick_nohz_cpu_down_callback, 0);
+	int cpu;
+
+	if (!have_nohz_full_mask)
+		return;
+
+	cpu_notifier(tick_nohz_cpu_down_callback, 0);
+
+	/* Make sure full dynticks CPU are also RCU nocbs */
+	for_each_cpu(cpu, nohz_full_mask) {
+		if (!rcu_is_nocb_cpu(cpu)) {
+			pr_warning("NO_HZ: CPU %d is not RCU nocb: "
+				   "cleared from nohz_full range", cpu);
+			cpumask_clear_cpu(cpu, nohz_full_mask);
+		}
+	}
 
 	cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask);
 	pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf);
-
-	return 0;
 }
-core_initcall(init_tick_nohz_full);
 #else
 #define have_nohz_full_mask (0)
 #endif
-- 
cgit 


From f07512e615dd8ebd0866efc38b39ccd41f1495d8 Mon Sep 17 00:00:00 2001
From: Laurent Meunier <laurent.meunier@st.com>
Date: Thu, 18 Apr 2013 10:48:07 +0200
Subject: pinctrl/pinconfig: add debug interface

This update adds a debugfs interface to modify a pin configuration
for a given state in the pinctrl map. This allows to modify the
configuration for a non-active state, typically sleep state.
This configuration is not applied right away, but only when the state
will be entered.

This solution is mandated for us by HW validation: in order
to test and verify several pin configurations during sleep without
recompiling the software.

Change log in this patch set;
Take into account latest feedback from Stephen Warren:
- stale comments update
- improved code efficiency and readibility
- limit size of global variable pinconf_dbg_conf
- remove req_type as it can easily be added later when
add/delete requests support is implemented

Signed-off-by: Laurent Meunier <laurent.meunier@st.com>
Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
---
 drivers/pinctrl/pinconf.c       | 232 ++++++++++++++++++++++++++++++++++++++++
 include/linux/pinctrl/pinconf.h |   6 ++
 2 files changed, 238 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pinctrl/pinconf.c b/drivers/pinctrl/pinconf.c
index baee2cc46a17..6f14b62ae8ba 100644
--- a/drivers/pinctrl/pinconf.c
+++ b/drivers/pinctrl/pinconf.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
+#include <linux/uaccess.h>
 #include <linux/pinctrl/machine.h>
 #include <linux/pinctrl/pinctrl.h>
 #include <linux/pinctrl/pinconf.h>
@@ -574,6 +575,235 @@ static const struct file_operations pinconf_groups_ops = {
 	.release	= single_release,
 };
 
+#define MAX_NAME_LEN 15
+
+struct dbg_cfg {
+	enum pinctrl_map_type map_type;
+	char dev_name[MAX_NAME_LEN+1];
+	char state_name[MAX_NAME_LEN+1];
+	char pin_name[MAX_NAME_LEN+1];
+};
+
+/*
+ * Goal is to keep this structure as global in order to simply read the
+ * pinconf-config file after a write to check config is as expected
+ */
+static struct dbg_cfg pinconf_dbg_conf;
+
+/**
+ * pinconf_dbg_config_print() - display the pinctrl config from the pinctrl
+ * map, of the dev/pin/state that was last written to pinconf-config file.
+ * @s: string filled in  with config description
+ * @d: not used
+ */
+static int pinconf_dbg_config_print(struct seq_file *s, void *d)
+{
+	struct pinctrl_maps *maps_node;
+	const struct pinctrl_map *map;
+	struct pinctrl_dev *pctldev = NULL;
+	const struct pinconf_ops *confops = NULL;
+	const struct pinctrl_map_configs *configs;
+	struct dbg_cfg *dbg = &pinconf_dbg_conf;
+	int i, j;
+	bool found = false;
+	unsigned long config;
+
+	mutex_lock(&pinctrl_mutex);
+
+	/* Parse the pinctrl map and look for the elected pin/state */
+	for_each_maps(maps_node, i, map) {
+		if (map->type != dbg->map_type)
+			continue;
+		if (strcmp(map->dev_name, dbg->dev_name))
+			continue;
+		if (strcmp(map->name, dbg->state_name))
+			continue;
+
+		for (j = 0; j < map->data.configs.num_configs; j++) {
+			if (!strcmp(map->data.configs.group_or_pin,
+					dbg->pin_name)) {
+				/*
+				 * We found the right pin / state, read the
+				 * config and he pctldev for later use
+				 */
+				configs = &map->data.configs;
+				pctldev = get_pinctrl_dev_from_devname
+					(map->ctrl_dev_name);
+				found = true;
+				break;
+			}
+		}
+	}
+
+	if (!found) {
+		seq_printf(s, "No config found for dev/state/pin, expected:\n");
+		seq_printf(s, "Searched dev:%s\n", dbg->dev_name);
+		seq_printf(s, "Searched state:%s\n", dbg->state_name);
+		seq_printf(s, "Searched pin:%s\n", dbg->pin_name);
+		seq_printf(s, "Use: modify config_pin <devname> "\
+				"<state> <pinname> <value>\n");
+		goto exit;
+	}
+
+	config = *(configs->configs);
+	seq_printf(s, "Dev %s has config of %s in state %s: 0x%08lX\n",
+			dbg->dev_name, dbg->pin_name,
+			dbg->state_name, config);
+
+	if (pctldev)
+		confops = pctldev->desc->confops;
+
+	if (confops && confops->pin_config_config_dbg_show)
+		confops->pin_config_config_dbg_show(pctldev, s, config);
+
+exit:
+	mutex_unlock(&pinctrl_mutex);
+
+	return 0;
+}
+
+/**
+ * pinconf_dbg_config_write() - modify the pinctrl config in the pinctrl
+ * map, of a dev/pin/state entry based on user entries to pinconf-config
+ * @user_buf: contains the modification request with expected format:
+ *     modify config_pin <devicename> <state> <pinname> <newvalue>
+ * modify is literal string, alternatives like add/delete not supported yet
+ * config_pin is literal, alternatives like config_mux not supported yet
+ * <devicename> <state> <pinname> are values that should match the pinctrl-maps
+ * <newvalue> reflects the new config and is driver dependant
+ */
+static int pinconf_dbg_config_write(struct file *file,
+	const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	struct pinctrl_maps *maps_node;
+	const struct pinctrl_map *map;
+	struct pinctrl_dev *pctldev = NULL;
+	const struct pinconf_ops *confops = NULL;
+	struct dbg_cfg *dbg = &pinconf_dbg_conf;
+	const struct pinctrl_map_configs *configs;
+	char config[MAX_NAME_LEN+1];
+	bool found = false;
+	char buf[128];
+	char *b = &buf[0];
+	int buf_size;
+	char *token;
+	int i;
+
+	/* Get userspace string and assure termination */
+	buf_size = min(count, (sizeof(buf)-1));
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	/*
+	 * need to parse entry and extract parameters:
+	 * modify configs_pin devicename state pinname newvalue
+	 */
+
+	/* Get arg: 'modify' */
+	token = strsep(&b, " ");
+	if (!token)
+		return -EINVAL;
+	if (strcmp(token, "modify"))
+		return -EINVAL;
+
+	/* Get arg type: "config_pin" type supported so far */
+	token = strsep(&b, " ");
+	if (!token)
+		return -EINVAL;
+	if (strcmp(token, "config_pin"))
+		return -EINVAL;
+	dbg->map_type = PIN_MAP_TYPE_CONFIGS_PIN;
+
+	/* get arg 'device_name' */
+	token = strsep(&b, " ");
+	if (token == NULL)
+		return -EINVAL;
+	if (strlen(token) >= MAX_NAME_LEN)
+		return -EINVAL;
+	strncpy(dbg->dev_name, token, MAX_NAME_LEN);
+
+	/* get arg 'state_name' */
+	token = strsep(&b, " ");
+	if (token == NULL)
+		return -EINVAL;
+	if (strlen(token) >= MAX_NAME_LEN)
+		return -EINVAL;
+	strncpy(dbg->state_name, token, MAX_NAME_LEN);
+
+	/* get arg 'pin_name' */
+	token = strsep(&b, " ");
+	if (token == NULL)
+		return -EINVAL;
+	if (strlen(token) >= MAX_NAME_LEN)
+		return -EINVAL;
+	strncpy(dbg->pin_name, token, MAX_NAME_LEN);
+
+	/* get new_value of config' */
+	token = strsep(&b, " ");
+	if (token == NULL)
+		return -EINVAL;
+	if (strlen(token) >= MAX_NAME_LEN)
+		return -EINVAL;
+	strncpy(config, token, MAX_NAME_LEN);
+
+	mutex_lock(&pinctrl_mutex);
+
+	/* Parse the pinctrl map and look for the selected dev/state/pin */
+	for_each_maps(maps_node, i, map) {
+		if (strcmp(map->dev_name, dbg->dev_name))
+			continue;
+		if (map->type != dbg->map_type)
+			continue;
+		if (strcmp(map->name, dbg->state_name))
+			continue;
+
+		/*  we found the right pin / state, so overwrite config */
+		if (!strcmp(map->data.configs.group_or_pin, dbg->pin_name)) {
+			found = true;
+			pctldev = get_pinctrl_dev_from_devname(
+					map->ctrl_dev_name);
+			configs = &map->data.configs;
+			break;
+		}
+	}
+
+	if (!found) {
+		goto exit;
+		count = -EINVAL;
+	}
+
+	if (pctldev)
+		confops = pctldev->desc->confops;
+
+	if (confops && confops->pin_config_dbg_parse_modify) {
+		for (i = 0; i < configs->num_configs; i++) {
+			confops->pin_config_dbg_parse_modify(pctldev,
+						     config,
+						     &configs->configs[i]);
+		}
+	}
+
+exit:
+	mutex_unlock(&pinctrl_mutex);
+
+	return count;
+}
+
+static int pinconf_dbg_config_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pinconf_dbg_config_print, inode->i_private);
+}
+
+static const struct file_operations pinconf_dbg_pinconfig_fops = {
+	.open = pinconf_dbg_config_open,
+	.write = pinconf_dbg_config_write,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+	.owner = THIS_MODULE,
+};
+
 void pinconf_init_device_debugfs(struct dentry *devroot,
 			 struct pinctrl_dev *pctldev)
 {
@@ -581,6 +811,8 @@ void pinconf_init_device_debugfs(struct dentry *devroot,
 			    devroot, pctldev, &pinconf_pins_ops);
 	debugfs_create_file("pinconf-groups", S_IFREG | S_IRUGO,
 			    devroot, pctldev, &pinconf_groups_ops);
+	debugfs_create_file("pinconf-config",  (S_IRUGO | S_IWUSR | S_IWGRP),
+			    devroot, pctldev, &pinconf_dbg_pinconfig_fops);
 }
 
 #endif
diff --git a/include/linux/pinctrl/pinconf.h b/include/linux/pinctrl/pinconf.h
index e7a720104a47..1ad4f31ef6b8 100644
--- a/include/linux/pinctrl/pinconf.h
+++ b/include/linux/pinctrl/pinconf.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_PINCONF
 
+#include <linux/pinctrl/machine.h>
+
 struct pinctrl_dev;
 struct seq_file;
 
@@ -28,6 +30,7 @@ struct seq_file;
  * @pin_config_set: configure an individual pin
  * @pin_config_group_get: get configurations for an entire pin group
  * @pin_config_group_set: configure all pins in a group
+ * @pin_config_group_dbg_set: optional debugfs to modify a pin configuration
  * @pin_config_dbg_show: optional debugfs display hook that will provide
  *	per-device info for a certain pin in debugfs
  * @pin_config_group_dbg_show: optional debugfs display hook that will provide
@@ -51,6 +54,9 @@ struct pinconf_ops {
 	int (*pin_config_group_set) (struct pinctrl_dev *pctldev,
 				     unsigned selector,
 				     unsigned long config);
+	int (*pin_config_dbg_parse_modify) (struct pinctrl_dev *pctldev,
+					   const char *arg,
+					   unsigned long *config);
 	void (*pin_config_dbg_show) (struct pinctrl_dev *pctldev,
 				     struct seq_file *s,
 				     unsigned offset);
-- 
cgit 


From 555347f6c080d2f25265f981c963605b4dd3610d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 19 Apr 2013 16:17:38 +0200
Subject: posix_timers: New API to prevent from stopping the tick when timers
 are running

Bring a new helper that the full dynticks infrastructure can
call in order to know if it can safely stop the tick from
the posix cpu timers subsystem point of view.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/posix-timers.h |  2 ++
 kernel/posix-cpu-timers.c    | 41 ++++++++++++++++++++++++++---------------
 2 files changed, 28 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h
index 042058fdb0af..3698d9d08978 100644
--- a/include/linux/posix-timers.h
+++ b/include/linux/posix-timers.h
@@ -122,6 +122,8 @@ void run_posix_cpu_timers(struct task_struct *task);
 void posix_cpu_timers_exit(struct task_struct *task);
 void posix_cpu_timers_exit_group(struct task_struct *task);
 
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk);
+
 void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx,
 			   cputime_t *newval, cputime_t *oldval);
 
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0bc33561a435..84d5cb372ed5 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -155,6 +155,21 @@ static void bump_cpu_timer(struct k_itimer *timer,
 	}
 }
 
+/**
+ * task_cputime_zero - Check a task_cputime struct for all zero fields.
+ *
+ * @cputime:	The struct to compare.
+ *
+ * Checks @cputime to see if all fields are zero.  Returns true if all fields
+ * are zero, false if any field is nonzero.
+ */
+static inline int task_cputime_zero(const struct task_cputime *cputime)
+{
+	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
+		return 1;
+	return 0;
+}
+
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
 	cputime_t utime, stime;
@@ -654,6 +669,17 @@ static void posix_cpu_timer_kick_nohz(void)
 {
 	schedule_work(&nohz_kick_work);
 }
+
+bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
+{
+	if (!task_cputime_zero(&tsk->cputime_expires))
+		return true;
+
+	if (tsk->signal->cputimer.running)
+		return true;
+
+	return false;
+}
 #else
 static inline void posix_cpu_timer_kick_nohz(void) { }
 #endif
@@ -1032,21 +1058,6 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 	}
 }
 
-/**
- * task_cputime_zero - Check a task_cputime struct for all zero fields.
- *
- * @cputime:	The struct to compare.
- *
- * Checks @cputime to see if all fields are zero.  Returns true if all fields
- * are zero, false if any field is nonzero.
- */
-static inline int task_cputime_zero(const struct task_cputime *cputime)
-{
-	if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
-		return 1;
-	return 0;
-}
-
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
-- 
cgit 


From 4c4b8c105a7bbd4a8d41ab4458f01174fdf3fcbb Mon Sep 17 00:00:00 2001
From: Wei WANG <wei_wang@realsil.com.cn>
Date: Thu, 11 Apr 2013 10:43:40 +0800
Subject: mfd: rtsx: Support RTS5249
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RTS5249 supports SD UHS-II interface.

In order to support SD UHS-II，the definitions of some internal
registers of RTS5249 have to be modified and are different from its
predecessors. So we need this patch to ensure RTS5249 can work, even
SD/MMC stack doesn't support UHS-II interface.

Signed-off-by: Wei WANG <wei_wang@realsil.com.cn>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Makefile         |   2 +-
 drivers/mfd/rts5249.c        | 241 +++++++++++++++++++++++++++++++++++++++++++
 drivers/mfd/rtsx_pcr.c       |   5 +
 drivers/mfd/rtsx_pcr.h       |   1 +
 include/linux/mfd/rtsx_pci.h |  36 +++++++
 5 files changed, 284 insertions(+), 1 deletion(-)
 create mode 100644 drivers/mfd/rts5249.c

(limited to 'include/linux')

diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index f62a583b5c85..067656651b02 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -12,7 +12,7 @@ obj-$(CONFIG_MFD_CROS_EC)	+= cros_ec.o
 obj-$(CONFIG_MFD_CROS_EC_I2C)	+= cros_ec_i2c.o
 obj-$(CONFIG_MFD_CROS_EC_SPI)	+= cros_ec_spi.o
 
-rtsx_pci-objs			:= rtsx_pcr.o rts5209.o rts5229.o rtl8411.o rts5227.o
+rtsx_pci-objs			:= rtsx_pcr.o rts5209.o rts5229.o rtl8411.o rts5227.o rts5249.o
 obj-$(CONFIG_MFD_RTSX_PCI)	+= rtsx_pci.o
 
 obj-$(CONFIG_HTC_EGPIO)		+= htc-egpio.o
diff --git a/drivers/mfd/rts5249.c b/drivers/mfd/rts5249.c
new file mode 100644
index 000000000000..15dc848bc081
--- /dev/null
+++ b/drivers/mfd/rts5249.c
@@ -0,0 +1,241 @@
+/* Driver for Realtek PCI-Express card reader
+ *
+ * Copyright(c) 2009-2013 Realtek Semiconductor Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author:
+ *   Wei WANG <wei_wang@realsil.com.cn>
+ *   No. 128, West Shenhu Road, Suzhou Industry Park, Suzhou, China
+ */
+
+#include <linux/module.h>
+#include <linux/delay.h>
+#include <linux/mfd/rtsx_pci.h>
+
+#include "rtsx_pcr.h"
+
+static u8 rts5249_get_ic_version(struct rtsx_pcr *pcr)
+{
+	u8 val;
+
+	rtsx_pci_read_register(pcr, DUMMY_REG_RESET_0, &val);
+	return val & 0x0F;
+}
+
+static int rts5249_extra_init_hw(struct rtsx_pcr *pcr)
+{
+	rtsx_pci_init_cmd(pcr);
+
+	/* Configure GPIO as output */
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, GPIO_CTL, 0x02, 0x02);
+	/* Switch LDO3318 source from DV33 to card_3v3 */
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, LDO_PWR_SEL, 0x03, 0x00);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, LDO_PWR_SEL, 0x03, 0x01);
+	/* LED shine disabled, set initial shine cycle period */
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, OLT_LED_CTL, 0x0F, 0x02);
+	/* Correct driving */
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD,
+			SD30_CLK_DRIVE_SEL, 0xFF, 0x99);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD,
+			SD30_CMD_DRIVE_SEL, 0xFF, 0x99);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD,
+			SD30_DAT_DRIVE_SEL, 0xFF, 0x92);
+
+	return rtsx_pci_send_cmd(pcr, 100);
+}
+
+static int rts5249_optimize_phy(struct rtsx_pcr *pcr)
+{
+	int err;
+
+	err = rtsx_pci_write_phy_register(pcr, PHY_REG_REV, 0xFE46);
+	if (err < 0)
+		return err;
+
+	msleep(1);
+
+	return rtsx_pci_write_phy_register(pcr, PHY_BPCR, 0x05C0);
+}
+
+static int rts5249_turn_on_led(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, GPIO_CTL, 0x02, 0x02);
+}
+
+static int rts5249_turn_off_led(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, GPIO_CTL, 0x02, 0x00);
+}
+
+static int rts5249_enable_auto_blink(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, OLT_LED_CTL, 0x08, 0x08);
+}
+
+static int rts5249_disable_auto_blink(struct rtsx_pcr *pcr)
+{
+	return rtsx_pci_write_register(pcr, OLT_LED_CTL, 0x08, 0x00);
+}
+
+static int rts5249_card_power_on(struct rtsx_pcr *pcr, int card)
+{
+	int err;
+
+	rtsx_pci_init_cmd(pcr);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_PWR_CTL,
+			SD_POWER_MASK, SD_VCC_PARTIAL_POWER_ON);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PWR_GATE_CTRL,
+			LDO3318_PWR_MASK, 0x02);
+	err = rtsx_pci_send_cmd(pcr, 100);
+	if (err < 0)
+		return err;
+
+	msleep(5);
+
+	rtsx_pci_init_cmd(pcr);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_PWR_CTL,
+			SD_POWER_MASK, SD_VCC_POWER_ON);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PWR_GATE_CTRL,
+			LDO3318_PWR_MASK, 0x06);
+	err = rtsx_pci_send_cmd(pcr, 100);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int rts5249_card_power_off(struct rtsx_pcr *pcr, int card)
+{
+	rtsx_pci_init_cmd(pcr);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, CARD_PWR_CTL,
+			SD_POWER_MASK, SD_POWER_OFF);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, PWR_GATE_CTRL,
+			LDO3318_PWR_MASK, 0x00);
+	return rtsx_pci_send_cmd(pcr, 100);
+}
+
+static int rts5249_switch_output_voltage(struct rtsx_pcr *pcr, u8 voltage)
+{
+	int err;
+	u8 clk_drive, cmd_drive, dat_drive;
+
+	if (voltage == OUTPUT_3V3) {
+		err = rtsx_pci_write_phy_register(pcr, PHY_TUNE, 0x4FC0 | 0x24);
+		if (err < 0)
+			return err;
+		clk_drive = 0x99;
+		cmd_drive = 0x99;
+		dat_drive = 0x92;
+	} else if (voltage == OUTPUT_1V8) {
+		err = rtsx_pci_write_phy_register(pcr, PHY_BACR, 0x3C02);
+		if (err < 0)
+			return err;
+		err = rtsx_pci_write_phy_register(pcr, PHY_TUNE, 0x4C40 | 0x24);
+		if (err < 0)
+			return err;
+		clk_drive = 0xb3;
+		cmd_drive = 0xb3;
+		dat_drive = 0xb3;
+	} else {
+		return -EINVAL;
+	}
+
+	/* set pad drive */
+	rtsx_pci_init_cmd(pcr);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD30_CLK_DRIVE_SEL,
+			0xFF, clk_drive);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD30_CMD_DRIVE_SEL,
+			0xFF, cmd_drive);
+	rtsx_pci_add_cmd(pcr, WRITE_REG_CMD, SD30_DAT_DRIVE_SEL,
+			0xFF, dat_drive);
+	return rtsx_pci_send_cmd(pcr, 100);
+}
+
+static const struct pcr_ops rts5249_pcr_ops = {
+	.extra_init_hw = rts5249_extra_init_hw,
+	.optimize_phy = rts5249_optimize_phy,
+	.turn_on_led = rts5249_turn_on_led,
+	.turn_off_led = rts5249_turn_off_led,
+	.enable_auto_blink = rts5249_enable_auto_blink,
+	.disable_auto_blink = rts5249_disable_auto_blink,
+	.card_power_on = rts5249_card_power_on,
+	.card_power_off = rts5249_card_power_off,
+	.switch_output_voltage = rts5249_switch_output_voltage,
+};
+
+/* SD Pull Control Enable:
+ *     SD_DAT[3:0] ==> pull up
+ *     SD_CD       ==> pull up
+ *     SD_WP       ==> pull up
+ *     SD_CMD      ==> pull up
+ *     SD_CLK      ==> pull down
+ */
+static const u32 rts5249_sd_pull_ctl_enable_tbl[] = {
+	RTSX_REG_PAIR(CARD_PULL_CTL1, 0x66),
+	RTSX_REG_PAIR(CARD_PULL_CTL2, 0xAA),
+	RTSX_REG_PAIR(CARD_PULL_CTL3, 0xE9),
+	RTSX_REG_PAIR(CARD_PULL_CTL4, 0xAA),
+	0,
+};
+
+/* SD Pull Control Disable:
+ *     SD_DAT[3:0] ==> pull down
+ *     SD_CD       ==> pull up
+ *     SD_WP       ==> pull down
+ *     SD_CMD      ==> pull down
+ *     SD_CLK      ==> pull down
+ */
+static const u32 rts5249_sd_pull_ctl_disable_tbl[] = {
+	RTSX_REG_PAIR(CARD_PULL_CTL1, 0x66),
+	RTSX_REG_PAIR(CARD_PULL_CTL2, 0x55),
+	RTSX_REG_PAIR(CARD_PULL_CTL3, 0xD5),
+	RTSX_REG_PAIR(CARD_PULL_CTL4, 0x55),
+	0,
+};
+
+/* MS Pull Control Enable:
+ *     MS CD       ==> pull up
+ *     others      ==> pull down
+ */
+static const u32 rts5249_ms_pull_ctl_enable_tbl[] = {
+	RTSX_REG_PAIR(CARD_PULL_CTL4, 0x55),
+	RTSX_REG_PAIR(CARD_PULL_CTL5, 0x55),
+	RTSX_REG_PAIR(CARD_PULL_CTL6, 0x15),
+	0,
+};
+
+/* MS Pull Control Disable:
+ *     MS CD       ==> pull up
+ *     others      ==> pull down
+ */
+static const u32 rts5249_ms_pull_ctl_disable_tbl[] = {
+	RTSX_REG_PAIR(CARD_PULL_CTL4, 0x55),
+	RTSX_REG_PAIR(CARD_PULL_CTL5, 0x55),
+	RTSX_REG_PAIR(CARD_PULL_CTL6, 0x15),
+	0,
+};
+
+void rts5249_init_params(struct rtsx_pcr *pcr)
+{
+	pcr->extra_caps = EXTRA_CAPS_SD_SDR50 | EXTRA_CAPS_SD_SDR104;
+	pcr->num_slots = 2;
+	pcr->ops = &rts5249_pcr_ops;
+
+	pcr->ic_version = rts5249_get_ic_version(pcr);
+	pcr->sd_pull_ctl_enable_tbl = rts5249_sd_pull_ctl_enable_tbl;
+	pcr->sd_pull_ctl_disable_tbl = rts5249_sd_pull_ctl_disable_tbl;
+	pcr->ms_pull_ctl_enable_tbl = rts5249_ms_pull_ctl_enable_tbl;
+	pcr->ms_pull_ctl_disable_tbl = rts5249_ms_pull_ctl_disable_tbl;
+}
diff --git a/drivers/mfd/rtsx_pcr.c b/drivers/mfd/rtsx_pcr.c
index 578a1136b6dc..e968c01ca2ac 100644
--- a/drivers/mfd/rtsx_pcr.c
+++ b/drivers/mfd/rtsx_pcr.c
@@ -56,6 +56,7 @@ static DEFINE_PCI_DEVICE_TABLE(rtsx_pci_ids) = {
 	{ PCI_DEVICE(0x10EC, 0x5229), PCI_CLASS_OTHERS << 16, 0xFF0000 },
 	{ PCI_DEVICE(0x10EC, 0x5289), PCI_CLASS_OTHERS << 16, 0xFF0000 },
 	{ PCI_DEVICE(0x10EC, 0x5227), PCI_CLASS_OTHERS << 16, 0xFF0000 },
+	{ PCI_DEVICE(0x10EC, 0x5249), PCI_CLASS_OTHERS << 16, 0xFF0000 },
 	{ 0, }
 };
 
@@ -1033,6 +1034,10 @@ static int rtsx_pci_init_chip(struct rtsx_pcr *pcr)
 	case 0x5227:
 		rts5227_init_params(pcr);
 		break;
+
+	case 0x5249:
+		rts5249_init_params(pcr);
+		break;
 	}
 
 	dev_dbg(&(pcr->pci->dev), "PID: 0x%04x, IC version: 0x%02x\n",
diff --git a/drivers/mfd/rtsx_pcr.h b/drivers/mfd/rtsx_pcr.h
index 2b3ab8a04823..55fcfc25c4e4 100644
--- a/drivers/mfd/rtsx_pcr.h
+++ b/drivers/mfd/rtsx_pcr.h
@@ -32,5 +32,6 @@ void rts5209_init_params(struct rtsx_pcr *pcr);
 void rts5229_init_params(struct rtsx_pcr *pcr);
 void rtl8411_init_params(struct rtsx_pcr *pcr);
 void rts5227_init_params(struct rtsx_pcr *pcr);
+void rts5249_init_params(struct rtsx_pcr *pcr);
 
 #endif
diff --git a/include/linux/mfd/rtsx_pci.h b/include/linux/mfd/rtsx_pci.h
index 26ea7f1b7caf..86bc635f8385 100644
--- a/include/linux/mfd/rtsx_pci.h
+++ b/include/linux/mfd/rtsx_pci.h
@@ -500,6 +500,8 @@
 #define BPP_POWER_15_PERCENT_ON		0x08
 #define BPP_POWER_ON			0x00
 #define BPP_POWER_MASK			0x0F
+#define SD_VCC_PARTIAL_POWER_ON		0x02
+#define SD_VCC_POWER_ON			0x00
 
 /* PWR_GATE_CTRL */
 #define PWR_GATE_EN			0x01
@@ -689,6 +691,40 @@
 #define IMAGE_FLAG_ADDR0		0xCE80
 #define IMAGE_FLAG_ADDR1		0xCE81
 
+/* Phy register */
+#define PHY_PCR				0x00
+#define PHY_RCR0			0x01
+#define PHY_RCR1			0x02
+#define PHY_RCR2			0x03
+#define PHY_RTCR			0x04
+#define PHY_RDR				0x05
+#define PHY_TCR0			0x06
+#define PHY_TCR1			0x07
+#define PHY_TUNE			0x08
+#define PHY_IMR				0x09
+#define PHY_BPCR			0x0A
+#define PHY_BIST			0x0B
+#define PHY_RAW_L			0x0C
+#define PHY_RAW_H			0x0D
+#define PHY_RAW_DATA			0x0E
+#define PHY_HOST_CLK_CTRL		0x0F
+#define PHY_DMR				0x10
+#define PHY_BACR			0x11
+#define PHY_IER				0x12
+#define PHY_BCSR			0x13
+#define PHY_BPR				0x14
+#define PHY_BPNR2			0x15
+#define PHY_BPNR			0x16
+#define PHY_BRNR2			0x17
+#define PHY_BENR			0x18
+#define PHY_REG_REV			0x19
+#define PHY_FLD0			0x1A
+#define PHY_FLD1			0x1B
+#define PHY_FLD2			0x1C
+#define PHY_FLD3			0x1D
+#define PHY_FLD4			0x1E
+#define PHY_DUM_REG			0x1F
+
 #define rtsx_pci_init_cmd(pcr)		((pcr)->ci = 0)
 
 struct rtsx_pcr;
-- 
cgit 


From 9b8e1a5e4670b3b24c0e35c0884b4185f17fceab Mon Sep 17 00:00:00 2001
From: Andrey Smirnov <andreysm@charmander.(none)>
Date: Thu, 18 Apr 2013 09:58:30 -0700
Subject: mfd: si476x: Add header files and Kbuild plumbing

This patch adds all necessary header files and Kbuild plumbing for the
core driver for Silicon Laboratories Si476x series of AM/FM tuner
chips.

The driver as a whole is implemented as an MFD device and this patch
adds a core portion of it that provides all the necessary
functionality to the two other drivers that represent radio and audio
codec subsystems of the chip.

Acked-by: Hans Verkuil <hans.verkuil@cisco.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 drivers/mfd/Kconfig                 |  13 +
 drivers/mfd/Makefile                |   4 +
 include/linux/mfd/si476x-core.h     | 533 ++++++++++++++++++++++++++++++++++++
 include/linux/mfd/si476x-platform.h | 267 ++++++++++++++++++
 include/linux/mfd/si476x-reports.h  | 163 +++++++++++
 5 files changed, 980 insertions(+)
 create mode 100644 include/linux/mfd/si476x-core.h
 create mode 100644 include/linux/mfd/si476x-platform.h
 create mode 100644 include/linux/mfd/si476x-reports.h

(limited to 'include/linux')

diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 9de05020c660..7acd863287d1 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -487,6 +487,19 @@ config MFD_SEC_CORE
 	 additional drivers must be enabled in order to use the functionality
 	 of the device
 
+config MFD_SI476X_CORE
+	tristate "Silicon Laboratories 4761/64/68 AM/FM radio."
+	depends on I2C
+	select MFD_CORE
+	select REGMAP_I2C
+	help
+	  This is the core driver for the SI476x series of AM/FM
+	  radio. This MFD driver connects the radio-si476x V4L2 module
+	  and the si476x audio codec.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called si476x-core.
+
 config MFD_SM501
 	tristate "Silicon Motion SM501"
 	 ---help---
diff --git a/drivers/mfd/Makefile b/drivers/mfd/Makefile
index 067656651b02..718e94a2a9a7 100644
--- a/drivers/mfd/Makefile
+++ b/drivers/mfd/Makefile
@@ -134,6 +134,10 @@ obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
 obj-$(CONFIG_MFD_TPS6586X)	+= tps6586x.o
 obj-$(CONFIG_MFD_VX855)		+= vx855.o
 obj-$(CONFIG_MFD_WL1273_CORE)	+= wl1273-core.o
+
+si476x-core-y := si476x-cmd.o si476x-prop.o si476x-i2c.o
+obj-$(CONFIG_MFD_SI476X_CORE)	+= si476x-core.o
+
 obj-$(CONFIG_MFD_CS5535)	+= cs5535-mfd.o
 obj-$(CONFIG_MFD_OMAP_USB_HOST)	+= omap-usb-host.o omap-usb-tll.o
 obj-$(CONFIG_MFD_PM8921_CORE) 	+= pm8921-core.o
diff --git a/include/linux/mfd/si476x-core.h b/include/linux/mfd/si476x-core.h
new file mode 100644
index 000000000000..ede3022c03c4
--- /dev/null
+++ b/include/linux/mfd/si476x-core.h
@@ -0,0 +1,533 @@
+/*
+ * include/media/si476x-core.h -- Common definitions for si476x core
+ * device
+ *
+ * Copyright (C) 2012 Innovative Converged Devices(ICD)
+ * Copyright (C) 2013 Andrey Smirnov
+ *
+ * Author: Andrey Smirnov <andrew.smirnov@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#ifndef SI476X_CORE_H
+#define SI476X_CORE_H
+
+#include <linux/kfifo.h>
+#include <linux/atomic.h>
+#include <linux/i2c.h>
+#include <linux/regmap.h>
+#include <linux/mutex.h>
+#include <linux/mfd/core.h>
+#include <linux/videodev2.h>
+#include <linux/regulator/consumer.h>
+
+#include <linux/mfd/si476x-platform.h>
+#include <linux/mfd/si476x-reports.h>
+
+/* Command Timeouts */
+#define SI476X_DEFAULT_TIMEOUT	100000
+#define SI476X_TIMEOUT_TUNE	700000
+#define SI476X_TIMEOUT_POWER_UP	330000
+#define SI476X_STATUS_POLL_US	0
+
+/* -------------------- si476x-i2c.c ----------------------- */
+
+enum si476x_freq_supported_chips {
+	SI476X_CHIP_SI4761 = 1,
+	SI476X_CHIP_SI4764,
+	SI476X_CHIP_SI4768,
+};
+
+enum si476x_part_revisions {
+	SI476X_REVISION_A10 = 0,
+	SI476X_REVISION_A20 = 1,
+	SI476X_REVISION_A30 = 2,
+};
+
+enum si476x_mfd_cells {
+	SI476X_RADIO_CELL = 0,
+	SI476X_CODEC_CELL,
+	SI476X_MFD_CELLS,
+};
+
+/**
+ * enum si476x_power_state - possible power state of the si476x
+ * device.
+ *
+ * @SI476X_POWER_DOWN: In this state all regulators are turned off
+ * and the reset line is pulled low. The device is completely
+ * inactive.
+ * @SI476X_POWER_UP_FULL: In this state all the power regualtors are
+ * turned on, reset line pulled high, IRQ line is enabled(polling is
+ * active for polling use scenario) and device is turned on with
+ * POWER_UP command. The device is ready to be used.
+ * @SI476X_POWER_INCONSISTENT: This state indicates that previous
+ * power down was inconsistent, meaning some of the regulators were
+ * not turned down and thus use of the device, without power-cycling
+ * is impossible.
+ */
+enum si476x_power_state {
+	SI476X_POWER_DOWN		= 0,
+	SI476X_POWER_UP_FULL		= 1,
+	SI476X_POWER_INCONSISTENT	= 2,
+};
+
+/**
+ * struct si476x_core - internal data structure representing the
+ * underlying "core" device which all the MFD cell-devices use.
+ *
+ * @client: Actual I2C client used to transfer commands to the chip.
+ * @chip_id: Last digit of the chip model(E.g. "1" for SI4761)
+ * @cells: MFD cell devices created by this driver.
+ * @cmd_lock: Mutex used to serialize all the requests to the core
+ * device. This filed should not be used directly. Instead
+ * si476x_core_lock()/si476x_core_unlock() should be used to get
+ * exclusive access to the "core" device.
+ * @users: Active users counter(Used by the radio cell)
+ * @rds_read_queue: Wait queue used to wait for RDS data.
+ * @rds_fifo: FIFO in which all the RDS data received from the chip is
+ * placed.
+ * @rds_fifo_drainer: Worker that drains on-chip RDS FIFO.
+ * @rds_drainer_is_working: Flag used for launching only one instance
+ * of the @rds_fifo_drainer.
+ * @rds_drainer_status_lock: Lock used to guard access to the
+ * @rds_drainer_is_working variable.
+ * @command: Wait queue for wainting on the command comapletion.
+ * @cts: Clear To Send flag set upon receiving first status with CTS
+ * set.
+ * @tuning: Wait queue used for wainting for tune/seek comand
+ * completion.
+ * @stc: Similar to @cts, but for the STC bit of the status value.
+ * @power_up_parameters: Parameters used as argument for POWER_UP
+ * command when the device is started.
+ * @state: Current power state of the device.
+ * @supplues: Structure containing handles to all power supplies used
+ * by the device (NULL ones are ignored).
+ * @gpio_reset: GPIO pin connectet to the RSTB pin of the chip.
+ * @pinmux: Chip's configurable pins configuration.
+ * @diversity_mode: Chips role when functioning in diversity mode.
+ * @status_monitor: Polling worker used in polling use case scenarion
+ * (when IRQ is not avalible).
+ * @revision: Chip's running firmware revision number(Used for correct
+ * command set support).
+ */
+
+struct si476x_core {
+	struct i2c_client *client;
+	struct regmap *regmap;
+	int chip_id;
+	struct mfd_cell cells[SI476X_MFD_CELLS];
+
+	struct mutex cmd_lock; /* for serializing fm radio operations */
+	atomic_t users;
+
+	wait_queue_head_t  rds_read_queue;
+	struct kfifo       rds_fifo;
+	struct work_struct rds_fifo_drainer;
+	bool               rds_drainer_is_working;
+	struct mutex       rds_drainer_status_lock;
+
+	wait_queue_head_t command;
+	atomic_t          cts;
+
+	wait_queue_head_t tuning;
+	atomic_t          stc;
+
+	struct si476x_power_up_args power_up_parameters;
+
+	enum si476x_power_state power_state;
+
+	struct regulator_bulk_data supplies[4];
+
+	int gpio_reset;
+
+	struct si476x_pinmux pinmux;
+	enum si476x_phase_diversity_mode diversity_mode;
+
+	atomic_t is_alive;
+
+	struct delayed_work status_monitor;
+#define SI476X_WORK_TO_CORE(w) container_of(to_delayed_work(w),	\
+					    struct si476x_core,	\
+					    status_monitor)
+
+	int revision;
+
+	int rds_fifo_depth;
+};
+
+static inline struct si476x_core *i2c_mfd_cell_to_core(struct device *dev)
+{
+	struct i2c_client *client = to_i2c_client(dev->parent);
+	return i2c_get_clientdata(client);
+}
+
+
+/**
+ * si476x_core_lock() - lock the core device to get an exclusive access
+ * to it.
+ */
+static inline void si476x_core_lock(struct si476x_core *core)
+{
+	mutex_lock(&core->cmd_lock);
+}
+
+/**
+ * si476x_core_unlock() - unlock the core device to relinquish an
+ * exclusive access to it.
+ */
+static inline void si476x_core_unlock(struct si476x_core *core)
+{
+	mutex_unlock(&core->cmd_lock);
+}
+
+/* *_TUNE_FREQ family of commands accept frequency in multiples of
+    10kHz */
+static inline u16 hz_to_si476x(struct si476x_core *core, int freq)
+{
+	u16 result;
+
+	switch (core->power_up_parameters.func) {
+	default:
+	case SI476X_FUNC_FM_RECEIVER:
+		result = freq / 10000;
+		break;
+	case SI476X_FUNC_AM_RECEIVER:
+		result = freq / 1000;
+		break;
+	}
+
+	return result;
+}
+
+static inline int si476x_to_hz(struct si476x_core *core, u16 freq)
+{
+	int result;
+
+	switch (core->power_up_parameters.func) {
+	default:
+	case SI476X_FUNC_FM_RECEIVER:
+		result = freq * 10000;
+		break;
+	case SI476X_FUNC_AM_RECEIVER:
+		result = freq * 1000;
+		break;
+	}
+
+	return result;
+}
+
+/* Since the V4L2_TUNER_CAP_LOW flag is supplied, V4L2 subsystem
+ * mesures frequency in 62.5 Hz units */
+
+static inline int hz_to_v4l2(int freq)
+{
+	return (freq * 10) / 625;
+}
+
+static inline int v4l2_to_hz(int freq)
+{
+	return (freq * 625) / 10;
+}
+
+static inline u16 v4l2_to_si476x(struct si476x_core *core, int freq)
+{
+	return hz_to_si476x(core, v4l2_to_hz(freq));
+}
+
+static inline int si476x_to_v4l2(struct si476x_core *core, u16 freq)
+{
+	return hz_to_v4l2(si476x_to_hz(core, freq));
+}
+
+
+
+/**
+ * struct si476x_func_info - structure containing result of the
+ * FUNC_INFO command.
+ *
+ * @firmware.major: Firmware major number.
+ * @firmware.minor[...]: Firmware minor numbers.
+ * @patch_id:
+ * @func: Mode tuner is working in.
+ */
+struct si476x_func_info {
+	struct {
+		u8 major, minor[2];
+	} firmware;
+	u16 patch_id;
+	enum si476x_func func;
+};
+
+/**
+ * struct si476x_power_down_args - structure used to pass parameters
+ * to POWER_DOWN command
+ *
+ * @xosc: true - Power down, but leav oscillator running.
+ *        false - Full power down.
+ */
+struct si476x_power_down_args {
+	bool xosc;
+};
+
+/**
+ * enum si476x_tunemode - enum representing possible tune modes for
+ * the chip.
+ * @SI476X_TM_VALIDATED_NORMAL_TUNE: Unconditionally stay on the new
+ * channel after tune, tune status is valid.
+ * @SI476X_TM_INVALIDATED_FAST_TUNE: Unconditionally stay in the new
+ * channel after tune, tune status invalid.
+ * @SI476X_TM_VALIDATED_AF_TUNE: Jump back to previous channel if
+ * metric thresholds are not met.
+ * @SI476X_TM_VALIDATED_AF_CHECK: Unconditionally jump back to the
+ * previous channel.
+ */
+enum si476x_tunemode {
+	SI476X_TM_VALIDATED_NORMAL_TUNE = 0,
+	SI476X_TM_INVALIDATED_FAST_TUNE = 1,
+	SI476X_TM_VALIDATED_AF_TUNE     = 2,
+	SI476X_TM_VALIDATED_AF_CHECK    = 3,
+};
+
+/**
+ * enum si476x_smoothmetrics - enum containing the possible setting fo
+ * audio transitioning of the chip
+ * @SI476X_SM_INITIALIZE_AUDIO: Initialize audio state to match this
+ * new channel
+ * @SI476X_SM_TRANSITION_AUDIO: Transition audio state from previous
+ * channel values to the new values
+ */
+enum si476x_smoothmetrics {
+	SI476X_SM_INITIALIZE_AUDIO = 0,
+	SI476X_SM_TRANSITION_AUDIO = 1,
+};
+
+/**
+ * struct si476x_rds_status_report - the structure representing the
+ * response to 'FM_RD_STATUS' command
+ * @rdstpptyint: Traffic program flag(TP) and/or program type(PTY)
+ * code has changed.
+ * @rdspiint: Program indentifiaction(PI) code has changed.
+ * @rdssyncint: RDS synchronization has changed.
+ * @rdsfifoint: RDS was received and the RDS FIFO has at least
+ * 'FM_RDS_INTERRUPT_FIFO_COUNT' elements in it.
+ * @tpptyvalid: TP flag and PTY code are valid falg.
+ * @pivalid: PI code is valid flag.
+ * @rdssync: RDS is currently synchronized.
+ * @rdsfifolost: On or more RDS groups have been lost/discarded flag.
+ * @tp: Current channel's TP flag.
+ * @pty: Current channel's PTY code.
+ * @pi: Current channel's PI code.
+ * @rdsfifoused: Number of blocks remaining in the RDS FIFO (0 if
+ * empty).
+ */
+struct si476x_rds_status_report {
+	bool rdstpptyint, rdspiint, rdssyncint, rdsfifoint;
+	bool tpptyvalid, pivalid, rdssync, rdsfifolost;
+	bool tp;
+
+	u8 pty;
+	u16 pi;
+
+	u8 rdsfifoused;
+	u8 ble[4];
+
+	struct v4l2_rds_data rds[4];
+};
+
+struct si476x_rsq_status_args {
+	bool primary;
+	bool rsqack;
+	bool attune;
+	bool cancel;
+	bool stcack;
+};
+
+enum si476x_injside {
+	SI476X_INJSIDE_AUTO	= 0,
+	SI476X_INJSIDE_LOW	= 1,
+	SI476X_INJSIDE_HIGH	= 2,
+};
+
+struct si476x_tune_freq_args {
+	bool zifsr;
+	bool hd;
+	enum si476x_injside injside;
+	int freq;
+	enum si476x_tunemode tunemode;
+	enum si476x_smoothmetrics smoothmetrics;
+	int antcap;
+};
+
+int  si476x_core_stop(struct si476x_core *, bool);
+int  si476x_core_start(struct si476x_core *, bool);
+int  si476x_core_set_power_state(struct si476x_core *, enum si476x_power_state);
+bool si476x_core_has_am(struct si476x_core *);
+bool si476x_core_has_diversity(struct si476x_core *);
+bool si476x_core_is_a_secondary_tuner(struct si476x_core *);
+bool si476x_core_is_a_primary_tuner(struct si476x_core *);
+bool si476x_core_is_in_am_receiver_mode(struct si476x_core *core);
+bool si476x_core_is_powered_up(struct si476x_core *core);
+
+enum si476x_i2c_type {
+	SI476X_I2C_SEND,
+	SI476X_I2C_RECV
+};
+
+int si476x_core_i2c_xfer(struct si476x_core *,
+			 enum si476x_i2c_type,
+			 char *, int);
+
+
+/* -------------------- si476x-cmd.c ----------------------- */
+
+int si476x_core_cmd_func_info(struct si476x_core *, struct si476x_func_info *);
+int si476x_core_cmd_set_property(struct si476x_core *, u16, u16);
+int si476x_core_cmd_get_property(struct si476x_core *, u16);
+int si476x_core_cmd_dig_audio_pin_cfg(struct si476x_core *,
+				      enum si476x_dclk_config,
+				      enum si476x_dfs_config,
+				      enum si476x_dout_config,
+				      enum si476x_xout_config);
+int si476x_core_cmd_zif_pin_cfg(struct si476x_core *,
+				enum si476x_iqclk_config,
+				enum si476x_iqfs_config,
+				enum si476x_iout_config,
+				enum si476x_qout_config);
+int si476x_core_cmd_ic_link_gpo_ctl_pin_cfg(struct si476x_core *,
+					    enum si476x_icin_config,
+					    enum si476x_icip_config,
+					    enum si476x_icon_config,
+					    enum si476x_icop_config);
+int si476x_core_cmd_ana_audio_pin_cfg(struct si476x_core *,
+				      enum si476x_lrout_config);
+int si476x_core_cmd_intb_pin_cfg(struct si476x_core *, enum si476x_intb_config,
+				 enum si476x_a1_config);
+int si476x_core_cmd_fm_seek_start(struct si476x_core *, bool, bool);
+int si476x_core_cmd_am_seek_start(struct si476x_core *, bool, bool);
+int si476x_core_cmd_fm_rds_status(struct si476x_core *, bool, bool, bool,
+				  struct si476x_rds_status_report *);
+int si476x_core_cmd_fm_rds_blockcount(struct si476x_core *, bool,
+				      struct si476x_rds_blockcount_report *);
+int si476x_core_cmd_fm_tune_freq(struct si476x_core *,
+				 struct si476x_tune_freq_args *);
+int si476x_core_cmd_am_tune_freq(struct si476x_core *,
+				 struct si476x_tune_freq_args *);
+int si476x_core_cmd_am_rsq_status(struct si476x_core *,
+				  struct si476x_rsq_status_args *,
+				  struct si476x_rsq_status_report *);
+int si476x_core_cmd_fm_rsq_status(struct si476x_core *,
+				  struct si476x_rsq_status_args *,
+				  struct si476x_rsq_status_report *);
+int si476x_core_cmd_power_up(struct si476x_core *,
+			     struct si476x_power_up_args *);
+int si476x_core_cmd_power_down(struct si476x_core *,
+			       struct si476x_power_down_args *);
+int si476x_core_cmd_fm_phase_div_status(struct si476x_core *);
+int si476x_core_cmd_fm_phase_diversity(struct si476x_core *,
+				       enum si476x_phase_diversity_mode);
+
+int si476x_core_cmd_fm_acf_status(struct si476x_core *,
+				  struct si476x_acf_status_report *);
+int si476x_core_cmd_am_acf_status(struct si476x_core *,
+				  struct si476x_acf_status_report *);
+int si476x_core_cmd_agc_status(struct si476x_core *,
+			       struct si476x_agc_status_report *);
+
+enum si476x_power_grid_type {
+	SI476X_POWER_GRID_50HZ = 0,
+	SI476X_POWER_GRID_60HZ,
+};
+
+/* Properties  */
+
+enum si476x_interrupt_flags {
+	SI476X_STCIEN = (1 << 0),
+	SI476X_ACFIEN = (1 << 1),
+	SI476X_RDSIEN = (1 << 2),
+	SI476X_RSQIEN = (1 << 3),
+
+	SI476X_ERRIEN = (1 << 6),
+	SI476X_CTSIEN = (1 << 7),
+
+	SI476X_STCREP = (1 << 8),
+	SI476X_ACFREP = (1 << 9),
+	SI476X_RDSREP = (1 << 10),
+	SI476X_RSQREP = (1 << 11),
+};
+
+enum si476x_rdsint_sources {
+	SI476X_RDSTPPTY = (1 << 4),
+	SI476X_RDSPI    = (1 << 3),
+	SI476X_RDSSYNC	= (1 << 1),
+	SI476X_RDSRECV	= (1 << 0),
+};
+
+enum si476x_status_response_bits {
+	SI476X_CTS	  = (1 << 7),
+	SI476X_ERR	  = (1 << 6),
+	/* Status response for WB receiver */
+	SI476X_WB_ASQ_INT = (1 << 4),
+	SI476X_RSQ_INT    = (1 << 3),
+	/* Status response for FM receiver */
+	SI476X_FM_RDS_INT = (1 << 2),
+	SI476X_ACF_INT    = (1 << 1),
+	SI476X_STC_INT    = (1 << 0),
+};
+
+/* -------------------- si476x-prop.c ----------------------- */
+
+enum si476x_common_receiver_properties {
+	SI476X_PROP_INT_CTL_ENABLE			= 0x0000,
+	SI476X_PROP_DIGITAL_IO_INPUT_SAMPLE_RATE	= 0x0200,
+	SI476X_PROP_DIGITAL_IO_INPUT_FORMAT		= 0x0201,
+	SI476X_PROP_DIGITAL_IO_OUTPUT_SAMPLE_RATE	= 0x0202,
+	SI476X_PROP_DIGITAL_IO_OUTPUT_FORMAT		= 0x0203,
+
+	SI476X_PROP_SEEK_BAND_BOTTOM			= 0x1100,
+	SI476X_PROP_SEEK_BAND_TOP			= 0x1101,
+	SI476X_PROP_SEEK_FREQUENCY_SPACING		= 0x1102,
+
+	SI476X_PROP_VALID_MAX_TUNE_ERROR		= 0x2000,
+	SI476X_PROP_VALID_SNR_THRESHOLD			= 0x2003,
+	SI476X_PROP_VALID_RSSI_THRESHOLD		= 0x2004,
+};
+
+enum si476x_am_receiver_properties {
+	SI476X_PROP_AUDIO_PWR_LINE_FILTER		= 0x0303,
+};
+
+enum si476x_fm_receiver_properties {
+	SI476X_PROP_AUDIO_DEEMPHASIS			= 0x0302,
+
+	SI476X_PROP_FM_RDS_INTERRUPT_SOURCE		= 0x4000,
+	SI476X_PROP_FM_RDS_INTERRUPT_FIFO_COUNT		= 0x4001,
+	SI476X_PROP_FM_RDS_CONFIG			= 0x4002,
+};
+
+enum si476x_prop_audio_pwr_line_filter_bits {
+	SI476X_PROP_PWR_HARMONICS_MASK	= 0b0000000000011111,
+	SI476X_PROP_PWR_GRID_MASK	= 0b0000000100000000,
+	SI476X_PROP_PWR_ENABLE_MASK	= 0b0000001000000000,
+	SI476X_PROP_PWR_GRID_50HZ	= 0b0000000000000000,
+	SI476X_PROP_PWR_GRID_60HZ	= 0b0000000100000000,
+};
+
+enum si476x_prop_fm_rds_config_bits {
+	SI476X_PROP_RDSEN_MASK	= 0x1,
+	SI476X_PROP_RDSEN	= 0x1,
+};
+
+
+struct regmap *devm_regmap_init_si476x(struct si476x_core *);
+
+#endif	/* SI476X_CORE_H */
diff --git a/include/linux/mfd/si476x-platform.h b/include/linux/mfd/si476x-platform.h
new file mode 100644
index 000000000000..88bb93b7a9d5
--- /dev/null
+++ b/include/linux/mfd/si476x-platform.h
@@ -0,0 +1,267 @@
+/*
+ * include/media/si476x-platform.h -- Platform data specific definitions
+ *
+ * Copyright (C) 2013 Andrey Smirnov
+ *
+ * Author: Andrey Smirnov <andrew.smirnov@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#ifndef __SI476X_PLATFORM_H__
+#define __SI476X_PLATFORM_H__
+
+/* It is possible to select one of the four adresses using pins A0
+ * and A1 on SI476x */
+#define SI476X_I2C_ADDR_1	0x60
+#define SI476X_I2C_ADDR_2	0x61
+#define SI476X_I2C_ADDR_3	0x62
+#define SI476X_I2C_ADDR_4	0x63
+
+enum si476x_iqclk_config {
+	SI476X_IQCLK_NOOP = 0,
+	SI476X_IQCLK_TRISTATE = 1,
+	SI476X_IQCLK_IQ = 21,
+};
+enum si476x_iqfs_config {
+	SI476X_IQFS_NOOP = 0,
+	SI476X_IQFS_TRISTATE = 1,
+	SI476X_IQFS_IQ = 21,
+};
+enum si476x_iout_config {
+	SI476X_IOUT_NOOP = 0,
+	SI476X_IOUT_TRISTATE = 1,
+	SI476X_IOUT_OUTPUT = 22,
+};
+enum si476x_qout_config {
+	SI476X_QOUT_NOOP = 0,
+	SI476X_QOUT_TRISTATE = 1,
+	SI476X_QOUT_OUTPUT = 22,
+};
+
+enum si476x_dclk_config {
+	SI476X_DCLK_NOOP      = 0,
+	SI476X_DCLK_TRISTATE  = 1,
+	SI476X_DCLK_DAUDIO    = 10,
+};
+
+enum si476x_dfs_config {
+	SI476X_DFS_NOOP      = 0,
+	SI476X_DFS_TRISTATE  = 1,
+	SI476X_DFS_DAUDIO    = 10,
+};
+
+enum si476x_dout_config {
+	SI476X_DOUT_NOOP       = 0,
+	SI476X_DOUT_TRISTATE   = 1,
+	SI476X_DOUT_I2S_OUTPUT = 12,
+	SI476X_DOUT_I2S_INPUT  = 13,
+};
+
+enum si476x_xout_config {
+	SI476X_XOUT_NOOP        = 0,
+	SI476X_XOUT_TRISTATE    = 1,
+	SI476X_XOUT_I2S_INPUT   = 13,
+	SI476X_XOUT_MODE_SELECT = 23,
+};
+
+enum si476x_icin_config {
+	SI476X_ICIN_NOOP	= 0,
+	SI476X_ICIN_TRISTATE	= 1,
+	SI476X_ICIN_GPO1_HIGH	= 2,
+	SI476X_ICIN_GPO1_LOW	= 3,
+	SI476X_ICIN_IC_LINK	= 30,
+};
+
+enum si476x_icip_config {
+	SI476X_ICIP_NOOP	= 0,
+	SI476X_ICIP_TRISTATE	= 1,
+	SI476X_ICIP_GPO2_HIGH	= 2,
+	SI476X_ICIP_GPO2_LOW	= 3,
+	SI476X_ICIP_IC_LINK	= 30,
+};
+
+enum si476x_icon_config {
+	SI476X_ICON_NOOP	= 0,
+	SI476X_ICON_TRISTATE	= 1,
+	SI476X_ICON_I2S		= 10,
+	SI476X_ICON_IC_LINK	= 30,
+};
+
+enum si476x_icop_config {
+	SI476X_ICOP_NOOP	= 0,
+	SI476X_ICOP_TRISTATE	= 1,
+	SI476X_ICOP_I2S		= 10,
+	SI476X_ICOP_IC_LINK	= 30,
+};
+
+
+enum si476x_lrout_config {
+	SI476X_LROUT_NOOP	= 0,
+	SI476X_LROUT_TRISTATE	= 1,
+	SI476X_LROUT_AUDIO	= 2,
+	SI476X_LROUT_MPX	= 3,
+};
+
+
+enum si476x_intb_config {
+	SI476X_INTB_NOOP     = 0,
+	SI476X_INTB_TRISTATE = 1,
+	SI476X_INTB_DAUDIO   = 10,
+	SI476X_INTB_IRQ      = 40,
+};
+
+enum si476x_a1_config {
+	SI476X_A1_NOOP     = 0,
+	SI476X_A1_TRISTATE = 1,
+	SI476X_A1_IRQ      = 40,
+};
+
+
+struct si476x_pinmux {
+	enum si476x_dclk_config  dclk;
+	enum si476x_dfs_config   dfs;
+	enum si476x_dout_config  dout;
+	enum si476x_xout_config  xout;
+
+	enum si476x_iqclk_config iqclk;
+	enum si476x_iqfs_config  iqfs;
+	enum si476x_iout_config  iout;
+	enum si476x_qout_config  qout;
+
+	enum si476x_icin_config  icin;
+	enum si476x_icip_config  icip;
+	enum si476x_icon_config  icon;
+	enum si476x_icop_config  icop;
+
+	enum si476x_lrout_config lrout;
+
+	enum si476x_intb_config  intb;
+	enum si476x_a1_config    a1;
+};
+
+enum si476x_ibias6x {
+	SI476X_IBIAS6X_OTHER			= 0,
+	SI476X_IBIAS6X_RCVR1_NON_4MHZ_CLK	= 1,
+};
+
+enum si476x_xstart {
+	SI476X_XSTART_MULTIPLE_TUNER	= 0x11,
+	SI476X_XSTART_NORMAL		= 0x77,
+};
+
+enum si476x_freq {
+	SI476X_FREQ_4_MHZ		= 0,
+	SI476X_FREQ_37P209375_MHZ	= 1,
+	SI476X_FREQ_36P4_MHZ		= 2,
+	SI476X_FREQ_37P8_MHZ		=  3,
+};
+
+enum si476x_xmode {
+	SI476X_XMODE_CRYSTAL_RCVR1	= 1,
+	SI476X_XMODE_EXT_CLOCK		= 2,
+	SI476X_XMODE_CRYSTAL_RCVR2_3	= 3,
+};
+
+enum si476x_xbiashc {
+	SI476X_XBIASHC_SINGLE_RECEIVER = 0,
+	SI476X_XBIASHC_MULTIPLE_RECEIVER = 1,
+};
+
+enum si476x_xbias {
+	SI476X_XBIAS_RCVR2_3	= 0,
+	SI476X_XBIAS_4MHZ_RCVR1 = 3,
+	SI476X_XBIAS_RCVR1	= 7,
+};
+
+enum si476x_func {
+	SI476X_FUNC_BOOTLOADER	= 0,
+	SI476X_FUNC_FM_RECEIVER = 1,
+	SI476X_FUNC_AM_RECEIVER = 2,
+	SI476X_FUNC_WB_RECEIVER = 3,
+};
+
+
+/**
+ * @xcload: Selects the amount of additional on-chip capacitance to
+ *          be connected between XTAL1 and gnd and between XTAL2 and
+ *          GND. One half of the capacitance value shown here is the
+ *          additional load capacitance presented to the xtal. The
+ *          minimum step size is 0.277 pF. Recommended value is 0x28
+ *          but it will be layout dependent. Range is 0–0x3F i.e.
+ *          (0–16.33 pF)
+ * @ctsien: enable CTSINT(interrupt request when CTS condition
+ *          arises) when set
+ * @intsel: when set A1 pin becomes the interrupt pin; otherwise,
+ *          INTB is the interrupt pin
+ * @func:   selects the boot function of the device. I.e.
+ *          SI476X_BOOTLOADER  - Boot loader
+ *          SI476X_FM_RECEIVER - FM receiver
+ *          SI476X_AM_RECEIVER - AM receiver
+ *          SI476X_WB_RECEIVER - Weatherband receiver
+ * @freq:   oscillator's crystal frequency:
+ *          SI476X_XTAL_37P209375_MHZ - 37.209375 Mhz
+ *          SI476X_XTAL_36P4_MHZ      - 36.4 Mhz
+ *          SI476X_XTAL_37P8_MHZ      - 37.8 Mhz
+ */
+struct si476x_power_up_args {
+	enum si476x_ibias6x ibias6x;
+	enum si476x_xstart  xstart;
+	u8   xcload;
+	bool fastboot;
+	enum si476x_xbiashc xbiashc;
+	enum si476x_xbias   xbias;
+	enum si476x_func    func;
+	enum si476x_freq    freq;
+	enum si476x_xmode   xmode;
+};
+
+
+/**
+ * enum si476x_phase_diversity_mode - possbile phase diversity modes
+ * for SI4764/5/6/7 chips.
+ *
+ * @SI476X_PHDIV_DISABLED:		Phase diversity feature is
+ *					disabled.
+ * @SI476X_PHDIV_PRIMARY_COMBINING:	Tuner works as a primary tuner
+ *					in combination with a
+ *					secondary one.
+ * @SI476X_PHDIV_PRIMARY_ANTENNA:	Tuner works as a primary tuner
+ *					using only its own antenna.
+ * @SI476X_PHDIV_SECONDARY_ANTENNA:	Tuner works as a primary tuner
+ *					usning seconary tuner's antenna.
+ * @SI476X_PHDIV_SECONDARY_COMBINING:	Tuner works as a secondary
+ *					tuner in combination with the
+ *					primary one.
+ */
+enum si476x_phase_diversity_mode {
+	SI476X_PHDIV_DISABLED			= 0,
+	SI476X_PHDIV_PRIMARY_COMBINING		= 1,
+	SI476X_PHDIV_PRIMARY_ANTENNA		= 2,
+	SI476X_PHDIV_SECONDARY_ANTENNA		= 3,
+	SI476X_PHDIV_SECONDARY_COMBINING	= 5,
+};
+
+
+/*
+ * Platform dependent definition
+ */
+struct si476x_platform_data {
+	int gpio_reset; /* < 0 if not used */
+
+	struct si476x_power_up_args power_up_parameters;
+	enum si476x_phase_diversity_mode diversity_mode;
+
+	struct si476x_pinmux pinmux;
+};
+
+
+#endif /* __SI476X_PLATFORM_H__ */
diff --git a/include/linux/mfd/si476x-reports.h b/include/linux/mfd/si476x-reports.h
new file mode 100644
index 000000000000..e0b9455a79c0
--- /dev/null
+++ b/include/linux/mfd/si476x-reports.h
@@ -0,0 +1,163 @@
+/*
+ * include/media/si476x-platform.h -- Definitions of the data formats
+ * returned by debugfs hooks
+ *
+ * Copyright (C) 2013 Andrey Smirnov
+ *
+ * Author: Andrey Smirnov <andrew.smirnov@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ */
+
+#ifndef __SI476X_REPORTS_H__
+#define __SI476X_REPORTS_H__
+
+/**
+ * struct si476x_rsq_status - structure containing received signal
+ * quality
+ * @multhint:   Multipath Detect High.
+ *              true  - Indicatedes that the value is below
+ *                      FM_RSQ_MULTIPATH_HIGH_THRESHOLD
+ *              false - Indicatedes that the value is above
+ *                      FM_RSQ_MULTIPATH_HIGH_THRESHOLD
+ * @multlint:   Multipath Detect Low.
+ *              true  - Indicatedes that the value is below
+ *                      FM_RSQ_MULTIPATH_LOW_THRESHOLD
+ *              false - Indicatedes that the value is above
+ *                      FM_RSQ_MULTIPATH_LOW_THRESHOLD
+ * @snrhint:    SNR Detect High.
+ *              true  - Indicatedes that the value is below
+ *                      FM_RSQ_SNR_HIGH_THRESHOLD
+ *              false - Indicatedes that the value is above
+ *                      FM_RSQ_SNR_HIGH_THRESHOLD
+ * @snrlint:    SNR Detect Low.
+ *              true  - Indicatedes that the value is below
+ *                      FM_RSQ_SNR_LOW_THRESHOLD
+ *              false - Indicatedes that the value is above
+ *                      FM_RSQ_SNR_LOW_THRESHOLD
+ * @rssihint:   RSSI Detect High.
+ *              true  - Indicatedes that the value is below
+ *                      FM_RSQ_RSSI_HIGH_THRESHOLD
+ *              false - Indicatedes that the value is above
+ *                      FM_RSQ_RSSI_HIGH_THRESHOLD
+ * @rssilint:   RSSI Detect Low.
+ *              true  - Indicatedes that the value is below
+ *                      FM_RSQ_RSSI_LOW_THRESHOLD
+ *              false - Indicatedes that the value is above
+ *                      FM_RSQ_RSSI_LOW_THRESHOLD
+ * @bltf:       Band Limit.
+ *              Set if seek command hits the band limit or wrapped to
+ *              the original frequency.
+ * @snr_ready:  SNR measurement in progress.
+ * @rssiready:  RSSI measurement in progress.
+ * @afcrl:      Set if FREQOFF >= MAX_TUNE_ERROR
+ * @valid:      Set if the channel is valid
+ *               rssi < FM_VALID_RSSI_THRESHOLD
+ *               snr  < FM_VALID_SNR_THRESHOLD
+ *               tune_error < FM_VALID_MAX_TUNE_ERROR
+ * @readfreq:   Current tuned frequency.
+ * @freqoff:    Signed frequency offset.
+ * @rssi:       Received Signal Strength Indicator(dBuV).
+ * @snr:        RF SNR Indicator(dB).
+ * @lassi:
+ * @hassi:      Low/High side Adjacent(100 kHz) Channel Strength Indicator
+ * @mult:       Multipath indicator
+ * @dev:        Who knows? But values may vary.
+ * @readantcap: Antenna tuning capacity value.
+ * @assi:       Adjacent Channel(+/- 200kHz) Strength Indicator
+ * @usn:        Ultrasonic Noise Inticator in -DBFS
+ */
+struct si476x_rsq_status_report {
+	__u8 multhint, multlint;
+	__u8 snrhint,  snrlint;
+	__u8 rssihint, rssilint;
+	__u8 bltf;
+	__u8 snr_ready;
+	__u8 rssiready;
+	__u8 injside;
+	__u8 afcrl;
+	__u8 valid;
+
+	__u16 readfreq;
+	__s8  freqoff;
+	__s8  rssi;
+	__s8  snr;
+	__s8  issi;
+	__s8  lassi, hassi;
+	__s8  mult;
+	__u8  dev;
+	__u16 readantcap;
+	__s8  assi;
+	__s8  usn;
+
+	__u8 pilotdev;
+	__u8 rdsdev;
+	__u8 assidev;
+	__u8 strongdev;
+	__u16 rdspi;
+} __packed;
+
+/**
+ * si476x_acf_status_report - ACF report results
+ *
+ * @blend_int: If set, indicates that stereo separation has crossed
+ * below the blend threshold as set by FM_ACF_BLEND_THRESHOLD
+ * @hblend_int: If set, indicates that HiBlend cutoff frequency is
+ * lower than threshold as set by FM_ACF_HBLEND_THRESHOLD
+ * @hicut_int:  If set, indicates that HiCut cutoff frequency is lower
+ * than the threshold set by ACF_
+
+ */
+struct si476x_acf_status_report {
+	__u8 blend_int;
+	__u8 hblend_int;
+	__u8 hicut_int;
+	__u8 chbw_int;
+	__u8 softmute_int;
+	__u8 smute;
+	__u8 smattn;
+	__u8 chbw;
+	__u8 hicut;
+	__u8 hiblend;
+	__u8 pilot;
+	__u8 stblend;
+} __packed;
+
+enum si476x_fmagc {
+	SI476X_FMAGC_10K_OHM	= 0,
+	SI476X_FMAGC_800_OHM	= 1,
+	SI476X_FMAGC_400_OHM	= 2,
+	SI476X_FMAGC_200_OHM	= 4,
+	SI476X_FMAGC_100_OHM	= 8,
+	SI476X_FMAGC_50_OHM	= 16,
+	SI476X_FMAGC_25_OHM	= 32,
+	SI476X_FMAGC_12P5_OHM	= 64,
+	SI476X_FMAGC_6P25_OHM	= 128,
+};
+
+struct si476x_agc_status_report {
+	__u8 mxhi;
+	__u8 mxlo;
+	__u8 lnahi;
+	__u8 lnalo;
+	__u8 fmagc1;
+	__u8 fmagc2;
+	__u8 pgagain;
+	__u8 fmwblang;
+} __packed;
+
+struct si476x_rds_blockcount_report {
+	__u16 expected;
+	__u16 received;
+	__u16 uncorrectable;
+} __packed;
+
+#endif  /* __SI476X_REPORTS_H__ */
-- 
cgit 


From f646968f8f7c624587de729115d802372b9063dd Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 19 Apr 2013 02:04:27 +0000
Subject: net: vlan: rename NETIF_F_HW_VLAN_* feature flags to
 NETIF_F_HW_VLAN_CTAG_*

Rename the hardware VLAN acceleration features to include "CTAG" to indicate
that they only support CTAGs. Follow up patches will introduce 802.1ad
server provider tagging (STAGs) and require the distinction for hardware not
supporting acclerating both.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/nes/nes_nic.c                | 14 ++++++------
 drivers/net/bonding/bond_main.c                    |  6 +++---
 drivers/net/ethernet/3com/typhoon.c                |  4 ++--
 drivers/net/ethernet/adaptec/starfire.c            |  2 +-
 drivers/net/ethernet/alteon/acenic.c               |  2 +-
 drivers/net/ethernet/amd/amd8111e.c                |  4 ++--
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c    | 22 +++++++++----------
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c    | 14 ++++++------
 drivers/net/ethernet/atheros/atlx/atl1.c           |  4 ++--
 drivers/net/ethernet/atheros/atlx/atl2.c           | 16 +++++++-------
 drivers/net/ethernet/atheros/atlx/atlx.c           | 10 ++++-----
 drivers/net/ethernet/broadcom/bnx2.c               | 10 ++++-----
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c   |  4 ++--
 drivers/net/ethernet/broadcom/tg3.c                |  2 +-
 drivers/net/ethernet/brocade/bna/bnad.c            |  4 ++--
 drivers/net/ethernet/chelsio/cxgb/cxgb2.c          | 13 +++++------
 drivers/net/ethernet/chelsio/cxgb/sge.c            |  2 +-
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c    | 20 +++++++++--------
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c    | 10 ++++-----
 .../net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c    | 15 +++++++------
 drivers/net/ethernet/cisco/enic/enic_main.c        |  4 ++--
 drivers/net/ethernet/emulex/benet/be_main.c        |  4 ++--
 drivers/net/ethernet/freescale/gianfar.c           | 15 +++++++------
 drivers/net/ethernet/freescale/gianfar_ethtool.c   |  2 +-
 drivers/net/ethernet/ibm/ehea/ehea_main.c          |  8 +++----
 drivers/net/ethernet/intel/e1000/e1000_main.c      | 16 +++++++-------
 drivers/net/ethernet/intel/e1000e/netdev.c         | 10 ++++-----
 drivers/net/ethernet/intel/igb/igb_main.c          | 18 ++++++++--------
 drivers/net/ethernet/intel/igbvf/netdev.c          |  6 +++---
 drivers/net/ethernet/intel/ixgb/ixgb_main.c        | 14 ++++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c      | 12 +++++------
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c     |  2 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c  |  6 +++---
 drivers/net/ethernet/jme.c                         |  4 ++--
 drivers/net/ethernet/marvell/sky2.c                |  9 ++++----
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  4 ++--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c   |  7 +++---
 drivers/net/ethernet/natsemi/ns83820.c             |  2 +-
 drivers/net/ethernet/neterion/s2io.c               |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-main.c     |  4 ++--
 drivers/net/ethernet/nvidia/forcedeth.c            | 20 +++++++++--------
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  4 ++--
 drivers/net/ethernet/qlogic/qlge/qlge_main.c       | 16 +++++++-------
 drivers/net/ethernet/realtek/8139cp.c              |  6 +++---
 drivers/net/ethernet/realtek/r8169.c               | 14 ++++++------
 drivers/net/ethernet/renesas/sh_eth.c              |  2 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  2 +-
 drivers/net/ethernet/tehuti/tehuti.c               |  6 +++---
 drivers/net/ethernet/ti/cpsw.c                     |  4 ++--
 drivers/net/ethernet/toshiba/spider_net.c          |  4 ++--
 drivers/net/ethernet/via/via-rhine.c               |  5 +++--
 drivers/net/ethernet/via/via-velocity.c            |  7 +++---
 drivers/net/ethernet/xilinx/ll_temac_main.c        |  6 +++---
 drivers/net/hyperv/netvsc_drv.c                    |  2 +-
 drivers/net/ifb.c                                  |  2 +-
 drivers/net/macvlan.c                              |  2 +-
 drivers/net/team/team.c                            |  6 +++---
 drivers/net/usb/cdc_mbim.c                         |  2 +-
 drivers/net/veth.c                                 |  2 +-
 drivers/net/virtio_net.c                           |  2 +-
 drivers/net/vmxnet3/vmxnet3_drv.c                  | 11 +++++-----
 drivers/net/vmxnet3/vmxnet3_ethtool.c              |  5 +++--
 drivers/s390/net/qeth_l2_main.c                    |  2 +-
 drivers/s390/net/qeth_l3_main.c                    |  6 +++---
 drivers/scsi/fcoe/fcoe.c                           |  2 +-
 include/linux/if_vlan.h                            |  4 ++--
 include/linux/netdev_features.h                    | 12 +++++------
 include/linux/netdevice.h                          |  6 ++++--
 net/8021q/vlan.c                                   |  6 +++---
 net/8021q/vlan_core.c                              |  4 ++--
 net/8021q/vlan_dev.c                               |  2 +-
 net/bridge/br_device.c                             |  4 ++--
 net/bridge/br_vlan.c                               |  6 +++---
 net/core/dev.c                                     |  9 ++++----
 net/core/ethtool.c                                 | 25 +++++++++++-----------
 net/core/netpoll.c                                 |  2 +-
 net/openvswitch/vport-internal_dev.c               |  2 +-
 78 files changed, 285 insertions(+), 266 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
index 85cf4d1ac442..49eb5111d2cd 100644
--- a/drivers/infiniband/hw/nes/nes_nic.c
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -1599,7 +1599,7 @@ static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev,
 
 	/* Enable/Disable VLAN Stripping */
 	u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
-	if (features & NETIF_F_HW_VLAN_RX)
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		u32temp &= 0xfdffffff;
 	else
 		u32temp	|= 0x02000000;
@@ -1614,10 +1614,10 @@ static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_feat
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -1628,7 +1628,7 @@ static int nes_set_features(struct net_device *netdev, netdev_features_t feature
 	struct nes_device *nesdev = nesvnic->nesdev;
 	u32 changed = netdev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		nes_vlan_mode(netdev, nesdev, features);
 
 	return 0;
@@ -1706,11 +1706,11 @@ struct net_device *nes_netdev_init(struct nes_device *nesdev,
 	netdev->dev_addr[4] = (u8)(u64temp>>8);
 	netdev->dev_addr[5] = (u8)u64temp;
 
-	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX;
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
 	if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV))
 		netdev->hw_features |= NETIF_F_TSO;
 
-	netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_TX;
+	netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_features |= NETIF_F_LRO;
 
 	nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 2aac890320cb..8d324f8a1757 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -4322,9 +4322,9 @@ static void bond_setup(struct net_device *bond_dev)
 	 */
 
 	bond_dev->hw_features = BOND_VLAN_FEATURES |
-				NETIF_F_HW_VLAN_TX |
-				NETIF_F_HW_VLAN_RX |
-				NETIF_F_HW_VLAN_FILTER;
+				NETIF_F_HW_VLAN_CTAG_TX |
+				NETIF_F_HW_VLAN_CTAG_RX |
+				NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	bond_dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM);
 	bond_dev->features |= bond_dev->hw_features;
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index 27aaaf99e73e..839a96213742 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -2445,9 +2445,9 @@ typhoon_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	 * settings -- so we only allow the user to toggle the TX processing.
 	 */
 	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
-		NETIF_F_HW_VLAN_TX;
+		NETIF_F_HW_VLAN_CTAG_TX;
 	dev->features = dev->hw_features |
-		NETIF_F_HW_VLAN_RX | NETIF_F_RXCSUM;
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_RXCSUM;
 
 	if(register_netdev(dev) < 0) {
 		err_msg = "unable to register netdev";
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 549b77500579..365865130f7c 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -702,7 +702,7 @@ static int starfire_init_one(struct pci_dev *pdev,
 #endif /* ZEROCOPY */
 
 #ifdef VLAN_SUPPORT
-	dev->features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
 #endif /* VLAN_RX_KILL_VID */
 #ifdef ADDR_64BITS
 	dev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index c0bc41a784ca..a7689d931053 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -472,7 +472,7 @@ static int acenic_probe_one(struct pci_dev *pdev,
 	ap->name = pci_name(pdev);
 
 	dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM;
-	dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	dev->watchdog_timeo = 5*HZ;
 
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 42d4e6ad58a5..6bad84d6e2c4 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -1869,7 +1869,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev,
 	SET_NETDEV_DEV(dev, &pdev->dev);
 
 #if AMD8111E_VLAN_TAG_USED
-	dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX ;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX ;
 #endif
 
 	lp = netdev_priv(dev);
@@ -1907,7 +1907,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev,
 	netif_napi_add(dev, &lp->napi, amd8111e_rx_poll, 32);
 
 #if AMD8111E_VLAN_TAG_USED
-	dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 #endif
 	/* Probe the external PHY */
 	amd8111e_probe_ext_phy(dev);
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 1f07fc633ab9..3565255cdd81 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -417,7 +417,7 @@ static void atl1c_set_multi(struct net_device *netdev)
 
 static void __atl1c_vlan_mode(netdev_features_t features, u32 *mac_ctrl_data)
 {
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		/* enable VLAN tag insert/strip */
 		*mac_ctrl_data |= MAC_CTRL_RMV_VLAN;
 	} else {
@@ -494,10 +494,10 @@ static netdev_features_t atl1c_fix_features(struct net_device *netdev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	if (netdev->mtu > MAX_TSO_FRAME_SIZE)
 		features &= ~(NETIF_F_TSO | NETIF_F_TSO6);
@@ -510,7 +510,7 @@ static int atl1c_set_features(struct net_device *netdev,
 {
 	netdev_features_t changed = netdev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		atl1c_vlan_mode(netdev, features);
 
 	return 0;
@@ -2475,13 +2475,13 @@ static int atl1c_init_netdev(struct net_device *netdev, struct pci_dev *pdev)
 	atl1c_set_ethtool_ops(netdev);
 
 	/* TODO: add when ready */
-	netdev->hw_features =	NETIF_F_SG	   |
-				NETIF_F_HW_CSUM	   |
-				NETIF_F_HW_VLAN_RX |
-				NETIF_F_TSO	   |
+	netdev->hw_features =	NETIF_F_SG		|
+				NETIF_F_HW_CSUM		|
+				NETIF_F_HW_VLAN_CTAG_RX	|
+				NETIF_F_TSO		|
 				NETIF_F_TSO6;
-	netdev->features =	netdev->hw_features |
-				NETIF_F_HW_VLAN_TX;
+	netdev->features =	netdev->hw_features	|
+				NETIF_F_HW_VLAN_CTAG_TX;
 	return 0;
 }
 
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index d058d0061ed0..598a61151668 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -315,7 +315,7 @@ static void atl1e_set_multi(struct net_device *netdev)
 
 static void __atl1e_vlan_mode(netdev_features_t features, u32 *mac_ctrl_data)
 {
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		/* enable VLAN tag insert/strip */
 		*mac_ctrl_data |= MAC_CTRL_RMV_VLAN;
 	} else {
@@ -378,10 +378,10 @@ static netdev_features_t atl1e_fix_features(struct net_device *netdev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -391,7 +391,7 @@ static int atl1e_set_features(struct net_device *netdev,
 {
 	netdev_features_t changed = netdev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		atl1e_vlan_mode(netdev, features);
 
 	return 0;
@@ -2198,9 +2198,9 @@ static int atl1e_init_netdev(struct net_device *netdev, struct pci_dev *pdev)
 	atl1e_set_ethtool_ops(netdev);
 
 	netdev->hw_features = NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_TSO |
-			      NETIF_F_HW_VLAN_RX;
+			      NETIF_F_HW_VLAN_CTAG_RX;
 	netdev->features = netdev->hw_features | NETIF_F_LLTX |
-			   NETIF_F_HW_VLAN_TX;
+			   NETIF_F_HW_VLAN_CTAG_TX;
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index 8338013ab33d..fd7d85044e4a 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -3018,10 +3018,10 @@ static int atl1_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	netdev->features = NETIF_F_HW_CSUM;
 	netdev->features |= NETIF_F_SG;
-	netdev->features |= (NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX);
+	netdev->features |= (NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX);
 
 	netdev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG | NETIF_F_TSO |
-			      NETIF_F_HW_VLAN_RX;
+			      NETIF_F_HW_VLAN_CTAG_RX;
 
 	/* is this valid? see atl1_setup_mac_ctrl() */
 	netdev->features |= NETIF_F_RXCSUM;
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index a046b6ff847c..6b2c08a89b7e 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -363,7 +363,7 @@ static inline void atl2_irq_disable(struct atl2_adapter *adapter)
 
 static void __atl2_vlan_mode(netdev_features_t features, u32 *ctrl)
 {
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		/* enable VLAN tag insert/strip */
 		*ctrl |= MAC_CTRL_RMV_VLAN;
 	} else {
@@ -399,10 +399,10 @@ static netdev_features_t atl2_fix_features(struct net_device *netdev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -412,7 +412,7 @@ static int atl2_set_features(struct net_device *netdev,
 {
 	netdev_features_t changed = netdev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		atl2_vlan_mode(netdev, features);
 
 	return 0;
@@ -887,7 +887,7 @@ static netdev_tx_t atl2_xmit_frame(struct sk_buff *skb,
 			skb->len-copy_len);
 		offset = ((u32)(skb->len-copy_len + 3) & ~3);
 	}
-#ifdef NETIF_F_HW_VLAN_TX
+#ifdef NETIF_F_HW_VLAN_CTAG_TX
 	if (vlan_tx_tag_present(skb)) {
 		u16 vlan_tag = vlan_tx_tag_get(skb);
 		vlan_tag = (vlan_tag << 4) |
@@ -1413,8 +1413,8 @@ static int atl2_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	err = -EIO;
 
-	netdev->hw_features = NETIF_F_SG | NETIF_F_HW_VLAN_RX;
-	netdev->features |= (NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX);
+	netdev->hw_features = NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_RX;
+	netdev->features |= (NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX);
 
 	/* Init PHY as early as possible due to power saving issue  */
 	atl2_phy_init(&adapter->hw);
diff --git a/drivers/net/ethernet/atheros/atlx/atlx.c b/drivers/net/ethernet/atheros/atlx/atlx.c
index f82eb1699464..46a622cceee4 100644
--- a/drivers/net/ethernet/atheros/atlx/atlx.c
+++ b/drivers/net/ethernet/atheros/atlx/atlx.c
@@ -220,7 +220,7 @@ static void atlx_link_chg_task(struct work_struct *work)
 
 static void __atlx_vlan_mode(netdev_features_t features, u32 *ctrl)
 {
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		/* enable VLAN tag insert/strip */
 		*ctrl |= MAC_CTRL_RMV_VLAN;
 	} else {
@@ -257,10 +257,10 @@ static netdev_features_t atlx_fix_features(struct net_device *netdev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -270,7 +270,7 @@ static int atlx_set_features(struct net_device *netdev,
 {
 	netdev_features_t changed = netdev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		atlx_vlan_mode(netdev, features);
 
 	return 0;
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index f27b549b692d..42a8bc8df5dd 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3553,7 +3553,7 @@ bnx2_set_rx_mode(struct net_device *dev)
 	rx_mode = bp->rx_mode & ~(BNX2_EMAC_RX_MODE_PROMISCUOUS |
 				  BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG);
 	sort_mode = 1 | BNX2_RPM_SORT_USER0_BC_EN;
-	if (!(dev->features & NETIF_F_HW_VLAN_RX) &&
+	if (!(dev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
 	     (bp->flags & BNX2_FLAG_CAN_KEEP_VLAN))
 		rx_mode |= BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG;
 	if (dev->flags & IFF_PROMISC) {
@@ -7695,7 +7695,7 @@ bnx2_fix_features(struct net_device *dev, netdev_features_t features)
 	struct bnx2 *bp = netdev_priv(dev);
 
 	if (!(bp->flags & BNX2_FLAG_CAN_KEEP_VLAN))
-		features |= NETIF_F_HW_VLAN_RX;
+		features |= NETIF_F_HW_VLAN_CTAG_RX;
 
 	return features;
 }
@@ -7706,12 +7706,12 @@ bnx2_set_features(struct net_device *dev, netdev_features_t features)
 	struct bnx2 *bp = netdev_priv(dev);
 
 	/* TSO with VLAN tag won't work with current firmware */
-	if (features & NETIF_F_HW_VLAN_TX)
+	if (features & NETIF_F_HW_VLAN_CTAG_TX)
 		dev->vlan_features |= (dev->hw_features & NETIF_F_ALL_TSO);
 	else
 		dev->vlan_features &= ~NETIF_F_ALL_TSO;
 
-	if ((!!(features & NETIF_F_HW_VLAN_RX) !=
+	if ((!!(features & NETIF_F_HW_VLAN_CTAG_RX) !=
 	    !!(bp->rx_mode & BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG)) &&
 	    netif_running(dev)) {
 		bnx2_netif_stop(bp, false);
@@ -8551,7 +8551,7 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
 
 	dev->vlan_features = dev->hw_features;
-	dev->hw_features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 	dev->features |= dev->hw_features;
 	dev->priv_flags |= IFF_UNICAST_FLT;
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
index fdfe33bc097b..1e60c5d139d1 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
@@ -12027,7 +12027,7 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev,
 	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 |
 		NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_GRO |
-		NETIF_F_RXHASH | NETIF_F_HW_VLAN_TX;
+		NETIF_F_RXHASH | NETIF_F_HW_VLAN_CTAG_TX;
 	if (!CHIP_IS_E1x(bp)) {
 		dev->hw_features |= NETIF_F_GSO_GRE;
 		dev->hw_enc_features =
@@ -12039,7 +12039,7 @@ static int bnx2x_init_dev(struct bnx2x *bp, struct pci_dev *pdev,
 	dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_HIGHDMA;
 
-	dev->features |= dev->hw_features | NETIF_F_HW_VLAN_RX;
+	dev->features |= dev->hw_features | NETIF_F_HW_VLAN_CTAG_RX;
 	if (bp->flags & USING_DAC_FLAG)
 		dev->features |= NETIF_F_HIGHDMA;
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 45719ddfde74..0c22c9a059c4 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -17197,7 +17197,7 @@ static int tg3_init_one(struct pci_dev *pdev,
 
 	tg3_init_bufmgr_config(tp);
 
-	features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	/* 5700 B0 chips do not support checksumming correctly due
 	 * to hardware bugs.
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index d588f842d557..1d9d0371a743 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -3170,14 +3170,14 @@ bnad_netdev_init(struct bnad *bnad, bool using_dac)
 
 	netdev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-		NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_HW_VLAN_TX;
+		NETIF_F_TSO | NETIF_F_TSO6 | NETIF_F_HW_VLAN_CTAG_TX;
 
 	netdev->vlan_features = NETIF_F_SG | NETIF_F_HIGHDMA |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO6;
 
 	netdev->features |= netdev->hw_features |
-		NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	if (using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
index 20d2085f61c5..9624cfe7df57 100644
--- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
+++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c
@@ -856,10 +856,10 @@ static netdev_features_t t1_fix_features(struct net_device *dev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -869,7 +869,7 @@ static int t1_set_features(struct net_device *dev, netdev_features_t features)
 	netdev_features_t changed = dev->features ^ features;
 	struct adapter *adapter = dev->ml_priv;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		t1_vlan_mode(adapter, features);
 
 	return 0;
@@ -1085,8 +1085,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			netdev->features |= NETIF_F_HIGHDMA;
 		if (vlan_tso_capable(adapter)) {
 			netdev->features |=
-				NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
-			netdev->hw_features |= NETIF_F_HW_VLAN_RX;
+				NETIF_F_HW_VLAN_CTAG_TX |
+				NETIF_F_HW_VLAN_CTAG_RX;
+			netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
 
 			/* T204: disable TSO */
 			if (!(is_T2(adapter)) || bi->port_number != 4) {
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index 55fe8c9f0484..f85e0659432b 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -734,7 +734,7 @@ void t1_vlan_mode(struct adapter *adapter, netdev_features_t features)
 {
 	struct sge *sge = adapter->sge;
 
-	if (features & NETIF_F_HW_VLAN_RX)
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		sge->sge_control |= F_VLAN_XTRACT;
 	else
 		sge->sge_control &= ~F_VLAN_XTRACT;
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
index 2b5e62193cea..71497e835f42 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_main.c
@@ -1181,14 +1181,15 @@ static void cxgb_vlan_mode(struct net_device *dev, netdev_features_t features)
 
 	if (adapter->params.rev > 0) {
 		t3_set_vlan_accel(adapter, 1 << pi->port_id,
-				  features & NETIF_F_HW_VLAN_RX);
+				  features & NETIF_F_HW_VLAN_CTAG_RX);
 	} else {
 		/* single control for all ports */
-		unsigned int i, have_vlans = features & NETIF_F_HW_VLAN_RX;
+		unsigned int i, have_vlans = features & NETIF_F_HW_VLAN_CTAG_RX;
 
 		for_each_port(adapter, i)
 			have_vlans |=
-				adapter->port[i]->features & NETIF_F_HW_VLAN_RX;
+				adapter->port[i]->features &
+				NETIF_F_HW_VLAN_CTAG_RX;
 
 		t3_set_vlan_accel(adapter, 1, have_vlans);
 	}
@@ -2563,10 +2564,10 @@ static netdev_features_t cxgb_fix_features(struct net_device *dev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -2575,7 +2576,7 @@ static int cxgb_set_features(struct net_device *dev, netdev_features_t features)
 {
 	netdev_features_t changed = dev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		cxgb_vlan_mode(dev, features);
 
 	return 0;
@@ -3288,8 +3289,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		netdev->mem_start = mmio_start;
 		netdev->mem_end = mmio_start + mmio_len - 1;
 		netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
-			NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX;
-		netdev->features |= netdev->hw_features | NETIF_F_HW_VLAN_TX;
+			NETIF_F_TSO | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
+		netdev->features |= netdev->hw_features |
+				    NETIF_F_HW_VLAN_CTAG_TX;
 		netdev->vlan_features |= netdev->features & VLAN_FEAT;
 		if (pci_using_dac)
 			netdev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index e76cf035100b..6a6a01af75fd 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -559,7 +559,7 @@ static int link_start(struct net_device *dev)
 	 * that step explicitly.
 	 */
 	ret = t4_set_rxmode(pi->adapter, mb, pi->viid, dev->mtu, -1, -1, -1,
-			    !!(dev->features & NETIF_F_HW_VLAN_RX), true);
+			    !!(dev->features & NETIF_F_HW_VLAN_CTAG_RX), true);
 	if (ret == 0) {
 		ret = t4_change_mac(pi->adapter, mb, pi->viid,
 				    pi->xact_addr_filt, dev->dev_addr, true,
@@ -2722,14 +2722,14 @@ static int cxgb_set_features(struct net_device *dev, netdev_features_t features)
 	netdev_features_t changed = dev->features ^ features;
 	int err;
 
-	if (!(changed & NETIF_F_HW_VLAN_RX))
+	if (!(changed & NETIF_F_HW_VLAN_CTAG_RX))
 		return 0;
 
 	err = t4_set_rxmode(pi->adapter, pi->adapter->fn, pi->viid, -1,
 			    -1, -1, -1,
-			    !!(features & NETIF_F_HW_VLAN_RX), true);
+			    !!(features & NETIF_F_HW_VLAN_CTAG_RX), true);
 	if (unlikely(err))
-		dev->features = features ^ NETIF_F_HW_VLAN_RX;
+		dev->features = features ^ NETIF_F_HW_VLAN_CTAG_RX;
 	return err;
 }
 
@@ -5628,7 +5628,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		netdev->hw_features = NETIF_F_SG | TSO_FLAGS |
 			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 			NETIF_F_RXCSUM | NETIF_F_RXHASH |
-			NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 		if (highdma)
 			netdev->hw_features |= NETIF_F_HIGHDMA;
 		netdev->features |= netdev->hw_features;
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
index 7fcac2003769..73aef76a526c 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/cxgb4vf_main.c
@@ -1100,10 +1100,10 @@ static netdev_features_t cxgb4vf_fix_features(struct net_device *dev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -1114,9 +1114,9 @@ static int cxgb4vf_set_features(struct net_device *dev,
 	struct port_info *pi = netdev_priv(dev);
 	netdev_features_t changed = dev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		t4vf_set_rxmode(pi->adapter, pi->viid, -1, -1, -1, -1,
-				features & NETIF_F_HW_VLAN_TX, 0);
+				features & NETIF_F_HW_VLAN_CTAG_TX, 0);
 
 	return 0;
 }
@@ -2623,11 +2623,12 @@ static int cxgb4vf_pci_probe(struct pci_dev *pdev,
 
 		netdev->hw_features = NETIF_F_SG | TSO_FLAGS |
 			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
-			NETIF_F_HW_VLAN_RX | NETIF_F_RXCSUM;
+			NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_RXCSUM;
 		netdev->vlan_features = NETIF_F_SG | TSO_FLAGS |
 			NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 			NETIF_F_HIGHDMA;
-		netdev->features = netdev->hw_features | NETIF_F_HW_VLAN_TX;
+		netdev->features = netdev->hw_features |
+				   NETIF_F_HW_VLAN_CTAG_TX;
 		if (pci_using_dac)
 			netdev->features |= NETIF_F_HIGHDMA;
 
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index ec1a233622c6..05c1e59b6bff 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -2496,9 +2496,9 @@ static int enic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netdev->watchdog_timeo = 2 * HZ;
 	netdev->ethtool_ops = &enic_ethtool_ops;
 
-	netdev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 	if (ENIC_SETTING(enic, LOOP)) {
-		netdev->features &= ~NETIF_F_HW_VLAN_TX;
+		netdev->features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 		enic->loop_enable = 1;
 		enic->loop_tag = enic->config.loop_tag;
 		dev_info(dev, "loopback tag=0x%04x\n", enic->loop_tag);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index 536afa2fb94c..bde26d4d52ec 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -3663,12 +3663,12 @@ static void be_netdev_init(struct net_device *netdev)
 
 	netdev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM |
-		NETIF_F_HW_VLAN_TX;
+		NETIF_F_HW_VLAN_CTAG_TX;
 	if (be_multi_rxq(adapter))
 		netdev->hw_features |= NETIF_F_RXHASH;
 
 	netdev->features |= netdev->hw_features |
-		NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	netdev->vlan_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6 |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c
index 96fbe3548243..51555445ce2f 100644
--- a/drivers/net/ethernet/freescale/gianfar.c
+++ b/drivers/net/ethernet/freescale/gianfar.c
@@ -386,7 +386,7 @@ static void gfar_init_mac(struct net_device *ndev)
 		priv->uses_rxfcb = 1;
 	}
 
-	if (ndev->features & NETIF_F_HW_VLAN_RX) {
+	if (ndev->features & NETIF_F_HW_VLAN_CTAG_RX) {
 		rctrl |= RCTRL_VLEX | RCTRL_PRSDEP_INIT;
 		priv->uses_rxfcb = 1;
 	}
@@ -1050,8 +1050,9 @@ static int gfar_probe(struct platform_device *ofdev)
 	}
 
 	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_VLAN) {
-		dev->hw_features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
-		dev->features |= NETIF_F_HW_VLAN_RX;
+		dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_HW_VLAN_CTAG_RX;
+		dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
 	}
 
 	if (priv->device_flags & FSL_GIANFAR_DEV_HAS_EXTENDED_HASH) {
@@ -2348,7 +2349,7 @@ void gfar_vlan_mode(struct net_device *dev, netdev_features_t features)
 	local_irq_save(flags);
 	lock_rx_qs(priv);
 
-	if (features & NETIF_F_HW_VLAN_TX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_TX) {
 		/* Enable VLAN tag insertion */
 		tempval = gfar_read(&regs->tctrl);
 		tempval |= TCTRL_VLINS;
@@ -2360,7 +2361,7 @@ void gfar_vlan_mode(struct net_device *dev, netdev_features_t features)
 		gfar_write(&regs->tctrl, tempval);
 	}
 
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		/* Enable VLAN tag extraction */
 		tempval = gfar_read(&regs->rctrl);
 		tempval |= (RCTRL_VLEX | RCTRL_PRSDEP_INIT);
@@ -2724,11 +2725,11 @@ static void gfar_process_frame(struct net_device *dev, struct sk_buff *skb,
 	/* Tell the skb what kind of packet this is */
 	skb->protocol = eth_type_trans(skb, dev);
 
-	/* There's need to check for NETIF_F_HW_VLAN_RX here.
+	/* There's need to check for NETIF_F_HW_VLAN_CTAG_RX here.
 	 * Even if vlan rx accel is disabled, on some chips
 	 * RXFCB_VLN is pseudo randomly set.
 	 */
-	if (dev->features & NETIF_F_HW_VLAN_RX &&
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_RX &&
 	    fcb->flags & RXFCB_VLN)
 		__vlan_hwaccel_put_tag(skb, fcb->vlctl);
 
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 083603f6bec0..21cd88124ca9 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -542,7 +542,7 @@ int gfar_set_features(struct net_device *dev, netdev_features_t features)
 	int err = 0, i = 0;
 	netdev_features_t changed = dev->features ^ features;
 
-	if (changed & (NETIF_F_HW_VLAN_TX|NETIF_F_HW_VLAN_RX))
+	if (changed & (NETIF_F_HW_VLAN_CTAG_TX|NETIF_F_HW_VLAN_CTAG_RX))
 		gfar_vlan_mode(dev, features);
 
 	if (!(changed & NETIF_F_RXCSUM))
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 029633434474..9c9fa745ff82 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -3021,11 +3021,11 @@ static struct ehea_port *ehea_setup_single_port(struct ehea_adapter *adapter,
 	ehea_set_ethtool_ops(dev);
 
 	dev->hw_features = NETIF_F_SG | NETIF_F_TSO
-		      | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_TX;
+		      | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_CTAG_TX;
 	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO
-		      | NETIF_F_HIGHDMA | NETIF_F_IP_CSUM | NETIF_F_HW_VLAN_TX
-		      | NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER
-		      | NETIF_F_RXCSUM;
+		      | NETIF_F_HIGHDMA | NETIF_F_IP_CSUM |
+		      | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+		      | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_RXCSUM;
 	dev->vlan_features = NETIF_F_SG | NETIF_F_TSO | NETIF_F_HIGHDMA |
 			NETIF_F_IP_CSUM;
 	dev->watchdog_timeo = EHEA_WATCH_DOG_TIMEOUT;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index d98e1d0996d4..8d0d0d420c21 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -809,10 +809,10 @@ static netdev_features_t e1000_fix_features(struct net_device *netdev,
 	/* Since there is no support for separate Rx/Tx vlan accel
 	 * enable/disable make sure Tx flag is always in same state as Rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -823,7 +823,7 @@ static int e1000_set_features(struct net_device *netdev,
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	netdev_features_t changed = features ^ netdev->features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		e1000_vlan_mode(netdev, features);
 
 	if (!(changed & (NETIF_F_RXCSUM | NETIF_F_RXALL)))
@@ -1058,9 +1058,9 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (hw->mac_type >= e1000_82543) {
 		netdev->hw_features = NETIF_F_SG |
 				   NETIF_F_HW_CSUM |
-				   NETIF_F_HW_VLAN_RX;
-		netdev->features = NETIF_F_HW_VLAN_TX |
-				   NETIF_F_HW_VLAN_FILTER;
+				   NETIF_F_HW_VLAN_CTAG_RX;
+		netdev->features = NETIF_F_HW_VLAN_CTAG_TX |
+				   NETIF_F_HW_VLAN_CTAG_FILTER;
 	}
 
 	if ((hw->mac_type >= e1000_82544) &&
@@ -4785,7 +4785,7 @@ static void __e1000_vlan_mode(struct e1000_adapter *adapter,
 	u32 ctrl;
 
 	ctrl = er32(CTRL);
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		/* enable VLAN tag insert/strip */
 		ctrl |= E1000_CTRL_VME;
 	} else {
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index b18fad5b579e..a2e7db33bf9d 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -3373,7 +3373,7 @@ static void e1000e_set_rx_mode(struct net_device *netdev)
 
 	ew32(RCTL, rctl);
 
-	if (netdev->features & NETIF_F_HW_VLAN_RX)
+	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
 		e1000e_vlan_strip_enable(adapter);
 	else
 		e1000e_vlan_strip_disable(adapter);
@@ -6418,7 +6418,7 @@ static int e1000_set_features(struct net_device *netdev,
 	if (changed & (NETIF_F_TSO | NETIF_F_TSO6))
 		adapter->flags |= FLAG_TSO_FORCE;
 
-	if (!(changed & (NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX |
+	if (!(changed & (NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_TX |
 			 NETIF_F_RXCSUM | NETIF_F_RXHASH | NETIF_F_RXFCS |
 			 NETIF_F_RXALL)))
 		return 0;
@@ -6629,8 +6629,8 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 
 	/* Set initial default active device features */
 	netdev->features = (NETIF_F_SG |
-			    NETIF_F_HW_VLAN_RX |
-			    NETIF_F_HW_VLAN_TX |
+			    NETIF_F_HW_VLAN_CTAG_RX |
+			    NETIF_F_HW_VLAN_CTAG_TX |
 			    NETIF_F_TSO |
 			    NETIF_F_TSO6 |
 			    NETIF_F_RXHASH |
@@ -6644,7 +6644,7 @@ static int e1000_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netdev->hw_features |= NETIF_F_RXALL;
 
 	if (adapter->flags & FLAG_HAS_HW_VLAN_FILTER)
-		netdev->features |= NETIF_F_HW_VLAN_FILTER;
+		netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	netdev->vlan_features |= (NETIF_F_SG |
 				  NETIF_F_TSO |
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 38590252be64..b0b1777c0af6 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -1860,10 +1860,10 @@ static netdev_features_t igb_fix_features(struct net_device *netdev,
 	/* Since there is no support for separate Rx/Tx vlan accel
 	 * enable/disable make sure Tx flag is always in same state as Rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -1874,7 +1874,7 @@ static int igb_set_features(struct net_device *netdev,
 	netdev_features_t changed = netdev->features ^ features;
 	struct igb_adapter *adapter = netdev_priv(netdev);
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		igb_vlan_mode(netdev, features);
 
 	if (!(changed & NETIF_F_RXALL))
@@ -2127,15 +2127,15 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			    NETIF_F_TSO6 |
 			    NETIF_F_RXHASH |
 			    NETIF_F_RXCSUM |
-			    NETIF_F_HW_VLAN_RX |
-			    NETIF_F_HW_VLAN_TX;
+			    NETIF_F_HW_VLAN_CTAG_RX |
+			    NETIF_F_HW_VLAN_CTAG_TX;
 
 	/* copy netdev features into list of user selectable features */
 	netdev->hw_features |= netdev->features;
 	netdev->hw_features |= NETIF_F_RXALL;
 
 	/* set this bit last since it cannot be part of hw_features */
-	netdev->features |= NETIF_F_HW_VLAN_FILTER;
+	netdev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	netdev->vlan_features |= NETIF_F_TSO |
 				 NETIF_F_TSO6 |
@@ -6674,7 +6674,7 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring,
 
 	igb_ptp_rx_hwtstamp(rx_ring->q_vector, rx_desc, skb);
 
-	if ((dev->features & NETIF_F_HW_VLAN_RX) &&
+	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
 	    igb_test_staterr(rx_desc, E1000_RXD_STAT_VP)) {
 		u16 vid;
 		if (igb_test_staterr(rx_desc, E1000_RXDEXT_STATERR_LB) &&
@@ -6954,7 +6954,7 @@ static void igb_vlan_mode(struct net_device *netdev, netdev_features_t features)
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
 	u32 ctrl, rctl;
-	bool enable = !!(features & NETIF_F_HW_VLAN_RX);
+	bool enable = !!(features & NETIF_F_HW_VLAN_CTAG_RX);
 
 	if (enable) {
 		/* enable VLAN tag insert/strip */
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index bea46bb26061..33e7b3069fb6 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2722,9 +2722,9 @@ static int igbvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			   NETIF_F_RXCSUM;
 
 	netdev->features = netdev->hw_features |
-	                   NETIF_F_HW_VLAN_TX |
-	                   NETIF_F_HW_VLAN_RX |
-	                   NETIF_F_HW_VLAN_FILTER;
+	                   NETIF_F_HW_VLAN_CTAG_TX |
+	                   NETIF_F_HW_VLAN_CTAG_RX |
+	                   NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	if (pci_using_dac)
 		netdev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index 5dc119fd95a8..e65d9e910227 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -332,8 +332,8 @@ ixgb_fix_features(struct net_device *netdev, netdev_features_t features)
 	 * Tx VLAN insertion does not work per HW design when Rx stripping is
 	 * disabled.
 	 */
-	if (!(features & NETIF_F_HW_VLAN_RX))
-		features &= ~NETIF_F_HW_VLAN_TX;
+	if (!(features & NETIF_F_HW_VLAN_CTAG_RX))
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -344,7 +344,7 @@ ixgb_set_features(struct net_device *netdev, netdev_features_t features)
 	struct ixgb_adapter *adapter = netdev_priv(netdev);
 	netdev_features_t changed = features ^ netdev->features;
 
-	if (!(changed & (NETIF_F_RXCSUM|NETIF_F_HW_VLAN_RX)))
+	if (!(changed & (NETIF_F_RXCSUM|NETIF_F_HW_VLAN_CTAG_RX)))
 		return 0;
 
 	adapter->rx_csum = !!(features & NETIF_F_RXCSUM);
@@ -479,10 +479,10 @@ ixgb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netdev->hw_features = NETIF_F_SG |
 			   NETIF_F_TSO |
 			   NETIF_F_HW_CSUM |
-			   NETIF_F_HW_VLAN_TX |
-			   NETIF_F_HW_VLAN_RX;
+			   NETIF_F_HW_VLAN_CTAG_TX |
+			   NETIF_F_HW_VLAN_CTAG_RX;
 	netdev->features = netdev->hw_features |
-			   NETIF_F_HW_VLAN_FILTER;
+			   NETIF_F_HW_VLAN_CTAG_FILTER;
 	netdev->hw_features |= NETIF_F_RXCSUM;
 
 	if (pci_using_dac) {
@@ -1140,7 +1140,7 @@ ixgb_set_multi(struct net_device *netdev)
 	}
 
 alloc_failed:
-	if (netdev->features & NETIF_F_HW_VLAN_RX)
+	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
 		ixgb_vlan_strip_enable(adapter);
 	else
 		ixgb_vlan_strip_disable(adapter);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index c022f9c417a6..0316b65dfe06 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1488,7 +1488,7 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
 
 	ixgbe_ptp_rx_hwtstamp(rx_ring, rx_desc, skb);
 
-	if ((dev->features & NETIF_F_HW_VLAN_RX) &&
+	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
 	    ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_VP)) {
 		u16 vid = le16_to_cpu(rx_desc->wb.upper.vlan);
 		__vlan_hwaccel_put_tag(skb, vid);
@@ -3722,7 +3722,7 @@ void ixgbe_set_rx_mode(struct net_device *netdev)
 
 	IXGBE_WRITE_REG(hw, IXGBE_FCTRL, fctrl);
 
-	if (netdev->features & NETIF_F_HW_VLAN_RX)
+	if (netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
 		ixgbe_vlan_strip_enable(adapter);
 	else
 		ixgbe_vlan_strip_disable(adapter);
@@ -7024,7 +7024,7 @@ static int ixgbe_set_features(struct net_device *netdev,
 		break;
 	}
 
-	if (features & NETIF_F_HW_VLAN_RX)
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		ixgbe_vlan_strip_enable(adapter);
 	else
 		ixgbe_vlan_strip_disable(adapter);
@@ -7431,9 +7431,9 @@ skip_sriov:
 	netdev->features = NETIF_F_SG |
 			   NETIF_F_IP_CSUM |
 			   NETIF_F_IPV6_CSUM |
-			   NETIF_F_HW_VLAN_TX |
-			   NETIF_F_HW_VLAN_RX |
-			   NETIF_F_HW_VLAN_FILTER |
+			   NETIF_F_HW_VLAN_CTAG_TX |
+			   NETIF_F_HW_VLAN_CTAG_RX |
+			   NETIF_F_HW_VLAN_CTAG_FILTER |
 			   NETIF_F_TSO |
 			   NETIF_F_TSO6 |
 			   NETIF_F_RXHASH |
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index b3e6530637e3..2d4bdcc4fdbe 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -35,7 +35,7 @@
 #include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/ipv6.h>
-#ifdef NETIF_F_HW_VLAN_TX
+#ifdef NETIF_F_HW_VLAN_CTAG_TX
 #include <linux/if_vlan.h>
 #endif
 
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index eeae9349f78b..8f907b7af319 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3410,9 +3410,9 @@ static int ixgbevf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			   NETIF_F_RXCSUM;
 
 	netdev->features = netdev->hw_features |
-			   NETIF_F_HW_VLAN_TX |
-			   NETIF_F_HW_VLAN_RX |
-			   NETIF_F_HW_VLAN_FILTER;
+			   NETIF_F_HW_VLAN_CTAG_TX |
+			   NETIF_F_HW_VLAN_CTAG_RX |
+			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	netdev->vlan_features |= NETIF_F_TSO;
 	netdev->vlan_features |= NETIF_F_TSO6;
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index 0519afa413d2..d28ce6f97172 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -3030,8 +3030,8 @@ jme_init_one(struct pci_dev *pdev,
 						NETIF_F_SG |
 						NETIF_F_TSO |
 						NETIF_F_TSO6 |
-						NETIF_F_HW_VLAN_TX |
-						NETIF_F_HW_VLAN_RX;
+						NETIF_F_HW_VLAN_CTAG_TX |
+						NETIF_F_HW_VLAN_CTAG_RX;
 	if (using_dac)
 		netdev->features	|=	NETIF_F_HIGHDMA;
 
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index 6a0e671fcecd..bf9da1b7b090 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -1421,14 +1421,14 @@ static void sky2_vlan_mode(struct net_device *dev, netdev_features_t features)
 	struct sky2_hw *hw = sky2->hw;
 	u16 port = sky2->port;
 
-	if (features & NETIF_F_HW_VLAN_RX)
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
 			     RX_VLAN_STRIP_ON);
 	else
 		sky2_write32(hw, SK_REG(port, RX_GMF_CTRL_T),
 			     RX_VLAN_STRIP_OFF);
 
-	if (features & NETIF_F_HW_VLAN_TX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_TX) {
 		sky2_write32(hw, SK_REG(port, TX_GMF_CTRL_T),
 			     TX_VLAN_TAG_ON);
 
@@ -4406,7 +4406,7 @@ static int sky2_set_features(struct net_device *dev, netdev_features_t features)
 	if (changed & NETIF_F_RXHASH)
 		rx_set_rss(dev, features);
 
-	if (changed & (NETIF_F_HW_VLAN_TX|NETIF_F_HW_VLAN_RX))
+	if (changed & (NETIF_F_HW_VLAN_CTAG_TX|NETIF_F_HW_VLAN_CTAG_RX))
 		sky2_vlan_mode(dev, features);
 
 	return 0;
@@ -4793,7 +4793,8 @@ static struct net_device *sky2_init_netdev(struct sky2_hw *hw, unsigned port,
 		dev->hw_features |= NETIF_F_RXHASH;
 
 	if (!(hw->flags & SKY2_HW_VLAN_BROKEN)) {
-		dev->hw_features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+		dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX |
+				    NETIF_F_HW_VLAN_CTAG_RX;
 		dev->vlan_features |= SKY2_VLAN_OFFLOADS;
 	}
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index d2a4f919bf1f..b2ba39c7143a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2082,8 +2082,8 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 
 	dev->hw_features |= NETIF_F_RXCSUM | NETIF_F_RXHASH;
 	dev->features = dev->hw_features | NETIF_F_HIGHDMA |
-			NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
-			NETIF_F_HW_VLAN_FILTER;
+			NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+			NETIF_F_HW_VLAN_CTAG_FILTER;
 	dev->hw_features |= NETIF_F_LOOPBACK;
 
 	if (mdev->dev->caps.steering_mode ==
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index d5ffdc8264eb..46262ea610fd 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -1281,7 +1281,8 @@ myri10ge_vlan_rx(struct net_device *dev, void *addr, struct sk_buff *skb)
 	va = addr;
 	va += MXGEFW_PAD;
 	veh = (struct vlan_ethhdr *)va;
-	if ((dev->features & NETIF_F_HW_VLAN_RX) == NETIF_F_HW_VLAN_RX &&
+	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) ==
+	    NETIF_F_HW_VLAN_CTAG_RX &&
 	    veh->h_vlan_proto == htons(ETH_P_8021Q)) {
 		/* fixup csum if needed */
 		if (skb->ip_summed == CHECKSUM_COMPLETE) {
@@ -3887,8 +3888,8 @@ static int myri10ge_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	netdev->mtu = myri10ge_initial_mtu;
 	netdev->hw_features = mgp->features | NETIF_F_RXCSUM;
 
-	/* fake NETIF_F_HW_VLAN_RX for good GRO performance */
-	netdev->hw_features |= NETIF_F_HW_VLAN_RX;
+	/* fake NETIF_F_HW_VLAN_CTAG_RX for good GRO performance */
+	netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX;
 
 	netdev->features = netdev->hw_features;
 
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 77c070de621e..60267e91cbda 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -2193,7 +2193,7 @@ static int ns83820_init_one(struct pci_dev *pci_dev,
 
 #ifdef NS83820_VLAN_ACCEL_SUPPORT
 	/* We also support hardware vlan acceleration */
-	ndev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 #endif
 
 	if (using_dac) {
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index 3371ff41bb34..ec82d59b03ed 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -7920,7 +7920,7 @@ s2io_init_nic(struct pci_dev *pdev, const struct pci_device_id *pre)
 		NETIF_F_TSO | NETIF_F_TSO6 |
 		NETIF_F_RXCSUM | NETIF_F_LRO;
 	dev->features |= dev->hw_features |
-		NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 	if (sp->device_type & XFRAME_II_DEVICE) {
 		dev->hw_features |= NETIF_F_UFO;
 		if (ufo)
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index 794444e09492..e9e58aadf87e 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3415,12 +3415,12 @@ static int vxge_device_register(struct __vxge_hw_device *hldev,
 	ndev->hw_features = NETIF_F_RXCSUM | NETIF_F_SG |
 		NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO6 |
-		NETIF_F_HW_VLAN_TX;
+		NETIF_F_HW_VLAN_CTAG_TX;
 	if (vdev->config.rth_steering != NO_STEERING)
 		ndev->hw_features |= NETIF_F_RXHASH;
 
 	ndev->features |= ndev->hw_features |
-		NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
 
 
 	ndev->netdev_ops = &vxge_netdev_ops;
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index 5ae124719790..fcad64081d74 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2961,11 +2961,11 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 			vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
 
 			/*
-			 * There's need to check for NETIF_F_HW_VLAN_RX here.
-			 * Even if vlan rx accel is disabled,
+			 * There's need to check for NETIF_F_HW_VLAN_CTAG_RX
+			 * here. Even if vlan rx accel is disabled,
 			 * NV_RX3_VLAN_TAG_PRESENT is pseudo randomly set.
 			 */
-			if (dev->features & NETIF_F_HW_VLAN_RX &&
+			if (dev->features & NETIF_F_HW_VLAN_CTAG_RX &&
 			    vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
 				u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK;
 
@@ -4816,7 +4816,7 @@ static netdev_features_t nv_fix_features(struct net_device *dev,
 	netdev_features_t features)
 {
 	/* vlan is dependent on rx checksum offload */
-	if (features & (NETIF_F_HW_VLAN_TX|NETIF_F_HW_VLAN_RX))
+	if (features & (NETIF_F_HW_VLAN_CTAG_TX|NETIF_F_HW_VLAN_CTAG_RX))
 		features |= NETIF_F_RXCSUM;
 
 	return features;
@@ -4828,12 +4828,12 @@ static void nv_vlan_mode(struct net_device *dev, netdev_features_t features)
 
 	spin_lock_irq(&np->lock);
 
-	if (features & NETIF_F_HW_VLAN_RX)
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		np->txrxctl_bits |= NVREG_TXRXCTL_VLANSTRIP;
 	else
 		np->txrxctl_bits &= ~NVREG_TXRXCTL_VLANSTRIP;
 
-	if (features & NETIF_F_HW_VLAN_TX)
+	if (features & NETIF_F_HW_VLAN_CTAG_TX)
 		np->txrxctl_bits |= NVREG_TXRXCTL_VLANINS;
 	else
 		np->txrxctl_bits &= ~NVREG_TXRXCTL_VLANINS;
@@ -4870,7 +4870,7 @@ static int nv_set_features(struct net_device *dev, netdev_features_t features)
 		spin_unlock_irq(&np->lock);
 	}
 
-	if (changed & (NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX))
+	if (changed & (NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX))
 		nv_vlan_mode(dev, features);
 
 	return 0;
@@ -5705,7 +5705,8 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 	np->vlanctl_bits = 0;
 	if (id->driver_data & DEV_HAS_VLAN) {
 		np->vlanctl_bits = NVREG_VLANCONTROL_ENABLE;
-		dev->hw_features |= NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX;
+		dev->hw_features |= NETIF_F_HW_VLAN_CTAG_RX |
+				    NETIF_F_HW_VLAN_CTAG_TX;
 	}
 
 	dev->features |= dev->hw_features;
@@ -5996,7 +5997,8 @@ static int nv_probe(struct pci_dev *pci_dev, const struct pci_device_id *id)
 		 dev->features & NETIF_F_HIGHDMA ? "highdma " : "",
 		 dev->features & (NETIF_F_IP_CSUM | NETIF_F_SG) ?
 			"csum " : "",
-		 dev->features & (NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_TX) ?
+		 dev->features & (NETIF_F_HW_VLAN_CTAG_RX |
+				  NETIF_F_HW_VLAN_CTAG_TX) ?
 			"vlan " : "",
 		 dev->features & (NETIF_F_LOOPBACK) ?
 			"loopback " : "",
diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 7867aebc05f2..af951f343ff6 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -1345,7 +1345,7 @@ netxen_setup_netdev(struct netxen_adapter *adapter,
 	}
 
 	if (adapter->capabilities & NX_FW_CAPABILITY_FVLANTX)
-		netdev->hw_features |= NETIF_F_HW_VLAN_TX;
+		netdev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX;
 
 	if (adapter->capabilities & NX_FW_CAPABILITY_HW_LRO)
 		netdev->hw_features |= NETIF_F_LRO;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 0d00b2bd2c81..845ba1d1c3c9 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -1714,7 +1714,7 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter, struct net_device *netdev,
 
 	netdev->features |= (NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
 			     NETIF_F_IPV6_CSUM | NETIF_F_GRO |
-			     NETIF_F_HW_VLAN_RX);
+			     NETIF_F_HW_VLAN_CTAG_RX);
 	netdev->vlan_features |= (NETIF_F_SG | NETIF_F_IP_CSUM |
 				  NETIF_F_IPV6_CSUM);
 
@@ -1729,7 +1729,7 @@ qlcnic_setup_netdev(struct qlcnic_adapter *adapter, struct net_device *netdev,
 	}
 
 	if (qlcnic_vlan_tx_check(adapter))
-		netdev->features |= (NETIF_F_HW_VLAN_TX);
+		netdev->features |= (NETIF_F_HW_VLAN_CTAG_TX);
 
 	if (adapter->ahw->capabilities & QLCNIC_FW_CAPABILITY_HW_LRO)
 		netdev->features |= NETIF_F_LRO;
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 1dd778a6f01e..8e3f43c75665 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -409,7 +409,7 @@ static int ql_set_mac_addr_reg(struct ql_adapter *qdev, u8 *addr, u32 type,
 				      (qdev->
 				       func << CAM_OUT_FUNC_SHIFT) |
 					(0 << CAM_OUT_CQ_ID_SHIFT));
-			if (qdev->ndev->features & NETIF_F_HW_VLAN_RX)
+			if (qdev->ndev->features & NETIF_F_HW_VLAN_CTAG_RX)
 				cam_output |= CAM_OUT_RV;
 			/* route to NIC core */
 			ql_write32(qdev, MAC_ADDR_DATA, cam_output);
@@ -2279,7 +2279,7 @@ static void qlge_vlan_mode(struct net_device *ndev, netdev_features_t features)
 {
 	struct ql_adapter *qdev = netdev_priv(ndev);
 
-	if (features & NETIF_F_HW_VLAN_RX) {
+	if (features & NETIF_F_HW_VLAN_CTAG_RX) {
 		ql_write32(qdev, NIC_RCV_CFG, NIC_RCV_CFG_VLAN_MASK |
 				 NIC_RCV_CFG_VLAN_MATCH_AND_NON);
 	} else {
@@ -2294,10 +2294,10 @@ static netdev_features_t qlge_fix_features(struct net_device *ndev,
 	 * Since there is no support for separate rx/tx vlan accel
 	 * enable/disable make sure tx flag is always in same state as rx.
 	 */
-	if (features & NETIF_F_HW_VLAN_RX)
-		features |= NETIF_F_HW_VLAN_TX;
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
 	else
-		features &= ~NETIF_F_HW_VLAN_TX;
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
 
 	return features;
 }
@@ -2307,7 +2307,7 @@ static int qlge_set_features(struct net_device *ndev,
 {
 	netdev_features_t changed = ndev->features ^ features;
 
-	if (changed & NETIF_F_HW_VLAN_RX)
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
 		qlge_vlan_mode(ndev, features);
 
 	return 0;
@@ -4665,9 +4665,9 @@ static int qlge_probe(struct pci_dev *pdev,
 	SET_NETDEV_DEV(ndev, &pdev->dev);
 	ndev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM |
 		NETIF_F_TSO | NETIF_F_TSO_ECN |
-		NETIF_F_HW_VLAN_TX | NETIF_F_RXCSUM;
+		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_RXCSUM;
 	ndev->features = ndev->hw_features |
-		NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER;
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_HW_VLAN_CTAG_FILTER;
 	ndev->vlan_features = ndev->hw_features;
 
 	if (test_bit(QL_DMA64, &qdev->flags))
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index b62a32484f6a..6d03b52e56f1 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -1438,7 +1438,7 @@ static int cp_set_features(struct net_device *dev, netdev_features_t features)
 	else
 		cp->cpcmd &= ~RxChkSum;
 
-	if (features & NETIF_F_HW_VLAN_RX)
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
 		cp->cpcmd |= RxVlanOn;
 	else
 		cp->cpcmd &= ~RxVlanOn;
@@ -1955,14 +1955,14 @@ static int cp_init_one (struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->ethtool_ops = &cp_ethtool_ops;
 	dev->watchdog_timeo = TX_TIMEOUT;
 
-	dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	if (pci_using_dac)
 		dev->features |= NETIF_F_HIGHDMA;
 
 	/* disabled by default until verified */
 	dev->hw_features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
-		NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 	dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
 		NETIF_F_HIGHDMA;
 
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 9a1bc1a23854..86d5d7909d10 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1793,16 +1793,17 @@ static void __rtl8169_set_features(struct net_device *dev,
 	netdev_features_t changed = features ^ dev->features;
 	void __iomem *ioaddr = tp->mmio_addr;
 
-	if (!(changed & (NETIF_F_RXALL | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX)))
+	if (!(changed & (NETIF_F_RXALL | NETIF_F_RXCSUM |
+			 NETIF_F_HW_VLAN_CTAG_RX)))
 		return;
 
-	if (changed & (NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX)) {
+	if (changed & (NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX)) {
 		if (features & NETIF_F_RXCSUM)
 			tp->cp_cmd |= RxChkSum;
 		else
 			tp->cp_cmd &= ~RxChkSum;
 
-		if (dev->features & NETIF_F_HW_VLAN_RX)
+		if (dev->features & NETIF_F_HW_VLAN_CTAG_RX)
 			tp->cp_cmd |= RxVlan;
 		else
 			tp->cp_cmd &= ~RxVlan;
@@ -7036,16 +7037,17 @@ rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* don't enable SG, IP_CSUM and TSO by default - it might not work
 	 * properly for all devices */
 	dev->features |= NETIF_F_RXCSUM |
-		NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+		NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
 
 	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
-		NETIF_F_RXCSUM | NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX;
+		NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX |
+		NETIF_F_HW_VLAN_CTAG_RX;
 	dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
 		NETIF_F_HIGHDMA;
 
 	if (tp->mac_version == RTL_GIGA_MAC_VER_05)
 		/* 8110SCd requires hardware Rx VLAN - disallow toggling */
-		dev->hw_features &= ~NETIF_F_HW_VLAN_RX;
+		dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX;
 
 	dev->hw_features |= NETIF_F_RXALL;
 	dev->hw_features |= NETIF_F_RXFCS;
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index a7499cbf4503..3d4a1ed0a7ab 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -2749,7 +2749,7 @@ static int sh_eth_drv_probe(struct platform_device *pdev)
 			goto out_release;
 		}
 		mdp->port = devno % 2;
-		ndev->features = NETIF_F_HW_VLAN_FILTER;
+		ndev->features = NETIF_F_HW_VLAN_CTAG_FILTER;
 	}
 
 	/* initialize first or needed device */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 71b64857e3a6..618446ae1ec1 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2679,7 +2679,7 @@ struct stmmac_priv *stmmac_dvr_probe(struct device *device,
 	ndev->watchdog_timeo = msecs_to_jiffies(watchdog);
 #ifdef STMMAC_VLAN_TAG_USED
 	/* Both mac100 and gmac support receive VLAN tag detection */
-	ndev->features |= NETIF_F_HW_VLAN_RX;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
 #endif
 	priv->msg_enable = netif_msg_init(debug, default_msg_level);
 
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index e8824cea093b..5ca4b33fc4c1 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -2017,12 +2017,12 @@ bdx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 		 * so we can have them same for all ports of the board */
 		ndev->if_port = port;
 		ndev->features = NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_TSO
-		    | NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
-		    NETIF_F_HW_VLAN_FILTER | NETIF_F_RXCSUM
+		    | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+		    NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_RXCSUM
 		    /*| NETIF_F_FRAGLIST */
 		    ;
 		ndev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG |
-			NETIF_F_TSO | NETIF_F_HW_VLAN_TX;
+			NETIF_F_TSO | NETIF_F_HW_VLAN_CTAG_TX;
 
 		if (pci_using_dac)
 			ndev->features |= NETIF_F_HIGHDMA;
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 1d740423a053..084992981cef 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1599,7 +1599,7 @@ static int cpsw_probe_dual_emac(struct platform_device *pdev,
 		priv_sl2->num_irqs = priv->num_irqs;
 	}
 
-	ndev->features |= NETIF_F_HW_VLAN_FILTER;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
@@ -1837,7 +1837,7 @@ static int cpsw_probe(struct platform_device *pdev)
 		k++;
 	}
 
-	ndev->features |= NETIF_F_HW_VLAN_FILTER;
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	ndev->netdev_ops = &cpsw_netdev_ops;
 	SET_ETHTOOL_OPS(ndev, &cpsw_ethtool_ops);
diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c
index fef6b59e69c9..c655fe60121e 100644
--- a/drivers/net/ethernet/toshiba/spider_net.c
+++ b/drivers/net/ethernet/toshiba/spider_net.c
@@ -2329,8 +2329,8 @@ spider_net_setup_netdev(struct spider_net_card *card)
 	if (SPIDER_NET_RX_CSUM_DEFAULT)
 		netdev->features |= NETIF_F_RXCSUM;
 	netdev->features |= NETIF_F_IP_CSUM | NETIF_F_LLTX;
-	/* some time: NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
-	 *		NETIF_F_HW_VLAN_FILTER */
+	/* some time: NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX |
+	 *		NETIF_F_HW_VLAN_CTAG_FILTER */
 
 	netdev->irq = card->pdev->irq;
 	card->num_rx_ints = 0;
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index 185c721c52d7..37b02c3768be 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -1026,8 +1026,9 @@ static int rhine_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 		dev->features |= NETIF_F_SG|NETIF_F_HW_CSUM;
 
 	if (pdev->revision >= VT6105M)
-		dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX |
-		NETIF_F_HW_VLAN_FILTER;
+		dev->features |= NETIF_F_HW_VLAN_CTAG_TX |
+				 NETIF_F_HW_VLAN_CTAG_RX |
+				 NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	/* dev->name not defined before register_netdev()! */
 	rc = register_netdev(dev);
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 1bc7f9fd2583..c1c55a7da941 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2810,9 +2810,10 @@ static int velocity_found1(struct pci_dev *pdev,
 	dev->ethtool_ops = &velocity_ethtool_ops;
 	netif_napi_add(dev, &vptr->napi, velocity_poll, VELOCITY_NAPI_WEIGHT);
 
-	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_HW_VLAN_TX;
-	dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_FILTER |
-		NETIF_F_HW_VLAN_RX | NETIF_F_IP_CSUM;
+	dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG |
+			   NETIF_F_HW_VLAN_CTAG_TX;
+	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_FILTER |
+			 NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_IP_CSUM;
 
 	ret = register_netdev(dev);
 	if (ret < 0)
diff --git a/drivers/net/ethernet/xilinx/ll_temac_main.c b/drivers/net/ethernet/xilinx/ll_temac_main.c
index 4a7c60f4c83d..57c2e5ef2804 100644
--- a/drivers/net/ethernet/xilinx/ll_temac_main.c
+++ b/drivers/net/ethernet/xilinx/ll_temac_main.c
@@ -1018,9 +1018,9 @@ static int temac_of_probe(struct platform_device *op)
 	ndev->features |= NETIF_F_HW_CSUM; /* Can checksum all the packets. */
 	ndev->features |= NETIF_F_IPV6_CSUM; /* Can checksum IPV6 TCP/UDP */
 	ndev->features |= NETIF_F_HIGHDMA; /* Can DMA to high memory. */
-	ndev->features |= NETIF_F_HW_VLAN_TX; /* Transmit VLAN hw accel */
-	ndev->features |= NETIF_F_HW_VLAN_RX; /* Receive VLAN hw acceleration */
-	ndev->features |= NETIF_F_HW_VLAN_FILTER; /* Receive VLAN filtering */
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_TX; /* Transmit VLAN hw accel */
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_RX; /* Receive VLAN hw acceleration */
+	ndev->features |= NETIF_F_HW_VLAN_CTAG_FILTER; /* Receive VLAN filtering */
 	ndev->features |= NETIF_F_VLAN_CHALLENGED; /* cannot handle VLAN pkts */
 	ndev->features |= NETIF_F_GSO; /* Enable software GSO. */
 	ndev->features |= NETIF_F_MULTI_QUEUE; /* Has multiple TX/RX queues */
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 5f85205cd12b..4559bb8115bf 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -431,7 +431,7 @@ static int netvsc_probe(struct hv_device *dev,
 
 	/* TODO: Add GSO and Checksum offload */
 	net->hw_features = NETIF_F_SG;
-	net->features = NETIF_F_SG | NETIF_F_HW_VLAN_TX;
+	net->features = NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_TX;
 
 	SET_ETHTOOL_OPS(net, &ethtool_ops);
 	SET_NETDEV_DEV(net, &dev->device);
diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c
index 82164381f778..724ce7a36c9b 100644
--- a/drivers/net/ifb.c
+++ b/drivers/net/ifb.c
@@ -166,7 +166,7 @@ static const struct net_device_ops ifb_netdev_ops = {
 
 #define IFB_FEATURES (NETIF_F_HW_CSUM | NETIF_F_SG  | NETIF_F_FRAGLIST	| \
 		      NETIF_F_TSO_ECN | NETIF_F_TSO | NETIF_F_TSO6	| \
-		      NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_TX)
+		      NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX)
 
 static void ifb_setup(struct net_device *dev)
 {
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 70af6dc07d40..fedd34cff91c 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -471,7 +471,7 @@ static struct lock_class_key macvlan_netdev_addr_lock_key;
 	(NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \
 	 NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \
 	 NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \
-	 NETIF_F_HW_VLAN_FILTER)
+	 NETIF_F_HW_VLAN_CTAG_FILTER)
 
 #define MACVLAN_STATE_MASK \
 	((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT))
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 9a31e8e50fac..9290eb23d664 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1841,9 +1841,9 @@ static void team_setup(struct net_device *dev)
 	dev->features |= NETIF_F_LLTX;
 	dev->features |= NETIF_F_GRO;
 	dev->hw_features = TEAM_VLAN_FEATURES |
-			   NETIF_F_HW_VLAN_TX |
-			   NETIF_F_HW_VLAN_RX |
-			   NETIF_F_HW_VLAN_FILTER;
+			   NETIF_F_HW_VLAN_CTAG_TX |
+			   NETIF_F_HW_VLAN_CTAG_RX |
+			   NETIF_F_HW_VLAN_CTAG_FILTER;
 
 	dev->hw_features &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_HW_CSUM);
 	dev->features |= dev->hw_features;
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index 16c842997291..b7e9cd7ca6d5 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -101,7 +101,7 @@ static int cdc_mbim_bind(struct usbnet *dev, struct usb_interface *intf)
 	dev->net->flags |= IFF_NOARP;
 
 	/* no need to put the VLAN tci in the packet headers */
-	dev->net->features |= NETIF_F_HW_VLAN_TX;
+	dev->net->features |= NETIF_F_HW_VLAN_CTAG_TX;
 err:
 	return ret;
 }
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 07a4af0aa3dc..f116c593d879 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -255,7 +255,7 @@ static const struct net_device_ops veth_netdev_ops = {
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO |    \
 		       NETIF_F_HW_CSUM | NETIF_F_RXCSUM | NETIF_F_HIGHDMA | \
-		       NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX)
+		       NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX)
 
 static void veth_setup(struct net_device *dev)
 {
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8fdfde6832ab..b61d57ab3c63 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1376,7 +1376,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 	if (vi->has_cvq) {
 		vi->cvq = vqs[total_vqs - 1];
 		if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VLAN))
-			vi->dev->features |= NETIF_F_HW_VLAN_FILTER;
+			vi->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 	}
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index eae7a03d4f9b..ba9bdad39986 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -2107,7 +2107,7 @@ vmxnet3_setup_driver_shared(struct vmxnet3_adapter *adapter)
 		devRead->misc.uptFeatures |= UPT1_F_LRO;
 		devRead->misc.maxNumRxSG = cpu_to_le16(1 + MAX_SKB_FRAGS);
 	}
-	if (adapter->netdev->features & NETIF_F_HW_VLAN_RX)
+	if (adapter->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
 		devRead->misc.uptFeatures |= UPT1_F_RXVLAN;
 
 	devRead->misc.mtu = cpu_to_le32(adapter->netdev->mtu);
@@ -2669,14 +2669,15 @@ vmxnet3_declare_features(struct vmxnet3_adapter *adapter, bool dma64)
 	struct net_device *netdev = adapter->netdev;
 
 	netdev->hw_features = NETIF_F_SG | NETIF_F_RXCSUM |
-		NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_TX |
-		NETIF_F_HW_VLAN_RX | NETIF_F_TSO | NETIF_F_TSO6 |
+		NETIF_F_HW_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
+		NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_TSO | NETIF_F_TSO6 |
 		NETIF_F_LRO;
 	if (dma64)
 		netdev->hw_features |= NETIF_F_HIGHDMA;
 	netdev->vlan_features = netdev->hw_features &
-				~(NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX);
-	netdev->features = netdev->hw_features | NETIF_F_HW_VLAN_FILTER;
+				~(NETIF_F_HW_VLAN_CTAG_TX |
+				  NETIF_F_HW_VLAN_CTAG_RX);
+	netdev->features = netdev->hw_features | NETIF_F_HW_VLAN_CTAG_FILTER;
 }
 
 
diff --git a/drivers/net/vmxnet3/vmxnet3_ethtool.c b/drivers/net/vmxnet3/vmxnet3_ethtool.c
index 63a124340cbe..600ab56c0008 100644
--- a/drivers/net/vmxnet3/vmxnet3_ethtool.c
+++ b/drivers/net/vmxnet3/vmxnet3_ethtool.c
@@ -263,7 +263,8 @@ int vmxnet3_set_features(struct net_device *netdev, netdev_features_t features)
 	unsigned long flags;
 	netdev_features_t changed = features ^ netdev->features;
 
-	if (changed & (NETIF_F_RXCSUM | NETIF_F_LRO | NETIF_F_HW_VLAN_RX)) {
+	if (changed & (NETIF_F_RXCSUM | NETIF_F_LRO |
+		       NETIF_F_HW_VLAN_CTAG_RX)) {
 		if (features & NETIF_F_RXCSUM)
 			adapter->shared->devRead.misc.uptFeatures |=
 			UPT1_F_RXCSUM;
@@ -279,7 +280,7 @@ int vmxnet3_set_features(struct net_device *netdev, netdev_features_t features)
 			adapter->shared->devRead.misc.uptFeatures &=
 							~UPT1_F_LRO;
 
-		if (features & NETIF_F_HW_VLAN_RX)
+		if (features & NETIF_F_HW_VLAN_CTAG_RX)
 			adapter->shared->devRead.misc.uptFeatures |=
 			UPT1_F_RXVLAN;
 		else
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index d690166efeaf..90ddd823605c 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -959,7 +959,7 @@ static int qeth_l2_setup_netdev(struct qeth_card *card)
 		SET_ETHTOOL_OPS(card->dev, &qeth_l2_ethtool_ops);
 	else
 		SET_ETHTOOL_OPS(card->dev, &qeth_l2_osn_ops);
-	card->dev->features |= NETIF_F_HW_VLAN_FILTER;
+	card->dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER;
 	card->info.broadcast_capable = 1;
 	qeth_l2_request_initial_mac(card);
 	SET_NETDEV_DEV(card->dev, &card->gdev->dev);
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 8710337dab3e..04261bc08f20 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -3294,9 +3294,9 @@ static int qeth_l3_setup_netdev(struct qeth_card *card)
 	card->dev->watchdog_timeo = QETH_TX_TIMEOUT;
 	card->dev->mtu = card->info.initial_mtu;
 	SET_ETHTOOL_OPS(card->dev, &qeth_l3_ethtool_ops);
-	card->dev->features |=	NETIF_F_HW_VLAN_TX |
-				NETIF_F_HW_VLAN_RX |
-				NETIF_F_HW_VLAN_FILTER;
+	card->dev->features |=	NETIF_F_HW_VLAN_CTAG_TX |
+				NETIF_F_HW_VLAN_CTAG_RX |
+				NETIF_F_HW_VLAN_CTAG_FILTER;
 	card->dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
 	card->dev->gso_max_size = 15 * PAGE_SIZE;
 
diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c
index 9bfdc9a3f897..292b24f9bf93 100644
--- a/drivers/scsi/fcoe/fcoe.c
+++ b/drivers/scsi/fcoe/fcoe.c
@@ -1655,7 +1655,7 @@ static int fcoe_xmit(struct fc_lport *lport, struct fc_frame *fp)
 	skb->priority = fcoe->priority;
 
 	if (fcoe->netdev->priv_flags & IFF_802_1Q_VLAN &&
-	    fcoe->realdev->features & NETIF_F_HW_VLAN_TX) {
+	    fcoe->realdev->features & NETIF_F_HW_VLAN_CTAG_TX) {
 		skb->vlan_tci = VLAN_TAG_PRESENT |
 				vlan_dev_vlan_id(fcoe->netdev);
 		skb->dev = fcoe->realdev;
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 70962f3fdb79..fee28291a824 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -238,7 +238,7 @@ static inline struct sk_buff *__vlan_hwaccel_put_tag(struct sk_buff *skb,
  */
 static inline struct sk_buff *vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
 {
-	if (skb->dev->features & NETIF_F_HW_VLAN_TX) {
+	if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
 		return __vlan_hwaccel_put_tag(skb, vlan_tci);
 	} else {
 		return __vlan_put_tag(skb, vlan_tci);
@@ -294,7 +294,7 @@ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb,
  */
 static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 {
-	if (skb->dev->features & NETIF_F_HW_VLAN_TX) {
+	if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
 		return __vlan_hwaccel_get_tag(skb, vlan_tci);
 	} else {
 		return __vlan_get_tag(skb, vlan_tci);
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index d6ee2d008ee4..785913b8983d 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -22,9 +22,9 @@ enum {
 	NETIF_F_IPV6_CSUM_BIT,		/* Can checksum TCP/UDP over IPV6 */
 	NETIF_F_HIGHDMA_BIT,		/* Can DMA to high memory. */
 	NETIF_F_FRAGLIST_BIT,		/* Scatter/gather IO. */
-	NETIF_F_HW_VLAN_TX_BIT,		/* Transmit VLAN hw acceleration */
-	NETIF_F_HW_VLAN_RX_BIT,		/* Receive VLAN hw acceleration */
-	NETIF_F_HW_VLAN_FILTER_BIT,	/* Receive filtering on VLAN */
+	NETIF_F_HW_VLAN_CTAG_TX_BIT,	/* Transmit VLAN CTAG HW acceleration */
+	NETIF_F_HW_VLAN_CTAG_RX_BIT,	/* Receive VLAN CTAG HW acceleration */
+	NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */
 	NETIF_F_VLAN_CHALLENGED_BIT,	/* Device cannot handle VLAN packets */
 	NETIF_F_GSO_BIT,		/* Enable software GSO. */
 	NETIF_F_LLTX_BIT,		/* LockLess TX - deprecated. Please */
@@ -80,9 +80,9 @@ enum {
 #define NETIF_F_GSO_ROBUST	__NETIF_F(GSO_ROBUST)
 #define NETIF_F_HIGHDMA		__NETIF_F(HIGHDMA)
 #define NETIF_F_HW_CSUM		__NETIF_F(HW_CSUM)
-#define NETIF_F_HW_VLAN_FILTER	__NETIF_F(HW_VLAN_FILTER)
-#define NETIF_F_HW_VLAN_RX	__NETIF_F(HW_VLAN_RX)
-#define NETIF_F_HW_VLAN_TX	__NETIF_F(HW_VLAN_TX)
+#define NETIF_F_HW_VLAN_CTAG_FILTER __NETIF_F(HW_VLAN_CTAG_FILTER)
+#define NETIF_F_HW_VLAN_CTAG_RX	__NETIF_F(HW_VLAN_CTAG_RX)
+#define NETIF_F_HW_VLAN_CTAG_TX	__NETIF_F(HW_VLAN_CTAG_TX)
 #define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM)
 #define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)
 #define NETIF_F_LLTX		__NETIF_F(LLTX)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 623b57b52195..7eb7e03ee417 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -785,11 +785,13 @@ struct netdev_fcoe_hbainfo {
  *	   neither operation.
  *
  * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid);
- *	If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER)
+ *	If device support VLAN filtering (dev->features &
+		 *			  NETIF_F_HW_VLAN_CTAG_FILTER)
  *	this function is called when a VLAN id is registered.
  *
  * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid);
- *	If device support VLAN filtering (dev->features & NETIF_F_HW_VLAN_FILTER)
+ *	If device support VLAN filtering (dev->features &
+		 *			  NETIF_F_HW_VLAN_CTAG_FILTER)
  *	this function is called when a VLAN id is unregistered.
  *
  * void (*ndo_poll_controller)(struct net_device *dev);
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 85addcd9372b..d913feed0757 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -301,7 +301,7 @@ static void vlan_transfer_features(struct net_device *dev,
 {
 	vlandev->gso_max_size = dev->gso_max_size;
 
-	if (dev->features & NETIF_F_HW_VLAN_TX)
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_TX)
 		vlandev->hard_header_len = dev->hard_header_len;
 	else
 		vlandev->hard_header_len = dev->hard_header_len + VLAN_HLEN;
@@ -347,7 +347,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		__vlan_device_event(dev, event);
 
 	if ((event == NETDEV_UP) &&
-	    (dev->features & NETIF_F_HW_VLAN_FILTER)) {
+	    (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
 		pr_info("adding VLAN 0 to HW filter on device %s\n",
 			dev->name);
 		vlan_vid_add(dev, 0);
@@ -415,7 +415,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		break;
 
 	case NETDEV_DOWN:
-		if (dev->features & NETIF_F_HW_VLAN_FILTER)
+		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
 			vlan_vid_del(dev, 0);
 
 		/* Put all VLANs for this dev in the down state too.  */
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index f3b6f515eba6..3df29d344704 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -225,7 +225,7 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid,
 	if (!vid_info)
 		return -ENOMEM;
 
-	if (dev->features & NETIF_F_HW_VLAN_FILTER) {
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 		err =  ops->ndo_vlan_rx_add_vid(dev, vid);
 		if (err) {
 			kfree(vid_info);
@@ -282,7 +282,7 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
 	unsigned short vid = vid_info->vid;
 	int err;
 
-	if (dev->features & NETIF_F_HW_VLAN_FILTER) {
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
 		err = ops->ndo_vlan_rx_kill_vid(dev, vid);
 		if (err) {
 			pr_warn("failed to kill vid %d for device %s\n",
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 19cf81bf9f69..5c4892a86410 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -583,7 +583,7 @@ static int vlan_dev_init(struct net_device *dev)
 #endif
 
 	dev->needed_headroom = real_dev->needed_headroom;
-	if (real_dev->features & NETIF_F_HW_VLAN_TX) {
+	if (real_dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
 		dev->header_ops      = real_dev->header_ops;
 		dev->hard_header_len = real_dev->hard_header_len;
 	} else {
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 314c73ed418f..967312803e41 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -348,10 +348,10 @@ void br_dev_setup(struct net_device *dev)
 
 	dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
 			NETIF_F_GSO_MASK | NETIF_F_HW_CSUM | NETIF_F_LLTX |
-			NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_TX;
+			NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX;
 	dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA |
 			   NETIF_F_GSO_MASK | NETIF_F_HW_CSUM |
-			   NETIF_F_HW_VLAN_TX;
+			   NETIF_F_HW_VLAN_CTAG_TX;
 
 	br->dev = dev;
 	spin_lock_init(&br->lock);
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 93dde75923f0..0b3dbbec80d0 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -54,7 +54,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
 			dev = br->dev;
 		}
 
-		if (p && (dev->features & NETIF_F_HW_VLAN_FILTER)) {
+		if (p && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
 			/* Add VLAN to the device filter if it is supported.
 			 * Stricly speaking, this is not necessary now, since
 			 * devices are made promiscuous by the bridge, but if
@@ -82,7 +82,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
 	return 0;
 
 out_filt:
-	if (p && (dev->features & NETIF_F_HW_VLAN_FILTER))
+	if (p && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
 		dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, vid);
 	return err;
 }
@@ -98,7 +98,7 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid)
 	if (v->port_idx && vid) {
 		struct net_device *dev = v->parent.port->dev;
 
-		if (dev->features & NETIF_F_HW_VLAN_FILTER)
+		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
 			dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, vid);
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 3655ff927315..07a8e9dc43fc 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2435,13 +2435,13 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 		return harmonize_features(skb, protocol, features);
 	}
 
-	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
+	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX);
 
 	if (protocol != htons(ETH_P_8021Q)) {
 		return harmonize_features(skb, protocol, features);
 	} else {
 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
-				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
+				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX;
 		return harmonize_features(skb, protocol, features);
 	}
 }
@@ -2482,7 +2482,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		features = netif_skb_features(skb);
 
 		if (vlan_tx_tag_present(skb) &&
-		    !(features & NETIF_F_HW_VLAN_TX)) {
+		    !(features & NETIF_F_HW_VLAN_CTAG_TX)) {
 			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
 			if (unlikely(!skb))
 				goto out;
@@ -5180,7 +5180,8 @@ int register_netdevice(struct net_device *dev)
 		}
 	}
 
-	if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
+	if (((dev->hw_features | dev->features) &
+	     NETIF_F_HW_VLAN_CTAG_FILTER) &&
 	    (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
 	     !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
 		netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index adc1351e6873..b87712cfd26c 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -60,10 +60,10 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_IPV6_CSUM_BIT] =        "tx-checksum-ipv6",
 	[NETIF_F_HIGHDMA_BIT] =          "highdma",
 	[NETIF_F_FRAGLIST_BIT] =         "tx-scatter-gather-fraglist",
-	[NETIF_F_HW_VLAN_TX_BIT] =       "tx-vlan-hw-insert",
+	[NETIF_F_HW_VLAN_CTAG_TX_BIT] =  "tx-vlan-ctag-hw-insert",
 
-	[NETIF_F_HW_VLAN_RX_BIT] =       "rx-vlan-hw-parse",
-	[NETIF_F_HW_VLAN_FILTER_BIT] =   "rx-vlan-filter",
+	[NETIF_F_HW_VLAN_CTAG_RX_BIT] =  "rx-vlan-ctag-hw-parse",
+	[NETIF_F_HW_VLAN_CTAG_FILTER_BIT] = "rx-vlan-ctag-filter",
 	[NETIF_F_VLAN_CHALLENGED_BIT] =  "vlan-challenged",
 	[NETIF_F_GSO_BIT] =              "tx-generic-segmentation",
 	[NETIF_F_LLTX_BIT] =             "tx-lockless",
@@ -267,18 +267,19 @@ static int ethtool_set_one_feature(struct net_device *dev,
 
 #define ETH_ALL_FLAGS    (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
 			  ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
-#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \
-			  NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_CTAG_RX | \
+			  NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_NTUPLE | \
+			  NETIF_F_RXHASH)
 
 static u32 __ethtool_get_flags(struct net_device *dev)
 {
 	u32 flags = 0;
 
-	if (dev->features & NETIF_F_LRO)	flags |= ETH_FLAG_LRO;
-	if (dev->features & NETIF_F_HW_VLAN_RX)	flags |= ETH_FLAG_RXVLAN;
-	if (dev->features & NETIF_F_HW_VLAN_TX)	flags |= ETH_FLAG_TXVLAN;
-	if (dev->features & NETIF_F_NTUPLE)	flags |= ETH_FLAG_NTUPLE;
-	if (dev->features & NETIF_F_RXHASH)	flags |= ETH_FLAG_RXHASH;
+	if (dev->features & NETIF_F_LRO)	     flags |= ETH_FLAG_LRO;
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_RX) flags |= ETH_FLAG_RXVLAN;
+	if (dev->features & NETIF_F_HW_VLAN_CTAG_TX) flags |= ETH_FLAG_TXVLAN;
+	if (dev->features & NETIF_F_NTUPLE)	     flags |= ETH_FLAG_NTUPLE;
+	if (dev->features & NETIF_F_RXHASH)	     flags |= ETH_FLAG_RXHASH;
 
 	return flags;
 }
@@ -291,8 +292,8 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
 		return -EINVAL;
 
 	if (data & ETH_FLAG_LRO)	features |= NETIF_F_LRO;
-	if (data & ETH_FLAG_RXVLAN)	features |= NETIF_F_HW_VLAN_RX;
-	if (data & ETH_FLAG_TXVLAN)	features |= NETIF_F_HW_VLAN_TX;
+	if (data & ETH_FLAG_RXVLAN)	features |= NETIF_F_HW_VLAN_CTAG_RX;
+	if (data & ETH_FLAG_TXVLAN)	features |= NETIF_F_HW_VLAN_CTAG_TX;
 	if (data & ETH_FLAG_NTUPLE)	features |= NETIF_F_NTUPLE;
 	if (data & ETH_FLAG_RXHASH)	features |= NETIF_F_RXHASH;
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index a3a17aed3639..8de961e67cf7 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -383,7 +383,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_xmit_stopped(txq)) {
 					if (vlan_tx_tag_present(skb) &&
-					    !(netif_skb_features(skb) & NETIF_F_HW_VLAN_TX)) {
+					    !(netif_skb_features(skb) & NETIF_F_HW_VLAN_CTAG_TX)) {
 						skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
 						if (unlikely(!skb))
 							break;
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 9604760494b1..73682de8dc69 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -137,7 +137,7 @@ static void do_setup(struct net_device *netdev)
 			   NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_TSO;
 
 	netdev->vlan_features = netdev->features;
-	netdev->features |= NETIF_F_HW_VLAN_TX;
+	netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
 	netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
 	eth_hw_addr_random(netdev);
 }
-- 
cgit 


From 80d5c3689b886308247da295a228a54df49a44f6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 19 Apr 2013 02:04:28 +0000
Subject: net: vlan: prepare for 802.1ad VLAN filtering offload

Change the rx_{add,kill}_vid callbacks to take a protocol argument in
preparation of 802.1ad support. The protocol argument used so far is
always htons(ETH_P_8021Q).

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                   | 17 ++++----
 drivers/net/ethernet/adaptec/starfire.c           |  6 ++-
 drivers/net/ethernet/brocade/bna/bnad.c           |  6 +--
 drivers/net/ethernet/cisco/enic/enic_dev.c        |  4 +-
 drivers/net/ethernet/cisco/enic/enic_dev.h        |  4 +-
 drivers/net/ethernet/emulex/benet/be_main.c       |  4 +-
 drivers/net/ethernet/ibm/ehea/ehea_main.c         |  4 +-
 drivers/net/ethernet/intel/e1000/e1000_main.c     | 22 +++++++----
 drivers/net/ethernet/intel/e1000e/netdev.c        | 20 ++++++----
 drivers/net/ethernet/intel/igb/igb_main.c         | 12 +++---
 drivers/net/ethernet/intel/igbvf/netdev.c         |  8 ++--
 drivers/net/ethernet/intel/ixgb/ixgb_main.c       | 12 +++---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     | 10 +++--
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  9 +++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  6 ++-
 drivers/net/ethernet/neterion/vxge/vxge-main.c    |  8 ++--
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c  |  8 ++--
 drivers/net/ethernet/qlogic/qlge/qlge_main.c      |  4 +-
 drivers/net/ethernet/renesas/sh_eth.c             |  6 ++-
 drivers/net/ethernet/tehuti/tehuti.c              |  4 +-
 drivers/net/ethernet/ti/cpsw.c                    |  4 +-
 drivers/net/ethernet/via/via-rhine.c              | 10 +++--
 drivers/net/ethernet/via/via-velocity.c           |  6 ++-
 drivers/net/macvlan.c                             |  8 ++--
 drivers/net/team/team.c                           | 10 ++---
 drivers/net/virtio_net.c                          |  6 ++-
 drivers/net/vmxnet3/vmxnet3_drv.c                 |  4 +-
 drivers/s390/net/qeth_l2_main.c                   |  6 ++-
 drivers/s390/net/qeth_l3_main.c                   |  6 ++-
 include/linux/if_vlan.h                           |  4 +-
 include/linux/netdevice.h                         | 16 ++++----
 net/8021q/vlan.c                                  | 10 ++---
 net/8021q/vlan_core.c                             | 47 +++++++++++++----------
 net/bridge/br_vlan.c                              | 10 +++--
 34 files changed, 184 insertions(+), 137 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 8d324f8a1757..35e89e12a1f7 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -428,14 +428,15 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
  * @bond_dev: bonding net device that got called
  * @vid: vlan id being added
  */
-static int bond_vlan_rx_add_vid(struct net_device *bond_dev, uint16_t vid)
+static int bond_vlan_rx_add_vid(struct net_device *bond_dev,
+				__be16 proto, u16 vid)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave, *stop_at;
 	int i, res;
 
 	bond_for_each_slave(bond, slave, i) {
-		res = vlan_vid_add(slave->dev, vid);
+		res = vlan_vid_add(slave->dev, proto, vid);
 		if (res)
 			goto unwind;
 	}
@@ -453,7 +454,7 @@ unwind:
 	/* unwind from head to the slave that failed */
 	stop_at = slave;
 	bond_for_each_slave_from_to(bond, slave, i, bond->first_slave, stop_at)
-		vlan_vid_del(slave->dev, vid);
+		vlan_vid_del(slave->dev, proto, vid);
 
 	return res;
 }
@@ -463,14 +464,15 @@ unwind:
  * @bond_dev: bonding net device that got called
  * @vid: vlan id being removed
  */
-static int bond_vlan_rx_kill_vid(struct net_device *bond_dev, uint16_t vid)
+static int bond_vlan_rx_kill_vid(struct net_device *bond_dev,
+				 __be16 proto, u16 vid)
 {
 	struct bonding *bond = netdev_priv(bond_dev);
 	struct slave *slave;
 	int i, res;
 
 	bond_for_each_slave(bond, slave, i)
-		vlan_vid_del(slave->dev, vid);
+		vlan_vid_del(slave->dev, proto, vid);
 
 	res = bond_del_vlan(bond, vid);
 	if (res) {
@@ -488,7 +490,8 @@ static void bond_add_vlans_on_slave(struct bonding *bond, struct net_device *sla
 	int res;
 
 	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
-		res = vlan_vid_add(slave_dev, vlan->vlan_id);
+		res = vlan_vid_add(slave_dev, htons(ETH_P_8021Q),
+				   vlan->vlan_id);
 		if (res)
 			pr_warning("%s: Failed to add vlan id %d to device %s\n",
 				   bond->dev->name, vlan->vlan_id,
@@ -504,7 +507,7 @@ static void bond_del_vlans_from_slave(struct bonding *bond,
 	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
 		if (!vlan->vlan_id)
 			continue;
-		vlan_vid_del(slave_dev, vlan->vlan_id);
+		vlan_vid_del(slave_dev, htons(ETH_P_8021Q), vlan->vlan_id);
 	}
 }
 
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index 365865130f7c..cdbc5443ae3b 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -594,7 +594,8 @@ static const struct ethtool_ops ethtool_ops;
 
 
 #ifdef VLAN_SUPPORT
-static int netdev_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int netdev_vlan_rx_add_vid(struct net_device *dev,
+				  __be16 proto, u16 vid)
 {
 	struct netdev_private *np = netdev_priv(dev);
 
@@ -608,7 +609,8 @@ static int netdev_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	return 0;
 }
 
-static int netdev_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int netdev_vlan_rx_kill_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
 {
 	struct netdev_private *np = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index 1d9d0371a743..c0bc44e51fa9 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -3068,8 +3068,7 @@ bnad_change_mtu(struct net_device *netdev, int new_mtu)
 }
 
 static int
-bnad_vlan_rx_add_vid(struct net_device *netdev,
-				 unsigned short vid)
+bnad_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct bnad *bnad = netdev_priv(netdev);
 	unsigned long flags;
@@ -3090,8 +3089,7 @@ bnad_vlan_rx_add_vid(struct net_device *netdev,
 }
 
 static int
-bnad_vlan_rx_kill_vid(struct net_device *netdev,
-				  unsigned short vid)
+bnad_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct bnad *bnad = netdev_priv(netdev);
 	unsigned long flags;
diff --git a/drivers/net/ethernet/cisco/enic/enic_dev.c b/drivers/net/ethernet/cisco/enic/enic_dev.c
index bf0fc56dba19..4b6e5695b263 100644
--- a/drivers/net/ethernet/cisco/enic/enic_dev.c
+++ b/drivers/net/ethernet/cisco/enic/enic_dev.c
@@ -212,7 +212,7 @@ int enic_dev_deinit_done(struct enic *enic, int *status)
 }
 
 /* rtnl lock is held */
-int enic_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+int enic_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct enic *enic = netdev_priv(netdev);
 	int err;
@@ -225,7 +225,7 @@ int enic_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 }
 
 /* rtnl lock is held */
-int enic_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+int enic_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct enic *enic = netdev_priv(netdev);
 	int err;
diff --git a/drivers/net/ethernet/cisco/enic/enic_dev.h b/drivers/net/ethernet/cisco/enic/enic_dev.h
index da1cba3c410e..08bded051b93 100644
--- a/drivers/net/ethernet/cisco/enic/enic_dev.h
+++ b/drivers/net/ethernet/cisco/enic/enic_dev.h
@@ -46,8 +46,8 @@ int enic_dev_packet_filter(struct enic *enic, int directed, int multicast,
 	int broadcast, int promisc, int allmulti);
 int enic_dev_add_addr(struct enic *enic, u8 *addr);
 int enic_dev_del_addr(struct enic *enic, u8 *addr);
-int enic_vlan_rx_add_vid(struct net_device *netdev, u16 vid);
-int enic_vlan_rx_kill_vid(struct net_device *netdev, u16 vid);
+int enic_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid);
+int enic_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid);
 int enic_dev_notify_unset(struct enic *enic);
 int enic_dev_hang_notify(struct enic *enic);
 int enic_dev_set_ig_vlan_rewrite_mode(struct enic *enic);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index bde26d4d52ec..b41333184916 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -902,7 +902,7 @@ set_vlan_promisc:
 	return status;
 }
 
-static int be_vlan_add_vid(struct net_device *netdev, u16 vid)
+static int be_vlan_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	int status = 0;
@@ -928,7 +928,7 @@ ret:
 	return status;
 }
 
-static int be_vlan_rem_vid(struct net_device *netdev, u16 vid)
+static int be_vlan_rem_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct be_adapter *adapter = netdev_priv(netdev);
 	int status = 0;
diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c
index 9c9fa745ff82..d1812aacbc7b 100644
--- a/drivers/net/ethernet/ibm/ehea/ehea_main.c
+++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c
@@ -2110,7 +2110,7 @@ static int ehea_start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
-static int ehea_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int ehea_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct ehea_port *port = netdev_priv(dev);
 	struct ehea_adapter *adapter = port->adapter;
@@ -2148,7 +2148,7 @@ out:
 	return err;
 }
 
-static int ehea_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int ehea_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct ehea_port *port = netdev_priv(dev);
 	struct ehea_adapter *adapter = port->adapter;
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 8d0d0d420c21..8239dafdd8e7 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -166,8 +166,10 @@ static void e1000_vlan_mode(struct net_device *netdev,
 			    netdev_features_t features);
 static void e1000_vlan_filter_on_off(struct e1000_adapter *adapter,
 				     bool filter_on);
-static int e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid);
-static int e1000_vlan_rx_kill_vid(struct net_device *netdev, u16 vid);
+static int e1000_vlan_rx_add_vid(struct net_device *netdev,
+				 __be16 proto, u16 vid);
+static int e1000_vlan_rx_kill_vid(struct net_device *netdev,
+				  __be16 proto, u16 vid);
 static void e1000_restore_vlan(struct e1000_adapter *adapter);
 
 #ifdef CONFIG_PM
@@ -333,7 +335,7 @@ static void e1000_update_mng_vlan(struct e1000_adapter *adapter)
 	if (!test_bit(vid, adapter->active_vlans)) {
 		if (hw->mng_cookie.status &
 		    E1000_MNG_DHCP_COOKIE_STATUS_VLAN_SUPPORT) {
-			e1000_vlan_rx_add_vid(netdev, vid);
+			e1000_vlan_rx_add_vid(netdev, htons(ETH_P_8021Q), vid);
 			adapter->mng_vlan_id = vid;
 		} else {
 			adapter->mng_vlan_id = E1000_MNG_VLAN_NONE;
@@ -341,7 +343,8 @@ static void e1000_update_mng_vlan(struct e1000_adapter *adapter)
 		if ((old_vid != (u16)E1000_MNG_VLAN_NONE) &&
 		    (vid != old_vid) &&
 		    !test_bit(old_vid, adapter->active_vlans))
-			e1000_vlan_rx_kill_vid(netdev, old_vid);
+			e1000_vlan_rx_kill_vid(netdev, htons(ETH_P_8021Q),
+					       old_vid);
 	} else {
 		adapter->mng_vlan_id = vid;
 	}
@@ -1457,7 +1460,8 @@ static int e1000_close(struct net_device *netdev)
 	if ((hw->mng_cookie.status &
 	     E1000_MNG_DHCP_COOKIE_STATUS_VLAN_SUPPORT) &&
 	    !test_bit(adapter->mng_vlan_id, adapter->active_vlans)) {
-		e1000_vlan_rx_kill_vid(netdev, adapter->mng_vlan_id);
+		e1000_vlan_rx_kill_vid(netdev, htons(ETH_P_8021Q),
+				       adapter->mng_vlan_id);
 	}
 
 	return 0;
@@ -4837,7 +4841,8 @@ static void e1000_vlan_mode(struct net_device *netdev,
 		e1000_irq_enable(adapter);
 }
 
-static int e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+static int e1000_vlan_rx_add_vid(struct net_device *netdev,
+				 __be16 proto, u16 vid)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -4862,7 +4867,8 @@ static int e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 	return 0;
 }
 
-static int e1000_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+static int e1000_vlan_rx_kill_vid(struct net_device *netdev,
+				  __be16 proto, u16 vid)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -4896,7 +4902,7 @@ static void e1000_restore_vlan(struct e1000_adapter *adapter)
 
 	e1000_vlan_filter_on_off(adapter, true);
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-		e1000_vlan_rx_add_vid(adapter->netdev, vid);
+		e1000_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid);
 }
 
 int e1000_set_spd_dplx(struct e1000_adapter *adapter, u32 spd, u8 dplx)
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index a2e7db33bf9d..8c17f01a155f 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -2672,7 +2672,8 @@ static int e1000e_poll(struct napi_struct *napi, int weight)
 	return work_done;
 }
 
-static int e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+static int e1000_vlan_rx_add_vid(struct net_device *netdev,
+				 __be16 proto, u16 vid)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -2697,7 +2698,8 @@ static int e1000_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 	return 0;
 }
 
-static int e1000_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+static int e1000_vlan_rx_kill_vid(struct net_device *netdev,
+				  __be16 proto, u16 vid)
 {
 	struct e1000_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -2741,7 +2743,8 @@ static void e1000e_vlan_filter_disable(struct e1000_adapter *adapter)
 		ew32(RCTL, rctl);
 
 		if (adapter->mng_vlan_id != (u16)E1000_MNG_VLAN_NONE) {
-			e1000_vlan_rx_kill_vid(netdev, adapter->mng_vlan_id);
+			e1000_vlan_rx_kill_vid(netdev, htons(ETH_P_8021Q),
+					       adapter->mng_vlan_id);
 			adapter->mng_vlan_id = E1000_MNG_VLAN_NONE;
 		}
 	}
@@ -2802,22 +2805,22 @@ static void e1000_update_mng_vlan(struct e1000_adapter *adapter)
 	u16 old_vid = adapter->mng_vlan_id;
 
 	if (adapter->hw.mng_cookie.status & E1000_MNG_DHCP_COOKIE_STATUS_VLAN) {
-		e1000_vlan_rx_add_vid(netdev, vid);
+		e1000_vlan_rx_add_vid(netdev, htons(ETH_P_8021Q), vid);
 		adapter->mng_vlan_id = vid;
 	}
 
 	if ((old_vid != (u16)E1000_MNG_VLAN_NONE) && (vid != old_vid))
-		e1000_vlan_rx_kill_vid(netdev, old_vid);
+		e1000_vlan_rx_kill_vid(netdev, htons(ETH_P_8021Q), old_vid);
 }
 
 static void e1000_restore_vlan(struct e1000_adapter *adapter)
 {
 	u16 vid;
 
-	e1000_vlan_rx_add_vid(adapter->netdev, 0);
+	e1000_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), 0);
 
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-	    e1000_vlan_rx_add_vid(adapter->netdev, vid);
+	    e1000_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid);
 }
 
 static void e1000_init_manageability_pt(struct e1000_adapter *adapter)
@@ -4384,7 +4387,8 @@ static int e1000_close(struct net_device *netdev)
 	 * the same ID is registered on the host OS (let 8021q kill it)
 	 */
 	if (adapter->hw.mng_cookie.status & E1000_MNG_DHCP_COOKIE_STATUS_VLAN)
-		e1000_vlan_rx_kill_vid(netdev, adapter->mng_vlan_id);
+		e1000_vlan_rx_kill_vid(netdev, htons(ETH_P_8021Q),
+				       adapter->mng_vlan_id);
 
 	/* If AMT is enabled, let the firmware know that the network
 	 * interface is now closed
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index b0b1777c0af6..d13ea71c7c1f 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -159,8 +159,8 @@ static int igb_ioctl(struct net_device *, struct ifreq *, int cmd);
 static void igb_tx_timeout(struct net_device *);
 static void igb_reset_task(struct work_struct *);
 static void igb_vlan_mode(struct net_device *netdev, netdev_features_t features);
-static int igb_vlan_rx_add_vid(struct net_device *, u16);
-static int igb_vlan_rx_kill_vid(struct net_device *, u16);
+static int igb_vlan_rx_add_vid(struct net_device *, __be16, u16);
+static int igb_vlan_rx_kill_vid(struct net_device *, __be16, u16);
 static void igb_restore_vlan(struct igb_adapter *);
 static void igb_rar_set_qsel(struct igb_adapter *, u8 *, u32 , u8);
 static void igb_ping_all_vfs(struct igb_adapter *);
@@ -6976,7 +6976,8 @@ static void igb_vlan_mode(struct net_device *netdev, netdev_features_t features)
 	igb_rlpml_set(adapter);
 }
 
-static int igb_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+static int igb_vlan_rx_add_vid(struct net_device *netdev,
+			       __be16 proto, u16 vid)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -6993,7 +6994,8 @@ static int igb_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 	return 0;
 }
 
-static int igb_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+static int igb_vlan_rx_kill_vid(struct net_device *netdev,
+				__be16 proto, u16 vid)
 {
 	struct igb_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -7019,7 +7021,7 @@ static void igb_restore_vlan(struct igb_adapter *adapter)
 	igb_vlan_mode(adapter->netdev, adapter->netdev->features);
 
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-		igb_vlan_rx_add_vid(adapter->netdev, vid);
+		igb_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid);
 }
 
 int igb_set_spd_dplx(struct igb_adapter *adapter, u32 spd, u8 dplx)
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 33e7b3069fb6..3854fd698b85 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -1230,7 +1230,8 @@ static void igbvf_set_rlpml(struct igbvf_adapter *adapter)
 	e1000_rlpml_set_vf(hw, max_frame_size);
 }
 
-static int igbvf_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+static int igbvf_vlan_rx_add_vid(struct net_device *netdev,
+				 __be16 proto, u16 vid)
 {
 	struct igbvf_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -1243,7 +1244,8 @@ static int igbvf_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 	return 0;
 }
 
-static int igbvf_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+static int igbvf_vlan_rx_kill_vid(struct net_device *netdev,
+				  __be16 proto, u16 vid)
 {
 	struct igbvf_adapter *adapter = netdev_priv(netdev);
 	struct e1000_hw *hw = &adapter->hw;
@@ -1262,7 +1264,7 @@ static void igbvf_restore_vlan(struct igbvf_adapter *adapter)
 	u16 vid;
 
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-		igbvf_vlan_rx_add_vid(adapter->netdev, vid);
+		igbvf_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index e65d9e910227..a32f274acd36 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -101,8 +101,10 @@ static void ixgb_tx_timeout_task(struct work_struct *work);
 
 static void ixgb_vlan_strip_enable(struct ixgb_adapter *adapter);
 static void ixgb_vlan_strip_disable(struct ixgb_adapter *adapter);
-static int ixgb_vlan_rx_add_vid(struct net_device *netdev, u16 vid);
-static int ixgb_vlan_rx_kill_vid(struct net_device *netdev, u16 vid);
+static int ixgb_vlan_rx_add_vid(struct net_device *netdev,
+				__be16 proto, u16 vid);
+static int ixgb_vlan_rx_kill_vid(struct net_device *netdev,
+				 __be16 proto, u16 vid);
 static void ixgb_restore_vlan(struct ixgb_adapter *adapter);
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
@@ -2209,7 +2211,7 @@ ixgb_vlan_strip_disable(struct ixgb_adapter *adapter)
 }
 
 static int
-ixgb_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+ixgb_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct ixgb_adapter *adapter = netdev_priv(netdev);
 	u32 vfta, index;
@@ -2226,7 +2228,7 @@ ixgb_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 }
 
 static int
-ixgb_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+ixgb_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct ixgb_adapter *adapter = netdev_priv(netdev);
 	u32 vfta, index;
@@ -2248,7 +2250,7 @@ ixgb_restore_vlan(struct ixgb_adapter *adapter)
 	u16 vid;
 
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-		ixgb_vlan_rx_add_vid(adapter->netdev, vid);
+		ixgb_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid);
 }
 
 #ifdef CONFIG_NET_POLL_CONTROLLER
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 0316b65dfe06..3becffc77321 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3467,7 +3467,8 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
 	hw->mac.ops.enable_rx_dma(hw, rxctrl);
 }
 
-static int ixgbe_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+static int ixgbe_vlan_rx_add_vid(struct net_device *netdev,
+				 __be16 proto, u16 vid)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -3479,7 +3480,8 @@ static int ixgbe_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 	return 0;
 }
 
-static int ixgbe_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+static int ixgbe_vlan_rx_kill_vid(struct net_device *netdev,
+				  __be16 proto, u16 vid)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -3584,10 +3586,10 @@ static void ixgbe_restore_vlan(struct ixgbe_adapter *adapter)
 {
 	u16 vid;
 
-	ixgbe_vlan_rx_add_vid(adapter->netdev, 0);
+	ixgbe_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), 0);
 
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-		ixgbe_vlan_rx_add_vid(adapter->netdev, vid);
+		ixgbe_vlan_rx_add_vid(adapter->netdev, htons(ETH_P_8021Q), vid);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 8f907b7af319..4bc1f84c9352 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -1179,7 +1179,8 @@ static void ixgbevf_configure_rx(struct ixgbevf_adapter *adapter)
 	}
 }
 
-static int ixgbevf_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+static int ixgbevf_vlan_rx_add_vid(struct net_device *netdev,
+				   __be16 proto, u16 vid)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -1204,7 +1205,8 @@ static int ixgbevf_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 	return err;
 }
 
-static int ixgbevf_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+static int ixgbevf_vlan_rx_kill_vid(struct net_device *netdev,
+				    __be16 proto, u16 vid)
 {
 	struct ixgbevf_adapter *adapter = netdev_priv(netdev);
 	struct ixgbe_hw *hw = &adapter->hw;
@@ -1227,7 +1229,8 @@ static void ixgbevf_restore_vlan(struct ixgbevf_adapter *adapter)
 	u16 vid;
 
 	for_each_set_bit(vid, adapter->active_vlans, VLAN_N_VID)
-		ixgbevf_vlan_rx_add_vid(adapter->netdev, vid);
+		ixgbevf_vlan_rx_add_vid(adapter->netdev,
+					htons(ETH_P_8021Q), vid);
 }
 
 static int ixgbevf_write_uc_addr_list(struct net_device *netdev)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b2ba39c7143a..e7e27842d8d4 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -356,7 +356,8 @@ static void mlx4_en_filter_rfs_expire(struct mlx4_en_priv *priv)
 }
 #endif
 
-static int mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int mlx4_en_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
@@ -381,7 +382,8 @@ static int mlx4_en_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	return 0;
 }
 
-static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int mlx4_en_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_en_dev *mdev = priv->mdev;
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index e9e58aadf87e..a9396142201c 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -3300,12 +3300,13 @@ static void vxge_tx_watchdog(struct net_device *dev)
 /**
  * vxge_vlan_rx_add_vid
  * @dev: net device pointer.
+ * @proto: vlan protocol
  * @vid: vid
  *
  * Add the vlan id to the devices vlan id table
  */
 static int
-vxge_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+vxge_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct vxgedev *vdev = netdev_priv(dev);
 	struct vxge_vpath *vpath;
@@ -3323,14 +3324,15 @@ vxge_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 }
 
 /**
- * vxge_vlan_rx_add_vid
+ * vxge_vlan_rx_kill_vid
  * @dev: net device pointer.
+ * @proto: vlan protocol
  * @vid: vid
  *
  * Remove the vlan id from the device's vlan id table
  */
 static int
-vxge_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+vxge_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct vxgedev *vdev = netdev_priv(dev);
 	struct vxge_vpath *vpath;
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index 845ba1d1c3c9..e88e01312c67 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -86,8 +86,8 @@ static void qlcnic_dev_set_npar_ready(struct qlcnic_adapter *);
 static int qlcnicvf_start_firmware(struct qlcnic_adapter *);
 static void qlcnic_set_netdev_features(struct qlcnic_adapter *,
 				struct qlcnic_esw_func_cfg *);
-static int qlcnic_vlan_rx_add(struct net_device *, u16);
-static int qlcnic_vlan_rx_del(struct net_device *, u16);
+static int qlcnic_vlan_rx_add(struct net_device *, __be16, u16);
+static int qlcnic_vlan_rx_del(struct net_device *, __be16, u16);
 
 #define QLCNIC_IS_TSO_CAPABLE(adapter)	\
 	((adapter)->ahw->capabilities & QLCNIC_FW_CAPABILITY_TSO)
@@ -902,7 +902,7 @@ void qlcnic_set_vlan_config(struct qlcnic_adapter *adapter,
 }
 
 static int
-qlcnic_vlan_rx_add(struct net_device *netdev, u16 vid)
+qlcnic_vlan_rx_add(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 	set_bit(vid, adapter->vlans);
@@ -910,7 +910,7 @@ qlcnic_vlan_rx_add(struct net_device *netdev, u16 vid)
 }
 
 static int
-qlcnic_vlan_rx_del(struct net_device *netdev, u16 vid)
+qlcnic_vlan_rx_del(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct qlcnic_adapter *adapter = netdev_priv(netdev);
 
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index 8e3f43c75665..a9016acc2d6a 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -2326,7 +2326,7 @@ static int __qlge_vlan_rx_add_vid(struct ql_adapter *qdev, u16 vid)
 	return err;
 }
 
-static int qlge_vlan_rx_add_vid(struct net_device *ndev, u16 vid)
+static int qlge_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid)
 {
 	struct ql_adapter *qdev = netdev_priv(ndev);
 	int status;
@@ -2357,7 +2357,7 @@ static int __qlge_vlan_rx_kill_vid(struct ql_adapter *qdev, u16 vid)
 	return err;
 }
 
-static int qlge_vlan_rx_kill_vid(struct net_device *ndev, u16 vid)
+static int qlge_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
 {
 	struct ql_adapter *qdev = netdev_priv(ndev);
 	int status;
diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c
index 3d4a1ed0a7ab..b8e52cd1a698 100644
--- a/drivers/net/ethernet/renesas/sh_eth.c
+++ b/drivers/net/ethernet/renesas/sh_eth.c
@@ -2448,7 +2448,8 @@ static int sh_eth_get_vtag_index(struct sh_eth_private *mdp)
 		return TSU_VTAG1;
 }
 
-static int sh_eth_vlan_rx_add_vid(struct net_device *ndev, u16 vid)
+static int sh_eth_vlan_rx_add_vid(struct net_device *ndev,
+				  __be16 proto, u16 vid)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
 	int vtag_reg_index = sh_eth_get_vtag_index(mdp);
@@ -2478,7 +2479,8 @@ static int sh_eth_vlan_rx_add_vid(struct net_device *ndev, u16 vid)
 	return 0;
 }
 
-static int sh_eth_vlan_rx_kill_vid(struct net_device *ndev, u16 vid)
+static int sh_eth_vlan_rx_kill_vid(struct net_device *ndev,
+				   __be16 proto, u16 vid)
 {
 	struct sh_eth_private *mdp = netdev_priv(ndev);
 	int vtag_reg_index = sh_eth_get_vtag_index(mdp);
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index 5ca4b33fc4c1..f7050c573519 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -733,7 +733,7 @@ static void __bdx_vlan_rx_vid(struct net_device *ndev, uint16_t vid, int enable)
  * @ndev: network device
  * @vid:  VLAN vid to add
  */
-static int bdx_vlan_rx_add_vid(struct net_device *ndev, uint16_t vid)
+static int bdx_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid)
 {
 	__bdx_vlan_rx_vid(ndev, vid, 1);
 	return 0;
@@ -744,7 +744,7 @@ static int bdx_vlan_rx_add_vid(struct net_device *ndev, uint16_t vid)
  * @ndev: network device
  * @vid:  VLAN vid to kill
  */
-static int bdx_vlan_rx_kill_vid(struct net_device *ndev, unsigned short vid)
+static int bdx_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
 {
 	__bdx_vlan_rx_vid(ndev, vid, 0);
 	return 0;
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 084992981cef..5cf8d03b8cae 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1251,7 +1251,7 @@ clean_vid:
 }
 
 static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
-		unsigned short vid)
+				    __be16 proto, u16 vid)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 
@@ -1263,7 +1263,7 @@ static int cpsw_ndo_vlan_rx_add_vid(struct net_device *ndev,
 }
 
 static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
-		unsigned short vid)
+				     __be16 proto, u16 vid)
 {
 	struct cpsw_priv *priv = netdev_priv(ndev);
 	int ret;
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index 37b02c3768be..c6014916f622 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -508,8 +508,10 @@ static struct rtnl_link_stats64 *rhine_get_stats64(struct net_device *dev,
 static int netdev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd);
 static const struct ethtool_ops netdev_ethtool_ops;
 static int  rhine_close(struct net_device *dev);
-static int rhine_vlan_rx_add_vid(struct net_device *dev, unsigned short vid);
-static int rhine_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid);
+static int rhine_vlan_rx_add_vid(struct net_device *dev,
+				 __be16 proto, u16 vid);
+static int rhine_vlan_rx_kill_vid(struct net_device *dev,
+				  __be16 proto, u16 vid);
 static void rhine_restart_tx(struct net_device *dev);
 
 static void rhine_wait_bit(struct rhine_private *rp, u8 reg, u8 mask, bool low)
@@ -1415,7 +1417,7 @@ static void rhine_update_vcam(struct net_device *dev)
 	rhine_set_vlan_cam_mask(ioaddr, vCAMmask);
 }
 
-static int rhine_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int rhine_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct rhine_private *rp = netdev_priv(dev);
 
@@ -1426,7 +1428,7 @@ static int rhine_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	return 0;
 }
 
-static int rhine_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int rhine_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct rhine_private *rp = netdev_priv(dev);
 
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index c1c55a7da941..91cd59146c24 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -525,7 +525,8 @@ static void velocity_init_cam_filter(struct velocity_info *vptr)
 	mac_set_vlan_cam_mask(regs, vptr->vCAMmask);
 }
 
-static int velocity_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int velocity_vlan_rx_add_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
 {
 	struct velocity_info *vptr = netdev_priv(dev);
 
@@ -536,7 +537,8 @@ static int velocity_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	return 0;
 }
 
-static int velocity_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int velocity_vlan_rx_kill_vid(struct net_device *dev,
+				     __be16 proto, u16 vid)
 {
 	struct velocity_info *vptr = netdev_priv(dev);
 
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index fedd34cff91c..7347cdbe736f 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -567,21 +567,21 @@ static struct rtnl_link_stats64 *macvlan_dev_get_stats64(struct net_device *dev,
 }
 
 static int macvlan_vlan_rx_add_vid(struct net_device *dev,
-				    unsigned short vid)
+				   __be16 proto, u16 vid)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *lowerdev = vlan->lowerdev;
 
-	return vlan_vid_add(lowerdev, vid);
+	return vlan_vid_add(lowerdev, proto, vid);
 }
 
 static int macvlan_vlan_rx_kill_vid(struct net_device *dev,
-				     unsigned short vid)
+				    __be16 proto, u16 vid)
 {
 	struct macvlan_dev *vlan = netdev_priv(dev);
 	struct net_device *lowerdev = vlan->lowerdev;
 
-	vlan_vid_del(lowerdev, vid);
+	vlan_vid_del(lowerdev, proto, vid);
 	return 0;
 }
 
diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c
index 9290eb23d664..7c43261975bd 100644
--- a/drivers/net/team/team.c
+++ b/drivers/net/team/team.c
@@ -1598,7 +1598,7 @@ team_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats)
 	return stats;
 }
 
-static int team_vlan_rx_add_vid(struct net_device *dev, uint16_t vid)
+static int team_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
@@ -1610,7 +1610,7 @@ static int team_vlan_rx_add_vid(struct net_device *dev, uint16_t vid)
 	 */
 	mutex_lock(&team->lock);
 	list_for_each_entry(port, &team->port_list, list) {
-		err = vlan_vid_add(port->dev, vid);
+		err = vlan_vid_add(port->dev, proto, vid);
 		if (err)
 			goto unwind;
 	}
@@ -1620,20 +1620,20 @@ static int team_vlan_rx_add_vid(struct net_device *dev, uint16_t vid)
 
 unwind:
 	list_for_each_entry_continue_reverse(port, &team->port_list, list)
-		vlan_vid_del(port->dev, vid);
+		vlan_vid_del(port->dev, proto, vid);
 	mutex_unlock(&team->lock);
 
 	return err;
 }
 
-static int team_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid)
+static int team_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct team *team = netdev_priv(dev);
 	struct team_port *port;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(port, &team->port_list, list)
-		vlan_vid_del(port->dev, vid);
+		vlan_vid_del(port->dev, proto, vid);
 	rcu_read_unlock();
 
 	return 0;
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b61d57ab3c63..50077753a0e5 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1006,7 +1006,8 @@ static void virtnet_set_rx_mode(struct net_device *dev)
 	kfree(buf);
 }
 
-static int virtnet_vlan_rx_add_vid(struct net_device *dev, u16 vid)
+static int virtnet_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg;
@@ -1019,7 +1020,8 @@ static int virtnet_vlan_rx_add_vid(struct net_device *dev, u16 vid)
 	return 0;
 }
 
-static int virtnet_vlan_rx_kill_vid(struct net_device *dev, u16 vid)
+static int virtnet_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct scatterlist sg;
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index ba9bdad39986..27b889992ab8 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1931,7 +1931,7 @@ vmxnet3_restore_vlan(struct vmxnet3_adapter *adapter)
 
 
 static int
-vmxnet3_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
+vmxnet3_vlan_rx_add_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 
@@ -1953,7 +1953,7 @@ vmxnet3_vlan_rx_add_vid(struct net_device *netdev, u16 vid)
 
 
 static int
-vmxnet3_vlan_rx_kill_vid(struct net_device *netdev, u16 vid)
+vmxnet3_vlan_rx_kill_vid(struct net_device *netdev, __be16 proto, u16 vid)
 {
 	struct vmxnet3_adapter *adapter = netdev_priv(netdev);
 
diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 90ddd823605c..e68f79b38556 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -302,7 +302,8 @@ static void qeth_l2_process_vlans(struct qeth_card *card)
 	spin_unlock_bh(&card->vlanlock);
 }
 
-static int qeth_l2_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int qeth_l2_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
 {
 	struct qeth_card *card = dev->ml_priv;
 	struct qeth_vlan_vid *id;
@@ -331,7 +332,8 @@ static int qeth_l2_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	return 0;
 }
 
-static int qeth_l2_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int qeth_l2_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
 {
 	struct qeth_vlan_vid *id, *tmpid = NULL;
 	struct qeth_card *card = dev->ml_priv;
diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 04261bc08f20..642686846a14 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -1824,7 +1824,8 @@ static void qeth_l3_free_vlan_addresses(struct qeth_card *card,
 	rcu_read_unlock();
 }
 
-static int qeth_l3_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
+static int qeth_l3_vlan_rx_add_vid(struct net_device *dev,
+				   __be16 proto, u16 vid)
 {
 	struct qeth_card *card = dev->ml_priv;
 
@@ -1832,7 +1833,8 @@ static int qeth_l3_vlan_rx_add_vid(struct net_device *dev, unsigned short vid)
 	return 0;
 }
 
-static int qeth_l3_vlan_rx_kill_vid(struct net_device *dev, unsigned short vid)
+static int qeth_l3_vlan_rx_kill_vid(struct net_device *dev,
+				    __be16 proto, u16 vid)
 {
 	struct qeth_card *card = dev->ml_priv;
 	unsigned long flags;
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index fee28291a824..fcb9ef82aae1 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -93,8 +93,8 @@ extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 extern bool vlan_do_receive(struct sk_buff **skb);
 extern struct sk_buff *vlan_untag(struct sk_buff *skb);
 
-extern int vlan_vid_add(struct net_device *dev, unsigned short vid);
-extern void vlan_vid_del(struct net_device *dev, unsigned short vid);
+extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid);
+extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid);
 
 extern int vlan_vids_add_by_dev(struct net_device *dev,
 				const struct net_device *by_dev);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7eb7e03ee417..f8898a435dc5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -784,15 +784,13 @@ struct netdev_fcoe_hbainfo {
  *	3. Update dev->stats asynchronously and atomically, and define
  *	   neither operation.
  *
- * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid);
- *	If device support VLAN filtering (dev->features &
-		 *			  NETIF_F_HW_VLAN_CTAG_FILTER)
- *	this function is called when a VLAN id is registered.
+ * int (*ndo_vlan_rx_add_vid)(struct net_device *dev, __be16 proto, u16t vid);
+ *	If device support VLAN filtering this function is called when a
+ *	VLAN id is registered.
  *
  * int (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid);
- *	If device support VLAN filtering (dev->features &
-		 *			  NETIF_F_HW_VLAN_CTAG_FILTER)
- *	this function is called when a VLAN id is unregistered.
+ *	If device support VLAN filtering this function is called when a
+ *	VLAN id is unregistered.
  *
  * void (*ndo_poll_controller)(struct net_device *dev);
  *
@@ -936,9 +934,9 @@ struct net_device_ops {
 	struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);
 
 	int			(*ndo_vlan_rx_add_vid)(struct net_device *dev,
-						       unsigned short vid);
+						       __be16 proto, u16 vid);
 	int			(*ndo_vlan_rx_kill_vid)(struct net_device *dev,
-						        unsigned short vid);
+						        __be16 proto, u16 vid);
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	void                    (*ndo_poll_controller)(struct net_device *dev);
 	int			(*ndo_netpoll_setup)(struct net_device *dev,
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d913feed0757..447c5c93434f 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -112,7 +112,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	 * VLAN is not 0 (leave it there for 802.1p).
 	 */
 	if (vlan_id)
-		vlan_vid_del(real_dev, vlan_id);
+		vlan_vid_del(real_dev, htons(ETH_P_8021Q), vlan_id);
 
 	/* Get rid of the vlan's reference to real_dev */
 	dev_put(real_dev);
@@ -142,7 +142,7 @@ int register_vlan_dev(struct net_device *dev)
 	struct vlan_group *grp;
 	int err;
 
-	err = vlan_vid_add(real_dev, vlan_id);
+	err = vlan_vid_add(real_dev, htons(ETH_P_8021Q), vlan_id);
 	if (err)
 		return err;
 
@@ -195,7 +195,7 @@ out_uninit_gvrp:
 	if (grp->nr_vlan_devs == 0)
 		vlan_gvrp_uninit_applicant(real_dev);
 out_vid_del:
-	vlan_vid_del(real_dev, vlan_id);
+	vlan_vid_del(real_dev, htons(ETH_P_8021Q), vlan_id);
 	return err;
 }
 
@@ -350,7 +350,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	    (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
 		pr_info("adding VLAN 0 to HW filter on device %s\n",
 			dev->name);
-		vlan_vid_add(dev, 0);
+		vlan_vid_add(dev, htons(ETH_P_8021Q), 0);
 	}
 
 	vlan_info = rtnl_dereference(dev->vlan_info);
@@ -416,7 +416,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_DOWN:
 		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
-			vlan_vid_del(dev, 0);
+			vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
 
 		/* Put all VLANs for this dev in the down state too.  */
 		for (i = 0; i < VLAN_N_VID; i++) {
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 3df29d344704..04e3b95a0d48 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -185,35 +185,37 @@ static struct vlan_info *vlan_info_alloc(struct net_device *dev)
 
 struct vlan_vid_info {
 	struct list_head list;
-	unsigned short vid;
+	__be16 proto;
+	u16 vid;
 	int refcount;
 };
 
 static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
-					       unsigned short vid)
+					       __be16 proto, u16 vid)
 {
 	struct vlan_vid_info *vid_info;
 
 	list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
-		if (vid_info->vid == vid)
+		if (vid_info->proto == proto && vid_info->vid == vid)
 			return vid_info;
 	}
 	return NULL;
 }
 
-static struct vlan_vid_info *vlan_vid_info_alloc(unsigned short vid)
+static struct vlan_vid_info *vlan_vid_info_alloc(__be16 proto, u16 vid)
 {
 	struct vlan_vid_info *vid_info;
 
 	vid_info = kzalloc(sizeof(struct vlan_vid_info), GFP_KERNEL);
 	if (!vid_info)
 		return NULL;
+	vid_info->proto = proto;
 	vid_info->vid = vid;
 
 	return vid_info;
 }
 
-static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid,
+static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
 			  struct vlan_vid_info **pvid_info)
 {
 	struct net_device *dev = vlan_info->real_dev;
@@ -221,12 +223,13 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid,
 	struct vlan_vid_info *vid_info;
 	int err;
 
-	vid_info = vlan_vid_info_alloc(vid);
+	vid_info = vlan_vid_info_alloc(proto, vid);
 	if (!vid_info)
 		return -ENOMEM;
 
-	if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
-		err =  ops->ndo_vlan_rx_add_vid(dev, vid);
+	if (proto == htons(ETH_P_8021Q) &&
+	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+		err =  ops->ndo_vlan_rx_add_vid(dev, proto, vid);
 		if (err) {
 			kfree(vid_info);
 			return err;
@@ -238,7 +241,7 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, unsigned short vid,
 	return 0;
 }
 
-int vlan_vid_add(struct net_device *dev, unsigned short vid)
+int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct vlan_info *vlan_info;
 	struct vlan_vid_info *vid_info;
@@ -254,9 +257,9 @@ int vlan_vid_add(struct net_device *dev, unsigned short vid)
 			return -ENOMEM;
 		vlan_info_created = true;
 	}
-	vid_info = vlan_vid_info_get(vlan_info, vid);
+	vid_info = vlan_vid_info_get(vlan_info, proto, vid);
 	if (!vid_info) {
-		err = __vlan_vid_add(vlan_info, vid, &vid_info);
+		err = __vlan_vid_add(vlan_info, proto, vid, &vid_info);
 		if (err)
 			goto out_free_vlan_info;
 	}
@@ -279,14 +282,16 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
 {
 	struct net_device *dev = vlan_info->real_dev;
 	const struct net_device_ops *ops = dev->netdev_ops;
-	unsigned short vid = vid_info->vid;
+	__be16 proto = vid_info->proto;
+	u16 vid = vid_info->vid;
 	int err;
 
-	if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
-		err = ops->ndo_vlan_rx_kill_vid(dev, vid);
+	if (proto == htons(ETH_P_8021Q) &&
+	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+		err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
 		if (err) {
-			pr_warn("failed to kill vid %d for device %s\n",
-				vid, dev->name);
+			pr_warn("failed to kill vid %04x/%d for device %s\n",
+				proto, vid, dev->name);
 		}
 	}
 	list_del(&vid_info->list);
@@ -294,7 +299,7 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
 	vlan_info->nr_vids--;
 }
 
-void vlan_vid_del(struct net_device *dev, unsigned short vid)
+void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
 {
 	struct vlan_info *vlan_info;
 	struct vlan_vid_info *vid_info;
@@ -305,7 +310,7 @@ void vlan_vid_del(struct net_device *dev, unsigned short vid)
 	if (!vlan_info)
 		return;
 
-	vid_info = vlan_vid_info_get(vlan_info, vid);
+	vid_info = vlan_vid_info_get(vlan_info, proto, vid);
 	if (!vid_info)
 		return;
 	vid_info->refcount--;
@@ -333,7 +338,7 @@ int vlan_vids_add_by_dev(struct net_device *dev,
 		return 0;
 
 	list_for_each_entry(vid_info, &vlan_info->vid_list, list) {
-		err = vlan_vid_add(dev, vid_info->vid);
+		err = vlan_vid_add(dev, vid_info->proto, vid_info->vid);
 		if (err)
 			goto unwind;
 	}
@@ -343,7 +348,7 @@ unwind:
 	list_for_each_entry_continue_reverse(vid_info,
 					     &vlan_info->vid_list,
 					     list) {
-		vlan_vid_del(dev, vid_info->vid);
+		vlan_vid_del(dev, vid_info->proto, vid_info->vid);
 	}
 
 	return err;
@@ -363,7 +368,7 @@ void vlan_vids_del_by_dev(struct net_device *dev,
 		return;
 
 	list_for_each_entry(vid_info, &vlan_info->vid_list, list)
-		vlan_vid_del(dev, vid_info->vid);
+		vlan_vid_del(dev, vid_info->proto, vid_info->vid);
 }
 EXPORT_SYMBOL(vlan_vids_del_by_dev);
 
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 0b3dbbec80d0..c3076e2c4294 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -34,6 +34,7 @@ static void __vlan_add_flags(struct net_port_vlans *v, u16 vid, u16 flags)
 
 static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
 {
+	const struct net_device_ops *ops;
 	struct net_bridge_port *p = NULL;
 	struct net_bridge *br;
 	struct net_device *dev;
@@ -53,6 +54,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
 			br = v->parent.br;
 			dev = br->dev;
 		}
+		ops = dev->netdev_ops;
 
 		if (p && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)) {
 			/* Add VLAN to the device filter if it is supported.
@@ -61,7 +63,8 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
 			 * that ever changes this code will allow tagged
 			 * traffic to enter the bridge.
 			 */
-			err = dev->netdev_ops->ndo_vlan_rx_add_vid(dev, vid);
+			err = ops->ndo_vlan_rx_add_vid(dev, htons(ETH_P_8021Q),
+						       vid);
 			if (err)
 				return err;
 		}
@@ -83,7 +86,7 @@ static int __vlan_add(struct net_port_vlans *v, u16 vid, u16 flags)
 
 out_filt:
 	if (p && (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER))
-		dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, vid);
+		ops->ndo_vlan_rx_kill_vid(dev, htons(ETH_P_8021Q), vid);
 	return err;
 }
 
@@ -97,9 +100,10 @@ static int __vlan_del(struct net_port_vlans *v, u16 vid)
 
 	if (v->port_idx && vid) {
 		struct net_device *dev = v->parent.port->dev;
+		const struct net_device_ops *ops = dev->netdev_ops;
 
 		if (dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
-			dev->netdev_ops->ndo_vlan_rx_kill_vid(dev, vid);
+			ops->ndo_vlan_rx_kill_vid(dev, htons(ETH_P_8021Q), vid);
 	}
 
 	clear_bit(vid, v->vlan_bitmap);
-- 
cgit 


From 1fd9b1fc310314911f66d2f14a8e4f0ef37bf47b Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 19 Apr 2013 02:04:29 +0000
Subject: net: vlan: prepare for 802.1ad support

Make the encapsulation protocol value a property of VLAN devices and change
the device lookup functions to take the protocol value into account.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_main.c                    |  8 +-
 drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c   |  2 +-
 include/linux/if_vlan.h                            |  2 +-
 net/8021q/vlan.c                                   | 87 ++++++++--------------
 net/8021q/vlan.h                                   | 54 +++++++++++---
 net/8021q/vlan_core.c                              | 10 ++-
 net/8021q/vlan_dev.c                               |  7 +-
 net/8021q/vlan_gvrp.c                              |  4 +
 net/8021q/vlan_mvrp.c                              |  4 +
 net/8021q/vlan_netlink.c                           |  9 ++-
 net/bridge/br_netfilter.c                          |  3 +-
 12 files changed, 108 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 35e89e12a1f7..1e79a7643f08 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -782,7 +782,7 @@ static void bond_resend_igmp_join_requests(struct bonding *bond)
 
 	/* rejoin all groups on vlan devices */
 	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
-		vlan_dev = __vlan_find_dev_deep(bond_dev,
+		vlan_dev = __vlan_find_dev_deep(bond_dev, htons(ETH_P_8021Q),
 						vlan->vlan_id);
 		if (vlan_dev)
 			__bond_resend_igmp_join_requests(vlan_dev);
@@ -2512,7 +2512,8 @@ static int bond_has_this_ip(struct bonding *bond, __be32 ip)
 
 	list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
 		rcu_read_lock();
-		vlan_dev = __vlan_find_dev_deep(bond->dev, vlan->vlan_id);
+		vlan_dev = __vlan_find_dev_deep(bond->dev, htons(ETH_P_8021Q),
+						vlan->vlan_id);
 		rcu_read_unlock();
 		if (vlan_dev && ip == bond_confirm_addr(vlan_dev, 0, ip))
 			return 1;
@@ -2541,7 +2542,7 @@ static void bond_arp_send(struct net_device *slave_dev, int arp_op, __be32 dest_
 		return;
 	}
 	if (vlan_id) {
-		skb = vlan_put_tag(skb, vlan_id);
+		skb = vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
 		if (!skb) {
 			pr_err("failed to insert VLAN tag\n");
 			return;
@@ -2603,6 +2604,7 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave)
 		list_for_each_entry(vlan, &bond->vlan_list, vlan_list) {
 			rcu_read_lock();
 			vlan_dev = __vlan_find_dev_deep(bond->dev,
+							htons(ETH_P_8021Q),
 							vlan->vlan_id);
 			rcu_read_unlock();
 			if (vlan_dev == rt->dst.dev) {
diff --git a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
index 4232767862b5..0c96e5fe99cc 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/cxgb3_offload.c
@@ -185,7 +185,7 @@ static struct net_device *get_iff_from_mac(struct adapter *adapter,
 		if (!memcmp(dev->dev_addr, mac, ETH_ALEN)) {
 			rcu_read_lock();
 			if (vlan && vlan != VLAN_VID_MASK) {
-				dev = __vlan_find_dev_deep(dev, vlan);
+				dev = __vlan_find_dev_deep(dev, htons(ETH_P_8021Q), vlan);
 			} else if (netif_is_bond_slave(dev)) {
 				struct net_device *upper_dev;
 
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
index e88e01312c67..d132765f92af 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_main.c
@@ -3346,7 +3346,7 @@ void qlcnic_restore_indev_addr(struct net_device *netdev, unsigned long event)
 
 	rcu_read_lock();
 	for_each_set_bit(vid, adapter->vlans, VLAN_N_VID) {
-		dev = __vlan_find_dev_deep(netdev, vid);
+		dev = __vlan_find_dev_deep(netdev, htons(ETH_P_8021Q), vid);
 		if (!dev)
 			continue;
 		qlcnic_config_indev_addr(adapter, dev, event);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index fcb9ef82aae1..2c9fb65f8267 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -86,7 +86,7 @@ static inline int is_vlan_dev(struct net_device *dev)
 #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE)
 
 extern struct net_device *__vlan_find_dev_deep(struct net_device *real_dev,
-					       u16 vlan_id);
+					       __be16 vlan_proto, u16 vlan_id);
 extern struct net_device *vlan_dev_real_dev(const struct net_device *dev);
 extern u16 vlan_dev_vlan_id(const struct net_device *dev);
 
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 447c5c93434f..9424f3718ea7 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -51,14 +51,18 @@ const char vlan_version[] = DRV_VERSION;
 
 /* End of global variables definitions. */
 
-static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id)
+static int vlan_group_prealloc_vid(struct vlan_group *vg,
+				   __be16 vlan_proto, u16 vlan_id)
 {
 	struct net_device **array;
+	unsigned int pidx, vidx;
 	unsigned int size;
 
 	ASSERT_RTNL();
 
-	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+	pidx  = vlan_proto_idx(vlan_proto);
+	vidx  = vlan_id / VLAN_GROUP_ARRAY_PART_LEN;
+	array = vg->vlan_devices_arrays[pidx][vidx];
 	if (array != NULL)
 		return 0;
 
@@ -67,7 +71,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg, u16 vlan_id)
 	if (array == NULL)
 		return -ENOBUFS;
 
-	vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN] = array;
+	vg->vlan_devices_arrays[pidx][vidx] = array;
 	return 0;
 }
 
@@ -93,7 +97,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	if (vlan->flags & VLAN_FLAG_GVRP)
 		vlan_gvrp_request_leave(dev);
 
-	vlan_group_set_device(grp, vlan_id, NULL);
+	vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, NULL);
 	/* Because unregister_netdevice_queue() makes sure at least one rcu
 	 * grace period is respected before device freeing,
 	 * we dont need to call synchronize_net() here.
@@ -112,13 +116,14 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head)
 	 * VLAN is not 0 (leave it there for 802.1p).
 	 */
 	if (vlan_id)
-		vlan_vid_del(real_dev, htons(ETH_P_8021Q), vlan_id);
+		vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
 
 	/* Get rid of the vlan's reference to real_dev */
 	dev_put(real_dev);
 }
 
-int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
+int vlan_check_real_dev(struct net_device *real_dev,
+			__be16 protocol, u16 vlan_id)
 {
 	const char *name = real_dev->name;
 
@@ -127,7 +132,7 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id)
 		return -EOPNOTSUPP;
 	}
 
-	if (vlan_find_dev(real_dev, vlan_id) != NULL)
+	if (vlan_find_dev(real_dev, protocol, vlan_id) != NULL)
 		return -EEXIST;
 
 	return 0;
@@ -142,7 +147,7 @@ int register_vlan_dev(struct net_device *dev)
 	struct vlan_group *grp;
 	int err;
 
-	err = vlan_vid_add(real_dev, htons(ETH_P_8021Q), vlan_id);
+	err = vlan_vid_add(real_dev, vlan->vlan_proto, vlan_id);
 	if (err)
 		return err;
 
@@ -160,7 +165,7 @@ int register_vlan_dev(struct net_device *dev)
 			goto out_uninit_gvrp;
 	}
 
-	err = vlan_group_prealloc_vid(grp, vlan_id);
+	err = vlan_group_prealloc_vid(grp, vlan->vlan_proto, vlan_id);
 	if (err < 0)
 		goto out_uninit_mvrp;
 
@@ -181,7 +186,7 @@ int register_vlan_dev(struct net_device *dev)
 	/* So, got the sucker initialized, now lets place
 	 * it into our local structure.
 	 */
-	vlan_group_set_device(grp, vlan_id, dev);
+	vlan_group_set_device(grp, vlan->vlan_proto, vlan_id, dev);
 	grp->nr_vlan_devs++;
 
 	return 0;
@@ -195,7 +200,7 @@ out_uninit_gvrp:
 	if (grp->nr_vlan_devs == 0)
 		vlan_gvrp_uninit_applicant(real_dev);
 out_vid_del:
-	vlan_vid_del(real_dev, htons(ETH_P_8021Q), vlan_id);
+	vlan_vid_del(real_dev, vlan->vlan_proto, vlan_id);
 	return err;
 }
 
@@ -213,7 +218,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	if (vlan_id >= VLAN_VID_MASK)
 		return -ERANGE;
 
-	err = vlan_check_real_dev(real_dev, vlan_id);
+	err = vlan_check_real_dev(real_dev, htons(ETH_P_8021Q), vlan_id);
 	if (err < 0)
 		return err;
 
@@ -255,6 +260,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
 	new_dev->mtu = real_dev->mtu;
 	new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT);
 
+	vlan_dev_priv(new_dev)->vlan_proto = htons(ETH_P_8021Q);
 	vlan_dev_priv(new_dev)->vlan_id = vlan_id;
 	vlan_dev_priv(new_dev)->real_dev = real_dev;
 	vlan_dev_priv(new_dev)->dent = NULL;
@@ -341,6 +347,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	int i, flgs;
 	struct net_device *vlandev;
 	struct vlan_dev_priv *vlan;
+	bool last = false;
 	LIST_HEAD(list);
 
 	if (is_vlan_dev(dev))
@@ -365,22 +372,13 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	switch (event) {
 	case NETDEV_CHANGE:
 		/* Propagate real device state to vlan devices */
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev)
 			netif_stacked_transfer_operstate(dev, vlandev);
-		}
 		break;
 
 	case NETDEV_CHANGEADDR:
 		/* Adjust unicast filters on underlying device */
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev) {
 			flgs = vlandev->flags;
 			if (!(flgs & IFF_UP))
 				continue;
@@ -390,11 +388,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		break;
 
 	case NETDEV_CHANGEMTU:
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev) {
 			if (vlandev->mtu <= dev->mtu)
 				continue;
 
@@ -404,14 +398,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_FEAT_CHANGE:
 		/* Propagate device features to underlying device */
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev)
 			vlan_transfer_features(dev, vlandev);
-		}
-
 		break;
 
 	case NETDEV_DOWN:
@@ -419,11 +407,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 			vlan_vid_del(dev, htons(ETH_P_8021Q), 0);
 
 		/* Put all VLANs for this dev in the down state too.  */
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev) {
 			flgs = vlandev->flags;
 			if (!(flgs & IFF_UP))
 				continue;
@@ -437,11 +421,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 
 	case NETDEV_UP:
 		/* Put all VLANs for this dev in the up state too.  */
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev) {
 			flgs = vlandev->flags;
 			if (flgs & IFF_UP)
 				continue;
@@ -458,17 +438,15 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 		if (dev->reg_state != NETREG_UNREGISTERING)
 			break;
 
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev) {
 			/* removal of last vid destroys vlan_info, abort
 			 * afterwards */
 			if (vlan_info->nr_vids == 1)
-				i = VLAN_N_VID;
+				last = true;
 
 			unregister_vlan_dev(vlandev, &list);
+			if (last)
+				break;
 		}
 		unregister_netdevice_many(&list);
 		break;
@@ -482,13 +460,8 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	case NETDEV_NOTIFY_PEERS:
 	case NETDEV_BONDING_FAILOVER:
 		/* Propagate to vlan devices */
-		for (i = 0; i < VLAN_N_VID; i++) {
-			vlandev = vlan_group_get_device(grp, i);
-			if (!vlandev)
-				continue;
-
+		vlan_group_for_each_dev(grp, i, vlandev)
 			call_netdevice_notifiers(event, vlandev);
-		}
 		break;
 	}
 
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 670f1e8cfc0f..245de9653db0 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -49,6 +49,7 @@ struct netpoll;
  *	@ingress_priority_map: ingress priority mappings
  *	@nr_egress_mappings: number of egress priority mappings
  *	@egress_priority_map: hash of egress priority mappings
+ *	@vlan_proto: VLAN encapsulation protocol
  *	@vlan_id: VLAN identifier
  *	@flags: device flags
  *	@real_dev: underlying netdevice
@@ -62,6 +63,7 @@ struct vlan_dev_priv {
 	unsigned int				nr_egress_mappings;
 	struct vlan_priority_tci_mapping	*egress_priority_map[16];
 
+	__be16					vlan_proto;
 	u16					vlan_id;
 	u16					flags;
 
@@ -87,10 +89,16 @@ static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
 #define VLAN_GROUP_ARRAY_SPLIT_PARTS  8
 #define VLAN_GROUP_ARRAY_PART_LEN     (VLAN_N_VID/VLAN_GROUP_ARRAY_SPLIT_PARTS)
 
+enum vlan_protos {
+	VLAN_PROTO_8021Q	= 0,
+	VLAN_PROTO_NUM,
+};
+
 struct vlan_group {
 	unsigned int		nr_vlan_devs;
 	struct hlist_node	hlist;	/* linked list */
-	struct net_device **vlan_devices_arrays[VLAN_GROUP_ARRAY_SPLIT_PARTS];
+	struct net_device **vlan_devices_arrays[VLAN_PROTO_NUM]
+					       [VLAN_GROUP_ARRAY_SPLIT_PARTS];
 };
 
 struct vlan_info {
@@ -103,37 +111,64 @@ struct vlan_info {
 	struct rcu_head		rcu;
 };
 
-static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
-						       u16 vlan_id)
+static inline unsigned int vlan_proto_idx(__be16 proto)
+{
+	switch (proto) {
+	case __constant_htons(ETH_P_8021Q):
+		return VLAN_PROTO_8021Q;
+	default:
+		BUG();
+	}
+}
+
+static inline struct net_device *__vlan_group_get_device(struct vlan_group *vg,
+							 unsigned int pidx,
+							 u16 vlan_id)
 {
 	struct net_device **array;
-	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+
+	array = vg->vlan_devices_arrays[pidx]
+				       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
 	return array ? array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] : NULL;
 }
 
+static inline struct net_device *vlan_group_get_device(struct vlan_group *vg,
+						       __be16 vlan_proto,
+						       u16 vlan_id)
+{
+	return __vlan_group_get_device(vg, vlan_proto_idx(vlan_proto), vlan_id);
+}
+
 static inline void vlan_group_set_device(struct vlan_group *vg,
-					 u16 vlan_id,
+					 __be16 vlan_proto, u16 vlan_id,
 					 struct net_device *dev)
 {
 	struct net_device **array;
 	if (!vg)
 		return;
-	array = vg->vlan_devices_arrays[vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
+	array = vg->vlan_devices_arrays[vlan_proto_idx(vlan_proto)]
+				       [vlan_id / VLAN_GROUP_ARRAY_PART_LEN];
 	array[vlan_id % VLAN_GROUP_ARRAY_PART_LEN] = dev;
 }
 
 /* Must be invoked with rcu_read_lock or with RTNL. */
 static inline struct net_device *vlan_find_dev(struct net_device *real_dev,
-					       u16 vlan_id)
+					       __be16 vlan_proto, u16 vlan_id)
 {
 	struct vlan_info *vlan_info = rcu_dereference_rtnl(real_dev->vlan_info);
 
 	if (vlan_info)
-		return vlan_group_get_device(&vlan_info->grp, vlan_id);
+		return vlan_group_get_device(&vlan_info->grp,
+					     vlan_proto, vlan_id);
 
 	return NULL;
 }
 
+#define vlan_group_for_each_dev(grp, i, dev) \
+	for ((i) = 0; i < VLAN_PROTO_NUM * VLAN_N_VID; i++) \
+		if (((dev) = __vlan_group_get_device((grp), (i) / VLAN_N_VID, \
+							    (i) % VLAN_N_VID)))
+
 /* found in vlan_dev.c */
 void vlan_dev_set_ingress_priority(const struct net_device *dev,
 				   u32 skb_prio, u16 vlan_prio);
@@ -142,7 +177,8 @@ int vlan_dev_set_egress_priority(const struct net_device *dev,
 int vlan_dev_change_flags(const struct net_device *dev, u32 flag, u32 mask);
 void vlan_dev_get_realdev_name(const struct net_device *dev, char *result);
 
-int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id);
+int vlan_check_real_dev(struct net_device *real_dev,
+			__be16 protocol, u16 vlan_id);
 void vlan_setup(struct net_device *dev);
 int register_vlan_dev(struct net_device *dev);
 void unregister_vlan_dev(struct net_device *dev, struct list_head *head);
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 04e3b95a0d48..4e4c360353ea 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -12,7 +12,7 @@ bool vlan_do_receive(struct sk_buff **skbp)
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
-	vlan_dev = vlan_find_dev(skb->dev, vlan_id);
+	vlan_dev = vlan_find_dev(skb->dev, htons(ETH_P_8021Q), vlan_id);
 	if (!vlan_dev)
 		return false;
 
@@ -62,12 +62,13 @@ bool vlan_do_receive(struct sk_buff **skbp)
 
 /* Must be invoked with rcu_read_lock. */
 struct net_device *__vlan_find_dev_deep(struct net_device *dev,
-					u16 vlan_id)
+					__be16 vlan_proto, u16 vlan_id)
 {
 	struct vlan_info *vlan_info = rcu_dereference(dev->vlan_info);
 
 	if (vlan_info) {
-		return vlan_group_get_device(&vlan_info->grp, vlan_id);
+		return vlan_group_get_device(&vlan_info->grp,
+					     vlan_proto, vlan_id);
 	} else {
 		/*
 		 * Lower devices of master uppers (bonding, team) do not have
@@ -78,7 +79,8 @@ struct net_device *__vlan_find_dev_deep(struct net_device *dev,
 
 		upper_dev = netdev_master_upper_dev_get_rcu(dev);
 		if (upper_dev)
-			return __vlan_find_dev_deep(upper_dev, vlan_id);
+			return __vlan_find_dev_deep(upper_dev,
+						    vlan_proto, vlan_id);
 	}
 
 	return NULL;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 5c4892a86410..d7457b7e1b95 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -99,6 +99,7 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 				const void *daddr, const void *saddr,
 				unsigned int len)
 {
+	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct vlan_hdr *vhdr;
 	unsigned int vhdrlen = 0;
 	u16 vlan_tci = 0;
@@ -120,8 +121,8 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev,
 		else
 			vhdr->h_vlan_encapsulated_proto = htons(len);
 
-		skb->protocol = htons(ETH_P_8021Q);
-		type = ETH_P_8021Q;
+		skb->protocol = vlan->vlan_proto;
+		type = ntohs(vlan->vlan_proto);
 		vhdrlen = VLAN_HLEN;
 	}
 
@@ -161,7 +162,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 	 * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING
 	 * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs...
 	 */
-	if (veth->h_vlan_proto != htons(ETH_P_8021Q) ||
+	if (veth->h_vlan_proto != vlan->vlan_proto ||
 	    vlan->flags & VLAN_FLAG_REORDER_HDR) {
 		u16 vlan_tci;
 		vlan_tci = vlan->vlan_id;
diff --git a/net/8021q/vlan_gvrp.c b/net/8021q/vlan_gvrp.c
index 6f9755352760..66a80320b032 100644
--- a/net/8021q/vlan_gvrp.c
+++ b/net/8021q/vlan_gvrp.c
@@ -32,6 +32,8 @@ int vlan_gvrp_request_join(const struct net_device *dev)
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	__be16 vlan_id = htons(vlan->vlan_id);
 
+	if (vlan->vlan_proto != htons(ETH_P_8021Q))
+		return 0;
 	return garp_request_join(vlan->real_dev, &vlan_gvrp_app,
 				 &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
 }
@@ -41,6 +43,8 @@ void vlan_gvrp_request_leave(const struct net_device *dev)
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	__be16 vlan_id = htons(vlan->vlan_id);
 
+	if (vlan->vlan_proto != htons(ETH_P_8021Q))
+		return;
 	garp_request_leave(vlan->real_dev, &vlan_gvrp_app,
 			   &vlan_id, sizeof(vlan_id), GVRP_ATTR_VID);
 }
diff --git a/net/8021q/vlan_mvrp.c b/net/8021q/vlan_mvrp.c
index d9ec1d5964aa..e0fe091801b0 100644
--- a/net/8021q/vlan_mvrp.c
+++ b/net/8021q/vlan_mvrp.c
@@ -38,6 +38,8 @@ int vlan_mvrp_request_join(const struct net_device *dev)
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	__be16 vlan_id = htons(vlan->vlan_id);
 
+	if (vlan->vlan_proto != htons(ETH_P_8021Q))
+		return 0;
 	return mrp_request_join(vlan->real_dev, &vlan_mrp_app,
 				&vlan_id, sizeof(vlan_id), MVRP_ATTR_VID);
 }
@@ -47,6 +49,8 @@ void vlan_mvrp_request_leave(const struct net_device *dev)
 	const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	__be16 vlan_id = htons(vlan->vlan_id);
 
+	if (vlan->vlan_proto != htons(ETH_P_8021Q))
+		return;
 	mrp_request_leave(vlan->real_dev, &vlan_mrp_app,
 			  &vlan_id, sizeof(vlan_id), MVRP_ATTR_VID);
 }
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 1789658b7cd7..a1a956ab39a5 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -118,11 +118,12 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	if (!real_dev)
 		return -ENODEV;
 
-	vlan->vlan_id  = nla_get_u16(data[IFLA_VLAN_ID]);
-	vlan->real_dev = real_dev;
-	vlan->flags    = VLAN_FLAG_REORDER_HDR;
+	vlan->vlan_proto = htons(ETH_P_8021Q);
+	vlan->vlan_id	 = nla_get_u16(data[IFLA_VLAN_ID]);
+	vlan->real_dev	 = real_dev;
+	vlan->flags	 = VLAN_FLAG_REORDER_HDR;
 
-	err = vlan_check_real_dev(real_dev, vlan->vlan_id);
+	err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);
 	if (err < 0)
 		return err;
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index fe43bc7b063f..bd61bf5159c9 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -535,7 +535,8 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
 		return br;
 
-	vlan = __vlan_find_dev_deep(br, vlan_tx_tag_get(skb) & VLAN_VID_MASK);
+	vlan = __vlan_find_dev_deep(br, htons(ETH_P_8021Q),
+				    vlan_tx_tag_get(skb) & VLAN_VID_MASK);
 
 	return vlan ? vlan : br;
 }
-- 
cgit 


From 86a9bad3ab6b6f858fd4443b48738cabbb6d094c Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 19 Apr 2013 02:04:30 +0000
Subject: net: vlan: add protocol argument to packet tagging functions

Add a protocol argument to the VLAN packet tagging functions. In case of HW
tagging, we need that protocol available in the ndo_start_xmit functions,
so it is stored in a new field in the skb. The new field fits into a hole
(on 64 bit) and doesn't increase the sks's size.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/nes/nes_hw.c                |  2 +-
 drivers/net/bonding/bond_alb.c                    |  4 +--
 drivers/net/ethernet/3com/typhoon.c               |  2 +-
 drivers/net/ethernet/adaptec/starfire.c           |  2 +-
 drivers/net/ethernet/alteon/acenic.c              |  2 +-
 drivers/net/ethernet/amd/amd8111e.c               |  2 +-
 drivers/net/ethernet/atheros/atl1c/atl1c_main.c   |  2 +-
 drivers/net/ethernet/atheros/atl1e/atl1e_main.c   |  2 +-
 drivers/net/ethernet/atheros/atlx/atl1.c          |  2 +-
 drivers/net/ethernet/atheros/atlx/atl2.c          |  2 +-
 drivers/net/ethernet/broadcom/bnx2.c              |  2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c   |  4 +--
 drivers/net/ethernet/broadcom/tg3.c               |  2 +-
 drivers/net/ethernet/brocade/bna/bnad.c           |  2 +-
 drivers/net/ethernet/chelsio/cxgb/sge.c           |  2 +-
 drivers/net/ethernet/chelsio/cxgb3/sge.c          |  4 +--
 drivers/net/ethernet/chelsio/cxgb4/sge.c          |  4 +--
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c        |  5 ++--
 drivers/net/ethernet/cisco/enic/enic_main.c       |  2 +-
 drivers/net/ethernet/emulex/benet/be_main.c       |  6 ++---
 drivers/net/ethernet/intel/e1000/e1000_main.c     |  2 +-
 drivers/net/ethernet/intel/e1000e/netdev.c        |  2 +-
 drivers/net/ethernet/intel/igb/igb_main.c         |  2 +-
 drivers/net/ethernet/intel/igbvf/netdev.c         |  2 +-
 drivers/net/ethernet/intel/ixgb/ixgb_main.c       |  4 +--
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c     |  2 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  2 +-
 drivers/net/ethernet/jme.c                        |  2 +-
 drivers/net/ethernet/marvell/sky2.c               |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c        |  4 +--
 drivers/net/ethernet/myricom/myri10ge/myri10ge.c  |  2 +-
 drivers/net/ethernet/natsemi/ns83820.c            |  2 +-
 drivers/net/ethernet/neterion/s2io.c              |  2 +-
 drivers/net/ethernet/neterion/vxge/vxge-main.c    |  2 +-
 drivers/net/ethernet/nvidia/forcedeth.c           |  2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c    |  8 +++---
 drivers/net/ethernet/qlogic/qlge/qlge_main.c      |  8 +++---
 drivers/net/ethernet/realtek/8139cp.c             |  2 +-
 drivers/net/ethernet/realtek/r8169.c              |  2 +-
 drivers/net/ethernet/tehuti/tehuti.c              |  2 +-
 drivers/net/ethernet/via/via-rhine.c              |  2 +-
 drivers/net/ethernet/via/via-velocity.c           |  2 +-
 drivers/net/usb/cdc_mbim.c                        |  2 +-
 drivers/net/vmxnet3/vmxnet3_drv.c                 |  2 +-
 include/linux/if_vlan.h                           | 33 ++++++++++++++++-------
 include/linux/skbuff.h                            |  2 ++
 net/8021q/vlan_core.c                             |  8 +++---
 net/8021q/vlan_dev.c                              |  2 +-
 net/batman-adv/bridge_loop_avoidance.c            |  2 +-
 net/bridge/br_netfilter.c                         |  2 +-
 net/bridge/br_vlan.c                              |  4 +--
 net/core/dev.c                                    |  5 ++--
 net/core/netpoll.c                                |  5 ++--
 net/core/skbuff.c                                 |  1 +
 net/openvswitch/actions.c                         |  6 ++---
 net/openvswitch/datapath.c                        |  2 +-
 56 files changed, 107 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
index 67647e264611..418004c93feb 100644
--- a/drivers/infiniband/hw/nes/nes_hw.c
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -2948,7 +2948,7 @@ void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
 					nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
 							nesvnic->netdev->name, vlan_tag);
 
-					__vlan_hwaccel_put_tag(rx_skb, vlan_tag);
+					__vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag);
 				}
 				if (nes_use_lro)
 					lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL);
diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index f5e052723029..e02cc265723a 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -514,7 +514,7 @@ static void rlb_update_client(struct rlb_client_info *client_info)
 		skb->dev = client_info->slave->dev;
 
 		if (client_info->tag) {
-			skb = vlan_put_tag(skb, client_info->vlan_id);
+			skb = vlan_put_tag(skb, htons(ETH_P_8021Q), client_info->vlan_id);
 			if (!skb) {
 				pr_err("%s: Error: failed to insert VLAN tag\n",
 				       client_info->slave->bond->dev->name);
@@ -1014,7 +1014,7 @@ static void alb_send_learning_packets(struct slave *slave, u8 mac_addr[])
 				continue;
 			}
 
-			skb = vlan_put_tag(skb, vlan->vlan_id);
+			skb = vlan_put_tag(skb, htons(ETH_P_8021Q), vlan->vlan_id);
 			if (!skb) {
 				pr_err("%s: Error: failed to insert VLAN tag\n",
 				       bond->dev->name);
diff --git a/drivers/net/ethernet/3com/typhoon.c b/drivers/net/ethernet/3com/typhoon.c
index 839a96213742..144942f6372b 100644
--- a/drivers/net/ethernet/3com/typhoon.c
+++ b/drivers/net/ethernet/3com/typhoon.c
@@ -1690,7 +1690,7 @@ typhoon_rx(struct typhoon *tp, struct basic_ring *rxRing, volatile __le32 * read
 			skb_checksum_none_assert(new_skb);
 
 		if (rx->rxStatus & TYPHOON_RX_VLAN)
-			__vlan_hwaccel_put_tag(new_skb,
+			__vlan_hwaccel_put_tag(new_skb, htons(ETH_P_8021Q),
 					       ntohl(rx->vlanTag) & 0xffff);
 		netif_receive_skb(new_skb);
 
diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c
index cdbc5443ae3b..8b04bfc20cfb 100644
--- a/drivers/net/ethernet/adaptec/starfire.c
+++ b/drivers/net/ethernet/adaptec/starfire.c
@@ -1498,7 +1498,7 @@ static int __netdev_rx(struct net_device *dev, int *quota)
 				printk(KERN_DEBUG "  netdev_rx() vlanid = %d\n",
 				       vlid);
 			}
-			__vlan_hwaccel_put_tag(skb, vlid);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlid);
 		}
 #endif /* VLAN_SUPPORT */
 		netif_receive_skb(skb);
diff --git a/drivers/net/ethernet/alteon/acenic.c b/drivers/net/ethernet/alteon/acenic.c
index a7689d931053..b7894f8af9d1 100644
--- a/drivers/net/ethernet/alteon/acenic.c
+++ b/drivers/net/ethernet/alteon/acenic.c
@@ -2019,7 +2019,7 @@ static void ace_rx_int(struct net_device *dev, u32 rxretprd, u32 rxretcsm)
 
 		/* send it up */
 		if ((bd_flags & BD_FLG_VLAN_TAG))
-			__vlan_hwaccel_put_tag(skb, retdesc->vlan);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), retdesc->vlan);
 		netif_rx(skb);
 
 		dev->stats.rx_packets++;
diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c
index 6bad84d6e2c4..8e6b665a6726 100644
--- a/drivers/net/ethernet/amd/amd8111e.c
+++ b/drivers/net/ethernet/amd/amd8111e.c
@@ -793,7 +793,7 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget)
 #if AMD8111E_VLAN_TAG_USED
 			if (vtag == TT_VLAN_TAGGED){
 				u16 vlan_tag = le16_to_cpu(lp->rx_ring[rx_index].tag_ctrl_info);
-				__vlan_hwaccel_put_tag(skb, vlan_tag);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 			}
 #endif
 			netif_receive_skb(skb);
diff --git a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
index 3565255cdd81..0ba900762b13 100644
--- a/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
+++ b/drivers/net/ethernet/atheros/atl1c/atl1c_main.c
@@ -1809,7 +1809,7 @@ rrs_checked:
 
 			AT_TAG_TO_VLAN(rrs->vlan_tag, vlan);
 			vlan = le16_to_cpu(vlan);
-			__vlan_hwaccel_put_tag(skb, vlan);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan);
 		}
 		netif_receive_skb(skb);
 
diff --git a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
index 598a61151668..0688bb82b442 100644
--- a/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
+++ b/drivers/net/ethernet/atheros/atl1e/atl1e_main.c
@@ -1435,7 +1435,7 @@ static void atl1e_clean_rx_irq(struct atl1e_adapter *adapter, u8 que,
 				netdev_dbg(netdev,
 					   "RXD VLAN TAG<RRD>=0x%04x\n",
 					   prrs->vtag);
-				__vlan_hwaccel_put_tag(skb, vlan_tag);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 			}
 			netif_receive_skb(skb);
 
diff --git a/drivers/net/ethernet/atheros/atlx/atl1.c b/drivers/net/ethernet/atheros/atlx/atl1.c
index fd7d85044e4a..fa0915f3999b 100644
--- a/drivers/net/ethernet/atheros/atlx/atl1.c
+++ b/drivers/net/ethernet/atheros/atlx/atl1.c
@@ -2024,7 +2024,7 @@ rrd_ok:
 					((rrd->vlan_tag & 7) << 13) |
 					((rrd->vlan_tag & 8) << 9);
 
-			__vlan_hwaccel_put_tag(skb, vlan_tag);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 		}
 		netif_receive_skb(skb);
 
diff --git a/drivers/net/ethernet/atheros/atlx/atl2.c b/drivers/net/ethernet/atheros/atlx/atl2.c
index 6b2c08a89b7e..265ce1b752ed 100644
--- a/drivers/net/ethernet/atheros/atlx/atl2.c
+++ b/drivers/net/ethernet/atheros/atlx/atl2.c
@@ -452,7 +452,7 @@ static void atl2_intr_rx(struct atl2_adapter *adapter)
 					((rxd->status.vtag&7) << 13) |
 					((rxd->status.vtag&8) << 9);
 
-				__vlan_hwaccel_put_tag(skb, vlan_tag);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 			}
 			netif_rx(skb);
 			netdev->stats.rx_bytes += rx_size;
diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c
index 42a8bc8df5dd..5d204492c603 100644
--- a/drivers/net/ethernet/broadcom/bnx2.c
+++ b/drivers/net/ethernet/broadcom/bnx2.c
@@ -3211,7 +3211,7 @@ bnx2_rx_int(struct bnx2 *bp, struct bnx2_napi *bnapi, int budget)
 		}
 		if ((status & L2_FHDR_STATUS_L2_VLAN_TAG) &&
 		    !(bp->rx_mode & BNX2_EMAC_RX_MODE_KEEP_VLAN_TAG))
-			__vlan_hwaccel_put_tag(skb, rx_hdr->l2_fhdr_vlan_tag);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rx_hdr->l2_fhdr_vlan_tag);
 
 		skb->protocol = eth_type_trans(skb, bp->dev);
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 352e58ede4d5..6b50443d3456 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -719,7 +719,7 @@ static void bnx2x_tpa_stop(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 		if (!bnx2x_fill_frag_skb(bp, fp, tpa_info, pages,
 					 skb, cqe, cqe_idx)) {
 			if (tpa_info->parsing_flags & PARSING_FLAGS_VLAN)
-				__vlan_hwaccel_put_tag(skb, tpa_info->vlan_tag);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tpa_info->vlan_tag);
 			bnx2x_gro_receive(bp, fp, skb);
 		} else {
 			DP(NETIF_MSG_RX_STATUS,
@@ -994,7 +994,7 @@ reuse_rx:
 
 		if (le16_to_cpu(cqe_fp->pars_flags.flags) &
 		    PARSING_FLAGS_VLAN)
-			__vlan_hwaccel_put_tag(skb,
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 					       le16_to_cpu(cqe_fp->vlan_tag));
 		napi_gro_receive(&fp->napi, skb);
 
diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
index 0c22c9a059c4..ac83c87e0b1b 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -6715,7 +6715,7 @@ static int tg3_rx(struct tg3_napi *tnapi, int budget)
 
 		if (desc->type_flags & RXD_FLAG_VLAN &&
 		    !(tp->rx_mode & RX_MODE_KEEP_VLAN_TAG))
-			__vlan_hwaccel_put_tag(skb,
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
 					       desc->err_vlan & RXD_VLAN_MASK);
 
 		napi_gro_receive(&tnapi->napi, skb);
diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c
index c0bc44e51fa9..ce4a030d3d0c 100644
--- a/drivers/net/ethernet/brocade/bna/bnad.c
+++ b/drivers/net/ethernet/brocade/bna/bnad.c
@@ -610,7 +610,7 @@ bnad_cq_process(struct bnad *bnad, struct bna_ccb *ccb, int budget)
 		rcb->rxq->rx_bytes += length;
 
 		if (flags & BNA_CQ_EF_VLAN)
-			__vlan_hwaccel_put_tag(skb, ntohs(cmpl->vlan_tag));
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cmpl->vlan_tag));
 
 		if (BNAD_RXBUF_IS_PAGE(unmap_q->type))
 			napi_gro_frags(&rx_ctrl->napi);
diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c
index f85e0659432b..8061fb0ef7ed 100644
--- a/drivers/net/ethernet/chelsio/cxgb/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb/sge.c
@@ -1386,7 +1386,7 @@ static void sge_rx(struct sge *sge, struct freelQ *fl, unsigned int len)
 
 	if (p->vlan_valid) {
 		st->vlan_xtract++;
-		__vlan_hwaccel_put_tag(skb, ntohs(p->vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
 	}
 	netif_receive_skb(skb);
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c
index 9d67eb794c4b..f12e6b85a653 100644
--- a/drivers/net/ethernet/chelsio/cxgb3/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c
@@ -2030,7 +2030,7 @@ static void rx_eth(struct adapter *adap, struct sge_rspq *rq,
 
 	if (p->vlan_valid) {
 		qs->port_stats[SGE_PSTAT_VLANEX]++;
-		__vlan_hwaccel_put_tag(skb, ntohs(p->vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(p->vlan));
 	}
 	if (rq->polling) {
 		if (lro)
@@ -2132,7 +2132,7 @@ static void lro_add_page(struct adapter *adap, struct sge_qset *qs,
 
 	if (cpl->vlan_valid) {
 		qs->port_stats[SGE_PSTAT_VLANEX]++;
-		__vlan_hwaccel_put_tag(skb, ntohs(cpl->vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(cpl->vlan));
 	}
 	napi_gro_frags(&qs->napi);
 }
diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c
index 8b47b253e204..2bfbb206b35a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c
@@ -1633,7 +1633,7 @@ static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
 		skb->rxhash = (__force u32)pkt->rsshdr.hash_val;
 
 	if (unlikely(pkt->vlan_ex)) {
-		__vlan_hwaccel_put_tag(skb, ntohs(pkt->vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
 		rxq->stats.vlan_ex++;
 	}
 	ret = napi_gro_frags(&rxq->rspq.napi);
@@ -1705,7 +1705,7 @@ int t4_ethrx_handler(struct sge_rspq *q, const __be64 *rsp,
 		skb_checksum_none_assert(skb);
 
 	if (unlikely(pkt->vlan_ex)) {
-		__vlan_hwaccel_put_tag(skb, ntohs(pkt->vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(pkt->vlan));
 		rxq->stats.vlan_ex++;
 	}
 	netif_receive_skb(skb);
diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index 61dfb2a47929..df296af20bd5 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -1482,7 +1482,8 @@ static void do_gro(struct sge_eth_rxq *rxq, const struct pkt_gl *gl,
 	skb_record_rx_queue(skb, rxq->rspq.idx);
 
 	if (pkt->vlan_ex) {
-		__vlan_hwaccel_put_tag(skb, be16_to_cpu(pkt->vlan));
+		__vlan_hwaccel_put_tag(skb, cpu_to_be16(ETH_P_8021Q),
+					be16_to_cpu(pkt->vlan));
 		rxq->stats.vlan_ex++;
 	}
 	ret = napi_gro_frags(&rxq->rspq.napi);
@@ -1551,7 +1552,7 @@ int t4vf_ethrx_handler(struct sge_rspq *rspq, const __be64 *rsp,
 
 	if (pkt->vlan_ex) {
 		rxq->stats.vlan_ex++;
-		__vlan_hwaccel_put_tag(skb, be16_to_cpu(pkt->vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(pkt->vlan));
 	}
 
 	netif_receive_skb(skb);
diff --git a/drivers/net/ethernet/cisco/enic/enic_main.c b/drivers/net/ethernet/cisco/enic/enic_main.c
index 05c1e59b6bff..635f55992d7e 100644
--- a/drivers/net/ethernet/cisco/enic/enic_main.c
+++ b/drivers/net/ethernet/cisco/enic/enic_main.c
@@ -1300,7 +1300,7 @@ static void enic_rq_indicate_buf(struct vnic_rq *rq,
 		}
 
 		if (vlan_stripped)
-			__vlan_hwaccel_put_tag(skb, vlan_tci);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
 
 		if (netdev->features & NETIF_F_GRO)
 			napi_gro_receive(&enic->napi[q_number], skb);
diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c
index b41333184916..811d0a47d17a 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -771,7 +771,7 @@ static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
 
 	if (vlan_tx_tag_present(skb)) {
 		vlan_tag = be_get_tx_vlan_tag(adapter, skb);
-		__vlan_put_tag(skb, vlan_tag);
+		__vlan_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 		skb->vlan_tci = 0;
 	}
 
@@ -1383,7 +1383,7 @@ static void be_rx_compl_process(struct be_rx_obj *rxo,
 
 
 	if (rxcp->vlanf)
-		__vlan_hwaccel_put_tag(skb, rxcp->vlan_tag);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag);
 
 	netif_receive_skb(skb);
 }
@@ -1439,7 +1439,7 @@ void be_rx_compl_process_gro(struct be_rx_obj *rxo, struct napi_struct *napi,
 		skb->rxhash = rxcp->rss_hash;
 
 	if (rxcp->vlanf)
-		__vlan_hwaccel_put_tag(skb, rxcp->vlan_tag);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rxcp->vlan_tag);
 
 	napi_gro_frags(napi);
 }
diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
index 8239dafdd8e7..59ad007dd5aa 100644
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
@@ -4003,7 +4003,7 @@ static void e1000_receive_skb(struct e1000_adapter *adapter, u8 status,
 	if (status & E1000_RXD_STAT_VP) {
 		u16 vid = le16_to_cpu(vlan) & E1000_RXD_SPC_VLAN_MASK;
 
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 	}
 	napi_gro_receive(&adapter->napi, skb);
 }
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 8c17f01a155f..da7f2fad5ba4 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -554,7 +554,7 @@ static void e1000_receive_skb(struct e1000_adapter *adapter,
 	skb->protocol = eth_type_trans(skb, netdev);
 
 	if (staterr & E1000_RXD_STAT_VP)
-		__vlan_hwaccel_put_tag(skb, tag);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tag);
 
 	napi_gro_receive(&adapter->napi, skb);
 }
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index d13ea71c7c1f..9bf08b977daa 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -6683,7 +6683,7 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring,
 		else
 			vid = le16_to_cpu(rx_desc->wb.upper.vlan);
 
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 	}
 
 	skb_record_rx_queue(skb, rx_ring->queue_index);
diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c b/drivers/net/ethernet/intel/igbvf/netdev.c
index 3854fd698b85..93eb7ee06d3e 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -116,7 +116,7 @@ static void igbvf_receive_skb(struct igbvf_adapter *adapter,
 		else
 			vid = le16_to_cpu(vlan) & E1000_RXD_SPC_VLAN_MASK;
 		if (test_bit(vid, adapter->active_vlans))
-			__vlan_hwaccel_put_tag(skb, vid);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 	}
 
 	napi_gro_receive(&adapter->rx_ring->napi, skb);
diff --git a/drivers/net/ethernet/intel/ixgb/ixgb_main.c b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
index a32f274acd36..fce3e92f9d11 100644
--- a/drivers/net/ethernet/intel/ixgb/ixgb_main.c
+++ b/drivers/net/ethernet/intel/ixgb/ixgb_main.c
@@ -2082,8 +2082,8 @@ ixgb_clean_rx_irq(struct ixgb_adapter *adapter, int *work_done, int work_to_do)
 
 		skb->protocol = eth_type_trans(skb, netdev);
 		if (status & IXGB_RX_DESC_STATUS_VP)
-			__vlan_hwaccel_put_tag(skb,
-					       le16_to_cpu(rx_desc->special));
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+				       le16_to_cpu(rx_desc->special));
 
 		netif_receive_skb(skb);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 3becffc77321..6225f880a3f4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -1491,7 +1491,7 @@ static void ixgbe_process_skb_fields(struct ixgbe_ring *rx_ring,
 	if ((dev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
 	    ixgbe_test_staterr(rx_desc, IXGBE_RXD_STAT_VP)) {
 		u16 vid = le16_to_cpu(rx_desc->wb.upper.vlan);
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 	}
 
 	skb_record_rx_queue(skb, rx_ring->queue_index);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 4bc1f84c9352..1f5166ad6bb5 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -291,7 +291,7 @@ static void ixgbevf_receive_skb(struct ixgbevf_q_vector *q_vector,
 	u16 tag = le16_to_cpu(rx_desc->wb.upper.vlan);
 
 	if (is_vlan && test_bit(tag & VLAN_VID_MASK, adapter->active_vlans))
-		__vlan_hwaccel_put_tag(skb, tag);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), tag);
 
 	if (!(adapter->flags & IXGBE_FLAG_IN_NETPOLL))
 		napi_gro_receive(&q_vector->napi, skb);
diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c
index d28ce6f97172..070a6f1a0577 100644
--- a/drivers/net/ethernet/jme.c
+++ b/drivers/net/ethernet/jme.c
@@ -1059,7 +1059,7 @@ jme_alloc_and_feed_skb(struct jme_adapter *jme, int idx)
 		if (rxdesc->descwb.flags & cpu_to_le16(RXWBFLAG_TAGON)) {
 			u16 vid = le16_to_cpu(rxdesc->descwb.vlan);
 
-			__vlan_hwaccel_put_tag(skb, vid);
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 			NET_STAT(jme).rx_bytes += 4;
 		}
 		jme->jme_rx(skb);
diff --git a/drivers/net/ethernet/marvell/sky2.c b/drivers/net/ethernet/marvell/sky2.c
index bf9da1b7b090..256ae789c143 100644
--- a/drivers/net/ethernet/marvell/sky2.c
+++ b/drivers/net/ethernet/marvell/sky2.c
@@ -2713,7 +2713,7 @@ static void sky2_rx_tag(struct sky2_port *sky2, u16 length)
 	struct sk_buff *skb;
 
 	skb = sky2->rx_ring[sky2->rx_next].skb;
-	__vlan_hwaccel_put_tag(skb, be16_to_cpu(length));
+	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(length));
 }
 
 static void sky2_rx_hash(struct sky2_port *sky2, u32 status)
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index c7f856308e1a..4006f8857cb5 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -673,7 +673,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) {
 						u16 vid = be16_to_cpu(cqe->sl_vid);
 
-						__vlan_hwaccel_put_tag(gro_skb, vid);
+						__vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
 					}
 
 					if (dev->features & NETIF_F_RXHASH)
@@ -716,7 +716,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 		if (be32_to_cpu(cqe->vlan_my_qpn) &
 		    MLX4_CQE_VLAN_PRESENT_MASK)
-			__vlan_hwaccel_put_tag(skb, be16_to_cpu(cqe->sl_vid));
+			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
 
 		/* Push it up the stack */
 		netif_receive_skb(skb);
diff --git a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
index 46262ea610fd..7be9788ed0f6 100644
--- a/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
+++ b/drivers/net/ethernet/myricom/myri10ge/myri10ge.c
@@ -1290,7 +1290,7 @@ myri10ge_vlan_rx(struct net_device *dev, void *addr, struct sk_buff *skb)
 			skb->csum = csum_sub(skb->csum, vsum);
 		}
 		/* pop tag */
-		__vlan_hwaccel_put_tag(skb, ntohs(veh->h_vlan_TCI));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(veh->h_vlan_TCI));
 		memmove(va + VLAN_HLEN, va, 2 * ETH_ALEN);
 		skb->len -= VLAN_HLEN;
 		skb->data_len -= VLAN_HLEN;
diff --git a/drivers/net/ethernet/natsemi/ns83820.c b/drivers/net/ethernet/natsemi/ns83820.c
index 60267e91cbda..d3b47003a575 100644
--- a/drivers/net/ethernet/natsemi/ns83820.c
+++ b/drivers/net/ethernet/natsemi/ns83820.c
@@ -911,7 +911,7 @@ static void rx_irq(struct net_device *ndev)
 				unsigned short tag;
 
 				tag = ntohs(extsts & EXTSTS_VTG_MASK);
-				__vlan_hwaccel_put_tag(skb, tag);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_IPV6), tag);
 			}
 #endif
 			rx_rc = netif_rx(skb);
diff --git a/drivers/net/ethernet/neterion/s2io.c b/drivers/net/ethernet/neterion/s2io.c
index ec82d59b03ed..51b00941302c 100644
--- a/drivers/net/ethernet/neterion/s2io.c
+++ b/drivers/net/ethernet/neterion/s2io.c
@@ -8555,7 +8555,7 @@ static void queue_rx_frame(struct sk_buff *skb, u16 vlan_tag)
 
 	skb->protocol = eth_type_trans(skb, dev);
 	if (vlan_tag && sp->vlan_strip_flag)
-		__vlan_hwaccel_put_tag(skb, vlan_tag);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
 	if (sp->config.napi)
 		netif_receive_skb(skb);
 	else
diff --git a/drivers/net/ethernet/neterion/vxge/vxge-main.c b/drivers/net/ethernet/neterion/vxge/vxge-main.c
index a9396142201c..cbfaed5f2f8d 100644
--- a/drivers/net/ethernet/neterion/vxge/vxge-main.c
+++ b/drivers/net/ethernet/neterion/vxge/vxge-main.c
@@ -312,7 +312,7 @@ vxge_rx_complete(struct vxge_ring *ring, struct sk_buff *skb, u16 vlan,
 
 	if (ext_info->vlan &&
 	    ring->vlan_tag_strip == VXGE_HW_VPATH_RPA_STRIP_VLAN_TAG_ENABLE)
-		__vlan_hwaccel_put_tag(skb, ext_info->vlan);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ext_info->vlan);
 	napi_gro_receive(ring->napi_p, skb);
 
 	vxge_debug_entryexit(VXGE_TRACE,
diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c
index fcad64081d74..b003fe53c8e2 100644
--- a/drivers/net/ethernet/nvidia/forcedeth.c
+++ b/drivers/net/ethernet/nvidia/forcedeth.c
@@ -2969,7 +2969,7 @@ static int nv_rx_process_optimized(struct net_device *dev, int limit)
 			    vlanflags & NV_RX3_VLAN_TAG_PRESENT) {
 				u16 vid = vlanflags & NV_RX3_VLAN_TAG_MASK;
 
-				__vlan_hwaccel_put_tag(skb, vid);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 			}
 			napi_gro_receive(&np->napi, skb);
 			u64_stats_update_begin(&np->swstats_rx_syncp);
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
index a85ca63a2c9e..56223a6aa408 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_io.c
@@ -1050,7 +1050,7 @@ qlcnic_process_rcv(struct qlcnic_adapter *adapter,
 	skb->protocol = eth_type_trans(skb, netdev);
 
 	if (vid != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 
 	napi_gro_receive(&sds_ring->napi, skb);
 
@@ -1153,7 +1153,7 @@ qlcnic_process_lro(struct qlcnic_adapter *adapter,
 	}
 
 	if (vid != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 	netif_receive_skb(skb);
 
 	adapter->stats.lro_pkts++;
@@ -1518,7 +1518,7 @@ qlcnic_83xx_process_rcv(struct qlcnic_adapter *adapter,
 	skb->protocol = eth_type_trans(skb, netdev);
 
 	if (vid != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 
 	napi_gro_receive(&sds_ring->napi, skb);
 
@@ -1615,7 +1615,7 @@ qlcnic_83xx_process_lro(struct qlcnic_adapter *adapter,
 	}
 
 	if (vid != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 
 	netif_receive_skb(skb);
 
diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
index a9016acc2d6a..44cf72ac2489 100644
--- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c
+++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c
@@ -1498,7 +1498,7 @@ static void ql_process_mac_rx_gro_page(struct ql_adapter *qdev,
 	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	skb_record_rx_queue(skb, rx_ring->cq_id);
 	if (vlan_id != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vlan_id);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
 	napi_gro_frags(napi);
 }
 
@@ -1574,7 +1574,7 @@ static void ql_process_mac_rx_page(struct ql_adapter *qdev,
 
 	skb_record_rx_queue(skb, rx_ring->cq_id);
 	if (vlan_id != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vlan_id);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
 	if (skb->ip_summed == CHECKSUM_UNNECESSARY)
 		napi_gro_receive(napi, skb);
 	else
@@ -1670,7 +1670,7 @@ static void ql_process_mac_rx_skb(struct ql_adapter *qdev,
 
 	skb_record_rx_queue(skb, rx_ring->cq_id);
 	if (vlan_id != 0xffff)
-		__vlan_hwaccel_put_tag(skb, vlan_id);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
 	if (skb->ip_summed == CHECKSUM_UNNECESSARY)
 		napi_gro_receive(&rx_ring->napi, skb);
 	else
@@ -1975,7 +1975,7 @@ static void ql_process_mac_split_rx_intr(struct ql_adapter *qdev,
 	rx_ring->rx_bytes += skb->len;
 	skb_record_rx_queue(skb, rx_ring->cq_id);
 	if ((ib_mac_rsp->flags2 & IB_MAC_IOCB_RSP_V) && (vlan_id != 0))
-		__vlan_hwaccel_put_tag(skb, vlan_id);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_id);
 	if (skb->ip_summed == CHECKSUM_UNNECESSARY)
 		napi_gro_receive(&rx_ring->napi, skb);
 	else
diff --git a/drivers/net/ethernet/realtek/8139cp.c b/drivers/net/ethernet/realtek/8139cp.c
index 6d03b52e56f1..7d1fb9ad1296 100644
--- a/drivers/net/ethernet/realtek/8139cp.c
+++ b/drivers/net/ethernet/realtek/8139cp.c
@@ -431,7 +431,7 @@ static inline void cp_rx_skb (struct cp_private *cp, struct sk_buff *skb,
 	cp->dev->stats.rx_bytes += skb->len;
 
 	if (opts2 & RxVlanTagged)
-		__vlan_hwaccel_put_tag(skb, swab16(opts2 & 0xffff));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), swab16(opts2 & 0xffff));
 
 	napi_gro_receive(&cp->napi, skb);
 }
diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c
index 86d5d7909d10..c6dac38fd9cc 100644
--- a/drivers/net/ethernet/realtek/r8169.c
+++ b/drivers/net/ethernet/realtek/r8169.c
@@ -1843,7 +1843,7 @@ static void rtl8169_rx_vlan_tag(struct RxDesc *desc, struct sk_buff *skb)
 	u32 opts2 = le32_to_cpu(desc->opts2);
 
 	if (opts2 & RxVlanTag)
-		__vlan_hwaccel_put_tag(skb, swab16(opts2 & 0xffff));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), swab16(opts2 & 0xffff));
 }
 
 static int rtl8169_gset_tbi(struct net_device *dev, struct ethtool_cmd *cmd)
diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c
index f7050c573519..571452e786d5 100644
--- a/drivers/net/ethernet/tehuti/tehuti.c
+++ b/drivers/net/ethernet/tehuti/tehuti.c
@@ -1148,7 +1148,7 @@ NETIF_RX_MUX(struct bdx_priv *priv, u32 rxd_val1, u16 rxd_vlan,
 		    priv->ndev->name,
 		    GET_RXD_VLAN_ID(rxd_vlan),
 		    GET_RXD_VTAG(rxd_val1));
-		__vlan_hwaccel_put_tag(skb, GET_RXD_VLAN_TCI(rxd_vlan));
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), GET_RXD_VLAN_TCI(rxd_vlan));
 	}
 	netif_receive_skb(skb);
 }
diff --git a/drivers/net/ethernet/via/via-rhine.c b/drivers/net/ethernet/via/via-rhine.c
index c6014916f622..ca98acabf1b4 100644
--- a/drivers/net/ethernet/via/via-rhine.c
+++ b/drivers/net/ethernet/via/via-rhine.c
@@ -1936,7 +1936,7 @@ static int rhine_rx(struct net_device *dev, int limit)
 			skb->protocol = eth_type_trans(skb, dev);
 
 			if (unlikely(desc_length & DescTag))
-				__vlan_hwaccel_put_tag(skb, vlan_tci);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tci);
 			netif_receive_skb(skb);
 
 			u64_stats_update_begin(&rp->rx_stats.syncp);
diff --git a/drivers/net/ethernet/via/via-velocity.c b/drivers/net/ethernet/via/via-velocity.c
index 91cd59146c24..fb6248956ee2 100644
--- a/drivers/net/ethernet/via/via-velocity.c
+++ b/drivers/net/ethernet/via/via-velocity.c
@@ -2080,7 +2080,7 @@ static int velocity_receive_frame(struct velocity_info *vptr, int idx)
 	if (rd->rdesc0.RSR & RSR_DETAG) {
 		u16 vid = swab16(le16_to_cpu(rd->rdesc1.PQTAG));
 
-		__vlan_hwaccel_put_tag(skb, vid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vid);
 	}
 	netif_rx(skb);
 
diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c
index b7e9cd7ca6d5..cc6dfe4102fd 100644
--- a/drivers/net/usb/cdc_mbim.c
+++ b/drivers/net/usb/cdc_mbim.c
@@ -221,7 +221,7 @@ static struct sk_buff *cdc_mbim_process_dgram(struct usbnet *dev, u8 *buf, size_
 
 	/* map MBIM session to VLAN */
 	if (tci)
-		vlan_put_tag(skb, tci);
+		vlan_put_tag(skb, htons(ETH_P_8021Q), tci);
 err:
 	return skb;
 }
diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c
index 27b889992ab8..55a62cae2cb4 100644
--- a/drivers/net/vmxnet3/vmxnet3_drv.c
+++ b/drivers/net/vmxnet3/vmxnet3_drv.c
@@ -1293,7 +1293,7 @@ vmxnet3_rq_rx_complete(struct vmxnet3_rx_queue *rq,
 			skb->protocol = eth_type_trans(skb, adapter->netdev);
 
 			if (unlikely(rcd->ts))
-				__vlan_hwaccel_put_tag(skb, rcd->tci);
+				__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), rcd->tci);
 
 			if (adapter->netdev->features & NETIF_F_LRO)
 				netif_receive_skb(skb);
diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 2c9fb65f8267..8086ff9988b1 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -157,9 +157,18 @@ static inline bool vlan_uses_dev(const struct net_device *dev)
 }
 #endif
 
+static inline bool vlan_hw_offload_capable(netdev_features_t features,
+					   __be16 proto)
+{
+	if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
+		return true;
+	return false;
+}
+
 /**
  * vlan_insert_tag - regular VLAN tag inserting
  * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Inserts the VLAN tag into @skb as part of the payload
@@ -170,7 +179,8 @@ static inline bool vlan_uses_dev(const struct net_device *dev)
  *
  * Does not change skb->protocol so this function can be used during receive.
  */
-static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
+static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb,
+					      __be16 vlan_proto, u16 vlan_tci)
 {
 	struct vlan_ethhdr *veth;
 
@@ -185,7 +195,7 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
 	skb->mac_header -= VLAN_HLEN;
 
 	/* first, the ethernet type */
-	veth->h_vlan_proto = htons(ETH_P_8021Q);
+	veth->h_vlan_proto = vlan_proto;
 
 	/* now, the TCI */
 	veth->h_vlan_TCI = htons(vlan_tci);
@@ -204,24 +214,28 @@ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, u16 vlan_tci)
  * Following the skb_unshare() example, in case of error, the calling function
  * doesn't have to worry about freeing the original skb.
  */
-static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
+static inline struct sk_buff *__vlan_put_tag(struct sk_buff *skb,
+					     __be16 vlan_proto, u16 vlan_tci)
 {
-	skb = vlan_insert_tag(skb, vlan_tci);
+	skb = vlan_insert_tag(skb, vlan_proto, vlan_tci);
 	if (skb)
-		skb->protocol = htons(ETH_P_8021Q);
+		skb->protocol = vlan_proto;
 	return skb;
 }
 
 /**
  * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
  * @skb: skbuff to tag
+ * @vlan_proto: VLAN encapsulation protocol
  * @vlan_tci: VLAN TCI to insert
  *
  * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest
  */
 static inline struct sk_buff *__vlan_hwaccel_put_tag(struct sk_buff *skb,
+						     __be16 vlan_proto,
 						     u16 vlan_tci)
 {
+	skb->vlan_proto = vlan_proto;
 	skb->vlan_tci = VLAN_TAG_PRESENT | vlan_tci;
 	return skb;
 }
@@ -236,12 +250,13 @@ static inline struct sk_buff *__vlan_hwaccel_put_tag(struct sk_buff *skb,
  * Assumes skb->dev is the target that will xmit this frame.
  * Returns a VLAN tagged skb.
  */
-static inline struct sk_buff *vlan_put_tag(struct sk_buff *skb, u16 vlan_tci)
+static inline struct sk_buff *vlan_put_tag(struct sk_buff *skb,
+					   __be16 vlan_proto, u16 vlan_tci)
 {
-	if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) {
-		return __vlan_hwaccel_put_tag(skb, vlan_tci);
+	if (vlan_hw_offload_capable(skb->dev->features, vlan_proto)) {
+		return __vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
 	} else {
-		return __vlan_put_tag(skb, vlan_tci);
+		return __vlan_put_tag(skb, vlan_proto, vlan_tci);
 	}
 }
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e27d1c782f32..f5bed7b31954 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -387,6 +387,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@secmark: security marking
  *	@mark: Generic packet mark
  *	@dropcount: total number of sk_receive_queue overflows
+ *	@vlan_proto: vlan encapsulation protocol
  *	@vlan_tci: vlan tag control information
  *	@inner_transport_header: Inner transport layer header (encapsulation)
  *	@inner_network_header: Network layer header (encapsulation)
@@ -465,6 +466,7 @@ struct sk_buff {
 
 	__u32			rxhash;
 
+	__be16			vlan_proto;
 	__u16			vlan_tci;
 
 #ifdef CONFIG_NET_SCHED
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 4e4c360353ea..bdb0b9d2e9cf 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -8,11 +8,12 @@
 bool vlan_do_receive(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
+	__be16 vlan_proto = skb->vlan_proto;
 	u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK;
 	struct net_device *vlan_dev;
 	struct vlan_pcpu_stats *rx_stats;
 
-	vlan_dev = vlan_find_dev(skb->dev, htons(ETH_P_8021Q), vlan_id);
+	vlan_dev = vlan_find_dev(skb->dev, vlan_proto, vlan_id);
 	if (!vlan_dev)
 		return false;
 
@@ -38,7 +39,8 @@ bool vlan_do_receive(struct sk_buff **skbp)
 		 * original position later
 		 */
 		skb_push(skb, offset);
-		skb = *skbp = vlan_insert_tag(skb, skb->vlan_tci);
+		skb = *skbp = vlan_insert_tag(skb, skb->vlan_proto,
+					      skb->vlan_tci);
 		if (!skb)
 			return false;
 		skb_pull(skb, offset + VLAN_HLEN);
@@ -127,7 +129,7 @@ struct sk_buff *vlan_untag(struct sk_buff *skb)
 
 	vhdr = (struct vlan_hdr *) skb->data;
 	vlan_tci = ntohs(vhdr->h_vlan_TCI);
-	__vlan_hwaccel_put_tag(skb, vlan_tci);
+	__vlan_hwaccel_put_tag(skb, skb->protocol, vlan_tci);
 
 	skb_pull_rcsum(skb, VLAN_HLEN);
 	vlan_set_encap_proto(skb, vhdr);
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index d7457b7e1b95..8af508536d36 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -167,7 +167,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 		u16 vlan_tci;
 		vlan_tci = vlan->vlan_id;
 		vlan_tci |= vlan_dev_get_egress_qos_mask(dev, skb);
-		skb = __vlan_hwaccel_put_tag(skb, vlan_tci);
+		skb = __vlan_hwaccel_put_tag(skb, vlan->vlan_proto, vlan_tci);
 	}
 
 	skb->dev = vlan->real_dev;
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 6a4f728680ae..379061c72549 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -341,7 +341,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, uint8_t *mac,
 	}
 
 	if (vid != -1)
-		skb = vlan_insert_tag(skb, vid);
+		skb = vlan_insert_tag(skb, htons(ETH_P_8021Q), vid);
 
 	skb_reset_mac_header(skb);
 	skb->protocol = eth_type_trans(skb, soft_iface);
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index bd61bf5159c9..1ed75bfd8d1d 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -535,7 +535,7 @@ static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct
 	if (brnf_pass_vlan_indev == 0 || !vlan_tx_tag_present(skb))
 		return br;
 
-	vlan = __vlan_find_dev_deep(br, htons(ETH_P_8021Q),
+	vlan = __vlan_find_dev_deep(br, skb->vlan_proto,
 				    vlan_tx_tag_get(skb) & VLAN_VID_MASK);
 
 	return vlan ? vlan : br;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index c3076e2c4294..bd58b45f5f90 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -175,7 +175,7 @@ struct sk_buff *br_handle_vlan(struct net_bridge *br,
 			 * mac header.
 			 */
 			skb_push(skb, ETH_HLEN);
-			skb = __vlan_put_tag(skb, skb->vlan_tci);
+			skb = __vlan_put_tag(skb, skb->vlan_proto, skb->vlan_tci);
 			if (!skb)
 				goto out;
 			/* put skb->data back to where it was */
@@ -217,7 +217,7 @@ bool br_allowed_ingress(struct net_bridge *br, struct net_port_vlans *v,
 		/* PVID is set on this port.  Any untagged ingress
 		 * frame is considered to belong to this vlan.
 		 */
-		__vlan_hwaccel_put_tag(skb, pvid);
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), pvid);
 		return true;
 	}
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 07a8e9dc43fc..3a12ee132b59 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2482,8 +2482,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 		features = netif_skb_features(skb);
 
 		if (vlan_tx_tag_present(skb) &&
-		    !(features & NETIF_F_HW_VLAN_CTAG_TX)) {
-			skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+		    !vlan_hw_offload_capable(features, skb->vlan_proto)) {
+			skb = __vlan_put_tag(skb, skb->vlan_proto,
+					     vlan_tx_tag_get(skb));
 			if (unlikely(!skb))
 				goto out;
 
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 8de961e67cf7..209d84253dd5 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -383,8 +383,9 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
 			if (__netif_tx_trylock(txq)) {
 				if (!netif_xmit_stopped(txq)) {
 					if (vlan_tx_tag_present(skb) &&
-					    !(netif_skb_features(skb) & NETIF_F_HW_VLAN_CTAG_TX)) {
-						skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
+					    !vlan_hw_offload_capable(netif_skb_features(skb),
+								     skb->vlan_proto)) {
+						skb = __vlan_put_tag(skb, skb->vlan_proto, vlan_tx_tag_get(skb));
 						if (unlikely(!skb))
 							break;
 						skb->vlan_tci = 0;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ba646145cd5c..a92d9e7d10f7 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -707,6 +707,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->tc_verd		= old->tc_verd;
 #endif
 #endif
+	new->vlan_proto		= old->vlan_proto;
 	new->vlan_tci		= old->vlan_tci;
 
 	skb_copy_secmark(new, old);
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index d4d5363c7ba7..894b6cbdd929 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -98,7 +98,7 @@ static int pop_vlan(struct sk_buff *skb)
 	if (unlikely(err))
 		return err;
 
-	__vlan_hwaccel_put_tag(skb, ntohs(tci));
+	__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), ntohs(tci));
 	return 0;
 }
 
@@ -110,7 +110,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
 		/* push down current VLAN tag */
 		current_tag = vlan_tx_tag_get(skb);
 
-		if (!__vlan_put_tag(skb, current_tag))
+		if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
 			return -ENOMEM;
 
 		if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -118,7 +118,7 @@ static int push_vlan(struct sk_buff *skb, const struct ovs_action_push_vlan *vla
 					+ (2 * ETH_ALEN), VLAN_HLEN, 0));
 
 	}
-	__vlan_hwaccel_put_tag(skb, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
+	__vlan_hwaccel_put_tag(skb, vlan->vlan_tpid, ntohs(vlan->vlan_tci) & ~VLAN_TAG_PRESENT);
 	return 0;
 }
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index b7d0b7c3fe2c..7bb5d4f6bb90 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -401,7 +401,7 @@ static int queue_userspace_packet(struct net *net, int dp_ifindex,
 		if (!nskb)
 			return -ENOMEM;
 
-		nskb = __vlan_put_tag(nskb, vlan_tx_tag_get(nskb));
+		nskb = __vlan_put_tag(nskb, nskb->vlan_proto, vlan_tx_tag_get(nskb));
 		if (!nskb)
 			return -ENOMEM;
 
-- 
cgit 


From 8ad227ff89a7e6f05d07cd0acfd95ed3a24450ca Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Fri, 19 Apr 2013 02:04:31 +0000
Subject: net: vlan: add 802.1ad support

Add support for 802.1ad VLAN devices. This mainly consists of checking for
ETH_P_8021AD in addition to ETH_P_8021Q in a couple of places and check
offloading capabilities based on the used protocol.

Configuration is done using "ip link":

# ip link add link eth0 eth0.1000 \
	type vlan proto 802.1ad id 1000
# ip link add link eth0.1000 eth0.1000.1000 \
	type vlan proto 802.1q id 1000

52:54:00:12:34:56 > 92:b1:54:28:e4:8c, ethertype 802.1Q (0x8100), length 106: vlan 1000, p 0, ethertype 802.1Q, vlan 1000, p 0, ethertype IPv4, (tos 0x0, ttl 64, id 0, offset 0, flags [DF], proto ICMP (1), length 84)
    20.1.0.2 > 20.1.0.1: ICMP echo request, id 3003, seq 8, length 64
92:b1:54:28:e4:8c > 52:54:00:12:34:56, ethertype 802.1Q-QinQ (0x88a8), length 106: vlan 1000, p 0, ethertype 802.1Q, vlan 1000, p 0, ethertype IPv4, (tos 0x0, ttl 64, id 47944, offset 0, flags [none], proto ICMP (1), length 84)
    20.1.0.1 > 20.1.0.2: ICMP echo reply, id 3003, seq 8, length 64

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h         |  6 ++++--
 include/linux/netdev_features.h |  6 ++++++
 include/uapi/linux/if_link.h    |  1 +
 net/8021q/Kconfig               |  2 +-
 net/8021q/vlan.h                |  3 +++
 net/8021q/vlan_core.c           | 18 ++++++++++++++----
 net/8021q/vlan_netlink.c        | 25 ++++++++++++++++++++++---
 net/core/dev.c                  | 16 ++++++++++------
 8 files changed, 61 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index 8086ff9988b1..a78f9390da87 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -162,6 +162,8 @@ static inline bool vlan_hw_offload_capable(netdev_features_t features,
 {
 	if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX)
 		return true;
+	if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX)
+		return true;
 	return false;
 }
 
@@ -271,9 +273,9 @@ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci)
 {
 	struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data;
 
-	if (veth->h_vlan_proto != htons(ETH_P_8021Q)) {
+	if (veth->h_vlan_proto != htons(ETH_P_8021Q) &&
+	    veth->h_vlan_proto != htons(ETH_P_8021AD))
 		return -EINVAL;
-	}
 
 	*vlan_tci = ntohs(veth->h_vlan_TCI);
 	return 0;
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 785913b8983d..cbaa027ef5a7 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -25,6 +25,9 @@ enum {
 	NETIF_F_HW_VLAN_CTAG_TX_BIT,	/* Transmit VLAN CTAG HW acceleration */
 	NETIF_F_HW_VLAN_CTAG_RX_BIT,	/* Receive VLAN CTAG HW acceleration */
 	NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */
+	NETIF_F_HW_VLAN_STAG_TX_BIT,	/* Transmit VLAN STAG HW acceleration */
+	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
+	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_VLAN_CHALLENGED_BIT,	/* Device cannot handle VLAN packets */
 	NETIF_F_GSO_BIT,		/* Enable software GSO. */
 	NETIF_F_LLTX_BIT,		/* LockLess TX - deprecated. Please */
@@ -83,6 +86,9 @@ enum {
 #define NETIF_F_HW_VLAN_CTAG_FILTER __NETIF_F(HW_VLAN_CTAG_FILTER)
 #define NETIF_F_HW_VLAN_CTAG_RX	__NETIF_F(HW_VLAN_CTAG_RX)
 #define NETIF_F_HW_VLAN_CTAG_TX	__NETIF_F(HW_VLAN_CTAG_TX)
+#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
+#define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
+#define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM)
 #define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)
 #define NETIF_F_LLTX		__NETIF_F(LLTX)
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9922704f08af..e3163544f339 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -250,6 +250,7 @@ enum {
 	IFLA_VLAN_FLAGS,
 	IFLA_VLAN_EGRESS_QOS,
 	IFLA_VLAN_INGRESS_QOS,
+	IFLA_VLAN_PROTOCOL,
 	__IFLA_VLAN_MAX,
 };
 
diff --git a/net/8021q/Kconfig b/net/8021q/Kconfig
index 8f7517df41a5..b85a91fa61f1 100644
--- a/net/8021q/Kconfig
+++ b/net/8021q/Kconfig
@@ -3,7 +3,7 @@
 #
 
 config VLAN_8021Q
-	tristate "802.1Q VLAN Support"
+	tristate "802.1Q/802.1ad VLAN Support"
 	---help---
 	  Select this and you will be able to create 802.1Q VLAN interfaces
 	  on your ethernet interfaces.  802.1Q VLAN supports almost
diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h
index 245de9653db0..abc9cb631c47 100644
--- a/net/8021q/vlan.h
+++ b/net/8021q/vlan.h
@@ -91,6 +91,7 @@ static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev)
 
 enum vlan_protos {
 	VLAN_PROTO_8021Q	= 0,
+	VLAN_PROTO_8021AD,
 	VLAN_PROTO_NUM,
 };
 
@@ -116,6 +117,8 @@ static inline unsigned int vlan_proto_idx(__be16 proto)
 	switch (proto) {
 	case __constant_htons(ETH_P_8021Q):
 		return VLAN_PROTO_8021Q;
+	case __constant_htons(ETH_P_8021AD):
+		return VLAN_PROTO_8021AD;
 	default:
 		BUG();
 	}
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index bdb0b9d2e9cf..ebfa2fceb88b 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -194,6 +194,18 @@ struct vlan_vid_info {
 	int refcount;
 };
 
+static bool vlan_hw_filter_capable(const struct net_device *dev,
+				     const struct vlan_vid_info *vid_info)
+{
+	if (vid_info->proto == htons(ETH_P_8021Q) &&
+	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER)
+		return true;
+	if (vid_info->proto == htons(ETH_P_8021AD) &&
+	    dev->features & NETIF_F_HW_VLAN_STAG_FILTER)
+		return true;
+	return false;
+}
+
 static struct vlan_vid_info *vlan_vid_info_get(struct vlan_info *vlan_info,
 					       __be16 proto, u16 vid)
 {
@@ -231,8 +243,7 @@ static int __vlan_vid_add(struct vlan_info *vlan_info, __be16 proto, u16 vid,
 	if (!vid_info)
 		return -ENOMEM;
 
-	if (proto == htons(ETH_P_8021Q) &&
-	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+	if (vlan_hw_filter_capable(dev, vid_info)) {
 		err =  ops->ndo_vlan_rx_add_vid(dev, proto, vid);
 		if (err) {
 			kfree(vid_info);
@@ -290,8 +301,7 @@ static void __vlan_vid_del(struct vlan_info *vlan_info,
 	u16 vid = vid_info->vid;
 	int err;
 
-	if (proto == htons(ETH_P_8021Q) &&
-	    dev->features & NETIF_F_HW_VLAN_CTAG_FILTER) {
+	if (vlan_hw_filter_capable(dev, vid_info)) {
 		err = ops->ndo_vlan_rx_kill_vid(dev, proto, vid);
 		if (err) {
 			pr_warn("failed to kill vid %04x/%d for device %s\n",
diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index a1a956ab39a5..309129732285 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -23,6 +23,7 @@ static const struct nla_policy vlan_policy[IFLA_VLAN_MAX + 1] = {
 	[IFLA_VLAN_FLAGS]	= { .len = sizeof(struct ifla_vlan_flags) },
 	[IFLA_VLAN_EGRESS_QOS]	= { .type = NLA_NESTED },
 	[IFLA_VLAN_INGRESS_QOS] = { .type = NLA_NESTED },
+	[IFLA_VLAN_PROTOCOL]	= { .type = NLA_U16 },
 };
 
 static const struct nla_policy vlan_map_policy[IFLA_VLAN_QOS_MAX + 1] = {
@@ -53,6 +54,16 @@ static int vlan_validate(struct nlattr *tb[], struct nlattr *data[])
 	if (!data)
 		return -EINVAL;
 
+	if (data[IFLA_VLAN_PROTOCOL]) {
+		switch (nla_get_be16(data[IFLA_VLAN_PROTOCOL])) {
+		case __constant_htons(ETH_P_8021Q):
+		case __constant_htons(ETH_P_8021AD):
+			break;
+		default:
+			return -EPROTONOSUPPORT;
+		}
+	}
+
 	if (data[IFLA_VLAN_ID]) {
 		id = nla_get_u16(data[IFLA_VLAN_ID]);
 		if (id >= VLAN_VID_MASK)
@@ -107,6 +118,7 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 	struct net_device *real_dev;
+	__be16 proto;
 	int err;
 
 	if (!data[IFLA_VLAN_ID])
@@ -118,7 +130,12 @@ static int vlan_newlink(struct net *src_net, struct net_device *dev,
 	if (!real_dev)
 		return -ENODEV;
 
-	vlan->vlan_proto = htons(ETH_P_8021Q);
+	if (data[IFLA_VLAN_PROTOCOL])
+		proto = nla_get_be16(data[IFLA_VLAN_PROTOCOL]);
+	else
+		proto = htons(ETH_P_8021Q);
+
+	vlan->vlan_proto = proto;
 	vlan->vlan_id	 = nla_get_u16(data[IFLA_VLAN_ID]);
 	vlan->real_dev	 = real_dev;
 	vlan->flags	 = VLAN_FLAG_REORDER_HDR;
@@ -152,7 +169,8 @@ static size_t vlan_get_size(const struct net_device *dev)
 {
 	struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
 
-	return nla_total_size(2) +	/* IFLA_VLAN_ID */
+	return nla_total_size(2) +	/* IFLA_VLAN_PROTOCOL */
+	       nla_total_size(2) +	/* IFLA_VLAN_ID */
 	       sizeof(struct ifla_vlan_flags) + /* IFLA_VLAN_FLAGS */
 	       vlan_qos_map_size(vlan->nr_ingress_mappings) +
 	       vlan_qos_map_size(vlan->nr_egress_mappings);
@@ -167,7 +185,8 @@ static int vlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
 	struct nlattr *nest;
 	unsigned int i;
 
-	if (nla_put_u16(skb, IFLA_VLAN_ID, vlan_dev_priv(dev)->vlan_id))
+	if (nla_put_be16(skb, IFLA_VLAN_PROTOCOL, vlan->vlan_proto) ||
+	    nla_put_u16(skb, IFLA_VLAN_ID, vlan->vlan_id))
 		goto nla_put_failure;
 	if (vlan->flags) {
 		f.flags = vlan->flags;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3a12ee132b59..fad4c385f7a1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2212,7 +2212,7 @@ __be16 skb_network_protocol(struct sk_buff *skb)
 	__be16 type = skb->protocol;
 	int vlan_depth = ETH_HLEN;
 
-	while (type == htons(ETH_P_8021Q)) {
+	while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
 		struct vlan_hdr *vh;
 
 		if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
@@ -2428,20 +2428,22 @@ netdev_features_t netif_skb_features(struct sk_buff *skb)
 	if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
 		features &= ~NETIF_F_GSO_MASK;
 
-	if (protocol == htons(ETH_P_8021Q)) {
+	if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
 		struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
 		protocol = veh->h_vlan_encapsulated_proto;
 	} else if (!vlan_tx_tag_present(skb)) {
 		return harmonize_features(skb, protocol, features);
 	}
 
-	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX);
+	features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
+					       NETIF_F_HW_VLAN_STAG_TX);
 
-	if (protocol != htons(ETH_P_8021Q)) {
+	if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
 		return harmonize_features(skb, protocol, features);
 	} else {
 		features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
-				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX;
+				NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
+				NETIF_F_HW_VLAN_STAG_TX;
 		return harmonize_features(skb, protocol, features);
 	}
 }
@@ -3360,6 +3362,7 @@ static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
 	case __constant_htons(ETH_P_IP):
 	case __constant_htons(ETH_P_IPV6):
 	case __constant_htons(ETH_P_8021Q):
+	case __constant_htons(ETH_P_8021AD):
 		return true;
 	default:
 		return false;
@@ -3400,7 +3403,8 @@ another_round:
 
 	__this_cpu_inc(softnet_data.processed);
 
-	if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
+	if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
+	    skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
 		skb = vlan_untag(skb);
 		if (unlikely(!skb))
 			goto unlock;
-- 
cgit 


From e32123e59871b9389d5b3fe9318611c7f1d1307a Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Apr 2013 06:46:57 +0000
Subject: netlink: rename ssk to sk in struct netlink_skb_params

Memory mapped netlink needs to store the receiving userspace socket
when sending from the kernel to userspace. Rename 'ssk' to 'sk' to
avoid confusion.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h       | 2 +-
 net/ipv4/inet_diag.c          | 6 +++---
 net/ipv4/udp_diag.c           | 4 ++--
 net/netfilter/nfnetlink_log.c | 2 +-
 net/netlink/af_netlink.c      | 2 +-
 net/sched/cls_flow.c          | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index e0f746b7b95c..d8e9264ae04a 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -19,7 +19,7 @@ struct netlink_skb_parms {
 	struct scm_creds	creds;		/* Skb credentials	*/
 	__u32			portid;
 	__u32			dst_group;
-	struct sock		*ssk;
+	struct sock		*sk;
 };
 
 #define NETLINK_CB(skb)		(*(struct netlink_skb_parms*)&((skb)->cb))
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 8620408af574..5f648751fce2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -324,7 +324,7 @@ int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *in_s
 	}
 
 	err = sk_diag_fill(sk, rep, req,
-			   sk_user_ns(NETLINK_CB(in_skb).ssk),
+			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
@@ -630,7 +630,7 @@ static int inet_csk_diag_dump(struct sock *sk,
 		return 0;
 
 	return inet_csk_diag_fill(sk, skb, r,
-				  sk_user_ns(NETLINK_CB(cb->skb).ssk),
+				  sk_user_ns(NETLINK_CB(cb->skb).sk),
 				  NETLINK_CB(cb->skb).portid,
 				  cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -805,7 +805,7 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
 			}
 
 			err = inet_diag_fill_req(skb, sk, req,
-					       sk_user_ns(NETLINK_CB(cb->skb).ssk),
+					       sk_user_ns(NETLINK_CB(cb->skb).sk),
 					       NETLINK_CB(cb->skb).portid,
 					       cb->nlh->nlmsg_seq, cb->nlh);
 			if (err < 0) {
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 369a781851ad..7927db0a9279 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -25,7 +25,7 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
 		return 0;
 
 	return inet_sk_diag_fill(sk, NULL, skb, req,
-			sk_user_ns(NETLINK_CB(cb->skb).ssk),
+			sk_user_ns(NETLINK_CB(cb->skb).sk),
 			NETLINK_CB(cb->skb).portid,
 			cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh);
 }
@@ -71,7 +71,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
 		goto out;
 
 	err = inet_sk_diag_fill(sk, NULL, rep, req,
-			   sk_user_ns(NETLINK_CB(in_skb).ssk),
+			   sk_user_ns(NETLINK_CB(in_skb).sk),
 			   NETLINK_CB(in_skb).portid,
 			   nlh->nlmsg_seq, 0, nlh);
 	if (err < 0) {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 1a0be2af1dd8..50aaf716cd68 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -824,7 +824,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
 
 			inst = instance_create(net, group_num,
 					       NETLINK_CB(skb).portid,
-					       sk_user_ns(NETLINK_CB(skb).ssk));
+					       sk_user_ns(NETLINK_CB(skb).sk));
 			if (IS_ERR(inst)) {
 				ret = PTR_ERR(inst);
 				goto out;
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f20a81005177..978a61f7c87f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -891,7 +891,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
 	if (nlk->netlink_rcv != NULL) {
 		ret = skb->len;
 		skb_set_owner_r(skb, sk);
-		NETLINK_CB(skb).ssk = ssk;
+		NETLINK_CB(skb).sk = ssk;
 		nlk->netlink_rcv(skb);
 		consume_skb(skb);
 	} else {
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index aa36a8c8b33b..7881e2fccbc2 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -393,7 +393,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb,
 			return -EOPNOTSUPP;
 
 		if ((keymask & (FLOW_KEY_SKUID|FLOW_KEY_SKGID)) &&
-		    sk_user_ns(NETLINK_CB(in_skb).ssk) != &init_user_ns)
+		    sk_user_ns(NETLINK_CB(in_skb).sk) != &init_user_ns)
 			return -EOPNOTSUPP;
 	}
 
-- 
cgit 


From 0ebd0ac5ff01ebf412e1bd3c33620ef7ffc5d866 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Apr 2013 06:46:58 +0000
Subject: net: add function to allocate sk_buff head without data area

Add a function to allocate a sk_buff head without any data. This will
be used by memory mapped netlink to attach data from the mmaped area
to the skb.

Additionally change skb_release_all() to check whether the skb has a
data area to allow the skb destructor to clear the data pointer in case
only a head has been allocated.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/skbuff.h |  6 ++++++
 net/core/skbuff.c      | 30 +++++++++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f5bed7b31954..2e0ced1af3b1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -651,6 +651,12 @@ static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
 	return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE);
 }
 
+extern struct sk_buff *__alloc_skb_head(gfp_t priority, int node);
+static inline struct sk_buff *alloc_skb_head(gfp_t priority)
+{
+	return __alloc_skb_head(priority, -1);
+}
+
 extern struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src);
 extern int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask);
 extern struct sk_buff *skb_clone(struct sk_buff *skb,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index a92d9e7d10f7..898cf5c566f9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -179,6 +179,33 @@ out:
  *
  */
 
+struct sk_buff *__alloc_skb_head(gfp_t gfp_mask, int node)
+{
+	struct sk_buff *skb;
+
+	/* Get the HEAD */
+	skb = kmem_cache_alloc_node(skbuff_head_cache,
+				    gfp_mask & ~__GFP_DMA, node);
+	if (!skb)
+		goto out;
+
+	/*
+	 * Only clear those fields we need to clear, not those that we will
+	 * actually initialise below. Hence, don't put any more fields after
+	 * the tail pointer in struct sk_buff!
+	 */
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	skb->data = NULL;
+	skb->truesize = sizeof(struct sk_buff);
+	atomic_set(&skb->users, 1);
+
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->mac_header = ~0U;
+#endif
+out:
+	return skb;
+}
+
 /**
  *	__alloc_skb	-	allocate a network buffer
  *	@size: size to allocate
@@ -584,7 +611,8 @@ static void skb_release_head_state(struct sk_buff *skb)
 static void skb_release_all(struct sk_buff *skb)
 {
 	skb_release_head_state(skb);
-	skb_release_data(skb);
+	if (likely(skb->data))
+		skb_release_data(skb);
 }
 
 /**
-- 
cgit 


From 9652e931e73be7e54a9c40e9bcd4bbdafe92a406 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Apr 2013 06:47:02 +0000
Subject: netlink: add mmap'ed netlink helper functions

Add helper functions for looking up mmap'ed frame headers, reading and
writing their status, allocating skbs with mmap'ed data areas and a poll
function.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |   7 ++
 net/netlink/af_netlink.c | 185 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 190 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index d8e9264ae04a..07c473848dbd 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -15,10 +15,17 @@ static inline struct nlmsghdr *nlmsg_hdr(const struct sk_buff *skb)
 	return (struct nlmsghdr *)skb->data;
 }
 
+enum netlink_skb_flags {
+	NETLINK_SKB_MMAPED	= 0x1,		/* Packet data is mmaped */
+	NETLINK_SKB_TX		= 0x2,		/* Packet was sent by userspace */
+	NETLINK_SKB_DELIVERED	= 0x4,		/* Packet was delivered */
+};
+
 struct netlink_skb_parms {
 	struct scm_creds	creds;		/* Skb credentials	*/
 	__u32			portid;
 	__u32			dst_group;
+	__u32			flags;
 	struct sock		*sk;
 };
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 1d3c7128e90e..6560635fd25c 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -56,6 +56,7 @@
 #include <linux/audit.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
+#include <asm/cacheflush.h>
 
 #include <net/net_namespace.h>
 #include <net/sock.h>
@@ -89,6 +90,7 @@ EXPORT_SYMBOL_GPL(nl_table);
 static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
 
 static int netlink_dump(struct sock *sk);
+static void netlink_skb_destructor(struct sk_buff *skb);
 
 DEFINE_RWLOCK(nl_table_lock);
 EXPORT_SYMBOL_GPL(nl_table_lock);
@@ -109,6 +111,11 @@ static inline struct hlist_head *nl_portid_hashfn(struct nl_portid_hash *hash, u
 }
 
 #ifdef CONFIG_NETLINK_MMAP
+static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
+{
+	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
+}
+
 static __pure struct page *pgvec_to_page(const void *addr)
 {
 	if (is_vmalloc_addr(addr))
@@ -332,8 +339,154 @@ out:
 	mutex_unlock(&nlk->pg_vec_lock);
 	return 0;
 }
+
+static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr)
+{
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
+	struct page *p_start, *p_end;
+
+	/* First page is flushed through netlink_{get,set}_status */
+	p_start = pgvec_to_page(hdr + PAGE_SIZE);
+	p_end   = pgvec_to_page((void *)hdr + NL_MMAP_MSG_HDRLEN + hdr->nm_len - 1);
+	while (p_start <= p_end) {
+		flush_dcache_page(p_start);
+		p_start++;
+	}
+#endif
+}
+
+static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
+{
+	smp_rmb();
+	flush_dcache_page(pgvec_to_page(hdr));
+	return hdr->nm_status;
+}
+
+static void netlink_set_status(struct nl_mmap_hdr *hdr,
+			       enum nl_mmap_status status)
+{
+	hdr->nm_status = status;
+	flush_dcache_page(pgvec_to_page(hdr));
+	smp_wmb();
+}
+
+static struct nl_mmap_hdr *
+__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
+{
+	unsigned int pg_vec_pos, frame_off;
+
+	pg_vec_pos = pos / ring->frames_per_block;
+	frame_off  = pos % ring->frames_per_block;
+
+	return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
+}
+
+static struct nl_mmap_hdr *
+netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
+		     enum nl_mmap_status status)
+{
+	struct nl_mmap_hdr *hdr;
+
+	hdr = __netlink_lookup_frame(ring, pos);
+	if (netlink_get_status(hdr) != status)
+		return NULL;
+
+	return hdr;
+}
+
+static struct nl_mmap_hdr *
+netlink_current_frame(const struct netlink_ring *ring,
+		      enum nl_mmap_status status)
+{
+	return netlink_lookup_frame(ring, ring->head, status);
+}
+
+static struct nl_mmap_hdr *
+netlink_previous_frame(const struct netlink_ring *ring,
+		       enum nl_mmap_status status)
+{
+	unsigned int prev;
+
+	prev = ring->head ? ring->head - 1 : ring->frame_max;
+	return netlink_lookup_frame(ring, prev, status);
+}
+
+static void netlink_increment_head(struct netlink_ring *ring)
+{
+	ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
+}
+
+static void netlink_forward_ring(struct netlink_ring *ring)
+{
+	unsigned int head = ring->head, pos = head;
+	const struct nl_mmap_hdr *hdr;
+
+	do {
+		hdr = __netlink_lookup_frame(ring, pos);
+		if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
+			break;
+		if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
+			break;
+		netlink_increment_head(ring);
+	} while (ring->head != head);
+}
+
+static unsigned int netlink_poll(struct file *file, struct socket *sock,
+				 poll_table *wait)
+{
+	struct sock *sk = sock->sk;
+	struct netlink_sock *nlk = nlk_sk(sk);
+	unsigned int mask;
+
+	mask = datagram_poll(file, sock, wait);
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	if (nlk->rx_ring.pg_vec) {
+		netlink_forward_ring(&nlk->rx_ring);
+		if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED))
+			mask |= POLLIN | POLLRDNORM;
+	}
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	spin_lock_bh(&sk->sk_write_queue.lock);
+	if (nlk->tx_ring.pg_vec) {
+		if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
+			mask |= POLLOUT | POLLWRNORM;
+	}
+	spin_unlock_bh(&sk->sk_write_queue.lock);
+
+	return mask;
+}
+
+static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
+{
+	return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
+}
+
+static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
+				   struct netlink_ring *ring,
+				   struct nl_mmap_hdr *hdr)
+{
+	unsigned int size;
+	void *data;
+
+	size = ring->frame_size - NL_MMAP_HDRLEN;
+	data = (void *)hdr + NL_MMAP_HDRLEN;
+
+	skb->head	= data;
+	skb->data	= data;
+	skb_reset_tail_pointer(skb);
+	skb->end	= skb->tail + size;
+	skb->len	= 0;
+
+	skb->destructor	= netlink_skb_destructor;
+	NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
+	NETLINK_CB(skb).sk = sk;
+}
 #else /* CONFIG_NETLINK_MMAP */
+#define netlink_skb_is_mmaped(skb)	false
 #define netlink_mmap			sock_no_mmap
+#define netlink_poll			datagram_poll
 #endif /* CONFIG_NETLINK_MMAP */
 
 static void netlink_destroy_callback(struct netlink_callback *cb)
@@ -350,7 +503,35 @@ static void netlink_consume_callback(struct netlink_callback *cb)
 
 static void netlink_skb_destructor(struct sk_buff *skb)
 {
-	sock_rfree(skb);
+#ifdef CONFIG_NETLINK_MMAP
+	struct nl_mmap_hdr *hdr;
+	struct netlink_ring *ring;
+	struct sock *sk;
+
+	/* If a packet from the kernel to userspace was freed because of an
+	 * error without being delivered to userspace, the kernel must reset
+	 * the status. In the direction userspace to kernel, the status is
+	 * always reset here after the packet was processed and freed.
+	 */
+	if (netlink_skb_is_mmaped(skb)) {
+		hdr = netlink_mmap_hdr(skb);
+		sk = NETLINK_CB(skb).sk;
+
+		if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
+			hdr->nm_len = 0;
+			netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+		}
+		ring = &nlk_sk(sk)->rx_ring;
+
+		WARN_ON(atomic_read(&ring->pending) == 0);
+		atomic_dec(&ring->pending);
+		sock_put(sk);
+
+		skb->data = NULL;
+	}
+#endif
+	if (skb->sk != NULL)
+		sock_rfree(skb);
 }
 
 static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
@@ -2349,7 +2530,7 @@ static const struct proto_ops netlink_ops = {
 	.socketpair =	sock_no_socketpair,
 	.accept =	sock_no_accept,
 	.getname =	netlink_getname,
-	.poll =		datagram_poll,
+	.poll =		netlink_poll,
 	.ioctl =	sock_no_ioctl,
 	.listen =	sock_no_listen,
 	.shutdown =	sock_no_shutdown,
-- 
cgit 


From f9c2288837ba072b21dba955f04a4c97eaa77b1e Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Apr 2013 06:47:04 +0000
Subject: netlink: implement memory mapped recvmsg()

Add support for mmap'ed recvmsg(). To allow the kernel to construct messages
into the mapped area, a dataless skb is allocated and the data pointer is
set to point into the ring frame. This means frames will be delivered to
userspace in order of allocation instead of order of transmission. This
usually doesn't matter since the order is either not determinable by
userspace or message creation/transmission is serialized. The only case
where this can have a visible difference is nfnetlink_queue. Userspace
can't assume mmap'ed messages have ordered IDs anymore and needs to check
this if using batched verdicts.

For non-mapped sockets, nothing changes.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netlink.h  |   2 +
 net/netlink/af_netlink.c | 144 ++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 143 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 07c473848dbd..6358da5eeee8 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -64,6 +64,8 @@ extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group)
 extern void netlink_clear_multicast_users(struct sock *sk, unsigned int group);
 extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err);
 extern int netlink_has_listeners(struct sock *sk, unsigned int group);
+extern struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
+					 u32 dst_portid, gfp_t gfp_mask);
 extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock);
 extern int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, __u32 portid,
 			     __u32 group, gfp_t allocation);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 90504a0e42ab..d120b5d4d86a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -116,6 +116,11 @@ static bool netlink_skb_is_mmaped(const struct sk_buff *skb)
 	return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
 }
 
+static bool netlink_rx_is_mmaped(struct sock *sk)
+{
+	return nlk_sk(sk)->rx_ring.pg_vec != NULL;
+}
+
 static bool netlink_tx_is_mmaped(struct sock *sk)
 {
 	return nlk_sk(sk)->tx_ring.pg_vec != NULL;
@@ -589,8 +594,54 @@ out:
 	mutex_unlock(&nlk->pg_vec_lock);
 	return err;
 }
+
+static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
+{
+	struct nl_mmap_hdr *hdr;
+
+	hdr = netlink_mmap_hdr(skb);
+	hdr->nm_len	= skb->len;
+	hdr->nm_group	= NETLINK_CB(skb).dst_group;
+	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
+	hdr->nm_uid	= NETLINK_CB(skb).creds.uid;
+	hdr->nm_gid	= NETLINK_CB(skb).creds.gid;
+	netlink_frame_flush_dcache(hdr);
+	netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
+
+	NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
+	kfree_skb(skb);
+}
+
+static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	struct netlink_ring *ring = &nlk->rx_ring;
+	struct nl_mmap_hdr *hdr;
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+	if (hdr == NULL) {
+		spin_unlock_bh(&sk->sk_receive_queue.lock);
+		kfree_skb(skb);
+		sk->sk_err = ENOBUFS;
+		sk->sk_error_report(sk);
+		return;
+	}
+	netlink_increment_head(ring);
+	__skb_queue_tail(&sk->sk_receive_queue, skb);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+
+	hdr->nm_len	= skb->len;
+	hdr->nm_group	= NETLINK_CB(skb).dst_group;
+	hdr->nm_pid	= NETLINK_CB(skb).creds.pid;
+	hdr->nm_uid	= NETLINK_CB(skb).creds.uid;
+	hdr->nm_gid	= NETLINK_CB(skb).creds.gid;
+	netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
+}
+
 #else /* CONFIG_NETLINK_MMAP */
 #define netlink_skb_is_mmaped(skb)	false
+#define netlink_rx_is_mmaped(sk)	false
 #define netlink_tx_is_mmaped(sk)	false
 #define netlink_mmap			sock_no_mmap
 #define netlink_poll			datagram_poll
@@ -1381,7 +1432,14 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
 {
 	int len = skb->len;
 
-	skb_queue_tail(&sk->sk_receive_queue, skb);
+#ifdef CONFIG_NETLINK_MMAP
+	if (netlink_skb_is_mmaped(skb))
+		netlink_queue_mmaped_skb(sk, skb);
+	else if (netlink_rx_is_mmaped(sk))
+		netlink_ring_set_copied(sk, skb);
+	else
+#endif /* CONFIG_NETLINK_MMAP */
+		skb_queue_tail(&sk->sk_receive_queue, skb);
 	sk->sk_data_ready(sk, len);
 	return len;
 }
@@ -1492,6 +1550,68 @@ retry:
 }
 EXPORT_SYMBOL(netlink_unicast);
 
+struct sk_buff *netlink_alloc_skb(struct sock *ssk, unsigned int size,
+				  u32 dst_portid, gfp_t gfp_mask)
+{
+#ifdef CONFIG_NETLINK_MMAP
+	struct sock *sk = NULL;
+	struct sk_buff *skb;
+	struct netlink_ring *ring;
+	struct nl_mmap_hdr *hdr;
+	unsigned int maxlen;
+
+	sk = netlink_getsockbyportid(ssk, dst_portid);
+	if (IS_ERR(sk))
+		goto out;
+
+	ring = &nlk_sk(sk)->rx_ring;
+	/* fast-path without atomic ops for common case: non-mmaped receiver */
+	if (ring->pg_vec == NULL)
+		goto out_put;
+
+	skb = alloc_skb_head(gfp_mask);
+	if (skb == NULL)
+		goto err1;
+
+	spin_lock_bh(&sk->sk_receive_queue.lock);
+	/* check again under lock */
+	if (ring->pg_vec == NULL)
+		goto out_free;
+
+	maxlen = ring->frame_size - NL_MMAP_HDRLEN;
+	if (maxlen < size)
+		goto out_free;
+
+	netlink_forward_ring(ring);
+	hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
+	if (hdr == NULL)
+		goto err2;
+	netlink_ring_setup_skb(skb, sk, ring, hdr);
+	netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
+	atomic_inc(&ring->pending);
+	netlink_increment_head(ring);
+
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+	return skb;
+
+err2:
+	kfree_skb(skb);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+err1:
+	sock_put(sk);
+	return NULL;
+
+out_free:
+	kfree_skb(skb);
+	spin_unlock_bh(&sk->sk_receive_queue.lock);
+out_put:
+	sock_put(sk);
+out:
+#endif
+	return alloc_skb(size, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(netlink_alloc_skb);
+
 int netlink_has_listeners(struct sock *sk, unsigned int group)
 {
 	int res = 0;
@@ -2270,9 +2390,13 @@ static int netlink_dump(struct sock *sk)
 
 	alloc_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
 
-	skb = sock_rmalloc(sk, alloc_size, 0, GFP_KERNEL);
+	if (!netlink_rx_is_mmaped(sk) &&
+	    atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+		goto errout_skb;
+	skb = netlink_alloc_skb(sk, alloc_size, nlk->portid, GFP_KERNEL);
 	if (!skb)
 		goto errout_skb;
+	netlink_skb_set_owner_r(skb, sk);
 
 	len = cb->dump(skb, cb);
 
@@ -2327,6 +2451,19 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
 	if (cb == NULL)
 		return -ENOBUFS;
 
+	/* Memory mapped dump requests need to be copied to avoid looping
+	 * on the pending state in netlink_mmap_sendmsg() while the CB hold
+	 * a reference to the skb.
+	 */
+	if (netlink_skb_is_mmaped(skb)) {
+		skb = skb_copy(skb, GFP_KERNEL);
+		if (skb == NULL) {
+			kfree(cb);
+			return -ENOBUFS;
+		}
+	} else
+		atomic_inc(&skb->users);
+
 	cb->dump = control->dump;
 	cb->done = control->done;
 	cb->nlh = nlh;
@@ -2387,7 +2524,8 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
 	if (err)
 		payload += nlmsg_len(nlh);
 
-	skb = nlmsg_new(payload, GFP_KERNEL);
+	skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
+				NETLINK_CB(in_skb).portid, GFP_KERNEL);
 	if (!skb) {
 		struct sock *sk;
 
-- 
cgit 


From ec464e5dc504a164c5dbff4a06812d495e44e34d Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Apr 2013 06:47:08 +0000
Subject: netfilter: rename netlink related "pid" variables to "portid"

Get rid of the confusing mix of pid and portid and use portid consistently
for all netlink related socket identities.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h         |  9 +++++----
 include/net/netfilter/nf_conntrack.h        |  2 +-
 include/net/netfilter/nf_conntrack_expect.h |  4 ++--
 net/netfilter/nf_conntrack_core.c           |  8 ++++----
 net/netfilter/nf_conntrack_expect.c         |  8 ++++----
 net/netfilter/nfnetlink.c                   | 13 +++++++------
 6 files changed, 23 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index ecbb8e495912..60b164171daf 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -29,10 +29,11 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
 extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
-extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, unsigned int group,
-			  int echo, gfp_t flags);
-extern int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error);
-extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags);
+extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
+			  unsigned int group, int echo, gfp_t flags);
+extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
+extern int nfnetlink_unicast(struct sk_buff *skb, struct net *net,
+			     u32 portid, int flags);
 
 extern void nfnl_lock(__u8 subsys_id);
 extern void nfnl_unlock(__u8 subsys_id);
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index caca0c4d6b4b..644d9c223d24 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -184,7 +184,7 @@ extern int nf_conntrack_hash_check_insert(struct nf_conn *ct);
 extern void nf_ct_delete_from_lists(struct nf_conn *ct);
 extern void nf_ct_dying_timeout(struct nf_conn *ct);
 
-extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report);
+extern void nf_conntrack_flush_report(struct net *net, u32 portid, int report);
 
 extern bool nf_ct_get_tuplepr(const struct sk_buff *skb,
 			      unsigned int nhoff, u_int16_t l3num,
diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h
index cbbae7621e22..3f3aecbc8632 100644
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -88,7 +88,7 @@ nf_ct_find_expectation(struct net *net, u16 zone,
 		       const struct nf_conntrack_tuple *tuple);
 
 void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
-				u32 pid, int report);
+				u32 portid, int report);
 static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp)
 {
 	nf_ct_unlink_expect_report(exp, 0, 0);
@@ -106,7 +106,7 @@ void nf_ct_expect_init(struct nf_conntrack_expect *, unsigned int, u_int8_t,
 		       u_int8_t, const __be16 *, const __be16 *);
 void nf_ct_expect_put(struct nf_conntrack_expect *exp);
 int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
-				u32 pid, int report);
+				u32 portid, int report);
 static inline int nf_ct_expect_related(struct nf_conntrack_expect *expect)
 {
 	return nf_ct_expect_related_report(expect, 0, 0);
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 007e8c43d19a..54ddc2f8e7c9 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1260,7 +1260,7 @@ void nf_ct_iterate_cleanup(struct net *net,
 EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
 
 struct __nf_ct_flush_report {
-	u32 pid;
+	u32 portid;
 	int report;
 };
 
@@ -1275,7 +1275,7 @@ static int kill_report(struct nf_conn *i, void *data)
 
 	/* If we fail to deliver the event, death_by_timeout() will retry */
 	if (nf_conntrack_event_report(IPCT_DESTROY, i,
-				      fr->pid, fr->report) < 0)
+				      fr->portid, fr->report) < 0)
 		return 1;
 
 	/* Avoid the delivery of the destroy event in death_by_timeout(). */
@@ -1298,10 +1298,10 @@ void nf_ct_free_hashtable(void *hash, unsigned int size)
 }
 EXPORT_SYMBOL_GPL(nf_ct_free_hashtable);
 
-void nf_conntrack_flush_report(struct net *net, u32 pid, int report)
+void nf_conntrack_flush_report(struct net *net, u32 portid, int report)
 {
 	struct __nf_ct_flush_report fr = {
-		.pid 	= pid,
+		.portid	= portid,
 		.report = report,
 	};
 	nf_ct_iterate_cleanup(net, kill_report, &fr);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index 8c10e3db3d9b..0adfdcc68bae 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -40,7 +40,7 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
 /* nf_conntrack_expect helper functions */
 void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
-				u32 pid, int report)
+				u32 portid, int report)
 {
 	struct nf_conn_help *master_help = nfct_help(exp->master);
 	struct net *net = nf_ct_exp_net(exp);
@@ -54,7 +54,7 @@ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
 	hlist_del(&exp->lnode);
 	master_help->expecting[exp->class]--;
 
-	nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report);
+	nf_ct_expect_event_report(IPEXP_DESTROY, exp, portid, report);
 	nf_ct_expect_put(exp);
 
 	NF_CT_STAT_INC(net, expect_delete);
@@ -412,7 +412,7 @@ out:
 }
 
 int nf_ct_expect_related_report(struct nf_conntrack_expect *expect, 
-				u32 pid, int report)
+				u32 portid, int report)
 {
 	int ret;
 
@@ -425,7 +425,7 @@ int nf_ct_expect_related_report(struct nf_conntrack_expect *expect,
 	if (ret < 0)
 		goto out;
 	spin_unlock_bh(&nf_conntrack_lock);
-	nf_ct_expect_event_report(IPEXP_NEW, expect, pid, report);
+	nf_ct_expect_event_report(IPEXP_NEW, expect, portid, report);
 	return ret;
 out:
 	spin_unlock_bh(&nf_conntrack_lock);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index bc4c499adb13..1640bd7dcdf4 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -112,22 +112,23 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
-int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 pid,
+int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags)
 {
-	return nlmsg_notify(net->nfnl, skb, pid, group, echo, flags);
+	return nlmsg_notify(net->nfnl, skb, portid, group, echo, flags);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_send);
 
-int nfnetlink_set_err(struct net *net, u32 pid, u32 group, int error)
+int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error)
 {
-	return netlink_set_err(net->nfnl, pid, group, error);
+	return netlink_set_err(net->nfnl, portid, group, error);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_set_err);
 
-int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u_int32_t pid, int flags)
+int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid,
+		      int flags)
 {
-	return netlink_unicast(net->nfnl, skb, pid, flags);
+	return netlink_unicast(net->nfnl, skb, portid, flags);
 }
 EXPORT_SYMBOL_GPL(nfnetlink_unicast);
 
-- 
cgit 


From 3ab1f683bf8be7aa7869cc3ffb8d1db2ec8c8307 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Wed, 17 Apr 2013 06:47:09 +0000
Subject: nfnetlink: add support for memory mapped netlink

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter/nfnetlink.h  |  2 ++
 net/netfilter/nfnetlink.c            |  7 +++++++
 net/netfilter/nfnetlink_log.c        | 10 ++++++----
 net/netfilter/nfnetlink_queue_core.c |  3 ++-
 4 files changed, 17 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h
index 60b164171daf..cadb7402d7a7 100644
--- a/include/linux/netfilter/nfnetlink.h
+++ b/include/linux/netfilter/nfnetlink.h
@@ -29,6 +29,8 @@ extern int nfnetlink_subsys_register(const struct nfnetlink_subsystem *n);
 extern int nfnetlink_subsys_unregister(const struct nfnetlink_subsystem *n);
 
 extern int nfnetlink_has_listeners(struct net *net, unsigned int group);
+extern struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+					   u32 dst_portid, gfp_t gfp_mask);
 extern int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 			  unsigned int group, int echo, gfp_t flags);
 extern int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error);
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index 1640bd7dcdf4..572d87dc116f 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -112,6 +112,13 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
 }
 EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
 
+struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
+				    u32 dst_portid, gfp_t gfp_mask)
+{
+	return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
+}
+EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
+
 int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
 		   unsigned int group, int echo, gfp_t flags)
 {
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 50aaf716cd68..d4199eb9b338 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -318,7 +318,7 @@ nfulnl_set_flags(struct nfulnl_instance *inst, u_int16_t flags)
 }
 
 static struct sk_buff *
-nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
+nfulnl_alloc_skb(u32 peer_portid, unsigned int inst_size, unsigned int pkt_size)
 {
 	struct sk_buff *skb;
 	unsigned int n;
@@ -327,13 +327,14 @@ nfulnl_alloc_skb(unsigned int inst_size, unsigned int pkt_size)
 	 * message.  WARNING: has to be <= 128k due to slab restrictions */
 
 	n = max(inst_size, pkt_size);
-	skb = alloc_skb(n, GFP_ATOMIC);
+	skb = nfnetlink_alloc_skb(&init_net, n, peer_portid, GFP_ATOMIC);
 	if (!skb) {
 		if (n > pkt_size) {
 			/* try to allocate only as much as we need for current
 			 * packet */
 
-			skb = alloc_skb(pkt_size, GFP_ATOMIC);
+			skb = nfnetlink_alloc_skb(&init_net, pkt_size,
+						  peer_portid, GFP_ATOMIC);
 			if (!skb)
 				pr_err("nfnetlink_log: can't even alloc %u bytes\n",
 				       pkt_size);
@@ -696,7 +697,8 @@ nfulnl_log_packet(u_int8_t pf,
 	}
 
 	if (!inst->skb) {
-		inst->skb = nfulnl_alloc_skb(inst->nlbufsiz, size);
+		inst->skb = nfulnl_alloc_skb(inst->peer_portid, inst->nlbufsiz,
+					     size);
 		if (!inst->skb)
 			goto alloc_failure;
 	}
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 5e280b3e154f..ef3cdb4bfeea 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -339,7 +339,8 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
 	if (queue->flags & NFQA_CFG_F_CONNTRACK)
 		ct = nfqnl_ct_get(entskb, &size, &ctinfo);
 
-	skb = alloc_skb(size, GFP_ATOMIC);
+	skb = nfnetlink_alloc_skb(&init_net, size, queue->peer_portid,
+				  GFP_ATOMIC);
 	if (!skb)
 		return NULL;
 
-- 
cgit 


From 9a08d3766874835f4eba1b3d66938326c069d817 Mon Sep 17 00:00:00 2001
From: Wang YanQing <udknight@gmail.com>
Date: Fri, 19 Apr 2013 09:38:04 +0800
Subject: iommu: Include linux/err.h

The linux/iommu.h header uses ERR_PTR defined
in linux/err.h but doesn't include it.

Cc:joro@8bytes.org
Reviewed-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Wang YanQing <udknight@gmail.com>
Signed-off-by: Joerg Roedel <joro@8bytes.org>
---
 include/linux/iommu.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ba3b8a98a049..6a027dc8dbfe 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -20,6 +20,7 @@
 #define __LINUX_IOMMU_H
 
 #include <linux/errno.h>
+#include <linux/err.h>
 #include <linux/types.h>
 
 #define IOMMU_READ	(1)
-- 
cgit 


From e34d3865ee4a71195f91b23fd09e2619a5f727d3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 28 Jan 2013 17:42:24 +0000
Subject: ata: arasan: remove the need for platform_data

This adds a complete DT binding for the arasan device driver. There is
currently only one user, which is the spear13xx platform, so we don't
actually have to parse all the properties until another user comes in,
but this does use the generic DMA binding to find the DMA channel.

The patch is untested so far and is part of a series to convert
the spear platform over to use the generic DMA binding, so it
should stay with the rest of the series.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Viresh Kumar <viresh.linux@linaro.org>
Cc: Vinod Koul <vinod.koul@intel.com>
Cc: Jeff Garzik <jgarzik@redhat.com>
Cc: devicetree-discuss@lists.ozlabs.org
---
 .../devicetree/bindings/ata/pata-arasan.txt        | 22 +++++++++++++
 drivers/ata/pata_arasan_cf.c                       | 37 +++++++++++-----------
 include/linux/pata_arasan_cf_data.h                |  2 --
 3 files changed, 41 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/ata/pata-arasan.txt b/Documentation/devicetree/bindings/ata/pata-arasan.txt
index 95ec7f825ede..2aff154be84e 100644
--- a/Documentation/devicetree/bindings/ata/pata-arasan.txt
+++ b/Documentation/devicetree/bindings/ata/pata-arasan.txt
@@ -6,6 +6,26 @@ Required properties:
 - interrupt-parent: Should be the phandle for the interrupt controller
   that services interrupts for this device
 - interrupt: Should contain the CF interrupt number
+- clock-frequency: Interface clock rate, in Hz, one of
+       25000000
+       33000000
+       40000000
+       50000000
+       66000000
+       75000000
+      100000000
+      125000000
+      150000000
+      166000000
+      200000000
+
+Optional properties:
+- arasan,broken-udma: if present, UDMA mode is unusable
+- arasan,broken-mwdma: if present, MWDMA mode is unusable
+- arasan,broken-pio: if present, PIO mode is unusable
+- dmas: one DMA channel, as described in bindings/dma/dma.txt
+  required unless both UDMA and MWDMA mode are broken
+- dma-names: the corresponding channel name, must be "data"
 
 Example:
 
@@ -14,4 +34,6 @@ Example:
 		reg = <0xfc000000 0x1000>;
 		interrupt-parent = <&vic1>;
 		interrupts = <12>;
+		dmas = <&dma-controller 23>;
+		dma-names = "data";
 	};
diff --git a/drivers/ata/pata_arasan_cf.c b/drivers/ata/pata_arasan_cf.c
index 405022d302c3..7638121cb5d1 100644
--- a/drivers/ata/pata_arasan_cf.c
+++ b/drivers/ata/pata_arasan_cf.c
@@ -209,8 +209,6 @@ struct arasan_cf_dev {
 	struct dma_chan *dma_chan;
 	/* Mask for DMA transfers */
 	dma_cap_mask_t mask;
-	/* dma channel private data */
-	void *dma_priv;
 	/* DMA transfer work */
 	struct work_struct work;
 	/* DMA delayed finish work */
@@ -308,6 +306,7 @@ static void cf_card_detect(struct arasan_cf_dev *acdev, bool hotplugged)
 static int cf_init(struct arasan_cf_dev *acdev)
 {
 	struct arasan_cf_pdata *pdata = dev_get_platdata(acdev->host->dev);
+	unsigned int if_clk;
 	unsigned long flags;
 	int ret = 0;
 
@@ -325,8 +324,12 @@ static int cf_init(struct arasan_cf_dev *acdev)
 
 	spin_lock_irqsave(&acdev->host->lock, flags);
 	/* configure CF interface clock */
-	writel((pdata->cf_if_clk <= CF_IF_CLK_200M) ? pdata->cf_if_clk :
-			CF_IF_CLK_166M, acdev->vbase + CLK_CFG);
+	/* TODO: read from device tree */
+	if_clk = CF_IF_CLK_166M;
+	if (pdata && pdata->cf_if_clk <= CF_IF_CLK_200M)
+		if_clk = pdata->cf_if_clk;
+
+	writel(if_clk, acdev->vbase + CLK_CFG);
 
 	writel(TRUE_IDE_MODE | CFHOST_ENB, acdev->vbase + OP_MODE);
 	cf_interrupt_enable(acdev, CARD_DETECT_IRQ, 1);
@@ -357,12 +360,6 @@ static void dma_callback(void *dev)
 	complete(&acdev->dma_completion);
 }
 
-static bool filter(struct dma_chan *chan, void *slave)
-{
-	chan->private = slave;
-	return true;
-}
-
 static inline void dma_complete(struct arasan_cf_dev *acdev)
 {
 	struct ata_queued_cmd *qc = acdev->qc;
@@ -530,8 +527,7 @@ static void data_xfer(struct work_struct *work)
 
 	/* request dma channels */
 	/* dma_request_channel may sleep, so calling from process context */
-	acdev->dma_chan = dma_request_channel(acdev->mask, filter,
-			acdev->dma_priv);
+	acdev->dma_chan = dma_request_slave_channel(acdev->host->dev, "data");
 	if (!acdev->dma_chan) {
 		dev_err(acdev->host->dev, "Unable to get dma_chan\n");
 		goto chan_request_fail;
@@ -798,6 +794,7 @@ static int arasan_cf_probe(struct platform_device *pdev)
 	struct ata_host *host;
 	struct ata_port *ap;
 	struct resource *res;
+	u32 quirk;
 	irq_handler_t irq_handler = NULL;
 	int ret = 0;
 
@@ -817,12 +814,17 @@ static int arasan_cf_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
+	if (pdata)
+		quirk = pdata->quirk;
+	else
+		quirk = CF_BROKEN_UDMA; /* as it is on spear1340 */
+
 	/* if irq is 0, support only PIO */
 	acdev->irq = platform_get_irq(pdev, 0);
 	if (acdev->irq)
 		irq_handler = arasan_cf_interrupt;
 	else
-		pdata->quirk |= CF_BROKEN_MWDMA | CF_BROKEN_UDMA;
+		quirk |= CF_BROKEN_MWDMA | CF_BROKEN_UDMA;
 
 	acdev->pbase = res->start;
 	acdev->vbase = devm_ioremap_nocache(&pdev->dev, res->start,
@@ -859,17 +861,16 @@ static int arasan_cf_probe(struct platform_device *pdev)
 	INIT_WORK(&acdev->work, data_xfer);
 	INIT_DELAYED_WORK(&acdev->dwork, delayed_finish);
 	dma_cap_set(DMA_MEMCPY, acdev->mask);
-	acdev->dma_priv = pdata->dma_priv;
 
 	/* Handle platform specific quirks */
-	if (pdata->quirk) {
-		if (pdata->quirk & CF_BROKEN_PIO) {
+	if (quirk) {
+		if (quirk & CF_BROKEN_PIO) {
 			ap->ops->set_piomode = NULL;
 			ap->pio_mask = 0;
 		}
-		if (pdata->quirk & CF_BROKEN_MWDMA)
+		if (quirk & CF_BROKEN_MWDMA)
 			ap->mwdma_mask = 0;
-		if (pdata->quirk & CF_BROKEN_UDMA)
+		if (quirk & CF_BROKEN_UDMA)
 			ap->udma_mask = 0;
 	}
 	ap->flags |= ATA_FLAG_PIO_POLLING | ATA_FLAG_NO_ATAPI;
diff --git a/include/linux/pata_arasan_cf_data.h b/include/linux/pata_arasan_cf_data.h
index a7b4fc386e63..3cc21c9cc1e8 100644
--- a/include/linux/pata_arasan_cf_data.h
+++ b/include/linux/pata_arasan_cf_data.h
@@ -37,8 +37,6 @@ struct arasan_cf_pdata {
 	#define CF_BROKEN_PIO			(1)
 	#define CF_BROKEN_MWDMA			(1 << 1)
 	#define CF_BROKEN_UDMA			(1 << 2)
-	/* This is platform specific data for the DMA controller */
-	void *dma_priv;
 };
 
 static inline void
-- 
cgit 


From 6e94d1ef37e439bf45659cc071553574ccb98080 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <dborkman@redhat.com>
Date: Tue, 16 Apr 2013 01:29:10 +0000
Subject: net: socket: move ktime2ts to ktime header api

Currently, ktime2ts is a small helper function that is only used in
net/socket.c. Move this helper into the ktime API as a small inline
function, so that i) it's maintained together with ktime routines,
and ii) also other files can make use of it. The function is named
ktime_to_timespec_cond() and placed into the generic part of ktime,
since we internally make use of ktime_to_timespec(). ktime_to_timespec()
itself does not check the ktime variable for zero, hence, we name
this function ktime_to_timespec_cond() for only a conditional
conversion, and adapt its users to it.

Signed-off-by: Daniel Borkmann <dborkman@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/ktime.h | 18 ++++++++++++++++++
 net/socket.c          | 20 ++++----------------
 2 files changed, 22 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index e83512f63df5..bbca12804d12 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -330,6 +330,24 @@ static inline ktime_t ktime_sub_us(const ktime_t kt, const u64 usec)
 
 extern ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs);
 
+/**
+ * ktime_to_timespec_cond - convert a ktime_t variable to timespec
+ *			    format only if the variable contains data
+ * @kt:		the ktime_t variable to convert
+ * @ts:		the timespec variable to store the result in
+ *
+ * Returns true if there was a successful conversion, false if kt was 0.
+ */
+static inline bool ktime_to_timespec_cond(const ktime_t kt, struct timespec *ts)
+{
+	if (kt.tv64) {
+		*ts = ktime_to_timespec(kt);
+		return true;
+	} else {
+		return false;
+	}
+}
+
 /*
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
diff --git a/net/socket.c b/net/socket.c
index 36883fea44f3..280283f03ccc 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -681,16 +681,6 @@ int kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 }
 EXPORT_SYMBOL(kernel_sendmsg);
 
-static int ktime2ts(ktime_t kt, struct timespec *ts)
-{
-	if (kt.tv64) {
-		*ts = ktime_to_timespec(kt);
-		return 1;
-	} else {
-		return 0;
-	}
-}
-
 /*
  * called from sock_recv_timestamp() if sock_flag(sk, SOCK_RCVTSTAMP)
  */
@@ -723,17 +713,15 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk,
 
 
 	memset(ts, 0, sizeof(ts));
-	if (skb->tstamp.tv64 &&
-	    sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) {
-		skb_get_timestampns(skb, ts + 0);
+	if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE) &&
+	    ktime_to_timespec_cond(skb->tstamp, ts + 0))
 		empty = 0;
-	}
 	if (shhwtstamps) {
 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE) &&
-		    ktime2ts(shhwtstamps->syststamp, ts + 1))
+		    ktime_to_timespec_cond(shhwtstamps->syststamp, ts + 1))
 			empty = 0;
 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE) &&
-		    ktime2ts(shhwtstamps->hwtstamp, ts + 2))
+		    ktime_to_timespec_cond(shhwtstamps->hwtstamp, ts + 2))
 			empty = 0;
 	}
 	if (!empty)
-- 
cgit 


From d2802d0739dcc61af5e5ea00773ce7ddead4e9c2 Mon Sep 17 00:00:00 2001
From: "Steven Rostedt (Red Hat)" <rostedt@goodmis.org>
Date: Fri, 19 Apr 2013 17:10:27 -0400
Subject: tracing: Compare to 1 instead of zero for is_signed_type()

The formats of the trace events show if the type of a event field
is signed or not via a macro called is_signed_type(). This does
a trick with the type and compares a -1 to zero after typecasting
to the tested type. If it returns true, it's signed, otherwise
its not. But this unfortunately triggers a warning by gcc:

  warning: comparison of unsigned expression < 0 is always false

As we know it is always false (that's why we do it), this is a
false warning. Luckily for us, the comparison works with a 1 as
well, without giving the warning.

Convert the check to compare (type)-1 < (type)0 to (type)-1 < (type)1
to determine if the type is signed or not.

Link: http://lkml.kernel.org/r/CAErSpo4YXcY9fuOKWYGDkddJwk68kmZTohsmVB6QvrhjboOh1Q@mail.gmail.com

Reported-by: Bjorn Helgaas <bhelgaas@google.com>
Reported-by: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4e28b011e63b..34e00fb49bec 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -333,7 +333,7 @@ extern int trace_define_field(struct ftrace_event_call *call, const char *type,
 extern int trace_add_event_call(struct ftrace_event_call *call);
 extern void trace_remove_event_call(struct ftrace_event_call *call);
 
-#define is_signed_type(type)	(((type)(-1)) < (type)0)
+#define is_signed_type(type)	(((type)(-1)) < (type)1)
 
 int trace_set_clr_event(const char *system, const char *event, int set);
 
-- 
cgit 


From 28daf4fae8693d4a285123494899fe01950cba50 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Fri, 19 Apr 2013 17:49:23 -0400
Subject: jbd2: use kmem_cache_zalloc instead of kmem_cache_alloc/memset

The jbd2_alloc_handle() function is only called by new_handle().  So
this commit uses kmem_cache_zalloc() instead of
kmem_cache_alloc()/memset().

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/jbd2/transaction.c | 1 -
 include/linux/jbd2.h  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 325bc019ed88..a1920da22802 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -332,7 +332,6 @@ static handle_t *new_handle(int nblocks)
 	handle_t *handle = jbd2_alloc_handle(GFP_NOFS);
 	if (!handle)
 		return NULL;
-	memset(handle, 0, sizeof(*handle));
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
 
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index f9fe88957b7a..6e051f472edb 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1145,7 +1145,7 @@ extern struct kmem_cache *jbd2_handle_cache;
 
 static inline handle_t *jbd2_alloc_handle(gfp_t gfp_flags)
 {
-	return kmem_cache_alloc(jbd2_handle_cache, gfp_flags);
+	return kmem_cache_zalloc(jbd2_handle_cache, gfp_flags);
 }
 
 static inline void jbd2_free_handle(handle_t *handle)
-- 
cgit 


From 877f962c5edacfef60ab21cfed6d8d54ce25b8a6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 20 Apr 2013 19:58:37 -0400
Subject: buffer: add BH_Prio and BH_Meta flags

Add buffer_head flags so that buffer cache writebacks can be marked
with the the appropriate request flags, so that metadata blocks can be
marked appropriately in blktrace.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/buffer.c                 | 5 +++++
 include/linux/buffer_head.h | 4 ++++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index b4dcb34c9635..a15575c0b9ee 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2988,6 +2988,11 @@ int submit_bh(int rw, struct buffer_head * bh)
 	/* Take care of bh's that straddle the end of the device */
 	guard_bh_eod(rw, bio, bh);
 
+	if (buffer_meta(bh))
+		rw |= REQ_META;
+	if (buffer_prio(bh))
+		rw |= REQ_PRIO;
+
 	bio_get(bio);
 	submit_bio(rw, bio);
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5afc4f94d110..33c0f8103fe4 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -34,6 +34,8 @@ enum bh_state_bits {
 	BH_Write_EIO,	/* I/O error on write */
 	BH_Unwritten,	/* Buffer is allocated on disk but not written */
 	BH_Quiet,	/* Buffer Error Prinks to be quiet */
+	BH_Meta,	/* Buffer contains metadata */
+	BH_Prio,	/* Buffer should be submitted with REQ_PRIO */
 
 	BH_PrivateStart,/* not a state bit, but the first bit available
 			 * for private allocation by other entities
@@ -124,6 +126,8 @@ BUFFER_FNS(Delay, delay)
 BUFFER_FNS(Boundary, boundary)
 BUFFER_FNS(Write_EIO, write_io_error)
 BUFFER_FNS(Unwritten, unwritten)
+BUFFER_FNS(Meta, meta)
+BUFFER_FNS(Prio, prio)
 
 #define bh_offset(bh)		((unsigned long)(bh)->b_data & ~PAGE_MASK)
 
-- 
cgit 


From 9fae27b33785cb6350f21449073d9f7d1a2bbfd6 Mon Sep 17 00:00:00 2001
From: Patrick McHardy <kaber@trash.net>
Date: Sat, 20 Apr 2013 23:51:41 +0000
Subject: net: vlan: fix dummy function signatures for CONFIG_VLAN=n

Fix up some function signatures for CONFIG_VLAN=n that were missed during
the 802.1ad support patches.

Found by the kbuild robot.

Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_vlan.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
index a78f9390da87..52bd03b38962 100644
--- a/include/linux/if_vlan.h
+++ b/include/linux/if_vlan.h
@@ -104,7 +104,8 @@ extern void vlan_vids_del_by_dev(struct net_device *dev,
 extern bool vlan_uses_dev(const struct net_device *dev);
 #else
 static inline struct net_device *
-__vlan_find_dev_deep(struct net_device *real_dev, u16 vlan_id)
+__vlan_find_dev_deep(struct net_device *real_dev,
+		     __be16 vlan_proto, u16 vlan_id)
 {
 	return NULL;
 }
@@ -131,12 +132,12 @@ static inline struct sk_buff *vlan_untag(struct sk_buff *skb)
 	return skb;
 }
 
-static inline int vlan_vid_add(struct net_device *dev, unsigned short vid)
+static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid)
 {
 	return 0;
 }
 
-static inline void vlan_vid_del(struct net_device *dev, unsigned short vid)
+static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid)
 {
 }
 
-- 
cgit 


From 026249ef100b5384b6c74c360db46728e98354da Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 20 Apr 2013 15:58:34 +0200
Subject: perf: New helper to prevent full dynticks CPUs from stopping tick

Provide a new helper that help full dynticks CPUs to prevent
from stopping their tick in case there are events in the local
rotation list.

This way we make sure that perf_event_task_tick() is serviced
on demand.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephane Eranian <eranian@google.com>
Cc: Jiri Olsa <jolsa@redhat.com>
---
 include/linux/perf_event.h |  6 ++++++
 kernel/events/core.c       | 10 ++++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e47ee462c2f2..0140830225e2 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -799,6 +799,12 @@ static inline int __perf_event_disable(void *info)			{ return -1; }
 static inline void perf_event_task_tick(void)				{ }
 #endif
 
+#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_NO_HZ_FULL)
+extern bool perf_event_can_stop_tick(void);
+#else
+static inline bool perf_event_can_stop_tick(void)			{ return true; }
+#endif
+
 #define perf_output_put(handle, x) perf_output_copy((handle), &(x), sizeof(x))
 
 /*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 75b58bb75b32..ddb993b52190 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2560,6 +2560,16 @@ done:
 		list_del_init(&cpuctx->rotation_list);
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+bool perf_event_can_stop_tick(void)
+{
+	if (list_empty(&__get_cpu_var(rotation_list)))
+		return true;
+	else
+		return false;
+}
+#endif
+
 void perf_event_task_tick(void)
 {
 	struct list_head *head = &__get_cpu_var(rotation_list);
-- 
cgit 


From ce831b38ca4920739a7a5b0c73b921da41f03718 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 20 Apr 2013 15:15:35 +0200
Subject: sched: New helper to prevent from stopping the tick in full dynticks

Provide a new helper to be called from the full dynticks engine
before stopping the tick in order to make sure we don't stop
it when there is more than one task running on the CPU.

This way we make sure that the tick stays alive to maintain
fairness.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h |  6 ++++++
 kernel/sched/core.c   | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1ff9e0a5de27..a74adedcdd10 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1856,6 +1856,12 @@ extern void wake_up_nohz_cpu(int cpu);
 static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
 
+#ifdef CONFIG_NO_HZ_FULL
+extern bool sched_can_stop_tick(void);
+#else
+static inline bool sched_can_stop_tick(void) { return false; }
+#endif
+
 #ifdef CONFIG_SCHED_AUTOGROUP
 extern void sched_autogroup_create_attach(struct task_struct *p);
 extern void sched_autogroup_detach(struct task_struct *p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0f0a5b3fd62c..69f71335984f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -650,6 +650,24 @@ static inline bool got_nohz_idle_kick(void)
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
+#ifdef CONFIG_NO_HZ_FULL
+bool sched_can_stop_tick(void)
+{
+       struct rq *rq;
+
+       rq = this_rq();
+
+       /* Make sure rq->nr_running update is visible after the IPI */
+       smp_rmb();
+
+       /* More than one running task need preemption */
+       if (rq->nr_running > 1)
+               return false;
+
+       return true;
+}
+#endif /* CONFIG_NO_HZ_FULL */
+
 void sched_avg_update(struct rq *rq)
 {
 	s64 period = sched_avg_period();
-- 
cgit 


From ff442c51f6543378cf23107c75b7949dc64a9119 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 20 Apr 2013 15:27:08 +0200
Subject: nohz: Re-evaluate the tick from the scheduler IPI

The scheduler IPI is used by the scheduler to kick
full dynticks CPUs asynchronously when more than one
task are running or when a new timer list timer is
enqueued. This way the destination CPU can decide
to restart the tick to handle this new situation.

Now let's call that kick in the scheduler IPI.

(Reusing the scheduler IPI rather than implementing
a new IPI was suggested by Peter Zijlstra a while ago)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     | 2 ++
 kernel/sched/core.c      | 4 +++-
 kernel/time/tick-sched.c | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index d290168335bc..e31e67623ea1 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -160,11 +160,13 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 #ifdef CONFIG_NO_HZ_FULL
 extern void tick_nohz_init(void);
 extern int tick_nohz_full_cpu(int cpu);
+extern void tick_nohz_full_check(void);
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_all(void);
 #else
 static inline void tick_nohz_init(void) { }
 static inline int tick_nohz_full_cpu(int cpu) { return 0; }
+static inline void tick_nohz_full_check(void) { }
 static inline void tick_nohz_full_kick(void) { }
 static inline void tick_nohz_full_kick_all(void) { }
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 69f71335984f..9ad35005f1cb 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1398,7 +1398,8 @@ static void sched_ttwu_pending(void)
 
 void scheduler_ipi(void)
 {
-	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
+	if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()
+	    && !tick_nohz_full_cpu(smp_processor_id()))
 		return;
 
 	/*
@@ -1415,6 +1416,7 @@ void scheduler_ipi(void)
 	 * somewhat pessimize the simple resched case.
 	 */
 	irq_enter();
+	tick_nohz_full_check();
 	sched_ttwu_pending();
 
 	/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 884a9f302a06..4d74a68b2c34 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -151,7 +151,7 @@ bool have_nohz_full_mask;
  * Re-evaluate the need for the tick on the current CPU
  * and restart it if necessary.
  */
-static void tick_nohz_full_check(void)
+void tick_nohz_full_check(void)
 {
 	/*
 	 * STUB for now, will be filled with the full tick stop/restart
-- 
cgit 


From 99e5ada9407cc19d7c4c05ce2165f20dc46fc093 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 20 Apr 2013 17:11:50 +0200
Subject: nohz: Re-evaluate the tick for the new task after a context switch

When a task is scheduled in, it may have some properties
of its own that could make the CPU reconsider the need for
the tick: posix cpu timers, perf events, ...

So notify the full dynticks subsystem when a task gets
scheduled in and re-check the tick dependency at this
stage. This is done through a self IPI to avoid messing
up with any current lock scenario.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Geoff Levand <geoff@infradead.org>
Cc: Gilad Ben Yossef <gilad@benyossef.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/tick.h     |  2 ++
 kernel/sched/core.c      |  2 ++
 kernel/time/tick-sched.c | 20 ++++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/tick.h b/include/linux/tick.h
index e31e67623ea1..9180f4b85e6d 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -163,12 +163,14 @@ extern int tick_nohz_full_cpu(int cpu);
 extern void tick_nohz_full_check(void);
 extern void tick_nohz_full_kick(void);
 extern void tick_nohz_full_kick_all(void);
+extern void tick_nohz_task_switch(struct task_struct *tsk);
 #else
 static inline void tick_nohz_init(void) { }
 static inline int tick_nohz_full_cpu(int cpu) { return 0; }
 static inline void tick_nohz_full_check(void) { }
 static inline void tick_nohz_full_kick(void) { }
 static inline void tick_nohz_full_kick_all(void) { }
+static inline void tick_nohz_task_switch(struct task_struct *tsk) { }
 #endif
 
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9ad35005f1cb..dd09def88567 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1896,6 +1896,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 		kprobe_flush_task(prev);
 		put_task_struct(prev);
 	}
+
+	tick_nohz_task_switch(current);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d0ed1905a85c..12a900dbb819 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -232,6 +232,26 @@ void tick_nohz_full_kick_all(void)
 	preempt_enable();
 }
 
+/*
+ * Re-evaluate the need for the tick as we switch the current task.
+ * It might need the tick due to per task/process properties:
+ * perf events, posix cpu timers, ...
+ */
+void tick_nohz_task_switch(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	if (!tick_nohz_full_cpu(smp_processor_id()))
+		return;
+
+	local_irq_save(flags);
+
+	if (tick_nohz_tick_stopped() && !can_stop_full_tick())
+		tick_nohz_full_kick();
+
+	local_irq_restore(flags);
+}
+
 int tick_nohz_full_cpu(int cpu)
 {
 	if (!have_nohz_full_mask)
-- 
cgit 


From 554c06ba3ee29cf453fca17e9e61120b75aa476d Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 23 Apr 2013 08:54:31 +0000
Subject: cpuidle: remove en_core_tk_irqen flag

The en_core_tk_irqen flag is set in all the cpuidle driver which
means it is not necessary to specify this flag.

Remove the flag and the code related to it.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Acked-by: Kevin Hilman <khilman@linaro.org>  # for mach-omap2/*
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/arm/mach-at91/cpuidle.c                    |  1 -
 arch/arm/mach-davinci/cpuidle.c                 |  1 -
 arch/arm/mach-exynos/cpuidle.c                  |  1 -
 arch/arm/mach-imx/cpuidle-imx6q.c               |  1 -
 arch/arm/mach-imx/pm-imx5.c                     |  1 -
 arch/arm/mach-omap2/cpuidle34xx.c               |  1 -
 arch/arm/mach-omap2/cpuidle44xx.c               |  1 -
 arch/arm/mach-s3c64xx/cpuidle.c                 |  1 -
 arch/arm/mach-shmobile/cpuidle.c                |  1 -
 arch/arm/mach-shmobile/pm-sh7372.c              |  1 -
 arch/arm/mach-tegra/cpuidle-tegra114.c          |  1 -
 arch/arm/mach-tegra/cpuidle-tegra20.c           |  1 -
 arch/arm/mach-tegra/cpuidle-tegra30.c           |  1 -
 arch/arm/mach-ux500/cpuidle.c                   |  1 -
 arch/powerpc/platforms/pseries/processor_idle.c |  1 -
 arch/sh/kernel/cpu/shmobile/cpuidle.c           |  1 -
 arch/x86/kernel/apm_32.c                        |  1 -
 drivers/acpi/processor_idle.c                   |  1 -
 drivers/cpuidle/cpuidle-calxeda.c               |  1 -
 drivers/cpuidle/cpuidle-kirkwood.c              |  1 -
 drivers/cpuidle/cpuidle.c                       | 72 +++++++------------------
 drivers/idle/intel_idle.c                       |  1 -
 include/linux/cpuidle.h                         | 11 ----
 23 files changed, 18 insertions(+), 86 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/mach-at91/cpuidle.c b/arch/arm/mach-at91/cpuidle.c
index 0c6381516a5a..0130df7f0524 100644
--- a/arch/arm/mach-at91/cpuidle.c
+++ b/arch/arm/mach-at91/cpuidle.c
@@ -47,7 +47,6 @@ static int at91_enter_idle(struct cpuidle_device *dev,
 static struct cpuidle_driver at91_idle_driver = {
 	.name			= "at91_idle",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 	.states[0]		= ARM_CPUIDLE_WFI_STATE,
 	.states[1]		= {
 		.enter			= at91_enter_idle,
diff --git a/arch/arm/mach-davinci/cpuidle.c b/arch/arm/mach-davinci/cpuidle.c
index 22d6d4acc8d9..c2887c57c899 100644
--- a/arch/arm/mach-davinci/cpuidle.c
+++ b/arch/arm/mach-davinci/cpuidle.c
@@ -62,7 +62,6 @@ static int davinci_enter_idle(struct cpuidle_device *dev,
 static struct cpuidle_driver davinci_idle_driver = {
 	.name			= "cpuidle-davinci",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 	.states[0]		= ARM_CPUIDLE_WFI_STATE,
 	.states[1]		= {
 		.enter			= davinci_enter_idle,
diff --git a/arch/arm/mach-exynos/cpuidle.c b/arch/arm/mach-exynos/cpuidle.c
index fcfe0251aa3e..498a7a23e260 100644
--- a/arch/arm/mach-exynos/cpuidle.c
+++ b/arch/arm/mach-exynos/cpuidle.c
@@ -58,7 +58,6 @@ static DEFINE_PER_CPU(struct cpuidle_device, exynos4_cpuidle_device);
 static struct cpuidle_driver exynos4_idle_driver = {
 	.name			= "exynos4_idle",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 };
 
 /* Ext-GIC nIRQ/nFIQ is the only wakeup source in AFTR */
diff --git a/arch/arm/mach-imx/cpuidle-imx6q.c b/arch/arm/mach-imx/cpuidle-imx6q.c
index a783a6314b4f..e2739adf093a 100644
--- a/arch/arm/mach-imx/cpuidle-imx6q.c
+++ b/arch/arm/mach-imx/cpuidle-imx6q.c
@@ -45,7 +45,6 @@ done:
 static struct cpuidle_driver imx6q_cpuidle_driver = {
 	.name = "imx6q_cpuidle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		/* WFI */
 		ARM_CPUIDLE_WFI_STATE,
diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c
index f67fd7ee8127..4b52b3e028ab 100644
--- a/arch/arm/mach-imx/pm-imx5.c
+++ b/arch/arm/mach-imx/pm-imx5.c
@@ -164,7 +164,6 @@ static int imx5_cpuidle_enter(struct cpuidle_device *dev,
 static struct cpuidle_driver imx5_cpuidle_driver = {
 	.name			= "imx5_cpuidle",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 	.states[0]	= {
 		.enter			= imx5_cpuidle_enter,
 		.exit_latency		= 2,
diff --git a/arch/arm/mach-omap2/cpuidle34xx.c b/arch/arm/mach-omap2/cpuidle34xx.c
index 0ee39a8b1763..20785b2b0ec8 100644
--- a/arch/arm/mach-omap2/cpuidle34xx.c
+++ b/arch/arm/mach-omap2/cpuidle34xx.c
@@ -264,7 +264,6 @@ static DEFINE_PER_CPU(struct cpuidle_device, omap3_idle_dev);
 static struct cpuidle_driver omap3_idle_driver = {
 	.name             = "omap3_idle",
 	.owner            = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		{
 			.enter		  = omap3_enter_idle_bm,
diff --git a/arch/arm/mach-omap2/cpuidle44xx.c b/arch/arm/mach-omap2/cpuidle44xx.c
index 8a0e43c69b0b..d1bfb21feb74 100644
--- a/arch/arm/mach-omap2/cpuidle44xx.c
+++ b/arch/arm/mach-omap2/cpuidle44xx.c
@@ -164,7 +164,6 @@ static DEFINE_PER_CPU(struct cpuidle_device, omap_idle_dev);
 static struct cpuidle_driver omap4_idle_driver = {
 	.name				= "omap4_idle",
 	.owner				= THIS_MODULE,
-	.en_core_tk_irqen		= 1,
 	.states = {
 		{
 			/* C1 - CPU0 ON + CPU1 ON + MPU ON */
diff --git a/arch/arm/mach-s3c64xx/cpuidle.c b/arch/arm/mach-s3c64xx/cpuidle.c
index ead5fab0dbb5..852ff16189f7 100644
--- a/arch/arm/mach-s3c64xx/cpuidle.c
+++ b/arch/arm/mach-s3c64xx/cpuidle.c
@@ -45,7 +45,6 @@ static DEFINE_PER_CPU(struct cpuidle_device, s3c64xx_cpuidle_device);
 static struct cpuidle_driver s3c64xx_cpuidle_driver = {
 	.name	= "s3c64xx_cpuidle",
 	.owner  = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		{
 			.enter            = s3c64xx_enter_idle,
diff --git a/arch/arm/mach-shmobile/cpuidle.c b/arch/arm/mach-shmobile/cpuidle.c
index c872ae8c1cb9..d671ae90c7a7 100644
--- a/arch/arm/mach-shmobile/cpuidle.c
+++ b/arch/arm/mach-shmobile/cpuidle.c
@@ -20,7 +20,6 @@ static struct cpuidle_device shmobile_cpuidle_dev;
 static struct cpuidle_driver shmobile_cpuidle_default_driver = {
 	.name			= "shmobile_cpuidle",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 	.states[0]		= ARM_CPUIDLE_WFI_STATE,
 	.safe_state_index	= 0, /* C1 */
 	.state_count		= 1,
diff --git a/arch/arm/mach-shmobile/pm-sh7372.c b/arch/arm/mach-shmobile/pm-sh7372.c
index fbef7b933c1d..dec9293bb90d 100644
--- a/arch/arm/mach-shmobile/pm-sh7372.c
+++ b/arch/arm/mach-shmobile/pm-sh7372.c
@@ -410,7 +410,6 @@ static int sh7372_enter_a4s(struct cpuidle_device *dev,
 static struct cpuidle_driver sh7372_cpuidle_driver = {
 	.name			= "sh7372_cpuidle",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 	.state_count		= 5,
 	.safe_state_index	= 0, /* C1 */
 	.states[0] = ARM_CPUIDLE_WFI_STATE,
diff --git a/arch/arm/mach-tegra/cpuidle-tegra114.c b/arch/arm/mach-tegra/cpuidle-tegra114.c
index c527cff47ee0..c5fadf9dd900 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra114.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra114.c
@@ -23,7 +23,6 @@
 static struct cpuidle_driver tegra_idle_driver = {
 	.name = "tegra_idle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.state_count = 1,
 	.states = {
 		[0] = ARM_CPUIDLE_WFI_STATE_PWR(600),
diff --git a/arch/arm/mach-tegra/cpuidle-tegra20.c b/arch/arm/mach-tegra/cpuidle-tegra20.c
index b94d76a0e0c0..f1f6ac40d714 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra20.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra20.c
@@ -51,7 +51,6 @@ static int tegra20_idle_lp2_coupled(struct cpuidle_device *dev,
 static struct cpuidle_driver tegra_idle_driver = {
 	.name = "tegra_idle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		ARM_CPUIDLE_WFI_STATE_PWR(600),
 #ifdef CONFIG_PM_SLEEP
diff --git a/arch/arm/mach-tegra/cpuidle-tegra30.c b/arch/arm/mach-tegra/cpuidle-tegra30.c
index c4e01fe0fde4..f6a0c7291c35 100644
--- a/arch/arm/mach-tegra/cpuidle-tegra30.c
+++ b/arch/arm/mach-tegra/cpuidle-tegra30.c
@@ -43,7 +43,6 @@ static int tegra30_idle_lp2(struct cpuidle_device *dev,
 static struct cpuidle_driver tegra_idle_driver = {
 	.name = "tegra_idle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 #ifdef CONFIG_PM_SLEEP
 	.state_count = 2,
 #else
diff --git a/arch/arm/mach-ux500/cpuidle.c b/arch/arm/mach-ux500/cpuidle.c
index 1b16d9ebecda..c29c1bf5ff8b 100644
--- a/arch/arm/mach-ux500/cpuidle.c
+++ b/arch/arm/mach-ux500/cpuidle.c
@@ -94,7 +94,6 @@ out:
 static struct cpuidle_driver ux500_idle_driver = {
 	.name = "ux500_idle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		ARM_CPUIDLE_WFI_STATE,
 		{
diff --git a/arch/powerpc/platforms/pseries/processor_idle.c b/arch/powerpc/platforms/pseries/processor_idle.c
index a197120e169a..4644efa06941 100644
--- a/arch/powerpc/platforms/pseries/processor_idle.c
+++ b/arch/powerpc/platforms/pseries/processor_idle.c
@@ -25,7 +25,6 @@
 struct cpuidle_driver pseries_idle_driver = {
 	.name             = "pseries_idle",
 	.owner            = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 };
 
 #define MAX_IDLE_STATE_COUNT	2
diff --git a/arch/sh/kernel/cpu/shmobile/cpuidle.c b/arch/sh/kernel/cpu/shmobile/cpuidle.c
index ea6ab0278490..fdfe57f42b8c 100644
--- a/arch/sh/kernel/cpu/shmobile/cpuidle.c
+++ b/arch/sh/kernel/cpu/shmobile/cpuidle.c
@@ -55,7 +55,6 @@ static struct cpuidle_device cpuidle_dev;
 static struct cpuidle_driver cpuidle_driver = {
 	.name   = "sh_idle",
 	.owner  = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		{
 			.exit_latency = 1,
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 66b5faffe14a..53a4e2744846 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -373,7 +373,6 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 static struct cpuidle_driver apm_idle_driver = {
 	.name = "apm_idle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 	.states = {
 		{ /* entry 0 is for polling */ },
 		{ /* entry 1 is for APM idle */
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index ee255c60bdac..f0df2c9434d2 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -918,7 +918,6 @@ static int acpi_idle_enter_bm(struct cpuidle_device *dev,
 struct cpuidle_driver acpi_idle_driver = {
 	.name =		"acpi_idle",
 	.owner =	THIS_MODULE,
-	.en_core_tk_irqen = 1,
 };
 
 /**
diff --git a/drivers/cpuidle/cpuidle-calxeda.c b/drivers/cpuidle/cpuidle-calxeda.c
index e1aab38c5a8d..a3b56f5d33bd 100644
--- a/drivers/cpuidle/cpuidle-calxeda.c
+++ b/drivers/cpuidle/cpuidle-calxeda.c
@@ -100,7 +100,6 @@ static void calxeda_idle_cpuidle_devices_uninit(void)
 
 static struct cpuidle_driver calxeda_idle_driver = {
 	.name = "calxeda_idle",
-	.en_core_tk_irqen = 1,
 	.states = {
 		ARM_CPUIDLE_WFI_STATE,
 		{
diff --git a/drivers/cpuidle/cpuidle-kirkwood.c b/drivers/cpuidle/cpuidle-kirkwood.c
index 53aad7324965..6f3152436983 100644
--- a/drivers/cpuidle/cpuidle-kirkwood.c
+++ b/drivers/cpuidle/cpuidle-kirkwood.c
@@ -41,7 +41,6 @@ static int kirkwood_enter_idle(struct cpuidle_device *dev,
 static struct cpuidle_driver kirkwood_idle_driver = {
 	.name			= "kirkwood_idle",
 	.owner			= THIS_MODULE,
-	.en_core_tk_irqen	= 1,
 	.states[0]		= ARM_CPUIDLE_WFI_STATE,
 	.states[1]		= {
 		.enter			= kirkwood_enter_idle,
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c50037029184..0da795b9dbbf 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -43,24 +43,6 @@ void disable_cpuidle(void)
 
 static int __cpuidle_register_device(struct cpuidle_device *dev);
 
-static inline int cpuidle_enter(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int index)
-{
-	struct cpuidle_state *target_state = &drv->states[index];
-	return target_state->enter(dev, drv, index);
-}
-
-static inline int cpuidle_enter_tk(struct cpuidle_device *dev,
-			       struct cpuidle_driver *drv, int index)
-{
-	return cpuidle_wrap_enter(dev, drv, index, cpuidle_enter);
-}
-
-typedef int (*cpuidle_enter_t)(struct cpuidle_device *dev,
-			       struct cpuidle_driver *drv, int index);
-
-static cpuidle_enter_t cpuidle_enter_ops;
-
 /**
  * cpuidle_play_dead - cpu off-lining
  *
@@ -90,11 +72,27 @@ int cpuidle_play_dead(void)
  * @next_state: index into drv->states of the state to enter
  */
 int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
-		int next_state)
+			int index)
 {
 	int entered_state;
 
-	entered_state = cpuidle_enter_ops(dev, drv, next_state);
+	struct cpuidle_state *target_state = &drv->states[index];
+	ktime_t time_start, time_end;
+	s64 diff;
+
+	time_start = ktime_get();
+
+	entered_state = target_state->enter(dev, drv, index);
+
+	time_end = ktime_get();
+
+	local_irq_enable();
+
+	diff = ktime_to_us(ktime_sub(time_end, time_start));
+	if (diff > INT_MAX)
+		diff = INT_MAX;
+
+	dev->last_residency = (int) diff;
 
 	if (entered_state >= 0) {
 		/* Update cpuidle counters */
@@ -231,37 +229,6 @@ void cpuidle_resume(void)
 	mutex_unlock(&cpuidle_lock);
 }
 
-/**
- * cpuidle_wrap_enter - performs timekeeping and irqen around enter function
- * @dev: pointer to a valid cpuidle_device object
- * @drv: pointer to a valid cpuidle_driver object
- * @index: index of the target cpuidle state.
- */
-int cpuidle_wrap_enter(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int index,
-				int (*enter)(struct cpuidle_device *dev,
-					struct cpuidle_driver *drv, int index))
-{
-	ktime_t time_start, time_end;
-	s64 diff;
-
-	time_start = ktime_get();
-
-	index = enter(dev, drv, index);
-
-	time_end = ktime_get();
-
-	local_irq_enable();
-
-	diff = ktime_to_us(ktime_sub(time_end, time_start));
-	if (diff > INT_MAX)
-		diff = INT_MAX;
-
-	dev->last_residency = (int) diff;
-
-	return index;
-}
-
 #ifdef CONFIG_ARCH_HAS_CPU_RELAX
 static int poll_idle(struct cpuidle_device *dev,
 		struct cpuidle_driver *drv, int index)
@@ -333,9 +300,6 @@ int cpuidle_enable_device(struct cpuidle_device *dev)
 			return ret;
 	}
 
-	cpuidle_enter_ops = drv->en_core_tk_irqen ?
-		cpuidle_enter_tk : cpuidle_enter;
-
 	poll_idle_init(drv);
 
 	ret = cpuidle_add_device_sysfs(dev);
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index c99c31e8a821..6f80c132d4c9 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -71,7 +71,6 @@
 static struct cpuidle_driver intel_idle_driver = {
 	.name = "intel_idle",
 	.owner = THIS_MODULE,
-	.en_core_tk_irqen = 1,
 };
 /* intel_idle.max_cstate=0 disables driver */
 static int max_cstate = CPUIDLE_STATE_MAX - 1;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index fc3e5808b7ff..79e38114e5f4 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -105,8 +105,6 @@ struct cpuidle_driver {
 	struct module 		*owner;
 	int                     refcnt;
 
-	/* set to 1 to use the core cpuidle time keeping (for all states). */
-	unsigned int		en_core_tk_irqen:1;
         /* used by the cpuidle framework to setup the broadcast timer */
 	unsigned int            bctimer:1;
 	/* states array must be ordered in decreasing power consumption */
@@ -132,10 +130,6 @@ extern void cpuidle_pause(void);
 extern void cpuidle_resume(void);
 extern int cpuidle_enable_device(struct cpuidle_device *dev);
 extern void cpuidle_disable_device(struct cpuidle_device *dev);
-extern int cpuidle_wrap_enter(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int index,
-				int (*enter)(struct cpuidle_device *dev,
-					struct cpuidle_driver *drv, int index));
 extern int cpuidle_play_dead(void);
 
 extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev);
@@ -162,11 +156,6 @@ static inline void cpuidle_resume(void) { }
 static inline int cpuidle_enable_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_disable_device(struct cpuidle_device *dev) { }
-static inline int cpuidle_wrap_enter(struct cpuidle_device *dev,
-				struct cpuidle_driver *drv, int index,
-				int (*enter)(struct cpuidle_device *dev,
-					struct cpuidle_driver *drv, int index))
-{ return -ENODEV; }
 static inline int cpuidle_play_dead(void) {return -ENODEV; }
 #endif
 
-- 
cgit 


From 4c637b2175a0dc65d533494225525c6c82d73293 Mon Sep 17 00:00:00 2001
From: Daniel Lezcano <daniel.lezcano@linaro.org>
Date: Tue, 23 Apr 2013 08:54:33 +0000
Subject: cpuidle: make a single register function for all

The usual scheme to initialize a cpuidle driver on a SMP is:

	cpuidle_register_driver(drv);
	for_each_possible_cpu(cpu) {
		device = &per_cpu(cpuidle_dev, cpu);
		cpuidle_register_device(device);
	}

This code is duplicated in each cpuidle driver.

On UP systems, it is done this way:

	cpuidle_register_driver(drv);
	device = &per_cpu(cpuidle_dev, cpu);
	cpuidle_register_device(device);

On UP, the macro 'for_each_cpu' does one iteration:

#define for_each_cpu(cpu, mask)                 \
        for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)

Hence, the initialization loop is the same for UP than SMP.

Beside, we saw different bugs / mis-initialization / return code unchecked in
the different drivers, the code is duplicated including bugs. After fixing all
these ones, it appears the initialization pattern is the same for everyone.

Please note, some drivers are doing dev->state_count = drv->state_count. This is
not necessary because it is done by the cpuidle_enable_device function in the
cpuidle framework. This is true, until you have the same states for all your
devices. Otherwise, the 'low level' API should be used instead with the specific
initialization for the driver.

Let's add a wrapper function doing this initialization with a cpumask parameter
for the coupled idle states and use it for all the drivers.

That will save a lot of LOC, consolidate the code, and the modifications in the
future could be done in a single place. Another benefit is the consolidation of
the cpuidle_device variable which is now in the cpuidle framework and no longer
spread accross the different arch specific drivers.

Signed-off-by: Daniel Lezcano <daniel.lezcano@linaro.org>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 Documentation/cpuidle/driver.txt |  6 ++++
 drivers/cpuidle/cpuidle.c        | 72 ++++++++++++++++++++++++++++++++++++++++
 include/linux/cpuidle.h          |  9 +++--
 3 files changed, 85 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/cpuidle/driver.txt b/Documentation/cpuidle/driver.txt
index 7a9e09ece931..1b0d81d92583 100644
--- a/Documentation/cpuidle/driver.txt
+++ b/Documentation/cpuidle/driver.txt
@@ -15,11 +15,17 @@ has mechanisms in place to support actual entry-exit into CPU idle states.
 cpuidle driver initializes the cpuidle_device structure for each CPU device
 and registers with cpuidle using cpuidle_register_device.
 
+If all the idle states are the same, the wrapper function cpuidle_register
+could be used instead.
+
 It can also support the dynamic changes (like battery <-> AC), by using
 cpuidle_pause_and_lock, cpuidle_disable_device and cpuidle_enable_device,
 cpuidle_resume_and_unlock.
 
 Interfaces:
+extern int cpuidle_register(struct cpuidle_driver *drv,
+                            const struct cpumask *const coupled_cpus);
+extern int cpuidle_unregister(struct cpuidle_driver *drv);
 extern int cpuidle_register_driver(struct cpuidle_driver *drv);
 extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
 extern int cpuidle_register_device(struct cpuidle_device *dev);
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 0da795b9dbbf..49e8d302af55 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -24,6 +24,7 @@
 #include "cpuidle.h"
 
 DEFINE_PER_CPU(struct cpuidle_device *, cpuidle_devices);
+DEFINE_PER_CPU(struct cpuidle_device, cpuidle_dev);
 
 DEFINE_MUTEX(cpuidle_lock);
 LIST_HEAD(cpuidle_detected_devices);
@@ -453,6 +454,77 @@ void cpuidle_unregister_device(struct cpuidle_device *dev)
 
 EXPORT_SYMBOL_GPL(cpuidle_unregister_device);
 
+/*
+ * cpuidle_unregister: unregister a driver and the devices. This function
+ * can be used only if the driver has been previously registered through
+ * the cpuidle_register function.
+ *
+ * @drv: a valid pointer to a struct cpuidle_driver
+ */
+void cpuidle_unregister(struct cpuidle_driver *drv)
+{
+	int cpu;
+	struct cpuidle_device *device;
+
+	for_each_possible_cpu(cpu) {
+		device = &per_cpu(cpuidle_dev, cpu);
+		cpuidle_unregister_device(device);
+	}
+
+	cpuidle_unregister_driver(drv);
+}
+EXPORT_SYMBOL_GPL(cpuidle_unregister);
+
+/**
+ * cpuidle_register: registers the driver and the cpu devices with the
+ * coupled_cpus passed as parameter. This function is used for all common
+ * initialization pattern there are in the arch specific drivers. The
+ * devices is globally defined in this file.
+ *
+ * @drv         : a valid pointer to a struct cpuidle_driver
+ * @coupled_cpus: a cpumask for the coupled states
+ *
+ * Returns 0 on success, < 0 otherwise
+ */
+int cpuidle_register(struct cpuidle_driver *drv,
+		     const struct cpumask *const coupled_cpus)
+{
+	int ret, cpu;
+	struct cpuidle_device *device;
+
+	ret = cpuidle_register_driver(drv);
+	if (ret) {
+		pr_err("failed to register cpuidle driver\n");
+		return ret;
+	}
+
+	for_each_possible_cpu(cpu) {
+		device = &per_cpu(cpuidle_dev, cpu);
+		device->cpu = cpu;
+
+#ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
+		/*
+		 * On multiplatform for ARM, the coupled idle states could
+		 * enabled in the kernel even if the cpuidle driver does not
+		 * use it. Note, coupled_cpus is a struct copy.
+		 */
+		if (coupled_cpus)
+			device->coupled_cpus = *coupled_cpus;
+#endif
+		ret = cpuidle_register_device(device);
+		if (!ret)
+			continue;
+
+		pr_err("Failed to register cpuidle device for cpu%d\n", cpu);
+
+		cpuidle_unregister(drv);
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(cpuidle_register);
+
 #ifdef CONFIG_SMP
 
 static void smp_callback(void *v)
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index 79e38114e5f4..3c86faa59798 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -123,7 +123,9 @@ extern void cpuidle_driver_unref(void);
 extern void cpuidle_unregister_driver(struct cpuidle_driver *drv);
 extern int cpuidle_register_device(struct cpuidle_device *dev);
 extern void cpuidle_unregister_device(struct cpuidle_device *dev);
-
+extern int cpuidle_register(struct cpuidle_driver *drv,
+			    const struct cpumask *const coupled_cpus);
+extern void cpuidle_unregister(struct cpuidle_driver *drv);
 extern void cpuidle_pause_and_lock(void);
 extern void cpuidle_resume_and_unlock(void);
 extern void cpuidle_pause(void);
@@ -148,7 +150,10 @@ static inline void cpuidle_unregister_driver(struct cpuidle_driver *drv) { }
 static inline int cpuidle_register_device(struct cpuidle_device *dev)
 {return -ENODEV; }
 static inline void cpuidle_unregister_device(struct cpuidle_device *dev) { }
-
+static inline int cpuidle_register(struct cpuidle_driver *drv,
+				   const struct cpumask *const coupled_cpus)
+{return -ENODEV; }
+static inline void cpuidle_unregister(struct cpuidle_driver *drv) { }
 static inline void cpuidle_pause_and_lock(void) { }
 static inline void cpuidle_resume_and_unlock(void) { }
 static inline void cpuidle_pause(void) { }
-- 
cgit 


From 703860ed4e36ded696bd44af6107243fdedfb746 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Wed, 17 Apr 2013 16:57:56 -0600
Subject: PCI: Use u8, not int, for PM capability offset

The Power Management Capability (PCI_CAP_ID_PM == 0x01) is defined by PCI
and must appear in the 256-byte PCI Configuration Space from 0-0xff.  It
cannot be in the PCIe Extended Configuration space from 0x100-0xfff, so
we only need a u8 to hold its offset.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 include/linux/pci.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 2461033a7987..9587d4d19279 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -249,8 +249,7 @@ struct pci_dev {
 	pci_power_t     current_state;  /* Current operating state. In ACPI-speak,
 					   this is D0-D3, D0 being fully functional,
 					   and D3 being off. */
-	int		pm_cap;		/* PM capability offset in the
-					   configuration space */
+	u8		pm_cap;		/* PM capability offset */
 	unsigned int	pme_support:5;	/* Bitmask of states from which PME#
 					   can be generated */
 	unsigned int	pme_interrupt:1;
-- 
cgit 


From e375b561817d9ae098cc4296a729fc88924a0159 Mon Sep 17 00:00:00 2001
From: Gavin Shan <shangw@linux.vnet.ibm.com>
Date: Thu, 4 Apr 2013 16:54:30 +0000
Subject: PCI: Cache MSI/MSI-X capability offsets in struct pci_dev

The patch caches the MSI and MSI-X capability offset in PCI device
(struct pci_dev) so that we needn't read it from the config space
upon enabling or disabling MSI or MSI-X interrupts.

[bhelgaas: moved pm_cap size change to separate patch]
Signed-off-by: Gavin Shan <shangw@linux.vnet.ibm.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 drivers/pci/msi.c   | 42 +++++++++++++++++++-----------------------
 include/linux/pci.h |  2 ++
 2 files changed, 21 insertions(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 00cc78c7aa04..99befbd99938 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -111,32 +111,26 @@ void default_restore_msi_irqs(struct pci_dev *dev, int irq)
 }
 #endif
 
-static void msi_set_enable(struct pci_dev *dev, int pos, int enable)
+static void msi_set_enable(struct pci_dev *dev, int enable)
 {
 	u16 control;
 
-	BUG_ON(!pos);
-
-	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
+	pci_read_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, &control);
 	control &= ~PCI_MSI_FLAGS_ENABLE;
 	if (enable)
 		control |= PCI_MSI_FLAGS_ENABLE;
-	pci_write_config_word(dev, pos + PCI_MSI_FLAGS, control);
+	pci_write_config_word(dev, dev->msi_cap + PCI_MSI_FLAGS, control);
 }
 
 static void msix_set_enable(struct pci_dev *dev, int enable)
 {
-	int pos;
 	u16 control;
 
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
-	if (pos) {
-		pci_read_config_word(dev, pos + PCI_MSIX_FLAGS, &control);
-		control &= ~PCI_MSIX_FLAGS_ENABLE;
-		if (enable)
-			control |= PCI_MSIX_FLAGS_ENABLE;
-		pci_write_config_word(dev, pos + PCI_MSIX_FLAGS, control);
-	}
+	pci_read_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, &control);
+	control &= ~PCI_MSIX_FLAGS_ENABLE;
+	if (enable)
+		control |= PCI_MSIX_FLAGS_ENABLE;
+	pci_write_config_word(dev, dev->msix_cap + PCI_MSIX_FLAGS, control);
 }
 
 static inline __attribute_const__ u32 msi_mask(unsigned x)
@@ -402,7 +396,7 @@ static void __pci_restore_msi_state(struct pci_dev *dev)
 	pos = entry->msi_attrib.pos;
 
 	pci_intx_for_msi(dev, 0);
-	msi_set_enable(dev, pos, 0);
+	msi_set_enable(dev, 0);
 	arch_restore_msi_irqs(dev, dev->irq);
 
 	pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &control);
@@ -557,7 +551,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec)
 	unsigned mask;
 
 	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	msi_set_enable(dev, pos, 0);	/* Disable MSI during set up */
+	msi_set_enable(dev, 0);	/* Disable MSI during set up */
 
 	pci_read_config_word(dev, msi_control_reg(pos), &control);
 	/* MSI Entry Initialization */
@@ -598,7 +592,7 @@ static int msi_capability_init(struct pci_dev *dev, int nvec)
 
 	/* Set MSI enabled bits	 */
 	pci_intx_for_msi(dev, 0);
-	msi_set_enable(dev, pos, 1);
+	msi_set_enable(dev, 1);
 	dev->msi_enabled = 1;
 
 	dev->irq = entry->irq;
@@ -885,7 +879,7 @@ void pci_msi_shutdown(struct pci_dev *dev)
 	desc = list_first_entry(&dev->msi_list, struct msi_desc, list);
 	pos = desc->msi_attrib.pos;
 
-	msi_set_enable(dev, pos, 0);
+	msi_set_enable(dev, 0);
 	pci_intx_for_msi(dev, 1);
 	dev->msi_enabled = 0;
 
@@ -1048,15 +1042,17 @@ EXPORT_SYMBOL(pci_msi_enabled);
 
 void pci_msi_init_pci_dev(struct pci_dev *dev)
 {
-	int pos;
 	INIT_LIST_HEAD(&dev->msi_list);
 
 	/* Disable the msi hardware to avoid screaming interrupts
 	 * during boot.  This is the power on reset default so
 	 * usually this should be a noop.
 	 */
-	pos = pci_find_capability(dev, PCI_CAP_ID_MSI);
-	if (pos)
-		msi_set_enable(dev, pos, 0);
-	msix_set_enable(dev, 0);
+	dev->msi_cap = pci_find_capability(dev, PCI_CAP_ID_MSI);
+	if (dev->msi_cap)
+		msi_set_enable(dev, 0);
+
+	dev->msix_cap = pci_find_capability(dev, PCI_CAP_ID_MSIX);
+	if (dev->msix_cap)
+		msix_set_enable(dev, 0);
 }
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 9587d4d19279..b73c2460ad57 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -232,6 +232,8 @@ struct pci_dev {
 	u8		revision;	/* PCI revision, low byte of class word */
 	u8		hdr_type;	/* PCI header type (`multi' flag masked out) */
 	u8		pcie_cap;	/* PCI-E capability offset */
+	u8		msi_cap;	/* MSI capability offset */
+	u8		msix_cap;	/* MSI-X capability offset */
 	u8		pcie_mpss:3;	/* PCI-E Max Payload Size Supported */
 	u8		rom_base_reg;	/* which config register controls the ROM */
 	u8		pin;  		/* which interrupt pin this device uses */
-- 
cgit 


From 2366d06eb15d3cf21d3b243ff586bcb80896c9f2 Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 18 Apr 2013 10:55:46 -0600
Subject: PCI: Remove "extern" from function declarations

We had an inconsistent mix of using and omitting the "extern" keyword
on function declarations in header files.  This removes them all.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
---
 include/linux/msi.h | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/msi.h b/include/linux/msi.h
index ce93a341337d..20c2d6dd5d25 100644
--- a/include/linux/msi.h
+++ b/include/linux/msi.h
@@ -13,14 +13,14 @@ struct msi_msg {
 /* Helper functions */
 struct irq_data;
 struct msi_desc;
-extern void mask_msi_irq(struct irq_data *data);
-extern void unmask_msi_irq(struct irq_data *data);
-extern void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-extern void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-extern void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
-extern void read_msi_msg(unsigned int irq, struct msi_msg *msg);
-extern void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
-extern void write_msi_msg(unsigned int irq, struct msi_msg *msg);
+void mask_msi_irq(struct irq_data *data);
+void unmask_msi_irq(struct irq_data *data);
+void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg);
+void read_msi_msg(unsigned int irq, struct msi_msg *msg);
+void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg);
+void write_msi_msg(unsigned int irq, struct msi_msg *msg);
 
 struct msi_desc {
 	struct {
@@ -54,9 +54,8 @@ struct msi_desc {
  */
 int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc);
 void arch_teardown_msi_irq(unsigned int irq);
-extern int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
-extern void arch_teardown_msi_irqs(struct pci_dev *dev);
-extern int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
-
+int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
+void arch_teardown_msi_irqs(struct pci_dev *dev);
+int arch_msi_check_device(struct pci_dev* dev, int nvec, int type);
 
 #endif /* LINUX_MSI_H */
-- 
cgit 


From ee5d5499edb94cd03738a52a7e234b139da8fd72 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 23 Apr 2013 21:05:40 +0200
Subject: usb: phy: tegra: don't call into tegra-ehci directly

Both phy-tegra-usb.c and ehci-tegra.c export symbols used by the other one,
which does not work if one of them or both are loadable modules, resulting
in an error like:

drivers/built-in.o: In function `utmi_phy_clk_disable':
drivers/usb/phy/phy-tegra-usb.c:302: undefined reference to `tegra_ehci_set_phcd'
drivers/built-in.o: In function `utmi_phy_clk_enable':
drivers/usb/phy/phy-tegra-usb.c:324: undefined reference to `tegra_ehci_set_phcd'
drivers/built-in.o: In function `utmi_phy_power_on':
drivers/usb/phy/phy-tegra-usb.c:447: undefined reference to `tegra_ehci_set_pts'

This turns the interface into a one-way dependency by letting the tegra ehci
driver pass two function pointers for callbacks that need to be called by
the phy driver.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Venu Byravarasu <vbyravarasu@nvidia.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Stephen Warren <swarren@nvidia.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 drivers/usb/host/ehci-tegra.c     | 10 +++++-----
 drivers/usb/phy/phy-tegra-usb.c   | 13 +++++++++----
 include/linux/usb/tegra_usb_phy.h | 10 +++++-----
 3 files changed, 19 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/host/ehci-tegra.c b/drivers/usb/host/ehci-tegra.c
index ed201ae879cb..e3eddc31ac83 100644
--- a/drivers/usb/host/ehci-tegra.c
+++ b/drivers/usb/host/ehci-tegra.c
@@ -611,7 +611,7 @@ static const struct dev_pm_ops tegra_ehci_pm_ops = {
 /* Bits of PORTSC1, which will get cleared by writing 1 into them */
 #define TEGRA_PORTSC1_RWC_BITS (PORT_CSC | PORT_PEC | PORT_OCC)
 
-void tegra_ehci_set_pts(struct usb_phy *x, u8 pts_val)
+static void tegra_ehci_set_pts(struct usb_phy *x, u8 pts_val)
 {
 	unsigned long val;
 	struct usb_hcd *hcd = bus_to_hcd(x->otg->host);
@@ -622,9 +622,8 @@ void tegra_ehci_set_pts(struct usb_phy *x, u8 pts_val)
 	val |= TEGRA_USB_PORTSC1_PTS(pts_val & 3);
 	writel(val, base + TEGRA_USB_PORTSC1);
 }
-EXPORT_SYMBOL_GPL(tegra_ehci_set_pts);
 
-void tegra_ehci_set_phcd(struct usb_phy *x, bool enable)
+static void tegra_ehci_set_phcd(struct usb_phy *x, bool enable)
 {
 	unsigned long val;
 	struct usb_hcd *hcd = bus_to_hcd(x->otg->host);
@@ -637,7 +636,6 @@ void tegra_ehci_set_phcd(struct usb_phy *x, bool enable)
 		val &= ~TEGRA_USB_PORTSC1_PHCD;
 	writel(val, base + TEGRA_USB_PORTSC1);
 }
-EXPORT_SYMBOL_GPL(tegra_ehci_set_phcd);
 
 static u64 tegra_ehci_dma_mask = DMA_BIT_MASK(32);
 
@@ -738,7 +736,9 @@ static int tegra_ehci_probe(struct platform_device *pdev)
 
 	tegra->phy = tegra_usb_phy_open(&pdev->dev, instance, hcd->regs,
 					pdata->phy_config,
-					TEGRA_USB_PHY_MODE_HOST);
+					TEGRA_USB_PHY_MODE_HOST,
+					tegra_ehci_set_pts,
+					tegra_ehci_set_phcd);
 	if (IS_ERR(tegra->phy)) {
 		dev_err(&pdev->dev, "Failed to open USB phy\n");
 		err = -ENXIO;
diff --git a/drivers/usb/phy/phy-tegra-usb.c b/drivers/usb/phy/phy-tegra-usb.c
index 5487d38481af..17d811292f3a 100644
--- a/drivers/usb/phy/phy-tegra-usb.c
+++ b/drivers/usb/phy/phy-tegra-usb.c
@@ -299,7 +299,7 @@ static void utmi_phy_clk_disable(struct tegra_usb_phy *phy)
 		val &= ~USB_SUSP_SET;
 		writel(val, base + USB_SUSP_CTRL);
 	} else
-		tegra_ehci_set_phcd(&phy->u_phy, true);
+		phy->set_phcd(&phy->u_phy, true);
 
 	if (utmi_wait_register(base + USB_SUSP_CTRL, USB_PHY_CLK_VALID, 0) < 0)
 		pr_err("%s: timeout waiting for phy to stabilize\n", __func__);
@@ -321,7 +321,7 @@ static void utmi_phy_clk_enable(struct tegra_usb_phy *phy)
 		val &= ~USB_SUSP_CLR;
 		writel(val, base + USB_SUSP_CTRL);
 	} else
-		tegra_ehci_set_phcd(&phy->u_phy, false);
+		phy->set_phcd(&phy->u_phy, false);
 
 	if (utmi_wait_register(base + USB_SUSP_CTRL, USB_PHY_CLK_VALID,
 						     USB_PHY_CLK_VALID))
@@ -444,7 +444,7 @@ static int utmi_phy_power_on(struct tegra_usb_phy *phy)
 	utmi_phy_clk_enable(phy);
 
 	if (!phy->is_legacy_phy)
-		tegra_ehci_set_pts(&phy->u_phy, 0);
+		phy->set_pts(&phy->u_phy, 0);
 
 	return 0;
 }
@@ -688,7 +688,10 @@ static int	tegra_usb_phy_suspend(struct usb_phy *x, int suspend)
 }
 
 struct tegra_usb_phy *tegra_usb_phy_open(struct device *dev, int instance,
-	void __iomem *regs, void *config, enum tegra_usb_phy_mode phy_mode)
+	void __iomem *regs, void *config, enum tegra_usb_phy_mode phy_mode,
+	void (*set_pts)(struct usb_phy *x, u8 pts_val),
+	void (*set_phcd)(struct usb_phy *x, bool enable))
+
 {
 	struct tegra_usb_phy *phy;
 	unsigned long parent_rate;
@@ -707,6 +710,8 @@ struct tegra_usb_phy *tegra_usb_phy_open(struct device *dev, int instance,
 	phy->dev = dev;
 	phy->is_legacy_phy =
 		of_property_read_bool(np, "nvidia,has-legacy-mode");
+	phy->set_pts = set_pts;
+	phy->set_phcd = set_phcd;
 	err = of_property_match_string(np, "phy_type", "ulpi");
 	if (err < 0)
 		phy->is_ulpi_phy = false;
diff --git a/include/linux/usb/tegra_usb_phy.h b/include/linux/usb/tegra_usb_phy.h
index 9ebebe906925..1b7519a8c0bf 100644
--- a/include/linux/usb/tegra_usb_phy.h
+++ b/include/linux/usb/tegra_usb_phy.h
@@ -61,10 +61,14 @@ struct tegra_usb_phy {
 	struct device *dev;
 	bool is_legacy_phy;
 	bool is_ulpi_phy;
+	void (*set_pts)(struct usb_phy *x, u8 pts_val);
+	void (*set_phcd)(struct usb_phy *x, bool enable);
 };
 
 struct tegra_usb_phy *tegra_usb_phy_open(struct device *dev, int instance,
-	void __iomem *regs, void *config, enum tegra_usb_phy_mode phy_mode);
+	void __iomem *regs, void *config, enum tegra_usb_phy_mode phy_mode,
+	void (*set_pts)(struct usb_phy *x, u8 pts_val),
+	void (*set_phcd)(struct usb_phy *x, bool enable));
 
 void tegra_usb_phy_preresume(struct usb_phy *phy);
 
@@ -75,8 +79,4 @@ void tegra_ehci_phy_restore_start(struct usb_phy *phy,
 
 void tegra_ehci_phy_restore_end(struct usb_phy *phy);
 
-void tegra_ehci_set_pts(struct usb_phy *x, u8 pts_val);
-
-void tegra_ehci_set_phcd(struct usb_phy *x, bool enable);
-
 #endif /* __TEGRA_USB_PHY_H */
-- 
cgit 


From 871dd9286e25330c8a581e5dacfa8b1dfe1dd641 Mon Sep 17 00:00:00 2001
From: James Bottomley <JBottomley@Parallels.com>
Date: Wed, 24 Apr 2013 08:52:50 -0600
Subject: block: fix max discard sectors limit

linux-v3.8-rc1 and later support for plug for blkdev_issue_discard with
commit 0cfbcafcae8b7364b5fa96c2b26ccde7a3a296a9
(block: add plug for blkdev_issue_discard )

For example,
1) DISCARD rq-1 with size size 4GB
2) DISCARD rq-2 with size size 1GB

If these 2 discard requests get merged, final request size will be 5GB.

In this case, request's __data_len field may overflow as it can store
max 4GB(unsigned int).

This issue was observed while doing mkfs.f2fs on 5GB SD card:
https://lkml.org/lkml/2013/4/1/292

Info: sector size = 512
Info: total sectors = 11370496 (in 512bytes)
Info: zone aligned segment0 blkaddr: 512
[  257.789764] blk_update_request: bio idx 0 >= vcnt 0

mkfs process gets stuck in D state and I see the following in the dmesg:

[  257.789733] __end_that: dev mmcblk0: type=1, flags=122c8081
[  257.789764]   sector 4194304, nr/cnr 2981888/4294959104
[  257.789764]   bio df3840c0, biotail df3848c0, buffer   (null), len
1526726656
[  257.789764] blk_update_request: bio idx 0 >= vcnt 0
[  257.794921] request botched: dev mmcblk0: type=1, flags=122c8081
[  257.794921]   sector 4194304, nr/cnr 2981888/4294959104
[  257.794921]   bio df3840c0, biotail df3848c0, buffer   (null), len
1526726656

This patch fixes this issue.

Reported-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Tested-by: Max Filippov <jcmvbkbc@gmail.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 include/linux/blkdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 89d89c7162aa..6189bf26b53d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -844,7 +844,7 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 						     unsigned int cmd_flags)
 {
 	if (unlikely(cmd_flags & REQ_DISCARD))
-		return q->limits.max_discard_sectors;
+		return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
 	if (unlikely(cmd_flags & REQ_WRITE_SAME))
 		return q->limits.max_write_same_sectors;
-- 
cgit 


From 198f38f7fb2d0956cc3394655d03cdb7054b0dcd Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Wed, 24 Apr 2013 23:10:28 +0800
Subject: thermal: cpu_cooling: add needed header for cpu_cooling.h

Update header list for cpu_cooling.h. Missing definition of cpumask.

Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Acked-by: Amit Daniel Kachhap <amit.daniel@samsung.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 include/linux/cpu_cooling.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index 77c87c9d0193..75ef931fc4fc 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -25,6 +25,7 @@
 #define __CPU_COOLING_H__
 
 #include <linux/thermal.h>
+#include <linux/cpumask.h>
 
 #define CPUFREQ_COOLING_START		0
 #define CPUFREQ_COOLING_STOP		1
-- 
cgit 


From e44dec77be9e032ff9c2f95f17f7df3a55b58f60 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Wed, 17 Apr 2013 17:12:22 +0000
Subject: thermal: cpu_cooling: remove unused symbols on cpu_cooling.h

Remove defines that are not in used.

Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Acked-by: Amit Daniel Kachhap <amit.daniel@samsung.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 include/linux/cpu_cooling.h | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index 75ef931fc4fc..f48f6fbcd3be 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -27,9 +27,6 @@
 #include <linux/thermal.h>
 #include <linux/cpumask.h>
 
-#define CPUFREQ_COOLING_START		0
-#define CPUFREQ_COOLING_STOP		1
-
 #ifdef CONFIG_CPU_THERMAL
 /**
  * cpufreq_cooling_register - function to create cpufreq cooling device.
-- 
cgit 


From 7b73c993776b9f7d833dde7d09fd861508c54777 Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Tue, 23 Apr 2013 21:48:14 +0000
Subject: thermal: rename notify_thermal_framework to thermal_notify_framework

To follow the prefix names used by the thermal functions,
this patch renames notify_thermal_framework to thermal_notify_framework.

Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 Documentation/thermal/sysfs-api.txt | 2 +-
 drivers/thermal/thermal_core.c      | 6 +++---
 include/linux/thermal.h             | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index b2ffe98cf469..bb42266afc66 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -367,7 +367,7 @@ This function returns the thermal_instance corresponding to a given
 {thermal_zone, cooling_device, trip_point} combination. Returns NULL
 if such an instance does not exist.
 
-5.3:notify_thermal_framework:
+5.3:thermal_notify_framework:
 This function handles the trip events from sensor drivers. It starts
 throttling the cooling devices according to the policy configured.
 For CRITICAL and HOT trip points, this notifies the respective drivers,
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index a579c622f1a7..fb22ae51aac2 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -1432,7 +1432,7 @@ void thermal_cdev_update(struct thermal_cooling_device *cdev)
 EXPORT_SYMBOL(thermal_cdev_update);
 
 /**
- * notify_thermal_framework - Sensor drivers use this API to notify framework
+ * thermal_notify_framework - Sensor drivers use this API to notify framework
  * @tz:		thermal zone device
  * @trip:	indicates which trip point has been crossed
  *
@@ -1443,11 +1443,11 @@ EXPORT_SYMBOL(thermal_cdev_update);
  * The throttling policy is based on the configured platform data; if no
  * platform data is provided, this uses the step_wise throttling policy.
  */
-void notify_thermal_framework(struct thermal_zone_device *tz, int trip)
+void thermal_notify_framework(struct thermal_zone_device *tz, int trip)
 {
 	handle_thermal_trip(tz, trip);
 }
-EXPORT_SYMBOL(notify_thermal_framework);
+EXPORT_SYMBOL(thermal_notify_framework);
 
 /**
  * create_trip_attrs - create attributes for trip points
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ec2b886f9d96..d7272ab20ebc 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -246,7 +246,7 @@ int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
 		struct thermal_cooling_device *, int);
 void thermal_cdev_update(struct thermal_cooling_device *);
-void notify_thermal_framework(struct thermal_zone_device *, int);
+void thermal_notify_framework(struct thermal_zone_device *, int);
 
 #ifdef CONFIG_NET
 extern int thermal_generate_netlink_event(struct thermal_zone_device *tz,
-- 
cgit 


From aa16bea929aed6ea854b55d2be8306a9fb40e694 Mon Sep 17 00:00:00 2001
From: Alexey Kardashevskiy <aik@ozlabs.ru>
Date: Mon, 25 Mar 2013 10:23:49 +1100
Subject: iommu: Add a function to find an iommu group by id

As IOMMU groups are exposed to the user space by their numbers,
the user space can use them in various kernel APIs so the kernel
might need an API to find a group by its ID.

As an example, QEMU VFIO on PPC64 platform needs it to associate
a logical bus number (LIOBN) with a specific IOMMU group in order
to support in-kernel handling of DMA map/unmap requests.

The patch adds the iommu_group_get_by_id(id) function which performs
such search.

v2: fixed reference counting.

Signed-off-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Joerg Roedel <joro@8bytes.org>
---
 drivers/iommu/iommu.c | 29 +++++++++++++++++++++++++++++
 include/linux/iommu.h |  1 +
 2 files changed, 30 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index b972d430d92b..db01af01ce0a 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -204,6 +204,35 @@ again:
 }
 EXPORT_SYMBOL_GPL(iommu_group_alloc);
 
+struct iommu_group *iommu_group_get_by_id(int id)
+{
+	struct kobject *group_kobj;
+	struct iommu_group *group;
+	const char *name;
+
+	if (!iommu_group_kset)
+		return NULL;
+
+	name = kasprintf(GFP_KERNEL, "%d", id);
+	if (!name)
+		return NULL;
+
+	group_kobj = kset_find_obj(iommu_group_kset, name);
+	kfree(name);
+
+	if (!group_kobj)
+		return NULL;
+
+	group = container_of(group_kobj, struct iommu_group, kobj);
+	BUG_ON(group->id != id);
+
+	kobject_get(group->devices_kobj);
+	kobject_put(&group->kobj);
+
+	return group;
+}
+EXPORT_SYMBOL_GPL(iommu_group_get_by_id);
+
 /**
  * iommu_group_get_iommudata - retrieve iommu_data registered for a group
  * @group: the group
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6a027dc8dbfe..8e1b5be72b02 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -126,6 +126,7 @@ struct iommu_ops {
 extern int bus_set_iommu(struct bus_type *bus, struct iommu_ops *ops);
 extern bool iommu_present(struct bus_type *bus);
 extern struct iommu_domain *iommu_domain_alloc(struct bus_type *bus);
+extern struct iommu_group *iommu_group_get_by_id(int id);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
-- 
cgit 


From d998735f443427c1530cac5eeda0a45c8cb60a57 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Tue, 23 Apr 2013 06:06:47 +0000
Subject: net/mlx4_core: Add timestamping device capability

Add new device capability for timestamping support and query FW to retrieve it.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c | 7 ++++++-
 include/linux/mlx4/device.h             | 3 ++-
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index ab470d991ade..f1e70979b4c1 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -130,7 +130,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags)
 		[1] = "RSS Toeplitz Hash Function support",
 		[2] = "RSS XOR Hash Function support",
 		[3] = "Device manage flow steering support",
-		[4] = "Automatic mac reassignment support"
+		[4] = "Automatic MAC reassignment support",
+		[5] = "Time stamping support"
 	};
 	int i;
 
@@ -444,6 +445,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_MAX_MSG_SZ_OFFSET		0x38
 #define QUERY_DEV_CAP_MAX_GID_OFFSET		0x3b
 #define QUERY_DEV_CAP_RATE_SUPPORT_OFFSET	0x3c
+#define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET	0x3e
 #define QUERY_DEV_CAP_MAX_PKEY_OFFSET		0x3f
 #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET		0x40
 #define QUERY_DEV_CAP_FLAGS_OFFSET		0x44
@@ -560,6 +562,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	dev_cap->fs_max_num_qp_per_entry = field;
 	MLX4_GET(stat_rate, outbox, QUERY_DEV_CAP_RATE_SUPPORT_OFFSET);
 	dev_cap->stat_rate_support = stat_rate;
+	MLX4_GET(field, outbox, QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET);
+	if (field & 0x80)
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_TS;
 	MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET);
 	MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET);
 	dev_cap->flags = flags | (u64)ext_flags << 32;
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 1bc5a750b330..86ae260c2e53 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -152,7 +152,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_RSS_TOP		= 1LL <<  1,
 	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
 	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
-	MLX4_DEV_CAP_FLAGS2_REASSIGN_MAC_EN	= 1LL <<  4
+	MLX4_DEV_CAP_FLAGS2_REASSIGN_MAC_EN	= 1LL <<  4,
+	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5
 };
 
 enum {
-- 
cgit 


From ddd8a6c12d7e494902a9435a9a7a543ef730b2d8 Mon Sep 17 00:00:00 2001
From: Eugenia Emantayev <eugenia@mellanox.com>
Date: Tue, 23 Apr 2013 06:06:48 +0000
Subject: net/mlx4_core: Read HCA frequency and map internal clock

Read HCA frequency, read PCI clock bar and offset, map internal clock to
PCI bar.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 11 ++++++
 drivers/net/ethernet/mellanox/mlx4/fw.h   |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c | 57 +++++++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  6 +++-
 include/linux/mlx4/device.h               |  1 +
 5 files changed, 75 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index f1e70979b4c1..6776c257bd34 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -1013,6 +1013,9 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev)
 #define QUERY_FW_COMM_BASE_OFFSET      0x40
 #define QUERY_FW_COMM_BAR_OFFSET       0x48
 
+#define QUERY_FW_CLOCK_OFFSET	       0x50
+#define QUERY_FW_CLOCK_BAR	       0x58
+
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
 		return PTR_ERR(mailbox);
@@ -1087,6 +1090,12 @@ int mlx4_QUERY_FW(struct mlx4_dev *dev)
 		 fw->comm_bar, fw->comm_base);
 	mlx4_dbg(dev, "FW size %d KB\n", fw->fw_pages >> 2);
 
+	MLX4_GET(fw->clock_offset, outbox, QUERY_FW_CLOCK_OFFSET);
+	MLX4_GET(fw->clock_bar,    outbox, QUERY_FW_CLOCK_BAR);
+	fw->clock_bar = (fw->clock_bar >> 6) * 2;
+	mlx4_dbg(dev, "Internal clock bar:%d offset:0x%llx\n",
+		 fw->clock_bar, fw->clock_offset);
+
 	/*
 	 * Round up number of system pages needed in case
 	 * MLX4_ICM_PAGE_SIZE < PAGE_SIZE.
@@ -1374,6 +1383,7 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 	u8 byte_field;
 
 #define QUERY_HCA_GLOBAL_CAPS_OFFSET	0x04
+#define QUERY_HCA_CORE_CLOCK_OFFSET	0x0c
 
 	mailbox = mlx4_alloc_cmd_mailbox(dev);
 	if (IS_ERR(mailbox))
@@ -1388,6 +1398,7 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
 		goto out;
 
 	MLX4_GET(param->global_caps, outbox, QUERY_HCA_GLOBAL_CAPS_OFFSET);
+	MLX4_GET(param->hca_core_clock, outbox, QUERY_HCA_CORE_CLOCK_OFFSET);
 
 	/* QPC/EEC/CQC/EQC/RDMARC attributes */
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h
index 151c2bb380a6..fdf41665a059 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.h
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.h
@@ -162,6 +162,7 @@ struct mlx4_init_hca_param {
 	u64 global_caps;
 	u16 log_mc_entry_sz;
 	u16 log_mc_hash_sz;
+	u16 hca_core_clock; /* Internal Clock Frequency (in MHz) */
 	u8  log_num_qps;
 	u8  log_num_srqs;
 	u8  log_num_cqs;
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index 16abde20e1fc..e81840faa6c6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -513,6 +513,8 @@ static int mlx4_slave_cap(struct mlx4_dev *dev)
 
 	mlx4_log_num_mgm_entry_size = hca_param.log_mc_entry_sz;
 
+	dev->caps.hca_core_clock = hca_param.hca_core_clock;
+
 	memset(&dev_cap, 0, sizeof(dev_cap));
 	dev->caps.max_qp_dest_rdma = 1 << hca_param.log_rd_per_qp;
 	err = mlx4_dev_cap(dev, &dev_cap);
@@ -1226,8 +1228,31 @@ static void unmap_bf_area(struct mlx4_dev *dev)
 		io_mapping_free(mlx4_priv(dev)->bf_mapping);
 }
 
+static int map_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	priv->clock_mapping =
+		ioremap(pci_resource_start(dev->pdev, priv->fw.clock_bar) +
+			priv->fw.clock_offset, MLX4_CLOCK_SIZE);
+
+	if (!priv->clock_mapping)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void unmap_internal_clock(struct mlx4_dev *dev)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	if (priv->clock_mapping)
+		iounmap(priv->clock_mapping);
+}
+
 static void mlx4_close_hca(struct mlx4_dev *dev)
 {
+	unmap_internal_clock(dev);
 	unmap_bf_area(dev);
 	if (mlx4_is_slave(dev))
 		mlx4_slave_exit(dev);
@@ -1445,6 +1470,37 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 			mlx4_err(dev, "INIT_HCA command failed, aborting.\n");
 			goto err_free_icm;
 		}
+		/*
+		 * If TS is supported by FW
+		 * read HCA frequency by QUERY_HCA command
+		 */
+		if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+			memset(&init_hca, 0, sizeof(init_hca));
+			err = mlx4_QUERY_HCA(dev, &init_hca);
+			if (err) {
+				mlx4_err(dev, "QUERY_HCA command failed, disable timestamp.\n");
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+			} else {
+				dev->caps.hca_core_clock =
+					init_hca.hca_core_clock;
+			}
+
+			/* In case we got HCA frequency 0 - disable timestamping
+			 * to avoid dividing by zero
+			 */
+			if (!dev->caps.hca_core_clock) {
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev,
+					 "HCA frequency is 0. Timestamping is not supported.");
+			} else if (map_internal_clock(dev)) {
+				/*
+				 * Map internal clock,
+				 * in case of failure disable timestamping
+				 */
+				dev->caps.flags2 &= ~MLX4_DEV_CAP_FLAG2_TS;
+				mlx4_err(dev, "Failed to map internal clock. Timestamping is not supported.\n");
+			}
+		}
 	} else {
 		err = mlx4_init_slave(dev);
 		if (err) {
@@ -1478,6 +1534,7 @@ static int mlx4_init_hca(struct mlx4_dev *dev)
 	return 0;
 
 unmap_bf:
+	unmap_internal_clock(dev);
 	unmap_bf_area(dev);
 
 err_close:
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 252f4ba7f32c..0567f01938ed 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -87,7 +87,8 @@ enum {
 	MLX4_HCR_SIZE		= 0x0001c,
 	MLX4_CLR_INT_SIZE	= 0x00008,
 	MLX4_SLAVE_COMM_BASE	= 0x0,
-	MLX4_COMM_PAGESIZE	= 0x1000
+	MLX4_COMM_PAGESIZE	= 0x1000,
+	MLX4_CLOCK_SIZE		= 0x00008
 };
 
 enum {
@@ -403,6 +404,7 @@ struct mlx4_fw {
 	u64			clr_int_base;
 	u64			catas_offset;
 	u64			comm_base;
+	u64			clock_offset;
 	struct mlx4_icm	       *fw_icm;
 	struct mlx4_icm	       *aux_icm;
 	u32			catas_size;
@@ -410,6 +412,7 @@ struct mlx4_fw {
 	u8			clr_int_bar;
 	u8			catas_bar;
 	u8			comm_bar;
+	u8			clock_bar;
 };
 
 struct mlx4_comm {
@@ -826,6 +829,7 @@ struct mlx4_priv {
 	struct list_head	bf_list;
 	struct mutex		bf_mutex;
 	struct io_mapping	*bf_mapping;
+	void __iomem            *clock_mapping;
 	int			reserved_mtts;
 	int			fs_hash_mode;
 	u8 virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 86ae260c2e53..e088290d9792 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -445,6 +445,7 @@ struct mlx4_caps {
 	u8			eqe_factor;
 	u32			userspace_caps; /* userspace must be aware of these */
 	u32			function_caps;  /* VFs must be aware of these */
+	u16			hca_core_clock;
 };
 
 struct mlx4_buf_list {
-- 
cgit 


From ec693d47010e8302e61e0bdf3f47496c5610641a Mon Sep 17 00:00:00 2001
From: Amir Vadai <amirv@mellanox.com>
Date: Tue, 23 Apr 2013 06:06:49 +0000
Subject: net/mlx4_en: Add HW timestamping (TS) support

The patch allows to enable/disable HW timestamping for incoming and/or
outgoing packets. It adds and initializes all structs and callbacks
needed by kernel TS API.
To enable/disable HW timestamping appropriate ioctl should be used.
Currently HWTSTAMP_FILTER_ALL/NONE and HWTSAMP_TX_ON/OFF only are
supported.
When enabling TS on receive flow - VLAN stripping will be disabled.
Also were made all relevant changes in RX/TX flows to consider TS request
and plant HW timestamps into relevant structures.
mlx4_ib was fixed to compile with new mlx4_cq_alloc() signature.

Signed-off-by: Eugenia Emantayev <eugenia@mellanox.com>
Signed-off-by: Amir Vadai <amirv@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/infiniband/hw/mlx4/cq.c                   |   2 +-
 drivers/net/ethernet/mellanox/mlx4/Makefile       |   2 +-
 drivers/net/ethernet/mellanox/mlx4/cq.c           |  10 +-
 drivers/net/ethernet/mellanox/mlx4/en_clock.c     | 132 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_cq.c        |  10 +-
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c   |  30 +++++
 drivers/net/ethernet/mellanox/mlx4/en_main.c      |   5 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c    |  75 ++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_resources.c |   3 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c        |  29 ++++-
 drivers/net/ethernet/mellanox/mlx4/en_tx.c        |  29 ++++-
 drivers/net/ethernet/mellanox/mlx4/main.c         |  22 ++++
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h      |  21 +++-
 include/linux/mlx4/cq.h                           |  16 +++
 include/linux/mlx4/device.h                       |   6 +-
 15 files changed, 375 insertions(+), 17 deletions(-)
 create mode 100644 drivers/net/ethernet/mellanox/mlx4/en_clock.c

(limited to 'include/linux')

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index ae67df35dd4d..73b3a7132587 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -228,7 +228,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector
 		vector = dev->eq_table[vector % ibdev->num_comp_vectors];
 
 	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-			    cq->db.dma, &cq->mcq, vector, 0);
+			    cq->db.dma, &cq->mcq, vector, 0, 0);
 	if (err)
 		goto err_dbmap;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/Makefile b/drivers/net/ethernet/mellanox/mlx4/Makefile
index 293127d28b33..3e9c70f15b42 100644
--- a/drivers/net/ethernet/mellanox/mlx4/Makefile
+++ b/drivers/net/ethernet/mellanox/mlx4/Makefile
@@ -6,5 +6,5 @@ mlx4_core-y :=	alloc.o catas.o cmd.o cq.o eq.o fw.o icm.o intf.o main.o mcg.o \
 obj-$(CONFIG_MLX4_EN)               += mlx4_en.o
 
 mlx4_en-y := 	en_main.o en_tx.o en_rx.o en_ethtool.o en_port.o en_cq.o \
-		en_resources.o en_netdev.o en_selftest.o
+		en_resources.o en_netdev.o en_selftest.o en_clock.o
 mlx4_en-$(CONFIG_MLX4_EN_DCB) += en_dcb_nl.o
diff --git a/drivers/net/ethernet/mellanox/mlx4/cq.c b/drivers/net/ethernet/mellanox/mlx4/cq.c
index 0706623cfb96..004e4231af67 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cq.c
@@ -240,9 +240,10 @@ static void mlx4_cq_free_icm(struct mlx4_dev *dev, int cqn)
 		__mlx4_cq_free_icm(dev, cqn);
 }
 
-int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
-		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
-		  unsigned vector, int collapsed)
+int mlx4_cq_alloc(struct mlx4_dev *dev, int nent,
+		  struct mlx4_mtt *mtt, struct mlx4_uar *uar, u64 db_rec,
+		  struct mlx4_cq *cq, unsigned vector, int collapsed,
+		  int timestamp_en)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
 	struct mlx4_cq_table *cq_table = &priv->cq_table;
@@ -276,6 +277,9 @@ int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 	memset(cq_context, 0, sizeof *cq_context);
 
 	cq_context->flags	    = cpu_to_be32(!!collapsed << 18);
+	if (timestamp_en)
+		cq_context->flags  |= cpu_to_be32(1 << 19);
+
 	cq_context->logsize_usrpage = cpu_to_be32((ilog2(nent) << 24) | uar->index);
 	cq_context->comp_eqn	    = priv->eq_table.eq[vector].eqn;
 	cq_context->log_page_size   = mtt->page_shift - MLX4_ICM_PAGE_SHIFT;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_clock.c b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
new file mode 100644
index 000000000000..501c72f1fbeb
--- /dev/null
+++ b/drivers/net/ethernet/mellanox/mlx4/en_clock.c
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/mlx4/device.h>
+
+#include "mlx4_en.h"
+
+int mlx4_en_timestamp_config(struct net_device *dev, int tx_type, int rx_filter)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int port_up = 0;
+	int err = 0;
+
+	mutex_lock(&mdev->state_lock);
+	if (priv->port_up) {
+		port_up = 1;
+		mlx4_en_stop_port(dev, 1);
+	}
+
+	mlx4_en_free_resources(priv);
+
+	en_warn(priv, "Changing Time Stamp configuration\n");
+
+	priv->hwtstamp_config.tx_type = tx_type;
+	priv->hwtstamp_config.rx_filter = rx_filter;
+
+	if (rx_filter != HWTSTAMP_FILTER_NONE)
+		dev->features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+	else
+		dev->features |= NETIF_F_HW_VLAN_CTAG_RX;
+
+	err = mlx4_en_alloc_resources(priv);
+	if (err) {
+		en_err(priv, "Failed reallocating port resources\n");
+		goto out;
+	}
+	if (port_up) {
+		err = mlx4_en_start_port(dev);
+		if (err)
+			en_err(priv, "Failed starting port\n");
+	}
+
+out:
+	mutex_unlock(&mdev->state_lock);
+	netdev_features_change(dev);
+	return err;
+}
+
+/* mlx4_en_read_clock - read raw cycle counter (to be used by time counter)
+ */
+static cycle_t mlx4_en_read_clock(const struct cyclecounter *tc)
+{
+	struct mlx4_en_dev *mdev =
+		container_of(tc, struct mlx4_en_dev, cycles);
+	struct mlx4_dev *dev = mdev->dev;
+
+	return mlx4_read_clock(dev) & tc->mask;
+}
+
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe)
+{
+	u64 hi, lo;
+	struct mlx4_ts_cqe *ts_cqe = (struct mlx4_ts_cqe *)cqe;
+
+	lo = (u64)be16_to_cpu(ts_cqe->timestamp_lo);
+	hi = ((u64)be32_to_cpu(ts_cqe->timestamp_hi) + !lo) << 16;
+
+	return hi | lo;
+}
+
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp)
+{
+	u64 nsec;
+
+	nsec = timecounter_cyc2time(&mdev->clock, timestamp);
+
+	memset(hwts, 0, sizeof(struct skb_shared_hwtstamps));
+	hwts->hwtstamp = ns_to_ktime(nsec);
+}
+
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev)
+{
+	struct mlx4_dev *dev = mdev->dev;
+
+	memset(&mdev->cycles, 0, sizeof(mdev->cycles));
+	mdev->cycles.read = mlx4_en_read_clock;
+	mdev->cycles.mask = CLOCKSOURCE_MASK(48);
+	/* Using shift to make calculation more accurate. Since current HW
+	 * clock frequency is 427 MHz, and cycles are given using a 48 bits
+	 * register, the biggest shift when calculating using u64, is 14
+	 * (max_cycles * multiplier < 2^64)
+	 */
+	mdev->cycles.shift = 14;
+	mdev->cycles.mult =
+		clocksource_khz2mult(1000 * dev->caps.hca_core_clock, mdev->cycles.shift);
+
+	timecounter_init(&mdev->clock, &mdev->cycles,
+			 ktime_to_ns(ktime_get_real()));
+}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_cq.c b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
index b8d0854a7ad1..1e6c594d6d04 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_cq.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_cq.c
@@ -77,6 +77,7 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	struct mlx4_en_dev *mdev = priv->mdev;
 	int err = 0;
 	char name[25];
+	int timestamp_en = 0;
 	struct cpu_rmap *rmap =
 #ifdef CONFIG_RFS_ACCEL
 		priv->dev->rx_cpu_rmap;
@@ -123,8 +124,13 @@ int mlx4_en_activate_cq(struct mlx4_en_priv *priv, struct mlx4_en_cq *cq,
 	if (!cq->is_tx)
 		cq->size = priv->rx_ring[cq->ring].actual_size;
 
-	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt, &mdev->priv_uar,
-			    cq->wqres.db.dma, &cq->mcq, cq->vector, 0);
+	if ((cq->is_tx && priv->hwtstamp_config.tx_type) ||
+	    (!cq->is_tx && priv->hwtstamp_config.rx_filter))
+		timestamp_en = 1;
+
+	err = mlx4_cq_alloc(mdev->dev, cq->size, &cq->wqres.mtt,
+			    &mdev->priv_uar, cq->wqres.db.dma, &cq->mcq,
+			    cq->vector, 0, timestamp_en);
 	if (err)
 		return err;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 00f25b5f297f..bcf4d118e98c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1147,6 +1147,35 @@ out:
 	return err;
 }
 
+static int mlx4_en_get_ts_info(struct net_device *dev,
+			       struct ethtool_ts_info *info)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	int ret;
+
+	ret = ethtool_op_get_ts_info(dev, info);
+	if (ret)
+		return ret;
+
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS) {
+		info->so_timestamping |=
+			SOF_TIMESTAMPING_TX_HARDWARE |
+			SOF_TIMESTAMPING_RX_HARDWARE |
+			SOF_TIMESTAMPING_RAW_HARDWARE;
+
+		info->tx_types =
+			(1 << HWTSTAMP_TX_OFF) |
+			(1 << HWTSTAMP_TX_ON);
+
+		info->rx_filters =
+			(1 << HWTSTAMP_FILTER_NONE) |
+			(1 << HWTSTAMP_FILTER_ALL);
+	}
+
+	return ret;
+}
+
 const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.get_drvinfo = mlx4_en_get_drvinfo,
 	.get_settings = mlx4_en_get_settings,
@@ -1173,6 +1202,7 @@ const struct ethtool_ops mlx4_en_ethtool_ops = {
 	.set_rxfh_indir = mlx4_en_set_rxfh_indir,
 	.get_channels = mlx4_en_get_channels,
 	.set_channels = mlx4_en_set_channels,
+	.get_ts_info = mlx4_en_get_ts_info,
 };
 
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_main.c b/drivers/net/ethernet/mellanox/mlx4/en_main.c
index fc27800e9c38..a5c9df07a7d0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_main.c
@@ -300,6 +300,11 @@ static void *mlx4_en_add(struct mlx4_dev *dev)
 		if (mlx4_en_init_netdev(mdev, i, &mdev->profile.prof[i]))
 			mdev->pndev[i] = NULL;
 	}
+
+	/* Initialize time stamp mechanism */
+	if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS)
+		mlx4_en_init_timestamp(mdev);
+
 	return mdev;
 
 err_mr:
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index e7e27842d8d4..4cb9f3203973 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -1916,6 +1916,75 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+static int mlx4_en_hwtstamp_ioctl(struct net_device *dev, struct ifreq *ifr)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
+	struct hwtstamp_config config;
+
+	if (copy_from_user(&config, ifr->ifr_data, sizeof(config)))
+		return -EFAULT;
+
+	/* reserved for future extensions */
+	if (config.flags)
+		return -EINVAL;
+
+	/* device doesn't support time stamping */
+	if (!(mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_TS))
+		return -EINVAL;
+
+	/* TX HW timestamp */
+	switch (config.tx_type) {
+	case HWTSTAMP_TX_OFF:
+	case HWTSTAMP_TX_ON:
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	/* RX HW timestamp */
+	switch (config.rx_filter) {
+	case HWTSTAMP_FILTER_NONE:
+		break;
+	case HWTSTAMP_FILTER_ALL:
+	case HWTSTAMP_FILTER_SOME:
+	case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+	case HWTSTAMP_FILTER_PTP_V2_EVENT:
+	case HWTSTAMP_FILTER_PTP_V2_SYNC:
+	case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+		config.rx_filter = HWTSTAMP_FILTER_ALL;
+		break;
+	default:
+		return -ERANGE;
+	}
+
+	if (mlx4_en_timestamp_config(dev, config.tx_type, config.rx_filter)) {
+		config.tx_type = HWTSTAMP_TX_OFF;
+		config.rx_filter = HWTSTAMP_FILTER_NONE;
+	}
+
+	return copy_to_user(ifr->ifr_data, &config,
+			    sizeof(config)) ? -EFAULT : 0;
+}
+
+static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	switch (cmd) {
+	case SIOCSHWTSTAMP:
+		return mlx4_en_hwtstamp_ioctl(dev, ifr);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
 static int mlx4_en_set_features(struct net_device *netdev,
 		netdev_features_t features)
 {
@@ -1943,6 +2012,7 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_set_mac_address	= mlx4_en_set_mac,
 	.ndo_validate_addr	= eth_validate_addr,
 	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_do_ioctl		= mlx4_en_ioctl,
 	.ndo_tx_timeout		= mlx4_en_tx_timeout,
 	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
@@ -2054,6 +2124,11 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	spin_lock_init(&priv->filters_lock);
 #endif
 
+	/* Initialize time stamping config */
+	priv->hwtstamp_config.flags = 0;
+	priv->hwtstamp_config.tx_type = HWTSTAMP_TX_OFF;
+	priv->hwtstamp_config.rx_filter = HWTSTAMP_FILTER_NONE;
+
 	/* Allocate page for receive rings */
 	err = mlx4_alloc_hwq_res(mdev->dev, &priv->res,
 				MLX4_EN_PAGE_SIZE, MLX4_EN_PAGE_SIZE);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_resources.c b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
index 10c24c784b70..91f2b2c43c12 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_resources.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_resources.c
@@ -42,6 +42,7 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 			     int user_prio, struct mlx4_qp_context *context)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
+	struct net_device *dev = priv->dev;
 
 	memset(context, 0, sizeof *context);
 	context->flags = cpu_to_be32(7 << 16 | rss << MLX4_RSS_QPC_FLAG_OFFSET);
@@ -65,6 +66,8 @@ void mlx4_en_fill_qp_context(struct mlx4_en_priv *priv, int size, int stride,
 	context->cqn_send = cpu_to_be32(cqn);
 	context->cqn_recv = cpu_to_be32(cqn);
 	context->db_rec_addr = cpu_to_be64(priv->res.db.dma << 2);
+	if (!(dev->features & NETIF_F_HW_VLAN_CTAG_RX))
+		context->param3 |= cpu_to_be32(1 << 30);
 }
 
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 4006f8857cb5..02aee1ebd203 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -320,6 +320,8 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	}
 	ring->buf = ring->wqres.buf.direct.buf;
 
+	ring->hwtstamp_rx_filter = priv->hwtstamp_config.rx_filter;
+
 	return 0;
 
 err_hwq:
@@ -554,6 +556,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_cqe *cqe;
 	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
 	struct mlx4_en_rx_alloc *frags;
@@ -565,6 +568,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	int polled = 0;
 	int ip_summed;
 	int factor = priv->cqe_factor;
+	u64 timestamp;
 
 	if (!priv->port_up)
 		return 0;
@@ -669,8 +673,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					gro_skb->data_len = length;
 					gro_skb->ip_summed = CHECKSUM_UNNECESSARY;
 
-					if (cqe->vlan_my_qpn &
-					    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) {
+					if ((cqe->vlan_my_qpn &
+					    cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) &&
+					    (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
 						u16 vid = be16_to_cpu(cqe->sl_vid);
 
 						__vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
@@ -680,8 +685,15 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 						gro_skb->rxhash = be32_to_cpu(cqe->immed_rss_invalid);
 
 					skb_record_rx_queue(gro_skb, cq->ring);
-					napi_gro_frags(&cq->napi);
 
+					if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
+						timestamp = mlx4_en_get_cqe_ts(cqe);
+						mlx4_en_fill_hwtstamps(mdev,
+								       skb_hwtstamps(gro_skb),
+								       timestamp);
+					}
+
+					napi_gro_frags(&cq->napi);
 					goto next;
 				}
 
@@ -714,10 +726,17 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		if (dev->features & NETIF_F_RXHASH)
 			skb->rxhash = be32_to_cpu(cqe->immed_rss_invalid);
 
-		if (be32_to_cpu(cqe->vlan_my_qpn) &
-		    MLX4_CQE_VLAN_PRESENT_MASK)
+		if ((be32_to_cpu(cqe->vlan_my_qpn) &
+		    MLX4_CQE_VLAN_PRESENT_MASK) &&
+		    (dev->features & NETIF_F_HW_VLAN_CTAG_RX))
 			__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe->sl_vid));
 
+		if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
+			timestamp = mlx4_en_get_cqe_ts(cqe);
+			mlx4_en_fill_hwtstamps(mdev, skb_hwtstamps(skb),
+					       timestamp);
+		}
+
 		/* Push it up the stack */
 		netif_receive_skb(skb);
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index 49308cc65ee7..b0a2d2b96e43 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -118,6 +118,8 @@ int mlx4_en_create_tx_ring(struct mlx4_en_priv *priv,
 	} else
 		ring->bf_enabled = true;
 
+	ring->hwtstamp_tx_type = priv->hwtstamp_config.tx_type;
+
 	return 0;
 
 err_map:
@@ -192,8 +194,9 @@ void mlx4_en_deactivate_tx_ring(struct mlx4_en_priv *priv,
 
 static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
 				struct mlx4_en_tx_ring *ring,
-				int index, u8 owner)
+				int index, u8 owner, u64 timestamp)
 {
+	struct mlx4_en_dev *mdev = priv->mdev;
 	struct mlx4_en_tx_info *tx_info = &ring->tx_info[index];
 	struct mlx4_en_tx_desc *tx_desc = ring->buf + index * TXBB_SIZE;
 	struct mlx4_wqe_data_seg *data = (void *) tx_desc + tx_info->data_offset;
@@ -204,6 +207,12 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
 	int i;
 	__be32 *ptr = (__be32 *)tx_desc;
 	__be32 stamp = cpu_to_be32(STAMP_VAL | (!!owner << STAMP_SHIFT));
+	struct skb_shared_hwtstamps hwts;
+
+	if (timestamp) {
+		mlx4_en_fill_hwtstamps(mdev, &hwts, timestamp);
+		skb_tstamp_tx(skb, &hwts);
+	}
 
 	/* Optimize the common case when there are no wraparounds */
 	if (likely((void *) tx_desc + tx_info->nr_txbb * TXBB_SIZE <= end)) {
@@ -289,7 +298,7 @@ int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring)
 	while (ring->cons != ring->prod) {
 		ring->last_nr_txbb = mlx4_en_free_tx_desc(priv, ring,
 						ring->cons & ring->size_mask,
-						!!(ring->cons & ring->size));
+						!!(ring->cons & ring->size), 0);
 		ring->cons += ring->last_nr_txbb;
 		cnt++;
 	}
@@ -318,6 +327,7 @@ static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq)
 	u32 packets = 0;
 	u32 bytes = 0;
 	int factor = priv->cqe_factor;
+	u64 timestamp = 0;
 
 	if (!priv->port_up)
 		return;
@@ -341,11 +351,14 @@ static void mlx4_en_process_tx_cq(struct net_device *dev, struct mlx4_en_cq *cq)
 		do {
 			txbbs_skipped += ring->last_nr_txbb;
 			ring_index = (ring_index + ring->last_nr_txbb) & size_mask;
+			if (ring->tx_info[ring_index].ts_requested)
+				timestamp = mlx4_en_get_cqe_ts(cqe);
+
 			/* free next descriptor */
 			ring->last_nr_txbb = mlx4_en_free_tx_desc(
 					priv, ring, ring_index,
 					!!((ring->cons + txbbs_skipped) &
-							ring->size));
+					ring->size), timestamp);
 			packets++;
 			bytes += ring->tx_info[ring_index].nr_bytes;
 		} while (ring_index != new_index);
@@ -629,6 +642,16 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev)
 	tx_info->skb = skb;
 	tx_info->nr_txbb = nr_txbb;
 
+	/*
+	 * For timestamping add flag to skb_shinfo and
+	 * set flag for further reference
+	 */
+	if (ring->hwtstamp_tx_type == HWTSTAMP_TX_ON &&
+	    skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP) {
+		skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS;
+		tx_info->ts_requested = 1;
+	}
+
 	/* Prepare ctrl segement apart opcode+ownership, which depends on
 	 * whether LSO is used */
 	tx_desc->ctrl.vlan_tag = cpu_to_be16(vlan_tag);
diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index e81840faa6c6..0d32a82458bf 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -1228,6 +1228,28 @@ static void unmap_bf_area(struct mlx4_dev *dev)
 		io_mapping_free(mlx4_priv(dev)->bf_mapping);
 }
 
+cycle_t mlx4_read_clock(struct mlx4_dev *dev)
+{
+	u32 clockhi, clocklo, clockhi1;
+	cycle_t cycles;
+	int i;
+	struct mlx4_priv *priv = mlx4_priv(dev);
+
+	for (i = 0; i < 10; i++) {
+		clockhi = swab32(readl(priv->clock_mapping));
+		clocklo = swab32(readl(priv->clock_mapping + 4));
+		clockhi1 = swab32(readl(priv->clock_mapping));
+		if (clockhi == clockhi1)
+			break;
+	}
+
+	cycles = (u64) clockhi << 32 | (u64) clocklo;
+
+	return cycles;
+}
+EXPORT_SYMBOL_GPL(mlx4_read_clock);
+
+
 static int map_internal_clock(struct mlx4_dev *dev)
 {
 	struct mlx4_priv *priv = mlx4_priv(dev);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index d4cb5d3b28a2..85b0754ec8ac 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -40,6 +40,7 @@
 #include <linux/mutex.h>
 #include <linux/netdevice.h>
 #include <linux/if_vlan.h>
+#include <linux/net_tstamp.h>
 #ifdef CONFIG_MLX4_EN_DCB
 #include <linux/dcbnl.h>
 #endif
@@ -207,6 +208,7 @@ struct mlx4_en_tx_info {
 	u8 linear;
 	u8 data_offset;
 	u8 inl;
+	u8 ts_requested;
 };
 
 
@@ -262,6 +264,7 @@ struct mlx4_en_tx_ring {
 	struct mlx4_bf bf;
 	bool bf_enabled;
 	struct netdev_queue *tx_queue;
+	int hwtstamp_tx_type;
 };
 
 struct mlx4_en_rx_desc {
@@ -288,6 +291,7 @@ struct mlx4_en_rx_ring {
 	unsigned long packets;
 	unsigned long csum_ok;
 	unsigned long csum_none;
+	int hwtstamp_rx_filter;
 };
 
 struct mlx4_en_cq {
@@ -348,6 +352,9 @@ struct mlx4_en_dev {
 	u32                     priv_pdn;
 	spinlock_t              uar_lock;
 	u8			mac_removed[MLX4_MAX_PORTS + 1];
+	struct cyclecounter	cycles;
+	struct timecounter	clock;
+	unsigned long		last_overflow_check;
 };
 
 
@@ -525,6 +532,7 @@ struct mlx4_en_priv {
 	struct device *ddev;
 	int base_tx_qpn;
 	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
+	struct hwtstamp_config hwtstamp_config;
 
 #ifdef CONFIG_MLX4_EN_DCB
 	struct ieee_ets ets;
@@ -639,7 +647,18 @@ void mlx4_en_ex_selftest(struct net_device *dev, u32 *flags, u64 *buf);
 u64 mlx4_en_mac_to_u64(u8 *addr);
 
 /*
- * Globals
+ * Functions for time stamping
+ */
+u64 mlx4_en_get_cqe_ts(struct mlx4_cqe *cqe);
+void mlx4_en_fill_hwtstamps(struct mlx4_en_dev *mdev,
+			    struct skb_shared_hwtstamps *hwts,
+			    u64 timestamp);
+void mlx4_en_init_timestamp(struct mlx4_en_dev *mdev);
+int mlx4_en_timestamp_config(struct net_device *dev,
+			     int tx_type,
+			     int rx_filter);
+
+/* Globals
  */
 extern const struct ethtool_ops mlx4_en_ethtool_ops;
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index 6f65b2c8bb89..98fa492cf406 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -64,6 +64,22 @@ struct mlx4_err_cqe {
 	u8			owner_sr_opcode;
 };
 
+struct mlx4_ts_cqe {
+	__be32			vlan_my_qpn;
+	__be32			immed_rss_invalid;
+	__be32			g_mlpath_rqpn;
+	__be32			timestamp_hi;
+	__be16			status;
+	u8			ipv6_ext_mask;
+	u8			badfcs_enc;
+	__be32			byte_cnt;
+	__be16			wqe_index;
+	__be16			checksum;
+	u8			reserved;
+	__be16			timestamp_lo;
+	u8			owner_sr_opcode;
+} __packed;
+
 enum {
 	MLX4_CQE_VLAN_PRESENT_MASK	= 1 << 29,
 	MLX4_CQE_QPN_MASK		= 0xffffff,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index e088290d9792..2fbc1464b53b 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -40,6 +40,8 @@
 
 #include <linux/atomic.h>
 
+#include <linux/clocksource.h>
+
 #define MAX_MSIX_P_PORT		17
 #define MAX_MSIX		64
 #define MSIX_LEGACY_SZ		4
@@ -840,7 +842,7 @@ void mlx4_free_hwq_res(struct mlx4_dev *mdev, struct mlx4_hwq_resources *wqres,
 
 int mlx4_cq_alloc(struct mlx4_dev *dev, int nent, struct mlx4_mtt *mtt,
 		  struct mlx4_uar *uar, u64 db_rec, struct mlx4_cq *cq,
-		  unsigned vector, int collapsed);
+		  unsigned vector, int collapsed, int timestamp_en);
 void mlx4_cq_free(struct mlx4_dev *dev, struct mlx4_cq *cq);
 
 int mlx4_qp_reserve_range(struct mlx4_dev *dev, int cnt, int align, int *base);
@@ -1031,4 +1033,6 @@ int set_and_calc_slave_port_state(struct mlx4_dev *dev, int slave, u8 port, int
 void mlx4_put_slave_node_guid(struct mlx4_dev *dev, int slave, __be64 guid);
 __be64 mlx4_get_slave_node_guid(struct mlx4_dev *dev, int slave);
 
+cycle_t mlx4_read_clock(struct mlx4_dev *dev);
+
 #endif /* MLX4_DEVICE_H */
-- 
cgit 


From 3cd0e1789ad39c3b7ed006ce53a83328bdadbee8 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 24 Apr 2013 13:58:44 +0000
Subject: mlx4_core: Move DMFS HW structs to common header file

Move flow steering HW structures to be on the public mlx4 include
directory, as a pre-step for the mlx4 IB driver to use them too.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/mlx4.h | 79 -------------------------------
 include/linux/mlx4/device.h               | 79 +++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 79 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index d738454116a0..d5fdb19771e2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -701,85 +701,6 @@ struct mlx4_steer {
 	struct list_head steer_entries[MLX4_NUM_STEERS];
 };
 
-struct mlx4_net_trans_rule_hw_ctrl {
-	__be32 ctrl;
-	u8 rsvd1;
-	u8 funcid;
-	u8 vep;
-	u8 port;
-	__be32 qpn;
-	__be32 rsvd2;
-};
-
-struct mlx4_net_trans_rule_hw_ib {
-	u8 size;
-	u8 rsvd1;
-	__be16 id;
-	u32 rsvd2;
-	__be32 qpn;
-	__be32 qpn_mask;
-	u8 dst_gid[16];
-	u8 dst_gid_msk[16];
-} __packed;
-
-struct mlx4_net_trans_rule_hw_eth {
-	u8	size;
-	u8	rsvd;
-	__be16	id;
-	u8	rsvd1[6];
-	u8	dst_mac[6];
-	u16	rsvd2;
-	u8	dst_mac_msk[6];
-	u16	rsvd3;
-	u8	src_mac[6];
-	u16	rsvd4;
-	u8	src_mac_msk[6];
-	u8      rsvd5;
-	u8      ether_type_enable;
-	__be16  ether_type;
-	__be16  vlan_id_msk;
-	__be16  vlan_id;
-} __packed;
-
-struct mlx4_net_trans_rule_hw_tcp_udp {
-	u8	size;
-	u8	rsvd;
-	__be16	id;
-	__be16	rsvd1[3];
-	__be16	dst_port;
-	__be16	rsvd2;
-	__be16	dst_port_msk;
-	__be16	rsvd3;
-	__be16	src_port;
-	__be16	rsvd4;
-	__be16	src_port_msk;
-} __packed;
-
-struct mlx4_net_trans_rule_hw_ipv4 {
-	u8	size;
-	u8	rsvd;
-	__be16	id;
-	__be32	rsvd1;
-	__be32	dst_ip;
-	__be32	dst_ip_msk;
-	__be32	src_ip;
-	__be32	src_ip_msk;
-} __packed;
-
-struct _rule_hw {
-	union {
-		struct {
-			u8 size;
-			u8 rsvd;
-			__be16 id;
-		};
-		struct mlx4_net_trans_rule_hw_eth eth;
-		struct mlx4_net_trans_rule_hw_ib ib;
-		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
-		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
-	};
-};
-
 enum {
 	MLX4_PCI_DEV_IS_VF		= 1 << 0,
 	MLX4_PCI_DEV_FORCE_SENSE_PORT	= 1 << 1,
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 811f91cf5e8c..9fbf416fa694 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -962,6 +962,85 @@ struct mlx4_net_trans_rule {
 	u32	qpn;
 };
 
+struct mlx4_net_trans_rule_hw_ctrl {
+	__be32 ctrl;
+	u8 rsvd1;
+	u8 funcid;
+	u8 vep;
+	u8 port;
+	__be32 qpn;
+	__be32 rsvd2;
+};
+
+struct mlx4_net_trans_rule_hw_ib {
+	u8 size;
+	u8 rsvd1;
+	__be16 id;
+	u32 rsvd2;
+	__be32 qpn;
+	__be32 qpn_mask;
+	u8 dst_gid[16];
+	u8 dst_gid_msk[16];
+} __packed;
+
+struct mlx4_net_trans_rule_hw_eth {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	u8	rsvd1[6];
+	u8	dst_mac[6];
+	u16	rsvd2;
+	u8	dst_mac_msk[6];
+	u16	rsvd3;
+	u8	src_mac[6];
+	u16	rsvd4;
+	u8	src_mac_msk[6];
+	u8      rsvd5;
+	u8      ether_type_enable;
+	__be16  ether_type;
+	__be16  vlan_id_msk;
+	__be16  vlan_id;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_tcp_udp {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be16	rsvd1[3];
+	__be16	dst_port;
+	__be16	rsvd2;
+	__be16	dst_port_msk;
+	__be16	rsvd3;
+	__be16	src_port;
+	__be16	rsvd4;
+	__be16	src_port_msk;
+} __packed;
+
+struct mlx4_net_trans_rule_hw_ipv4 {
+	u8	size;
+	u8	rsvd;
+	__be16	id;
+	__be32	rsvd1;
+	__be32	dst_ip;
+	__be32	dst_ip_msk;
+	__be32	src_ip;
+	__be32	src_ip_msk;
+} __packed;
+
+struct _rule_hw {
+	union {
+		struct {
+			u8 size;
+			u8 rsvd;
+			__be16 id;
+		};
+		struct mlx4_net_trans_rule_hw_eth eth;
+		struct mlx4_net_trans_rule_hw_ib ib;
+		struct mlx4_net_trans_rule_hw_ipv4 ipv4;
+		struct mlx4_net_trans_rule_hw_tcp_udp tcp_udp;
+	};
+};
+
 int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
 				enum mlx4_net_trans_promisc_mode mode);
 int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
-- 
cgit 


From f91625398a2e6e03f0155861b630021ceddb42e7 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 24 Apr 2013 13:58:45 +0000
Subject: mlx4: Match DMFS promiscuous field names to firmware spec

Align the names used by enum mlx4_net_trans_promisc_mode with the
actual firmware specification.  The patch doesn't introduce any
functional change or API change towards the firmware.

Remove MLX4_FS_PROMISC_FUNCTION_PORT which isn't of use.  Add new
enums MLX4_FS_{UC/MC}_SNIFFER as a preparation step for sniffer
support.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 16 ++++++++--------
 drivers/net/ethernet/mellanox/mlx4/mcg.c        | 21 ++++++++++-----------
 include/linux/mlx4/device.h                     | 11 ++++++-----
 4 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 00f25b5f297f..20476844fb20 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -889,7 +889,7 @@ static int mlx4_en_flow_replace(struct net_device *dev,
 		.queue_mode = MLX4_NET_TRANS_Q_FIFO,
 		.exclusive = 0,
 		.allow_loopback = 1,
-		.promisc_mode = MLX4_FS_PROMISC_NONE,
+		.promisc_mode = MLX4_FS_REGULAR,
 	};
 
 	rule.port = priv->port;
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 30d78f806dc3..0860130f2b17 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -127,7 +127,7 @@ static void mlx4_en_filter_work(struct work_struct *work)
 		.queue_mode = MLX4_NET_TRANS_Q_LIFO,
 		.exclusive = 1,
 		.allow_loopback = 1,
-		.promisc_mode = MLX4_FS_PROMISC_NONE,
+		.promisc_mode = MLX4_FS_REGULAR,
 		.port = priv->port,
 		.priority = MLX4_DOMAIN_RFS,
 	};
@@ -446,7 +446,7 @@ static int mlx4_en_uc_steer_add(struct mlx4_en_priv *priv,
 			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
 			.exclusive = 0,
 			.allow_loopback = 1,
-			.promisc_mode = MLX4_FS_PROMISC_NONE,
+			.promisc_mode = MLX4_FS_REGULAR,
 			.priority = MLX4_DOMAIN_NIC,
 		};
 
@@ -793,7 +793,7 @@ static void mlx4_en_set_promisc_mode(struct mlx4_en_priv *priv,
 			err = mlx4_flow_steer_promisc_add(mdev->dev,
 							  priv->port,
 							  priv->base_qpn,
-							  MLX4_FS_PROMISC_UPLINK);
+							  MLX4_FS_ALL_DEFAULT);
 			if (err)
 				en_err(priv, "Failed enabling promiscuous mode\n");
 			priv->flags |= MLX4_EN_FLAG_MC_PROMISC;
@@ -856,7 +856,7 @@ static void mlx4_en_clear_promisc_mode(struct mlx4_en_priv *priv,
 	case MLX4_STEERING_MODE_DEVICE_MANAGED:
 		err = mlx4_flow_steer_promisc_remove(mdev->dev,
 						     priv->port,
-						     MLX4_FS_PROMISC_UPLINK);
+						     MLX4_FS_ALL_DEFAULT);
 		if (err)
 			en_err(priv, "Failed disabling promiscuous mode\n");
 		priv->flags &= ~MLX4_EN_FLAG_MC_PROMISC;
@@ -917,7 +917,7 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 				err = mlx4_flow_steer_promisc_add(mdev->dev,
 								  priv->port,
 								  priv->base_qpn,
-								  MLX4_FS_PROMISC_ALL_MULTI);
+								  MLX4_FS_MC_DEFAULT);
 				break;
 
 			case MLX4_STEERING_MODE_B0:
@@ -940,7 +940,7 @@ static void mlx4_en_do_multicast(struct mlx4_en_priv *priv,
 			case MLX4_STEERING_MODE_DEVICE_MANAGED:
 				err = mlx4_flow_steer_promisc_remove(mdev->dev,
 								     priv->port,
-								     MLX4_FS_PROMISC_ALL_MULTI);
+								     MLX4_FS_MC_DEFAULT);
 				break;
 
 			case MLX4_STEERING_MODE_B0:
@@ -1598,10 +1598,10 @@ void mlx4_en_stop_port(struct net_device *dev, int detach)
 				 MLX4_EN_FLAG_MC_PROMISC);
 		mlx4_flow_steer_promisc_remove(mdev->dev,
 					       priv->port,
-					       MLX4_FS_PROMISC_UPLINK);
+					       MLX4_FS_ALL_DEFAULT);
 		mlx4_flow_steer_promisc_remove(mdev->dev,
 					       priv->port,
-					       MLX4_FS_PROMISC_ALL_MULTI);
+					       MLX4_FS_MC_DEFAULT);
 	} else if (priv->flags & MLX4_EN_FLAG_PROMISC) {
 		priv->flags &= ~MLX4_EN_FLAG_PROMISC;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 52685524708d..d1f01dc8fbc3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -649,10 +649,11 @@ static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
 				  struct mlx4_net_trans_rule_hw_ctrl *hw)
 {
 	static const u8 __promisc_mode[] = {
-		[MLX4_FS_PROMISC_NONE]   = 0x0,
-		[MLX4_FS_PROMISC_UPLINK] = 0x1,
-		[MLX4_FS_PROMISC_FUNCTION_PORT] = 0x2,
-		[MLX4_FS_PROMISC_ALL_MULTI] = 0x3,
+		[MLX4_FS_REGULAR]	= 0x0,
+		[MLX4_FS_ALL_DEFAULT]	= 0x1,
+		[MLX4_FS_MC_DEFAULT]	= 0x3,
+		[MLX4_FS_UC_SNIFFER]	= 0x4,
+		[MLX4_FS_MC_SNIFFER]	= 0x5,
 	};
 
 	u32 dw = 0;
@@ -1153,7 +1154,7 @@ int mlx4_multicast_attach(struct mlx4_dev *dev, struct mlx4_qp *qp, u8 gid[16],
 		struct mlx4_net_trans_rule rule = {
 			.queue_mode = MLX4_NET_TRANS_Q_FIFO,
 			.exclusive = 0,
-			.promisc_mode = MLX4_FS_PROMISC_NONE,
+			.promisc_mode = MLX4_FS_REGULAR,
 			.priority = MLX4_DOMAIN_NIC,
 		};
 
@@ -1222,11 +1223,10 @@ int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port,
 	u64 *regid_p;
 
 	switch (mode) {
-	case MLX4_FS_PROMISC_UPLINK:
-	case MLX4_FS_PROMISC_FUNCTION_PORT:
+	case MLX4_FS_ALL_DEFAULT:
 		regid_p = &dev->regid_promisc_array[port];
 		break;
-	case MLX4_FS_PROMISC_ALL_MULTI:
+	case MLX4_FS_MC_DEFAULT:
 		regid_p = &dev->regid_allmulti_array[port];
 		break;
 	default:
@@ -1253,11 +1253,10 @@ int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
 	u64 *regid_p;
 
 	switch (mode) {
-	case MLX4_FS_PROMISC_UPLINK:
-	case MLX4_FS_PROMISC_FUNCTION_PORT:
+	case MLX4_FS_ALL_DEFAULT:
 		regid_p = &dev->regid_promisc_array[port];
 		break;
-	case MLX4_FS_PROMISC_ALL_MULTI:
+	case MLX4_FS_MC_DEFAULT:
 		regid_p = &dev->regid_allmulti_array[port];
 		break;
 	default:
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 9fbf416fa694..b2fe59d199f8 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -896,11 +896,12 @@ static inline int map_hw_to_sw_id(u16 header_id)
 }
 
 enum mlx4_net_trans_promisc_mode {
-	MLX4_FS_PROMISC_NONE = 0,
-	MLX4_FS_PROMISC_UPLINK,
-	/* For future use. Not implemented yet */
-	MLX4_FS_PROMISC_FUNCTION_PORT,
-	MLX4_FS_PROMISC_ALL_MULTI,
+	MLX4_FS_REGULAR = 1,
+	MLX4_FS_ALL_DEFAULT,
+	MLX4_FS_MC_DEFAULT,
+	MLX4_FS_UC_SNIFFER,
+	MLX4_FS_MC_SNIFFER,
+
 };
 
 struct mlx4_spec_eth {
-- 
cgit 


From ba60a3560ccf8e4253da9c0e8d1292375fd51796 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 24 Apr 2013 13:58:46 +0000
Subject: mlx4_core: Change a few DMFS fields names to match firmare spec

Change struct mlx4_net_trans_rule_hw_eth :: vlan_id name to vlan_tag

Change struct mlx4_net_trans_rule_hw_ib :: r_u_qpn name to l3_qpn

The patch doesn't introduce any functional change or API change
towards the firmware.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/mcg.c | 6 +++---
 include/linux/mlx4/device.h              | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index d1f01dc8fbc3..3cfd3725961d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -714,12 +714,12 @@ static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
 			rule_hw->eth.ether_type_enable = 1;
 			rule_hw->eth.ether_type = spec->eth.ether_type;
 		}
-		rule_hw->eth.vlan_id = spec->eth.vlan_id;
-		rule_hw->eth.vlan_id_msk = spec->eth.vlan_id_msk;
+		rule_hw->eth.vlan_tag = spec->eth.vlan_id;
+		rule_hw->eth.vlan_tag_msk = spec->eth.vlan_id_msk;
 		break;
 
 	case MLX4_NET_TRANS_RULE_ID_IB:
-		rule_hw->ib.qpn = spec->ib.r_qpn;
+		rule_hw->ib.l3_qpn = spec->ib.l3_qpn;
 		rule_hw->ib.qpn_mask = spec->ib.qpn_msk;
 		memcpy(&rule_hw->ib.dst_gid, &spec->ib.dst_gid, 16);
 		memcpy(&rule_hw->ib.dst_gid_msk, &spec->ib.dst_gid_msk, 16);
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index b2fe59d199f8..a69bda77c1d1 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -930,7 +930,7 @@ struct mlx4_spec_ipv4 {
 };
 
 struct mlx4_spec_ib {
-	__be32	r_qpn;
+	__be32  l3_qpn;
 	__be32	qpn_msk;
 	u8	dst_gid[16];
 	u8	dst_gid_msk[16];
@@ -978,7 +978,7 @@ struct mlx4_net_trans_rule_hw_ib {
 	u8 rsvd1;
 	__be16 id;
 	u32 rsvd2;
-	__be32 qpn;
+	__be32 l3_qpn;
 	__be32 qpn_mask;
 	u8 dst_gid[16];
 	u8 dst_gid_msk[16];
@@ -999,8 +999,8 @@ struct mlx4_net_trans_rule_hw_eth {
 	u8      rsvd5;
 	u8      ether_type_enable;
 	__be16  ether_type;
-	__be16  vlan_id_msk;
-	__be16  vlan_id;
+	__be16  vlan_tag_msk;
+	__be16  vlan_tag;
 } __packed;
 
 struct mlx4_net_trans_rule_hw_tcp_udp {
-- 
cgit 


From bcf372971d471f6cb4070adb7bfc987d8b3d21f2 Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 24 Apr 2013 13:58:47 +0000
Subject: mlx4_core: Directly expose fields of DMFS HW rule control segment

Some of struct mlx4_net_trans_rule_hw_ctrl fields were packed into u32
and accessed through bit field operations.  Expose and access them
directly as u8 or u16.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/mcg.c | 14 +++++++-------
 include/linux/mlx4/device.h              |  4 +++-
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 3cfd3725961d..07712f976c8f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -656,15 +656,15 @@ static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
 		[MLX4_FS_MC_SNIFFER]	= 0x5,
 	};
 
-	u32 dw = 0;
+	u8 flags = 0;
 
-	dw = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
-	dw |= ctrl->exclusive ? (1 << 2) : 0;
-	dw |= ctrl->allow_loopback ? (1 << 3) : 0;
-	dw |= __promisc_mode[ctrl->promisc_mode] << 8;
-	dw |= ctrl->priority << 16;
+	flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
+	flags |= ctrl->exclusive ? (1 << 2) : 0;
+	flags |= ctrl->allow_loopback ? (1 << 3) : 0;
 
-	hw->ctrl = cpu_to_be32(dw);
+	hw->flags = flags;
+	hw->type = __promisc_mode[ctrl->promisc_mode];
+	hw->prio = cpu_to_be16(ctrl->priority);
 	hw->port = ctrl->port;
 	hw->qpn = cpu_to_be32(ctrl->qpn);
 }
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index a69bda77c1d1..08e5bc1387e1 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -964,7 +964,9 @@ struct mlx4_net_trans_rule {
 };
 
 struct mlx4_net_trans_rule_hw_ctrl {
-	__be32 ctrl;
+	__be16 prio;
+	u8 type;
+	u8 flags;
 	u8 rsvd1;
 	u8 funcid;
 	u8 vep;
-- 
cgit 


From c2c19dc3c9a1585e58804041e5a328cde425403a Mon Sep 17 00:00:00 2001
From: Hadar Hen Zion <hadarh@mellanox.com>
Date: Wed, 24 Apr 2013 13:58:48 +0000
Subject: mlx4_core: Expose a few helpers to fill DMFS HW strucutures

Re-arrange some of code which fills DMFS HW structures so we can use
it from within the core driver and from the IB driver too, e.g when
verbs DMFS structures are transformed into mlx4 hardware structs.

Also, add struct mlx4_flow_handle struct which will be of use by the
DMFS verbs flow in the IB driver.

Signed-off-by: Hadar Hen Zion <hadarh@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
---
 drivers/net/ethernet/mellanox/mlx4/mcg.c | 85 ++++++++++++++++++++++----------
 include/linux/mlx4/device.h              | 12 ++++-
 2 files changed, 70 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/mcg.c b/drivers/net/ethernet/mellanox/mlx4/mcg.c
index 07712f976c8f..00b4e7be7c7e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mcg.c
+++ b/drivers/net/ethernet/mellanox/mlx4/mcg.c
@@ -645,17 +645,28 @@ static int find_entry(struct mlx4_dev *dev, u8 port,
 	return err;
 }
 
+static const u8 __promisc_mode[] = {
+	[MLX4_FS_REGULAR]   = 0x0,
+	[MLX4_FS_ALL_DEFAULT] = 0x1,
+	[MLX4_FS_MC_DEFAULT] = 0x3,
+	[MLX4_FS_UC_SNIFFER] = 0x4,
+	[MLX4_FS_MC_SNIFFER] = 0x5,
+};
+
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type)
+{
+	if (flow_type >= MLX4_FS_MODE_NUM || flow_type < 0) {
+		mlx4_err(dev, "Invalid flow type. type = %d\n", flow_type);
+		return -EINVAL;
+	}
+	return __promisc_mode[flow_type];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_mode);
+
 static void trans_rule_ctrl_to_hw(struct mlx4_net_trans_rule *ctrl,
 				  struct mlx4_net_trans_rule_hw_ctrl *hw)
 {
-	static const u8 __promisc_mode[] = {
-		[MLX4_FS_REGULAR]	= 0x0,
-		[MLX4_FS_ALL_DEFAULT]	= 0x1,
-		[MLX4_FS_MC_DEFAULT]	= 0x3,
-		[MLX4_FS_UC_SNIFFER]	= 0x4,
-		[MLX4_FS_MC_SNIFFER]	= 0x5,
-	};
-
 	u8 flags = 0;
 
 	flags = ctrl->queue_mode == MLX4_NET_TRANS_Q_LIFO ? 1 : 0;
@@ -678,29 +689,51 @@ const u16 __sw_id_hw[] = {
 	[MLX4_NET_TRANS_RULE_ID_UDP]     = 0xE006
 };
 
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+	return __sw_id_hw[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_map_sw_to_hw_steering_id);
+
+static const int __rule_hw_sz[] = {
+	[MLX4_NET_TRANS_RULE_ID_ETH] =
+		sizeof(struct mlx4_net_trans_rule_hw_eth),
+	[MLX4_NET_TRANS_RULE_ID_IB] =
+		sizeof(struct mlx4_net_trans_rule_hw_ib),
+	[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
+	[MLX4_NET_TRANS_RULE_ID_IPV4] =
+		sizeof(struct mlx4_net_trans_rule_hw_ipv4),
+	[MLX4_NET_TRANS_RULE_ID_TCP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
+	[MLX4_NET_TRANS_RULE_ID_UDP] =
+		sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
+};
+
+int mlx4_hw_rule_sz(struct mlx4_dev *dev,
+	       enum mlx4_net_trans_rule_id id)
+{
+	if (id >= MLX4_NET_TRANS_RULE_NUM || id < 0) {
+		mlx4_err(dev, "Invalid network rule id. id = %d\n", id);
+		return -EINVAL;
+	}
+
+	return __rule_hw_sz[id];
+}
+EXPORT_SYMBOL_GPL(mlx4_hw_rule_sz);
+
 static int parse_trans_rule(struct mlx4_dev *dev, struct mlx4_spec_list *spec,
 			    struct _rule_hw *rule_hw)
 {
-	static const size_t __rule_hw_sz[] = {
-		[MLX4_NET_TRANS_RULE_ID_ETH] =
-			sizeof(struct mlx4_net_trans_rule_hw_eth),
-		[MLX4_NET_TRANS_RULE_ID_IB] =
-			sizeof(struct mlx4_net_trans_rule_hw_ib),
-		[MLX4_NET_TRANS_RULE_ID_IPV6] = 0,
-		[MLX4_NET_TRANS_RULE_ID_IPV4] =
-			sizeof(struct mlx4_net_trans_rule_hw_ipv4),
-		[MLX4_NET_TRANS_RULE_ID_TCP] =
-			sizeof(struct mlx4_net_trans_rule_hw_tcp_udp),
-		[MLX4_NET_TRANS_RULE_ID_UDP] =
-			sizeof(struct mlx4_net_trans_rule_hw_tcp_udp)
-	};
-	if (spec->id >= MLX4_NET_TRANS_RULE_NUM) {
-		mlx4_err(dev, "Invalid network rule id. id = %d\n", spec->id);
+	if (mlx4_hw_rule_sz(dev, spec->id) < 0)
 		return -EINVAL;
-	}
-	memset(rule_hw, 0, __rule_hw_sz[spec->id]);
+	memset(rule_hw, 0, mlx4_hw_rule_sz(dev, spec->id));
 	rule_hw->id = cpu_to_be16(__sw_id_hw[spec->id]);
-	rule_hw->size = __rule_hw_sz[spec->id] >> 2;
+	rule_hw->size = mlx4_hw_rule_sz(dev, spec->id) >> 2;
 
 	switch (spec->id) {
 	case MLX4_NET_TRANS_RULE_ID_ETH:
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 08e5bc1387e1..ad4a53fbdddf 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -901,7 +901,7 @@ enum mlx4_net_trans_promisc_mode {
 	MLX4_FS_MC_DEFAULT,
 	MLX4_FS_UC_SNIFFER,
 	MLX4_FS_MC_SNIFFER,
-
+	MLX4_FS_MODE_NUM, /* should be last */
 };
 
 struct mlx4_spec_eth {
@@ -1044,6 +1044,11 @@ struct _rule_hw {
 	};
 };
 
+/* translating DMFS verbs sniffer rule to the FW API would need two reg IDs */
+struct mlx4_flow_handle {
+	u64 reg_id[2];
+};
+
 int mlx4_flow_steer_promisc_add(struct mlx4_dev *dev, u8 port, u32 qpn,
 				enum mlx4_net_trans_promisc_mode mode);
 int mlx4_flow_steer_promisc_remove(struct mlx4_dev *dev, u8 port,
@@ -1093,6 +1098,11 @@ void mlx4_counter_free(struct mlx4_dev *dev, u32 idx);
 int mlx4_flow_attach(struct mlx4_dev *dev,
 		     struct mlx4_net_trans_rule *rule, u64 *reg_id);
 int mlx4_flow_detach(struct mlx4_dev *dev, u64 reg_id);
+int mlx4_map_sw_to_hw_steering_mode(struct mlx4_dev *dev,
+				    enum mlx4_net_trans_promisc_mode flow_type);
+int mlx4_map_sw_to_hw_steering_id(struct mlx4_dev *dev,
+				  enum mlx4_net_trans_rule_id id);
+int mlx4_hw_rule_sz(struct mlx4_dev *dev, enum mlx4_net_trans_rule_id id);
 
 void mlx4_sync_pkey_table(struct mlx4_dev *dev, int slave, int port,
 			  int i, int val);
-- 
cgit 


From 5a8eb24292ffd68604cedeb24ad2b4bc02cfc037 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Thu, 25 Apr 2013 04:42:29 +0000
Subject: pci: Add SRIOV helper function to determine if VFs are assigned to
 guest

This function is meant to add a helper function that will determine if a PF
has any VFs that are currently assigned to a guest.  We currently have been
implementing this function per driver, and going forward I would like to avoid
that by making this function generic and using this helper.

v2: Removed extern from declaration of pci_vfs_assigned in pci.h and return
    0 if SR-IOV is disabled with is inline with other PCI SRIOV functions.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/pci/iov.c   | 41 +++++++++++++++++++++++++++++++++++++++++
 include/linux/pci.h |  5 +++++
 2 files changed, 46 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index ee599f274f05..c93071d428f5 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -728,6 +728,47 @@ int pci_num_vf(struct pci_dev *dev)
 }
 EXPORT_SYMBOL_GPL(pci_num_vf);
 
+/**
+ * pci_vfs_assigned - returns number of VFs are assigned to a guest
+ * @dev: the PCI device
+ *
+ * Returns number of VFs belonging to this device that are assigned to a guest.
+ * If device is not a physical function returns -ENODEV.
+ */
+int pci_vfs_assigned(struct pci_dev *dev)
+{
+	struct pci_dev *vfdev;
+	unsigned int vfs_assigned = 0;
+	unsigned short dev_id;
+
+	/* only search if we are a PF */
+	if (!dev->is_physfn)
+		return 0;
+
+	/*
+	 * determine the device ID for the VFs, the vendor ID will be the
+	 * same as the PF so there is no need to check for that one
+	 */
+	pci_read_config_word(dev, dev->sriov->pos + PCI_SRIOV_VF_DID, &dev_id);
+
+	/* loop through all the VFs to see if we own any that are assigned */
+	vfdev = pci_get_device(dev->vendor, dev_id, NULL);
+	while (vfdev) {
+		/*
+		 * It is considered assigned if it is a virtual function with
+		 * our dev as the physical function and the assigned bit is set
+		 */
+		if (vfdev->is_virtfn && (vfdev->physfn == dev) &&
+		    (vfdev->dev_flags & PCI_DEV_FLAGS_ASSIGNED))
+			vfs_assigned++;
+
+		vfdev = pci_get_device(dev->vendor, dev_id, vfdev);
+	}
+
+	return vfs_assigned;
+}
+EXPORT_SYMBOL_GPL(pci_vfs_assigned);
+
 /**
  * pci_sriov_set_totalvfs -- reduce the TotalVFs available
  * @dev: the PCI PF device
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 710067f3618c..43e45ac36458 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -1644,6 +1644,7 @@ extern int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 extern void pci_disable_sriov(struct pci_dev *dev);
 extern irqreturn_t pci_sriov_migration(struct pci_dev *dev);
 extern int pci_num_vf(struct pci_dev *dev);
+int pci_vfs_assigned(struct pci_dev *dev);
 extern int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
 extern int pci_sriov_get_totalvfs(struct pci_dev *dev);
 #else
@@ -1662,6 +1663,10 @@ static inline int pci_num_vf(struct pci_dev *dev)
 {
 	return 0;
 }
+static inline int pci_vfs_assigned(struct pci_dev *dev)
+{
+	return 0;
+}
 static inline int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs)
 {
 	return 0;
-- 
cgit 


From 1907da78bf6a3f74e5d030a4a4f6700ba0d0a234 Mon Sep 17 00:00:00 2001
From: Alexander Clouter <alex@digriz.org.uk>
Date: Sun, 31 Mar 2013 17:34:50 +0100
Subject: hwrng: timeriomem - update to support more than one device

timeriomem_rng only supports a single device instance.  This patch
enables multiple timeriomem_rng devices to coexist as well as adds
some additional error checking.

Signed-off-by: Alexander Clouter <alex@digriz.org.uk>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 drivers/char/hw_random/timeriomem-rng.c | 170 ++++++++++++++++++++++----------
 include/linux/timeriomem-rng.h          |   5 -
 2 files changed, 116 insertions(+), 59 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/char/hw_random/timeriomem-rng.c b/drivers/char/hw_random/timeriomem-rng.c
index 849db199c02c..9dd2cadc31bd 100644
--- a/drivers/char/hw_random/timeriomem-rng.c
+++ b/drivers/char/hw_random/timeriomem-rng.c
@@ -25,117 +25,179 @@
 #include <linux/platform_device.h>
 #include <linux/hw_random.h>
 #include <linux/io.h>
+#include <linux/slab.h>
 #include <linux/timeriomem-rng.h>
 #include <linux/jiffies.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/completion.h>
 
-static struct timeriomem_rng_data *timeriomem_rng_data;
+struct timeriomem_rng_private_data {
+	void __iomem		*io_base;
+	unsigned int		expires;
+	unsigned int		period;
+	unsigned int		present:1;
 
-static void timeriomem_rng_trigger(unsigned long);
-static DEFINE_TIMER(timeriomem_rng_timer, timeriomem_rng_trigger, 0, 0);
+	struct timer_list	timer;
+	struct completion	completion;
+
+	struct hwrng		timeriomem_rng_ops;
+};
+
+#define to_rng_priv(rng) \
+		((struct timeriomem_rng_private_data *)rng->priv)
 
 /*
  * have data return 1, however return 0 if we have nothing
  */
 static int timeriomem_rng_data_present(struct hwrng *rng, int wait)
 {
-	if (rng->priv == 0)
-		return 1;
+	struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
 
-	if (!wait || timeriomem_rng_data->present)
-		return timeriomem_rng_data->present;
+	if (!wait || priv->present)
+		return priv->present;
 
-	wait_for_completion(&timeriomem_rng_data->completion);
+	wait_for_completion(&priv->completion);
 
 	return 1;
 }
 
 static int timeriomem_rng_data_read(struct hwrng *rng, u32 *data)
 {
+	struct timeriomem_rng_private_data *priv = to_rng_priv(rng);
 	unsigned long cur;
 	s32 delay;
 
-	*data = readl(timeriomem_rng_data->address);
+	*data = readl(priv->io_base);
 
-	if (rng->priv != 0) {
-		cur = jiffies;
+	cur = jiffies;
 
-		delay = cur - timeriomem_rng_timer.expires;
-		delay = rng->priv - (delay % rng->priv);
+	delay = cur - priv->expires;
+	delay = priv->period - (delay % priv->period);
 
-		timeriomem_rng_timer.expires = cur + delay;
-		timeriomem_rng_data->present = 0;
+	priv->expires = cur + delay;
+	priv->present = 0;
 
-		init_completion(&timeriomem_rng_data->completion);
-		add_timer(&timeriomem_rng_timer);
-	}
+	INIT_COMPLETION(priv->completion);
+	mod_timer(&priv->timer, priv->expires);
 
 	return 4;
 }
 
-static void timeriomem_rng_trigger(unsigned long dummy)
+static void timeriomem_rng_trigger(unsigned long data)
 {
-	timeriomem_rng_data->present = 1;
-	complete(&timeriomem_rng_data->completion);
-}
+	struct timeriomem_rng_private_data *priv
+			= (struct timeriomem_rng_private_data *)data;
 
-static struct hwrng timeriomem_rng_ops = {
-	.name		= "timeriomem",
-	.data_present	= timeriomem_rng_data_present,
-	.data_read	= timeriomem_rng_data_read,
-	.priv		= 0,
-};
+	priv->present = 1;
+	complete(&priv->completion);
+}
 
 static int timeriomem_rng_probe(struct platform_device *pdev)
 {
+	struct timeriomem_rng_data *pdata = pdev->dev.platform_data;
+	struct timeriomem_rng_private_data *priv;
 	struct resource *res;
-	int ret;
+	int err = 0;
+	int period;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!pdata) {
+		dev_err(&pdev->dev, "timeriomem_rng_data is missing\n");
+		return -EINVAL;
+	}
 
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res)
-		return -ENOENT;
+		return -ENXIO;
 
-	timeriomem_rng_data = pdev->dev.platform_data;
+	if (res->start % 4 != 0 || resource_size(res) != 4) {
+		dev_err(&pdev->dev,
+			"address must be four bytes wide and aligned\n");
+		return -EINVAL;
+	}
 
-	timeriomem_rng_data->address = ioremap(res->start, resource_size(res));
-	if (!timeriomem_rng_data->address)
-		return -EIO;
+	/* Allocate memory for the device structure (and zero it) */
+	priv = kzalloc(sizeof(struct timeriomem_rng_private_data), GFP_KERNEL);
+	if (!priv) {
+		dev_err(&pdev->dev, "failed to allocate device structure.\n");
+		return -ENOMEM;
+	}
+
+	platform_set_drvdata(pdev, priv);
 
-	if (timeriomem_rng_data->period != 0
-		&& usecs_to_jiffies(timeriomem_rng_data->period) > 0) {
-		timeriomem_rng_timer.expires = jiffies;
+	period = pdata->period;
 
-		timeriomem_rng_ops.priv = usecs_to_jiffies(
-						timeriomem_rng_data->period);
+	priv->period = usecs_to_jiffies(period);
+	if (priv->period < 1) {
+		dev_err(&pdev->dev, "period is less than one jiffy\n");
+		err = -EINVAL;
+		goto out_free;
 	}
-	timeriomem_rng_data->present = 1;
 
-	ret = hwrng_register(&timeriomem_rng_ops);
-	if (ret)
-		goto failed;
+	priv->expires	= jiffies;
+	priv->present	= 1;
+
+	init_completion(&priv->completion);
+	complete(&priv->completion);
+
+	setup_timer(&priv->timer, timeriomem_rng_trigger, (unsigned long)priv);
+
+	priv->timeriomem_rng_ops.name		= dev_name(&pdev->dev);
+	priv->timeriomem_rng_ops.data_present	= timeriomem_rng_data_present;
+	priv->timeriomem_rng_ops.data_read	= timeriomem_rng_data_read;
+	priv->timeriomem_rng_ops.priv		= (unsigned long)priv;
+
+	if (!request_mem_region(res->start, resource_size(res),
+				dev_name(&pdev->dev))) {
+		dev_err(&pdev->dev, "request_mem_region failed\n");
+		err = -EBUSY;
+		goto out_timer;
+	}
+
+	priv->io_base = ioremap(res->start, resource_size(res));
+	if (priv->io_base == NULL) {
+		dev_err(&pdev->dev, "ioremap failed\n");
+		err = -EIO;
+		goto out_release_io;
+	}
+
+	err = hwrng_register(&priv->timeriomem_rng_ops);
+	if (err) {
+		dev_err(&pdev->dev, "problem registering\n");
+		goto out;
+	}
 
 	dev_info(&pdev->dev, "32bits from 0x%p @ %dus\n",
-			timeriomem_rng_data->address,
-			timeriomem_rng_data->period);
+			priv->io_base, period);
 
 	return 0;
 
-failed:
-	dev_err(&pdev->dev, "problem registering\n");
-	iounmap(timeriomem_rng_data->address);
-
-	return ret;
+out:
+	iounmap(priv->io_base);
+out_release_io:
+	release_mem_region(res->start, resource_size(res));
+out_timer:
+	del_timer_sync(&priv->timer);
+out_free:
+	platform_set_drvdata(pdev, NULL);
+	kfree(priv);
+	return err;
 }
 
 static int timeriomem_rng_remove(struct platform_device *pdev)
 {
-	del_timer_sync(&timeriomem_rng_timer);
-	hwrng_unregister(&timeriomem_rng_ops);
+	struct timeriomem_rng_private_data *priv = platform_get_drvdata(pdev);
+	struct resource *res;
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+
+	hwrng_unregister(&priv->timeriomem_rng_ops);
 
-	iounmap(timeriomem_rng_data->address);
+	del_timer_sync(&priv->timer);
+	iounmap(priv->io_base);
+	release_mem_region(res->start, resource_size(res));
+	platform_set_drvdata(pdev, NULL);
+	kfree(priv);
 
 	return 0;
 }
diff --git a/include/linux/timeriomem-rng.h b/include/linux/timeriomem-rng.h
index 3e08a1c86830..46eb27ddbfab 100644
--- a/include/linux/timeriomem-rng.h
+++ b/include/linux/timeriomem-rng.h
@@ -8,12 +8,7 @@
  * published by the Free Software Foundation.
  */
 
-#include <linux/completion.h>
-
 struct timeriomem_rng_data {
-	struct completion	completion;
-	unsigned int		present:1;
-
 	void __iomem		*address;
 
 	/* measures in usecs */
-- 
cgit 


From e162b2f835eeda0d255bd463753b5eb823735205 Mon Sep 17 00:00:00 2001
From: Zheng Liu <wenqing.lz@taobao.com>
Date: Fri, 12 Apr 2013 02:24:06 +0000
Subject: jbd: use kmem_cache_zalloc instead of kmem_cache_alloc/memset

Now jbd_alloc_handle is only called by new_handle.  So this commit
uses kmem_cache_zalloc instead of kmem_cache_alloc/memset.

Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/jbd/transaction.c | 1 -
 include/linux/jbd.h  | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 071d6905f0dd..e3e255c0a509 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -245,7 +245,6 @@ static handle_t *new_handle(int nblocks)
 	handle_t *handle = jbd_alloc_handle(GFP_NOFS);
 	if (!handle)
 		return NULL;
-	memset(handle, 0, sizeof(*handle));
 	handle->h_buffer_credits = nblocks;
 	handle->h_ref = 1;
 
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index c8f32975f0e4..7e0b622503c4 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -887,7 +887,7 @@ extern struct kmem_cache *jbd_handle_cache;
 
 static inline handle_t *jbd_alloc_handle(gfp_t gfp_flags)
 {
-	return kmem_cache_alloc(jbd_handle_cache, gfp_flags);
+	return kmem_cache_zalloc(jbd_handle_cache, gfp_flags);
 }
 
 static inline void jbd_free_handle(handle_t *handle)
-- 
cgit 


From 25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 23 Apr 2013 16:59:02 +0200
Subject: sched: Fix init NOHZ_IDLE flag

On my SMP platform which is made of 5 cores in 2 clusters, I
have the nr_busy_cpu field of sched_group_power struct that is
not null when the platform is fully idle - which makes the
scheduler unhappy.

The root cause is:

During the boot sequence, some CPUs reach the idle loop and set
their NOHZ_IDLE flag while waiting for others CPUs to boot. But
the nr_busy_cpus field is initialized later with the assumption
that all CPUs are in the busy state whereas some CPUs have
already set their NOHZ_IDLE flag.

More generally, the NOHZ_IDLE flag must be initialized when new
sched_domains are created in order to ensure that NOHZ_IDLE and
nr_busy_cpus are aligned.

This condition can be ensured by adding a synchronize_rcu()
between the destruction of old sched_domains and the creation of
new ones so the NOHZ_IDLE flag will not be updated with old
sched_domain once it has been initialized. But this solution
introduces a additionnal latency in the rebuild sequence that is
called during cpu hotplug.

As suggested by Frederic Weisbecker, another solution is to have
the same rcu lifecycle for both NOHZ_IDLE and sched_domain
struct. A new nohz_idle field is added to sched_domain so both
status and sched_domain will share the same RCU lifecycle and
will be always synchronized. In addition, there is no more need
to protect nohz_idle against concurrent access as it is only
modified by 2 exclusive functions called by local cpu.

This solution has been prefered to the creation of a new struct
with an extra pointer indirection for sched_domain.

The synchronization is done at the cost of :

 - An additional indirection and a rcu_dereference for accessing nohz_idle.
 - We use only the nohz_idle field of the top sched_domain.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linaro-kernel@lists.linaro.org
Cc: peterz@infradead.org
Cc: fweisbec@gmail.com
Cc: pjt@google.com
Cc: rostedt@goodmis.org
Cc: efault@gmx.de
Link: http://lkml.kernel.org/r/1366729142-14662-1-git-send-email-vincent.guittot@linaro.org
[ Fixed !NO_HZ build bug. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  2 ++
 kernel/sched/fair.c   | 26 ++++++++++++++++----------
 kernel/sched/sched.h  |  1 -
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6bdaa73ede13..a25168f4ab86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -808,6 +808,8 @@ struct sched_domain {
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
 	unsigned int smt_gain;
+
+	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
 	int level;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index acaf567a03d2..8bf7081b1ec5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5420,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
 	struct sched_domain *sd;
 	int cpu = smp_processor_id();
 
-	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-		return;
-	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
 	rcu_read_lock();
-	for_each_domain(cpu, sd)
+	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+	if (!sd || !sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 0;
+
+	for (; sd; sd = sd->parent)
 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
 	rcu_read_unlock();
 }
 
@@ -5435,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
 	struct sched_domain *sd;
 	int cpu = smp_processor_id();
 
-	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-		return;
-	set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
 	rcu_read_lock();
-	for_each_domain(cpu, sd)
+	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+	if (!sd || sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 1;
+
+	for (; sd; sd = sd->parent)
 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
 	rcu_read_unlock();
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 605426a63588..4c225c4c7111 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1303,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
 enum rq_nohz_flag_bits {
 	NOHZ_TICK_STOPPED,
 	NOHZ_BALANCE_KICK,
-	NOHZ_IDLE,
 };
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
-- 
cgit 


From 99f4c6b66a9ae362d21e6df95d04bc74e04d285e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 26 Apr 2013 17:23:26 +0200
Subject: mfd: si476x: Don't use 0bNNN

This doesn't compile with sparc64 gcc-3.4.5.

Cc: Hans Verkuil <hans.verkuil@cisco.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Andrey Smirnov <andrew.smirnov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Samuel Ortiz <sameo@linux.intel.com>
---
 include/linux/mfd/si476x-core.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mfd/si476x-core.h b/include/linux/mfd/si476x-core.h
index ede3022c03c4..ba89b94e4a56 100644
--- a/include/linux/mfd/si476x-core.h
+++ b/include/linux/mfd/si476x-core.h
@@ -515,11 +515,11 @@ enum si476x_fm_receiver_properties {
 };
 
 enum si476x_prop_audio_pwr_line_filter_bits {
-	SI476X_PROP_PWR_HARMONICS_MASK	= 0b0000000000011111,
-	SI476X_PROP_PWR_GRID_MASK	= 0b0000000100000000,
-	SI476X_PROP_PWR_ENABLE_MASK	= 0b0000001000000000,
-	SI476X_PROP_PWR_GRID_50HZ	= 0b0000000000000000,
-	SI476X_PROP_PWR_GRID_60HZ	= 0b0000000100000000,
+	SI476X_PROP_PWR_HARMONICS_MASK	= 0x001f,
+	SI476X_PROP_PWR_GRID_MASK	= 0x0100,
+	SI476X_PROP_PWR_ENABLE_MASK	= 0x0200,
+	SI476X_PROP_PWR_GRID_50HZ	= 0x0000,
+	SI476X_PROP_PWR_GRID_60HZ	= 0x0100,
 };
 
 enum si476x_prop_fm_rds_config_bits {
-- 
cgit 


From 33d90ac0581ce81d1ebfc51918a2757e41a6011c Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Thu, 11 Apr 2013 15:06:36 -0400
Subject: SUNRPC: allow disabling idle timeout

In the gss-proxy case we don't want to have to reconnect at random--we
want to connect only on gss-proxy startup when we can steal gss-proxy's
context to do the connect in the right namespace.

So, provide a flag that allows the rpc_create caller to turn off the
idle timeout.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/clnt.h | 1 +
 include/linux/sunrpc/xprt.h | 1 +
 net/sunrpc/clnt.c           | 2 ++
 net/sunrpc/xprt.c           | 2 ++
 4 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index e7d492ce7c18..bfe11be81f6f 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -125,6 +125,7 @@ struct rpc_create_args {
 #define RPC_CLNT_CREATE_DISCRTRY	(1UL << 5)
 #define RPC_CLNT_CREATE_QUIET		(1UL << 6)
 #define RPC_CLNT_CREATE_INFINITE_SLOTS	(1UL << 7)
+#define RPC_CLNT_CREATE_NO_IDLE_TIMEOUT	(1UL << 8)
 
 struct rpc_clnt *rpc_create(struct rpc_create_args *args);
 struct rpc_clnt	*rpc_bind_new_program(struct rpc_clnt *,
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index ff5392421cb2..cec7b9b5e1bf 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -256,6 +256,7 @@ static inline int bc_prealloc(struct rpc_rqst *req)
 #endif /* CONFIG_SUNRPC_BACKCHANNEL */
 
 #define XPRT_CREATE_INFINITE_SLOTS	(1U)
+#define XPRT_CREATE_NO_IDLE_TIMEOUT	(1U << 1)
 
 struct xprt_create {
 	int			ident;		/* XPRT_TRANSPORT identifier */
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 651245aa829a..80cf23241da9 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -416,6 +416,8 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
 
 	if (args->flags & RPC_CLNT_CREATE_INFINITE_SLOTS)
 		xprtargs.flags |= XPRT_CREATE_INFINITE_SLOTS;
+	if (args->flags & RPC_CLNT_CREATE_NO_IDLE_TIMEOUT)
+		xprtargs.flags |= XPRT_CREATE_NO_IDLE_TIMEOUT;
 	/*
 	 * If the caller chooses not to specify a hostname, whip
 	 * up a string representation of the passed-in address.
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 745fca3cfd36..095363eee764 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1300,6 +1300,8 @@ found:
 				-PTR_ERR(xprt));
 		goto out;
 	}
+	if (args->flags & XPRT_CREATE_NO_IDLE_TIMEOUT)
+		xprt->idle_timeout = 0;
 	INIT_WORK(&xprt->task_cleanup, xprt_autoclose);
 	if (xprt_has_timer(xprt))
 		setup_timer(&xprt->timer, xprt_init_autodisconnect,
-- 
cgit 


From 400f26b542e86995662a0cc5483656b7b1f42af6 Mon Sep 17 00:00:00 2001
From: Simo Sorce <simo@redhat.com>
Date: Fri, 25 May 2012 18:09:53 -0400
Subject: SUNRPC: conditionally return endtime from import_sec_context

We expose this parameter for a future caller.
It will be used to extract the endtime from the gss-proxy upcall mechanism,
in order to set the rsc cache expiration time.

Signed-off-by: Simo Sorce <simo@redhat.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 include/linux/sunrpc/gss_api.h        | 2 ++
 net/sunrpc/auth_gss/auth_gss.c        | 2 +-
 net/sunrpc/auth_gss/gss_krb5_mech.c   | 7 +++++--
 net/sunrpc/auth_gss/gss_mech_switch.c | 5 +++--
 net/sunrpc/auth_gss/svcauth_gss.c     | 3 ++-
 5 files changed, 13 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sunrpc/gss_api.h b/include/linux/sunrpc/gss_api.h
index a19e2547ae6a..04d03bb2de5d 100644
--- a/include/linux/sunrpc/gss_api.h
+++ b/include/linux/sunrpc/gss_api.h
@@ -37,6 +37,7 @@ int gss_import_sec_context(
 		size_t			bufsize,
 		struct gss_api_mech	*mech,
 		struct gss_ctx		**ctx_id,
+		time_t			*endtime,
 		gfp_t			gfp_mask);
 u32 gss_get_mic(
 		struct gss_ctx		*ctx_id,
@@ -92,6 +93,7 @@ struct gss_api_ops {
 			const void		*input_token,
 			size_t			bufsize,
 			struct gss_ctx		*ctx_id,
+			time_t			*endtime,
 			gfp_t			gfp_mask);
 	u32 (*gss_get_mic)(
 			struct gss_ctx		*ctx_id,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 5257d2982ba5..23563e783ec2 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -238,7 +238,7 @@ gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct
 		p = ERR_PTR(-EFAULT);
 		goto err;
 	}
-	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, GFP_NOFS);
+	ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx, NULL, GFP_NOFS);
 	if (ret < 0) {
 		p = ERR_PTR(ret);
 		goto err;
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index d3611f11a8df..3bc4a23938ea 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -679,6 +679,7 @@ out_err:
 static int
 gss_import_sec_context_kerberos(const void *p, size_t len,
 				struct gss_ctx *ctx_id,
+				time_t *endtime,
 				gfp_t gfp_mask)
 {
 	const void *end = (const void *)((const char *)p + len);
@@ -694,9 +695,11 @@ gss_import_sec_context_kerberos(const void *p, size_t len,
 	else
 		ret = gss_import_v2_context(p, end, ctx, gfp_mask);
 
-	if (ret == 0)
+	if (ret == 0) {
 		ctx_id->internal_ctx_id = ctx;
-	else
+		if (endtime)
+			*endtime = ctx->endtime;
+	} else
 		kfree(ctx);
 
 	dprintk("RPC:       %s: returning %d\n", __func__, ret);
diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c
index f0f4eee63a35..43fd5bbf92c6 100644
--- a/net/sunrpc/auth_gss/gss_mech_switch.c
+++ b/net/sunrpc/auth_gss/gss_mech_switch.c
@@ -325,14 +325,15 @@ int
 gss_import_sec_context(const void *input_token, size_t bufsize,
 		       struct gss_api_mech	*mech,
 		       struct gss_ctx		**ctx_id,
+		       time_t			*endtime,
 		       gfp_t gfp_mask)
 {
 	if (!(*ctx_id = kzalloc(sizeof(**ctx_id), gfp_mask)))
 		return -ENOMEM;
 	(*ctx_id)->mech_type = gss_mech_get(mech);
 
-	return mech->gm_ops
-		->gss_import_sec_context(input_token, bufsize, *ctx_id, gfp_mask);
+	return mech->gm_ops->gss_import_sec_context(input_token, bufsize,
+						*ctx_id, endtime, gfp_mask);
 }
 
 /* gss_get_mic: compute a mic over message and return mic_token. */
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c
index 5ead60550895..20eedecc35f8 100644
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -497,7 +497,8 @@ static int rsc_parse(struct cache_detail *cd,
 		len = qword_get(&mesg, buf, mlen);
 		if (len < 0)
 			goto out;
-		status = gss_import_sec_context(buf, len, gm, &rsci.mechctx, GFP_KERNEL);
+		status = gss_import_sec_context(buf, len, gm, &rsci.mechctx,
+						NULL, GFP_KERNEL);
 		if (status)
 			goto out;
 
-- 
cgit 


From 8175e5b79c38a1d85225da516fa1a0ecbf2fdbca Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 15 Apr 2013 10:42:33 +0200
Subject: KVM: Add KVM_IRQCHIP_NUM_PINS in addition to KVM_IOAPIC_NUM_PINS

The concept of routing interrupt lines to an irqchip is nothing
that is IOAPIC specific. Every irqchip has a maximum number of pins
that can be linked to irq lines.

So let's add a new define that allows us to reuse generic code for
non-IOAPIC platforms.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 include/linux/kvm_host.h        | 2 +-
 virt/kvm/irq_comm.c             | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 18635ae42a8e..14337fa464bc 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -43,6 +43,8 @@
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
 
+#define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
+
 #define CR0_RESERVED_BITS                                               \
 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
 			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 93a50054d46c..bf3b1dcb8b3d 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -307,7 +307,7 @@ struct kvm_kernel_irq_routing_entry {
 #ifdef __KVM_HAVE_IOAPIC
 
 struct kvm_irq_routing_table {
-	int chip[KVM_NR_IRQCHIPS][KVM_IOAPIC_NUM_PINS];
+	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
 	struct kvm_kernel_irq_routing_entry *rt_entries;
 	u32 nr_rt_entries;
 	/*
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 25ab48007adb..7c0071de9e85 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -480,7 +480,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
 
 	new->nr_rt_entries = nr_rt_entries;
 	for (i = 0; i < 3; i++)
-		for (j = 0; j < KVM_IOAPIC_NUM_PINS; j++)
+		for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
 			new->chip[i][j] = -1;
 
 	for (i = 0; i < nr; ++i) {
-- 
cgit 


From a725d56a02ec3582bb5b9756f261fdc6962c79ee Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Wed, 17 Apr 2013 13:29:30 +0200
Subject: KVM: Introduce CONFIG_HAVE_KVM_IRQ_ROUTING

Quite a bit of code in KVM has been conditionalized on availability of
IOAPIC emulation. However, most of it is generically applicable to
platforms that don't have an IOPIC, but a different type of irq chip.

Make code that only relies on IRQ routing, not an APIC itself, on
CONFIG_HAVE_KVM_IRQ_ROUTING, so that we can reuse it later.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 arch/x86/kvm/Kconfig     | 1 +
 include/linux/kvm_host.h | 6 +++---
 virt/kvm/Kconfig         | 3 +++
 virt/kvm/eventfd.c       | 6 +++---
 virt/kvm/kvm_main.c      | 2 +-
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 586f00059805..9d50efdb719d 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -29,6 +29,7 @@ config KVM
 	select MMU_NOTIFIER
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
+	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
 	select KVM_ASYNC_PF
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bf3b1dcb8b3d..4215d4f3d86f 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -304,7 +304,7 @@ struct kvm_kernel_irq_routing_entry {
 	struct hlist_node link;
 };
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 
 struct kvm_irq_routing_table {
 	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
@@ -432,7 +432,7 @@ void kvm_vcpu_uninit(struct kvm_vcpu *vcpu);
 int __must_check vcpu_load(struct kvm_vcpu *vcpu);
 void vcpu_put(struct kvm_vcpu *vcpu);
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 int kvm_irqfd_init(void);
 void kvm_irqfd_exit(void);
 #else
@@ -957,7 +957,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 }
 #endif
 
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 
 #define KVM_MAX_IRQ_ROUTES 1024
 
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index d01b24b72c61..779262f59e25 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -6,6 +6,9 @@ config HAVE_KVM
 config HAVE_KVM_IRQCHIP
        bool
 
+config HAVE_KVM_IRQ_ROUTING
+       bool
+
 config HAVE_KVM_EVENTFD
        bool
        select EVENTFD
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index c5d43ffbf1f3..64ee720b75c7 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -35,7 +35,7 @@
 
 #include "iodev.h"
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 /*
  * --------------------------------------------------------------------
  * irqfd: Allows an fd to be used to inject an interrupt to the guest
@@ -433,7 +433,7 @@ fail:
 void
 kvm_eventfd_init(struct kvm *kvm)
 {
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 	spin_lock_init(&kvm->irqfds.lock);
 	INIT_LIST_HEAD(&kvm->irqfds.items);
 	INIT_LIST_HEAD(&kvm->irqfds.resampler_list);
@@ -442,7 +442,7 @@ kvm_eventfd_init(struct kvm *kvm)
 	INIT_LIST_HEAD(&kvm->ioeventfds);
 }
 
-#ifdef __KVM_HAVE_IOAPIC
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 /*
  * shutdown any irqfd's that match fd+gsi
  */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index aaac1a7a9ea8..2c3b226bc13b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2404,7 +2404,7 @@ static long kvm_dev_ioctl_check_extension_generic(long arg)
 	case KVM_CAP_SIGNAL_MSI:
 #endif
 		return 1;
-#ifdef KVM_CAP_IRQ_ROUTING
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 	case KVM_CAP_IRQ_ROUTING:
 		return KVM_MAX_IRQ_ROUTES;
 #endif
-- 
cgit 


From 7eee2efdc6978aa70ab4046394128c1a10bc2d80 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 15 Apr 2013 10:50:54 +0200
Subject: KVM: Remove kvm_get_intr_delivery_bitmask

The prototype has been stale for a while, I can't spot any real function
define behind it. Let's just remove it.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/kvm_host.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 4215d4f3d86f..a7bfe9d3aa5e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -719,11 +719,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 			     bool mask);
 
-#ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
-#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 		bool line_status);
 int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
-- 
cgit 


From e8cde0939d8ebe9c8ebe5a4bdf71af51a0d56053 Mon Sep 17 00:00:00 2001
From: Alexander Graf <agraf@suse.de>
Date: Mon, 15 Apr 2013 23:23:21 +0200
Subject: KVM: Move irq routing setup to irqchip.c

Setting up IRQ routes is nothing IOAPIC specific. Extract everything
that really is generic code into irqchip.c and only leave the ioapic
specific bits to irq_comm.c.

Signed-off-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
---
 include/linux/kvm_host.h |  3 ++
 virt/kvm/irq_comm.c      | 76 ++-----------------------------------------
 virt/kvm/irqchip.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index a7bfe9d3aa5e..dcef724f4ba6 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -961,6 +961,9 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
 			unsigned flags);
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+			  struct kvm_kernel_irq_routing_entry *e,
+			  const struct kvm_irq_routing_entry *ue);
 void kvm_free_irq_routing(struct kvm *kvm);
 
 int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d5008f4aade7..e2e6b4473a96 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -271,27 +271,14 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 	rcu_read_unlock();
 }
 
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
-			       struct kvm_kernel_irq_routing_entry *e,
-			       const struct kvm_irq_routing_entry *ue)
+int kvm_set_routing_entry(struct kvm_irq_routing_table *rt,
+			  struct kvm_kernel_irq_routing_entry *e,
+			  const struct kvm_irq_routing_entry *ue)
 {
 	int r = -EINVAL;
 	int delta;
 	unsigned max_pin;
-	struct kvm_kernel_irq_routing_entry *ei;
 
-	/*
-	 * Do not allow GSI to be mapped to the same irqchip more than once.
-	 * Allow only one to one mapping between GSI and MSI.
-	 */
-	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
-		if (ei->type == KVM_IRQ_ROUTING_MSI ||
-		    ue->type == KVM_IRQ_ROUTING_MSI ||
-		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
-			return r;
-
-	e->gsi = ue->gsi;
-	e->type = ue->type;
 	switch (ue->type) {
 	case KVM_IRQ_ROUTING_IRQCHIP:
 		delta = 0;
@@ -328,68 +315,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 		goto out;
 	}
 
-	hlist_add_head(&e->link, &rt->map[e->gsi]);
 	r = 0;
 out:
 	return r;
 }
 
-int kvm_set_irq_routing(struct kvm *kvm,
-			const struct kvm_irq_routing_entry *ue,
-			unsigned nr,
-			unsigned flags)
-{
-	struct kvm_irq_routing_table *new, *old;
-	u32 i, j, nr_rt_entries = 0;
-	int r;
-
-	for (i = 0; i < nr; ++i) {
-		if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
-			return -EINVAL;
-		nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
-	}
-
-	nr_rt_entries += 1;
-
-	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
-		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
-		      GFP_KERNEL);
-
-	if (!new)
-		return -ENOMEM;
-
-	new->rt_entries = (void *)&new->map[nr_rt_entries];
-
-	new->nr_rt_entries = nr_rt_entries;
-	for (i = 0; i < 3; i++)
-		for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
-			new->chip[i][j] = -1;
-
-	for (i = 0; i < nr; ++i) {
-		r = -EINVAL;
-		if (ue->flags)
-			goto out;
-		r = setup_routing_entry(new, &new->rt_entries[i], ue);
-		if (r)
-			goto out;
-		++ue;
-	}
-
-	mutex_lock(&kvm->irq_lock);
-	old = kvm->irq_routing;
-	kvm_irq_routing_update(kvm, new);
-	mutex_unlock(&kvm->irq_lock);
-
-	synchronize_rcu();
-
-	new = old;
-	r = 0;
-
-out:
-	kfree(new);
-	return r;
-}
-
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip.irqchip = KVM_IRQCHIP_IOAPIC, .u.irqchip.pin = (irq) }
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 12f7f26af4f8..20dc9e4a8f6c 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -150,3 +150,88 @@ void kvm_free_irq_routing(struct kvm *kvm)
 	   at this stage */
 	kfree(kvm->irq_routing);
 }
+
+static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+			       struct kvm_kernel_irq_routing_entry *e,
+			       const struct kvm_irq_routing_entry *ue)
+{
+	int r = -EINVAL;
+	struct kvm_kernel_irq_routing_entry *ei;
+
+	/*
+	 * Do not allow GSI to be mapped to the same irqchip more than once.
+	 * Allow only one to one mapping between GSI and MSI.
+	 */
+	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
+		if (ei->type == KVM_IRQ_ROUTING_MSI ||
+		    ue->type == KVM_IRQ_ROUTING_MSI ||
+		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
+			return r;
+
+	e->gsi = ue->gsi;
+	e->type = ue->type;
+	r = kvm_set_routing_entry(rt, e, ue);
+	if (r)
+		goto out;
+
+	hlist_add_head(&e->link, &rt->map[e->gsi]);
+	r = 0;
+out:
+	return r;
+}
+
+int kvm_set_irq_routing(struct kvm *kvm,
+			const struct kvm_irq_routing_entry *ue,
+			unsigned nr,
+			unsigned flags)
+{
+	struct kvm_irq_routing_table *new, *old;
+	u32 i, j, nr_rt_entries = 0;
+	int r;
+
+	for (i = 0; i < nr; ++i) {
+		if (ue[i].gsi >= KVM_MAX_IRQ_ROUTES)
+			return -EINVAL;
+		nr_rt_entries = max(nr_rt_entries, ue[i].gsi);
+	}
+
+	nr_rt_entries += 1;
+
+	new = kzalloc(sizeof(*new) + (nr_rt_entries * sizeof(struct hlist_head))
+		      + (nr * sizeof(struct kvm_kernel_irq_routing_entry)),
+		      GFP_KERNEL);
+
+	if (!new)
+		return -ENOMEM;
+
+	new->rt_entries = (void *)&new->map[nr_rt_entries];
+
+	new->nr_rt_entries = nr_rt_entries;
+	for (i = 0; i < KVM_NR_IRQCHIPS; i++)
+		for (j = 0; j < KVM_IRQCHIP_NUM_PINS; j++)
+			new->chip[i][j] = -1;
+
+	for (i = 0; i < nr; ++i) {
+		r = -EINVAL;
+		if (ue->flags)
+			goto out;
+		r = setup_routing_entry(new, &new->rt_entries[i], ue);
+		if (r)
+			goto out;
+		++ue;
+	}
+
+	mutex_lock(&kvm->irq_lock);
+	old = kvm->irq_routing;
+	kvm_irq_routing_update(kvm, new);
+	mutex_unlock(&kvm->irq_lock);
+
+	synchronize_rcu();
+
+	new = old;
+	r = 0;
+
+out:
+	kfree(new);
+	return r;
+}
-- 
cgit 


From 852b6d57dc7fa378019786fa84727036e56839ea Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 12 Apr 2013 14:08:42 +0000
Subject: kvm: add device control API

Currently, devices that are emulated inside KVM are configured in a
hardcoded manner based on an assumption that any given architecture
only has one way to do it.  If there's any need to access device state,
it is done through inflexible one-purpose-only IOCTLs (e.g.
KVM_GET/SET_LAPIC).  Defining new IOCTLs for every little thing is
cumbersome and depletes a limited numberspace.

This API provides a mechanism to instantiate a device of a certain
type, returning an ID that can be used to set/get attributes of the
device.  Attributes may include configuration parameters (e.g.
register base address), device state, operational commands, etc.  It
is similar to the ONE_REG API, except that it acts on devices rather
than vcpus.

Both device types and individual attributes can be tested without having
to create the device or get/set the attribute, without the need for
separately managing enumerated capabilities.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt        |  70 +++++++++++++++++
 Documentation/virtual/kvm/devices/README |   1 +
 include/linux/kvm_host.h                 |  35 +++++++++
 include/uapi/linux/kvm.h                 |  27 +++++++
 virt/kvm/kvm_main.c                      | 129 +++++++++++++++++++++++++++++++
 5 files changed, 262 insertions(+)
 create mode 100644 Documentation/virtual/kvm/devices/README

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a1f2200e43d0..66b58e494935 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2189,6 +2189,76 @@ header; first `n_valid' valid entries with contents from the data
 written, then `n_invalid' invalid entries, invalidating any previously
 valid entries found.
 
+4.79 KVM_CREATE_DEVICE
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: vm ioctl
+Parameters: struct kvm_create_device (in/out)
+Returns: 0 on success, -1 on error
+Errors:
+  ENODEV: The device type is unknown or unsupported
+  EEXIST: Device already created, and this type of device may not
+          be instantiated multiple times
+
+  Other error conditions may be defined by individual device types or
+  have their standard meanings.
+
+Creates an emulated device in the kernel.  The file descriptor returned
+in fd can be used with KVM_SET/GET/HAS_DEVICE_ATTR.
+
+If the KVM_CREATE_DEVICE_TEST flag is set, only test whether the
+device type is supported (not necessarily whether it can be created
+in the current vm).
+
+Individual devices should not define flags.  Attributes should be used
+for specifying any behavior that is not implied by the device type
+number.
+
+struct kvm_create_device {
+	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
+	__u32	fd;	/* out: device handle */
+	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
+};
+
+4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+  EPERM:  The attribute cannot (currently) be accessed this way
+          (e.g. read-only attribute, or attribute that only makes
+          sense when the device is in a different state)
+
+  Other error conditions may be defined by individual device types.
+
+Gets/sets a specified piece of device configuration and/or state.  The
+semantics are device-specific.  See individual device documentation in
+the "devices" directory.  As with ONE_REG, the size of the data
+transferred is defined by the particular attribute.
+
+struct kvm_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
+4.81 KVM_HAS_DEVICE_ATTR
+
+Capability: KVM_CAP_DEVICE_CTRL
+Type: device ioctl
+Parameters: struct kvm_device_attr
+Returns: 0 on success, -1 on error
+Errors:
+  ENXIO:  The group or attribute is unknown/unsupported for this device
+
+Tests whether a device supports a particular attribute.  A successful
+return indicates the attribute is implemented.  It does not necessarily
+indicate that the attribute can be read or written in the device's
+current state.  "addr" is ignored.
 
 4.77 KVM_ARM_VCPU_INIT
 
diff --git a/Documentation/virtual/kvm/devices/README b/Documentation/virtual/kvm/devices/README
new file mode 100644
index 000000000000..34a69834124a
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/README
@@ -0,0 +1 @@
+This directory contains specific device bindings for KVM_CAP_DEVICE_CTRL.
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index dcef724f4ba6..6dab6b5b3e3b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1064,6 +1064,41 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 
 extern bool kvm_rebooting;
 
+struct kvm_device_ops;
+
+struct kvm_device {
+	struct kvm_device_ops *ops;
+	struct kvm *kvm;
+	atomic_t users;
+	void *private;
+};
+
+/* create, destroy, and name are mandatory */
+struct kvm_device_ops {
+	const char *name;
+	int (*create)(struct kvm_device *dev, u32 type);
+
+	/*
+	 * Destroy is responsible for freeing dev.
+	 *
+	 * Destroy may be called before or after destructors are called
+	 * on emulated I/O regions, depending on whether a reference is
+	 * held by a vcpu or other kvm component that gets destroyed
+	 * after the emulated I/O.
+	 */
+	void (*destroy)(struct kvm_device *dev);
+
+	int (*set_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+	int (*get_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+	int (*has_attr)(struct kvm_device *dev, struct kvm_device_attr *attr);
+	long (*ioctl)(struct kvm_device *dev, unsigned int ioctl,
+		      unsigned long arg);
+};
+
+void kvm_device_get(struct kvm_device *dev);
+void kvm_device_put(struct kvm_device *dev);
+struct kvm_device *kvm_device_from_filp(struct file *filp);
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index c741902c9e0b..38a0be0c199f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_EPR 86
 #define KVM_CAP_ARM_PSCI 87
 #define KVM_CAP_ARM_SET_DEVICE_ADDR 88
+#define KVM_CAP_DEVICE_CTRL 89
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -818,6 +819,24 @@ struct kvm_arm_device_addr {
 	__u64 addr;
 };
 
+/*
+ * Device control API, available with KVM_CAP_DEVICE_CTRL
+ */
+#define KVM_CREATE_DEVICE_TEST		1
+
+struct kvm_create_device {
+	__u32	type;	/* in: KVM_DEV_TYPE_xxx */
+	__u32	fd;	/* out: device handle */
+	__u32	flags;	/* in: KVM_CREATE_DEVICE_xxx */
+};
+
+struct kvm_device_attr {
+	__u32	flags;		/* no flags currently defined */
+	__u32	group;		/* device-defined */
+	__u64	attr;		/* group-defined */
+	__u64	addr;		/* userspace address of attr data */
+};
+
 /*
  * ioctls for VM fds
  */
@@ -906,6 +925,14 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_ARM_SET_DEVICE_ADDR */
 #define KVM_ARM_SET_DEVICE_ADDR	  _IOW(KVMIO,  0xab, struct kvm_arm_device_addr)
 
+/* ioctl for vm fd */
+#define KVM_CREATE_DEVICE	  _IOWR(KVMIO,  0xe0, struct kvm_create_device)
+
+/* ioctls for fds returned by KVM_CREATE_DEVICE */
+#define KVM_SET_DEVICE_ATTR	  _IOW(KVMIO,  0xe1, struct kvm_device_attr)
+#define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
+#define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
+
 /*
  * ioctls for vcpu fds
  */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f9492f3847d6..5f0d78c2537b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2159,6 +2159,117 @@ out:
 }
 #endif
 
+static int kvm_device_ioctl_attr(struct kvm_device *dev,
+				 int (*accessor)(struct kvm_device *dev,
+						 struct kvm_device_attr *attr),
+				 unsigned long arg)
+{
+	struct kvm_device_attr attr;
+
+	if (!accessor)
+		return -EPERM;
+
+	if (copy_from_user(&attr, (void __user *)arg, sizeof(attr)))
+		return -EFAULT;
+
+	return accessor(dev, &attr);
+}
+
+static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
+			     unsigned long arg)
+{
+	struct kvm_device *dev = filp->private_data;
+
+	switch (ioctl) {
+	case KVM_SET_DEVICE_ATTR:
+		return kvm_device_ioctl_attr(dev, dev->ops->set_attr, arg);
+	case KVM_GET_DEVICE_ATTR:
+		return kvm_device_ioctl_attr(dev, dev->ops->get_attr, arg);
+	case KVM_HAS_DEVICE_ATTR:
+		return kvm_device_ioctl_attr(dev, dev->ops->has_attr, arg);
+	default:
+		if (dev->ops->ioctl)
+			return dev->ops->ioctl(dev, ioctl, arg);
+
+		return -ENOTTY;
+	}
+}
+
+void kvm_device_get(struct kvm_device *dev)
+{
+	atomic_inc(&dev->users);
+}
+
+void kvm_device_put(struct kvm_device *dev)
+{
+	if (atomic_dec_and_test(&dev->users))
+		dev->ops->destroy(dev);
+}
+
+static int kvm_device_release(struct inode *inode, struct file *filp)
+{
+	struct kvm_device *dev = filp->private_data;
+	struct kvm *kvm = dev->kvm;
+
+	kvm_device_put(dev);
+	kvm_put_kvm(kvm);
+	return 0;
+}
+
+static const struct file_operations kvm_device_fops = {
+	.unlocked_ioctl = kvm_device_ioctl,
+	.release = kvm_device_release,
+};
+
+struct kvm_device *kvm_device_from_filp(struct file *filp)
+{
+	if (filp->f_op != &kvm_device_fops)
+		return NULL;
+
+	return filp->private_data;
+}
+
+static int kvm_ioctl_create_device(struct kvm *kvm,
+				   struct kvm_create_device *cd)
+{
+	struct kvm_device_ops *ops = NULL;
+	struct kvm_device *dev;
+	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
+	int ret;
+
+	switch (cd->type) {
+	default:
+		return -ENODEV;
+	}
+
+	if (test)
+		return 0;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->ops = ops;
+	dev->kvm = kvm;
+	atomic_set(&dev->users, 1);
+
+	ret = ops->create(dev, cd->type);
+	if (ret < 0) {
+		kfree(dev);
+		return ret;
+	}
+
+	ret = anon_inode_getfd(ops->name, &kvm_device_fops, dev, O_RDWR);
+	if (ret < 0) {
+		ops->destroy(dev);
+		return ret;
+	}
+
+	kvm_get_kvm(kvm);
+	cd->fd = ret;
+	return 0;
+}
+
 static long kvm_vm_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -2304,6 +2415,24 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 #endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
+	case KVM_CREATE_DEVICE: {
+		struct kvm_create_device cd;
+
+		r = -EFAULT;
+		if (copy_from_user(&cd, argp, sizeof(cd)))
+			goto out;
+
+		r = kvm_ioctl_create_device(kvm, &cd);
+		if (r)
+			goto out;
+
+		r = -EFAULT;
+		if (copy_to_user(argp, &cd, sizeof(cd)))
+			goto out;
+
+		r = 0;
+		break;
+	}
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
-- 
cgit 


From 5df554ad5b7522ea62b0ff9d5be35183494efc21 Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Fri, 12 Apr 2013 14:08:46 +0000
Subject: kvm/ppc/mpic: in-kernel MPIC emulation

Hook the MPIC code up to the KVM interfaces, add locking, etc.

Signed-off-by: Scott Wood <scottwood@freescale.com>
[agraf: add stub function for kvmppc_mpic_set_epr, non-booke, 64bit]
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/devices/mpic.txt |  37 ++
 arch/powerpc/include/asm/kvm_host.h        |   8 +-
 arch/powerpc/include/asm/kvm_ppc.h         |  17 +
 arch/powerpc/include/uapi/asm/kvm.h        |   8 +
 arch/powerpc/kvm/Kconfig                   |   9 +
 arch/powerpc/kvm/Makefile                  |   2 +
 arch/powerpc/kvm/booke.c                   |   8 +-
 arch/powerpc/kvm/mpic.c                    | 762 ++++++++++++++++++++++-------
 arch/powerpc/kvm/powerpc.c                 |  12 +-
 include/linux/kvm_host.h                   |   2 +
 include/uapi/linux/kvm.h                   |   3 +
 virt/kvm/kvm_main.c                        |   6 +
 12 files changed, 674 insertions(+), 200 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/mpic.txt

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/devices/mpic.txt b/Documentation/virtual/kvm/devices/mpic.txt
new file mode 100644
index 000000000000..ce98e3264fa8
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/mpic.txt
@@ -0,0 +1,37 @@
+MPIC interrupt controller
+=========================
+
+Device types supported:
+  KVM_DEV_TYPE_FSL_MPIC_20     Freescale MPIC v2.0
+  KVM_DEV_TYPE_FSL_MPIC_42     Freescale MPIC v4.2
+
+Only one MPIC instance, of any type, may be instantiated.  The created
+MPIC will act as the system interrupt controller, connecting to each
+vcpu's interrupt inputs.
+
+Groups:
+  KVM_DEV_MPIC_GRP_MISC
+  Attributes:
+    KVM_DEV_MPIC_BASE_ADDR (rw, 64-bit)
+      Base address of the 256 KiB MPIC register space.  Must be
+      naturally aligned.  A value of zero disables the mapping.
+      Reset value is zero.
+
+  KVM_DEV_MPIC_GRP_REGISTER (rw, 32-bit)
+    Access an MPIC register, as if the access were made from the guest.
+    "attr" is the byte offset into the MPIC register space.  Accesses
+    must be 4-byte aligned.
+
+    MSIs may be signaled by using this attribute group to write
+    to the relevant MSIIR.
+
+  KVM_DEV_MPIC_GRP_IRQ_ACTIVE (rw, 32-bit)
+    IRQ input line for each standard openpic source.  0 is inactive and 1
+    is active, regardless of interrupt sense.
+
+    For edge-triggered interrupts:  Writing 1 is considered an activating
+    edge, and writing 0 is ignored.  Reading returns 1 if a previously
+    signaled edge has not been acknowledged, and 0 otherwise.
+
+    "attr" is the IRQ number.  IRQ numbers for standard sources are the
+    byte offset of the relevant IVPR from EIVPR0, divided by 32.
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index 1443768a6588..153c8c2b0f88 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -361,6 +361,11 @@ struct kvmppc_slb {
 #define KVMPPC_BOOKE_MAX_IAC	4
 #define KVMPPC_BOOKE_MAX_DAC	2
 
+/* KVMPPC_EPR_USER takes precedence over KVMPPC_EPR_KERNEL */
+#define KVMPPC_EPR_NONE		0 /* EPR not supported */
+#define KVMPPC_EPR_USER		1 /* exit to userspace to fill EPR */
+#define KVMPPC_EPR_KERNEL	2 /* in-kernel irqchip */
+
 struct kvmppc_booke_debug_reg {
 	u32 dbcr0;
 	u32 dbcr1;
@@ -526,7 +531,7 @@ struct kvm_vcpu_arch {
 	u8 sane;
 	u8 cpu_type;
 	u8 hcall_needed;
-	u8 epr_enabled;
+	u8 epr_flags; /* KVMPPC_EPR_xxx */
 	u8 epr_needed;
 
 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
@@ -593,5 +598,6 @@ struct kvm_vcpu_arch {
 #define KVM_MMIO_REG_FQPR	0x0060
 
 #define __KVM_HAVE_ARCH_WQP
+#define __KVM_HAVE_CREATE_DEVICE
 
 #endif /* __POWERPC_KVM_HOST_H__ */
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index bcc68b1afc66..3810f9c7616c 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -164,6 +164,8 @@ extern int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu);
 
 extern int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *);
 
+int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
@@ -245,6 +247,9 @@ int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, union kvmppc_one_reg *);
 
 void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid);
 
+struct openpic;
+void kvmppc_mpic_put(struct openpic *opp);
+
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 static inline void kvmppc_set_xics_phys(int cpu, unsigned long addr)
 {
@@ -270,6 +275,18 @@ static inline void kvmppc_set_epr(struct kvm_vcpu *vcpu, u32 epr)
 #endif
 }
 
+#ifdef CONFIG_KVM_MPIC
+
+void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu);
+
+#else
+
+static inline void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu)
+{
+}
+
+#endif /* CONFIG_KVM_MPIC */
+
 int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu,
 			      struct kvm_config_tlb *cfg);
 int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 41d59d8bdfe8..02ad96606860 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -382,6 +382,14 @@ struct kvm_get_htab_header {
 	__u16	n_invalid;
 };
 
+/* Device control API: PPC-specific devices */
+#define KVM_DEV_MPIC_GRP_MISC		1
+#define   KVM_DEV_MPIC_BASE_ADDR	0	/* 64-bit */
+
+#define KVM_DEV_MPIC_GRP_REGISTER	2	/* 32-bit */
+#define KVM_DEV_MPIC_GRP_IRQ_ACTIVE	3	/* 32-bit */
+
+/* One-Reg API: PPC-specific registers */
 #define KVM_REG_PPC_HIOR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x1)
 #define KVM_REG_PPC_IAC1	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x2)
 #define KVM_REG_PPC_IAC2	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x3)
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 448952035b6e..f47e95e0b6de 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -151,6 +151,15 @@ config KVM_E500MC
 
 	  If unsure, say N.
 
+config KVM_MPIC
+	bool "KVM in-kernel MPIC emulation"
+	depends on KVM
+	help
+	  Enable support for emulating MPIC devices inside the
+          host kernel, rather than relying on userspace to emulate.
+          Currently, support is limited to certain versions of
+          Freescale's MPIC implementation.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index b772eded8c26..4a2277a221bb 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -103,6 +103,8 @@ kvm-book3s_32-objs := \
 	book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 
+kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o
+
 kvm-objs := $(kvm-objs-m) $(kvm-objs-y)
 
 obj-$(CONFIG_KVM_440) += kvm.o
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 02756537cd90..4da11ed48c59 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -346,7 +346,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 		keep_irq = true;
 	}
 
-	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_enabled)
+	if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags)
 		update_epr = true;
 
 	switch (priority) {
@@ -427,8 +427,10 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
 			set_guest_esr(vcpu, vcpu->arch.queued_esr);
 		if (update_dear == true)
 			set_guest_dear(vcpu, vcpu->arch.queued_dear);
-		if (update_epr == true)
-			kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+		if (update_epr == true) {
+			if (vcpu->arch.epr_flags & KVMPPC_EPR_USER)
+				kvm_make_request(KVM_REQ_EPR_EXIT, vcpu);
+		}
 
 		new_msr &= msr_mask;
 #if defined(CONFIG_64BIT)
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 1df67aed7a91..cb451b91e342 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -23,6 +23,19 @@
  * THE SOFTWARE.
  */
 
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/anon_inodes.h>
+#include <asm/uaccess.h>
+#include <asm/mpic.h>
+#include <asm/kvm_para.h>
+#include <asm/kvm_host.h>
+#include <asm/kvm_ppc.h>
+#include "iodev.h"
+
 #define MAX_CPU     32
 #define MAX_SRC     256
 #define MAX_TMR     4
@@ -36,6 +49,7 @@
 #define OPENPIC_FLAG_ILR          (2 << 0)
 
 /* OpenPIC address map */
+#define OPENPIC_REG_SIZE             0x40000
 #define OPENPIC_GLB_REG_START        0x0
 #define OPENPIC_GLB_REG_SIZE         0x10F0
 #define OPENPIC_TMR_REG_START        0x10F0
@@ -89,6 +103,7 @@ static struct fsl_mpic_info fsl_mpic_42 = {
 #define ILR_INTTGT_INT    0x00
 #define ILR_INTTGT_CINT   0x01	/* critical */
 #define ILR_INTTGT_MCP    0x02	/* machine check */
+#define NUM_OUTPUTS       3
 
 #define MSIIR_OFFSET       0x140
 #define MSIIR_SRS_SHIFT    29
@@ -98,18 +113,19 @@ static struct fsl_mpic_info fsl_mpic_42 = {
 
 static int get_current_cpu(void)
 {
-	CPUState *cpu_single_cpu;
-
-	if (!cpu_single_env)
-		return -1;
-
-	cpu_single_cpu = ENV_GET_CPU(cpu_single_env);
-	return cpu_single_cpu->cpu_index;
+#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE)
+	struct kvm_vcpu *vcpu = current->thread.kvm_vcpu;
+	return vcpu ? vcpu->vcpu_id : -1;
+#else
+	/* XXX */
+	return -1;
+#endif
 }
 
-static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx);
-static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
-				       uint32_t val, int idx);
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx);
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx);
 
 enum irq_type {
 	IRQ_TYPE_NORMAL = 0,
@@ -131,7 +147,7 @@ struct irq_source {
 	uint32_t idr;		/* IRQ destination register */
 	uint32_t destmask;	/* bitmap of CPU destinations */
 	int last_cpu;
-	int output;		/* IRQ level, e.g. OPENPIC_OUTPUT_INT */
+	int output;		/* IRQ level, e.g. ILR_INTTGT_INT */
 	int pending;		/* TRUE if IRQ is pending */
 	enum irq_type type;
 	bool level:1;		/* level-triggered */
@@ -158,16 +174,27 @@ struct irq_source {
 #define IDR_CI      0x40000000	/* critical interrupt */
 
 struct irq_dest {
+	struct kvm_vcpu *vcpu;
+
 	int32_t ctpr;		/* CPU current task priority */
 	struct irq_queue raised;
 	struct irq_queue servicing;
-	qemu_irq *irqs;
 
 	/* Count of IRQ sources asserting on non-INT outputs */
-	uint32_t outputs_active[OPENPIC_OUTPUT_NB];
+	uint32_t outputs_active[NUM_OUTPUTS];
 };
 
 struct openpic {
+	struct kvm *kvm;
+	struct kvm_device *dev;
+	struct kvm_io_device mmio;
+	struct list_head mmio_regions;
+	atomic_t users;
+	bool mmio_mapped;
+
+	gpa_t reg_base;
+	spinlock_t lock;
+
 	/* Behavior control */
 	struct fsl_mpic_info *fsl;
 	uint32_t model;
@@ -208,6 +235,47 @@ struct openpic {
 	uint32_t irq_msi;
 };
 
+
+static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	struct kvm_interrupt irq = {
+		.irq = KVM_INTERRUPT_SET_LEVEL,
+	};
+
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq);
+}
+
+static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst,
+			   int output)
+{
+	if (!dst->vcpu) {
+		pr_debug("%s: destination cpu %d does not exist\n",
+			 __func__, (int)(dst - &opp->dst[0]));
+		return;
+	}
+
+	pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->vcpu_id,
+		output);
+
+	if (output != ILR_INTTGT_INT)	/* TODO */
+		return;
+
+	kvmppc_core_dequeue_external(dst->vcpu);
+}
+
 static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ)
 {
 	set_bit(n_IRQ, q->queue);
@@ -268,7 +336,7 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 	pr_debug("%s: IRQ %d active %d was %d\n",
 		__func__, n_IRQ, active, was_active);
 
-	if (src->output != OPENPIC_OUTPUT_INT) {
+	if (src->output != ILR_INTTGT_INT) {
 		pr_debug("%s: output %d irq %d active %d was %d count %d\n",
 			__func__, src->output, n_IRQ, active, was_active,
 			dst->outputs_active[src->output]);
@@ -282,14 +350,14 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 			    dst->outputs_active[src->output]++ == 0) {
 				pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n",
 					__func__, src->output, n_CPU, n_IRQ);
-				qemu_irq_raise(dst->irqs[src->output]);
+				mpic_irq_raise(opp, dst, src->output);
 			}
 		} else {
 			if (was_active &&
 			    --dst->outputs_active[src->output] == 0) {
 				pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n",
 					__func__, src->output, n_CPU, n_IRQ);
-				qemu_irq_lower(dst->irqs[src->output]);
+				mpic_irq_lower(opp, dst, src->output);
 			}
 		}
 
@@ -322,8 +390,7 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 		} else {
 			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n",
 				__func__, n_CPU, n_IRQ, dst->raised.next);
-			qemu_irq_raise(opp->dst[n_CPU].
-				       irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
 		}
 	} else {
 		IRQ_get_next(opp, &dst->servicing);
@@ -338,8 +405,7 @@ static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ,
 			pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n",
 				__func__, n_IRQ, dst->ctpr,
 				dst->servicing.priority, n_CPU);
-			qemu_irq_lower(opp->dst[n_CPU].
-				       irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
 		}
 	}
 }
@@ -415,8 +481,8 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level)
 	struct irq_source *src;
 
 	if (n_IRQ >= MAX_IRQ) {
-		pr_err("%s: IRQ %d out of range\n", __func__, n_IRQ);
-		abort();
+		WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ);
+		return;
 	}
 
 	src = &opp->src[n_IRQ];
@@ -433,7 +499,7 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level)
 			openpic_update_irq(opp, n_IRQ);
 		}
 
-		if (src->output != OPENPIC_OUTPUT_INT) {
+		if (src->output != ILR_INTTGT_INT) {
 			/* Edge-triggered interrupts shouldn't be used
 			 * with non-INT delivery, but just in case,
 			 * try to make it do something sane rather than
@@ -446,15 +512,13 @@ static void openpic_set_irq(void *opaque, int n_IRQ, int level)
 	}
 }
 
-static void openpic_reset(DeviceState *d)
+static void openpic_reset(struct openpic *opp)
 {
-	struct openpic *opp = FROM_SYSBUS(typeof(*opp), SYS_BUS_DEVICE(d));
 	int i;
 
 	opp->gcr = GCR_RESET;
 	/* Initialise controller registers */
 	opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) |
-	    ((opp->nb_cpus - 1) << FRR_NCPU_SHIFT) |
 	    (opp->vid << FRR_VID_SHIFT);
 
 	opp->pir = 0;
@@ -504,7 +568,7 @@ static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ)
 static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ)
 {
 	if (opp->flags & OPENPIC_FLAG_ILR)
-		return output_to_inttgt(opp->src[n_IRQ].output);
+		return opp->src[n_IRQ].output;
 
 	return 0xffffffff;
 }
@@ -539,7 +603,7 @@ static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
 					__func__);
 			}
 
-			src->output = OPENPIC_OUTPUT_CINT;
+			src->output = ILR_INTTGT_CINT;
 			src->nomask = true;
 			src->destmask = 0;
 
@@ -550,7 +614,7 @@ static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ,
 					src->destmask |= 1UL << i;
 			}
 		} else {
-			src->output = OPENPIC_OUTPUT_INT;
+			src->output = ILR_INTTGT_INT;
 			src->nomask = false;
 			src->destmask = src->idr & normal_mask;
 		}
@@ -565,7 +629,7 @@ static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ,
 	if (opp->flags & OPENPIC_FLAG_ILR) {
 		struct irq_source *src = &opp->src[n_IRQ];
 
-		src->output = inttgt_to_output(val & ILR_INTTGT_MASK);
+		src->output = val & ILR_INTTGT_MASK;
 		pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr,
 			src->output);
 
@@ -614,34 +678,23 @@ static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ,
 
 static void openpic_gcr_write(struct openpic *opp, uint64_t val)
 {
-	bool mpic_proxy = false;
-
 	if (val & GCR_RESET) {
-		openpic_reset(&opp->busdev.qdev);
+		openpic_reset(opp);
 		return;
 	}
 
 	opp->gcr &= ~opp->mpic_mode_mask;
 	opp->gcr |= val & opp->mpic_mode_mask;
-
-	/* Set external proxy mode */
-	if ((val & opp->mpic_mode_mask) == GCR_MODE_PROXY)
-		mpic_proxy = true;
-
-	ppce500_set_mpic_proxy(mpic_proxy);
 }
 
-static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
-	struct irq_dest *dst;
-	int idx;
+	int err = 0;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	switch (addr) {
 	case 0x00:	/* Block Revision Register1 (BRR1) is Readonly */
@@ -654,7 +707,8 @@ static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
 	case 0x90:
 	case 0xA0:
 	case 0xB0:
-		openpic_cpu_write_internal(opp, addr, val, get_current_cpu());
+		err = openpic_cpu_write_internal(opp, addr, val,
+						 get_current_cpu());
 		break;
 	case 0x1000:		/* FRR */
 		break;
@@ -664,21 +718,11 @@ static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
 	case 0x1080:		/* VIR */
 		break;
 	case 0x1090:		/* PIR */
-		for (idx = 0; idx < opp->nb_cpus; idx++) {
-			if ((val & (1 << idx)) && !(opp->pir & (1 << idx))) {
-				pr_debug("Raise OpenPIC RESET output for CPU %d\n",
-					idx);
-				dst = &opp->dst[idx];
-				qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_RESET]);
-			} else if (!(val & (1 << idx)) &&
-				   (opp->pir & (1 << idx))) {
-				pr_debug("Lower OpenPIC RESET output for CPU %d\n",
-					idx);
-				dst = &opp->dst[idx];
-				qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_RESET]);
-			}
-		}
-		opp->pir = val;
+		/*
+		 * This register is used to reset a CPU core --
+		 * let userspace handle it.
+		 */
+		err = -ENXIO;
 		break;
 	case 0x10A0:		/* IPI_IVPR */
 	case 0x10B0:
@@ -695,21 +739,25 @@ static void openpic_gbl_write(void *opaque, gpa_t addr, uint64_t val,
 	default:
 		break;
 	}
+
+	return err;
 }
 
-static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
+static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
-	uint32_t retval;
+	u32 retval;
+	int err = 0;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	retval = 0xFFFFFFFF;
 	if (addr & 0xF)
-		return retval;
+		goto out;
 
 	switch (addr) {
 	case 0x1000:		/* FRR */
 		retval = opp->frr;
+		retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT;
 		break;
 	case 0x1020:		/* GCR */
 		retval = opp->gcr;
@@ -731,8 +779,8 @@ static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
 	case 0x90:
 	case 0xA0:
 	case 0xB0:
-		retval =
-		    openpic_cpu_read_internal(opp, addr, get_current_cpu());
+		err = openpic_cpu_read_internal(opp, addr,
+			&retval, get_current_cpu());
 		break;
 	case 0x10A0:		/* IPI_IVPR */
 	case 0x10B0:
@@ -750,28 +798,28 @@ static uint64_t openpic_gbl_read(void *opaque, gpa_t addr, unsigned len)
 	default:
 		break;
 	}
-	pr_debug("%s: => 0x%08x\n", __func__, retval);
 
-	return retval;
+out:
+	pr_debug("%s: => 0x%08x\n", __func__, retval);
+	*ptr = retval;
+	return err;
 }
 
-static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
 	int idx;
 
 	addr += 0x10f0;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	if (addr == 0x10f0) {
 		/* TFRR */
 		opp->tfrr = val;
-		return;
+		return 0;
 	}
 
 	idx = (addr >> 6) & 0x3;
@@ -795,15 +843,17 @@ static void openpic_tmr_write(void *opaque, gpa_t addr, uint64_t val,
 		write_IRQreg_idr(opp, opp->irq_tim0 + idx, val);
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len)
+static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
 	uint32_t retval = -1;
 	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	if (addr & 0xF)
 		goto out;
 
@@ -813,6 +863,7 @@ static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len)
 		retval = opp->tfrr;
 		goto out;
 	}
+
 	switch (addr & 0x30) {
 	case 0x00:		/* TCCR */
 		retval = opp->timers[idx].tccr;
@@ -830,18 +881,16 @@ static uint64_t openpic_tmr_read(void *opaque, gpa_t addr, unsigned len)
 
 out:
 	pr_debug("%s: => 0x%08x\n", __func__, retval);
-
-	return retval;
+	*ptr = retval;
+	return 0;
 }
 
-static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_src_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
 	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= %08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val);
 
 	addr = addr & 0xffff;
 	idx = addr >> 5;
@@ -857,15 +906,17 @@ static void openpic_src_write(void *opaque, gpa_t addr, uint64_t val,
 		write_IRQreg_ilr(opp, idx, val);
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len)
+static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
 	uint32_t retval;
 	int idx;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	retval = 0xFFFFFFFF;
 
 	addr = addr & 0xffff;
@@ -884,20 +935,19 @@ static uint64_t openpic_src_read(void *opaque, uint64_t addr, unsigned len)
 	}
 
 	pr_debug("%s: => 0x%08x\n", __func__, retval);
-	return retval;
+	*ptr = retval;
+	return 0;
 }
 
-static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned size)
+static int openpic_msi_write(void *opaque, gpa_t addr, u32 val)
 {
 	struct openpic *opp = opaque;
 	int idx = opp->irq_msi;
 	int srs, ibs;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	switch (addr) {
 	case MSIIR_OFFSET:
@@ -911,17 +961,19 @@ static void openpic_msi_write(void *opaque, gpa_t addr, uint64_t val,
 		/* most registers are read-only, thus ignored */
 		break;
 	}
+
+	return 0;
 }
 
-static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size)
+static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr)
 {
 	struct openpic *opp = opaque;
-	uint64_t r = 0;
+	uint32_t r = 0;
 	int i, srs;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 	if (addr & 0xF)
-		return -1;
+		return -ENXIO;
 
 	srs = addr >> 4;
 
@@ -945,45 +997,47 @@ static uint64_t openpic_msi_read(void *opaque, gpa_t addr, unsigned size)
 		break;
 	}
 
-	return r;
+	pr_debug("%s: => 0x%08x\n", __func__, r);
+	*ptr = r;
+	return 0;
 }
 
-static uint64_t openpic_summary_read(void *opaque, gpa_t addr, unsigned size)
+static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr)
 {
-	uint64_t r = 0;
+	uint32_t r = 0;
 
-	pr_debug("%s: addr %#" HWADDR_PRIx "\n", __func__, addr);
+	pr_debug("%s: addr %#llx\n", __func__, addr);
 
 	/* TODO: EISR/EIMR */
 
-	return r;
+	*ptr = r;
+	return 0;
 }
 
-static void openpic_summary_write(void *opaque, gpa_t addr, uint64_t val,
-				  unsigned size)
+static int openpic_summary_write(void *opaque, gpa_t addr, u32 val)
 {
-	pr_debug("%s: addr %#" HWADDR_PRIx " <= 0x%08" PRIx64 "\n",
-		__func__, addr, val);
+	pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val);
 
 	/* TODO: EISR/EIMR */
+	return 0;
 }
 
-static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
-				       uint32_t val, int idx)
+static int openpic_cpu_write_internal(void *opaque, gpa_t addr,
+				      u32 val, int idx)
 {
 	struct openpic *opp = opaque;
 	struct irq_source *src;
 	struct irq_dest *dst;
 	int s_IRQ, n_IRQ;
 
-	pr_debug("%s: cpu %d addr %#" HWADDR_PRIx " <= 0x%08x\n", __func__, idx,
+	pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx,
 		addr, val);
 
 	if (idx < 0)
-		return;
+		return 0;
 
 	if (addr & 0xF)
-		return;
+		return 0;
 
 	dst = &opp->dst[idx];
 	addr &= 0xFF0;
@@ -1008,11 +1062,11 @@ static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
 		if (dst->raised.priority <= dst->ctpr) {
 			pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n",
 				__func__, idx);
-			qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
 		} else if (dst->raised.priority > dst->servicing.priority) {
 			pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n",
 				__func__, idx, dst->raised.next);
-			qemu_irq_raise(dst->irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
 		}
 
 		break;
@@ -1043,18 +1097,22 @@ static void openpic_cpu_write_internal(void *opaque, gpa_t addr,
 		     IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) {
 			pr_debug("Raise OpenPIC INT output cpu %d irq %d\n",
 				idx, n_IRQ);
-			qemu_irq_raise(opp->dst[idx].irqs[OPENPIC_OUTPUT_INT]);
+			mpic_irq_raise(opp, dst, ILR_INTTGT_INT);
 		}
 		break;
 	default:
 		break;
 	}
+
+	return 0;
 }
 
-static void openpic_cpu_write(void *opaque, gpa_t addr, uint64_t val,
-			      unsigned len)
+static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val)
 {
-	openpic_cpu_write_internal(opaque, addr, val, (addr & 0x1f000) >> 12);
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_write_internal(opp, addr, val,
+					 (addr & 0x1f000) >> 12);
 }
 
 static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
@@ -1064,7 +1122,7 @@ static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
 	int retval, irq;
 
 	pr_debug("Lower OpenPIC INT output\n");
-	qemu_irq_lower(dst->irqs[OPENPIC_OUTPUT_INT]);
+	mpic_irq_lower(opp, dst, ILR_INTTGT_INT);
 
 	irq = IRQ_get_next(opp, &dst->raised);
 	pr_debug("IACK: irq=%d\n", irq);
@@ -1107,20 +1165,21 @@ static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst,
 	return retval;
 }
 
-static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx)
+static int openpic_cpu_read_internal(void *opaque, gpa_t addr,
+				     u32 *ptr, int idx)
 {
 	struct openpic *opp = opaque;
 	struct irq_dest *dst;
 	uint32_t retval;
 
-	pr_debug("%s: cpu %d addr %#" HWADDR_PRIx "\n", __func__, idx, addr);
+	pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr);
 	retval = 0xFFFFFFFF;
 
 	if (idx < 0)
-		return retval;
+		goto out;
 
 	if (addr & 0xF)
-		return retval;
+		goto out;
 
 	dst = &opp->dst[idx];
 	addr &= 0xFF0;
@@ -1142,49 +1201,67 @@ static uint32_t openpic_cpu_read_internal(void *opaque, gpa_t addr, int idx)
 	}
 	pr_debug("%s: => 0x%08x\n", __func__, retval);
 
-	return retval;
+out:
+	*ptr = retval;
+	return 0;
 }
 
-static uint64_t openpic_cpu_read(void *opaque, gpa_t addr, unsigned len)
+static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr)
 {
-	return openpic_cpu_read_internal(opaque, addr, (addr & 0x1f000) >> 12);
+	struct openpic *opp = opaque;
+
+	return openpic_cpu_read_internal(opp, addr, ptr,
+					 (addr & 0x1f000) >> 12);
 }
 
-static const struct kvm_io_device_ops openpic_glb_ops_be = {
+struct mem_reg {
+	struct list_head list;
+	int (*read)(void *opaque, gpa_t addr, u32 *ptr);
+	int (*write)(void *opaque, gpa_t addr, u32 val);
+	gpa_t start_addr;
+	int size;
+};
+
+static struct mem_reg openpic_gbl_mmio = {
 	.write = openpic_gbl_write,
 	.read = openpic_gbl_read,
+	.start_addr = OPENPIC_GLB_REG_START,
+	.size = OPENPIC_GLB_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_tmr_ops_be = {
+static struct mem_reg openpic_tmr_mmio = {
 	.write = openpic_tmr_write,
 	.read = openpic_tmr_read,
+	.start_addr = OPENPIC_TMR_REG_START,
+	.size = OPENPIC_TMR_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_cpu_ops_be = {
+static struct mem_reg openpic_cpu_mmio = {
 	.write = openpic_cpu_write,
 	.read = openpic_cpu_read,
+	.start_addr = OPENPIC_CPU_REG_START,
+	.size = OPENPIC_CPU_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_src_ops_be = {
+static struct mem_reg openpic_src_mmio = {
 	.write = openpic_src_write,
 	.read = openpic_src_read,
+	.start_addr = OPENPIC_SRC_REG_START,
+	.size = OPENPIC_SRC_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_msi_ops_be = {
+static struct mem_reg openpic_msi_mmio = {
 	.read = openpic_msi_read,
 	.write = openpic_msi_write,
+	.start_addr = OPENPIC_MSI_REG_START,
+	.size = OPENPIC_MSI_REG_SIZE,
 };
 
-static const struct kvm_io_device_ops openpic_summary_ops_be = {
+static struct mem_reg openpic_summary_mmio = {
 	.read = openpic_summary_read,
 	.write = openpic_summary_write,
-};
-
-struct mem_reg {
-	const char *name;
-	const struct kvm_io_device_ops *ops;
-	gpa_t start_addr;
-	int size;
+	.start_addr = OPENPIC_SUMMARY_REG_START,
+	.size = OPENPIC_SUMMARY_REG_SIZE,
 };
 
 static void fsl_common_init(struct openpic *opp)
@@ -1192,6 +1269,9 @@ static void fsl_common_init(struct openpic *opp)
 	int i;
 	int virq = MAX_SRC;
 
+	list_add(&openpic_msi_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_summary_mmio.list, &opp->mmio_regions);
+
 	opp->vid = VID_REVISION_1_2;
 	opp->vir = VIR_GENERIC;
 	opp->vector_mask = 0xFFFF;
@@ -1205,11 +1285,10 @@ static void fsl_common_init(struct openpic *opp)
 	opp->irq_tim0 = virq;
 	virq += MAX_TMR;
 
-	assert(virq <= MAX_IRQ);
+	BUG_ON(virq > MAX_IRQ);
 
 	opp->irq_msi = 224;
 
-	msi_supported = true;
 	for (i = 0; i < opp->fsl->max_ext; i++)
 		opp->src[i].level = false;
 
@@ -1226,63 +1305,352 @@ static void fsl_common_init(struct openpic *opp)
 	}
 }
 
-static void map_list(struct openpic *opp, const struct mem_reg *list,
-		     int *count)
+static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr)
 {
-	while (list->name) {
-		assert(*count < ARRAY_SIZE(opp->sub_io_mem));
+	struct list_head *node;
 
-		memory_region_init_io(&opp->sub_io_mem[*count], list->ops, opp,
-				      list->name, list->size);
+	list_for_each(node, &opp->mmio_regions) {
+		struct mem_reg *mr = list_entry(node, struct mem_reg, list);
 
-		memory_region_add_subregion(&opp->mem, list->start_addr,
-					    &opp->sub_io_mem[*count]);
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
 
-		(*count)++;
-		list++;
+		return mr->read(opp, addr - mr->start_addr, ptr);
 	}
+
+	return -ENXIO;
 }
 
-static int openpic_init(SysBusDevice *dev)
+static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
 {
-	struct openpic *opp = FROM_SYSBUS(typeof(*opp), dev);
-	int i, j;
-	int list_count = 0;
-	static const struct mem_reg list_le[] = {
-		{"glb", &openpic_glb_ops_le,
-		 OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE},
-		{"tmr", &openpic_tmr_ops_le,
-		 OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE},
-		{"src", &openpic_src_ops_le,
-		 OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE},
-		{"cpu", &openpic_cpu_ops_le,
-		 OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE},
-		{NULL}
-	};
-	static const struct mem_reg list_be[] = {
-		{"glb", &openpic_glb_ops_be,
-		 OPENPIC_GLB_REG_START, OPENPIC_GLB_REG_SIZE},
-		{"tmr", &openpic_tmr_ops_be,
-		 OPENPIC_TMR_REG_START, OPENPIC_TMR_REG_SIZE},
-		{"src", &openpic_src_ops_be,
-		 OPENPIC_SRC_REG_START, OPENPIC_SRC_REG_SIZE},
-		{"cpu", &openpic_cpu_ops_be,
-		 OPENPIC_CPU_REG_START, OPENPIC_CPU_REG_SIZE},
-		{NULL}
-	};
-	static const struct mem_reg list_fsl[] = {
-		{"msi", &openpic_msi_ops_be,
-		 OPENPIC_MSI_REG_START, OPENPIC_MSI_REG_SIZE},
-		{"summary", &openpic_summary_ops_be,
-		 OPENPIC_SUMMARY_REG_START, OPENPIC_SUMMARY_REG_SIZE},
-		{NULL}
-	};
+	struct list_head *node;
+
+	list_for_each(node, &opp->mmio_regions) {
+		struct mem_reg *mr = list_entry(node, struct mem_reg, list);
+
+		if (mr->start_addr > addr || addr >= mr->start_addr + mr->size)
+			continue;
 
-	memory_region_init(&opp->mem, "openpic", 0x40000);
+		return mr->write(opp, addr - mr->start_addr, val);
+	}
+
+	return -ENXIO;
+}
+
+static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
+			 int len, void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+	union {
+		u32 val;
+		u8 bytes[4];
+	} u;
+
+	if (addr & (len - 1)) {
+		pr_debug("%s: bad alignment %llx/%d\n",
+			 __func__, addr, len);
+		return -EINVAL;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val);
+	spin_unlock_irq(&opp->lock);
+
+	/*
+	 * Technically only 32-bit accesses are allowed, but be nice to
+	 * people dumping registers a byte at a time -- it works in real
+	 * hardware (reads only, not writes).
+	 */
+	if (len == 4) {
+		*(u32 *)ptr = u.val;
+		pr_debug("%s: addr %llx ret %d len 4 val %x\n",
+			 __func__, addr, ret, u.val);
+	} else if (len == 1) {
+		*(u8 *)ptr = u.bytes[addr & 3];
+		pr_debug("%s: addr %llx ret %d len 1 val %x\n",
+			 __func__, addr, ret, u.bytes[addr & 3]);
+	} else {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EINVAL;
+	}
+
+	return ret;
+}
+
+static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
+			  int len, const void *ptr)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+	int ret;
+
+	if (len != 4) {
+		pr_debug("%s: bad length %d\n", __func__, len);
+		return -EOPNOTSUPP;
+	}
+	if (addr & 3) {
+		pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len);
+		return -EOPNOTSUPP;
+	}
+
+	spin_lock_irq(&opp->lock);
+	ret = kvm_mpic_write_internal(opp, addr - opp->reg_base,
+				      *(const u32 *)ptr);
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: addr %llx ret %d val %x\n",
+		 __func__, addr, ret, *(const u32 *)ptr);
+
+	return ret;
+}
+
+static void kvm_mpic_dtor(struct kvm_io_device *this)
+{
+	struct openpic *opp = container_of(this, struct openpic, mmio);
+
+	opp->mmio_mapped = false;
+}
+
+static const struct kvm_io_device_ops mpic_mmio_ops = {
+	.read = kvm_mpic_read,
+	.write = kvm_mpic_write,
+	.destructor = kvm_mpic_dtor,
+};
+
+static void map_mmio(struct openpic *opp)
+{
+	BUG_ON(opp->mmio_mapped);
+	opp->mmio_mapped = true;
+
+	kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops);
+
+	kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS,
+				opp->reg_base, OPENPIC_REG_SIZE,
+				&opp->mmio);
+}
+
+static void unmap_mmio(struct openpic *opp)
+{
+	BUG_ON(opp->mmio_mapped);
+	opp->mmio_mapped = false;
+
+	kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio);
+}
+
+static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr)
+{
+	u64 base;
+
+	if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64)))
+		return -EFAULT;
+
+	if (base & 0x3ffff) {
+		pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n",
+			 __func__, base);
+		return -EINVAL;
+	}
+
+	if (base == opp->reg_base)
+		return 0;
+
+	mutex_lock(&opp->kvm->slots_lock);
+
+	unmap_mmio(opp);
+	opp->reg_base = base;
+
+	pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n",
+		 __func__, base);
+
+	if (base == 0)
+		goto out;
+
+	map_mmio(opp);
+
+	mutex_unlock(&opp->kvm->slots_lock);
+out:
+	return 0;
+}
+
+#define ATTR_SET		0
+#define ATTR_GET		1
+
+static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type)
+{
+	int ret;
+
+	if (addr & 3)
+		return -ENXIO;
+
+	spin_lock_irq(&opp->lock);
+
+	if (type == ATTR_SET)
+		ret = kvm_mpic_write_internal(opp, addr, *val);
+	else
+		ret = kvm_mpic_read_internal(opp, addr, val);
+
+	spin_unlock_irq(&opp->lock);
+
+	pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val);
+
+	return ret;
+}
+
+static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u32 attr32;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return set_base_addr(opp, attr);
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return access_reg(opp, attr->attr, &attr32, ATTR_SET);
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		if (get_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		if (attr32 != 0 && attr32 != 1)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		openpic_set_irq(opp, attr->attr, attr32);
+		spin_unlock_irq(&opp->lock);
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct openpic *opp = dev->private;
+	u64 attr64;
+	u32 attr32;
+	int ret;
+
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			mutex_lock(&opp->kvm->slots_lock);
+			attr64 = opp->reg_base;
+			mutex_unlock(&opp->kvm->slots_lock);
+
+			if (copy_to_user((u64 __user *)(long)attr->addr,
+					 &attr64, sizeof(u64)))
+				return -EFAULT;
+
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		ret = access_reg(opp, attr->attr, &attr32, ATTR_GET);
+		if (ret)
+			return ret;
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			return -EINVAL;
+
+		spin_lock_irq(&opp->lock);
+		attr32 = opp->src[attr->attr].pending;
+		spin_unlock_irq(&opp->lock);
+
+		if (put_user(attr32, (u32 __user *)(long)attr->addr))
+			return -EFAULT;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	switch (attr->group) {
+	case KVM_DEV_MPIC_GRP_MISC:
+		switch (attr->attr) {
+		case KVM_DEV_MPIC_BASE_ADDR:
+			return 0;
+		}
+
+		break;
+
+	case KVM_DEV_MPIC_GRP_REGISTER:
+		return 0;
+
+	case KVM_DEV_MPIC_GRP_IRQ_ACTIVE:
+		if (attr->attr > MAX_SRC)
+			break;
+
+		return 0;
+	}
+
+	return -ENXIO;
+}
+
+static void mpic_destroy(struct kvm_device *dev)
+{
+	struct openpic *opp = dev->private;
+
+	if (opp->mmio_mapped) {
+		/*
+		 * Normally we get unmapped by kvm_io_bus_destroy(),
+		 * which happens before the VCPUs release their references.
+		 *
+		 * Thus, we should only get here if no VCPUs took a reference
+		 * to us in the first place.
+		 */
+		WARN_ON(opp->nb_cpus != 0);
+		unmap_mmio(opp);
+	}
+
+	kfree(opp);
+}
+
+static int mpic_create(struct kvm_device *dev, u32 type)
+{
+	struct openpic *opp;
+	int ret;
+
+	opp = kzalloc(sizeof(struct openpic), GFP_KERNEL);
+	if (!opp)
+		return -ENOMEM;
+
+	dev->private = opp;
+	opp->kvm = dev->kvm;
+	opp->dev = dev;
+	opp->model = type;
+	spin_lock_init(&opp->lock);
+
+	INIT_LIST_HEAD(&opp->mmio_regions);
+	list_add(&openpic_gbl_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_tmr_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_src_mmio.list, &opp->mmio_regions);
+	list_add(&openpic_cpu_mmio.list, &opp->mmio_regions);
 
 	switch (opp->model) {
-	case OPENPIC_MODEL_FSL_MPIC_20:
-	default:
+	case KVM_DEV_TYPE_FSL_MPIC_20:
 		opp->fsl = &fsl_mpic_20;
 		opp->brr1 = 0x00400200;
 		opp->flags |= OPENPIC_FLAG_IDR_CRIT;
@@ -1290,12 +1658,10 @@ static int openpic_init(SysBusDevice *dev)
 		opp->mpic_mode_mask = GCR_MODE_MIXED;
 
 		fsl_common_init(opp);
-		map_list(opp, list_be, &list_count);
-		map_list(opp, list_fsl, &list_count);
 
 		break;
 
-	case OPENPIC_MODEL_FSL_MPIC_42:
+	case KVM_DEV_TYPE_FSL_MPIC_42:
 		opp->fsl = &fsl_mpic_42;
 		opp->brr1 = 0x00400402;
 		opp->flags |= OPENPIC_FLAG_ILR;
@@ -1303,11 +1669,27 @@ static int openpic_init(SysBusDevice *dev)
 		opp->mpic_mode_mask = GCR_MODE_PROXY;
 
 		fsl_common_init(opp);
-		map_list(opp, list_be, &list_count);
-		map_list(opp, list_fsl, &list_count);
 
 		break;
+
+	default:
+		ret = -ENODEV;
+		goto err;
 	}
 
+	openpic_reset(opp);
 	return 0;
+
+err:
+	kfree(opp);
+	return ret;
 }
+
+struct kvm_device_ops kvm_mpic_ops = {
+	.name = "kvm-mpic",
+	.create = mpic_create,
+	.destroy = mpic_destroy,
+	.set_attr = mpic_set_attr,
+	.get_attr = mpic_get_attr,
+	.has_attr = mpic_has_attr,
+};
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6b8108624851..88d69cf1f953 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -317,6 +317,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_IOEVENTFD:
+	case KVM_CAP_DEVICE_CTRL:
 		r = 1;
 		break;
 #ifndef CONFIG_KVM_BOOK3S_64_HV
@@ -762,7 +763,10 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_CAP_PPC_EPR:
 		r = 0;
-		vcpu->arch.epr_enabled = cap->args[0];
+		if (cap->args[0])
+			vcpu->arch.epr_flags |= KVMPPC_EPR_USER;
+		else
+			vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER;
 		break;
 #ifdef CONFIG_BOOKE
 	case KVM_CAP_PPC_BOOKE_WATCHDOG:
@@ -908,6 +912,7 @@ static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo)
 long kvm_arch_vm_ioctl(struct file *filp,
                        unsigned int ioctl, unsigned long arg)
 {
+	struct kvm *kvm __maybe_unused = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	long r;
 
@@ -926,7 +931,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
-		struct kvm *kvm = filp->private_data;
 
 		r = -EFAULT;
 		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
@@ -938,7 +942,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_KVM_BOOK3S_64_HV
 	case KVM_ALLOCATE_RMA: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_allocate_rma rma;
 
 		r = kvm_vm_ioctl_allocate_rma(kvm, &rma);
@@ -948,7 +951,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 
 	case KVM_PPC_ALLOCATE_HTAB: {
-		struct kvm *kvm = filp->private_data;
 		u32 htab_order;
 
 		r = -EFAULT;
@@ -965,7 +967,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 
 	case KVM_PPC_GET_HTAB_FD: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_get_htab_fd ghf;
 
 		r = -EFAULT;
@@ -978,7 +979,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_PPC_GET_SMMU_INFO: {
-		struct kvm *kvm = filp->private_data;
 		struct kvm_ppc_smmu_info info;
 
 		memset(&info, 0, sizeof(info));
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 6dab6b5b3e3b..feffbdaf8986 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1099,6 +1099,8 @@ void kvm_device_get(struct kvm_device *dev);
 void kvm_device_put(struct kvm_device *dev);
 struct kvm_device *kvm_device_from_filp(struct file *filp);
 
+extern struct kvm_device_ops kvm_mpic_ops;
+
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
 static inline void kvm_vcpu_set_in_spin_loop(struct kvm_vcpu *vcpu, bool val)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 38a0be0c199f..4148becdc93f 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -837,6 +837,9 @@ struct kvm_device_attr {
 	__u64	addr;		/* userspace address of attr data */
 };
 
+#define KVM_DEV_TYPE_FSL_MPIC_20	1
+#define KVM_DEV_TYPE_FSL_MPIC_42	2
+
 /*
  * ioctls for VM fds
  */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5f0d78c2537b..f6cd14d2f0d9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2238,6 +2238,12 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	int ret;
 
 	switch (cd->type) {
+#ifdef CONFIG_KVM_MPIC
+	case KVM_DEV_TYPE_FSL_MPIC_20:
+	case KVM_DEV_TYPE_FSL_MPIC_42:
+		ops = &kvm_mpic_ops;
+		break;
+#endif
 	default:
 		return -ENODEV;
 	}
-- 
cgit 


From 07f0a7bdec5c4039cfb9b836482c45004d4c21cc Mon Sep 17 00:00:00 2001
From: Scott Wood <scottwood@freescale.com>
Date: Thu, 25 Apr 2013 14:11:23 +0000
Subject: kvm: destroy emulated devices on VM exit

The hassle of getting refcounting right was greater than the hassle
of keeping a list of devices to destroy on VM exit.

Signed-off-by: Scott Wood <scottwood@freescale.com>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 arch/powerpc/kvm/mpic.c  |  2 --
 include/linux/kvm_host.h |  3 ++-
 virt/kvm/kvm_main.c      | 29 ++++++++++++++++-------------
 3 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 89fe1d66a7fb..795ca0c9ae69 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1781,7 +1781,6 @@ int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
 	if (opp->mpic_mode_mask == GCR_MODE_PROXY)
 		vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL;
 
-	kvm_device_get(dev);
 out:
 	spin_unlock_irq(&opp->lock);
 	return ret;
@@ -1797,7 +1796,6 @@ void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu)
 	BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu);
 
 	opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL;
-	kvm_device_put(opp->dev);
 }
 
 /*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index feffbdaf8986..36c977694741 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -393,6 +393,7 @@ struct kvm {
 	long mmu_notifier_count;
 #endif
 	long tlbs_dirty;
+	struct list_head devices;
 };
 
 #define kvm_err(fmt, ...) \
@@ -1069,8 +1070,8 @@ struct kvm_device_ops;
 struct kvm_device {
 	struct kvm_device_ops *ops;
 	struct kvm *kvm;
-	atomic_t users;
 	void *private;
+	struct list_head vm_node;
 };
 
 /* create, destroy, and name are mandatory */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index f6cd14d2f0d9..5da9f02a2a67 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -504,6 +504,7 @@ static struct kvm *kvm_create_vm(unsigned long type)
 	mutex_init(&kvm->irq_lock);
 	mutex_init(&kvm->slots_lock);
 	atomic_set(&kvm->users_count, 1);
+	INIT_LIST_HEAD(&kvm->devices);
 
 	r = kvm_init_mmu_notifier(kvm);
 	if (r)
@@ -581,6 +582,19 @@ void kvm_free_physmem(struct kvm *kvm)
 	kfree(kvm->memslots);
 }
 
+static void kvm_destroy_devices(struct kvm *kvm)
+{
+	struct list_head *node, *tmp;
+
+	list_for_each_safe(node, tmp, &kvm->devices) {
+		struct kvm_device *dev =
+			list_entry(node, struct kvm_device, vm_node);
+
+		list_del(node);
+		dev->ops->destroy(dev);
+	}
+}
+
 static void kvm_destroy_vm(struct kvm *kvm)
 {
 	int i;
@@ -600,6 +614,7 @@ static void kvm_destroy_vm(struct kvm *kvm)
 	kvm_arch_flush_shadow_all(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
+	kvm_destroy_devices(kvm);
 	kvm_free_physmem(kvm);
 	cleanup_srcu_struct(&kvm->srcu);
 	kvm_arch_free_vm(kvm);
@@ -2195,23 +2210,11 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl,
 	}
 }
 
-void kvm_device_get(struct kvm_device *dev)
-{
-	atomic_inc(&dev->users);
-}
-
-void kvm_device_put(struct kvm_device *dev)
-{
-	if (atomic_dec_and_test(&dev->users))
-		dev->ops->destroy(dev);
-}
-
 static int kvm_device_release(struct inode *inode, struct file *filp)
 {
 	struct kvm_device *dev = filp->private_data;
 	struct kvm *kvm = dev->kvm;
 
-	kvm_device_put(dev);
 	kvm_put_kvm(kvm);
 	return 0;
 }
@@ -2257,7 +2260,6 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 
 	dev->ops = ops;
 	dev->kvm = kvm;
-	atomic_set(&dev->users, 1);
 
 	ret = ops->create(dev, cd->type);
 	if (ret < 0) {
@@ -2271,6 +2273,7 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 		return ret;
 	}
 
+	list_add(&dev->vm_node, &kvm->devices);
 	kvm_get_kvm(kvm);
 	cd->fd = ret;
 	return 0;
-- 
cgit 


From d44ada516277f5d58b9bda2ea0a2c9503b316bbf Mon Sep 17 00:00:00 2001
From: Eduardo Valentin <eduardo.valentin@ti.com>
Date: Wed, 17 Apr 2013 17:12:23 +0000
Subject: thermal: cpu_cooling: improve line breaking

To improve code readiness, change the way the lines
are broken in this file.

Signed-off-by: Eduardo Valentin <eduardo.valentin@ti.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
---
 include/linux/cpu_cooling.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpu_cooling.h b/include/linux/cpu_cooling.h
index f48f6fbcd3be..282e27028418 100644
--- a/include/linux/cpu_cooling.h
+++ b/include/linux/cpu_cooling.h
@@ -32,8 +32,8 @@
  * cpufreq_cooling_register - function to create cpufreq cooling device.
  * @clip_cpus: cpumask of cpus where the frequency constraints will happen
  */
-struct thermal_cooling_device *cpufreq_cooling_register(
-		const struct cpumask *clip_cpus);
+struct thermal_cooling_device *
+cpufreq_cooling_register(const struct cpumask *clip_cpus);
 
 /**
  * cpufreq_cooling_unregister - function to remove cpufreq cooling device.
@@ -43,18 +43,18 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev);
 
 unsigned long cpufreq_cooling_get_level(unsigned int, unsigned int);
 #else /* !CONFIG_CPU_THERMAL */
-static inline struct thermal_cooling_device *cpufreq_cooling_register(
-	const struct cpumask *clip_cpus)
+static inline struct thermal_cooling_device *
+cpufreq_cooling_register(const struct cpumask *clip_cpus)
 {
 	return NULL;
 }
-static inline void cpufreq_cooling_unregister(
-		struct thermal_cooling_device *cdev)
+static inline
+void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 {
 	return;
 }
-static inline unsigned long cpufreq_cooling_get_level(unsigned int,
-						 unsigned int)
+static inline
+unsigned long cpufreq_cooling_get_level(unsigned int, unsigned int)
 {
 	return THERMAL_CSTATE_INVALID;
 }
-- 
cgit 


From 8f7ba3ca12f6f16526fa4a8aaf2cae91563eee69 Mon Sep 17 00:00:00 2001
From: Rony Efraim <ronye@mellanox.com>
Date: Thu, 25 Apr 2013 05:22:27 +0000
Subject: net/mlx4: Add set VF mac address support

Add ndo_set_vf_mac support which allows to set the MAC address
for mlx4 VF Ethernet NICs from the host.

Signed-off-by: Rony Efraim <ronye@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c       | 31 +++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 42 +++++++++++++++++++++++++-
 include/linux/mlx4/cmd.h                       |  1 +
 3 files changed, 73 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index 0a301e1a0635..a029124fe2f8 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2016,3 +2016,34 @@ u32 mlx4_comm_get_version(void)
 {
 	 return ((u32) CMD_CHAN_IF_REV << 8) | (u32) CMD_CHAN_VER;
 }
+
+static int mlx4_get_slave_indx(struct mlx4_dev *dev, int vf)
+{
+	if ((vf < 0) || (vf >= dev->num_vfs)) {
+		mlx4_err(dev, "Bad vf number:%d (number of activated vf: %d)\n", vf, dev->num_vfs);
+		return -EINVAL;
+	}
+
+	return vf+1;
+}
+
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->mac = mac;
+	mlx4_info(dev, "default mac on vf %d port %d to %llX will take afect only after vf restart\n",
+		  vf, port, s_info->mac);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 05c7c13bdbde..8293a92bf151 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2024,6 +2024,19 @@ static int mlx4_en_set_features(struct net_device *netdev,
 
 }
 
+static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+	u64 mac_u64 = mlx4_en_mac_to_u64(mac);
+
+	if (!is_valid_ether_addr(mac))
+		return -EINVAL;
+
+	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
+}
+
+
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
 	.ndo_stop		= mlx4_en_close,
@@ -2048,6 +2061,30 @@ static const struct net_device_ops mlx4_netdev_ops = {
 #endif
 };
 
+static const struct net_device_ops mlx4_netdev_ops_master = {
+	.ndo_open		= mlx4_en_open,
+	.ndo_stop		= mlx4_en_close,
+	.ndo_start_xmit		= mlx4_en_xmit,
+	.ndo_select_queue	= mlx4_en_select_queue,
+	.ndo_get_stats		= mlx4_en_get_stats,
+	.ndo_set_rx_mode	= mlx4_en_set_rx_mode,
+	.ndo_set_mac_address	= mlx4_en_set_mac,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_change_mtu		= mlx4_en_change_mtu,
+	.ndo_tx_timeout		= mlx4_en_tx_timeout,
+	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
+	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
+	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= mlx4_en_netpoll,
+#endif
+	.ndo_set_features	= mlx4_en_set_features,
+	.ndo_setup_tc		= mlx4_en_setup_tc,
+#ifdef CONFIG_RFS_ACCEL
+	.ndo_rx_flow_steer	= mlx4_en_filter_rfs,
+#endif
+};
+
 int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 			struct mlx4_en_port_profile *prof)
 {
@@ -2164,7 +2201,10 @@ int mlx4_en_init_netdev(struct mlx4_en_dev *mdev, int port,
 	/*
 	 * Initialize netdev entry points
 	 */
-	dev->netdev_ops = &mlx4_netdev_ops;
+	if (mlx4_is_master(priv->mdev->dev))
+		dev->netdev_ops = &mlx4_netdev_ops_master;
+	else
+		dev->netdev_ops = &mlx4_netdev_ops;
 	dev->watchdog_timeo = MLX4_EN_WATCHDOG_TIMEOUT;
 	netif_set_real_num_tx_queues(dev, priv->tx_ring_num);
 	netif_set_real_num_rx_queues(dev, priv->rx_ring_num);
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 260695186256..f21ddc6203bd 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -232,6 +232,7 @@ struct mlx4_cmd_mailbox *mlx4_alloc_cmd_mailbox(struct mlx4_dev *dev);
 void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox);
 
 u32 mlx4_comm_get_version(void);
+int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
 
-- 
cgit 


From 3f7fb021d081c8aaac1d0cf69a288d21625e872e Mon Sep 17 00:00:00 2001
From: Rony Efraim <ronye@mellanox.com>
Date: Thu, 25 Apr 2013 05:22:28 +0000
Subject: net/mlx4: Add set VF default vlan ID and priority support

Add support to ndo_set_vf_vlan in the driver. Once this call is used the vport
is considered to be in VST mode. In this mode, the PPF driver configures
Ethernet QPs created by this VF to use this vlan id and priority. Currently
RoCE isn't supported on that mode.

The special values of VID=4095 or VID=0,UP=0 are considered as VGT.

Signed-off-by: Rony Efraim <ronye@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           | 72 ++++++++++++++++++++--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  9 +++
 drivers/net/ethernet/mellanox/mlx4/fw.c            |  5 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h          |  2 +
 drivers/net/ethernet/mellanox/mlx4/port.c          |  4 +-
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 36 +++++++++++
 include/linux/mlx4/cmd.h                           |  2 +
 include/linux/mlx4/device.h                        |  3 +-
 8 files changed, 126 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index a029124fe2f8..aad6f8dbfb4c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1492,14 +1492,48 @@ out:
 
 static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 {
-	int port;
+	int port, err;
+	struct mlx4_vport_state *vp_admin;
+	struct mlx4_vport_oper_state *vp_oper;
+
 	for (port = 1; port <= MLX4_MAX_PORTS; port++) {
-		priv->mfunc.master.vf_oper[slave].vport[port].state =
-				priv->mfunc.master.vf_admin[slave].vport[port];
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		vp_admin = &priv->mfunc.master.vf_admin[slave].vport[port];
+		vp_oper->state = *vp_admin;
+		if (MLX4_VGT != vp_admin->default_vlan) {
+			err = __mlx4_register_vlan(&priv->dev, port,
+						   vp_admin->default_vlan, &(vp_oper->vlan_idx));
+			if (err) {
+				vp_oper->vlan_idx = NO_INDX;
+				mlx4_warn((&priv->dev),
+					  "No vlan resorces slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg((&(priv->dev)), "alloc vlan %d idx  %d slave %d port %d\n",
+				 (int)(vp_oper->state.default_vlan),
+				 vp_oper->vlan_idx, slave, port);
+		}
 	}
 	return 0;
 }
 
+static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave)
+{
+	int port;
+	struct mlx4_vport_oper_state *vp_oper;
+
+	for (port = 1; port <= MLX4_MAX_PORTS; port++) {
+		vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+		if (NO_INDX != vp_oper->vlan_idx) {
+			__mlx4_unregister_vlan(&priv->dev,
+					       port, vp_oper->vlan_idx);
+			vp_oper->vlan_idx = NO_INDX;
+		}
+	}
+	return;
+}
+
 static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 			       u16 param, u8 toggle)
 {
@@ -1520,6 +1554,7 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 	if (cmd == MLX4_COMM_CMD_RESET) {
 		mlx4_warn(dev, "Received reset from slave:%d\n", slave);
 		slave_state[slave].active = false;
+		mlx4_master_deactivate_admin_state(priv, slave);
 		for (i = 0; i < MLX4_EVENT_TYPES_NUM; ++i) {
 				slave_state[slave].event_eq[i].eqn = -1;
 				slave_state[slave].event_eq[i].token = 0;
@@ -1566,7 +1601,8 @@ static void mlx4_master_do_cmd(struct mlx4_dev *dev, int slave, u8 cmd,
 		if (slave_state[slave].last_cmd != MLX4_COMM_CMD_VHCR2)
 			goto reset_slave;
 		slave_state[slave].vhcr_dma |= param;
-		mlx4_master_activate_admin_state(priv, slave);
+		if (mlx4_master_activate_admin_state(priv, slave))
+				goto reset_slave;
 		slave_state[slave].active = true;
 		mlx4_dispatch_event(dev, MLX4_DEV_EVENT_SLAVE_INIT, slave);
 		break;
@@ -1776,6 +1812,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
 				}
 				INIT_LIST_HEAD(&s_state->mcast_filters[port]);
 				priv->mfunc.master.vf_admin[i].vport[port].default_vlan = MLX4_VGT;
+				priv->mfunc.master.vf_oper[i].vport[port].state.default_vlan = MLX4_VGT;
 				priv->mfunc.master.vf_oper[i].vport[port].vlan_idx = NO_INDX;
 				priv->mfunc.master.vf_oper[i].vport[port].mac_idx = NO_INDX;
 			}
@@ -2047,3 +2084,30 @@ int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_mac);
+
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_VLAN_CONTROL))
+		return -EPROTONOSUPPORT;
+
+	if ((vlan > 4095) || (qos > 7))
+		return -EINVAL;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	if ((0 == vlan) && (0 == qos))
+		s_info->default_vlan = MLX4_VGT;
+	else
+		s_info->default_vlan = vlan;
+	s_info->default_qos = qos;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 8293a92bf151..c1f2c5b34d95 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2036,6 +2036,14 @@ static int mlx4_en_set_vf_mac(struct net_device *dev, int queue, u8 *mac)
 	return mlx4_set_vf_mac(mdev->dev, en_priv->port, queue, mac_u64);
 }
 
+static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
+}
+
 
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
@@ -2075,6 +2083,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_vlan_rx_add_vid	= mlx4_en_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
 	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
+	.ndo_set_vf_vlan	= mlx4_en_set_vf_vlan,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index 70d44adddc39..d2d30c940790 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -468,6 +468,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_RSVD_XRC_OFFSET		0x66
 #define QUERY_DEV_CAP_MAX_XRC_OFFSET		0x67
 #define QUERY_DEV_CAP_MAX_COUNTERS_OFFSET	0x68
+#define QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET	0x70
 #define QUERY_DEV_CAP_FLOW_STEERING_RANGE_EN_OFFSET	0x76
 #define QUERY_DEV_CAP_FLOW_STEERING_MAX_QP_OFFSET	0x77
 #define QUERY_DEV_CAP_RDMARC_ENTRY_SZ_OFFSET	0x80
@@ -655,6 +656,10 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 		MLX4_GET(dev_cap->max_counters, outbox,
 			 QUERY_DEV_CAP_MAX_COUNTERS_OFFSET);
 
+	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
+	if (field32 & (1 << 26))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+
 	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
 			MLX4_GET(field, outbox, QUERY_DEV_CAP_VL_PORT_OFFSET);
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
index 7e1d10059eda..eac3dae10efe 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h
@@ -1157,6 +1157,8 @@ int mlx4_change_port_types(struct mlx4_dev *dev,
 
 void mlx4_init_mac_table(struct mlx4_dev *dev, struct mlx4_mac_table *table);
 void mlx4_init_vlan_table(struct mlx4_dev *dev, struct mlx4_vlan_table *table);
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index);
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index);
 
 int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz);
 /* resource tracker functions*/
diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c
index d3408add8742..946e0af5faef 100644
--- a/drivers/net/ethernet/mellanox/mlx4/port.c
+++ b/drivers/net/ethernet/mellanox/mlx4/port.c
@@ -310,7 +310,7 @@ int mlx4_find_cached_vlan(struct mlx4_dev *dev, u8 port, u16 vid, int *idx)
 }
 EXPORT_SYMBOL_GPL(mlx4_find_cached_vlan);
 
-static int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
+int __mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan,
 				int *index)
 {
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
@@ -384,7 +384,7 @@ int mlx4_register_vlan(struct mlx4_dev *dev, u8 port, u16 vlan, int *index)
 }
 EXPORT_SYMBOL_GPL(mlx4_register_vlan);
 
-static void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
+void __mlx4_unregister_vlan(struct mlx4_dev *dev, u8 port, int index)
 {
 	struct mlx4_vlan_table *table = &mlx4_priv(dev)->port[port].vlan_table;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index f2d64435d8ef..5083d4babfe3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -353,6 +353,39 @@ static void update_gid(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *inbox,
 	}
 }
 
+static int update_vport_qp_param(struct mlx4_dev *dev,
+				 struct mlx4_cmd_mailbox *inbox,
+				 u8 slave)
+{
+	struct mlx4_qp_context	*qpc = inbox->buf + 8;
+	struct mlx4_vport_oper_state *vp_oper;
+	struct mlx4_priv *priv;
+	u32 qp_type;
+	int port;
+
+	port = (qpc->pri_path.sched_queue & 0x40) ? 2 : 1;
+	priv = mlx4_priv(dev);
+	vp_oper = &priv->mfunc.master.vf_oper[slave].vport[port];
+
+	if (MLX4_VGT != vp_oper->state.default_vlan) {
+		qp_type	= (be32_to_cpu(qpc->flags) >> 16) & 0xff;
+		if (MLX4_QP_ST_RC == qp_type)
+			return -EINVAL;
+
+		qpc->pri_path.vlan_index = vp_oper->vlan_idx;
+		qpc->pri_path.fl = (1 << 6) | (1 << 2); /* set cv bit and hide_cqe_vlan bit*/
+		qpc->pri_path.feup |= 1 << 3; /* set fvl bit */
+		qpc->pri_path.sched_queue &= 0xC7;
+		qpc->pri_path.sched_queue |= (vp_oper->state.default_qos) << 3;
+		mlx4_dbg(dev, "qp %d  port %d Q 0x%x set vlan to %d vidx %d feup %x fl %x\n",
+			 be32_to_cpu(qpc->local_qpn) & 0xffffff, port,
+			 (int)(qpc->pri_path.sched_queue), vp_oper->state.default_vlan,
+			 vp_oper->vlan_idx, (int)(qpc->pri_path.feup),
+			 (int)(qpc->pri_path.fl));
+	}
+	return 0;
+}
+
 static int mpt_mask(struct mlx4_dev *dev)
 {
 	return dev->caps.num_mpts - 1;
@@ -2798,6 +2831,9 @@ int mlx4_INIT2RTR_QP_wrapper(struct mlx4_dev *dev, int slave,
 	update_pkey_index(dev, slave, inbox);
 	update_gid(dev, inbox, (u8)slave);
 	adjust_proxy_tun_qkey(dev, vhcr, qpc);
+	err = update_vport_qp_param(dev, inbox, slave);
+	if (err)
+		return err;
 
 	return mlx4_GEN_QP_wrapper(dev, slave, vhcr, inbox, outbox, cmd);
 }
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index f21ddc6203bd..7daead75a6f5 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -233,6 +233,8 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbo
 
 u32 mlx4_comm_get_version(void);
 int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
+int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
+
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
 
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 2fbc1464b53b..6606d8f5862a 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -155,7 +155,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_RSS_XOR		= 1LL <<  2,
 	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
 	MLX4_DEV_CAP_FLAGS2_REASSIGN_MAC_EN	= 1LL <<  4,
-	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5
+	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5,
+	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  6
 };
 
 enum {
-- 
cgit 


From e6b6a2316379feebebacec0979b3ebc5743e7502 Mon Sep 17 00:00:00 2001
From: Rony Efraim <ronye@mellanox.com>
Date: Thu, 25 Apr 2013 05:22:29 +0000
Subject: net/mlx4: Add VF MAC spoof checking support

Add ndo_set_vf_spoofchk support

Signed-off-by: Rony Efraim <ronye@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c           | 40 ++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c     |  9 +++++
 drivers/net/ethernet/mellanox/mlx4/fw.c            |  2 ++
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  |  8 +++++
 include/linux/mlx4/cmd.h                           |  1 +
 include/linux/mlx4/device.h                        |  3 +-
 6 files changed, 62 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index aad6f8dbfb4c..eda347eeb5dc 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -1514,6 +1514,21 @@ static int mlx4_master_activate_admin_state(struct mlx4_priv *priv, int slave)
 				 (int)(vp_oper->state.default_vlan),
 				 vp_oper->vlan_idx, slave, port);
 		}
+		if (vp_admin->spoofchk) {
+			vp_oper->mac_idx = __mlx4_register_mac(&priv->dev,
+							       port,
+							       vp_admin->mac);
+			if (0 > vp_oper->mac_idx) {
+				err = vp_oper->mac_idx;
+				vp_oper->mac_idx = NO_INDX;
+				mlx4_warn((&priv->dev),
+					  "No mac resorces slave %d, port %d\n",
+					  slave, port);
+				return err;
+			}
+			mlx4_dbg((&(priv->dev)), "alloc mac %llx idx  %d slave %d port %d\n",
+				 vp_oper->state.mac, vp_oper->mac_idx, slave, port);
+		}
 	}
 	return 0;
 }
@@ -1530,6 +1545,10 @@ static void mlx4_master_deactivate_admin_state(struct mlx4_priv *priv, int slave
 					       port, vp_oper->vlan_idx);
 			vp_oper->vlan_idx = NO_INDX;
 		}
+		if (NO_INDX != vp_oper->mac_idx) {
+			__mlx4_unregister_mac(&priv->dev, port, vp_oper->mac_idx);
+			vp_oper->mac_idx = NO_INDX;
+		}
 	}
 	return;
 }
@@ -2111,3 +2130,24 @@ int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_vlan);
+
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if ((!mlx4_is_master(dev)) ||
+	    !(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FSM))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	s_info->spoofchk = setting;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index c1f2c5b34d95..8d197b41242e 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2044,6 +2044,14 @@ static int mlx4_en_set_vf_vlan(struct net_device *dev, int vf, u16 vlan, u8 qos)
 	return mlx4_set_vf_vlan(mdev->dev, en_priv->port, vf, vlan, qos);
 }
 
+static int mlx4_en_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_set_vf_spoofchk(mdev->dev, en_priv->port, vf, setting);
+}
+
 
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
@@ -2084,6 +2092,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_vlan_rx_kill_vid	= mlx4_en_vlan_rx_kill_vid,
 	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
 	.ndo_set_vf_vlan	= mlx4_en_set_vf_vlan,
+	.ndo_set_vf_spoofchk	= mlx4_en_set_vf_spoofchk,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c
index d2d30c940790..b147bdd40768 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -659,6 +659,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap)
 	MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET);
 	if (field32 & (1 << 26))
 		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_VLAN_CONTROL;
+	if (field32 & (1 << 20))
+		dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_FSM;
 
 	if (dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
 		for (i = 1; i <= dev_cap->num_ports; ++i) {
diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 5083d4babfe3..e12e0d2e0ee0 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -383,6 +383,14 @@ static int update_vport_qp_param(struct mlx4_dev *dev,
 			 vp_oper->vlan_idx, (int)(qpc->pri_path.feup),
 			 (int)(qpc->pri_path.fl));
 	}
+	if (vp_oper->state.spoofchk) {
+		qpc->pri_path.feup |= 1 << 5; /* set fsm bit */;
+		qpc->pri_path.grh_mylmc = (0x80 & qpc->pri_path.grh_mylmc) + vp_oper->mac_idx;
+		mlx4_dbg(dev, "spoof qp %d  port %d feup  0x%x, myLmc 0x%x mindx %d\n",
+			 be32_to_cpu(qpc->local_qpn) & 0xffffff, port,
+			 (int)qpc->pri_path.feup, (int)qpc->pri_path.grh_mylmc,
+			 vp_oper->mac_idx);
+	}
 	return 0;
 }
 
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 7daead75a6f5..95c3223719fa 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -234,6 +234,7 @@ void mlx4_free_cmd_mailbox(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbo
 u32 mlx4_comm_get_version(void);
 int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
 int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
+int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
 
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h
index 6606d8f5862a..53acaf64189f 100644
--- a/include/linux/mlx4/device.h
+++ b/include/linux/mlx4/device.h
@@ -156,7 +156,8 @@ enum {
 	MLX4_DEV_CAP_FLAG2_FS_EN		= 1LL <<  3,
 	MLX4_DEV_CAP_FLAGS2_REASSIGN_MAC_EN	= 1LL <<  4,
 	MLX4_DEV_CAP_FLAG2_TS			= 1LL <<  5,
-	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  6
+	MLX4_DEV_CAP_FLAG2_VLAN_CONTROL		= 1LL <<  6,
+	MLX4_DEV_CAP_FLAG2_FSM			= 1LL <<  7
 };
 
 enum {
-- 
cgit 


From 2cccb9e4f3476da916146c2ec571c4f3eff738b1 Mon Sep 17 00:00:00 2001
From: Rony Efraim <ronye@mellanox.com>
Date: Thu, 25 Apr 2013 05:22:30 +0000
Subject: net/mlx4: Add support to get VF config

Support getting VF config.

Signed-off-by: Rony Efraim <ronye@mellanox.com>
Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c       | 33 ++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c |  8 +++++++
 include/linux/mlx4/cmd.h                       |  2 ++
 3 files changed, 43 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index eda347eeb5dc..1df56cc50ee9 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2151,3 +2151,36 @@ int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mlx4_set_vf_spoofchk);
+
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_priv *priv = mlx4_priv(dev);
+	struct mlx4_vport_state *s_info;
+	int slave;
+
+	if (!mlx4_is_master(dev))
+		return -EPROTONOSUPPORT;
+
+	slave = mlx4_get_slave_indx(dev, vf);
+	if (slave < 0)
+		return -EINVAL;
+
+	s_info = &priv->mfunc.master.vf_admin[slave].vport[port];
+	ivf->vf = vf;
+
+	/* need to convert it to a func */
+	ivf->mac[0] = ((s_info->mac >> (5*8)) & 0xff);
+	ivf->mac[1] = ((s_info->mac >> (4*8)) & 0xff);
+	ivf->mac[2] = ((s_info->mac >> (3*8)) & 0xff);
+	ivf->mac[3] = ((s_info->mac >> (2*8)) & 0xff);
+	ivf->mac[4] = ((s_info->mac >> (1*8)) & 0xff);
+	ivf->mac[5] = ((s_info->mac)  & 0xff);
+
+	ivf->vlan	= s_info->default_vlan;
+	ivf->qos	= s_info->default_qos;
+	ivf->tx_rate	= s_info->tx_rate;
+	ivf->spoofchk	= s_info->spoofchk;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mlx4_get_vf_config);
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 8d197b41242e..a69a908614e6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2052,6 +2052,13 @@ static int mlx4_en_set_vf_spoofchk(struct net_device *dev, int vf, bool setting)
 	return mlx4_set_vf_spoofchk(mdev->dev, en_priv->port, vf, setting);
 }
 
+static int mlx4_en_get_vf_config(struct net_device *dev, int vf, struct ifla_vf_info *ivf)
+{
+	struct mlx4_en_priv *en_priv = netdev_priv(dev);
+	struct mlx4_en_dev *mdev = en_priv->mdev;
+
+	return mlx4_get_vf_config(mdev->dev, en_priv->port, vf, ivf);
+}
 
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
@@ -2093,6 +2100,7 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_set_vf_mac		= mlx4_en_set_vf_mac,
 	.ndo_set_vf_vlan	= mlx4_en_set_vf_vlan,
 	.ndo_set_vf_spoofchk	= mlx4_en_set_vf_spoofchk,
+	.ndo_get_vf_config	= mlx4_en_get_vf_config,
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	.ndo_poll_controller	= mlx4_en_netpoll,
 #endif
diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h
index 95c3223719fa..adf6e0648f20 100644
--- a/include/linux/mlx4/cmd.h
+++ b/include/linux/mlx4/cmd.h
@@ -34,6 +34,7 @@
 #define MLX4_CMD_H
 
 #include <linux/dma-mapping.h>
+#include <linux/if_link.h>
 
 enum {
 	/* initialization and general commands */
@@ -235,6 +236,7 @@ u32 mlx4_comm_get_version(void);
 int mlx4_set_vf_mac(struct mlx4_dev *dev, int port, int vf, u64 mac);
 int mlx4_set_vf_vlan(struct mlx4_dev *dev, int port, int vf, u16 vlan, u8 qos);
 int mlx4_set_vf_spoofchk(struct mlx4_dev *dev, int port, int vf, bool setting);
+int mlx4_get_vf_config(struct mlx4_dev *dev, int port, int vf, struct ifla_vf_info *ivf);
 
 
 #define MLX4_COMM_GET_IF_REV(cmd_chan_ver) (u8)((cmd_chan_ver) >> 8)
-- 
cgit 


From 730dca42c1d363c939da18c1499c7327c66e2b37 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Sun, 28 Apr 2013 10:50:52 +0200
Subject: KVM: x86: Rework request for immediate exit

The VMX implementation of enable_irq_window raised
KVM_REQ_IMMEDIATE_EXIT after we checked it in vcpu_enter_guest. This
caused infinite loops on vmentry. Fix it by letting enable_irq_window
signal the need for an immediate exit via its return value and drop
KVM_REQ_IMMEDIATE_EXIT.

This issue only affects nested VMX scenarios.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  3 ++-
 arch/x86/kvm/vmx.c              | 15 ++++++++-------
 arch/x86/kvm/x86.c              |  7 +++----
 include/linux/kvm_host.h        | 15 +++++++--------
 5 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 18635ae42a8e..111b4a0c3907 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -694,7 +694,7 @@ struct kvm_x86_ops {
 	bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
 	void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
-	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+	int (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 	int (*vm_has_apicv)(struct kvm *kvm);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 15c9cccd716b..7f896cbe717f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3632,7 +3632,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static int enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
@@ -3646,6 +3646,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 		svm_set_vintr(svm);
 		svm_inject_irq(svm, 0x0);
 	}
+	return 0;
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 0f0cb3110626..74c525e2c608 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4398,22 +4398,23 @@ static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
 		PIN_BASED_NMI_EXITING;
 }
 
-static void enable_irq_window(struct kvm_vcpu *vcpu)
+static int enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
-	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu)) {
+
+	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
 		/*
 		 * We get here if vmx_interrupt_allowed() said we can't
-		 * inject to L1 now because L2 must run. Ask L2 to exit
-		 * right after entry, so we can inject to L1 more promptly.
+		 * inject to L1 now because L2 must run. The caller will have
+		 * to make L2 exit right after entry, so we can inject to L1
+		 * more promptly.
 		 */
-		kvm_make_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
-		return;
-	}
+		return -EBUSY;
 
 	cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
 	cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_INTR_PENDING;
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
+	return 0;
 }
 
 static void enable_nmi_window(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2a434bf3918d..c522260b5bbf 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5692,7 +5692,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
-	bool req_immediate_exit = 0;
+	bool req_immediate_exit = false;
 
 	if (vcpu->requests) {
 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
@@ -5734,8 +5734,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			record_steal_time(vcpu);
 		if (kvm_check_request(KVM_REQ_NMI, vcpu))
 			process_nmi(vcpu);
-		req_immediate_exit =
-			kvm_check_request(KVM_REQ_IMMEDIATE_EXIT, vcpu);
 		if (kvm_check_request(KVM_REQ_PMU, vcpu))
 			kvm_handle_pmu_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
@@ -5757,7 +5755,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		if (vcpu->arch.nmi_pending)
 			kvm_x86_ops->enable_nmi_window(vcpu);
 		else if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
-			kvm_x86_ops->enable_irq_window(vcpu);
+			req_immediate_exit =
+				kvm_x86_ops->enable_irq_window(vcpu) != 0;
 
 		if (kvm_lapic_enabled(vcpu)) {
 			/*
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 93a50054d46c..7bde42470e37 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -119,14 +119,13 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_APF_HALT          12
 #define KVM_REQ_STEAL_UPDATE      13
 #define KVM_REQ_NMI               14
-#define KVM_REQ_IMMEDIATE_EXIT    15
-#define KVM_REQ_PMU               16
-#define KVM_REQ_PMI               17
-#define KVM_REQ_WATCHDOG          18
-#define KVM_REQ_MASTERCLOCK_UPDATE 19
-#define KVM_REQ_MCLOCK_INPROGRESS 20
-#define KVM_REQ_EPR_EXIT          21
-#define KVM_REQ_SCAN_IOAPIC       22
+#define KVM_REQ_PMU               15
+#define KVM_REQ_PMI               16
+#define KVM_REQ_WATCHDOG          17
+#define KVM_REQ_MASTERCLOCK_UPDATE 18
+#define KVM_REQ_MCLOCK_INPROGRESS 19
+#define KVM_REQ_EPR_EXIT          20
+#define KVM_REQ_SCAN_IOAPIC       21
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
-- 
cgit 


From 2a5bab1004729f3302c776e53ee7c895b98bb1ce Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Tue, 16 Apr 2013 13:49:18 -0600
Subject: kvm: Allow build-time configuration of KVM device assignment

We hope to at some point deprecate KVM legacy device assignment in
favor of VFIO-based assignment.  Towards that end, allow legacy
device assignment to be deconfigured.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Alexander Graf <agraf@suse.de>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Gleb Natapov <gleb@redhat.com>
---
 arch/ia64/include/uapi/asm/kvm.h |  1 -
 arch/ia64/kvm/Kconfig            | 13 +++++++++++--
 arch/ia64/kvm/Makefile           |  6 +++---
 arch/ia64/kvm/kvm-ia64.c         |  2 --
 arch/x86/include/uapi/asm/kvm.h  |  1 -
 arch/x86/kvm/Kconfig             | 13 +++++++++++--
 arch/x86/kvm/Makefile            |  5 +++--
 arch/x86/kvm/x86.c               |  6 ++++--
 include/linux/kvm_host.h         | 30 ++++++++----------------------
 include/uapi/linux/kvm.h         |  4 ----
 10 files changed, 40 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/include/uapi/asm/kvm.h b/arch/ia64/include/uapi/asm/kvm.h
index ec6c6b301238..99503c284400 100644
--- a/arch/ia64/include/uapi/asm/kvm.h
+++ b/arch/ia64/include/uapi/asm/kvm.h
@@ -27,7 +27,6 @@
 /* Select x86 specific features in <linux/kvm.h> */
 #define __KVM_HAVE_IOAPIC
 #define __KVM_HAVE_IRQ_LINE
-#define __KVM_HAVE_DEVICE_ASSIGNMENT
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 043183a06409..990b86420cc6 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -21,8 +21,6 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on BROKEN
 	depends on HAVE_KVM && MODULES
-	# for device assignment:
-	depends on PCI
 	depends on BROKEN
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
@@ -51,6 +49,17 @@ config KVM_INTEL
 	  Provides support for KVM on Itanium 2 processors equipped with the VT
 	  extensions.
 
+config KVM_DEVICE_ASSIGNMENT
+	bool "KVM legacy PCI device assignment support"
+	depends on KVM && PCI && IOMMU_API
+	default y
+	---help---
+	  Provide support for legacy PCI device assignment through KVM.  The
+	  kernel now also supports a full featured userspace device driver
+	  framework through VFIO, which supersedes much of this support.
+
+	  If unsure, say Y.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION
diff --git a/arch/ia64/kvm/Makefile b/arch/ia64/kvm/Makefile
index 511f64a78d56..1a4053789d01 100644
--- a/arch/ia64/kvm/Makefile
+++ b/arch/ia64/kvm/Makefile
@@ -49,10 +49,10 @@ ccflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 asflags-y := -Ivirt/kvm -Iarch/ia64/kvm/
 
 common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
-		coalesced_mmio.o irq_comm.o assigned-dev.o irqchip.o)
+		coalesced_mmio.o irq_comm.o)
 
-ifeq ($(CONFIG_IOMMU_API),y)
-common-objs += $(addprefix ../../../virt/kvm/, iommu.o)
+ifeq ($(CONFIG_KVM_DEVICE_ASSIGNMENT),y)
+common-objs += $(addprefix ../../../virt/kvm/, assigned-dev.o iommu.o)
 endif
 
 kvm-objs := $(common-objs) kvm-ia64.o kvm_fw.o
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 032c54d63c3d..dcc560729c20 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1368,9 +1368,7 @@ void kvm_arch_sync_events(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_iommu_unmap_guest(kvm);
-#ifdef  KVM_CAP_DEVICE_ASSIGNMENT
 	kvm_free_all_assigned_devices(kvm);
-#endif
 	kfree(kvm->arch.vioapic);
 	kvm_release_vm_pages(kvm);
 }
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index a65ec29e6ffb..5d9a3033b3d7 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -29,7 +29,6 @@
 #define __KVM_HAVE_PIT
 #define __KVM_HAVE_IOAPIC
 #define __KVM_HAVE_IRQ_LINE
-#define __KVM_HAVE_DEVICE_ASSIGNMENT
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 9d50efdb719d..a47a3e54b964 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,8 +21,6 @@ config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
 	depends on HAVE_KVM
 	depends on HIGH_RES_TIMERS
-	# for device assignment:
-	depends on PCI
 	# for TASKSTATS/TASK_DELAY_ACCT:
 	depends on NET
 	select PREEMPT_NOTIFIERS
@@ -83,6 +81,17 @@ config KVM_MMU_AUDIT
 	 This option adds a R/W kVM module parameter 'mmu_audit', which allows
 	 audit  KVM MMU at runtime.
 
+config KVM_DEVICE_ASSIGNMENT
+	bool "KVM legacy PCI device assignment support"
+	depends on KVM && PCI && IOMMU_API
+	default y
+	---help---
+	  Provide support for legacy PCI device assignment through KVM.  The
+	  kernel now also supports a full featured userspace device driver
+	  framework through VFIO, which supersedes much of this support.
+
+	  If unsure, say Y.
+
 # OK, it's a little counter-intuitive to do this, but it puts it neatly under
 # the virtualization menu.
 source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index a797b8e43ade..d609e1d84048 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -7,8 +7,9 @@ CFLAGS_vmx.o := -I.
 
 kvm-y			+= $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
 				coalesced_mmio.o irq_comm.o eventfd.o \
-				assigned-dev.o irqchip.o)
-kvm-$(CONFIG_IOMMU_API)	+= $(addprefix ../../../virt/kvm/, iommu.o)
+				irqchip.o)
+kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT)	+= $(addprefix ../../../virt/kvm/, \
+				assigned-dev.o iommu.o)
 kvm-$(CONFIG_KVM_ASYNC_PF)	+= $(addprefix ../../../virt/kvm/, async_pf.o)
 
 kvm-y			+= x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 145b1c81011b..8747fef7fd59 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2501,7 +2501,6 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
-	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_PIT2:
@@ -2519,9 +2518,12 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_XSAVE:
 	case KVM_CAP_ASYNC_PF:
 	case KVM_CAP_GET_TSC_KHZ:
-	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_KVMCLOCK_CTRL:
 	case KVM_CAP_READONLY_MEM:
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
+	case KVM_CAP_ASSIGN_DEV_IRQ:
+	case KVM_CAP_PCI_2_3:
+#endif
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 309774715897..996661eb8e99 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -667,7 +667,6 @@ static inline wait_queue_head_t *kvm_arch_vcpu_wq(struct kvm_vcpu *vcpu)
 
 int kvm_arch_init_vm(struct kvm *kvm, unsigned long type);
 void kvm_arch_destroy_vm(struct kvm *kvm);
-void kvm_free_all_assigned_devices(struct kvm *kvm);
 void kvm_arch_sync_events(struct kvm *kvm);
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu);
@@ -736,7 +735,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 /* For vcpu->arch.iommu_flags */
 #define KVM_IOMMU_CACHE_COHERENCY	0x1
 
-#ifdef CONFIG_IOMMU_API
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
 void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot);
 int kvm_iommu_map_guest(struct kvm *kvm);
@@ -745,7 +744,7 @@ int kvm_assign_device(struct kvm *kvm,
 		      struct kvm_assigned_dev_kernel *assigned_dev);
 int kvm_deassign_device(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *assigned_dev);
-#else /* CONFIG_IOMMU_API */
+#else
 static inline int kvm_iommu_map_pages(struct kvm *kvm,
 				      struct kvm_memory_slot *slot)
 {
@@ -757,28 +756,11 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
 {
 }
 
-static inline int kvm_iommu_map_guest(struct kvm *kvm)
-{
-	return -ENODEV;
-}
-
 static inline int kvm_iommu_unmap_guest(struct kvm *kvm)
 {
 	return 0;
 }
-
-static inline int kvm_assign_device(struct kvm *kvm,
-		struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	return 0;
-}
-
-static inline int kvm_deassign_device(struct kvm *kvm,
-		struct kvm_assigned_dev_kernel *assigned_dev)
-{
-	return 0;
-}
-#endif /* CONFIG_IOMMU_API */
+#endif
 
 static inline void __guest_enter(void)
 {
@@ -1032,11 +1014,13 @@ static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
 
 #endif
 
-#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
+#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 
 long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 				  unsigned long arg);
 
+void kvm_free_all_assigned_devices(struct kvm *kvm);
+
 #else
 
 static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
@@ -1045,6 +1029,8 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 	return -ENOTTY;
 }
 
+static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {}
+
 #endif
 
 static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d4005192ad6e..965e5b52dee0 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -561,9 +561,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_MP_STATE 14
 #define KVM_CAP_COALESCED_MMIO 15
 #define KVM_CAP_SYNC_MMU 16  /* Changes to host mmap are reflected in guest */
-#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_ASSIGNMENT 17
-#endif
 #define KVM_CAP_IOMMU 18
 #ifdef __KVM_HAVE_MSI
 #define KVM_CAP_DEVICE_MSI 20
@@ -581,9 +579,7 @@ struct kvm_ppc_smmu_info {
 #endif
 #define KVM_CAP_IRQ_ROUTING 25
 #define KVM_CAP_IRQ_INJECT_STATUS 26
-#ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
-#endif
 #ifdef __KVM_HAVE_MSIX
 #define KVM_CAP_DEVICE_MSIX 28
 #endif
-- 
cgit 


From e8d9612c181b1a68ba5f71384629343466f1bd13 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Thu, 25 Apr 2013 06:53:54 +0000
Subject: sock_diag: allow to dump bpf filters

This patch allows to dump BPF filters attached to a socket with
SO_ATTACH_FILTER.
Note that we check CAP_SYS_ADMIN before allowing to dump this info.

For now, only AF_PACKET sockets use this feature.

Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/sock_diag.h        |  3 +++
 include/uapi/linux/packet_diag.h |  2 ++
 net/core/sock_diag.c             | 33 +++++++++++++++++++++++++++++++++
 net/packet/diag.c                |  4 ++++
 4 files changed, 42 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index e8d702e0fd89..54f91d35e5fd 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -1,6 +1,7 @@
 #ifndef __SOCK_DIAG_H__
 #define __SOCK_DIAG_H__
 
+#include <linux/user_namespace.h>
 #include <uapi/linux/sock_diag.h>
 
 struct sk_buff;
@@ -22,5 +23,7 @@ int sock_diag_check_cookie(void *sk, __u32 *cookie);
 void sock_diag_save_cookie(void *sk, __u32 *cookie);
 
 int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attr);
+int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
+			     struct sk_buff *skb, int attrtype);
 
 #endif
diff --git a/include/uapi/linux/packet_diag.h b/include/uapi/linux/packet_diag.h
index c0802c18c8ad..b2cc0cd9c4d9 100644
--- a/include/uapi/linux/packet_diag.h
+++ b/include/uapi/linux/packet_diag.h
@@ -17,6 +17,7 @@ struct packet_diag_req {
 #define PACKET_SHOW_RING_CFG	0x00000004 /* Rings configuration parameters */
 #define PACKET_SHOW_FANOUT	0x00000008
 #define PACKET_SHOW_MEMINFO	0x00000010
+#define PACKET_SHOW_FILTER	0x00000020
 
 struct packet_diag_msg {
 	__u8	pdiag_family;
@@ -35,6 +36,7 @@ enum {
 	PACKET_DIAG_FANOUT,
 	PACKET_DIAG_UID,
 	PACKET_DIAG_MEMINFO,
+	PACKET_DIAG_FILTER,
 
 	__PACKET_DIAG_MAX,
 };
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index a29e90cf36b7..d5bef0b0f639 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -49,6 +49,39 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
 }
 EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
 
+int sock_diag_put_filterinfo(struct user_namespace *user_ns, struct sock *sk,
+			     struct sk_buff *skb, int attrtype)
+{
+	struct nlattr *attr;
+	struct sk_filter *filter;
+	unsigned int len;
+	int err = 0;
+
+	if (!ns_capable(user_ns, CAP_NET_ADMIN)) {
+		nla_reserve(skb, attrtype, 0);
+		return 0;
+	}
+
+	rcu_read_lock();
+
+	filter = rcu_dereference(sk->sk_filter);
+	len = filter ? filter->len * sizeof(struct sock_filter) : 0;
+
+	attr = nla_reserve(skb, attrtype, len);
+	if (attr == NULL) {
+		err = -EMSGSIZE;
+		goto out;
+	}
+
+	if (filter)
+		memcpy(nla_data(attr), filter->insns, len);
+
+out:
+	rcu_read_unlock();
+	return err;
+}
+EXPORT_SYMBOL(sock_diag_put_filterinfo);
+
 void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
 {
 	mutex_lock(&sock_diag_table_mutex);
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 822fe9b33a49..a9584a2f6d69 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -170,6 +170,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO))
 		goto out_nlmsg_trim;
 
+	if ((req->pdiag_show & PACKET_SHOW_FILTER) &&
+	    sock_diag_put_filterinfo(user_ns, sk, skb, PACKET_DIAG_FILTER))
+		goto out_nlmsg_trim;
+
 	return nlmsg_end(skb, nlh);
 
 out_nlmsg_trim:
-- 
cgit 


From 5f5624cf156283687e11ea329c7a0523c677ea0e Mon Sep 17 00:00:00 2001
From: Pravin B Shelar <pshelar@nicira.com>
Date: Thu, 25 Apr 2013 11:08:30 +0000
Subject: ipv6: Kill ipv6 dependency of icmpv6_send().

Following patch adds icmp-registration module for ipv6.  It allows
ipv6 protocol to register icmp_sender which is used for sending
ipv6 icmp msgs.  This extra layer allows us to kill ipv6 dependency
for sending icmp packets.

This patch also fixes ip_tunnel compilation problem when ip_tunnel
is statically compiled in kernel but ipv6 is module

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/icmpv6.h | 18 +++++++++++++++---
 net/ipv6/Makefile      |  2 +-
 net/ipv6/icmp.c        | 35 ++++++++++++++++++++---------------
 net/ipv6/ip6_icmp.c    | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 19 deletions(-)
 create mode 100644 net/ipv6/ip6_icmp.c

(limited to 'include/linux')

diff --git a/include/linux/icmpv6.h b/include/linux/icmpv6.h
index b4f6c29caced..630f45335c73 100644
--- a/include/linux/icmpv6.h
+++ b/include/linux/icmpv6.h
@@ -11,9 +11,21 @@ static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb)
 
 #include <linux/netdevice.h>
 
-extern void				icmpv6_send(struct sk_buff *skb,
-						    u8 type, u8 code,
-						    __u32 info);
+#if IS_ENABLED(CONFIG_IPV6)
+extern void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info);
+
+typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info);
+extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn);
+extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn);
+
+#else
+
+static inline void icmpv6_send(struct sk_buff *skb,
+			       u8 type, u8 code, __u32 info)
+{
+
+}
+#endif
 
 extern int				icmpv6_init(void);
 extern int				icmpv6_err_convert(u8 type, u8 code,
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 309af19a0a0a..9af088d2cdaa 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
 obj-$(CONFIG_IPV6_GRE) += ip6_gre.o
 
-obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o
+obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
 obj-$(CONFIG_INET) += output_core.o protocol.o $(ipv6-offload)
 
 obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 71b900c3f4ff..2a53a790514d 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -123,15 +123,6 @@ static __inline__ void icmpv6_xmit_unlock(struct sock *sk)
 	spin_unlock_bh(&sk->sk_lock.slock);
 }
 
-/*
- * Slightly more convenient version of icmpv6_send.
- */
-void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
-{
-	icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos);
-	kfree_skb(skb);
-}
-
 /*
  * Figure out, may we reply to this packet with icmp error.
  *
@@ -332,7 +323,7 @@ static struct dst_entry *icmpv6_route_lookup(struct net *net, struct sk_buff *sk
 	 * anycast.
 	 */
 	if (((struct rt6_info *)dst)->rt6i_flags & RTF_ANYCAST) {
-		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: acast source\n");
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: acast source\n");
 		dst_release(dst);
 		return ERR_PTR(-EINVAL);
 	}
@@ -381,7 +372,7 @@ relookup_failed:
 /*
  *	Send an ICMP message in response to a packet in error
  */
-void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 {
 	struct net *net = dev_net(skb->dev);
 	struct inet6_dev *idev = NULL;
@@ -406,7 +397,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	/*
 	 *	Make sure we respect the rules
 	 *	i.e. RFC 1885 2.4(e)
-	 *	Rule (e.1) is enforced by not using icmpv6_send
+	 *	Rule (e.1) is enforced by not using icmp6_send
 	 *	in any code that processes icmp errors.
 	 */
 	addr_type = ipv6_addr_type(&hdr->daddr);
@@ -444,7 +435,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	 *	and anycast addresses will be checked later.
 	 */
 	if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) {
-		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n");
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: addr_any/mcast source\n");
 		return;
 	}
 
@@ -452,7 +443,7 @@ void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
 	 *	Never answer to a ICMP packet.
 	 */
 	if (is_ineligible(skb)) {
-		LIMIT_NETDEBUG(KERN_DEBUG "icmpv6_send: no reply to icmp error\n");
+		LIMIT_NETDEBUG(KERN_DEBUG "icmp6_send: no reply to icmp error\n");
 		return;
 	}
 
@@ -529,7 +520,14 @@ out_dst_release:
 out:
 	icmpv6_xmit_unlock(sk);
 }
-EXPORT_SYMBOL(icmpv6_send);
+
+/* Slightly more convenient version of icmp6_send.
+ */
+void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos)
+{
+	icmp6_send(skb, ICMPV6_PARAMPROB, code, pos);
+	kfree_skb(skb);
+}
 
 static void icmpv6_echo_reply(struct sk_buff *skb)
 {
@@ -885,8 +883,14 @@ int __init icmpv6_init(void)
 	err = -EAGAIN;
 	if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0)
 		goto fail;
+
+	err = inet6_register_icmp_sender(icmp6_send);
+	if (err)
+		goto sender_reg_err;
 	return 0;
 
+sender_reg_err:
+	inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
 fail:
 	pr_err("Failed to register ICMP6 protocol\n");
 	unregister_pernet_subsys(&icmpv6_sk_ops);
@@ -895,6 +899,7 @@ fail:
 
 void icmpv6_cleanup(void)
 {
+	inet6_unregister_icmp_sender(icmp6_send);
 	unregister_pernet_subsys(&icmpv6_sk_ops);
 	inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
 }
diff --git a/net/ipv6/ip6_icmp.c b/net/ipv6/ip6_icmp.c
new file mode 100644
index 000000000000..4578e23834f7
--- /dev/null
+++ b/net/ipv6/ip6_icmp.c
@@ -0,0 +1,47 @@
+#include <linux/export.h>
+#include <linux/icmpv6.h>
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+
+#include <net/ipv6.h>
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+static ip6_icmp_send_t __rcu *ip6_icmp_send;
+
+int inet6_register_icmp_sender(ip6_icmp_send_t *fn)
+{
+	return (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, NULL, fn) == NULL) ?
+	        0 : -EBUSY;
+}
+EXPORT_SYMBOL(inet6_register_icmp_sender);
+
+int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn)
+{
+	int ret;
+
+	ret = (cmpxchg((ip6_icmp_send_t **)&ip6_icmp_send, fn, NULL) == fn) ?
+	      0 : -EINVAL;
+
+	synchronize_net();
+
+	return ret;
+}
+EXPORT_SYMBOL(inet6_unregister_icmp_sender);
+
+void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info)
+{
+	ip6_icmp_send_t *send;
+
+	rcu_read_lock();
+	send = rcu_dereference(ip6_icmp_send);
+
+	if (!send)
+		goto out;
+	send(skb, type, code, info);
+out:
+	rcu_read_unlock();
+}
+EXPORT_SYMBOL(icmpv6_send);
+#endif
-- 
cgit 


From 43c56e595bb81319230affd545392536c933317e Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 8 Apr 2013 21:51:25 +0200
Subject: netfilter: ipset: Make possible to test elements marked with nomatch

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h      |  8 ++++++++
 net/netfilter/ipset/ip_set_hash_ipportnet.c | 14 ++++++++------
 net/netfilter/ipset/ip_set_hash_net.c       | 14 ++++++++------
 net/netfilter/ipset/ip_set_hash_netiface.c  | 14 ++++++++------
 net/netfilter/ipset/ip_set_hash_netport.c   | 14 ++++++++------
 5 files changed, 40 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 7958e84a65af..970187187f5b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -200,6 +200,14 @@ ip_set_eexist(int ret, u32 flags)
 	return ret == -IPSET_ERR_EXIST && (flags & IPSET_FLAG_EXIST);
 }
 
+/* Match elements marked with nomatch */
+static inline bool
+ip_set_enomatch(int ret, u32 flags, enum ipset_adt adt)
+{
+	return adt == IPSET_TEST &&
+	       ret == -ENOTEMPTY && ((flags >> 16) & IPSET_FLAG_NOMATCH);
+}
+
 /* Check the NLA_F_NET_BYTEORDER flag */
 static inline bool
 ip_set_attr_netorder(struct nlattr *tb[], int type)
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index 10a30b4fc7db..b4836c81fbb7 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -279,10 +279,10 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
-			flags |= (cadt_flags << 16);
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
@@ -292,7 +292,8 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[],
 		data.ip = htonl(ip);
 		data.ip2 = htonl(ip2_from & ip_set_hostmask(data.cidr + 1));
 		ret = adtfn(set, &data, timeout, flags);
-		return ip_set_eexist(ret, flags) ? 0 : ret;
+		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
 	ip_to = ip;
@@ -610,15 +611,16 @@ hash_ipportnet6_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
-			flags |= (cadt_flags << 16);
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
 		ret = adtfn(set, &data, timeout, flags);
-		return ip_set_eexist(ret, flags) ? 0 : ret;
+		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
 	port = ntohs(data.port);
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index d6a59154d710..6dbe0afc5a8d 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -225,16 +225,17 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
-			flags |= (cadt_flags << 16);
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		data.ip = htonl(ip & ip_set_hostmask(data.cidr));
 		ret = adtfn(set, &data, timeout, flags);
-		return ip_set_eexist(ret, flags) ? 0 : ret;
+		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
 	ip_to = ip;
@@ -466,15 +467,16 @@ hash_net6_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
-			flags |= (cadt_flags << 16);
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	ret = adtfn(set, &data, timeout, flags);
 
-	return ip_set_eexist(ret, flags) ? 0 : ret;
+	return ip_set_enomatch(ret, flags, adt) ? 1 :
+	       ip_set_eexist(ret, flags) ? 0 : ret;
 }
 
 /* Create hash:ip type of sets */
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index f2b0a3c30130..248162020d80 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -396,13 +396,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[],
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_PHYSDEV)
 			data.physdev = 1;
-		if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH))
-			flags |= (cadt_flags << 16);
+		if (cadt_flags & IPSET_FLAG_NOMATCH)
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 	if (adt == IPSET_TEST || !tb[IPSET_ATTR_IP_TO]) {
 		data.ip = htonl(ip & ip_set_hostmask(data.cidr));
 		ret = adtfn(set, &data, timeout, flags);
-		return ip_set_eexist(ret, flags) ? 0 : ret;
+		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
 	if (tb[IPSET_ATTR_IP_TO]) {
@@ -704,13 +705,14 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[],
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_PHYSDEV)
 			data.physdev = 1;
-		if (adt == IPSET_ADD && (cadt_flags & IPSET_FLAG_NOMATCH))
-			flags |= (cadt_flags << 16);
+		if (cadt_flags & IPSET_FLAG_NOMATCH)
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	ret = adtfn(set, &data, timeout, flags);
 
-	return ip_set_eexist(ret, flags) ? 0 : ret;
+	return ip_set_enomatch(ret, flags, adt) ? 1 :
+	       ip_set_eexist(ret, flags) ? 0 : ret;
 }
 
 /* Create hash:ip type of sets */
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 349deb672a2d..57b0550a0b0d 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -272,16 +272,17 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[],
 
 	with_ports = with_ports && tb[IPSET_ATTR_PORT_TO];
 
-	if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
-			flags |= (cadt_flags << 16);
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	if (adt == IPSET_TEST || !(with_ports || tb[IPSET_ATTR_IP_TO])) {
 		data.ip = htonl(ip & ip_set_hostmask(data.cidr + 1));
 		ret = adtfn(set, &data, timeout, flags);
-		return ip_set_eexist(ret, flags) ? 0 : ret;
+		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
 	port = port_to = ntohs(data.port);
@@ -561,15 +562,16 @@ hash_netport6_uadt(struct ip_set *set, struct nlattr *tb[],
 		timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
 
-	if (tb[IPSET_ATTR_CADT_FLAGS] && adt == IPSET_ADD) {
+	if (tb[IPSET_ATTR_CADT_FLAGS]) {
 		u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
 		if (cadt_flags & IPSET_FLAG_NOMATCH)
-			flags |= (cadt_flags << 16);
+			flags |= (IPSET_FLAG_NOMATCH << 16);
 	}
 
 	if (adt == IPSET_TEST || !with_ports || !tb[IPSET_ATTR_PORT_TO]) {
 		ret = adtfn(set, &data, timeout, flags);
-		return ip_set_eexist(ret, flags) ? 0 : ret;
+		return ip_set_enomatch(ret, flags, adt) ? 1 :
+		       ip_set_eexist(ret, flags) ? 0 : ret;
 	}
 
 	port = ntohs(data.port);
-- 
cgit 


From 8672d4d1a00b59057bb1f9659259967d2a19e086 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 8 Apr 2013 20:54:37 +0200
Subject: netfilter: ipset: Move often used IPv6 address masking function to
 header file

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/pfxlen.h      | 9 +++++++++
 net/netfilter/ipset/ip_set_hash_ip.c        | 9 ---------
 net/netfilter/ipset/ip_set_hash_ipportnet.c | 9 ---------
 net/netfilter/ipset/ip_set_hash_net.c       | 9 ---------
 net/netfilter/ipset/ip_set_hash_netiface.c  | 9 ---------
 net/netfilter/ipset/ip_set_hash_netport.c   | 9 ---------
 6 files changed, 9 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/pfxlen.h b/include/linux/netfilter/ipset/pfxlen.h
index 199fd11fedc0..1afbb94b4b65 100644
--- a/include/linux/netfilter/ipset/pfxlen.h
+++ b/include/linux/netfilter/ipset/pfxlen.h
@@ -41,4 +41,13 @@ do {						\
 	to = from | ~ip_set_hostmask(cidr);	\
 } while (0)
 
+static inline void
+ip6_netmask(union nf_inet_addr *ip, u8 prefix)
+{
+	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
+	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
+	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
+	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
+}
+
 #endif /*_PFXLEN_H */
diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c
index b7d4cb475ae6..c486ad2d43b7 100644
--- a/net/netfilter/ipset/ip_set_hash_ip.c
+++ b/net/netfilter/ipset/ip_set_hash_ip.c
@@ -255,15 +255,6 @@ hash_ip6_data_zero_out(struct hash_ip6_elem *elem)
 	ipv6_addr_set(&elem->ip.in6, 0, 0, 0, 0);
 }
 
-static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
-	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
-	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
-	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
-}
-
 static bool
 hash_ip6_data_list(struct sk_buff *skb, const struct hash_ip6_elem *data)
 {
diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c
index b4836c81fbb7..352f8b4a63e5 100644
--- a/net/netfilter/ipset/ip_set_hash_ipportnet.c
+++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c
@@ -445,15 +445,6 @@ hash_ipportnet6_data_zero_out(struct hash_ipportnet6_elem *elem)
 	elem->proto = 0;
 }
 
-static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
-	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
-	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
-	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
-}
-
 static inline void
 hash_ipportnet6_data_netmask(struct hash_ipportnet6_elem *elem, u8 cidr)
 {
diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c
index 6dbe0afc5a8d..370947c65b20 100644
--- a/net/netfilter/ipset/ip_set_hash_net.c
+++ b/net/netfilter/ipset/ip_set_hash_net.c
@@ -342,15 +342,6 @@ hash_net6_data_zero_out(struct hash_net6_elem *elem)
 	elem->cidr = 0;
 }
 
-static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
-	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
-	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
-	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
-}
-
 static inline void
 hash_net6_data_netmask(struct hash_net6_elem *elem, u8 cidr)
 {
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 248162020d80..dd9843e3b06c 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -528,15 +528,6 @@ hash_netiface6_data_zero_out(struct hash_netiface6_elem *elem)
 	elem->elem = 0;
 }
 
-static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
-	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
-	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
-	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
-}
-
 static inline void
 hash_netiface6_data_netmask(struct hash_netiface6_elem *elem, u8 cidr)
 {
diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c
index 57b0550a0b0d..cd1d3c1df72a 100644
--- a/net/netfilter/ipset/ip_set_hash_netport.c
+++ b/net/netfilter/ipset/ip_set_hash_netport.c
@@ -406,15 +406,6 @@ hash_netport6_data_zero_out(struct hash_netport6_elem *elem)
 	elem->proto = 0;
 }
 
-static inline void
-ip6_netmask(union nf_inet_addr *ip, u8 prefix)
-{
-	ip->ip6[0] &= ip_set_netmask6(prefix)[0];
-	ip->ip6[1] &= ip_set_netmask6(prefix)[1];
-	ip->ip6[2] &= ip_set_netmask6(prefix)[2];
-	ip->ip6[3] &= ip_set_netmask6(prefix)[3];
-}
-
 static inline void
 hash_netport6_data_netmask(struct hash_netport6_elem *elem, u8 cidr)
 {
-- 
cgit 


From 075e64c041b5d3c29651965608e1e76505e01d54 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 27 Apr 2013 14:28:55 +0200
Subject: netfilter: ipset: Introduce extensions to elements in the core

Introduce extensions to elements in the core and prepare timeout as
the first one.

This patch also modifies the em_ipset classifier to use the new
extension struct layout.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h         |  46 +++++++++--
 include/linux/netfilter/ipset/ip_set_timeout.h | 102 ++++++-------------------
 net/netfilter/ipset/ip_set_core.c              |  24 ++++--
 net/netfilter/xt_set.c                         |  24 ++----
 net/sched/em_ipset.c                           |   2 +-
 5 files changed, 87 insertions(+), 111 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 970187187f5b..bf0220cbf46a 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -47,10 +47,30 @@ enum ip_set_feature {
 	IPSET_DUMP_LAST = (1 << IPSET_DUMP_LAST_FLAG),
 };
 
+/* Set extensions */
+enum ip_set_extension {
+	IPSET_EXT_NONE = 0,
+	IPSET_EXT_BIT_TIMEOUT = 1,
+	IPSET_EXT_TIMEOUT = (1 << IPSET_EXT_BIT_TIMEOUT),
+};
+
+/* Extension offsets */
+enum ip_set_offset {
+	IPSET_OFFSET_TIMEOUT = 0,
+	IPSET_OFFSET_MAX,
+};
+
+#define SET_WITH_TIMEOUT(s)	((s)->extensions & IPSET_EXT_TIMEOUT)
+
+struct ip_set_ext {
+	unsigned long timeout;
+};
+
 struct ip_set;
 
 typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
-			   u32 timeout, u32 flags);
+			   const struct ip_set_ext *ext,
+			   struct ip_set_ext *mext, u32 flags);
 
 /* Kernel API function options */
 struct ip_set_adt_opt {
@@ -58,7 +78,7 @@ struct ip_set_adt_opt {
 	u8 dim;			/* Dimension of match/target */
 	u8 flags;		/* Direction and negation flags */
 	u32 cmdflags;		/* Command-like flags */
-	u32 timeout;		/* Timeout value */
+	struct ip_set_ext ext;	/* Extensions */
 };
 
 /* Set type, variant-specific part */
@@ -69,7 +89,7 @@ struct ip_set_type_variant {
 	 *			positive for matching element */
 	int (*kadt)(struct ip_set *set, const struct sk_buff *skb,
 		    const struct xt_action_param *par,
-		    enum ipset_adt adt, const struct ip_set_adt_opt *opt);
+		    enum ipset_adt adt, struct ip_set_adt_opt *opt);
 
 	/* Userspace: test/add/del entries
 	 *		returns negative error code,
@@ -151,6 +171,8 @@ struct ip_set {
 	u8 family;
 	/* The type revision */
 	u8 revision;
+	/* Extensions */
+	u8 extensions;
 	/* The type specific data */
 	void *data;
 };
@@ -167,19 +189,21 @@ extern void ip_set_nfnl_put(ip_set_id_t index);
 
 extern int ip_set_add(ip_set_id_t id, const struct sk_buff *skb,
 		      const struct xt_action_param *par,
-		      const struct ip_set_adt_opt *opt);
+		      struct ip_set_adt_opt *opt);
 extern int ip_set_del(ip_set_id_t id, const struct sk_buff *skb,
 		      const struct xt_action_param *par,
-		      const struct ip_set_adt_opt *opt);
+		      struct ip_set_adt_opt *opt);
 extern int ip_set_test(ip_set_id_t id, const struct sk_buff *skb,
 		       const struct xt_action_param *par,
-		       const struct ip_set_adt_opt *opt);
+		       struct ip_set_adt_opt *opt);
 
 /* Utility functions */
 extern void *ip_set_alloc(size_t size);
 extern void ip_set_free(void *members);
 extern int ip_set_get_ipaddr4(struct nlattr *nla,  __be32 *ipaddr);
 extern int ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr);
+extern int ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
+				 struct ip_set_ext *ext);
 
 static inline int
 ip_set_get_hostipaddr4(struct nlattr *nla, u32 *ipaddr)
@@ -292,4 +316,12 @@ bitmap_bytes(u32 a, u32 b)
 	return 4 * ((((b - a + 8) / 8) + 3) / 4);
 }
 
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+
+#define IP_SET_INIT_KEXT(skb, opt, map)		\
+	{ .timeout = ip_set_adt_opt_timeout(opt, map) }
+
+#define IP_SET_INIT_UEXT(map)			\
+	{ .timeout = (map)->timeout }
+
 #endif /*_IP_SET_H */
diff --git a/include/linux/netfilter/ipset/ip_set_timeout.h b/include/linux/netfilter/ipset/ip_set_timeout.h
index 41d9cfa08167..3aac04167ca7 100644
--- a/include/linux/netfilter/ipset/ip_set_timeout.h
+++ b/include/linux/netfilter/ipset/ip_set_timeout.h
@@ -1,7 +1,7 @@
 #ifndef _IP_SET_TIMEOUT_H
 #define _IP_SET_TIMEOUT_H
 
-/* Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+/* Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -17,13 +17,14 @@
 #define IPSET_GC_PERIOD(timeout) \
 	((timeout/3) ? min_t(u32, (timeout)/3, IPSET_GC_TIME) : 1)
 
-/* Set is defined without timeout support: timeout value may be 0 */
-#define IPSET_NO_TIMEOUT	UINT_MAX
+/* Entry is set with no timeout value */
+#define IPSET_ELEM_PERMANENT	0
 
-#define with_timeout(timeout)	((timeout) != IPSET_NO_TIMEOUT)
+/* Set is defined with timeout support: timeout value may be 0 */
+#define IPSET_NO_TIMEOUT	UINT_MAX
 
-#define opt_timeout(opt, map)	\
-	(with_timeout((opt)->timeout) ? (opt)->timeout : (map)->timeout)
+#define ip_set_adt_opt_timeout(opt, map)	\
+((opt)->ext.timeout != IPSET_NO_TIMEOUT ? (opt)->ext.timeout : (map)->timeout)
 
 static inline unsigned int
 ip_set_timeout_uget(struct nlattr *tb)
@@ -38,61 +39,6 @@ ip_set_timeout_uget(struct nlattr *tb)
 	return timeout == IPSET_NO_TIMEOUT ? IPSET_NO_TIMEOUT - 1 : timeout;
 }
 
-#ifdef IP_SET_BITMAP_TIMEOUT
-
-/* Bitmap specific timeout constants and macros for the entries */
-
-/* Bitmap entry is unset */
-#define IPSET_ELEM_UNSET	0
-/* Bitmap entry is set with no timeout value */
-#define IPSET_ELEM_PERMANENT	(UINT_MAX/2)
-
-static inline bool
-ip_set_timeout_test(unsigned long timeout)
-{
-	return timeout != IPSET_ELEM_UNSET &&
-	       (timeout == IPSET_ELEM_PERMANENT ||
-		time_is_after_jiffies(timeout));
-}
-
-static inline bool
-ip_set_timeout_expired(unsigned long timeout)
-{
-	return timeout != IPSET_ELEM_UNSET &&
-	       timeout != IPSET_ELEM_PERMANENT &&
-	       time_is_before_jiffies(timeout);
-}
-
-static inline unsigned long
-ip_set_timeout_set(u32 timeout)
-{
-	unsigned long t;
-
-	if (!timeout)
-		return IPSET_ELEM_PERMANENT;
-
-	t = msecs_to_jiffies(timeout * 1000) + jiffies;
-	if (t == IPSET_ELEM_UNSET || t == IPSET_ELEM_PERMANENT)
-		/* Bingo! */
-		t++;
-
-	return t;
-}
-
-static inline u32
-ip_set_timeout_get(unsigned long timeout)
-{
-	return timeout == IPSET_ELEM_PERMANENT ? 0 :
-		jiffies_to_msecs(timeout - jiffies)/1000;
-}
-
-#else
-
-/* Hash specific timeout constants and macros for the entries */
-
-/* Hash entry is set with no timeout value */
-#define IPSET_ELEM_PERMANENT	0
-
 static inline bool
 ip_set_timeout_test(unsigned long timeout)
 {
@@ -101,36 +47,32 @@ ip_set_timeout_test(unsigned long timeout)
 }
 
 static inline bool
-ip_set_timeout_expired(unsigned long timeout)
+ip_set_timeout_expired(unsigned long *timeout)
 {
-	return timeout != IPSET_ELEM_PERMANENT &&
-	       time_is_before_jiffies(timeout);
+	return *timeout != IPSET_ELEM_PERMANENT &&
+	       time_is_before_jiffies(*timeout);
 }
 
-static inline unsigned long
-ip_set_timeout_set(u32 timeout)
+static inline void
+ip_set_timeout_set(unsigned long *timeout, u32 t)
 {
-	unsigned long t;
-
-	if (!timeout)
-		return IPSET_ELEM_PERMANENT;
+	if (!t) {
+		*timeout = IPSET_ELEM_PERMANENT;
+		return;
+	}
 
-	t = msecs_to_jiffies(timeout * 1000) + jiffies;
-	if (t == IPSET_ELEM_PERMANENT)
+	*timeout = msecs_to_jiffies(t * 1000) + jiffies;
+	if (*timeout == IPSET_ELEM_PERMANENT)
 		/* Bingo! :-) */
-		t++;
-
-	return t;
+		(*timeout)--;
 }
 
 static inline u32
-ip_set_timeout_get(unsigned long timeout)
+ip_set_timeout_get(unsigned long *timeout)
 {
-	return timeout == IPSET_ELEM_PERMANENT ? 0 :
-		jiffies_to_msecs(timeout - jiffies)/1000;
+	return *timeout == IPSET_ELEM_PERMANENT ? 0 :
+		jiffies_to_msecs(*timeout - jiffies)/1000;
 }
-#endif /* ! IP_SET_BITMAP_TIMEOUT */
 
 #endif	/* __KERNEL__ */
-
 #endif /* _IP_SET_TIMEOUT_H */
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 86f5e26f39d3..4486285d10da 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -1,6 +1,6 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -315,6 +315,19 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr)
 }
 EXPORT_SYMBOL_GPL(ip_set_get_ipaddr6);
 
+int
+ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
+		      struct ip_set_ext *ext)
+{
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		if (!(set->extensions & IPSET_EXT_TIMEOUT))
+			return -IPSET_ERR_TIMEOUT;
+		ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ip_set_get_extensions);
+
 /*
  * Creating/destroying/renaming/swapping affect the existence and
  * the properties of a set. All of these can be executed from userspace
@@ -365,8 +378,7 @@ ip_set_rcu_get(ip_set_id_t index)
 
 int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
-	    const struct xt_action_param *par,
-	    const struct ip_set_adt_opt *opt)
+	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_rcu_get(index);
 	int ret = 0;
@@ -404,8 +416,7 @@ EXPORT_SYMBOL_GPL(ip_set_test);
 
 int
 ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
-	   const struct xt_action_param *par,
-	   const struct ip_set_adt_opt *opt)
+	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_rcu_get(index);
 	int ret;
@@ -427,8 +438,7 @@ EXPORT_SYMBOL_GPL(ip_set_add);
 
 int
 ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
-	   const struct xt_action_param *par,
-	   const struct ip_set_adt_opt *opt)
+	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
 	struct ip_set *set = ip_set_rcu_get(index);
 	int ret = 0;
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 865a9e54f3ad..636c5199ff35 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -1,7 +1,7 @@
 /* Copyright (C) 2000-2002 Joakim Axelsson <gozem@linux.nu>
  *                         Patrick Schaaf <bof@bof.de>
  *                         Martin Josefsson <gandalf@wlug.westbo.se>
- * Copyright (C) 2003-2011 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ * Copyright (C) 2003-2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -30,7 +30,7 @@ MODULE_ALIAS("ip6t_SET");
 static inline int
 match_set(ip_set_id_t index, const struct sk_buff *skb,
 	  const struct xt_action_param *par,
-	  const struct ip_set_adt_opt *opt, int inv)
+	  struct ip_set_adt_opt *opt, int inv)
 {
 	if (ip_set_test(index, skb, par, opt))
 		inv = !inv;
@@ -38,20 +38,12 @@ match_set(ip_set_id_t index, const struct sk_buff *skb,
 }
 
 #define ADT_OPT(n, f, d, fs, cfs, t)	\
-const struct ip_set_adt_opt n = {	\
-	.family	= f,			\
-	.dim = d,			\
-	.flags = fs,			\
-	.cmdflags = cfs,		\
-	.timeout = t,			\
-}
-#define ADT_MOPT(n, f, d, fs, cfs, t)	\
 struct ip_set_adt_opt n = {		\
 	.family	= f,			\
 	.dim = d,			\
 	.flags = fs,			\
 	.cmdflags = cfs,		\
-	.timeout = t,			\
+	.ext.timeout = t,		\
 }
 
 /* Revision 0 interface: backward compatible with netfilter/iptables */
@@ -305,15 +297,15 @@ static unsigned int
 set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_set_info_target_v2 *info = par->targinfo;
-	ADT_MOPT(add_opt, par->family, info->add_set.dim,
-		 info->add_set.flags, info->flags, info->timeout);
+	ADT_OPT(add_opt, par->family, info->add_set.dim,
+		info->add_set.flags, info->flags, info->timeout);
 	ADT_OPT(del_opt, par->family, info->del_set.dim,
 		info->del_set.flags, 0, UINT_MAX);
 
 	/* Normalize to fit into jiffies */
-	if (add_opt.timeout != IPSET_NO_TIMEOUT &&
-	    add_opt.timeout > UINT_MAX/MSEC_PER_SEC)
-		add_opt.timeout = UINT_MAX/MSEC_PER_SEC;
+	if (add_opt.ext.timeout != IPSET_NO_TIMEOUT &&
+	    add_opt.ext.timeout > UINT_MAX/MSEC_PER_SEC)
+		add_opt.ext.timeout = UINT_MAX/MSEC_PER_SEC;
 	if (info->add_set.index != IPSET_INVALID_ID)
 		ip_set_add(info->add_set.index, skb, par, &add_opt);
 	if (info->del_set.index != IPSET_INVALID_ID)
diff --git a/net/sched/em_ipset.c b/net/sched/em_ipset.c
index 3130320997e2..938b7cbf5627 100644
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -83,7 +83,7 @@ static int em_ipset_match(struct sk_buff *skb, struct tcf_ematch *em,
 	opt.dim = set->dim;
 	opt.flags = set->flags;
 	opt.cmdflags = 0;
-	opt.timeout = ~0u;
+	opt.ext.timeout = ~0u;
 
 	network_offset = skb_network_offset(skb);
 	skb_pull(skb, network_offset);
-- 
cgit 


From 4d73de38c256623b324474098f7d2bb4e97f7cf0 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 8 Apr 2013 21:00:52 +0200
Subject: netfilter: ipset: Unified bitmap type generation

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set_bitmap.h |   6 +
 net/netfilter/ipset/ip_set_bitmap_gen.h       | 265 ++++++++++++++++++++++++++
 2 files changed, 271 insertions(+)
 create mode 100644 net/netfilter/ipset/ip_set_bitmap_gen.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_bitmap.h b/include/linux/netfilter/ipset/ip_set_bitmap.h
index 1a30646d5be8..5e4662a71e01 100644
--- a/include/linux/netfilter/ipset/ip_set_bitmap.h
+++ b/include/linux/netfilter/ipset/ip_set_bitmap.h
@@ -5,6 +5,12 @@
 
 #define IPSET_BITMAP_MAX_RANGE	0x0000FFFF
 
+enum {
+	IPSET_ADD_FAILED = 1,
+	IPSET_ADD_STORE_PLAIN_TIMEOUT,
+	IPSET_ADD_START_STORED_TIMEOUT,
+};
+
 /* Common functions */
 
 static inline u32
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
new file mode 100644
index 000000000000..b9931591cbe9
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -0,0 +1,265 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef __IP_SET_BITMAP_IP_GEN_H
+#define __IP_SET_BITMAP_IP_GEN_H
+
+#define CONCAT(a, b)		a##b
+#define TOKEN(a,b)		CONCAT(a, b)
+
+#define mtype_do_test		TOKEN(MTYPE, _do_test)
+#define mtype_gc_test		TOKEN(MTYPE, _gc_test)
+#define mtype_is_filled		TOKEN(MTYPE, _is_filled)
+#define mtype_do_add		TOKEN(MTYPE, _do_add)
+#define mtype_do_del		TOKEN(MTYPE, _do_del)
+#define mtype_do_list		TOKEN(MTYPE, _do_list)
+#define mtype_do_head		TOKEN(MTYPE, _do_head)
+#define mtype_adt_elem		TOKEN(MTYPE, _adt_elem)
+#define mtype_add_timeout	TOKEN(MTYPE, _add_timeout)
+#define mtype_gc_init		TOKEN(MTYPE, _gc_init)
+#define mtype_kadt		TOKEN(MTYPE, _kadt)
+#define mtype_uadt		TOKEN(MTYPE, _uadt)
+#define mtype_destroy		TOKEN(MTYPE, _destroy)
+#define mtype_flush		TOKEN(MTYPE, _flush)
+#define mtype_head		TOKEN(MTYPE, _head)
+#define mtype_same_set		TOKEN(MTYPE, _same_set)
+#define mtype_elem		TOKEN(MTYPE, _elem)
+#define mtype_test		TOKEN(MTYPE, _test)
+#define mtype_add		TOKEN(MTYPE, _add)
+#define mtype_del		TOKEN(MTYPE, _del)
+#define mtype_list		TOKEN(MTYPE, _list)
+#define mtype_gc		TOKEN(MTYPE, _gc)
+#define mtype			MTYPE
+
+#define ext_timeout(e, m)	\
+	(unsigned long *)((e) + (m)->offset[IPSET_OFFSET_TIMEOUT])
+#define get_ext(map, id)	((map)->extensions + (map)->dsize * (id))
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+	struct mtype *map = set->data;
+
+	init_timer(&map->gc);
+	map->gc.data = (unsigned long) set;
+	map->gc.function = gc;
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+static void
+mtype_destroy(struct ip_set *set)
+{
+	struct mtype *map = set->data;
+
+	if (SET_WITH_TIMEOUT(set))
+		del_timer_sync(&map->gc);
+
+	ip_set_free(map->members);
+	if (map->dsize)
+		ip_set_free(map->extensions);
+	kfree(map);
+
+	set->data = NULL;
+}
+
+static void
+mtype_flush(struct ip_set *set)
+{
+	struct mtype *map = set->data;
+
+	memset(map->members, 0, map->memsize);
+}
+
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+	const struct mtype *map = set->data;
+	struct nlattr *nested;
+
+	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+	if (!nested)
+		goto nla_put_failure;
+	if (mtype_do_head(skb, map) ||
+	    nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
+			  htonl(sizeof(*map) +
+				map->memsize +
+				map->dsize * map->elements)) ||
+	    (SET_WITH_TIMEOUT(set) &&
+	     nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(map->timeout))))
+		goto nla_put_failure;
+	ipset_nest_end(skb, nested);
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	   struct ip_set_ext *mext, u32 flags)
+{
+	struct mtype *map = set->data;
+	const struct mtype_adt_elem *e = value;
+	void *x = get_ext(map, e->id);
+	int ret = mtype_do_test(e, map);
+
+	if (ret <= 0)
+		return ret;
+	if (SET_WITH_TIMEOUT(set) &&
+	    ip_set_timeout_expired(ext_timeout(x, map)))
+		return 0;
+	return 1;
+}
+
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags)
+{
+	struct mtype *map = set->data;
+	const struct mtype_adt_elem *e = value;
+	void *x = get_ext(map, e->id);
+	int ret = mtype_do_add(e, map, flags);
+
+	if (ret == IPSET_ADD_FAILED) {
+		if (SET_WITH_TIMEOUT(set) &&
+		    ip_set_timeout_expired(ext_timeout(x, map)))
+			ret = 0;
+		else if (!(flags & IPSET_FLAG_EXIST))
+			return -IPSET_ERR_EXIST;
+	}
+
+	if (SET_WITH_TIMEOUT(set))
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+		mtype_add_timeout(ext_timeout(x, map), e, ext, map, ret);
+#else
+		ip_set_timeout_set(ext_timeout(x, map), ext->timeout);
+#endif
+
+	return 0;
+}
+
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags)
+{
+	struct mtype *map = set->data;
+	const struct mtype_adt_elem *e = value;
+	const void *x = get_ext(map, e->id);
+
+	if (mtype_do_del(e, map) ||
+	    (SET_WITH_TIMEOUT(set) &&
+	     ip_set_timeout_expired(ext_timeout(x, map))))
+		return -IPSET_ERR_EXIST;
+
+	return 0;
+}
+
+static int
+mtype_list(const struct ip_set *set,
+	   struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct mtype *map = set->data;
+	struct nlattr *adt, *nested;
+	void *x;
+	u32 id, first = cb->args[2];
+
+	adt = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!adt)
+		return -EMSGSIZE;
+	for (; cb->args[2] < map->elements; cb->args[2]++) {
+		id = cb->args[2];
+		x = get_ext(map, id);
+		if (!test_bit(id, map->members) ||
+		    (SET_WITH_TIMEOUT(set) &&
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+		     mtype_is_filled((const struct mtype_elem *) x) &&
+#endif
+		     ip_set_timeout_expired(ext_timeout(x, map))))
+			continue;
+		nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+		if (!nested) {
+			if (id == first) {
+				nla_nest_cancel(skb, adt);
+				return -EMSGSIZE;
+			} else
+				goto nla_put_failure;
+		}
+		if (mtype_do_list(skb, map, id))
+			goto nla_put_failure;
+		if (SET_WITH_TIMEOUT(set)) {
+#ifdef IP_SET_BITMAP_STORED_TIMEOUT
+			if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+					  htonl(ip_set_timeout_stored(map, id,
+							ext_timeout(x, map)))))
+				goto nla_put_failure;
+#else
+			if (nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+					  htonl(ip_set_timeout_get(
+							ext_timeout(x, map)))))
+				goto nla_put_failure;
+#endif
+		}
+		ipset_nest_end(skb, nested);
+	}
+	ipset_nest_end(skb, adt);
+
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nla_nest_cancel(skb, nested);
+	ipset_nest_end(skb, adt);
+	if (unlikely(id == first)) {
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+	struct ip_set *set = (struct ip_set *) ul_set;
+	struct mtype *map = set->data;
+	const void *x;
+	u32 id;
+
+	/* We run parallel with other readers (test element)
+	 * but adding/deleting new entries is locked out */
+	read_lock_bh(&set->lock);
+	for (id = 0; id < map->elements; id++)
+		if (mtype_gc_test(id, map)) {
+			x = get_ext(map, id);
+			if (ip_set_timeout_expired(ext_timeout(x, map)))
+				clear_bit(id, map->members);
+		}
+	read_unlock_bh(&set->lock);
+
+	map->gc.expires = jiffies + IPSET_GC_PERIOD(map->timeout) * HZ;
+	add_timer(&map->gc);
+}
+
+static const struct ip_set_type_variant mtype = {
+	.kadt	= mtype_kadt,
+	.uadt	= mtype_uadt,
+	.adt	= {
+		[IPSET_ADD] = mtype_add,
+		[IPSET_DEL] = mtype_del,
+		[IPSET_TEST] = mtype_test,
+	},
+	.destroy = mtype_destroy,
+	.flush	= mtype_flush,
+	.head	= mtype_head,
+	.list	= mtype_list,
+	.same_set = mtype_same_set,
+};
+
+#endif /* __IP_SET_BITMAP_IP_GEN_H */
-- 
cgit 


From 1feab10d7e6ddb5e13d6a4bd93a1b877390262ec Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Mon, 8 Apr 2013 21:05:44 +0200
Subject: netfilter: ipset: Unified hash type generation

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set_ahash.h | 1241 --------------------------
 net/netfilter/ipset/ip_set_hash_gen.h        | 1039 +++++++++++++++++++++
 2 files changed, 1039 insertions(+), 1241 deletions(-)
 delete mode 100644 include/linux/netfilter/ipset/ip_set_ahash.h
 create mode 100644 net/netfilter/ipset/ip_set_hash_gen.h

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set_ahash.h b/include/linux/netfilter/ipset/ip_set_ahash.h
deleted file mode 100644
index 0214c4c146fa..000000000000
--- a/include/linux/netfilter/ipset/ip_set_ahash.h
+++ /dev/null
@@ -1,1241 +0,0 @@
-#ifndef _IP_SET_AHASH_H
-#define _IP_SET_AHASH_H
-
-#include <linux/rcupdate.h>
-#include <linux/jhash.h>
-#include <linux/netfilter/ipset/ip_set_timeout.h>
-
-#define CONCAT(a, b, c)		a##b##c
-#define TOKEN(a, b, c)		CONCAT(a, b, c)
-
-#define type_pf_next		TOKEN(TYPE, PF, _elem)
-
-/* Hashing which uses arrays to resolve clashing. The hash table is resized
- * (doubled) when searching becomes too long.
- * Internally jhash is used with the assumption that the size of the
- * stored data is a multiple of sizeof(u32). If storage supports timeout,
- * the timeout field must be the last one in the data structure - that field
- * is ignored when computing the hash key.
- *
- * Readers and resizing
- *
- * Resizing can be triggered by userspace command only, and those
- * are serialized by the nfnl mutex. During resizing the set is
- * read-locked, so the only possible concurrent operations are
- * the kernel side readers. Those must be protected by proper RCU locking.
- */
-
-/* Number of elements to store in an initial array block */
-#define AHASH_INIT_SIZE			4
-/* Max number of elements to store in an array block */
-#define AHASH_MAX_SIZE			(3*AHASH_INIT_SIZE)
-
-/* Max number of elements can be tuned */
-#ifdef IP_SET_HASH_WITH_MULTI
-#define AHASH_MAX(h)			((h)->ahash_max)
-
-static inline u8
-tune_ahash_max(u8 curr, u32 multi)
-{
-	u32 n;
-
-	if (multi < curr)
-		return curr;
-
-	n = curr + AHASH_INIT_SIZE;
-	/* Currently, at listing one hash bucket must fit into a message.
-	 * Therefore we have a hard limit here.
-	 */
-	return n > curr && n <= 64 ? n : curr;
-}
-#define TUNE_AHASH_MAX(h, multi)	\
-	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
-#else
-#define AHASH_MAX(h)			AHASH_MAX_SIZE
-#define TUNE_AHASH_MAX(h, multi)
-#endif
-
-/* A hash bucket */
-struct hbucket {
-	void *value;		/* the array of the values */
-	u8 size;		/* size of the array */
-	u8 pos;			/* position of the first free entry */
-};
-
-/* The hash table: the table size stored here in order to make resizing easy */
-struct htable {
-	u8 htable_bits;		/* size of hash table == 2^htable_bits */
-	struct hbucket bucket[0]; /* hashtable buckets */
-};
-
-#define hbucket(h, i)		(&((h)->bucket[i]))
-
-/* Book-keeping of the prefixes added to the set */
-struct ip_set_hash_nets {
-	u8 cidr;		/* the different cidr values in the set */
-	u32 nets;		/* number of elements per cidr */
-};
-
-/* The generic ip_set hash structure */
-struct ip_set_hash {
-	struct htable *table;	/* the hash table */
-	u32 maxelem;		/* max elements in the hash */
-	u32 elements;		/* current element (vs timeout) */
-	u32 initval;		/* random jhash init value */
-	u32 timeout;		/* timeout value, if enabled */
-	struct timer_list gc;	/* garbage collection when timeout enabled */
-	struct type_pf_next next; /* temporary storage for uadd */
-#ifdef IP_SET_HASH_WITH_MULTI
-	u8 ahash_max;		/* max elements in an array block */
-#endif
-#ifdef IP_SET_HASH_WITH_NETMASK
-	u8 netmask;		/* netmask value for subnets to store */
-#endif
-#ifdef IP_SET_HASH_WITH_RBTREE
-	struct rb_root rbtree;
-#endif
-#ifdef IP_SET_HASH_WITH_NETS
-	struct ip_set_hash_nets nets[0]; /* book-keeping of prefixes */
-#endif
-};
-
-static size_t
-htable_size(u8 hbits)
-{
-	size_t hsize;
-
-	/* We must fit both into u32 in jhash and size_t */
-	if (hbits > 31)
-		return 0;
-	hsize = jhash_size(hbits);
-	if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
-	    < hsize)
-		return 0;
-
-	return hsize * sizeof(struct hbucket) + sizeof(struct htable);
-}
-
-/* Compute htable_bits from the user input parameter hashsize */
-static u8
-htable_bits(u32 hashsize)
-{
-	/* Assume that hashsize == 2^htable_bits */
-	u8 bits = fls(hashsize - 1);
-	if (jhash_size(bits) != hashsize)
-		/* Round up to the first 2^n value */
-		bits = fls(hashsize);
-
-	return bits;
-}
-
-#ifdef IP_SET_HASH_WITH_NETS
-#ifdef IP_SET_HASH_WITH_NETS_PACKED
-/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */
-#define CIDR(cidr)	(cidr + 1)
-#else
-#define CIDR(cidr)	(cidr)
-#endif
-
-#define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128)
-#ifdef IP_SET_HASH_WITH_MULTI
-#define NETS_LENGTH(family)	(SET_HOST_MASK(family) + 1)
-#else
-#define NETS_LENGTH(family)	SET_HOST_MASK(family)
-#endif
-
-/* Network cidr size book keeping when the hash stores different
- * sized networks */
-static void
-add_cidr(struct ip_set_hash *h, u8 cidr, u8 nets_length)
-{
-	int i, j;
-
-	/* Add in increasing prefix order, so larger cidr first */
-	for (i = 0, j = -1; i < nets_length && h->nets[i].nets; i++) {
-		if (j != -1)
-			continue;
-		else if (h->nets[i].cidr < cidr)
-			j = i;
-		else if (h->nets[i].cidr == cidr) {
-			h->nets[i].nets++;
-			return;
-		}
-	}
-	if (j != -1) {
-		for (; i > j; i--) {
-			h->nets[i].cidr = h->nets[i - 1].cidr;
-			h->nets[i].nets = h->nets[i - 1].nets;
-		}
-	}
-	h->nets[i].cidr = cidr;
-	h->nets[i].nets = 1;
-}
-
-static void
-del_cidr(struct ip_set_hash *h, u8 cidr, u8 nets_length)
-{
-	u8 i, j;
-
-	for (i = 0; i < nets_length - 1 && h->nets[i].cidr != cidr; i++)
-		;
-	h->nets[i].nets--;
-
-	if (h->nets[i].nets != 0)
-		return;
-
-	for (j = i; j < nets_length - 1 && h->nets[j].nets; j++) {
-		h->nets[j].cidr = h->nets[j + 1].cidr;
-		h->nets[j].nets = h->nets[j + 1].nets;
-	}
-}
-#else
-#define NETS_LENGTH(family)		0
-#endif
-
-/* Destroy the hashtable part of the set */
-static void
-ahash_destroy(struct htable *t)
-{
-	struct hbucket *n;
-	u32 i;
-
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = hbucket(t, i);
-		if (n->size)
-			/* FIXME: use slab cache */
-			kfree(n->value);
-	}
-
-	ip_set_free(t);
-}
-
-/* Calculate the actual memory size of the set data */
-static size_t
-ahash_memsize(const struct ip_set_hash *h, size_t dsize, u8 nets_length)
-{
-	u32 i;
-	struct htable *t = h->table;
-	size_t memsize = sizeof(*h)
-			 + sizeof(*t)
-#ifdef IP_SET_HASH_WITH_NETS
-			 + sizeof(struct ip_set_hash_nets) * nets_length
-#endif
-			 + jhash_size(t->htable_bits) * sizeof(struct hbucket);
-
-	for (i = 0; i < jhash_size(t->htable_bits); i++)
-			memsize += t->bucket[i].size * dsize;
-
-	return memsize;
-}
-
-/* Flush a hash type of set: destroy all elements */
-static void
-ip_set_hash_flush(struct ip_set *set)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	struct hbucket *n;
-	u32 i;
-
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = hbucket(t, i);
-		if (n->size) {
-			n->size = n->pos = 0;
-			/* FIXME: use slab cache */
-			kfree(n->value);
-		}
-	}
-#ifdef IP_SET_HASH_WITH_NETS
-	memset(h->nets, 0, sizeof(struct ip_set_hash_nets)
-			   * NETS_LENGTH(set->family));
-#endif
-	h->elements = 0;
-}
-
-/* Destroy a hash type of set */
-static void
-ip_set_hash_destroy(struct ip_set *set)
-{
-	struct ip_set_hash *h = set->data;
-
-	if (with_timeout(h->timeout))
-		del_timer_sync(&h->gc);
-
-	ahash_destroy(h->table);
-#ifdef IP_SET_HASH_WITH_RBTREE
-	rbtree_destroy(&h->rbtree);
-#endif
-	kfree(h);
-
-	set->data = NULL;
-}
-
-#endif /* _IP_SET_AHASH_H */
-
-#ifndef HKEY_DATALEN
-#define HKEY_DATALEN	sizeof(struct type_pf_elem)
-#endif
-
-#define HKEY(data, initval, htable_bits)			\
-(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval)	\
-	& jhash_mask(htable_bits))
-
-/* Type/family dependent function prototypes */
-
-#define type_pf_data_equal	TOKEN(TYPE, PF, _data_equal)
-#define type_pf_data_isnull	TOKEN(TYPE, PF, _data_isnull)
-#define type_pf_data_copy	TOKEN(TYPE, PF, _data_copy)
-#define type_pf_data_zero_out	TOKEN(TYPE, PF, _data_zero_out)
-#define type_pf_data_netmask	TOKEN(TYPE, PF, _data_netmask)
-#define type_pf_data_list	TOKEN(TYPE, PF, _data_list)
-#define type_pf_data_tlist	TOKEN(TYPE, PF, _data_tlist)
-#define type_pf_data_next	TOKEN(TYPE, PF, _data_next)
-#define type_pf_data_flags	TOKEN(TYPE, PF, _data_flags)
-#define type_pf_data_reset_flags TOKEN(TYPE, PF, _data_reset_flags)
-#ifdef IP_SET_HASH_WITH_NETS
-#define type_pf_data_match	TOKEN(TYPE, PF, _data_match)
-#else
-#define type_pf_data_match(d)	1
-#endif
-
-#define type_pf_elem		TOKEN(TYPE, PF, _elem)
-#define type_pf_telem		TOKEN(TYPE, PF, _telem)
-#define type_pf_data_timeout	TOKEN(TYPE, PF, _data_timeout)
-#define type_pf_data_expired	TOKEN(TYPE, PF, _data_expired)
-#define type_pf_data_timeout_set TOKEN(TYPE, PF, _data_timeout_set)
-
-#define type_pf_elem_add	TOKEN(TYPE, PF, _elem_add)
-#define type_pf_add		TOKEN(TYPE, PF, _add)
-#define type_pf_del		TOKEN(TYPE, PF, _del)
-#define type_pf_test_cidrs	TOKEN(TYPE, PF, _test_cidrs)
-#define type_pf_test		TOKEN(TYPE, PF, _test)
-
-#define type_pf_elem_tadd	TOKEN(TYPE, PF, _elem_tadd)
-#define type_pf_del_telem	TOKEN(TYPE, PF, _ahash_del_telem)
-#define type_pf_expire		TOKEN(TYPE, PF, _expire)
-#define type_pf_tadd		TOKEN(TYPE, PF, _tadd)
-#define type_pf_tdel		TOKEN(TYPE, PF, _tdel)
-#define type_pf_ttest_cidrs	TOKEN(TYPE, PF, _ahash_ttest_cidrs)
-#define type_pf_ttest		TOKEN(TYPE, PF, _ahash_ttest)
-
-#define type_pf_resize		TOKEN(TYPE, PF, _resize)
-#define type_pf_tresize		TOKEN(TYPE, PF, _tresize)
-#define type_pf_flush		ip_set_hash_flush
-#define type_pf_destroy		ip_set_hash_destroy
-#define type_pf_head		TOKEN(TYPE, PF, _head)
-#define type_pf_list		TOKEN(TYPE, PF, _list)
-#define type_pf_tlist		TOKEN(TYPE, PF, _tlist)
-#define type_pf_same_set	TOKEN(TYPE, PF, _same_set)
-#define type_pf_kadt		TOKEN(TYPE, PF, _kadt)
-#define type_pf_uadt		TOKEN(TYPE, PF, _uadt)
-#define type_pf_gc		TOKEN(TYPE, PF, _gc)
-#define type_pf_gc_init		TOKEN(TYPE, PF, _gc_init)
-#define type_pf_variant		TOKEN(TYPE, PF, _variant)
-#define type_pf_tvariant	TOKEN(TYPE, PF, _tvariant)
-
-/* Flavour without timeout */
-
-/* Get the ith element from the array block n */
-#define ahash_data(n, i)	\
-	((struct type_pf_elem *)((n)->value) + (i))
-
-/* Add an element to the hash table when resizing the set:
- * we spare the maintenance of the internal counters. */
-static int
-type_pf_elem_add(struct hbucket *n, const struct type_pf_elem *value,
-		 u8 ahash_max, u32 cadt_flags)
-{
-	struct type_pf_elem *data;
-
-	if (n->pos >= n->size) {
-		void *tmp;
-
-		if (n->size >= ahash_max)
-			/* Trigger rehashing */
-			return -EAGAIN;
-
-		tmp = kzalloc((n->size + AHASH_INIT_SIZE)
-			      * sizeof(struct type_pf_elem),
-			      GFP_ATOMIC);
-		if (!tmp)
-			return -ENOMEM;
-		if (n->size) {
-			memcpy(tmp, n->value,
-			       sizeof(struct type_pf_elem) * n->size);
-			kfree(n->value);
-		}
-		n->value = tmp;
-		n->size += AHASH_INIT_SIZE;
-	}
-	data = ahash_data(n, n->pos++);
-	type_pf_data_copy(data, value);
-#ifdef IP_SET_HASH_WITH_NETS
-	/* Resizing won't overwrite stored flags */
-	if (cadt_flags)
-		type_pf_data_flags(data, cadt_flags);
-#endif
-	return 0;
-}
-
-/* Resize a hash: create a new hash table with doubling the hashsize
- * and inserting the elements to it. Repeat until we succeed or
- * fail due to memory pressures. */
-static int
-type_pf_resize(struct ip_set *set, bool retried)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t, *orig = h->table;
-	u8 htable_bits = orig->htable_bits;
-	struct type_pf_elem *data;
-	struct hbucket *n, *m;
-	u32 i, j, flags = 0;
-	int ret;
-
-retry:
-	ret = 0;
-	htable_bits++;
-	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
-		 set->name, orig->htable_bits, htable_bits, orig);
-	if (!htable_bits) {
-		/* In case we have plenty of memory :-) */
-		pr_warning("Cannot increase the hashsize of set %s further\n",
-			   set->name);
-		return -IPSET_ERR_HASH_FULL;
-	}
-	t = ip_set_alloc(sizeof(*t)
-			 + jhash_size(htable_bits) * sizeof(struct hbucket));
-	if (!t)
-		return -ENOMEM;
-	t->htable_bits = htable_bits;
-
-	read_lock_bh(&set->lock);
-	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
-		n = hbucket(orig, i);
-		for (j = 0; j < n->pos; j++) {
-			data = ahash_data(n, j);
-#ifdef IP_SET_HASH_WITH_NETS
-			flags = 0;
-			type_pf_data_reset_flags(data, &flags);
-#endif
-			m = hbucket(t, HKEY(data, h->initval, htable_bits));
-			ret = type_pf_elem_add(m, data, AHASH_MAX(h), flags);
-			if (ret < 0) {
-#ifdef IP_SET_HASH_WITH_NETS
-				type_pf_data_flags(data, flags);
-#endif
-				read_unlock_bh(&set->lock);
-				ahash_destroy(t);
-				if (ret == -EAGAIN)
-					goto retry;
-				return ret;
-			}
-		}
-	}
-
-	rcu_assign_pointer(h->table, t);
-	read_unlock_bh(&set->lock);
-
-	/* Give time to other readers of the set */
-	synchronize_rcu_bh();
-
-	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
-		 orig->htable_bits, orig, t->htable_bits, t);
-	ahash_destroy(orig);
-
-	return 0;
-}
-
-static inline void
-type_pf_data_next(struct ip_set_hash *h, const struct type_pf_elem *d);
-
-/* Add an element to a hash and update the internal counters when succeeded,
- * otherwise report the proper error code. */
-static int
-type_pf_add(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t;
-	const struct type_pf_elem *d = value;
-	struct hbucket *n;
-	int i, ret = 0;
-	u32 key, multi = 0;
-	u32 cadt_flags = flags >> 16;
-
-	if (h->elements >= h->maxelem) {
-		if (net_ratelimit())
-			pr_warning("Set %s is full, maxelem %u reached\n",
-				   set->name, h->maxelem);
-		return -IPSET_ERR_HASH_FULL;
-	}
-
-	rcu_read_lock_bh();
-	t = rcu_dereference_bh(h->table);
-	key = HKEY(value, h->initval, t->htable_bits);
-	n = hbucket(t, key);
-	for (i = 0; i < n->pos; i++)
-		if (type_pf_data_equal(ahash_data(n, i), d, &multi)) {
-#ifdef IP_SET_HASH_WITH_NETS
-			if (flags & IPSET_FLAG_EXIST)
-				/* Support overwriting just the flags */
-				type_pf_data_flags(ahash_data(n, i),
-						   cadt_flags);
-#endif
-			ret = -IPSET_ERR_EXIST;
-			goto out;
-		}
-	TUNE_AHASH_MAX(h, multi);
-	ret = type_pf_elem_add(n, value, AHASH_MAX(h), cadt_flags);
-	if (ret != 0) {
-		if (ret == -EAGAIN)
-			type_pf_data_next(h, d);
-		goto out;
-	}
-
-#ifdef IP_SET_HASH_WITH_NETS
-	add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
-#endif
-	h->elements++;
-out:
-	rcu_read_unlock_bh();
-	return ret;
-}
-
-/* Delete an element from the hash: swap it with the last element
- * and free up space if possible.
- */
-static int
-type_pf_del(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	const struct type_pf_elem *d = value;
-	struct hbucket *n;
-	int i;
-	struct type_pf_elem *data;
-	u32 key, multi = 0;
-
-	key = HKEY(value, h->initval, t->htable_bits);
-	n = hbucket(t, key);
-	for (i = 0; i < n->pos; i++) {
-		data = ahash_data(n, i);
-		if (!type_pf_data_equal(data, d, &multi))
-			continue;
-		if (i != n->pos - 1)
-			/* Not last one */
-			type_pf_data_copy(data, ahash_data(n, n->pos - 1));
-
-		n->pos--;
-		h->elements--;
-#ifdef IP_SET_HASH_WITH_NETS
-		del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
-#endif
-		if (n->pos + AHASH_INIT_SIZE < n->size) {
-			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
-					    * sizeof(struct type_pf_elem),
-					    GFP_ATOMIC);
-			if (!tmp)
-				return 0;
-			n->size -= AHASH_INIT_SIZE;
-			memcpy(tmp, n->value,
-			       n->size * sizeof(struct type_pf_elem));
-			kfree(n->value);
-			n->value = tmp;
-		}
-		return 0;
-	}
-
-	return -IPSET_ERR_EXIST;
-}
-
-#ifdef IP_SET_HASH_WITH_NETS
-
-/* Special test function which takes into account the different network
- * sizes added to the set */
-static int
-type_pf_test_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	struct hbucket *n;
-	const struct type_pf_elem *data;
-	int i, j = 0;
-	u32 key, multi = 0;
-	u8 nets_length = NETS_LENGTH(set->family);
-
-	pr_debug("test by nets\n");
-	for (; j < nets_length && h->nets[j].nets && !multi; j++) {
-		type_pf_data_netmask(d, h->nets[j].cidr);
-		key = HKEY(d, h->initval, t->htable_bits);
-		n = hbucket(t, key);
-		for (i = 0; i < n->pos; i++) {
-			data = ahash_data(n, i);
-			if (type_pf_data_equal(data, d, &multi))
-				return type_pf_data_match(data);
-		}
-	}
-	return 0;
-}
-#endif
-
-/* Test whether the element is added to the set */
-static int
-type_pf_test(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	struct type_pf_elem *d = value;
-	struct hbucket *n;
-	const struct type_pf_elem *data;
-	int i;
-	u32 key, multi = 0;
-
-#ifdef IP_SET_HASH_WITH_NETS
-	/* If we test an IP address and not a network address,
-	 * try all possible network sizes */
-	if (CIDR(d->cidr) == SET_HOST_MASK(set->family))
-		return type_pf_test_cidrs(set, d, timeout);
-#endif
-
-	key = HKEY(d, h->initval, t->htable_bits);
-	n = hbucket(t, key);
-	for (i = 0; i < n->pos; i++) {
-		data = ahash_data(n, i);
-		if (type_pf_data_equal(data, d, &multi))
-			return type_pf_data_match(data);
-	}
-	return 0;
-}
-
-/* Reply a HEADER request: fill out the header part of the set */
-static int
-type_pf_head(struct ip_set *set, struct sk_buff *skb)
-{
-	const struct ip_set_hash *h = set->data;
-	struct nlattr *nested;
-	size_t memsize;
-
-	read_lock_bh(&set->lock);
-	memsize = ahash_memsize(h, with_timeout(h->timeout)
-					? sizeof(struct type_pf_telem)
-					: sizeof(struct type_pf_elem),
-				NETS_LENGTH(set->family));
-	read_unlock_bh(&set->lock);
-
-	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
-	if (!nested)
-		goto nla_put_failure;
-	if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
-			  htonl(jhash_size(h->table->htable_bits))) ||
-	    nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
-		goto nla_put_failure;
-#ifdef IP_SET_HASH_WITH_NETMASK
-	if (h->netmask != HOST_MASK &&
-	    nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
-		goto nla_put_failure;
-#endif
-	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
-	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
-	    (with_timeout(h->timeout) &&
-	     nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout))))
-		goto nla_put_failure;
-	ipset_nest_end(skb, nested);
-
-	return 0;
-nla_put_failure:
-	return -EMSGSIZE;
-}
-
-/* Reply a LIST/SAVE request: dump the elements of the specified set */
-static int
-type_pf_list(const struct ip_set *set,
-	     struct sk_buff *skb, struct netlink_callback *cb)
-{
-	const struct ip_set_hash *h = set->data;
-	const struct htable *t = h->table;
-	struct nlattr *atd, *nested;
-	const struct hbucket *n;
-	const struct type_pf_elem *data;
-	u32 first = cb->args[2];
-	/* We assume that one hash bucket fills into one page */
-	void *incomplete;
-	int i;
-
-	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
-	if (!atd)
-		return -EMSGSIZE;
-	pr_debug("list hash set %s\n", set->name);
-	for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
-		incomplete = skb_tail_pointer(skb);
-		n = hbucket(t, cb->args[2]);
-		pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
-		for (i = 0; i < n->pos; i++) {
-			data = ahash_data(n, i);
-			pr_debug("list hash %lu hbucket %p i %u, data %p\n",
-				 cb->args[2], n, i, data);
-			nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
-			if (!nested) {
-				if (cb->args[2] == first) {
-					nla_nest_cancel(skb, atd);
-					return -EMSGSIZE;
-				} else
-					goto nla_put_failure;
-			}
-			if (type_pf_data_list(skb, data))
-				goto nla_put_failure;
-			ipset_nest_end(skb, nested);
-		}
-	}
-	ipset_nest_end(skb, atd);
-	/* Set listing finished */
-	cb->args[2] = 0;
-
-	return 0;
-
-nla_put_failure:
-	nlmsg_trim(skb, incomplete);
-	ipset_nest_end(skb, atd);
-	if (unlikely(first == cb->args[2])) {
-		pr_warning("Can't list set %s: one bucket does not fit into "
-			   "a message. Please report it!\n", set->name);
-		cb->args[2] = 0;
-		return -EMSGSIZE;
-	}
-	return 0;
-}
-
-static int
-type_pf_kadt(struct ip_set *set, const struct sk_buff *skb,
-	     const struct xt_action_param *par,
-	     enum ipset_adt adt, const struct ip_set_adt_opt *opt);
-static int
-type_pf_uadt(struct ip_set *set, struct nlattr *tb[],
-	     enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
-
-static const struct ip_set_type_variant type_pf_variant = {
-	.kadt	= type_pf_kadt,
-	.uadt	= type_pf_uadt,
-	.adt	= {
-		[IPSET_ADD] = type_pf_add,
-		[IPSET_DEL] = type_pf_del,
-		[IPSET_TEST] = type_pf_test,
-	},
-	.destroy = type_pf_destroy,
-	.flush	= type_pf_flush,
-	.head	= type_pf_head,
-	.list	= type_pf_list,
-	.resize	= type_pf_resize,
-	.same_set = type_pf_same_set,
-};
-
-/* Flavour with timeout support */
-
-#define ahash_tdata(n, i) \
-	(struct type_pf_elem *)((struct type_pf_telem *)((n)->value) + (i))
-
-static inline u32
-type_pf_data_timeout(const struct type_pf_elem *data)
-{
-	const struct type_pf_telem *tdata =
-		(const struct type_pf_telem *) data;
-
-	return tdata->timeout;
-}
-
-static inline bool
-type_pf_data_expired(const struct type_pf_elem *data)
-{
-	const struct type_pf_telem *tdata =
-		(const struct type_pf_telem *) data;
-
-	return ip_set_timeout_expired(tdata->timeout);
-}
-
-static inline void
-type_pf_data_timeout_set(struct type_pf_elem *data, u32 timeout)
-{
-	struct type_pf_telem *tdata = (struct type_pf_telem *) data;
-
-	tdata->timeout = ip_set_timeout_set(timeout);
-}
-
-static int
-type_pf_elem_tadd(struct hbucket *n, const struct type_pf_elem *value,
-		  u8 ahash_max, u32 cadt_flags, u32 timeout)
-{
-	struct type_pf_elem *data;
-
-	if (n->pos >= n->size) {
-		void *tmp;
-
-		if (n->size >= ahash_max)
-			/* Trigger rehashing */
-			return -EAGAIN;
-
-		tmp = kzalloc((n->size + AHASH_INIT_SIZE)
-			      * sizeof(struct type_pf_telem),
-			      GFP_ATOMIC);
-		if (!tmp)
-			return -ENOMEM;
-		if (n->size) {
-			memcpy(tmp, n->value,
-			       sizeof(struct type_pf_telem) * n->size);
-			kfree(n->value);
-		}
-		n->value = tmp;
-		n->size += AHASH_INIT_SIZE;
-	}
-	data = ahash_tdata(n, n->pos++);
-	type_pf_data_copy(data, value);
-	type_pf_data_timeout_set(data, timeout);
-#ifdef IP_SET_HASH_WITH_NETS
-	/* Resizing won't overwrite stored flags */
-	if (cadt_flags)
-		type_pf_data_flags(data, cadt_flags);
-#endif
-	return 0;
-}
-
-/* Delete expired elements from the hashtable */
-static void
-type_pf_expire(struct ip_set_hash *h, u8 nets_length)
-{
-	struct htable *t = h->table;
-	struct hbucket *n;
-	struct type_pf_elem *data;
-	u32 i;
-	int j;
-
-	for (i = 0; i < jhash_size(t->htable_bits); i++) {
-		n = hbucket(t, i);
-		for (j = 0; j < n->pos; j++) {
-			data = ahash_tdata(n, j);
-			if (type_pf_data_expired(data)) {
-				pr_debug("expired %u/%u\n", i, j);
-#ifdef IP_SET_HASH_WITH_NETS
-				del_cidr(h, CIDR(data->cidr), nets_length);
-#endif
-				if (j != n->pos - 1)
-					/* Not last one */
-					type_pf_data_copy(data,
-						ahash_tdata(n, n->pos - 1));
-				n->pos--;
-				h->elements--;
-			}
-		}
-		if (n->pos + AHASH_INIT_SIZE < n->size) {
-			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
-					    * sizeof(struct type_pf_telem),
-					    GFP_ATOMIC);
-			if (!tmp)
-				/* Still try to delete expired elements */
-				continue;
-			n->size -= AHASH_INIT_SIZE;
-			memcpy(tmp, n->value,
-			       n->size * sizeof(struct type_pf_telem));
-			kfree(n->value);
-			n->value = tmp;
-		}
-	}
-}
-
-static int
-type_pf_tresize(struct ip_set *set, bool retried)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t, *orig = h->table;
-	u8 htable_bits = orig->htable_bits;
-	struct type_pf_elem *data;
-	struct hbucket *n, *m;
-	u32 i, j, flags = 0;
-	int ret;
-
-	/* Try to cleanup once */
-	if (!retried) {
-		i = h->elements;
-		write_lock_bh(&set->lock);
-		type_pf_expire(set->data, NETS_LENGTH(set->family));
-		write_unlock_bh(&set->lock);
-		if (h->elements <  i)
-			return 0;
-	}
-
-retry:
-	ret = 0;
-	htable_bits++;
-	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
-		 set->name, orig->htable_bits, htable_bits, orig);
-	if (!htable_bits) {
-		/* In case we have plenty of memory :-) */
-		pr_warning("Cannot increase the hashsize of set %s further\n",
-			   set->name);
-		return -IPSET_ERR_HASH_FULL;
-	}
-	t = ip_set_alloc(sizeof(*t)
-			 + jhash_size(htable_bits) * sizeof(struct hbucket));
-	if (!t)
-		return -ENOMEM;
-	t->htable_bits = htable_bits;
-
-	read_lock_bh(&set->lock);
-	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
-		n = hbucket(orig, i);
-		for (j = 0; j < n->pos; j++) {
-			data = ahash_tdata(n, j);
-#ifdef IP_SET_HASH_WITH_NETS
-			flags = 0;
-			type_pf_data_reset_flags(data, &flags);
-#endif
-			m = hbucket(t, HKEY(data, h->initval, htable_bits));
-			ret = type_pf_elem_tadd(m, data, AHASH_MAX(h), flags,
-				ip_set_timeout_get(type_pf_data_timeout(data)));
-			if (ret < 0) {
-#ifdef IP_SET_HASH_WITH_NETS
-				type_pf_data_flags(data, flags);
-#endif
-				read_unlock_bh(&set->lock);
-				ahash_destroy(t);
-				if (ret == -EAGAIN)
-					goto retry;
-				return ret;
-			}
-		}
-	}
-
-	rcu_assign_pointer(h->table, t);
-	read_unlock_bh(&set->lock);
-
-	/* Give time to other readers of the set */
-	synchronize_rcu_bh();
-
-	ahash_destroy(orig);
-
-	return 0;
-}
-
-static int
-type_pf_tadd(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	const struct type_pf_elem *d = value;
-	struct hbucket *n;
-	struct type_pf_elem *data;
-	int ret = 0, i, j = AHASH_MAX(h) + 1;
-	bool flag_exist = flags & IPSET_FLAG_EXIST;
-	u32 key, multi = 0;
-	u32 cadt_flags = flags >> 16;
-
-	if (h->elements >= h->maxelem)
-		/* FIXME: when set is full, we slow down here */
-		type_pf_expire(h, NETS_LENGTH(set->family));
-	if (h->elements >= h->maxelem) {
-		if (net_ratelimit())
-			pr_warning("Set %s is full, maxelem %u reached\n",
-				   set->name, h->maxelem);
-		return -IPSET_ERR_HASH_FULL;
-	}
-
-	rcu_read_lock_bh();
-	t = rcu_dereference_bh(h->table);
-	key = HKEY(d, h->initval, t->htable_bits);
-	n = hbucket(t, key);
-	for (i = 0; i < n->pos; i++) {
-		data = ahash_tdata(n, i);
-		if (type_pf_data_equal(data, d, &multi)) {
-			if (type_pf_data_expired(data) || flag_exist)
-				/* Just timeout value may be updated */
-				j = i;
-			else {
-				ret = -IPSET_ERR_EXIST;
-				goto out;
-			}
-		} else if (j == AHASH_MAX(h) + 1 &&
-			   type_pf_data_expired(data))
-			j = i;
-	}
-	if (j != AHASH_MAX(h) + 1) {
-		data = ahash_tdata(n, j);
-#ifdef IP_SET_HASH_WITH_NETS
-		del_cidr(h, CIDR(data->cidr), NETS_LENGTH(set->family));
-		add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
-#endif
-		type_pf_data_copy(data, d);
-		type_pf_data_timeout_set(data, timeout);
-#ifdef IP_SET_HASH_WITH_NETS
-		type_pf_data_flags(data, cadt_flags);
-#endif
-		goto out;
-	}
-	TUNE_AHASH_MAX(h, multi);
-	ret = type_pf_elem_tadd(n, d, AHASH_MAX(h), cadt_flags, timeout);
-	if (ret != 0) {
-		if (ret == -EAGAIN)
-			type_pf_data_next(h, d);
-		goto out;
-	}
-
-#ifdef IP_SET_HASH_WITH_NETS
-	add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
-#endif
-	h->elements++;
-out:
-	rcu_read_unlock_bh();
-	return ret;
-}
-
-static int
-type_pf_tdel(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	const struct type_pf_elem *d = value;
-	struct hbucket *n;
-	int i;
-	struct type_pf_elem *data;
-	u32 key, multi = 0;
-
-	key = HKEY(value, h->initval, t->htable_bits);
-	n = hbucket(t, key);
-	for (i = 0; i < n->pos; i++) {
-		data = ahash_tdata(n, i);
-		if (!type_pf_data_equal(data, d, &multi))
-			continue;
-		if (type_pf_data_expired(data))
-			return -IPSET_ERR_EXIST;
-		if (i != n->pos - 1)
-			/* Not last one */
-			type_pf_data_copy(data, ahash_tdata(n, n->pos - 1));
-
-		n->pos--;
-		h->elements--;
-#ifdef IP_SET_HASH_WITH_NETS
-		del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
-#endif
-		if (n->pos + AHASH_INIT_SIZE < n->size) {
-			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
-					    * sizeof(struct type_pf_telem),
-					    GFP_ATOMIC);
-			if (!tmp)
-				return 0;
-			n->size -= AHASH_INIT_SIZE;
-			memcpy(tmp, n->value,
-			       n->size * sizeof(struct type_pf_telem));
-			kfree(n->value);
-			n->value = tmp;
-		}
-		return 0;
-	}
-
-	return -IPSET_ERR_EXIST;
-}
-
-#ifdef IP_SET_HASH_WITH_NETS
-static int
-type_pf_ttest_cidrs(struct ip_set *set, struct type_pf_elem *d, u32 timeout)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	struct type_pf_elem *data;
-	struct hbucket *n;
-	int i, j = 0;
-	u32 key, multi = 0;
-	u8 nets_length = NETS_LENGTH(set->family);
-
-	for (; j < nets_length && h->nets[j].nets && !multi; j++) {
-		type_pf_data_netmask(d, h->nets[j].cidr);
-		key = HKEY(d, h->initval, t->htable_bits);
-		n = hbucket(t, key);
-		for (i = 0; i < n->pos; i++) {
-			data = ahash_tdata(n, i);
-#ifdef IP_SET_HASH_WITH_MULTI
-			if (type_pf_data_equal(data, d, &multi)) {
-				if (!type_pf_data_expired(data))
-					return type_pf_data_match(data);
-				multi = 0;
-			}
-#else
-			if (type_pf_data_equal(data, d, &multi) &&
-			    !type_pf_data_expired(data))
-				return type_pf_data_match(data);
-#endif
-		}
-	}
-	return 0;
-}
-#endif
-
-static int
-type_pf_ttest(struct ip_set *set, void *value, u32 timeout, u32 flags)
-{
-	struct ip_set_hash *h = set->data;
-	struct htable *t = h->table;
-	struct type_pf_elem *data, *d = value;
-	struct hbucket *n;
-	int i;
-	u32 key, multi = 0;
-
-#ifdef IP_SET_HASH_WITH_NETS
-	if (CIDR(d->cidr) == SET_HOST_MASK(set->family))
-		return type_pf_ttest_cidrs(set, d, timeout);
-#endif
-	key = HKEY(d, h->initval, t->htable_bits);
-	n = hbucket(t, key);
-	for (i = 0; i < n->pos; i++) {
-		data = ahash_tdata(n, i);
-		if (type_pf_data_equal(data, d, &multi) &&
-		    !type_pf_data_expired(data))
-			return type_pf_data_match(data);
-	}
-	return 0;
-}
-
-static int
-type_pf_tlist(const struct ip_set *set,
-	      struct sk_buff *skb, struct netlink_callback *cb)
-{
-	const struct ip_set_hash *h = set->data;
-	const struct htable *t = h->table;
-	struct nlattr *atd, *nested;
-	const struct hbucket *n;
-	const struct type_pf_elem *data;
-	u32 first = cb->args[2];
-	/* We assume that one hash bucket fills into one page */
-	void *incomplete;
-	int i;
-
-	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
-	if (!atd)
-		return -EMSGSIZE;
-	for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
-		incomplete = skb_tail_pointer(skb);
-		n = hbucket(t, cb->args[2]);
-		for (i = 0; i < n->pos; i++) {
-			data = ahash_tdata(n, i);
-			pr_debug("list %p %u\n", n, i);
-			if (type_pf_data_expired(data))
-				continue;
-			pr_debug("do list %p %u\n", n, i);
-			nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
-			if (!nested) {
-				if (cb->args[2] == first) {
-					nla_nest_cancel(skb, atd);
-					return -EMSGSIZE;
-				} else
-					goto nla_put_failure;
-			}
-			if (type_pf_data_tlist(skb, data))
-				goto nla_put_failure;
-			ipset_nest_end(skb, nested);
-		}
-	}
-	ipset_nest_end(skb, atd);
-	/* Set listing finished */
-	cb->args[2] = 0;
-
-	return 0;
-
-nla_put_failure:
-	nlmsg_trim(skb, incomplete);
-	ipset_nest_end(skb, atd);
-	if (unlikely(first == cb->args[2])) {
-		pr_warning("Can't list set %s: one bucket does not fit into "
-			   "a message. Please report it!\n", set->name);
-		cb->args[2] = 0;
-		return -EMSGSIZE;
-	}
-	return 0;
-}
-
-static const struct ip_set_type_variant type_pf_tvariant = {
-	.kadt	= type_pf_kadt,
-	.uadt	= type_pf_uadt,
-	.adt	= {
-		[IPSET_ADD] = type_pf_tadd,
-		[IPSET_DEL] = type_pf_tdel,
-		[IPSET_TEST] = type_pf_ttest,
-	},
-	.destroy = type_pf_destroy,
-	.flush	= type_pf_flush,
-	.head	= type_pf_head,
-	.list	= type_pf_tlist,
-	.resize	= type_pf_tresize,
-	.same_set = type_pf_same_set,
-};
-
-static void
-type_pf_gc(unsigned long ul_set)
-{
-	struct ip_set *set = (struct ip_set *) ul_set;
-	struct ip_set_hash *h = set->data;
-
-	pr_debug("called\n");
-	write_lock_bh(&set->lock);
-	type_pf_expire(h, NETS_LENGTH(set->family));
-	write_unlock_bh(&set->lock);
-
-	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
-	add_timer(&h->gc);
-}
-
-static void
-type_pf_gc_init(struct ip_set *set)
-{
-	struct ip_set_hash *h = set->data;
-
-	init_timer(&h->gc);
-	h->gc.data = (unsigned long) set;
-	h->gc.function = type_pf_gc;
-	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
-	add_timer(&h->gc);
-	pr_debug("gc initialized, run in every %u\n",
-		 IPSET_GC_PERIOD(h->timeout));
-}
-
-#undef HKEY_DATALEN
-#undef HKEY
-#undef type_pf_data_equal
-#undef type_pf_data_isnull
-#undef type_pf_data_copy
-#undef type_pf_data_zero_out
-#undef type_pf_data_netmask
-#undef type_pf_data_list
-#undef type_pf_data_tlist
-#undef type_pf_data_next
-#undef type_pf_data_flags
-#undef type_pf_data_reset_flags
-#undef type_pf_data_match
-
-#undef type_pf_elem
-#undef type_pf_telem
-#undef type_pf_data_timeout
-#undef type_pf_data_expired
-#undef type_pf_data_timeout_set
-
-#undef type_pf_elem_add
-#undef type_pf_add
-#undef type_pf_del
-#undef type_pf_test_cidrs
-#undef type_pf_test
-
-#undef type_pf_elem_tadd
-#undef type_pf_del_telem
-#undef type_pf_expire
-#undef type_pf_tadd
-#undef type_pf_tdel
-#undef type_pf_ttest_cidrs
-#undef type_pf_ttest
-
-#undef type_pf_resize
-#undef type_pf_tresize
-#undef type_pf_flush
-#undef type_pf_destroy
-#undef type_pf_head
-#undef type_pf_list
-#undef type_pf_tlist
-#undef type_pf_same_set
-#undef type_pf_kadt
-#undef type_pf_uadt
-#undef type_pf_gc
-#undef type_pf_gc_init
-#undef type_pf_variant
-#undef type_pf_tvariant
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
new file mode 100644
index 000000000000..2ba7d4e76cde
--- /dev/null
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -0,0 +1,1039 @@
+/* Copyright (C) 2013 Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef _IP_SET_HASH_GEN_H
+#define _IP_SET_HASH_GEN_H
+
+#include <linux/rcupdate.h>
+#include <linux/jhash.h>
+#include <linux/netfilter/ipset/ip_set_timeout.h>
+#ifndef rcu_dereference_bh
+#define rcu_dereference_bh(p)	rcu_dereference(p)
+#endif
+
+#define CONCAT(a, b)		a##b
+#define TOKEN(a, b)		CONCAT(a, b)
+
+/* Hashing which uses arrays to resolve clashing. The hash table is resized
+ * (doubled) when searching becomes too long.
+ * Internally jhash is used with the assumption that the size of the
+ * stored data is a multiple of sizeof(u32). If storage supports timeout,
+ * the timeout field must be the last one in the data structure - that field
+ * is ignored when computing the hash key.
+ *
+ * Readers and resizing
+ *
+ * Resizing can be triggered by userspace command only, and those
+ * are serialized by the nfnl mutex. During resizing the set is
+ * read-locked, so the only possible concurrent operations are
+ * the kernel side readers. Those must be protected by proper RCU locking.
+ */
+
+/* Number of elements to store in an initial array block */
+#define AHASH_INIT_SIZE			4
+/* Max number of elements to store in an array block */
+#define AHASH_MAX_SIZE			(3*AHASH_INIT_SIZE)
+
+/* Max number of elements can be tuned */
+#ifdef IP_SET_HASH_WITH_MULTI
+#define AHASH_MAX(h)			((h)->ahash_max)
+
+static inline u8
+tune_ahash_max(u8 curr, u32 multi)
+{
+	u32 n;
+
+	if (multi < curr)
+		return curr;
+
+	n = curr + AHASH_INIT_SIZE;
+	/* Currently, at listing one hash bucket must fit into a message.
+	 * Therefore we have a hard limit here.
+	 */
+	return n > curr && n <= 64 ? n : curr;
+}
+#define TUNE_AHASH_MAX(h, multi)	\
+	((h)->ahash_max = tune_ahash_max((h)->ahash_max, multi))
+#else
+#define AHASH_MAX(h)			AHASH_MAX_SIZE
+#define TUNE_AHASH_MAX(h, multi)
+#endif
+
+/* A hash bucket */
+struct hbucket {
+	void *value;		/* the array of the values */
+	u8 size;		/* size of the array */
+	u8 pos;			/* position of the first free entry */
+};
+
+/* The hash table: the table size stored here in order to make resizing easy */
+struct htable {
+	u8 htable_bits;		/* size of hash table == 2^htable_bits */
+	struct hbucket bucket[0]; /* hashtable buckets */
+};
+
+#define hbucket(h, i)		(&((h)->bucket[i]))
+
+/* Book-keeping of the prefixes added to the set */
+struct net_prefixes {
+	u8 cidr;		/* the different cidr values in the set */
+	u32 nets;		/* number of elements per cidr */
+};
+
+/* Compute the hash table size */
+static size_t
+htable_size(u8 hbits)
+{
+	size_t hsize;
+
+	/* We must fit both into u32 in jhash and size_t */
+	if (hbits > 31)
+		return 0;
+	hsize = jhash_size(hbits);
+	if ((((size_t)-1) - sizeof(struct htable))/sizeof(struct hbucket)
+	    < hsize)
+		return 0;
+
+	return hsize * sizeof(struct hbucket) + sizeof(struct htable);
+}
+
+/* Compute htable_bits from the user input parameter hashsize */
+static u8
+htable_bits(u32 hashsize)
+{
+	/* Assume that hashsize == 2^htable_bits */
+	u8 bits = fls(hashsize - 1);
+	if (jhash_size(bits) != hashsize)
+		/* Round up to the first 2^n value */
+		bits = fls(hashsize);
+
+	return bits;
+}
+
+/* Destroy the hashtable part of the set */
+static void
+ahash_destroy(struct htable *t)
+{
+	struct hbucket *n;
+	u32 i;
+
+	for (i = 0; i < jhash_size(t->htable_bits); i++) {
+		n = hbucket(t, i);
+		if (n->size)
+			/* FIXME: use slab cache */
+			kfree(n->value);
+	}
+
+	ip_set_free(t);
+}
+
+static int
+hbucket_elem_add(struct hbucket *n, u8 ahash_max, size_t dsize)
+{
+	if (n->pos >= n->size) {
+		void *tmp;
+
+		if (n->size >= ahash_max)
+			/* Trigger rehashing */
+			return -EAGAIN;
+
+		tmp = kzalloc((n->size + AHASH_INIT_SIZE) * dsize,
+			      GFP_ATOMIC);
+		if (!tmp)
+			return -ENOMEM;
+		if (n->size) {
+			memcpy(tmp, n->value, n->size * dsize);
+			kfree(n->value);
+		}
+		n->value = tmp;
+		n->size += AHASH_INIT_SIZE;
+	}
+	return 0;
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+#ifdef IP_SET_HASH_WITH_NETS_PACKED
+/* When cidr is packed with nomatch, cidr - 1 is stored in the entry */
+#define CIDR(cidr)		(cidr + 1)
+#else
+#define CIDR(cidr)		(cidr)
+#endif
+
+#define SET_HOST_MASK(family)	(family == AF_INET ? 32 : 128)
+
+#ifdef IP_SET_HASH_WITH_MULTI
+#define NETS_LENGTH(family)	(SET_HOST_MASK(family) + 1)
+#else
+#define NETS_LENGTH(family)	SET_HOST_MASK(family)
+#endif
+
+#else
+#define NETS_LENGTH(family)	0
+#endif /* IP_SET_HASH_WITH_NETS */
+
+#define ext_timeout(e, h)	\
+(unsigned long *)(((void *)(e)) + (h)->offset[IPSET_OFFSET_TIMEOUT])
+
+#endif /* _IP_SET_HASH_GEN_H */
+
+/* Family dependent templates */
+
+#undef ahash_data
+#undef mtype_data_equal
+#undef mtype_do_data_match
+#undef mtype_data_set_flags
+#undef mtype_data_reset_flags
+#undef mtype_data_netmask
+#undef mtype_data_list
+#undef mtype_data_next
+#undef mtype_elem
+
+#undef mtype_add_cidr
+#undef mtype_del_cidr
+#undef mtype_ahash_memsize
+#undef mtype_flush
+#undef mtype_destroy
+#undef mtype_gc_init
+#undef mtype_same_set
+#undef mtype_kadt
+#undef mtype_uadt
+#undef mtype
+
+#undef mtype_add
+#undef mtype_del
+#undef mtype_test_cidrs
+#undef mtype_test
+#undef mtype_expire
+#undef mtype_resize
+#undef mtype_head
+#undef mtype_list
+#undef mtype_gc
+#undef mtype_gc_init
+#undef mtype_variant
+#undef mtype_data_match
+
+#undef HKEY
+
+#define mtype_data_equal	TOKEN(MTYPE, _data_equal)
+#ifdef IP_SET_HASH_WITH_NETS
+#define mtype_do_data_match	TOKEN(MTYPE, _do_data_match)
+#else
+#define mtype_do_data_match(d)	1
+#endif
+#define mtype_data_set_flags	TOKEN(MTYPE, _data_set_flags)
+#define mtype_data_reset_flags	TOKEN(MTYPE, _data_reset_flags)
+#define mtype_data_netmask	TOKEN(MTYPE, _data_netmask)
+#define mtype_data_list		TOKEN(MTYPE, _data_list)
+#define mtype_data_next		TOKEN(MTYPE, _data_next)
+#define mtype_elem		TOKEN(MTYPE, _elem)
+#define mtype_add_cidr		TOKEN(MTYPE, _add_cidr)
+#define mtype_del_cidr		TOKEN(MTYPE, _del_cidr)
+#define mtype_ahash_memsize	TOKEN(MTYPE, _ahash_memsize)
+#define mtype_flush		TOKEN(MTYPE, _flush)
+#define mtype_destroy		TOKEN(MTYPE, _destroy)
+#define mtype_gc_init		TOKEN(MTYPE, _gc_init)
+#define mtype_same_set		TOKEN(MTYPE, _same_set)
+#define mtype_kadt		TOKEN(MTYPE, _kadt)
+#define mtype_uadt		TOKEN(MTYPE, _uadt)
+#define mtype			MTYPE
+
+#define mtype_elem		TOKEN(MTYPE, _elem)
+#define mtype_add		TOKEN(MTYPE, _add)
+#define mtype_del		TOKEN(MTYPE, _del)
+#define mtype_test_cidrs	TOKEN(MTYPE, _test_cidrs)
+#define mtype_test		TOKEN(MTYPE, _test)
+#define mtype_expire		TOKEN(MTYPE, _expire)
+#define mtype_resize		TOKEN(MTYPE, _resize)
+#define mtype_head		TOKEN(MTYPE, _head)
+#define mtype_list		TOKEN(MTYPE, _list)
+#define mtype_gc		TOKEN(MTYPE, _gc)
+#define mtype_variant		TOKEN(MTYPE, _variant)
+#define mtype_data_match	TOKEN(MTYPE, _data_match)
+
+#ifndef HKEY_DATALEN
+#define HKEY_DATALEN		sizeof(struct mtype_elem)
+#endif
+
+#define HKEY(data, initval, htable_bits)			\
+(jhash2((u32 *)(data), HKEY_DATALEN/sizeof(u32), initval)	\
+	& jhash_mask(htable_bits))
+
+#ifndef htype
+#define htype			HTYPE
+
+/* The generic hash structure */
+struct htype {
+	struct htable *table;	/* the hash table */
+	u32 maxelem;		/* max elements in the hash */
+	u32 elements;		/* current element (vs timeout) */
+	u32 initval;		/* random jhash init value */
+	u32 timeout;		/* timeout value, if enabled */
+	size_t dsize;		/* data struct size */
+	size_t offset[IPSET_OFFSET_MAX]; /* Offsets to extensions */
+	struct timer_list gc;	/* garbage collection when timeout enabled */
+	struct mtype_elem next; /* temporary storage for uadd */
+#ifdef IP_SET_HASH_WITH_MULTI
+	u8 ahash_max;		/* max elements in an array block */
+#endif
+#ifdef IP_SET_HASH_WITH_NETMASK
+	u8 netmask;		/* netmask value for subnets to store */
+#endif
+#ifdef IP_SET_HASH_WITH_RBTREE
+	struct rb_root rbtree;
+#endif
+#ifdef IP_SET_HASH_WITH_NETS
+	struct net_prefixes nets[0]; /* book-keeping of prefixes */
+#endif
+};
+#endif
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Network cidr size book keeping when the hash stores different
+ * sized networks */
+static void
+mtype_add_cidr(struct htype *h, u8 cidr, u8 nets_length)
+{
+	int i, j;
+
+	/* Add in increasing prefix order, so larger cidr first */
+	for (i = 0, j = -1; i < nets_length && h->nets[i].nets; i++) {
+		if (j != -1)
+			continue;
+		else if (h->nets[i].cidr < cidr)
+			j = i;
+		else if (h->nets[i].cidr == cidr) {
+			h->nets[i].nets++;
+			return;
+		}
+	}
+	if (j != -1) {
+		for (; i > j; i--) {
+			h->nets[i].cidr = h->nets[i - 1].cidr;
+			h->nets[i].nets = h->nets[i - 1].nets;
+		}
+	}
+	h->nets[i].cidr = cidr;
+	h->nets[i].nets = 1;
+}
+
+static void
+mtype_del_cidr(struct htype *h, u8 cidr, u8 nets_length)
+{
+	u8 i, j;
+
+	for (i = 0; i < nets_length - 1 && h->nets[i].cidr != cidr; i++)
+		;
+	h->nets[i].nets--;
+
+	if (h->nets[i].nets != 0)
+		return;
+
+	for (j = i; j < nets_length - 1 && h->nets[j].nets; j++) {
+		h->nets[j].cidr = h->nets[j + 1].cidr;
+		h->nets[j].nets = h->nets[j + 1].nets;
+	}
+}
+#endif
+
+/* Calculate the actual memory size of the set data */
+static size_t
+mtype_ahash_memsize(const struct htype *h, u8 nets_length)
+{
+	u32 i;
+	struct htable *t = h->table;
+	size_t memsize = sizeof(*h)
+			 + sizeof(*t)
+#ifdef IP_SET_HASH_WITH_NETS
+			 + sizeof(struct net_prefixes) * nets_length
+#endif
+			 + jhash_size(t->htable_bits) * sizeof(struct hbucket);
+
+	for (i = 0; i < jhash_size(t->htable_bits); i++)
+		memsize += t->bucket[i].size * h->dsize;
+
+	return memsize;
+}
+
+/* Flush a hash type of set: destroy all elements */
+static void
+mtype_flush(struct ip_set *set)
+{
+	struct htype *h = set->data;
+	struct htable *t = h->table;
+	struct hbucket *n;
+	u32 i;
+
+	for (i = 0; i < jhash_size(t->htable_bits); i++) {
+		n = hbucket(t, i);
+		if (n->size) {
+			n->size = n->pos = 0;
+			/* FIXME: use slab cache */
+			kfree(n->value);
+		}
+	}
+#ifdef IP_SET_HASH_WITH_NETS
+	memset(h->nets, 0, sizeof(struct net_prefixes)
+			   * NETS_LENGTH(set->family));
+#endif
+	h->elements = 0;
+}
+
+/* Destroy a hash type of set */
+static void
+mtype_destroy(struct ip_set *set)
+{
+	struct htype *h = set->data;
+
+	if (set->extensions & IPSET_EXT_TIMEOUT)
+		del_timer_sync(&h->gc);
+
+	ahash_destroy(h->table);
+#ifdef IP_SET_HASH_WITH_RBTREE
+	rbtree_destroy(&h->rbtree);
+#endif
+	kfree(h);
+
+	set->data = NULL;
+}
+
+static void
+mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+{
+	struct htype *h = set->data;
+
+	init_timer(&h->gc);
+	h->gc.data = (unsigned long) set;
+	h->gc.function = gc;
+	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
+	add_timer(&h->gc);
+	pr_debug("gc initialized, run in every %u\n",
+		 IPSET_GC_PERIOD(h->timeout));
+}
+
+static bool
+mtype_same_set(const struct ip_set *a, const struct ip_set *b)
+{
+	const struct htype *x = a->data;
+	const struct htype *y = b->data;
+
+	/* Resizing changes htable_bits, so we ignore it */
+	return x->maxelem == y->maxelem &&
+	       x->timeout == y->timeout &&
+#ifdef IP_SET_HASH_WITH_NETMASK
+	       x->netmask == y->netmask &&
+#endif
+	       a->extensions == b->extensions;
+}
+
+/* Get the ith element from the array block n */
+#define ahash_data(n, i, dsize)	\
+	((struct mtype_elem *)((n)->value + ((i) * (dsize))))
+
+/* Delete expired elements from the hashtable */
+static void
+mtype_expire(struct htype *h, u8 nets_length, size_t dsize)
+{
+	struct htable *t = h->table;
+	struct hbucket *n;
+	struct mtype_elem *data;
+	u32 i;
+	int j;
+
+	for (i = 0; i < jhash_size(t->htable_bits); i++) {
+		n = hbucket(t, i);
+		for (j = 0; j < n->pos; j++) {
+			data = ahash_data(n, j, dsize);
+			if (ip_set_timeout_expired(ext_timeout(data, h))) {
+				pr_debug("expired %u/%u\n", i, j);
+#ifdef IP_SET_HASH_WITH_NETS
+				mtype_del_cidr(h, CIDR(data->cidr),
+					       nets_length);
+#endif
+				if (j != n->pos - 1)
+					/* Not last one */
+					memcpy(data,
+					       ahash_data(n, n->pos - 1, dsize),
+					       dsize);
+				n->pos--;
+				h->elements--;
+			}
+		}
+		if (n->pos + AHASH_INIT_SIZE < n->size) {
+			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+					    * dsize,
+					    GFP_ATOMIC);
+			if (!tmp)
+				/* Still try to delete expired elements */
+				continue;
+			n->size -= AHASH_INIT_SIZE;
+			memcpy(tmp, n->value, n->size * dsize);
+			kfree(n->value);
+			n->value = tmp;
+		}
+	}
+}
+
+static void
+mtype_gc(unsigned long ul_set)
+{
+	struct ip_set *set = (struct ip_set *) ul_set;
+	struct htype *h = set->data;
+
+	pr_debug("called\n");
+	write_lock_bh(&set->lock);
+	mtype_expire(h, NETS_LENGTH(set->family), h->dsize);
+	write_unlock_bh(&set->lock);
+
+	h->gc.expires = jiffies + IPSET_GC_PERIOD(h->timeout) * HZ;
+	add_timer(&h->gc);
+}
+
+/* Resize a hash: create a new hash table with doubling the hashsize
+ * and inserting the elements to it. Repeat until we succeed or
+ * fail due to memory pressures. */
+static int
+mtype_resize(struct ip_set *set, bool retried)
+{
+	struct htype *h = set->data;
+	struct htable *t, *orig = h->table;
+	u8 htable_bits = orig->htable_bits;
+#ifdef IP_SET_HASH_WITH_NETS
+	u8 flags;
+#endif
+	struct mtype_elem *data;
+	struct mtype_elem *d;
+	struct hbucket *n, *m;
+	u32 i, j;
+	int ret;
+
+	/* Try to cleanup once */
+	if (SET_WITH_TIMEOUT(set) && !retried) {
+		i = h->elements;
+		write_lock_bh(&set->lock);
+		mtype_expire(set->data, NETS_LENGTH(set->family),
+			     h->dsize);
+		write_unlock_bh(&set->lock);
+		if (h->elements < i)
+			return 0;
+	}
+
+retry:
+	ret = 0;
+	htable_bits++;
+	pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+		 set->name, orig->htable_bits, htable_bits, orig);
+	if (!htable_bits) {
+		/* In case we have plenty of memory :-) */
+		pr_warning("Cannot increase the hashsize of set %s further\n",
+			   set->name);
+		return -IPSET_ERR_HASH_FULL;
+	}
+	t = ip_set_alloc(sizeof(*t)
+			 + jhash_size(htable_bits) * sizeof(struct hbucket));
+	if (!t)
+		return -ENOMEM;
+	t->htable_bits = htable_bits;
+
+	read_lock_bh(&set->lock);
+	for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+		n = hbucket(orig, i);
+		for (j = 0; j < n->pos; j++) {
+			data = ahash_data(n, j, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+			flags = 0;
+			mtype_data_reset_flags(data, &flags);
+#endif
+			m = hbucket(t, HKEY(data, h->initval, htable_bits));
+			ret = hbucket_elem_add(m, AHASH_MAX(h), h->dsize);
+			if (ret < 0) {
+#ifdef IP_SET_HASH_WITH_NETS
+				mtype_data_reset_flags(data, &flags);
+#endif
+				read_unlock_bh(&set->lock);
+				ahash_destroy(t);
+				if (ret == -EAGAIN)
+					goto retry;
+				return ret;
+			}
+			d = ahash_data(m, m->pos++, h->dsize);
+			memcpy(d, data, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+			mtype_data_reset_flags(d, &flags);
+#endif
+		}
+	}
+
+	rcu_assign_pointer(h->table, t);
+	read_unlock_bh(&set->lock);
+
+	/* Give time to other readers of the set */
+	synchronize_rcu_bh();
+
+	pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+		 orig->htable_bits, orig, t->htable_bits, t);
+	ahash_destroy(orig);
+
+	return 0;
+}
+
+/* Add an element to a hash and update the internal counters when succeeded,
+ * otherwise report the proper error code. */
+static int
+mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags)
+{
+	struct htype *h = set->data;
+	struct htable *t;
+	const struct mtype_elem *d = value;
+	struct mtype_elem *data;
+	struct hbucket *n;
+	int i, ret = 0;
+	int j = AHASH_MAX(h) + 1;
+	bool flag_exist = flags & IPSET_FLAG_EXIST;
+	u32 key, multi = 0;
+
+	if (SET_WITH_TIMEOUT(set) && h->elements >= h->maxelem)
+		/* FIXME: when set is full, we slow down here */
+		mtype_expire(h, NETS_LENGTH(set->family), h->dsize);
+
+	if (h->elements >= h->maxelem) {
+		if (net_ratelimit())
+			pr_warning("Set %s is full, maxelem %u reached\n",
+				   set->name, h->maxelem);
+		return -IPSET_ERR_HASH_FULL;
+	}
+
+	rcu_read_lock_bh();
+	t = rcu_dereference_bh(h->table);
+	key = HKEY(value, h->initval, t->htable_bits);
+	n = hbucket(t, key);
+	for (i = 0; i < n->pos; i++) {
+		data = ahash_data(n, i, h->dsize);
+		if (mtype_data_equal(data, d, &multi)) {
+			if (flag_exist ||
+			    (SET_WITH_TIMEOUT(set) &&
+			     ip_set_timeout_expired(ext_timeout(data, h)))) {
+				/* Just the extensions could be overwritten */
+				j = i;
+				goto reuse_slot;
+			} else {
+				ret = -IPSET_ERR_EXIST;
+				goto out;
+			}
+		}
+		/* Reuse first timed out entry */
+		if (SET_WITH_TIMEOUT(set) &&
+		    ip_set_timeout_expired(ext_timeout(data, h)) &&
+		    j != AHASH_MAX(h) + 1)
+			j = i;
+	}
+reuse_slot:
+	if (j != AHASH_MAX(h) + 1) {
+		/* Fill out reused slot */
+		data = ahash_data(n, j, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+		mtype_del_cidr(h, CIDR(data->cidr), NETS_LENGTH(set->family));
+		mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
+#endif
+	} else {
+		/* Use/create a new slot */
+		TUNE_AHASH_MAX(h, multi);
+		ret = hbucket_elem_add(n, AHASH_MAX(h), h->dsize);
+		if (ret != 0) {
+			if (ret == -EAGAIN)
+				mtype_data_next(&h->next, d);
+			goto out;
+		}
+		data = ahash_data(n, n->pos++, h->dsize);
+#ifdef IP_SET_HASH_WITH_NETS
+		mtype_add_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
+#endif
+		h->elements++;
+	}
+	memcpy(data, d, sizeof(struct mtype_elem));
+#ifdef IP_SET_HASH_WITH_NETS
+	mtype_data_set_flags(data, flags);
+#endif
+	if (SET_WITH_TIMEOUT(set))
+		ip_set_timeout_set(ext_timeout(data, h), ext->timeout);
+
+out:
+	rcu_read_unlock_bh();
+	return ret;
+}
+
+/* Delete an element from the hash: swap it with the last element
+ * and free up space if possible.
+ */
+static int
+mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	  struct ip_set_ext *mext, u32 flags)
+{
+	struct htype *h = set->data;
+	struct htable *t = h->table;
+	const struct mtype_elem *d = value;
+	struct mtype_elem *data;
+	struct hbucket *n;
+	int i;
+	u32 key, multi = 0;
+
+	key = HKEY(value, h->initval, t->htable_bits);
+	n = hbucket(t, key);
+	for (i = 0; i < n->pos; i++) {
+		data = ahash_data(n, i, h->dsize);
+		if (!mtype_data_equal(data, d, &multi))
+			continue;
+		if (SET_WITH_TIMEOUT(set) &&
+		    ip_set_timeout_expired(ext_timeout(data, h)))
+			return -IPSET_ERR_EXIST;
+		if (i != n->pos - 1)
+			/* Not last one */
+			memcpy(data, ahash_data(n, n->pos - 1, h->dsize),
+			       h->dsize);
+
+		n->pos--;
+		h->elements--;
+#ifdef IP_SET_HASH_WITH_NETS
+		mtype_del_cidr(h, CIDR(d->cidr), NETS_LENGTH(set->family));
+#endif
+		if (n->pos + AHASH_INIT_SIZE < n->size) {
+			void *tmp = kzalloc((n->size - AHASH_INIT_SIZE)
+					    * h->dsize,
+					    GFP_ATOMIC);
+			if (!tmp)
+				return 0;
+			n->size -= AHASH_INIT_SIZE;
+			memcpy(tmp, n->value, n->size * h->dsize);
+			kfree(n->value);
+			n->value = tmp;
+		}
+		return 0;
+	}
+
+	return -IPSET_ERR_EXIST;
+}
+
+static inline int
+mtype_data_match(struct mtype_elem *data, const struct ip_set_ext *ext,
+		 struct ip_set_ext *mext, struct ip_set *set, u32 flags)
+{
+	return mtype_do_data_match(data);
+}
+
+#ifdef IP_SET_HASH_WITH_NETS
+/* Special test function which takes into account the different network
+ * sizes added to the set */
+static int
+mtype_test_cidrs(struct ip_set *set, struct mtype_elem *d,
+		 const struct ip_set_ext *ext,
+		 struct ip_set_ext *mext, u32 flags)
+{
+	struct htype *h = set->data;
+	struct htable *t = h->table;
+	struct hbucket *n;
+	struct mtype_elem *data;
+	int i, j = 0;
+	u32 key, multi = 0;
+	u8 nets_length = NETS_LENGTH(set->family);
+
+	pr_debug("test by nets\n");
+	for (; j < nets_length && h->nets[j].nets && !multi; j++) {
+		mtype_data_netmask(d, h->nets[j].cidr);
+		key = HKEY(d, h->initval, t->htable_bits);
+		n = hbucket(t, key);
+		for (i = 0; i < n->pos; i++) {
+			data = ahash_data(n, i, h->dsize);
+			if (!mtype_data_equal(data, d, &multi))
+				continue;
+			if (SET_WITH_TIMEOUT(set)) {
+				if (!ip_set_timeout_expired(
+							ext_timeout(data, h)))
+					return mtype_data_match(data, ext,
+								mext, set,
+								flags);
+#ifdef IP_SET_HASH_WITH_MULTI
+				multi = 0;
+#endif
+			} else
+				return mtype_data_match(data, ext,
+							mext, set, flags);
+		}
+	}
+	return 0;
+}
+#endif
+
+/* Test whether the element is added to the set */
+static int
+mtype_test(struct ip_set *set, void *value, const struct ip_set_ext *ext,
+	   struct ip_set_ext *mext, u32 flags)
+{
+	struct htype *h = set->data;
+	struct htable *t = h->table;
+	struct mtype_elem *d = value;
+	struct hbucket *n;
+	struct mtype_elem *data;
+	int i;
+	u32 key, multi = 0;
+
+#ifdef IP_SET_HASH_WITH_NETS
+	/* If we test an IP address and not a network address,
+	 * try all possible network sizes */
+	if (CIDR(d->cidr) == SET_HOST_MASK(set->family))
+		return mtype_test_cidrs(set, d, ext, mext, flags);
+#endif
+
+	key = HKEY(d, h->initval, t->htable_bits);
+	n = hbucket(t, key);
+	for (i = 0; i < n->pos; i++) {
+		data = ahash_data(n, i, h->dsize);
+		if (mtype_data_equal(data, d, &multi) &&
+		    !(SET_WITH_TIMEOUT(set) &&
+		      ip_set_timeout_expired(ext_timeout(data, h))))
+			return mtype_data_match(data, ext, mext, set, flags);
+	}
+	return 0;
+}
+
+/* Reply a HEADER request: fill out the header part of the set */
+static int
+mtype_head(struct ip_set *set, struct sk_buff *skb)
+{
+	const struct htype *h = set->data;
+	struct nlattr *nested;
+	size_t memsize;
+
+	read_lock_bh(&set->lock);
+	memsize = mtype_ahash_memsize(h, NETS_LENGTH(set->family));
+	read_unlock_bh(&set->lock);
+
+	nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+	if (!nested)
+		goto nla_put_failure;
+	if (nla_put_net32(skb, IPSET_ATTR_HASHSIZE,
+			  htonl(jhash_size(h->table->htable_bits))) ||
+	    nla_put_net32(skb, IPSET_ATTR_MAXELEM, htonl(h->maxelem)))
+		goto nla_put_failure;
+#ifdef IP_SET_HASH_WITH_NETMASK
+	if (h->netmask != HOST_MASK &&
+	    nla_put_u8(skb, IPSET_ATTR_NETMASK, h->netmask))
+		goto nla_put_failure;
+#endif
+	if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+	    nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+	    ((set->extensions & IPSET_EXT_TIMEOUT) &&
+	     nla_put_net32(skb, IPSET_ATTR_TIMEOUT, htonl(h->timeout))))
+		goto nla_put_failure;
+	ipset_nest_end(skb, nested);
+
+	return 0;
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+/* Reply a LIST/SAVE request: dump the elements of the specified set */
+static int
+mtype_list(const struct ip_set *set,
+	   struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const struct htype *h = set->data;
+	const struct htable *t = h->table;
+	struct nlattr *atd, *nested;
+	const struct hbucket *n;
+	const struct mtype_elem *e;
+	u32 first = cb->args[2];
+	/* We assume that one hash bucket fills into one page */
+	void *incomplete;
+	int i;
+
+	atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
+	if (!atd)
+		return -EMSGSIZE;
+	pr_debug("list hash set %s\n", set->name);
+	for (; cb->args[2] < jhash_size(t->htable_bits); cb->args[2]++) {
+		incomplete = skb_tail_pointer(skb);
+		n = hbucket(t, cb->args[2]);
+		pr_debug("cb->args[2]: %lu, t %p n %p\n", cb->args[2], t, n);
+		for (i = 0; i < n->pos; i++) {
+			e = ahash_data(n, i, h->dsize);
+			if (SET_WITH_TIMEOUT(set) &&
+			    ip_set_timeout_expired(ext_timeout(e, h)))
+				continue;
+			pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+				 cb->args[2], n, i, e);
+			nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
+			if (!nested) {
+				if (cb->args[2] == first) {
+					nla_nest_cancel(skb, atd);
+					return -EMSGSIZE;
+				} else
+					goto nla_put_failure;
+			}
+			if (mtype_data_list(skb, e))
+				goto nla_put_failure;
+			if (SET_WITH_TIMEOUT(set) &&
+			    nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
+					  htonl(ip_set_timeout_get(
+						ext_timeout(e, h)))))
+				goto nla_put_failure;
+			ipset_nest_end(skb, nested);
+		}
+	}
+	ipset_nest_end(skb, atd);
+	/* Set listing finished */
+	cb->args[2] = 0;
+
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, incomplete);
+	ipset_nest_end(skb, atd);
+	if (unlikely(first == cb->args[2])) {
+		pr_warning("Can't list set %s: one bucket does not fit into "
+			   "a message. Please report it!\n", set->name);
+		cb->args[2] = 0;
+		return -EMSGSIZE;
+	}
+	return 0;
+}
+
+static int
+TOKEN(MTYPE, _kadt)(struct ip_set *set, const struct sk_buff *skb,
+	      const struct xt_action_param *par,
+	      enum ipset_adt adt, struct ip_set_adt_opt *opt);
+
+static int
+TOKEN(MTYPE, _uadt)(struct ip_set *set, struct nlattr *tb[],
+	      enum ipset_adt adt, u32 *lineno, u32 flags, bool retried);
+
+static const struct ip_set_type_variant mtype_variant = {
+	.kadt	= mtype_kadt,
+	.uadt	= mtype_uadt,
+	.adt	= {
+		[IPSET_ADD] = mtype_add,
+		[IPSET_DEL] = mtype_del,
+		[IPSET_TEST] = mtype_test,
+	},
+	.destroy = mtype_destroy,
+	.flush	= mtype_flush,
+	.head	= mtype_head,
+	.list	= mtype_list,
+	.resize	= mtype_resize,
+	.same_set = mtype_same_set,
+};
+
+#ifdef IP_SET_EMIT_CREATE
+static int
+TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
+{
+	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
+	u8 hbits;
+#ifdef IP_SET_HASH_WITH_NETMASK
+	u8 netmask;
+#endif
+	size_t hsize;
+	struct HTYPE *h;
+
+	if (!(set->family == NFPROTO_IPV4 || set->family == NFPROTO_IPV6))
+		return -IPSET_ERR_INVALID_FAMILY;
+#ifdef IP_SET_HASH_WITH_NETMASK
+	netmask = set->family == NFPROTO_IPV4 ? 32 : 128;
+	pr_debug("Create set %s with family %s\n",
+		 set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+#endif
+
+	if (unlikely(!ip_set_optattr_netorder(tb, IPSET_ATTR_HASHSIZE) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_MAXELEM) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_TIMEOUT) ||
+		     !ip_set_optattr_netorder(tb, IPSET_ATTR_CADT_FLAGS)))
+		return -IPSET_ERR_PROTOCOL;
+
+	if (tb[IPSET_ATTR_HASHSIZE]) {
+		hashsize = ip_set_get_h32(tb[IPSET_ATTR_HASHSIZE]);
+		if (hashsize < IPSET_MIMINAL_HASHSIZE)
+			hashsize = IPSET_MIMINAL_HASHSIZE;
+	}
+
+	if (tb[IPSET_ATTR_MAXELEM])
+		maxelem = ip_set_get_h32(tb[IPSET_ATTR_MAXELEM]);
+
+#ifdef IP_SET_HASH_WITH_NETMASK
+	if (tb[IPSET_ATTR_NETMASK]) {
+		netmask = nla_get_u8(tb[IPSET_ATTR_NETMASK]);
+
+		if ((set->family == NFPROTO_IPV4 && netmask > 32) ||
+		    (set->family == NFPROTO_IPV6 && netmask > 128) ||
+		    netmask == 0)
+			return -IPSET_ERR_INVALID_NETMASK;
+	}
+#endif
+
+	hsize = sizeof(*h);
+#ifdef IP_SET_HASH_WITH_NETS
+	hsize += sizeof(struct net_prefixes) *
+		(set->family == NFPROTO_IPV4 ? 32 : 128);
+#endif
+	h = kzalloc(hsize, GFP_KERNEL);
+	if (!h)
+		return -ENOMEM;
+
+	h->maxelem = maxelem;
+#ifdef IP_SET_HASH_WITH_NETMASK
+	h->netmask = netmask;
+#endif
+	get_random_bytes(&h->initval, sizeof(h->initval));
+	h->timeout = IPSET_NO_TIMEOUT;
+
+	hbits = htable_bits(hashsize);
+	hsize = htable_size(hbits);
+	if (hsize == 0) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table = ip_set_alloc(hsize);
+	if (!h->table) {
+		kfree(h);
+		return -ENOMEM;
+	}
+	h->table->htable_bits = hbits;
+
+	set->data = h;
+	if (set->family ==  NFPROTO_IPV4)
+		set->variant = &TOKEN(HTYPE, 4_variant);
+	else
+		set->variant = &TOKEN(HTYPE, 6_variant);
+
+	if (tb[IPSET_ATTR_TIMEOUT]) {
+		h->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
+		set->extensions |= IPSET_EXT_TIMEOUT;
+		if (set->family == NFPROTO_IPV4) {
+			h->dsize = sizeof(struct TOKEN(HTYPE, 4t_elem));
+			h->offset[IPSET_OFFSET_TIMEOUT] =
+				offsetof(struct TOKEN(HTYPE, 4t_elem),
+					 timeout);
+			TOKEN(HTYPE, 4_gc_init)(set, TOKEN(HTYPE, 4_gc));
+		} else {
+			h->dsize = sizeof(struct TOKEN(HTYPE, 6t_elem));
+			h->offset[IPSET_OFFSET_TIMEOUT] =
+				offsetof(struct TOKEN(HTYPE, 6t_elem),
+					 timeout);
+			TOKEN(HTYPE, 6_gc_init)(set, TOKEN(HTYPE, 6_gc));
+		}
+	} else {
+		if (set->family == NFPROTO_IPV4)
+			h->dsize = sizeof(struct TOKEN(HTYPE, 4_elem));
+		else
+			h->dsize = sizeof(struct TOKEN(HTYPE, 6_elem));
+	}
+
+	pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
+		 set->name, jhash_size(h->table->htable_bits),
+		 h->table->htable_bits, h->maxelem, set->data, h->table);
+
+	return 0;
+}
+#endif /* IP_SET_EMIT_CREATE */
-- 
cgit 


From 34d666d489cf70c246ca99b2387741915c34b88c Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 27 Apr 2013 14:38:56 +0200
Subject: netfilter: ipset: Introduce the counter extension in the core

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h      | 75 +++++++++++++++++++++++++++--
 include/uapi/linux/netfilter/ipset/ip_set.h |  5 ++
 net/netfilter/ipset/ip_set_core.c           | 10 ++++
 3 files changed, 86 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index bf0220cbf46a..0f978ebfaefb 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -52,18 +52,24 @@ enum ip_set_extension {
 	IPSET_EXT_NONE = 0,
 	IPSET_EXT_BIT_TIMEOUT = 1,
 	IPSET_EXT_TIMEOUT = (1 << IPSET_EXT_BIT_TIMEOUT),
+	IPSET_EXT_BIT_COUNTER = 2,
+	IPSET_EXT_COUNTER = (1 << IPSET_EXT_BIT_COUNTER),
 };
 
 /* Extension offsets */
 enum ip_set_offset {
 	IPSET_OFFSET_TIMEOUT = 0,
+	IPSET_OFFSET_COUNTER,
 	IPSET_OFFSET_MAX,
 };
 
 #define SET_WITH_TIMEOUT(s)	((s)->extensions & IPSET_EXT_TIMEOUT)
+#define SET_WITH_COUNTER(s)	((s)->extensions & IPSET_EXT_COUNTER)
 
 struct ip_set_ext {
 	unsigned long timeout;
+	u64 packets;
+	u64 bytes;
 };
 
 struct ip_set;
@@ -177,6 +183,65 @@ struct ip_set {
 	void *data;
 };
 
+struct ip_set_counter {
+	atomic64_t bytes;
+	atomic64_t packets;
+};
+
+static inline void
+ip_set_add_bytes(u64 bytes, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)bytes, &(counter)->bytes);
+}
+
+static inline void
+ip_set_add_packets(u64 packets, struct ip_set_counter *counter)
+{
+	atomic64_add((long long)packets, &(counter)->packets);
+}
+
+static inline u64
+ip_set_get_bytes(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->bytes);
+}
+
+static inline u64
+ip_set_get_packets(const struct ip_set_counter *counter)
+{
+	return (u64)atomic64_read(&(counter)->packets);
+}
+
+static inline void
+ip_set_update_counter(struct ip_set_counter *counter,
+		      const struct ip_set_ext *ext,
+		      struct ip_set_ext *mext, u32 flags)
+{
+	if (ext->packets != ULLONG_MAX) {
+		ip_set_add_bytes(ext->bytes, counter);
+		ip_set_add_packets(ext->packets, counter);
+	}
+}
+
+static inline bool
+ip_set_put_counter(struct sk_buff *skb, struct ip_set_counter *counter)
+{
+	return nla_put_net64(skb, IPSET_ATTR_BYTES,
+			     cpu_to_be64(ip_set_get_bytes(counter))) ||
+	       nla_put_net64(skb, IPSET_ATTR_PACKETS,
+			     cpu_to_be64(ip_set_get_packets(counter)));
+}
+
+static inline void
+ip_set_init_counter(struct ip_set_counter *counter,
+		    const struct ip_set_ext *ext)
+{
+	if (ext->bytes != ULLONG_MAX)
+		atomic64_set(&(counter)->bytes, (long long)(ext->bytes));
+	if (ext->packets != ULLONG_MAX)
+		atomic64_set(&(counter)->packets, (long long)(ext->packets));
+}
+
 /* register and unregister set references */
 extern ip_set_id_t ip_set_get_byname(const char *name, struct ip_set **set);
 extern void ip_set_put_byindex(ip_set_id_t index);
@@ -318,10 +383,12 @@ bitmap_bytes(u32 a, u32 b)
 
 #include <linux/netfilter/ipset/ip_set_timeout.h>
 
-#define IP_SET_INIT_KEXT(skb, opt, map)		\
-	{ .timeout = ip_set_adt_opt_timeout(opt, map) }
+#define IP_SET_INIT_KEXT(skb, opt, map)			\
+	{ .bytes = (skb)->len, .packets = 1,		\
+	  .timeout = ip_set_adt_opt_timeout(opt, map) }
 
-#define IP_SET_INIT_UEXT(map)			\
-	{ .timeout = (map)->timeout }
+#define IP_SET_INIT_UEXT(map)				\
+	{ .bytes = ULLONG_MAX, .packets = ULLONG_MAX,	\
+	  .timeout = (map)->timeout }
 
 #endif /*_IP_SET_H */
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index fbee42807a11..ed452675d153 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -108,6 +108,8 @@ enum {
 	IPSET_ATTR_CIDR2,
 	IPSET_ATTR_IP2_TO,
 	IPSET_ATTR_IFACE,
+	IPSET_ATTR_BYTES,
+	IPSET_ATTR_PACKETS,
 	__IPSET_ATTR_ADT_MAX,
 };
 #define IPSET_ATTR_ADT_MAX	(__IPSET_ATTR_ADT_MAX - 1)
@@ -137,6 +139,7 @@ enum ipset_errno {
 	IPSET_ERR_REFERENCED,
 	IPSET_ERR_IPADDR_IPV4,
 	IPSET_ERR_IPADDR_IPV6,
+	IPSET_ERR_COUNTER,
 
 	/* Type specific error codes */
 	IPSET_ERR_TYPE_SPECIFIC = 4352,
@@ -161,6 +164,8 @@ enum ipset_cadt_flags {
 	IPSET_FLAG_PHYSDEV	= (1 << IPSET_FLAG_BIT_PHYSDEV),
 	IPSET_FLAG_BIT_NOMATCH	= 2,
 	IPSET_FLAG_NOMATCH	= (1 << IPSET_FLAG_BIT_NOMATCH),
+	IPSET_FLAG_BIT_WITH_COUNTERS = 3,
+	IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS),
 	IPSET_FLAG_CADT_MAX	= 15,	/* Upper half */
 };
 
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 4486285d10da..f6d878a46c43 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -324,6 +324,16 @@ ip_set_get_extensions(struct ip_set *set, struct nlattr *tb[],
 			return -IPSET_ERR_TIMEOUT;
 		ext->timeout = ip_set_timeout_uget(tb[IPSET_ATTR_TIMEOUT]);
 	}
+	if (tb[IPSET_ATTR_BYTES] || tb[IPSET_ATTR_PACKETS]) {
+		if (!(set->extensions & IPSET_EXT_COUNTER))
+			return -IPSET_ERR_COUNTER;
+		if (tb[IPSET_ATTR_BYTES])
+			ext->bytes = be64_to_cpu(nla_get_be64(
+						 tb[IPSET_ATTR_BYTES]));
+		if (tb[IPSET_ATTR_PACKETS])
+			ext->packets = be64_to_cpu(nla_get_be64(
+						   tb[IPSET_ATTR_PACKETS]));
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(ip_set_get_extensions);
-- 
cgit 


From 6e01781d1c80e2e8263471252a631e86165b15c5 Mon Sep 17 00:00:00 2001
From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Sat, 27 Apr 2013 14:40:50 +0200
Subject: netfilter: ipset: set match: add support to match the counters

The new revision of the set match supports to match the counters
and to suppress updating the counters at matching too.

At the set:list types, the updating of the subcounters can be
suppressed as well.

Signed-off-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter/ipset/ip_set.h      |  9 +++-
 include/uapi/linux/netfilter/ipset/ip_set.h | 31 +++++++++++--
 include/uapi/linux/netfilter/xt_set.h       |  9 ++++
 net/netfilter/ipset/ip_set_core.c           |  2 +-
 net/netfilter/ipset/ip_set_list_set.c       |  8 +++-
 net/netfilter/xt_set.c                      | 70 +++++++++++++++++++++++++++++
 6 files changed, 120 insertions(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h
index 0f978ebfaefb..d80e2753847c 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -76,7 +76,7 @@ struct ip_set;
 
 typedef int (*ipset_adtfn)(struct ip_set *set, void *value,
 			   const struct ip_set_ext *ext,
-			   struct ip_set_ext *mext, u32 flags);
+			   struct ip_set_ext *mext, u32 cmdflags);
 
 /* Kernel API function options */
 struct ip_set_adt_opt {
@@ -217,10 +217,15 @@ ip_set_update_counter(struct ip_set_counter *counter,
 		      const struct ip_set_ext *ext,
 		      struct ip_set_ext *mext, u32 flags)
 {
-	if (ext->packets != ULLONG_MAX) {
+	if (ext->packets != ULLONG_MAX &&
+	    !(flags & IPSET_FLAG_SKIP_COUNTER_UPDATE)) {
 		ip_set_add_bytes(ext->bytes, counter);
 		ip_set_add_packets(ext->packets, counter);
 	}
+	if (flags & IPSET_FLAG_MATCH_COUNTERS) {
+		mext->packets = ip_set_get_packets(counter);
+		mext->bytes = ip_set_get_bytes(counter);
+	}
 }
 
 static inline bool
diff --git a/include/uapi/linux/netfilter/ipset/ip_set.h b/include/uapi/linux/netfilter/ipset/ip_set.h
index ed452675d153..8024cdf13b70 100644
--- a/include/uapi/linux/netfilter/ipset/ip_set.h
+++ b/include/uapi/linux/netfilter/ipset/ip_set.h
@@ -145,7 +145,7 @@ enum ipset_errno {
 	IPSET_ERR_TYPE_SPECIFIC = 4352,
 };
 
-/* Flags at command level */
+/* Flags at command level or match/target flags, lower half of cmdattrs*/
 enum ipset_cmd_flags {
 	IPSET_FLAG_BIT_EXIST	= 0,
 	IPSET_FLAG_EXIST	= (1 << IPSET_FLAG_BIT_EXIST),
@@ -153,10 +153,20 @@ enum ipset_cmd_flags {
 	IPSET_FLAG_LIST_SETNAME	= (1 << IPSET_FLAG_BIT_LIST_SETNAME),
 	IPSET_FLAG_BIT_LIST_HEADER = 2,
 	IPSET_FLAG_LIST_HEADER	= (1 << IPSET_FLAG_BIT_LIST_HEADER),
-	IPSET_FLAG_CMD_MAX = 15,	/* Lower half */
+	IPSET_FLAG_BIT_SKIP_COUNTER_UPDATE = 3,
+	IPSET_FLAG_SKIP_COUNTER_UPDATE =
+		(1 << IPSET_FLAG_BIT_SKIP_COUNTER_UPDATE),
+	IPSET_FLAG_BIT_SKIP_SUBCOUNTER_UPDATE = 4,
+	IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE =
+		(1 << IPSET_FLAG_BIT_SKIP_SUBCOUNTER_UPDATE),
+	IPSET_FLAG_BIT_MATCH_COUNTERS = 5,
+	IPSET_FLAG_MATCH_COUNTERS = (1 << IPSET_FLAG_BIT_MATCH_COUNTERS),
+	IPSET_FLAG_BIT_RETURN_NOMATCH = 7,
+	IPSET_FLAG_RETURN_NOMATCH = (1 << IPSET_FLAG_BIT_RETURN_NOMATCH),
+	IPSET_FLAG_CMD_MAX = 15,
 };
 
-/* Flags at CADT attribute level */
+/* Flags at CADT attribute level, upper half of cmdattrs */
 enum ipset_cadt_flags {
 	IPSET_FLAG_BIT_BEFORE	= 0,
 	IPSET_FLAG_BEFORE	= (1 << IPSET_FLAG_BIT_BEFORE),
@@ -166,7 +176,7 @@ enum ipset_cadt_flags {
 	IPSET_FLAG_NOMATCH	= (1 << IPSET_FLAG_BIT_NOMATCH),
 	IPSET_FLAG_BIT_WITH_COUNTERS = 3,
 	IPSET_FLAG_WITH_COUNTERS = (1 << IPSET_FLAG_BIT_WITH_COUNTERS),
-	IPSET_FLAG_CADT_MAX	= 15,	/* Upper half */
+	IPSET_FLAG_CADT_MAX	= 15,
 };
 
 /* Commands with settype-specific attributes */
@@ -195,6 +205,7 @@ enum ip_set_dim {
 	 * If changed, new revision of iptables match/target is required.
 	 */
 	IPSET_DIM_MAX = 6,
+	/* Backward compatibility: set match revision 2 */
 	IPSET_BIT_RETURN_NOMATCH = 7,
 };
 
@@ -207,6 +218,18 @@ enum ip_set_kopt {
 	IPSET_RETURN_NOMATCH = (1 << IPSET_BIT_RETURN_NOMATCH),
 };
 
+enum {
+	IPSET_COUNTER_NONE = 0,
+	IPSET_COUNTER_EQ,
+	IPSET_COUNTER_NE,
+	IPSET_COUNTER_LT,
+	IPSET_COUNTER_GT,
+};
+
+struct ip_set_counter_match {
+	__u8 op;
+	__u64 value;
+};
 
 /* Interface to iptables/ip6tables */
 
diff --git a/include/uapi/linux/netfilter/xt_set.h b/include/uapi/linux/netfilter/xt_set.h
index e3a9978f259f..964d3d42f874 100644
--- a/include/uapi/linux/netfilter/xt_set.h
+++ b/include/uapi/linux/netfilter/xt_set.h
@@ -62,4 +62,13 @@ struct xt_set_info_target_v2 {
 	__u32 timeout;
 };
 
+/* Revision 3 match */
+
+struct xt_set_info_match_v3 {
+	struct xt_set_info match_set;
+	struct ip_set_counter_match packets;
+	struct ip_set_counter_match bytes;
+	__u32 flags;
+};
+
 #endif /*_XT_SET_H*/
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index f6d878a46c43..f77139007983 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -413,7 +413,7 @@ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 		ret = 1;
 	} else {
 		/* --return-nomatch: invert matched element */
-		if ((opt->flags & IPSET_RETURN_NOMATCH) &&
+		if ((opt->cmdflags & IPSET_FLAG_RETURN_NOMATCH) &&
 		    (set->type->features & IPSET_TYPE_NOMATCH) &&
 		    (ret > 0 || ret == -ENOTEMPTY))
 			ret = -ret;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index c09022e37406..979b8c90e422 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -84,9 +84,13 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
 {
 	struct list_set *map = set->data;
 	struct set_elem *e;
-	u32 i;
+	u32 i, cmdflags = opt->cmdflags;
 	int ret;
 
+	/* Don't lookup sub-counters at all */
+	opt->cmdflags &= ~IPSET_FLAG_MATCH_COUNTERS;
+	if (opt->cmdflags & IPSET_FLAG_SKIP_SUBCOUNTER_UPDATE)
+		opt->cmdflags &= ~IPSET_FLAG_SKIP_COUNTER_UPDATE;
 	for (i = 0; i < map->size; i++) {
 		e = list_set_elem(map, i);
 		if (e->id == IPSET_INVALID_ID)
@@ -99,7 +103,7 @@ list_set_ktest(struct ip_set *set, const struct sk_buff *skb,
 			if (SET_WITH_COUNTER(set))
 				ip_set_update_counter(ext_counter(e, map),
 						      ext, &opt->ext,
-						      opt->cmdflags);
+						      cmdflags);
 			return ret;
 		}
 	}
diff --git a/net/netfilter/xt_set.c b/net/netfilter/xt_set.c
index 636c5199ff35..31790e789e22 100644
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -189,6 +189,9 @@ set_match_v1(const struct sk_buff *skb, struct xt_action_param *par)
 	ADT_OPT(opt, par->family, info->match_set.dim,
 		info->match_set.flags, 0, UINT_MAX);
 
+	if (opt.flags & IPSET_RETURN_NOMATCH)
+		opt.cmdflags |= IPSET_FLAG_RETURN_NOMATCH;
+
 	return match_set(info->match_set.index, skb, par, &opt,
 			 info->match_set.flags & IPSET_INV_MATCH);
 }
@@ -317,6 +320,52 @@ set_target_v2(struct sk_buff *skb, const struct xt_action_param *par)
 #define set_target_v2_checkentry	set_target_v1_checkentry
 #define set_target_v2_destroy		set_target_v1_destroy
 
+/* Revision 3 match */
+
+static bool
+match_counter(u64 counter, const struct ip_set_counter_match *info)
+{
+	switch (info->op) {
+	case IPSET_COUNTER_NONE:
+		return true;
+	case IPSET_COUNTER_EQ:
+		return counter == info->value;
+	case IPSET_COUNTER_NE:
+		return counter != info->value;
+	case IPSET_COUNTER_LT:
+		return counter < info->value;
+	case IPSET_COUNTER_GT:
+		return counter > info->value;
+	}
+	return false;
+}
+
+static bool
+set_match_v3(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_set_info_match_v3 *info = par->matchinfo;
+	ADT_OPT(opt, par->family, info->match_set.dim,
+		info->match_set.flags, info->flags, UINT_MAX);
+	int ret;
+
+	if (info->packets.op != IPSET_COUNTER_NONE ||
+	    info->bytes.op != IPSET_COUNTER_NONE)
+		opt.cmdflags |= IPSET_FLAG_MATCH_COUNTERS;
+
+	ret = match_set(info->match_set.index, skb, par, &opt,
+			info->match_set.flags & IPSET_INV_MATCH);
+
+	if (!(ret && opt.cmdflags & IPSET_FLAG_MATCH_COUNTERS))
+		return ret;
+
+	if (!match_counter(opt.ext.packets, &info->packets))
+		return 0;
+	return match_counter(opt.ext.bytes, &info->bytes);
+}
+
+#define set_match_v3_checkentry	set_match_v1_checkentry
+#define set_match_v3_destroy	set_match_v1_destroy
+
 static struct xt_match set_matches[] __read_mostly = {
 	{
 		.name		= "set",
@@ -369,6 +418,27 @@ static struct xt_match set_matches[] __read_mostly = {
 		.destroy	= set_match_v1_destroy,
 		.me		= THIS_MODULE
 	},
+	/* counters support: update, match */
+	{
+		.name		= "set",
+		.family		= NFPROTO_IPV4,
+		.revision	= 3,
+		.match		= set_match_v3,
+		.matchsize	= sizeof(struct xt_set_info_match_v3),
+		.checkentry	= set_match_v3_checkentry,
+		.destroy	= set_match_v3_destroy,
+		.me		= THIS_MODULE
+	},
+	{
+		.name		= "set",
+		.family		= NFPROTO_IPV6,
+		.revision	= 3,
+		.match		= set_match_v3,
+		.matchsize	= sizeof(struct xt_set_info_match_v3),
+		.checkentry	= set_match_v3_checkentry,
+		.destroy	= set_match_v3_destroy,
+		.me		= THIS_MODULE
+	},
 };
 
 static struct xt_target set_targets[] __read_mostly = {
-- 
cgit 


From 3dc20cb282ec03cc4c997130d680c800011ed479 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Apr 2013 20:31:37 -0400
Subject: new helper: read_code()

switch binfmts that use ->read() to that (and to kernel_read()
in several cases in binfmt_flat - sure, it's nommu, but still,
doing ->read() into kmalloc'ed buffer...)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/x86/ia32/ia32_aout.c | 30 ++++++------------------------
 fs/binfmt_aout.c          | 25 ++++++-------------------
 fs/binfmt_elf_fdpic.c     |  7 ++-----
 fs/binfmt_flat.c          | 37 ++++++++++++++++++-------------------
 fs/exec.c                 |  9 +++++++++
 include/linux/binfmts.h   |  1 +
 6 files changed, 42 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 03abf9b70011..03d721cbbc32 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -323,11 +323,8 @@ static int load_aout_binary(struct linux_binprm *bprm)
 
 	if (N_MAGIC(ex) == OMAGIC) {
 		unsigned long text_addr, map_size;
-		loff_t pos;
 
 		text_addr = N_TXTADDR(ex);
-
-		pos = 32;
 		map_size = ex.a_text+ex.a_data;
 
 		error = vm_brk(text_addr & PAGE_MASK, map_size);
@@ -337,15 +334,12 @@ static int load_aout_binary(struct linux_binprm *bprm)
 			return error;
 		}
 
-		error = bprm->file->f_op->read(bprm->file,
-			 (char __user *)text_addr,
-			  ex.a_text+ex.a_data, &pos);
+		error = read_code(bprm->file, text_addr, 32,
+				  ex.a_text + ex.a_data);
 		if ((signed long)error < 0) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
-
-		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
 	} else {
 #ifdef WARN_OLD
 		static unsigned long error_time, error_time2;
@@ -367,15 +361,9 @@ static int load_aout_binary(struct linux_binprm *bprm)
 #endif
 
 		if (!bprm->file->f_op->mmap || (fd_offset & ~PAGE_MASK) != 0) {
-			loff_t pos = fd_offset;
-
 			vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			bprm->file->f_op->read(bprm->file,
-					(char __user *)N_TXTADDR(ex),
-					ex.a_text+ex.a_data, &pos);
-			flush_icache_range((unsigned long) N_TXTADDR(ex),
-					   (unsigned long) N_TXTADDR(ex) +
-					   ex.a_text+ex.a_data);
+			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+					ex.a_text+ex.a_data);
 			goto beyond_if;
 		}
 
@@ -452,8 +440,6 @@ static int load_aout_library(struct file *file)
 	start_addr =  ex.a_entry & 0xfffff000;
 
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		loff_t pos = N_TXTOFF(ex);
-
 #ifdef WARN_OLD
 		static unsigned long error_time;
 		if (time_after(jiffies, error_time + 5*HZ)) {
@@ -466,12 +452,8 @@ static int load_aout_library(struct file *file)
 #endif
 		vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
 
-		file->f_op->read(file, (char __user *)start_addr,
-			ex.a_text + ex.a_data, &pos);
-		flush_icache_range((unsigned long) start_addr,
-				   (unsigned long) start_addr + ex.a_text +
-				   ex.a_data);
-
+		read_code(file, start_addr, N_TXTOFF(ex),
+			  ex.a_text + ex.a_data);
 		retval = 0;
 		goto out;
 	}
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index bbc8f8827eac..b23253df8756 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -287,15 +287,12 @@ static int load_aout_binary(struct linux_binprm * bprm)
 			return error;
 		}
 
-		error = bprm->file->f_op->read(bprm->file,
-			  (char __user *)text_addr,
-			  ex.a_text+ex.a_data, &pos);
+		error = read_code(bprm->file, text_addr, pos,
+				  ex.a_text+ex.a_data);
 		if ((signed long)error < 0) {
 			send_sig(SIGKILL, current, 0);
 			return error;
 		}
-			 
-		flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data);
 	} else {
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
@@ -311,14 +308,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		}
 
 		if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) {
-			loff_t pos = fd_offset;
 			vm_brk(N_TXTADDR(ex), ex.a_text+ex.a_data);
-			bprm->file->f_op->read(bprm->file,
-					(char __user *)N_TXTADDR(ex),
-					ex.a_text+ex.a_data, &pos);
-			flush_icache_range((unsigned long) N_TXTADDR(ex),
-					   (unsigned long) N_TXTADDR(ex) +
-					   ex.a_text+ex.a_data);
+			read_code(bprm->file, N_TXTADDR(ex), fd_offset,
+				  ex.a_text + ex.a_data);
 			goto beyond_if;
 		}
 
@@ -397,8 +389,6 @@ static int load_aout_library(struct file *file)
 	start_addr =  ex.a_entry & 0xfffff000;
 
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
-		loff_t pos = N_TXTOFF(ex);
-
 		if (printk_ratelimit())
 		{
 			printk(KERN_WARNING 
@@ -407,11 +397,8 @@ static int load_aout_library(struct file *file)
 		}
 		vm_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss);
 		
-		file->f_op->read(file, (char __user *)start_addr,
-			ex.a_text + ex.a_data, &pos);
-		flush_icache_range((unsigned long) start_addr,
-				   (unsigned long) start_addr + ex.a_text + ex.a_data);
-
+		read_code(file, start_addr, N_TXTOFF(ex),
+			  ex.a_text + ex.a_data);
 		retval = 0;
 		goto out;
 	}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 9c13e023e2b7..2711d9901632 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -926,7 +926,6 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 	struct elf32_fdpic_loadseg *seg;
 	struct elf32_phdr *phdr;
 	unsigned long load_addr, base = ULONG_MAX, top = 0, maddr = 0, mflags;
-	loff_t fpos;
 	int loop, ret;
 
 	load_addr = params->load_addr;
@@ -964,14 +963,12 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
 		if (params->phdrs[loop].p_type != PT_LOAD)
 			continue;
 
-		fpos = phdr->p_offset;
-
 		seg->addr = maddr + (phdr->p_vaddr - base);
 		seg->p_vaddr = phdr->p_vaddr;
 		seg->p_memsz = phdr->p_memsz;
 
-		ret = file->f_op->read(file, (void *) seg->addr,
-				       phdr->p_filesz, &fpos);
+		ret = read_code(file, seg->addr, phdr->p_offset,
+				       phdr->p_filesz);
 		if (ret < 0)
 			return ret;
 
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 2036d21baaef..d50bbe59da1e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -207,11 +207,12 @@ static int decompress_exec(
 
 	/* Read in first chunk of data and parse gzip header. */
 	fpos = offset;
-	ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+	ret = kernel_read(bprm->file, offset, buf, LBUFSIZE);
 
 	strm.next_in = buf;
 	strm.avail_in = ret;
 	strm.total_in = 0;
+	fpos += ret;
 
 	retval = -ENOEXEC;
 
@@ -277,7 +278,7 @@ static int decompress_exec(
 	}
 
 	while ((ret = zlib_inflate(&strm, Z_NO_FLUSH)) == Z_OK) {
-		ret = bprm->file->f_op->read(bprm->file, buf, LBUFSIZE, &fpos);
+		ret = kernel_read(bprm->file, fpos, buf, LBUFSIZE);
 		if (ret <= 0)
 			break;
 		len -= ret;
@@ -285,6 +286,7 @@ static int decompress_exec(
 		strm.next_in = buf;
 		strm.avail_in = ret;
 		strm.total_in = 0;
+		fpos += ret;
 	}
 
 	if (ret < 0) {
@@ -428,6 +430,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	unsigned long textpos = 0, datapos = 0, result;
 	unsigned long realdatastart = 0;
 	unsigned long text_len, data_len, bss_len, stack_len, flags;
+	unsigned long full_data;
 	unsigned long len, memp = 0;
 	unsigned long memp_size, extra, rlim;
 	unsigned long *reloc = 0, *rp;
@@ -451,6 +454,7 @@ static int load_flat_file(struct linux_binprm * bprm,
 	relocs    = ntohl(hdr->reloc_count);
 	flags     = ntohl(hdr->flags);
 	rev       = ntohl(hdr->rev);
+	full_data = data_len + relocs * sizeof(unsigned long);
 
 	if (strncmp(hdr->magic, "bFLT", 4)) {
 		/*
@@ -577,12 +581,12 @@ static int load_flat_file(struct linux_binprm * bprm,
 #ifdef CONFIG_BINFMT_ZFLAT
 		if (flags & FLAT_FLAG_GZDATA) {
 			result = decompress_exec(bprm, fpos, (char *) datapos, 
-						 data_len + (relocs * sizeof(unsigned long)), 0);
+						 full_data, 0);
 		} else
 #endif
 		{
-			result = bprm->file->f_op->read(bprm->file, (char *) datapos,
-					data_len + (relocs * sizeof(unsigned long)), &fpos);
+			result = read_code(bprm->file, datapos, fpos,
+					full_data);
 		}
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read data+bss, errno %d\n", (int)-result);
@@ -627,30 +631,25 @@ static int load_flat_file(struct linux_binprm * bprm,
 		if (flags & FLAT_FLAG_GZIP) {
 			result = decompress_exec(bprm, sizeof (struct flat_hdr),
 					 (((char *) textpos) + sizeof (struct flat_hdr)),
-					 (text_len + data_len + (relocs * sizeof(unsigned long))
+					 (text_len + full_data
 						  - sizeof (struct flat_hdr)),
 					 0);
 			memmove((void *) datapos, (void *) realdatastart,
-					data_len + (relocs * sizeof(unsigned long)));
+					full_data);
 		} else if (flags & FLAT_FLAG_GZDATA) {
-			fpos = 0;
-			result = bprm->file->f_op->read(bprm->file,
-					(char *) textpos, text_len, &fpos);
+			result = read_code(bprm->file, textpos, 0, text_len);
 			if (!IS_ERR_VALUE(result))
 				result = decompress_exec(bprm, text_len, (char *) datapos,
-						 data_len + (relocs * sizeof(unsigned long)), 0);
+						 full_data, 0);
 		}
 		else
 #endif
 		{
-			fpos = 0;
-			result = bprm->file->f_op->read(bprm->file,
-					(char *) textpos, text_len, &fpos);
-			if (!IS_ERR_VALUE(result)) {
-				fpos = ntohl(hdr->data_start);
-				result = bprm->file->f_op->read(bprm->file, (char *) datapos,
-					data_len + (relocs * sizeof(unsigned long)), &fpos);
-			}
+			result = read_code(bprm->file, textpos, 0, text_len);
+			if (!IS_ERR_VALUE(result))
+				result = read_code(bprm->file, datapos,
+						   ntohl(hdr->data_start),
+						   full_data);
 		}
 		if (IS_ERR_VALUE(result)) {
 			printk("Unable to read code+data+bss, errno %d\n",(int)-result);
diff --git a/fs/exec.c b/fs/exec.c
index a96a4885bbbf..77dc9096440f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -802,6 +802,15 @@ int kernel_read(struct file *file, loff_t offset,
 
 EXPORT_SYMBOL(kernel_read);
 
+ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
+{
+	ssize_t res = file->f_op->read(file, (void __user *)addr, len, &pos);
+	if (res > 0)
+		flush_icache_range(addr, addr + len);
+	return res;
+}
+EXPORT_SYMBOL(read_code);
+
 static int exec_mmap(struct mm_struct *mm)
 {
 	struct task_struct *tsk;
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index c3a09149f793..70cf138690e9 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -118,5 +118,6 @@ extern int prepare_bprm_creds(struct linux_binprm *bprm);
 extern void install_exec_creds(struct linux_binprm *bprm);
 extern void set_binfmt(struct linux_binfmt *new);
 extern void free_bprm(struct linux_binprm *);
+extern ssize_t read_code(struct file *, unsigned long, loff_t, size_t);
 
 #endif /* _LINUX_BINFMTS_H */
-- 
cgit 


From d0206fb55540cfdc3a2634ffdafc6f1d86cf1f15 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 9 Apr 2013 21:11:47 +0100
Subject: procfs: Mark create_proc_read_entry deprecated

Mark create_proc_read_entry deprecated.  proc_create[_data]() should be used
instead.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/proc_fs.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 2781e498f709..8175b49396a6 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -155,7 +155,8 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 	return proc_create_data(name, mode, parent, proc_fops, NULL);
 }
 
-extern struct proc_dir_entry *create_proc_read_entry(const char *name,
+extern __deprecated
+struct proc_dir_entry *create_proc_read_entry(const char *name,
 	umode_t mode, struct proc_dir_entry *base, 
 	read_proc_t *read_proc, void *data);
  
@@ -191,7 +192,8 @@ static inline struct proc_dir_entry *proc_mkdir(const char *name,
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
 
-static inline struct proc_dir_entry *create_proc_read_entry(const char *name,
+static inline __deprecated
+struct proc_dir_entry *create_proc_read_entry(const char *name,
 	umode_t mode, struct proc_dir_entry *base, 
 	read_proc_t *read_proc, void * data) { return NULL; }
 
-- 
cgit 


From 11db656ad462c9aa7aff7e3817214578cfc307b3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 10 Apr 2013 15:05:38 +0100
Subject: nubus: Don't use create_proc_read_entry()

Don't use create_proc_read_entry() as that is deprecated, but rather use
proc_create_data() and seq_file instead.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: linux-m68k@lists.linux-m68k.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/nubus/nubus.c | 55 ----------------------------------
 drivers/nubus/proc.c  | 81 +++++++++++++++++++++++++++++++++++++++++++++++----
 include/linux/nubus.h |  4 +++
 3 files changed, 80 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c
index 44d01afafe9c..43926cd25ae8 100644
--- a/drivers/nubus/nubus.c
+++ b/drivers/nubus/nubus.c
@@ -19,7 +19,6 @@
 #include <asm/setup.h>
 #include <asm/page.h>
 #include <asm/hwtest.h>
-#include <linux/proc_fs.h>
 #include <asm/mac_via.h>
 #include <asm/mac_oss.h>
 
@@ -954,56 +953,6 @@ void __init nubus_probe_slot(int slot)
 	}
 }
 
-#if defined(CONFIG_PROC_FS)
-
-/* /proc/nubus stuff */
-
-static int sprint_nubus_board(struct nubus_board* board, char* ptr, int len)
-{
-	if(len < 100)
-		return -1;
-	
-	sprintf(ptr, "Slot %X: %s\n",
-		board->slot, board->name);
-	
-	return strlen(ptr);
-}
-
-static int nubus_read_proc(char *page, char **start, off_t off,
-				int count, int *eof, void *data)
-{
-	int nprinted, len, begin = 0;
-	int size = PAGE_SIZE;
-	struct nubus_board* board;
-	
-	len   = sprintf(page, "Nubus devices found:\n");
-	/* Walk the list of NuBus boards */
-	for (board = nubus_boards; board != NULL; board = board->next)
-	{
-		nprinted = sprint_nubus_board(board, page + len, size - len);
-		if (nprinted < 0)
-			break;
-		len += nprinted;
-		if (len+begin < off) {
-			begin += len;
-			len = 0;
-		}
-		if (len+begin >= off+count)
-			break;
-	}
-	if (len+begin < off)
-		*eof = 1;
-	off -= begin;
-	*start = page + off;
-	len -= off;
-	if (len>count)
-		len = count;
-	if (len<0)
-		len = 0;
-	return len;
-}
-#endif
-
 void __init nubus_scan_bus(void)
 {
 	int slot;
@@ -1041,11 +990,7 @@ static int __init nubus_init(void)
 	nubus_devices = NULL;
 	nubus_boards  = NULL;
 	nubus_scan_bus();
-
-#ifdef CONFIG_PROC_FS
-	create_proc_read_entry("nubus", 0, NULL, nubus_read_proc, NULL);
 	nubus_proc_init();
-#endif
 	return 0;
 }
 
diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c
index bb1446bb2802..b8286ed65919 100644
--- a/drivers/nubus/proc.c
+++ b/drivers/nubus/proc.c
@@ -52,7 +52,6 @@ static int nubus_devices_proc_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations nubus_devices_proc_fops = {
-	.owner		= THIS_MODULE,
 	.open		= nubus_devices_proc_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -61,6 +60,10 @@ static const struct file_operations nubus_devices_proc_fops = {
 
 static struct proc_dir_entry *proc_bus_nubus_dir;
 
+static const struct file_operations nubus_proc_subdir_fops = {
+#warning Need to set some I/O handlers here
+};
+
 static void nubus_proc_subdir(struct nubus_dev* dev,
 			      struct proc_dir_entry* parent,
 			      struct nubus_dir* dir)
@@ -73,10 +76,10 @@ static void nubus_proc_subdir(struct nubus_dev* dev,
 		struct proc_dir_entry* e;
 		
 		sprintf(name, "%x", ent.type);
-#warning Need to set some I/O handlers here
-		e = create_proc_read_entry(name, S_IFREG | S_IRUGO | S_IWUSR,
-					   parent, NULL, NULL);
-		if (!e) return;
+		e = proc_create(name, S_IFREG | S_IRUGO | S_IWUSR, parent,
+				&nubus_proc_subdir_fops);
+		if (!e)
+			return;
 	}
 }
 
@@ -159,6 +162,73 @@ int nubus_proc_detach_device(struct nubus_dev *dev)
 }
 EXPORT_SYMBOL(nubus_proc_detach_device);
 
+/*
+ * /proc/nubus stuff
+ */
+static int nubus_proc_show(struct seq_file *m, void *v)
+{
+	const struct nubus_board *board = v;
+
+	/* Display header on line 1 */
+	if (v == SEQ_START_TOKEN)
+		seq_puts(m, "Nubus devices found:\n");
+	else
+		seq_printf(m, "Slot %X: %s\n", board->slot, board->name);
+	return 0;
+}
+
+static void *nubus_proc_start(struct seq_file *m, loff_t *_pos)
+{
+	struct nubus_board *board;
+	unsigned pos;
+
+	if (*_pos > LONG_MAX)
+		return NULL;
+	pos = *_pos;
+	if (pos == 0)
+		return SEQ_START_TOKEN;
+	for (board = nubus_boards; board; board = board->next)
+		if (--pos == 0)
+			break;
+	return board;
+}
+
+static void *nubus_proc_next(struct seq_file *p, void *v, loff_t *_pos)
+{
+	/* Walk the list of NuBus boards */
+	struct nubus_board *board = v;
+
+	++*_pos;
+	if (v == SEQ_START_TOKEN)
+		board = nubus_boards;
+	else if (board)
+		board = board->next;
+	return board;
+}
+
+static void nubus_proc_stop(struct seq_file *p, void *v)
+{
+}
+
+static const struct seq_operations nubus_proc_seqops = {
+	.start	= nubus_proc_start,
+	.next	= nubus_proc_next,
+	.stop	= nubus_proc_stop,
+	.show	= nubus_proc_show,
+};
+
+static int nubus_proc_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &nubus_proc_seqops);
+}
+
+static const struct file_operations nubus_proc_fops = {
+	.open		= nubus_proc_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 void __init proc_bus_nubus_add_devices(void)
 {
 	struct nubus_dev *dev;
@@ -169,6 +239,7 @@ void __init proc_bus_nubus_add_devices(void)
 
 void __init nubus_proc_init(void)
 {
+	proc_create("nubus", 0, NULL, &nubus_proc_fops);
 	if (!MACH_IS_MAC)
 		return;
 	proc_bus_nubus_dir = proc_mkdir("bus/nubus", NULL);
diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index a8696bbdfbc4..b3740527571a 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -80,7 +80,11 @@ extern struct nubus_board* nubus_boards;
 
 /* Generic NuBus interface functions, modelled after the PCI interface */
 void nubus_scan_bus(void);
+#ifdef CONFIG_PROC_FS
 extern void nubus_proc_init(void);
+#else
+static inline void nubus_proc_init(void) {}
+#endif
 int get_nubus_list(char *buf);
 int nubus_proc_attach_device(struct nubus_dev *dev);
 int nubus_proc_detach_device(struct nubus_dev *dev);
-- 
cgit 


From 3cb5bf1bf947d325fcf6e9458952b51cfd7e6677 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Apr 2013 03:20:50 +0100
Subject: proc: Delete create_proc_read_entry()

Delete create_proc_read_entry() as it no longer has any users.

Also delete read_proc_t, write_proc_t, the read_proc member of the
proc_dir_entry struct and the support functions that use them.  This saves a
pointer for every PDE allocated.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c       | 168 +-----------------------------------------------
 fs/proc/inode.c         |  35 ----------
 fs/proc/internal.h      |   2 -
 include/linux/proc_fs.h |  16 -----
 4 files changed, 1 insertion(+), 220 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index bec58323629c..1c07cadeb8db 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -36,141 +36,6 @@ static int proc_match(unsigned int len, const char *name, struct proc_dir_entry
 	return !memcmp(name, de->name, len);
 }
 
-/* buffer size is one page but our output routines use some slack for overruns */
-#define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)
-
-ssize_t
-__proc_file_read(struct file *file, char __user *buf, size_t nbytes,
-	       loff_t *ppos)
-{
-	struct inode * inode = file_inode(file);
-	char 	*page;
-	ssize_t	retval=0;
-	int	eof=0;
-	ssize_t	n, count;
-	char	*start;
-	struct proc_dir_entry * dp;
-	unsigned long long pos;
-
-	/*
-	 * Gaah, please just use "seq_file" instead. The legacy /proc
-	 * interfaces cut loff_t down to off_t for reads, and ignore
-	 * the offset entirely for writes..
-	 */
-	pos = *ppos;
-	if (pos > MAX_NON_LFS)
-		return 0;
-	if (nbytes > MAX_NON_LFS - pos)
-		nbytes = MAX_NON_LFS - pos;
-
-	dp = PDE(inode);
-	if (!(page = (char*) __get_free_page(GFP_TEMPORARY)))
-		return -ENOMEM;
-
-	while ((nbytes > 0) && !eof) {
-		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
-
-		start = NULL;
-		if (!dp->read_proc)
-			break;
-
-		/* How to be a proc read function
-		 * ------------------------------
-		 * Prototype:
-		 *    int f(char *buffer, char **start, off_t offset,
-		 *          int count, int *peof, void *dat)
-		 *
-		 * Assume that the buffer is "count" bytes in size.
-		 *
-		 * If you know you have supplied all the data you have, set
-		 * *peof.
-		 *
-		 * You have three ways to return data:
-		 *
-		 * 0) Leave *start = NULL.  (This is the default.)  Put the
-		 *    data of the requested offset at that offset within the
-		 *    buffer.  Return the number (n) of bytes there are from
-		 *    the beginning of the buffer up to the last byte of data.
-		 *    If the number of supplied bytes (= n - offset) is greater
-		 *    than zero and you didn't signal eof and the reader is
-		 *    prepared to take more data you will be called again with
-		 *    the requested offset advanced by the number of bytes
-		 *    absorbed.  This interface is useful for files no larger
-		 *    than the buffer.
-		 *
-		 * 1) Set *start = an unsigned long value less than the buffer
-		 *    address but greater than zero.  Put the data of the
-		 *    requested offset at the beginning of the buffer.  Return
-		 *    the number of bytes of data placed there.  If this number
-		 *    is greater than zero and you didn't signal eof and the
-		 *    reader is prepared to take more data you will be called
-		 *    again with the requested offset advanced by *start.  This
-		 *    interface is useful when you have a large file consisting
-		 *    of a series of blocks which you want to count and return
-		 *    as wholes.
-		 *    (Hack by Paul.Russell@rustcorp.com.au)
-		 *
-		 * 2) Set *start = an address within the buffer.  Put the data
-		 *    of the requested offset at *start.  Return the number of
-		 *    bytes of data placed there.  If this number is greater
-		 *    than zero and you didn't signal eof and the reader is
-		 *    prepared to take more data you will be called again with
-		 *    the requested offset advanced by the number of bytes
-		 *    absorbed.
-		 */
-		n = dp->read_proc(page, &start, *ppos, count, &eof, dp->data);
-
-		if (n == 0)   /* end of file */
-			break;
-		if (n < 0) {  /* error */
-			if (retval == 0)
-				retval = n;
-			break;
-		}
-
-		if (start == NULL) {
-			if (n > PAGE_SIZE)	/* Apparent buffer overflow */
-				n = PAGE_SIZE;
-			n -= *ppos;
-			if (n <= 0)
-				break;
-			if (n > count)
-				n = count;
-			start = page + *ppos;
-		} else if (start < page) {
-			if (n > PAGE_SIZE)	/* Apparent buffer overflow */
-				n = PAGE_SIZE;
-			if (n > count) {
-				/*
-				 * Don't reduce n because doing so might
-				 * cut off part of a data block.
-				 */
-				pr_warn("proc_file_read: count exceeded\n");
-			}
-		} else /* start >= page */ {
-			unsigned long startoff = (unsigned long)(start - page);
-			if (n > (PAGE_SIZE - startoff))	/* buffer overflow? */
-				n = PAGE_SIZE - startoff;
-			if (n > count)
-				n = count;
-		}
-		
- 		n -= copy_to_user(buf, start < page ? page : start, n);
-		if (n == 0) {
-			if (retval == 0)
-				retval = -EFAULT;
-			break;
-		}
-
-		*ppos += start < page ? (unsigned long)start : n;
-		nbytes -= n;
-		buf += n;
-		retval += n;
-	}
-	free_page((unsigned long) page);
-	return retval;
-}
-
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
@@ -476,8 +341,7 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 	} else if (S_ISLNK(dp->mode)) {
 		dp->proc_iops = &proc_link_inode_operations;
 	} else if (S_ISREG(dp->mode)) {
-		if (dp->proc_fops == NULL)
-			dp->proc_fops = &proc_file_operations;
+		BUG_ON(dp->proc_fops == NULL);
 		dp->proc_iops = &proc_file_inode_operations;
 	} else {
 		WARN_ON(1);
@@ -604,36 +468,6 @@ struct proc_dir_entry *proc_mkdir(const char *name,
 }
 EXPORT_SYMBOL(proc_mkdir);
 
-struct proc_dir_entry *create_proc_read_entry(
-	const char *name, umode_t mode, struct proc_dir_entry *parent, 
-	read_proc_t *read_proc, void *data)
-{
-	struct proc_dir_entry *ent;
-
-	if ((mode & S_IFMT) == 0)
-		mode |= S_IFREG;
-
-	if (!S_ISREG(mode)) {
-		WARN_ON(1);	/* use proc_mkdir(), damnit */
-		return NULL;
-	}
-
-	if ((mode & S_IALLUGO) == 0)
-		mode |= S_IRUGO;
-
-	ent = __proc_create(&parent, name, mode, 1);
-	if (ent) {
-		ent->read_proc = read_proc;
-		ent->data = data;
-		if (proc_register(parent, ent) < 0) {
-			kfree(ent);
-			ent = NULL;
-		}
-	}
-	return ent;
-}
-EXPORT_SYMBOL(create_proc_read_entry);
-
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 					struct proc_dir_entry *parent,
 					const struct file_operations *proc_fops,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3b14a45870a9..d50224c70215 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -183,41 +183,6 @@ void proc_entry_rundown(struct proc_dir_entry *de)
 	spin_unlock(&de->pde_unload_lock);
 }
 
-/* ->read_proc() users - legacy crap */
-static ssize_t
-proc_file_read(struct file *file, char __user *buf, size_t nbytes,
-	       loff_t *ppos)
-{
-	struct proc_dir_entry *pde = PDE(file_inode(file));
-	ssize_t rv = -EIO;
-	if (use_pde(pde)) {
-		rv = __proc_file_read(file, buf, nbytes, ppos);
-		unuse_pde(pde);
-	}
-	return rv;
-}
-
-static loff_t
-proc_file_lseek(struct file *file, loff_t offset, int orig)
-{
-	loff_t retval = -EINVAL;
-	switch (orig) {
-	case 1:
-		offset += file->f_pos;
-	/* fallthrough */
-	case 0:
-		if (offset < 0 || offset > MAX_NON_LFS)
-			break;
-		file->f_pos = retval = offset;
-	}
-	return retval;
-}
-
-const struct file_operations proc_file_operations = {
-	.llseek		= proc_file_lseek,
-	.read		= proc_file_read,
-};
-
 static loff_t proc_reg_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct proc_dir_entry *pde = PDE(file_inode(file));
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 46a7e2a7b904..4b13417acfc4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -157,8 +157,6 @@ struct pde_opener {
 	struct completion *c;
 };
 
-ssize_t __proc_file_read(struct file *, char __user *, size_t, loff_t *);
-extern const struct file_operations proc_file_operations;
 void proc_entry_rundown(struct proc_dir_entry *);
 
 extern spinlock_t proc_subdir_lock;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 8175b49396a6..f5105f4e54f4 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -47,11 +47,6 @@ enum {
  * non-directory entries).
  */
 
-typedef	int (read_proc_t)(char *page, char **start, off_t off,
-			  int count, int *eof, void *data);
-typedef	int (write_proc_t)(struct file *file, const char __user *buffer,
-			   unsigned long count, void *data);
-
 struct proc_dir_entry {
 	unsigned int low_ino;
 	umode_t mode;
@@ -63,7 +58,6 @@ struct proc_dir_entry {
 	const struct file_operations *proc_fops;
 	struct proc_dir_entry *next, *parent, *subdir;
 	void *data;
-	read_proc_t *read_proc;
 	atomic_t count;		/* use count */
 	atomic_t in_use;	/* number of callers into module in progress; */
 			/* negative -> it's going away RSN */
@@ -154,11 +148,6 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 {
 	return proc_create_data(name, mode, parent, proc_fops, NULL);
 }
-
-extern __deprecated
-struct proc_dir_entry *create_proc_read_entry(const char *name,
-	umode_t mode, struct proc_dir_entry *base, 
-	read_proc_t *read_proc, void *data);
  
 extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);
@@ -192,11 +181,6 @@ static inline struct proc_dir_entry *proc_mkdir(const char *name,
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
 
-static inline __deprecated
-struct proc_dir_entry *create_proc_read_entry(const char *name,
-	umode_t mode, struct proc_dir_entry *base, 
-	read_proc_t *read_proc, void * data) { return NULL; }
-
 struct tty_driver;
 static inline void proc_tty_register_driver(struct tty_driver *driver) {};
 static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
-- 
cgit 


From 2f96b8c1d5d492c1d0457b253015330f844136f6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 00:10:25 +0100
Subject: proc: Split kcore bits from linux/procfs.h into linux/kcore.h

Split kcore bits from linux/procfs.h into linux/kcore.h.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
cc: linux-mips@linux-mips.org
cc: sparclinux@vger.kernel.org
cc: x86@kernel.org
cc: linux-mm@kvack.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/mips/mm/init.c     |  1 +
 arch/score/mm/init.c    |  2 +-
 arch/x86/mm/init_64.c   |  1 +
 fs/proc/kcore.c         |  1 +
 fs/proc/vmcore.c        |  3 ++-
 include/linux/kcore.h   | 38 ++++++++++++++++++++++++++++++++++++++
 include/linux/proc_fs.h | 31 -------------------------------
 7 files changed, 44 insertions(+), 33 deletions(-)
 create mode 100644 include/linux/kcore.h

(limited to 'include/linux')

diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 67929251286c..60547b7fe2ff 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -29,6 +29,7 @@
 #include <linux/pfn.h>
 #include <linux/hardirq.h>
 #include <linux/gfp.h>
+#include <linux/kcore.h>
 
 #include <asm/asm-offsets.h>
 #include <asm/bootinfo.h>
diff --git a/arch/score/mm/init.c b/arch/score/mm/init.c
index cee6bce1e30c..8b6f796c6ade 100644
--- a/arch/score/mm/init.c
+++ b/arch/score/mm/init.c
@@ -31,7 +31,7 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/pagemap.h>
-#include <linux/proc_fs.h>
+#include <linux/kcore.h>
 #include <linux/sched.h>
 #include <linux/initrd.h>
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 474e28f10815..24ceda0101bb 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -32,6 +32,7 @@
 #include <linux/memory_hotplug.h>
 #include <linux/nmi.h>
 #include <linux/gfp.h>
+#include <linux/kcore.h>
 
 #include <asm/processor.h>
 #include <asm/bios_ebda.h>
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index eda6f017f272..8e6ce830de44 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -11,6 +11,7 @@
 
 #include <linux/mm.h>
 #include <linux/proc_fs.h>
+#include <linux/kcore.h>
 #include <linux/user.h>
 #include <linux/capability.h>
 #include <linux/elf.h>
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index b870f740ab5a..38edddc25816 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -8,7 +8,7 @@
  */
 
 #include <linux/mm.h>
-#include <linux/proc_fs.h>
+#include <linux/kcore.h>
 #include <linux/user.h>
 #include <linux/elf.h>
 #include <linux/elfcore.h>
@@ -22,6 +22,7 @@
 #include <linux/list.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
+#include "internal.h"
 
 /* List representing chunks of contiguous memory areas and their offsets in
  * vmcore file.
diff --git a/include/linux/kcore.h b/include/linux/kcore.h
new file mode 100644
index 000000000000..d92762286645
--- /dev/null
+++ b/include/linux/kcore.h
@@ -0,0 +1,38 @@
+/*
+ * /proc/kcore definitions
+ */
+#ifndef _LINUX_KCORE_H
+#define _LINUX_KCORE_H
+
+enum kcore_type {
+	KCORE_TEXT,
+	KCORE_VMALLOC,
+	KCORE_RAM,
+	KCORE_VMEMMAP,
+	KCORE_OTHER,
+};
+
+struct kcore_list {
+	struct list_head list;
+	unsigned long addr;
+	size_t size;
+	int type;
+};
+
+struct vmcore {
+	struct list_head list;
+	unsigned long long paddr;
+	unsigned long long size;
+	loff_t offset;
+};
+
+#ifdef CONFIG_PROC_KCORE
+extern void kclist_add(struct kcore_list *, void *, size_t, int type);
+#else
+static inline
+void kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
+{
+}
+#endif
+
+#endif /* _LINUX_KCORE_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index f5105f4e54f4..805edacfc2fc 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -68,28 +68,6 @@ struct proc_dir_entry {
 	char name[];
 };
 
-enum kcore_type {
-	KCORE_TEXT,
-	KCORE_VMALLOC,
-	KCORE_RAM,
-	KCORE_VMEMMAP,
-	KCORE_OTHER,
-};
-
-struct kcore_list {
-	struct list_head list;
-	unsigned long addr;
-	size_t size;
-	int type;
-};
-
-struct vmcore {
-	struct list_head list;
-	unsigned long long paddr;
-	unsigned long long size;
-	loff_t offset;
-};
-
 #ifdef CONFIG_PROC_FS
 
 extern void proc_root_init(void);
@@ -214,15 +192,6 @@ static inline void proc_free_inum(unsigned int inum)
 }
 #endif /* CONFIG_PROC_FS */
 
-#if !defined(CONFIG_PROC_KCORE)
-static inline void
-kclist_add(struct kcore_list *new, void *addr, size_t size, int type)
-{
-}
-#else
-extern void kclist_add(struct kcore_list *, void *, size_t, int type);
-#endif
-
 struct nsproxy;
 struct proc_ns_operations {
 	const char *name;
-- 
cgit 


From 68a2d20b79b105f02dcbc52c211d7e62f98996b7 Mon Sep 17 00:00:00 2001
From: Haiyang Zhang <haiyangz@microsoft.com>
Date: Mon, 29 Apr 2013 15:05:42 -0700
Subject: drivers/video: add Hyper-V Synthetic Video Frame Buffer Driver

This is the driver for the Hyper-V Synthetic Video, which supports
screen resolution up to Full HD 1920x1080 on Windows Server 2012 host,
and 1600x1200 on Windows Server 2008 R2 or earlier.  It also solves the
double mouse cursor issue of the emulated video mode.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Reviewed-by: K. Y. Srinivasan <kys@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
Cc: Olaf Hering <olaf@aepfle.de>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Florian Tobias Schandinat <FlorianSchandinat@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/Kconfig     |   9 +
 drivers/video/Makefile    |   1 +
 drivers/video/hyperv_fb.c | 829 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/hyperv.h    |  11 +
 4 files changed, 850 insertions(+)
 create mode 100644 drivers/video/hyperv_fb.c

(limited to 'include/linux')

diff --git a/drivers/video/Kconfig b/drivers/video/Kconfig
index 981c1c08c727..76be61701c97 100644
--- a/drivers/video/Kconfig
+++ b/drivers/video/Kconfig
@@ -2440,6 +2440,15 @@ config FB_PUV3_UNIGFX
 	  Choose this option if you want to use the Unigfx device as a
 	  framebuffer device. Without the support of PCI & AGP.
 
+config FB_HYPERV
+	tristate "Microsoft Hyper-V Synthetic Video support"
+	depends on FB && HYPERV
+	select FB_CFB_FILLRECT
+	select FB_CFB_COPYAREA
+	select FB_CFB_IMAGEBLIT
+	help
+	  This framebuffer driver supports Microsoft Hyper-V Synthetic Video.
+
 source "drivers/video/omap/Kconfig"
 source "drivers/video/omap2/Kconfig"
 source "drivers/video/exynos/Kconfig"
diff --git a/drivers/video/Makefile b/drivers/video/Makefile
index e414378d6a51..7234e4a959e8 100644
--- a/drivers/video/Makefile
+++ b/drivers/video/Makefile
@@ -149,6 +149,7 @@ obj-$(CONFIG_FB_MSM)              += msm/
 obj-$(CONFIG_FB_NUC900)           += nuc900fb.o
 obj-$(CONFIG_FB_JZ4740)		  += jz4740_fb.o
 obj-$(CONFIG_FB_PUV3_UNIGFX)      += fb-puv3.o
+obj-$(CONFIG_FB_HYPERV)		  += hyperv_fb.o
 
 # Platform or fallback drivers go here
 obj-$(CONFIG_FB_UVESA)            += uvesafb.o
diff --git a/drivers/video/hyperv_fb.c b/drivers/video/hyperv_fb.c
new file mode 100644
index 000000000000..d4d2c5fe2488
--- /dev/null
+++ b/drivers/video/hyperv_fb.c
@@ -0,0 +1,829 @@
+/*
+ * Copyright (c) 2012, Microsoft Corporation.
+ *
+ * Author:
+ *   Haiyang Zhang <haiyangz@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+/*
+ * Hyper-V Synthetic Video Frame Buffer Driver
+ *
+ * This is the driver for the Hyper-V Synthetic Video, which supports
+ * screen resolution up to Full HD 1920x1080 with 32 bit color on Windows
+ * Server 2012, and 1600x1200 with 16 bit color on Windows Server 2008 R2
+ * or earlier.
+ *
+ * It also solves the double mouse cursor issue of the emulated video mode.
+ *
+ * The default screen resolution is 1152x864, which may be changed by a
+ * kernel parameter:
+ *     video=hyperv_fb:<width>x<height>
+ *     For example: video=hyperv_fb:1280x1024
+ *
+ * Portrait orientation is also supported:
+ *     For example: video=hyperv_fb:864x1152
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/completion.h>
+#include <linux/fb.h>
+#include <linux/pci.h>
+
+#include <linux/hyperv.h>
+
+
+/* Hyper-V Synthetic Video Protocol definitions and structures */
+#define MAX_VMBUS_PKT_SIZE 0x4000
+
+#define SYNTHVID_VERSION(major, minor) ((minor) << 16 | (major))
+#define SYNTHVID_VERSION_WIN7 SYNTHVID_VERSION(3, 0)
+#define SYNTHVID_VERSION_WIN8 SYNTHVID_VERSION(3, 2)
+
+#define SYNTHVID_DEPTH_WIN7 16
+#define SYNTHVID_DEPTH_WIN8 32
+
+#define SYNTHVID_FB_SIZE_WIN7 (4 * 1024 * 1024)
+#define SYNTHVID_WIDTH_MAX_WIN7 1600
+#define SYNTHVID_HEIGHT_MAX_WIN7 1200
+
+#define SYNTHVID_FB_SIZE_WIN8 (8 * 1024 * 1024)
+
+#define PCI_VENDOR_ID_MICROSOFT 0x1414
+#define PCI_DEVICE_ID_HYPERV_VIDEO 0x5353
+
+
+enum pipe_msg_type {
+	PIPE_MSG_INVALID,
+	PIPE_MSG_DATA,
+	PIPE_MSG_MAX
+};
+
+struct pipe_msg_hdr {
+	u32 type;
+	u32 size; /* size of message after this field */
+} __packed;
+
+
+enum synthvid_msg_type {
+	SYNTHVID_ERROR			= 0,
+	SYNTHVID_VERSION_REQUEST	= 1,
+	SYNTHVID_VERSION_RESPONSE	= 2,
+	SYNTHVID_VRAM_LOCATION		= 3,
+	SYNTHVID_VRAM_LOCATION_ACK	= 4,
+	SYNTHVID_SITUATION_UPDATE	= 5,
+	SYNTHVID_SITUATION_UPDATE_ACK	= 6,
+	SYNTHVID_POINTER_POSITION	= 7,
+	SYNTHVID_POINTER_SHAPE		= 8,
+	SYNTHVID_FEATURE_CHANGE		= 9,
+	SYNTHVID_DIRT			= 10,
+
+	SYNTHVID_MAX			= 11
+};
+
+struct synthvid_msg_hdr {
+	u32 type;
+	u32 size;  /* size of this header + payload after this field*/
+} __packed;
+
+
+struct synthvid_version_req {
+	u32 version;
+} __packed;
+
+struct synthvid_version_resp {
+	u32 version;
+	u8 is_accepted;
+	u8 max_video_outputs;
+} __packed;
+
+struct synthvid_vram_location {
+	u64 user_ctx;
+	u8 is_vram_gpa_specified;
+	u64 vram_gpa;
+} __packed;
+
+struct synthvid_vram_location_ack {
+	u64 user_ctx;
+} __packed;
+
+struct video_output_situation {
+	u8 active;
+	u32 vram_offset;
+	u8 depth_bits;
+	u32 width_pixels;
+	u32 height_pixels;
+	u32 pitch_bytes;
+} __packed;
+
+struct synthvid_situation_update {
+	u64 user_ctx;
+	u8 video_output_count;
+	struct video_output_situation video_output[1];
+} __packed;
+
+struct synthvid_situation_update_ack {
+	u64 user_ctx;
+} __packed;
+
+struct synthvid_pointer_position {
+	u8 is_visible;
+	u8 video_output;
+	s32 image_x;
+	s32 image_y;
+} __packed;
+
+
+#define CURSOR_MAX_X 96
+#define CURSOR_MAX_Y 96
+#define CURSOR_ARGB_PIXEL_SIZE 4
+#define CURSOR_MAX_SIZE (CURSOR_MAX_X * CURSOR_MAX_Y * CURSOR_ARGB_PIXEL_SIZE)
+#define CURSOR_COMPLETE (-1)
+
+struct synthvid_pointer_shape {
+	u8 part_idx;
+	u8 is_argb;
+	u32 width; /* CURSOR_MAX_X at most */
+	u32 height; /* CURSOR_MAX_Y at most */
+	u32 hot_x; /* hotspot relative to upper-left of pointer image */
+	u32 hot_y;
+	u8 data[4];
+} __packed;
+
+struct synthvid_feature_change {
+	u8 is_dirt_needed;
+	u8 is_ptr_pos_needed;
+	u8 is_ptr_shape_needed;
+	u8 is_situ_needed;
+} __packed;
+
+struct rect {
+	s32 x1, y1; /* top left corner */
+	s32 x2, y2; /* bottom right corner, exclusive */
+} __packed;
+
+struct synthvid_dirt {
+	u8 video_output;
+	u8 dirt_count;
+	struct rect rect[1];
+} __packed;
+
+struct synthvid_msg {
+	struct pipe_msg_hdr pipe_hdr;
+	struct synthvid_msg_hdr vid_hdr;
+	union {
+		struct synthvid_version_req ver_req;
+		struct synthvid_version_resp ver_resp;
+		struct synthvid_vram_location vram;
+		struct synthvid_vram_location_ack vram_ack;
+		struct synthvid_situation_update situ;
+		struct synthvid_situation_update_ack situ_ack;
+		struct synthvid_pointer_position ptr_pos;
+		struct synthvid_pointer_shape ptr_shape;
+		struct synthvid_feature_change feature_chg;
+		struct synthvid_dirt dirt;
+	};
+} __packed;
+
+
+
+/* FB driver definitions and structures */
+#define HVFB_WIDTH 1152 /* default screen width */
+#define HVFB_HEIGHT 864 /* default screen height */
+#define HVFB_WIDTH_MIN 640
+#define HVFB_HEIGHT_MIN 480
+
+#define RING_BUFSIZE (256 * 1024)
+#define VSP_TIMEOUT (10 * HZ)
+#define HVFB_UPDATE_DELAY (HZ / 20)
+
+struct hvfb_par {
+	struct fb_info *info;
+	bool fb_ready; /* fb device is ready */
+	struct completion wait;
+	u32 synthvid_version;
+
+	struct delayed_work dwork;
+	bool update;
+
+	u32 pseudo_palette[16];
+	u8 init_buf[MAX_VMBUS_PKT_SIZE];
+	u8 recv_buf[MAX_VMBUS_PKT_SIZE];
+};
+
+static uint screen_width = HVFB_WIDTH;
+static uint screen_height = HVFB_HEIGHT;
+static uint screen_depth;
+static uint screen_fb_size;
+
+/* Send message to Hyper-V host */
+static inline int synthvid_send(struct hv_device *hdev,
+				struct synthvid_msg *msg)
+{
+	static atomic64_t request_id = ATOMIC64_INIT(0);
+	int ret;
+
+	msg->pipe_hdr.type = PIPE_MSG_DATA;
+	msg->pipe_hdr.size = msg->vid_hdr.size;
+
+	ret = vmbus_sendpacket(hdev->channel, msg,
+			       msg->vid_hdr.size + sizeof(struct pipe_msg_hdr),
+			       atomic64_inc_return(&request_id),
+			       VM_PKT_DATA_INBAND, 0);
+
+	if (ret)
+		pr_err("Unable to send packet via vmbus\n");
+
+	return ret;
+}
+
+
+/* Send screen resolution info to host */
+static int synthvid_send_situ(struct hv_device *hdev)
+{
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct synthvid_msg msg;
+
+	if (!info)
+		return -ENODEV;
+
+	memset(&msg, 0, sizeof(struct synthvid_msg));
+
+	msg.vid_hdr.type = SYNTHVID_SITUATION_UPDATE;
+	msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+		sizeof(struct synthvid_situation_update);
+	msg.situ.user_ctx = 0;
+	msg.situ.video_output_count = 1;
+	msg.situ.video_output[0].active = 1;
+	msg.situ.video_output[0].vram_offset = 0;
+	msg.situ.video_output[0].depth_bits = info->var.bits_per_pixel;
+	msg.situ.video_output[0].width_pixels = info->var.xres;
+	msg.situ.video_output[0].height_pixels = info->var.yres;
+	msg.situ.video_output[0].pitch_bytes = info->fix.line_length;
+
+	synthvid_send(hdev, &msg);
+
+	return 0;
+}
+
+/* Send mouse pointer info to host */
+static int synthvid_send_ptr(struct hv_device *hdev)
+{
+	struct synthvid_msg msg;
+
+	memset(&msg, 0, sizeof(struct synthvid_msg));
+	msg.vid_hdr.type = SYNTHVID_POINTER_POSITION;
+	msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+		sizeof(struct synthvid_pointer_position);
+	msg.ptr_pos.is_visible = 1;
+	msg.ptr_pos.video_output = 0;
+	msg.ptr_pos.image_x = 0;
+	msg.ptr_pos.image_y = 0;
+	synthvid_send(hdev, &msg);
+
+	memset(&msg, 0, sizeof(struct synthvid_msg));
+	msg.vid_hdr.type = SYNTHVID_POINTER_SHAPE;
+	msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+		sizeof(struct synthvid_pointer_shape);
+	msg.ptr_shape.part_idx = CURSOR_COMPLETE;
+	msg.ptr_shape.is_argb = 1;
+	msg.ptr_shape.width = 1;
+	msg.ptr_shape.height = 1;
+	msg.ptr_shape.hot_x = 0;
+	msg.ptr_shape.hot_y = 0;
+	msg.ptr_shape.data[0] = 0;
+	msg.ptr_shape.data[1] = 1;
+	msg.ptr_shape.data[2] = 1;
+	msg.ptr_shape.data[3] = 1;
+	synthvid_send(hdev, &msg);
+
+	return 0;
+}
+
+/* Send updated screen area (dirty rectangle) location to host */
+static int synthvid_update(struct fb_info *info)
+{
+	struct hv_device *hdev = device_to_hv_device(info->device);
+	struct synthvid_msg msg;
+
+	memset(&msg, 0, sizeof(struct synthvid_msg));
+
+	msg.vid_hdr.type = SYNTHVID_DIRT;
+	msg.vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+		sizeof(struct synthvid_dirt);
+	msg.dirt.video_output = 0;
+	msg.dirt.dirt_count = 1;
+	msg.dirt.rect[0].x1 = 0;
+	msg.dirt.rect[0].y1 = 0;
+	msg.dirt.rect[0].x2 = info->var.xres;
+	msg.dirt.rect[0].y2 = info->var.yres;
+
+	synthvid_send(hdev, &msg);
+
+	return 0;
+}
+
+
+/*
+ * Actions on received messages from host:
+ * Complete the wait event.
+ * Or, reply with screen and cursor info.
+ */
+static void synthvid_recv_sub(struct hv_device *hdev)
+{
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct hvfb_par *par;
+	struct synthvid_msg *msg;
+
+	if (!info)
+		return;
+
+	par = info->par;
+	msg = (struct synthvid_msg *)par->recv_buf;
+
+	/* Complete the wait event */
+	if (msg->vid_hdr.type == SYNTHVID_VERSION_RESPONSE ||
+	    msg->vid_hdr.type == SYNTHVID_VRAM_LOCATION_ACK) {
+		memcpy(par->init_buf, msg, MAX_VMBUS_PKT_SIZE);
+		complete(&par->wait);
+		return;
+	}
+
+	/* Reply with screen and cursor info */
+	if (msg->vid_hdr.type == SYNTHVID_FEATURE_CHANGE) {
+		if (par->fb_ready) {
+			synthvid_send_ptr(hdev);
+			synthvid_send_situ(hdev);
+		}
+
+		par->update = msg->feature_chg.is_dirt_needed;
+		if (par->update)
+			schedule_delayed_work(&par->dwork, HVFB_UPDATE_DELAY);
+	}
+}
+
+/* Receive callback for messages from the host */
+static void synthvid_receive(void *ctx)
+{
+	struct hv_device *hdev = ctx;
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct hvfb_par *par;
+	struct synthvid_msg *recv_buf;
+	u32 bytes_recvd;
+	u64 req_id;
+	int ret;
+
+	if (!info)
+		return;
+
+	par = info->par;
+	recv_buf = (struct synthvid_msg *)par->recv_buf;
+
+	do {
+		ret = vmbus_recvpacket(hdev->channel, recv_buf,
+				       MAX_VMBUS_PKT_SIZE,
+				       &bytes_recvd, &req_id);
+		if (bytes_recvd > 0 &&
+		    recv_buf->pipe_hdr.type == PIPE_MSG_DATA)
+			synthvid_recv_sub(hdev);
+	} while (bytes_recvd > 0 && ret == 0);
+}
+
+/* Check synthetic video protocol version with the host */
+static int synthvid_negotiate_ver(struct hv_device *hdev, u32 ver)
+{
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct hvfb_par *par = info->par;
+	struct synthvid_msg *msg = (struct synthvid_msg *)par->init_buf;
+	int t, ret = 0;
+
+	memset(msg, 0, sizeof(struct synthvid_msg));
+	msg->vid_hdr.type = SYNTHVID_VERSION_REQUEST;
+	msg->vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+		sizeof(struct synthvid_version_req);
+	msg->ver_req.version = ver;
+	synthvid_send(hdev, msg);
+
+	t = wait_for_completion_timeout(&par->wait, VSP_TIMEOUT);
+	if (!t) {
+		pr_err("Time out on waiting version response\n");
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+	if (!msg->ver_resp.is_accepted) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	par->synthvid_version = ver;
+
+out:
+	return ret;
+}
+
+/* Connect to VSP (Virtual Service Provider) on host */
+static int synthvid_connect_vsp(struct hv_device *hdev)
+{
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct hvfb_par *par = info->par;
+	int ret;
+
+	ret = vmbus_open(hdev->channel, RING_BUFSIZE, RING_BUFSIZE,
+			 NULL, 0, synthvid_receive, hdev);
+	if (ret) {
+		pr_err("Unable to open vmbus channel\n");
+		return ret;
+	}
+
+	/* Negotiate the protocol version with host */
+	if (vmbus_proto_version == VERSION_WS2008 ||
+	    vmbus_proto_version == VERSION_WIN7)
+		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN7);
+	else
+		ret = synthvid_negotiate_ver(hdev, SYNTHVID_VERSION_WIN8);
+
+	if (ret) {
+		pr_err("Synthetic video device version not accepted\n");
+		goto error;
+	}
+
+	if (par->synthvid_version == SYNTHVID_VERSION_WIN7) {
+		screen_depth = SYNTHVID_DEPTH_WIN7;
+		screen_fb_size = SYNTHVID_FB_SIZE_WIN7;
+	} else {
+		screen_depth = SYNTHVID_DEPTH_WIN8;
+		screen_fb_size = SYNTHVID_FB_SIZE_WIN8;
+	}
+
+	return 0;
+
+error:
+	vmbus_close(hdev->channel);
+	return ret;
+}
+
+/* Send VRAM and Situation messages to the host */
+static int synthvid_send_config(struct hv_device *hdev)
+{
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct hvfb_par *par = info->par;
+	struct synthvid_msg *msg = (struct synthvid_msg *)par->init_buf;
+	int t, ret = 0;
+
+	/* Send VRAM location */
+	memset(msg, 0, sizeof(struct synthvid_msg));
+	msg->vid_hdr.type = SYNTHVID_VRAM_LOCATION;
+	msg->vid_hdr.size = sizeof(struct synthvid_msg_hdr) +
+		sizeof(struct synthvid_vram_location);
+	msg->vram.user_ctx = msg->vram.vram_gpa = info->fix.smem_start;
+	msg->vram.is_vram_gpa_specified = 1;
+	synthvid_send(hdev, msg);
+
+	t = wait_for_completion_timeout(&par->wait, VSP_TIMEOUT);
+	if (!t) {
+		pr_err("Time out on waiting vram location ack\n");
+		ret = -ETIMEDOUT;
+		goto out;
+	}
+	if (msg->vram_ack.user_ctx != info->fix.smem_start) {
+		pr_err("Unable to set VRAM location\n");
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/* Send pointer and situation update */
+	synthvid_send_ptr(hdev);
+	synthvid_send_situ(hdev);
+
+out:
+	return ret;
+}
+
+
+/*
+ * Delayed work callback:
+ * It is called at HVFB_UPDATE_DELAY or longer time interval to process
+ * screen updates. It is re-scheduled if further update is necessary.
+ */
+static void hvfb_update_work(struct work_struct *w)
+{
+	struct hvfb_par *par = container_of(w, struct hvfb_par, dwork.work);
+	struct fb_info *info = par->info;
+
+	if (par->fb_ready)
+		synthvid_update(info);
+
+	if (par->update)
+		schedule_delayed_work(&par->dwork, HVFB_UPDATE_DELAY);
+}
+
+
+/* Framebuffer operation handlers */
+
+static int hvfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info)
+{
+	if (var->xres < HVFB_WIDTH_MIN || var->yres < HVFB_HEIGHT_MIN ||
+	    var->xres > screen_width || var->yres >  screen_height ||
+	    var->bits_per_pixel != screen_depth)
+		return -EINVAL;
+
+	var->xres_virtual = var->xres;
+	var->yres_virtual = var->yres;
+
+	return 0;
+}
+
+static int hvfb_set_par(struct fb_info *info)
+{
+	struct hv_device *hdev = device_to_hv_device(info->device);
+
+	return synthvid_send_situ(hdev);
+}
+
+
+static inline u32 chan_to_field(u32 chan, struct fb_bitfield *bf)
+{
+	return ((chan & 0xffff) >> (16 - bf->length)) << bf->offset;
+}
+
+static int hvfb_setcolreg(unsigned regno, unsigned red, unsigned green,
+			  unsigned blue, unsigned transp, struct fb_info *info)
+{
+	u32 *pal = info->pseudo_palette;
+
+	if (regno > 15)
+		return -EINVAL;
+
+	pal[regno] = chan_to_field(red, &info->var.red)
+		| chan_to_field(green, &info->var.green)
+		| chan_to_field(blue, &info->var.blue)
+		| chan_to_field(transp, &info->var.transp);
+
+	return 0;
+}
+
+
+static struct fb_ops hvfb_ops = {
+	.owner = THIS_MODULE,
+	.fb_check_var = hvfb_check_var,
+	.fb_set_par = hvfb_set_par,
+	.fb_setcolreg = hvfb_setcolreg,
+	.fb_fillrect = cfb_fillrect,
+	.fb_copyarea = cfb_copyarea,
+	.fb_imageblit = cfb_imageblit,
+};
+
+
+/* Get options from kernel paramenter "video=" */
+static void hvfb_get_option(struct fb_info *info)
+{
+	struct hvfb_par *par = info->par;
+	char *opt = NULL, *p;
+	uint x = 0, y = 0;
+
+	if (fb_get_options(KBUILD_MODNAME, &opt) || !opt || !*opt)
+		return;
+
+	p = strsep(&opt, "x");
+	if (!*p || kstrtouint(p, 0, &x) ||
+	    !opt || !*opt || kstrtouint(opt, 0, &y)) {
+		pr_err("Screen option is invalid: skipped\n");
+		return;
+	}
+
+	if (x < HVFB_WIDTH_MIN || y < HVFB_HEIGHT_MIN ||
+	    (par->synthvid_version == SYNTHVID_VERSION_WIN8 &&
+	     x * y * screen_depth / 8 > SYNTHVID_FB_SIZE_WIN8) ||
+	    (par->synthvid_version == SYNTHVID_VERSION_WIN7 &&
+	     (x > SYNTHVID_WIDTH_MAX_WIN7 || y > SYNTHVID_HEIGHT_MAX_WIN7))) {
+		pr_err("Screen resolution option is out of range: skipped\n");
+		return;
+	}
+
+	screen_width = x;
+	screen_height = y;
+	return;
+}
+
+
+/* Get framebuffer memory from Hyper-V video pci space */
+static int hvfb_getmem(struct fb_info *info)
+{
+	struct pci_dev *pdev;
+	ulong fb_phys;
+	void __iomem *fb_virt;
+
+	pdev = pci_get_device(PCI_VENDOR_ID_MICROSOFT,
+			      PCI_DEVICE_ID_HYPERV_VIDEO, NULL);
+	if (!pdev) {
+		pr_err("Unable to find PCI Hyper-V video\n");
+		return -ENODEV;
+	}
+
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) < screen_fb_size)
+		goto err1;
+
+	fb_phys = pci_resource_end(pdev, 0) - screen_fb_size + 1;
+	if (!request_mem_region(fb_phys, screen_fb_size, KBUILD_MODNAME))
+		goto err1;
+
+	fb_virt = ioremap(fb_phys, screen_fb_size);
+	if (!fb_virt)
+		goto err2;
+
+	info->apertures = alloc_apertures(1);
+	if (!info->apertures)
+		goto err3;
+
+	info->apertures->ranges[0].base = pci_resource_start(pdev, 0);
+	info->apertures->ranges[0].size = pci_resource_len(pdev, 0);
+	info->fix.smem_start = fb_phys;
+	info->fix.smem_len = screen_fb_size;
+	info->screen_base = fb_virt;
+	info->screen_size = screen_fb_size;
+
+	pci_dev_put(pdev);
+	return 0;
+
+err3:
+	iounmap(fb_virt);
+err2:
+	release_mem_region(fb_phys, screen_fb_size);
+err1:
+	pci_dev_put(pdev);
+	return -ENOMEM;
+}
+
+/* Release the framebuffer */
+static void hvfb_putmem(struct fb_info *info)
+{
+	iounmap(info->screen_base);
+	release_mem_region(info->fix.smem_start, screen_fb_size);
+}
+
+
+static int hvfb_probe(struct hv_device *hdev,
+		      const struct hv_vmbus_device_id *dev_id)
+{
+	struct fb_info *info;
+	struct hvfb_par *par;
+	int ret;
+
+	info = framebuffer_alloc(sizeof(struct hvfb_par), &hdev->device);
+	if (!info) {
+		pr_err("No memory for framebuffer info\n");
+		return -ENOMEM;
+	}
+
+	par = info->par;
+	par->info = info;
+	par->fb_ready = false;
+	init_completion(&par->wait);
+	INIT_DELAYED_WORK(&par->dwork, hvfb_update_work);
+
+	/* Connect to VSP */
+	hv_set_drvdata(hdev, info);
+	ret = synthvid_connect_vsp(hdev);
+	if (ret) {
+		pr_err("Unable to connect to VSP\n");
+		goto error1;
+	}
+
+	ret = hvfb_getmem(info);
+	if (ret) {
+		pr_err("No memory for framebuffer\n");
+		goto error2;
+	}
+
+	hvfb_get_option(info);
+	pr_info("Screen resolution: %dx%d, Color depth: %d\n",
+		screen_width, screen_height, screen_depth);
+
+
+	/* Set up fb_info */
+	info->flags = FBINFO_DEFAULT;
+
+	info->var.xres_virtual = info->var.xres = screen_width;
+	info->var.yres_virtual = info->var.yres = screen_height;
+	info->var.bits_per_pixel = screen_depth;
+
+	if (info->var.bits_per_pixel == 16) {
+		info->var.red = (struct fb_bitfield){11, 5, 0};
+		info->var.green = (struct fb_bitfield){5, 6, 0};
+		info->var.blue = (struct fb_bitfield){0, 5, 0};
+		info->var.transp = (struct fb_bitfield){0, 0, 0};
+	} else {
+		info->var.red = (struct fb_bitfield){16, 8, 0};
+		info->var.green = (struct fb_bitfield){8, 8, 0};
+		info->var.blue = (struct fb_bitfield){0, 8, 0};
+		info->var.transp = (struct fb_bitfield){24, 8, 0};
+	}
+
+	info->var.activate = FB_ACTIVATE_NOW;
+	info->var.height = -1;
+	info->var.width = -1;
+	info->var.vmode = FB_VMODE_NONINTERLACED;
+
+	strcpy(info->fix.id, KBUILD_MODNAME);
+	info->fix.type = FB_TYPE_PACKED_PIXELS;
+	info->fix.visual = FB_VISUAL_TRUECOLOR;
+	info->fix.line_length = screen_width * screen_depth / 8;
+	info->fix.accel = FB_ACCEL_NONE;
+
+	info->fbops = &hvfb_ops;
+	info->pseudo_palette = par->pseudo_palette;
+
+	/* Send config to host */
+	ret = synthvid_send_config(hdev);
+	if (ret)
+		goto error;
+
+	ret = register_framebuffer(info);
+	if (ret) {
+		pr_err("Unable to register framebuffer\n");
+		goto error;
+	}
+
+	par->fb_ready = true;
+
+	return 0;
+
+error:
+	hvfb_putmem(info);
+error2:
+	vmbus_close(hdev->channel);
+error1:
+	cancel_delayed_work_sync(&par->dwork);
+	hv_set_drvdata(hdev, NULL);
+	framebuffer_release(info);
+	return ret;
+}
+
+
+static int hvfb_remove(struct hv_device *hdev)
+{
+	struct fb_info *info = hv_get_drvdata(hdev);
+	struct hvfb_par *par = info->par;
+
+	par->update = false;
+	par->fb_ready = false;
+
+	unregister_framebuffer(info);
+	cancel_delayed_work_sync(&par->dwork);
+
+	vmbus_close(hdev->channel);
+	hv_set_drvdata(hdev, NULL);
+
+	hvfb_putmem(info);
+	framebuffer_release(info);
+
+	return 0;
+}
+
+
+static const struct hv_vmbus_device_id id_table[] = {
+	/* Synthetic Video Device GUID */
+	{HV_SYNTHVID_GUID},
+	{}
+};
+
+MODULE_DEVICE_TABLE(vmbus, id_table);
+
+static struct hv_driver hvfb_drv = {
+	.name = KBUILD_MODNAME,
+	.id_table = id_table,
+	.probe = hvfb_probe,
+	.remove = hvfb_remove,
+};
+
+
+static int __init hvfb_drv_init(void)
+{
+	return vmbus_driver_register(&hvfb_drv);
+}
+
+static void __exit hvfb_drv_exit(void)
+{
+	vmbus_driver_unregister(&hvfb_drv);
+}
+
+module_init(hvfb_drv_init);
+module_exit(hvfb_drv_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_VERSION(HV_DRV_VERSION);
+MODULE_DESCRIPTION("Microsoft Hyper-V Synthetic Video Frame Buffer Driver");
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 95d0850584da..c2559847d7ee 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1317,6 +1317,17 @@ void vmbus_driver_unregister(struct hv_driver *hv_driver);
 			0x29, 0x2e, 0xfa, 0x35, 0x23, 0xea, 0x36, 0x42, \
 			0x96, 0xae, 0x3a, 0x6e, 0xba, 0xcb, 0xa4,  0x40 \
 		}
+/*
+ * Synthetic Video GUID
+ * {DA0A7802-E377-4aac-8E77-0558EB1073F8}
+ */
+#define HV_SYNTHVID_GUID \
+	.guid = { \
+			0x02, 0x78, 0x0a, 0xda, 0x77, 0xe3, 0xac, 0x4a, \
+			0x8e, 0x77, 0x05, 0x58, 0xeb, 0x10, 0x73, 0xf8 \
+		}
+
+
 /*
  * Common header for Hyper-V ICs
  */
-- 
cgit 


From 2c2fea11957c8c45bf873d9bcd7cd9a342654e79 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Mon, 29 Apr 2013 15:06:06 -0700
Subject: debug_locks.h: make warning more verbose

The WARN_ON(1) in DEBUG_LOCKS_WARN_ON is surprisingly awkward to track
down when it's hit, as it's usually buried in macros, causing multiple
instances to land on the same line number.

This patch makes it more useful by switching to:

    WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c);

so that the particular DEBUG_LOCKS_WARN_ON is more easily identified and
grep'd for.  For example:

    WARNING: at kernel/mutex.c:198 _mutex_lock_nested+0x31c/0x380()
    DEBUG_LOCKS_WARN_ON(l->magic != l)

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debug_locks.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h
index 3bd46f766751..21ca773f77bf 100644
--- a/include/linux/debug_locks.h
+++ b/include/linux/debug_locks.h
@@ -27,7 +27,7 @@ extern int debug_locks_off(void);
 									\
 	if (!oops_in_progress && unlikely(c)) {				\
 		if (debug_locks_off() && !debug_locks_silent)		\
-			WARN_ON(1);					\
+			WARN(1, "DEBUG_LOCKS_WARN_ON(%s)", #c);		\
 		__ret = 1;						\
 	}								\
 	__ret;								\
-- 
cgit 


From 4b59e6c4730978679b414a8da61514a2518da512 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:06:11 -0700
Subject: mm, show_mem: suppress page counts in non-blockable contexts

On large systems with a lot of memory, walking all RAM to determine page
types may take a half second or even more.

In non-blockable contexts, the page allocator will emit a page allocation
failure warning unless __GFP_NOWARN is specified.  In such contexts, irqs
are typically disabled and such a lengthy delay may even result in NMI
watchdog timeouts.

To fix this, suppress the page walk in such contexts when printing the
page allocation failure warning.

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Mel Gorman <mgorman@suse.de>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Dave Hansen <dave@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mm/init.c       | 3 +++
 arch/ia64/mm/contig.c    | 2 ++
 arch/ia64/mm/discontig.c | 2 ++
 arch/parisc/mm/init.c    | 2 ++
 arch/unicore32/mm/init.c | 3 +++
 include/linux/mm.h       | 3 ++-
 lib/show_mem.c           | 3 +++
 mm/page_alloc.c          | 7 +++++++
 8 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index ad722f1208a5..ad9a9f3f0322 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -99,6 +99,9 @@ void show_mem(unsigned int filter)
 	printk("Mem-info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_bank (i, mi) {
 		struct membank *bank = &mi->bank[i];
 		unsigned int pfn1, pfn2;
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 80dab509dfb0..67c59ebec899 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -47,6 +47,8 @@ void show_mem(unsigned int filter)
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas(filter);
 	printk(KERN_INFO "Node memory in pages:\n");
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
 	for_each_online_pgdat(pgdat) {
 		unsigned long present;
 		unsigned long flags;
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index c2e955ee79a8..a57436e5d405 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -623,6 +623,8 @@ void show_mem(unsigned int filter)
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas(filter);
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
 	printk(KERN_INFO "Node memory in pages:\n");
 	for_each_online_pgdat(pgdat) {
 		unsigned long present;
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 3ac462de53a4..cf2da13c41e6 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -697,6 +697,8 @@ void show_mem(unsigned int filter)
 
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas(filter);
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
 #ifndef CONFIG_DISCONTIGMEM
 	i = max_mapnr;
 	while (i-- > 0) {
diff --git a/arch/unicore32/mm/init.c b/arch/unicore32/mm/init.c
index de186bde8975..644482882bae 100644
--- a/arch/unicore32/mm/init.c
+++ b/arch/unicore32/mm/init.c
@@ -66,6 +66,9 @@ void show_mem(unsigned int filter)
 	printk(KERN_DEFAULT "Mem-info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_bank(i, mi) {
 		struct membank *bank = &mi->bank[i];
 		unsigned int pfn1, pfn2;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e2091b88d24c..f3c7b1f9d1d8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -899,7 +899,8 @@ extern void pagefault_out_of_memory(void);
  * Flags passed to show_mem() and show_free_areas() to suppress output in
  * various contexts.
  */
-#define SHOW_MEM_FILTER_NODES	(0x0001u)	/* filter disallowed nodes */
+#define SHOW_MEM_FILTER_NODES		(0x0001u)	/* disallowed nodes */
+#define SHOW_MEM_FILTER_PAGE_COUNT	(0x0002u)	/* page type count */
 
 extern void show_free_areas(unsigned int flags);
 extern bool skip_free_areas_node(unsigned int flags, int nid);
diff --git a/lib/show_mem.c b/lib/show_mem.c
index 4407f8c9b1f7..b7c72311ad0c 100644
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -18,6 +18,9 @@ void show_mem(unsigned int filter)
 	printk("Mem-Info:\n");
 	show_free_areas(filter);
 
+	if (filter & SHOW_MEM_FILTER_PAGE_COUNT)
+		return;
+
 	for_each_online_pgdat(pgdat) {
 		unsigned long i, flags;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7ff1536f01b8..da7a2fe7332e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2002,6 +2002,13 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 	    debug_guardpage_minorder() > 0)
 		return;
 
+	/*
+	 * Walking all memory to count page types is very expensive and should
+	 * be inhibited in non-blockable contexts.
+	 */
+	if (!(gfp_mask & __GFP_WAIT))
+		filter |= SHOW_MEM_FILTER_PAGE_COUNT;
+
 	/*
 	 * This documents exceptions given to allocations in certain
 	 * contexts that are allowed to allocate outside current's set
-- 
cgit 


From 250297edf83292c831fbf4504df54953c2aacfe4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Apr 2013 15:06:12 -0700
Subject: mm/shmem.c: remove an ifdef

Create a CONFIG_MMU=y stub for ramfs_nommu_expand_for_mapping() in the
usual fashion.

Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Wolfram Sang <wsa@the-dreams.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ramfs.h | 8 +++++++-
 mm/shmem.c            | 5 +----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ramfs.h b/include/linux/ramfs.h
index 5bf5500db83d..69e37c2d1ea5 100644
--- a/include/linux/ramfs.h
+++ b/include/linux/ramfs.h
@@ -6,7 +6,13 @@ struct inode *ramfs_get_inode(struct super_block *sb, const struct inode *dir,
 extern struct dentry *ramfs_mount(struct file_system_type *fs_type,
 	 int flags, const char *dev_name, void *data);
 
-#ifndef CONFIG_MMU
+#ifdef CONFIG_MMU
+static inline int
+ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
+{
+	return 0;
+}
+#else
 extern int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize);
 extern unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
 						   unsigned long addr,
diff --git a/mm/shmem.c b/mm/shmem.c
index 1c44af71fcf5..39b2a0b86fe8 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/vfs.h>
 #include <linux/mount.h>
+#include <linux/ramfs.h>
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/mm.h>
@@ -2830,8 +2831,6 @@ out4:
  * effectively equivalent, but much lighter weight.
  */
 
-#include <linux/ramfs.h>
-
 static struct file_system_type shmem_fs_type = {
 	.name		= "tmpfs",
 	.mount		= ramfs_mount,
@@ -2931,11 +2930,9 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
 	d_instantiate(path.dentry, inode);
 	inode->i_size = size;
 	clear_nlink(inode);	/* It is unlinked */
-#ifndef CONFIG_MMU
 	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
 	if (IS_ERR(res))
 		goto put_dentry;
-#endif
 
 	res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
 		  &shmem_file_operations);
-- 
cgit 


From 8375ad98cc1defc36adf4a77d9ea1e71db51a371 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:06:13 -0700
Subject: vm: adjust ifdef for TINY_RCU

There is an ifdef in page_cache_get_speculative() that checks for !SMP
and TREE_RCU, which has been an impossible combination since the advent
of TINY_RCU.  The ifdef enables a fastpath that is valid when preemption
is disabled by rcu_read_lock() in UP systems, which is the case when
TINY_RCU is enabled.  This commit therefore adjusts the ifdef to
generate the fastpath when TINY_RCU is enabled.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reported-by: Andi Kleen <andi@firstfloor.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagemap.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 0e38e13eb249..e3dea75a078b 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -149,7 +149,7 @@ static inline int page_cache_get_speculative(struct page *page)
 {
 	VM_BUG_ON(in_interrupt());
 
-#if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
+#ifdef CONFIG_TINY_RCU
 # ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
-- 
cgit 


From 69afade72a3e13e96a065f757891d384d466123f Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Mon, 29 Apr 2013 15:06:21 -0700
Subject: mm: introduce common help functions to deal with reserved/managed
 pages

The original goal of this patchset is to fix the bug reported by

  https://bugzilla.kernel.org/show_bug.cgi?id=53501

Now it has also been expanded to reduce common code used by memory
initializion.

This is the first part, which applies to v3.9-rc1.

It introduces following common helper functions to simplify
free_initmem() and free_initrd_mem() on different architectures:

adjust_managed_page_count():
	will be used to adjust totalram_pages, totalhigh_pages,
	zone->managed_pages when reserving/unresering a page.

__free_reserved_page():
	free a reserved page into the buddy system without adjusting
	page statistics info

free_reserved_page():
	free a reserved page into the buddy system and adjust page
	statistics info

mark_page_reserved():
	mark a page as reserved and adjust page statistics info

free_reserved_area():
	free a continous ranges of pages by calling free_reserved_page()

free_initmem_default():
	default method to free __init pages.

We have only tested these patchset on x86 platforms, and have done basic
compliation tests using cross-compilers from ftp.kernel.org.  That means
some code may not pass compilation on some architectures.  So any help to
test this patchset are welcomed!

There are several other parts still under development:
Part2: introduce free_highmem_page() to simplify freeing highmem pages
Part3: refine code to manage totalram_pages, totalhigh_pages and
	zone->managed_pages
Part4: introduce helper functions to simplify mem_init() and remove the
	global variable num_physpages.

This patch:

Code to deal with reserved/managed pages are duplicated by many
architectures, so introduce common help functions to reduce duplicated
code.  These common help functions will also be used to concentrate code
to modify totalram_pages and zone->managed_pages, which makes the code
much more clear.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "James E.J. Bottomley" <jejb@parisc-linux.org>
Cc: Anatolij Gustschin <agust@denx.de>
Cc: Aurelien Jacquiot <a-jacquiot@ti.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Chen Liqin <liqin.chen@sunplusct.com>
Cc: Chris Zankel <chris@zankel.net>
Cc: David Howells <dhowells@redhat.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Haavard Skinnemoen <hskinnemoen@gmail.com>
Cc: Hans-Christian Egtvedt <egtvedt@samfundet.no>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Helge Deller <deller@gmx.de>
Cc: Hirokazu Takata <takata@linux-m32r.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Cc: Lennox Wu <lennox.wu@gmail.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Matt Turner <mattst88@gmail.com>
Cc: Max Filippov <jcmvbkbc@gmail.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Mikael Starvik <starvik@axis.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 48 ++++++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c    | 20 ++++++++++++++++++++
 2 files changed, 68 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f3c7b1f9d1d8..d064c73c925e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1295,6 +1295,54 @@ extern void free_area_init_node(int nid, unsigned long * zones_size,
 		unsigned long zone_start_pfn, unsigned long *zholes_size);
 extern void free_initmem(void);
 
+/*
+ * Free reserved pages within range [PAGE_ALIGN(start), end & PAGE_MASK)
+ * into the buddy system. The freed pages will be poisoned with pattern
+ * "poison" if it's non-zero.
+ * Return pages freed into the buddy system.
+ */
+extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
+					int poison, char *s);
+
+static inline void adjust_managed_page_count(struct page *page, long count)
+{
+	totalram_pages += count;
+}
+
+/* Free the reserved page into the buddy system, so it gets managed. */
+static inline void __free_reserved_page(struct page *page)
+{
+	ClearPageReserved(page);
+	init_page_count(page);
+	__free_page(page);
+}
+
+static inline void free_reserved_page(struct page *page)
+{
+	__free_reserved_page(page);
+	adjust_managed_page_count(page, 1);
+}
+
+static inline void mark_page_reserved(struct page *page)
+{
+	SetPageReserved(page);
+	adjust_managed_page_count(page, -1);
+}
+
+/*
+ * Default method to free all the __init memory into the buddy system.
+ * The freed pages will be poisoned with pattern "poison" if it is
+ * non-zero. Return pages freed into the buddy system.
+ */
+static inline unsigned long free_initmem_default(int poison)
+{
+	extern char __init_begin[], __init_end[];
+
+	return free_reserved_area(PAGE_ALIGN((unsigned long)&__init_begin) ,
+				  ((unsigned long)&__init_end) & PAGE_MASK,
+				  poison, "unused kernel");
+}
+
 #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 /*
  * With CONFIG_HAVE_MEMBLOCK_NODE_MAP set, an architecture may initialise its
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index da7a2fe7332e..5c660f5ba3d3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5121,6 +5121,26 @@ early_param("movablecore", cmdline_parse_movablecore);
 
 #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 
+unsigned long free_reserved_area(unsigned long start, unsigned long end,
+				 int poison, char *s)
+{
+	unsigned long pages, pos;
+
+	pos = start = PAGE_ALIGN(start);
+	end &= PAGE_MASK;
+	for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) {
+		if (poison)
+			memset((void *)pos, poison, PAGE_SIZE);
+		free_reserved_page(virt_to_page(pos));
+	}
+
+	if (pages && s)
+		pr_info("Freeing %s memory: %ldK (%lx - %lx)\n",
+			s, pages << (PAGE_SHIFT - 10), start, end);
+
+	return pages;
+}
+
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
-- 
cgit 


From cfa11e08ed39eb28a9eff9a907b20913020c69b5 Mon Sep 17 00:00:00 2001
From: Jiang Liu <liuj97@gmail.com>
Date: Mon, 29 Apr 2013 15:07:00 -0700
Subject: mm: introduce free_highmem_page() helper to free highmem pages into
 buddy system

The original goal of this patchset is to fix the bug reported by

  https://bugzilla.kernel.org/show_bug.cgi?id=53501

Now it has also been expanded to reduce common code used by memory
initializion.

This is the second part, which applies to the previous part at:
  http://marc.info/?l=linux-mm&m=136289696323825&w=2

It introduces a helper function free_highmem_page() to free highmem
pages into the buddy system when initializing mm subsystem.
Introduction of free_highmem_page() is one step forward to clean up
accesses and modificaitons of totalhigh_pages, totalram_pages and
zone->managed_pages etc. I hope we could remove all references to
totalhigh_pages from the arch/ subdirectory.

We have only tested these patchset on x86 platforms, and have done basic
compliation tests using cross-compilers from ftp.kernel.org. That means
some code may not pass compilation on some architectures. So any help
to test this patchset are welcomed!

There are several other parts still under development:
Part3: refine code to manage totalram_pages, totalhigh_pages and
	zone->managed_pages
Part4: introduce helper functions to simplify mem_init() and remove the
	global variable num_physpages.

This patch:

Introduce helper function free_highmem_page(), which will be used by
architectures with HIGHMEM enabled to free highmem pages into the buddy
system.

Signed-off-by: Jiang Liu <jiang.liu@huawei.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: "Suzuki K. Poulose" <suzuki@in.ibm.com>
Cc: Alexander Graf <agraf@suse.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Attilio Rao <attilio.rao@citrix.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Cong Wang <amwang@redhat.com>
Cc: David Daney <david.daney@cavium.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Cc: Jiang Liu <liuj97@gmail.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Linus Walleij <linus.walleij@linaro.org>
Cc: Marek Szyprowski <m.szyprowski@samsung.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Nazarewicz <mina86@mina86.com>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Michel Lespinasse <walken@google.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Rik van Riel <riel@redhat.com>
Cc: Russell King <linux@arm.linux.org.uk>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Stephen Boyd <sboyd@codeaurora.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Reviewed-by: Pekka Enberg <penberg@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 7 +++++++
 mm/page_alloc.c    | 9 +++++++++
 2 files changed, 16 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d064c73c925e..43b70d5f8201 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1303,6 +1303,13 @@ extern void free_initmem(void);
  */
 extern unsigned long free_reserved_area(unsigned long start, unsigned long end,
 					int poison, char *s);
+#ifdef	CONFIG_HIGHMEM
+/*
+ * Free a highmem page into the buddy system, adjusting totalhigh_pages
+ * and totalram_pages.
+ */
+extern void free_highmem_page(struct page *page);
+#endif
 
 static inline void adjust_managed_page_count(struct page *page, long count)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5c660f5ba3d3..72da11c6804d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5141,6 +5141,15 @@ unsigned long free_reserved_area(unsigned long start, unsigned long end,
 	return pages;
 }
 
+#ifdef	CONFIG_HIGHMEM
+void free_highmem_page(struct page *page)
+{
+	__free_reserved_page(page);
+	totalram_pages++;
+	totalhigh_pages++;
+}
+#endif
+
 /**
  * set_dma_reserve - set the specified number of pages reserved in the first zone
  * @new_dma_reserve: The number of pages to mark reserved
-- 
cgit 


From 6d2488f64a240191f0733c1f32d73607916b01b7 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.cz>
Date: Mon, 29 Apr 2013 15:07:21 -0700
Subject: cgroup: remove css_get_next

Now that we have generic and well ordered cgroup tree walkers there is
no need to keep css_get_next in the place.

Signed-off-by: Michal Hocko <mhocko@suse.cz>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ying Han <yinghan@google.com>
Cc: Tejun Heo <htejun@gmail.com>
Cc: Glauber Costa <glommer@parallels.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  7 -------
 kernel/cgroup.c        | 49 -------------------------------------------------
 2 files changed, 56 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..470073bf93d0 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -687,13 +687,6 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
 
 struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
 
-/*
- * Get a cgroup whose id is greater than or equal to id under tree of root.
- * Returning a cgroup_subsys_state or NULL.
- */
-struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
-		struct cgroup_subsys_state *root, int *foundid);
-
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
 		     const struct cgroup_subsys_state *root);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..dfaf50d4705e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5416,55 +5416,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
 }
 EXPORT_SYMBOL_GPL(css_lookup);
 
-/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
- *
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
-	     struct cgroup_subsys_state *root, int *foundid)
-{
-	struct cgroup_subsys_state *ret = NULL;
-	struct css_id *tmp;
-	int tmpid;
-	int rootid = css_id(root);
-	int depth = css_depth(root);
-
-	if (!rootid)
-		return NULL;
-
-	BUG_ON(!ss->use_id);
-	WARN_ON_ONCE(!rcu_read_lock_held());
-
-	/* fill start point for scan */
-	tmpid = id;
-	while (1) {
-		/*
-		 * scan next entry from bitmap(tree), tmpid is updated after
-		 * idr_get_next().
-		 */
-		tmp = idr_get_next(&ss->idr, &tmpid);
-		if (!tmp)
-			break;
-		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
-			ret = rcu_dereference(tmp->css);
-			if (ret) {
-				*foundid = tmpid;
-				break;
-			}
-		}
-		/* continue to scan from next id */
-		tmpid = tmpid + 1;
-	}
-	return ret;
-}
-
 /*
  * get corresponding css from file open on cgroupfs directory
  */
-- 
cgit 


From 146732ce104ddfed3d4d82722c0b336074016b92 Mon Sep 17 00:00:00 2001
From: Josh Triplett <josh@joshtriplett.org>
Date: Mon, 29 Apr 2013 15:07:22 -0700
Subject: fs: don't compile in drop_caches.c when CONFIG_SYSCTL=n

drop_caches.c provides code only invokable via sysctl, so don't compile it
in when CONFIG_SYSCTL=n.

Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/Makefile        | 3 ++-
 include/linux/mm.h | 4 ++++
 kernel/sysctl.c    | 1 -
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/fs/Makefile b/fs/Makefile
index 9d53192236fc..3b2c76759ec9 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -10,7 +10,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		ioctl.o readdir.o select.o fifo.o dcache.o inode.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
-		pnode.o drop_caches.o splice.o sync.o utimes.o \
+		pnode.o splice.o sync.o utimes.o \
 		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
@@ -49,6 +49,7 @@ obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 obj-$(CONFIG_COREDUMP)		+= coredump.o
+obj-$(CONFIG_SYSCTL)		+= drop_caches.o
 
 obj-$(CONFIG_FHANDLE)		+= fhandle.o
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43b70d5f8201..98a44376e4f3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1731,8 +1731,12 @@ int in_gate_area_no_mm(unsigned long addr);
 #define in_gate_area(mm, addr) ({(void)mm; in_gate_area_no_mm(addr);})
 #endif	/* __HAVE_ARCH_GATE_AREA */
 
+#ifdef CONFIG_SYSCTL
+extern int sysctl_drop_caches;
 int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
+#endif
+
 unsigned long shrink_slab(struct shrink_control *shrink,
 			  unsigned long nr_pages_scanned,
 			  unsigned long lru_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..3dadde52253c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
 #endif
 extern int pid_max;
 extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
 extern int latencytop_enabled;
-- 
cgit 


From 7136851117744f1d291bed6d307432699d405109 Mon Sep 17 00:00:00 2001
From: "Darrick J. Wong" <darrick.wong@oracle.com>
Date: Mon, 29 Apr 2013 15:07:25 -0700
Subject: mm: make snapshotting pages for stable writes a per-bio operation

Walking a bio's page mappings has proved problematic, so create a new
bio flag to indicate that a bio's data needs to be snapshotted in order
to guarantee stable pages during writeback.  Next, for the one user
(ext3/jbd) of snapshotting, hook all the places where writes can be
initiated without PG_writeback set, and set BIO_SNAP_STABLE there.

We must also flag journal "metadata" bios for stable writeout, since
file data can be written through the journal.  Finally, the
MS_SNAP_STABLE mount flag (only used by ext3) is now superfluous, so get
rid of it.

[akpm@linux-foundation.org: rename _submit_bh()'s `flags' to `bio_flags', delobotomize the _submit_bh declaration]
[akpm@linux-foundation.org: teeny cleanup]
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Artem Bityutskiy <dedekind1@gmail.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c                 |  9 ++++++++-
 fs/ext3/super.c             |  1 -
 fs/jbd/commit.c             | 25 ++++++++++++++++++++++---
 include/linux/blk_types.h   |  3 ++-
 include/linux/buffer_head.h |  1 +
 include/uapi/linux/fs.h     |  1 -
 mm/bounce.c                 | 21 +--------------------
 mm/page-writeback.c         |  4 ----
 8 files changed, 34 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/buffer.c b/fs/buffer.c
index b4dcb34c9635..71578d69b82d 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2949,7 +2949,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 	}
 }
 
-int submit_bh(int rw, struct buffer_head * bh)
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 {
 	struct bio *bio;
 	int ret = 0;
@@ -2984,6 +2984,7 @@ int submit_bh(int rw, struct buffer_head * bh)
 
 	bio->bi_end_io = end_bio_bh_io_sync;
 	bio->bi_private = bh;
+	bio->bi_flags |= bio_flags;
 
 	/* Take care of bh's that straddle the end of the device */
 	guard_bh_eod(rw, bio, bh);
@@ -2997,6 +2998,12 @@ int submit_bh(int rw, struct buffer_head * bh)
 	bio_put(bio);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(_submit_bh);
+
+int submit_bh(int rw, struct buffer_head *bh)
+{
+	return _submit_bh(rw, bh, 0);
+}
 EXPORT_SYMBOL(submit_bh);
 
 /**
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index fb5120a5505c..3dc48cc8b6eb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2067,7 +2067,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
 		test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
 		"writeback");
-	sb->s_flags |= MS_SNAP_STABLE;
 
 	return 0;
 
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 86b39b167c23..11bb11f48b3a 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -162,8 +162,17 @@ static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
 
 	for (i = 0; i < bufs; i++) {
 		wbuf[i]->b_end_io = end_buffer_write_sync;
-		/* We use-up our safety reference in submit_bh() */
-		submit_bh(write_op, wbuf[i]);
+		/*
+		 * Here we write back pagecache data that may be mmaped. Since
+		 * we cannot afford to clean the page and set PageWriteback
+		 * here due to lock ordering (page lock ranks above transaction
+		 * start), the data can change while IO is in flight. Tell the
+		 * block layer it should bounce the bio pages if stable data
+		 * during write is required.
+		 *
+		 * We use up our safety reference in submit_bh().
+		 */
+		_submit_bh(write_op, wbuf[i], 1 << BIO_SNAP_STABLE);
 	}
 }
 
@@ -667,7 +676,17 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(write_op, bh);
+				/*
+				 * In data=journal mode, here we can end up
+				 * writing pagecache data that might be
+				 * mmapped. Since we can't afford to clean the
+				 * page and set PageWriteback (see the comment
+				 * near the other use of _submit_bh()), the
+				 * data can change while the write is in
+				 * flight.  Tell the block layer to bounce the
+				 * bio pages if stable pages are required.
+				 */
+				_submit_bh(write_op, bh, 1 << BIO_SNAP_STABLE);
 			}
 			cond_resched();
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index cdf11191e645..22990cf4439d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -111,12 +111,13 @@ struct bio {
 #define BIO_FS_INTEGRITY 9	/* fs owns integrity data, not block layer */
 #define BIO_QUIET	10	/* Make BIO Quiet */
 #define BIO_MAPPED_INTEGRITY 11/* integrity metadata has been remapped */
+#define BIO_SNAP_STABLE	12	/* bio data must be snapshotted during write */
 
 /*
  * Flags starting here get preserved by bio_reset() - this includes
  * BIO_POOL_IDX()
  */
-#define BIO_RESET_BITS	12
+#define BIO_RESET_BITS	13
 
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 5afc4f94d110..4c16c4a88d47 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -181,6 +181,7 @@ void ll_rw_block(int, int, struct buffer_head * bh[]);
 int sync_dirty_buffer(struct buffer_head *bh);
 int __sync_dirty_buffer(struct buffer_head *bh, int rw);
 void write_dirty_buffer(struct buffer_head *bh, int rw);
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags);
 int submit_bh(int, struct buffer_head *);
 void write_boundary_block(struct block_device *bdev,
 			sector_t bblock, unsigned blocksize);
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index c7fc1e6517c3..a4ed56cf0eac 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -88,7 +88,6 @@ struct inodes_stat_t {
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
 
 /* These sb flags are internal to the kernel */
-#define MS_SNAP_STABLE	(1<<27) /* Snapshot pages during writeback, if needed */
 #define MS_NOSEC	(1<<28)
 #define MS_BORN		(1<<29)
 #define MS_ACTIVE	(1<<30)
diff --git a/mm/bounce.c b/mm/bounce.c
index 5f8901768602..a5c2ec3589cb 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -181,32 +181,13 @@ static void bounce_end_io_read_isa(struct bio *bio, int err)
 #ifdef CONFIG_NEED_BOUNCE_POOL
 static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
 {
-	struct page *page;
-	struct backing_dev_info *bdi;
-	struct address_space *mapping;
-	struct bio_vec *from;
-	int i;
-
 	if (bio_data_dir(bio) != WRITE)
 		return 0;
 
 	if (!bdi_cap_stable_pages_required(&q->backing_dev_info))
 		return 0;
 
-	/*
-	 * Based on the first page that has a valid mapping, decide whether or
-	 * not we have to employ bounce buffering to guarantee stable pages.
-	 */
-	bio_for_each_segment(from, bio, i) {
-		page = from->bv_page;
-		mapping = page_mapping(page);
-		if (!mapping)
-			continue;
-		bdi = mapping->backing_dev_info;
-		return mapping->host->i_sb->s_flags & MS_SNAP_STABLE;
-	}
-
-	return 0;
+	return test_bit(BIO_SNAP_STABLE, &bio->bi_flags);
 }
 #else
 static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index efe68148f621..4514ad7415c3 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2311,10 +2311,6 @@ void wait_for_stable_page(struct page *page)
 
 	if (!bdi_cap_stable_pages_required(bdi))
 		return;
-#ifdef CONFIG_NEED_BOUNCE_POOL
-	if (mapping->host->i_sb->s_flags & MS_SNAP_STABLE)
-		return;
-#endif /* CONFIG_NEED_BOUNCE_POOL */
 
 	wait_on_page_writeback(page);
 }
-- 
cgit 


From db3808c1bac64740b9d830fda92801ae65f1c851 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Mon, 29 Apr 2013 15:07:28 -0700
Subject: mm, vmalloc: move get_vmalloc_info() to vmalloc.c

Now get_vmalloc_info() is in fs/proc/mmu.c.  There is no reason that this
code must be here and it's implementation needs vmlist_lock and it iterate
a vmlist which may be internal data structure for vmalloc.

It is preferable that vmlist_lock and vmlist is only used in vmalloc.c
for maintainability. So move the code to vmalloc.c

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/Makefile        |  2 +-
 fs/proc/internal.h      | 18 ---------------
 fs/proc/meminfo.c       |  1 +
 fs/proc/mmu.c           | 60 -------------------------------------------------
 include/linux/vmalloc.h | 18 +++++++++++++++
 mm/vmalloc.c            | 44 ++++++++++++++++++++++++++++++++++++
 6 files changed, 64 insertions(+), 79 deletions(-)
 delete mode 100644 fs/proc/mmu.c

(limited to 'include/linux')

diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 712f24db9600..ab30716584f5 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -5,7 +5,7 @@
 obj-y   += proc.o
 
 proc-y			:= nommu.o task_nommu.o
-proc-$(CONFIG_MMU)	:= mmu.o task_mmu.o
+proc-$(CONFIG_MMU)	:= task_mmu.o
 
 proc-y       += inode.o root.o base.o generic.o array.o \
 		fd.o
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 85ff3a4598b3..75710357a517 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -30,24 +30,6 @@ extern int proc_net_init(void);
 static inline int proc_net_init(void) { return 0; }
 #endif
 
-struct vmalloc_info {
-	unsigned long	used;
-	unsigned long	largest_chunk;
-};
-
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
-
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi)			\
-do {						\
-	(vmi)->used = 0;			\
-	(vmi)->largest_chunk = 0;		\
-} while(0)
-#endif
-
 extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *task);
 extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 1efaaa19c4f3..5aa847a603c0 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -11,6 +11,7 @@
 #include <linux/swap.h>
 #include <linux/vmstat.h>
 #include <linux/atomic.h>
+#include <linux/vmalloc.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include "internal.h"
diff --git a/fs/proc/mmu.c b/fs/proc/mmu.c
deleted file mode 100644
index 8ae221dfd010..000000000000
--- a/fs/proc/mmu.c
+++ /dev/null
@@ -1,60 +0,0 @@
-/* mmu.c: mmu memory info files
- *
- * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-#include <linux/spinlock.h>
-#include <linux/vmalloc.h>
-#include <linux/highmem.h>
-#include <asm/pgtable.h>
-#include "internal.h"
-
-void get_vmalloc_info(struct vmalloc_info *vmi)
-{
-	struct vm_struct *vma;
-	unsigned long free_area_size;
-	unsigned long prev_end;
-
-	vmi->used = 0;
-
-	if (!vmlist) {
-		vmi->largest_chunk = VMALLOC_TOTAL;
-	}
-	else {
-		vmi->largest_chunk = 0;
-
-		prev_end = VMALLOC_START;
-
-		read_lock(&vmlist_lock);
-
-		for (vma = vmlist; vma; vma = vma->next) {
-			unsigned long addr = (unsigned long) vma->addr;
-
-			/*
-			 * Some archs keep another range for modules in vmlist
-			 */
-			if (addr < VMALLOC_START)
-				continue;
-			if (addr >= VMALLOC_END)
-				break;
-
-			vmi->used += vma->size;
-
-			free_area_size = addr - prev_end;
-			if (vmi->largest_chunk < free_area_size)
-				vmi->largest_chunk = free_area_size;
-
-			prev_end = vma->size + addr;
-		}
-
-		if (VMALLOC_END - prev_end > vmi->largest_chunk)
-			vmi->largest_chunk = VMALLOC_END - prev_end;
-
-		read_unlock(&vmlist_lock);
-	}
-}
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6071e911c7f4..698b1e50d3a4 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -158,4 +158,22 @@ pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
 # endif
 #endif
 
+struct vmalloc_info {
+	unsigned long   used;
+	unsigned long   largest_chunk;
+};
+
+#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+extern void get_vmalloc_info(struct vmalloc_info *vmi);
+#else
+
+#define VMALLOC_TOTAL 0UL
+#define get_vmalloc_info(vmi)			\
+do {						\
+	(vmi)->used = 0;			\
+	(vmi)->largest_chunk = 0;		\
+} while (0)
+#endif
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0f751f2068c3..1d9878b7cf52 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2645,5 +2645,49 @@ static int __init proc_vmalloc_init(void)
 	return 0;
 }
 module_init(proc_vmalloc_init);
+
+void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+	struct vm_struct *vma;
+	unsigned long free_area_size;
+	unsigned long prev_end;
+
+	vmi->used = 0;
+
+	if (!vmlist) {
+		vmi->largest_chunk = VMALLOC_TOTAL;
+	} else {
+		vmi->largest_chunk = 0;
+
+		prev_end = VMALLOC_START;
+
+		read_lock(&vmlist_lock);
+
+		for (vma = vmlist; vma; vma = vma->next) {
+			unsigned long addr = (unsigned long) vma->addr;
+
+			/*
+			 * Some archs keep another range for modules in vmlist
+			 */
+			if (addr < VMALLOC_START)
+				continue;
+			if (addr >= VMALLOC_END)
+				break;
+
+			vmi->used += vma->size;
+
+			free_area_size = addr - prev_end;
+			if (vmi->largest_chunk < free_area_size)
+				vmi->largest_chunk = free_area_size;
+
+			prev_end = vma->size + addr;
+		}
+
+		if (VMALLOC_END - prev_end > vmi->largest_chunk)
+			vmi->largest_chunk = VMALLOC_END - prev_end;
+
+		read_unlock(&vmlist_lock);
+	}
+}
 #endif
 
-- 
cgit 


From f1c4069e1dc128dc8a851174cba2e273652e9216 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <js1304@gmail.com>
Date: Mon, 29 Apr 2013 15:07:37 -0700
Subject: mm, vmalloc: export vmap_area_list, instead of vmlist

Although our intention is to unexport internal structure entirely, but
there is one exception for kexec.  kexec dumps address of vmlist and
makedumpfile uses this information.

We are about to remove vmlist, then another way to retrieve information
of vmalloc layer is needed for makedumpfile.  For this purpose, we
export vmap_area_list, instead of vmlist.

Signed-off-by: Joonsoo Kim <js1304@gmail.com>
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h |  3 +--
 kernel/kexec.c          |  2 +-
 mm/nommu.c              |  3 +--
 mm/vmalloc.c            | 11 ++++++-----
 4 files changed, 9 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 698b1e50d3a4..8a25f9081ed0 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -130,8 +130,7 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
 /*
  *	Internals.  Dont't use..
  */
-extern rwlock_t vmlist_lock;
-extern struct vm_struct *vmlist;
+extern struct list_head vmap_area_list;
 extern __init void vm_area_add_early(struct vm_struct *vm);
 extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
 
diff --git a/kernel/kexec.c b/kernel/kexec.c
index b19181d44201..0b1f7e780d46 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1577,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_SYMBOL(swapper_pg_dir);
 #endif
 	VMCOREINFO_SYMBOL(_stext);
-	VMCOREINFO_SYMBOL(vmlist);
+	VMCOREINFO_SYMBOL(vmap_area_list);
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES
 	VMCOREINFO_SYMBOL(mem_map);
diff --git a/mm/nommu.c b/mm/nommu.c
index e001768b14e8..2f1c75ed468e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -228,8 +228,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
 }
 EXPORT_SYMBOL(follow_pfn);
 
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
+LIST_HEAD(vmap_area_list);
 
 void vfree(const void *addr)
 {
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bda6cef5b97f..7e63984eb585 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -261,7 +261,8 @@ struct vmap_area {
 };
 
 static DEFINE_SPINLOCK(vmap_area_lock);
-static LIST_HEAD(vmap_area_list);
+/* Export for kexec only */
+LIST_HEAD(vmap_area_list);
 static struct rb_root vmap_area_root = RB_ROOT;
 
 /* The vmap cache globals are protected by vmap_area_lock */
@@ -272,6 +273,10 @@ static unsigned long cached_align;
 
 static unsigned long vmap_area_pcpu_hole;
 
+/*** Old vmalloc interfaces ***/
+static DEFINE_RWLOCK(vmlist_lock);
+static struct vm_struct *vmlist;
+
 static struct vmap_area *__find_vmap_area(unsigned long addr)
 {
 	struct rb_node *n = vmap_area_root.rb_node;
@@ -1283,10 +1288,6 @@ int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 }
 EXPORT_SYMBOL_GPL(map_vm_area);
 
-/*** Old vmalloc interfaces ***/
-DEFINE_RWLOCK(vmlist_lock);
-struct vm_struct *vmlist;
-
 static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
 			      unsigned long flags, const void *caller)
 {
-- 
cgit 


From 13ba3fcbbe31068b1ee7c39a0b58ecbed03c4d72 Mon Sep 17 00:00:00 2001
From: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Date: Mon, 29 Apr 2013 15:07:40 -0700
Subject: kexec, vmalloc: export additional vmalloc layer information

Now, vmap_area_list is exported as VMCOREINFO for makedumpfile to get
the start address of vmalloc region (vmalloc_start).  The address which
contains vmalloc_start value is represented as below:

  vmap_area_list.next - OFFSET(vmap_area.list) + OFFSET(vmap_area.va_start)

However, both OFFSET(vmap_area.va_start) and OFFSET(vmap_area.list)
aren't exported as VMCOREINFO.

So this patch exports them externally with small cleanup.

[akpm@linux-foundation.org: vmalloc.h should include list.h for list_head]
Signed-off-by: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Atsushi Kumagai <kumagai-atsushi@mxc.nes.nec.co.jp>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Dave Anderson <anderson@redhat.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Guan Xuetao <gxt@mprc.pku.edu.cn>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmalloc.h | 13 +++++++++++++
 kernel/kexec.c          |  3 ++-
 mm/vmalloc.c            | 11 -----------
 3 files changed, 15 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 8a25f9081ed0..7d5773a99f20 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -3,7 +3,9 @@
 
 #include <linux/spinlock.h>
 #include <linux/init.h>
+#include <linux/list.h>
 #include <asm/page.h>		/* pgprot_t */
+#include <linux/rbtree.h>
 
 struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 
@@ -35,6 +37,17 @@ struct vm_struct {
 	const void		*caller;
 };
 
+struct vmap_area {
+	unsigned long va_start;
+	unsigned long va_end;
+	unsigned long flags;
+	struct rb_node rb_node;         /* address sorted rbtree */
+	struct list_head list;          /* address sorted list */
+	struct list_head purge_list;    /* "lazy purge" list */
+	struct vm_struct *vm;
+	struct rcu_head rcu_head;
+};
+
 /*
  *	Highlevel APIs for driver use
  */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 0b1f7e780d46..b574920cbd4b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1615,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(free_area, free_list);
 	VMCOREINFO_OFFSET(list_head, next);
 	VMCOREINFO_OFFSET(list_head, prev);
-	VMCOREINFO_OFFSET(vm_struct, addr);
+	VMCOREINFO_OFFSET(vmap_area, va_start);
+	VMCOREINFO_OFFSET(vmap_area, list);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
 	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 151da8ac53fa..72043d6c88c0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -249,17 +249,6 @@ EXPORT_SYMBOL(vmalloc_to_pfn);
 #define VM_LAZY_FREEING	0x02
 #define VM_VM_AREA	0x04
 
-struct vmap_area {
-	unsigned long va_start;
-	unsigned long va_end;
-	unsigned long flags;
-	struct rb_node rb_node;		/* address sorted rbtree */
-	struct list_head list;		/* address sorted list */
-	struct list_head purge_list;	/* "lazy purge" list */
-	struct vm_struct *vm;
-	struct rcu_head rcu_head;
-};
-
 static DEFINE_SPINLOCK(vmap_area_lock);
 /* Export for kexec only */
 LIST_HEAD(vmap_area_list);
-- 
cgit 


From 949f7ec5760b021da3cccc1eaeb0671270e4238f Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:07:48 -0700
Subject: mm, hugetlb: include hugepages in meminfo

Particularly in oom conditions, it's troublesome that hugetlb memory is
not displayed.  All other meminfo that is emitted will not add up to
what is expected, and there is no artifact left in the kernel log to
show that a potentially significant amount of memory is actually
allocated as hugepages which are not available to be reclaimed.

Booting with hugepages=8192 on the command line, this memory is now
shown in oom conditions.  For example, with echo m >
/proc/sysrq-trigger:

  Node 0 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB
  Node 1 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB
  Node 2 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB
  Node 3 hugepages_total=2048 hugepages_free=2048 hugepages_surp=0 hugepages_size=2048kB

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h |  4 ++++
 mm/hugetlb.c            | 15 +++++++++++++++
 mm/page_alloc.c         |  3 +++
 3 files changed, 22 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 16e4e9a643fb..3a62df310f2e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -58,6 +58,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 void hugetlb_report_meminfo(struct seq_file *);
 int hugetlb_report_node_meminfo(int, char *);
+void hugetlb_show_meminfo(void);
 unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, unsigned int flags);
@@ -114,6 +115,9 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
 {
 }
 #define hugetlb_report_node_meminfo(n, buf)	0
+static inline void hugetlb_show_meminfo(void)
+{
+}
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
 #define follow_huge_pud(mm, addr, pud, write)	NULL
 #define prepare_hugepage_range(file, addr, len)	(-EINVAL)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 73b864a32017..9b9aeef8e590 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2121,6 +2121,21 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, h->surplus_huge_pages_node[nid]);
 }
 
+void hugetlb_show_meminfo(void)
+{
+	struct hstate *h;
+	int nid;
+
+	for_each_node_state(nid, N_MEMORY)
+		for_each_hstate(h)
+			pr_info("Node %d hugepages_total=%u hugepages_free=%u hugepages_surp=%u hugepages_size=%lukB\n",
+				nid,
+				h->nr_huge_pages_node[nid],
+				h->free_huge_pages_node[nid],
+				h->surplus_huge_pages_node[nid],
+				1UL << (huge_page_order(h) + PAGE_SHIFT - 10));
+}
+
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 72da11c6804d..7350986bbf99 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -58,6 +58,7 @@
 #include <linux/prefetch.h>
 #include <linux/migrate.h>
 #include <linux/page-debug-flags.h>
+#include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 
 #include <asm/tlbflush.h>
@@ -3113,6 +3114,8 @@ void show_free_areas(unsigned int filter)
 		printk("= %lukB\n", K(total));
 	}
 
+	hugetlb_show_meminfo();
+
 	printk("%ld total pagecache pages\n", global_page_state(NR_FILE_PAGES));
 
 	show_swap_cache_info();
-- 
cgit 


From 0aad818b2de455f1bfd7ef87c28cdbbaaed9a699 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Mon, 29 Apr 2013 15:07:50 -0700
Subject: sparse-vmemmap: specify vmemmap population range in bytes

The sparse code, when asking the architecture to populate the vmemmap,
specifies the section range as a starting page and a number of pages.

This is an awkward interface, because none of the arch-specific code
actually thinks of the range in terms of 'struct page' units and always
translates it to bytes first.

In addition, later patches mix huge page and regular page backing for
the vmemmap.  For this, they need to call vmemmap_populate_basepages()
on sub-section ranges with PAGE_SIZE and PMD_SIZE in mind.  But these
are not necessarily multiples of the 'struct page' size and so this unit
is too coarse.

Just translate the section range into bytes once in the generic sparse
code, then pass byte ranges down the stack.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Bernhard Schmidt <Bernhard.Schmidt@lrz.de>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: "Luck, Tony" <tony.luck@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: David S. Miller <davem@davemloft.net>
Tested-by: David S. Miller <davem@davemloft.net>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm64/mm/mmu.c       | 13 +++++--------
 arch/ia64/mm/discontig.c  |  7 +++----
 arch/powerpc/mm/init_64.c | 11 +++--------
 arch/s390/mm/vmem.c       | 15 ++++++---------
 arch/sparc/mm/init_64.c   |  7 +++----
 arch/x86/mm/init_64.c     | 15 +++++----------
 include/linux/mm.h        |  8 ++++----
 mm/sparse-vmemmap.c       | 19 ++++++++++++-------
 mm/sparse.c               | 10 ++++++++--
 9 files changed, 49 insertions(+), 56 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 70b8cd4021c4..eeecc9c8ed68 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -391,17 +391,14 @@ int kern_addr_valid(unsigned long addr)
 }
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 #ifdef CONFIG_ARM64_64K_PAGES
-int __meminit vmemmap_populate(struct page *start_page,
-			       unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
-	return vmemmap_populate_basepages(start_page, size, node);
+	return vmemmap_populate_basepages(start, end, node);
 }
 #else	/* !CONFIG_ARM64_64K_PAGES */
-int __meminit vmemmap_populate(struct page *start_page,
-			       unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
-	unsigned long addr = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long addr = start;
 	unsigned long next;
 	pgd_t *pgd;
 	pud_t *pud;
@@ -434,7 +431,7 @@ int __meminit vmemmap_populate(struct page *start_page,
 	return 0;
 }
 #endif	/* CONFIG_ARM64_64K_PAGES */
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 #endif	/* CONFIG_SPARSEMEM_VMEMMAP */
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index a57436e5d405..ae4db4bd6d97 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -819,13 +819,12 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat)
 #endif
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
-int __meminit vmemmap_populate(struct page *start_page,
-						unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
-	return vmemmap_populate_basepages(start_page, size, node);
+	return vmemmap_populate_basepages(start, end, node);
 }
 
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 7e2246fb2f31..5a535b73ea18 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -263,19 +263,14 @@ static __meminit void vmemmap_list_populate(unsigned long phys,
 	vmemmap_list = vmem_back;
 }
 
-int __meminit vmemmap_populate(struct page *start_page,
-			       unsigned long nr_pages, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
-	unsigned long start = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + nr_pages);
 	unsigned long page_size = 1 << mmu_psize_defs[mmu_vmemmap_psize].shift;
 
 	/* Align to the page size of the linear mapping. */
 	start = _ALIGN_DOWN(start, page_size);
 
-	pr_debug("vmemmap_populate page %p, %ld pages, node %d\n",
-		 start_page, nr_pages, node);
-	pr_debug(" -> map %lx..%lx\n", start, end);
+	pr_debug("vmemmap_populate %lx..%lx, node %d\n", start, end, node);
 
 	for (; start < end; start += page_size) {
 		void *p;
@@ -298,7 +293,7 @@ int __meminit vmemmap_populate(struct page *start_page,
 	return 0;
 }
 
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index ffab84db6907..35837054f734 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -191,19 +191,16 @@ static void vmem_remove_range(unsigned long start, unsigned long size)
 /*
  * Add a backed mem_map array to the virtual mem_map array.
  */
-int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
-	unsigned long address, start_addr, end_addr;
+	unsigned long address = start;
 	pgd_t *pg_dir;
 	pud_t *pu_dir;
 	pmd_t *pm_dir;
 	pte_t *pt_dir;
 	int ret = -ENOMEM;
 
-	start_addr = (unsigned long) start;
-	end_addr = (unsigned long) (start + nr);
-
-	for (address = start_addr; address < end_addr;) {
+	for (address = start; address < end;) {
 		pg_dir = pgd_offset_k(address);
 		if (pgd_none(*pg_dir)) {
 			pu_dir = vmem_pud_alloc();
@@ -262,14 +259,14 @@ int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
 		}
 		address += PAGE_SIZE;
 	}
-	memset(start, 0, nr * sizeof(struct page));
+	memset((void *)start, 0, end - start);
 	ret = 0;
 out:
-	flush_tlb_kernel_range(start_addr, end_addr);
+	flush_tlb_kernel_range(start, end);
 	return ret;
 }
 
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c
index 1588d33d5492..6ac99d64a13c 100644
--- a/arch/sparc/mm/init_64.c
+++ b/arch/sparc/mm/init_64.c
@@ -2181,10 +2181,9 @@ unsigned long vmemmap_table[VMEMMAP_SIZE];
 static long __meminitdata addr_start, addr_end;
 static int __meminitdata node_start;
 
-int __meminit vmemmap_populate(struct page *start, unsigned long nr, int node)
+int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend,
+			       int node)
 {
-	unsigned long vstart = (unsigned long) start;
-	unsigned long vend = (unsigned long) (start + nr);
 	unsigned long phys_start = (vstart - VMEMMAP_BASE);
 	unsigned long phys_end = (vend - VMEMMAP_BASE);
 	unsigned long addr = phys_start & VMEMMAP_CHUNK_MASK;
@@ -2236,7 +2235,7 @@ void __meminit vmemmap_populate_print_last(void)
 	}
 }
 
-void vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 2ef81f19bd6c..528c143f467c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1011,11 +1011,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct)
 	flush_tlb_all();
 }
 
-void __ref vmemmap_free(struct page *memmap, unsigned long nr_pages)
+void __ref vmemmap_free(unsigned long start, unsigned long end)
 {
-	unsigned long start = (unsigned long)memmap;
-	unsigned long end = (unsigned long)(memmap + nr_pages);
-
 	remove_pagetable(start, end, false);
 }
 
@@ -1284,17 +1281,15 @@ static long __meminitdata addr_start, addr_end;
 static void __meminitdata *p_start, *p_end;
 static int __meminitdata node_start;
 
-int __meminit
-vmemmap_populate(struct page *start_page, unsigned long size, int node)
+int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 {
-	unsigned long addr = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long addr;
 	unsigned long next;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 
-	for (; addr < end; addr = next) {
+	for (addr = start; addr < end; addr = next) {
 		void *p = NULL;
 
 		pgd = vmemmap_pgd_populate(addr, node);
@@ -1351,7 +1346,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
 		}
 
 	}
-	sync_global_pgds((unsigned long)start_page, end - 1);
+	sync_global_pgds(start, end - 1);
 	return 0;
 }
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 98a44376e4f3..6d7266842abd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1764,12 +1764,12 @@ pte_t *vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node);
 void *vmemmap_alloc_block(unsigned long size, int node);
 void *vmemmap_alloc_block_buf(unsigned long size, int node);
 void vmemmap_verify(pte_t *, int, unsigned long, unsigned long);
-int vmemmap_populate_basepages(struct page *start_page,
-						unsigned long pages, int node);
-int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
+int vmemmap_populate_basepages(unsigned long start, unsigned long end,
+			       int node);
+int vmemmap_populate(unsigned long start, unsigned long end, int node);
 void vmemmap_populate_print_last(void);
 #ifdef CONFIG_MEMORY_HOTPLUG
-void vmemmap_free(struct page *memmap, unsigned long nr_pages);
+void vmemmap_free(unsigned long start, unsigned long end);
 #endif
 void register_page_bootmem_memmap(unsigned long section_nr, struct page *map,
 				  unsigned long size);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 22b7e18e9dea..27eeab3be757 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -147,11 +147,10 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
 	return pgd;
 }
 
-int __meminit vmemmap_populate_basepages(struct page *start_page,
-						unsigned long size, int node)
+int __meminit vmemmap_populate_basepages(unsigned long start,
+					 unsigned long end, int node)
 {
-	unsigned long addr = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long addr = start;
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
@@ -178,9 +177,15 @@ int __meminit vmemmap_populate_basepages(struct page *start_page,
 
 struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid)
 {
-	struct page *map = pfn_to_page(pnum * PAGES_PER_SECTION);
-	int error = vmemmap_populate(map, PAGES_PER_SECTION, nid);
-	if (error)
+	unsigned long start;
+	unsigned long end;
+	struct page *map;
+
+	map = pfn_to_page(pnum * PAGES_PER_SECTION);
+	start = (unsigned long)map;
+	end = (unsigned long)(map + PAGES_PER_SECTION);
+
+	if (vmemmap_populate(start, end, nid))
 		return NULL;
 
 	return map;
diff --git a/mm/sparse.c b/mm/sparse.c
index 7ca6dc847947..a37be5f9050d 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -615,11 +615,17 @@ static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
 }
 static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 {
-	vmemmap_free(memmap, nr_pages);
+	unsigned long start = (unsigned long)memmap;
+	unsigned long end = (unsigned long)(memmap + nr_pages);
+
+	vmemmap_free(start, end);
 }
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
-	vmemmap_free(memmap, nr_pages);
+	unsigned long start = (unsigned long)memmap;
+	unsigned long end = (unsigned long)(memmap + nr_pages);
+
+	vmemmap_free(start, end);
 }
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
-- 
cgit 


From f9872caf07c1c774034b8bddde7d4a3a7f4a6484 Mon Sep 17 00:00:00 2001
From: Cody P Schafer <cody@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:08:01 -0700
Subject: page_alloc: make setup_nr_node_ids() usable for arch init code

powerpc and x86 were opencoding copies of setup_nr_node_ids(), which
page_alloc provides but makes static.  Make it avaliable to the archs in
linux/mm.h.

Signed-off-by: Cody P Schafer <cody@linux.vnet.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 6 ++++++
 mm/page_alloc.c    | 6 +-----
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6d7266842abd..7aa11a6736eb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1816,5 +1816,11 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
 static inline bool page_is_guard(struct page *page) { return false; }
 #endif /* CONFIG_DEBUG_PAGEALLOC */
 
+#if MAX_NUMNODES > 1
+void __init setup_nr_node_ids(void);
+#else
+static inline void setup_nr_node_ids(void) {}
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 5a234b64f3ac..98cbdf6e5532 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4749,7 +4749,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
 /*
  * Figure out the number of possible node ids.
  */
-static void __init setup_nr_node_ids(void)
+void __init setup_nr_node_ids(void)
 {
 	unsigned int node;
 	unsigned int highest = 0;
@@ -4758,10 +4758,6 @@ static void __init setup_nr_node_ids(void)
 		highest = node;
 	nr_node_ids = highest + 1;
 }
-#else
-static inline void setup_nr_node_ids(void)
-{
-}
 #endif
 
 /**
-- 
cgit 


From f02c696800886382198df897b30bb796b46a8dae Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Apr 2013 15:08:04 -0700
Subject: include/linux/memory.h: implement register_hotmemory_notifier()

When CONFIG_MEMORY_HOTPLUG=n, we don't want the memory-hotplug notifier
handlers to be included in the .o files, for space reasons.

The existing hotplug_memory_notifier() tries to handle this but testing
with gcc-4.4.4 shows that it doesn't work - the hotplug functions are
still present in the .o files.

So implement a new register_hotmemory_notifier() which is a copy of
register_hotcpu_notifier(), and which actually works as desired.
hotplug_memory_notifier() and register_memory_notifier() callsites
should be converted to use this new register_hotmemory_notifier().

While we're there, let's repair the existing hotplug_memory_notifier():
it simply stomps on the register_memory_notifier() return value, so
well-behaved code cannot check for errors.  Apparently non of the
existing callers were well-behaved :(

Cc: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h   | 14 ++++++++++----
 include/linux/notifier.h |  5 ++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 45e93b468878..0ff6598ee62f 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -18,6 +18,7 @@
 #include <linux/node.h>
 #include <linux/compiler.h>
 #include <linux/mutex.h>
+#include <linux/notifier.h>
 
 #define MIN_MEMORY_BLOCK_SIZE     (1UL << SECTION_SIZE_BITS)
 
@@ -127,13 +128,18 @@ enum mem_add_context { BOOT, HOTPLUG };
 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-#define hotplug_memory_notifier(fn, pri) {			\
+#define hotplug_memory_notifier(fn, pri) ({		\
 	static __meminitdata struct notifier_block fn##_mem_nb =\
-		{ .notifier_call = fn, .priority = pri };	\
+		{ .notifier_call = fn, .priority = pri };\
 	register_memory_notifier(&fn##_mem_nb);			\
-}
+})
+#define register_hotmemory_notifier(nb)		register_memory_notifier(nb)
+#define unregister_hotmemory_notifier(nb) 	unregister_memory_notifier(nb)
 #else
-#define hotplug_memory_notifier(fn, pri) do { } while (0)
+#define hotplug_memory_notifier(fn, pri)	(0)
+/* These aren't inline functions due to a GCC bug. */
+#define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
+#define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
 #endif
 
 /*
diff --git a/include/linux/notifier.h b/include/linux/notifier.h
index d65746efc954..d14a4c362465 100644
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -47,8 +47,11 @@
  * runtime initialization.
  */
 
+typedef	int (*notifier_fn_t)(struct notifier_block *nb,
+			unsigned long action, void *data);
+
 struct notifier_block {
-	int (*notifier_call)(struct notifier_block *, unsigned long, void *);
+	notifier_fn_t notifier_call;
 	struct notifier_block __rcu *next;
 	int priority;
 };
-- 
cgit 


From c9b1d0981fcce3d9976d7b7a56e4e0503bc610dd Mon Sep 17 00:00:00 2001
From: Andrew Shewmaker <agshew@gmail.com>
Date: Mon, 29 Apr 2013 15:08:10 -0700
Subject: mm: limit growth of 3% hardcoded other user reserve

Add user_reserve_kbytes knob.

Limit the growth of the memory reserved for other user processes to
min(3% current process size, user_reserve_pages).  Only about 8MB is
necessary to enable recovery in the default mode, and only a few hundred
MB are required even when overcommit is disabled.

user_reserve_pages defaults to min(3% free pages, 128MB)

I arrived at 128MB by taking the max VSZ of sshd, login, bash, and top ...
then adding the RSS of each.

This only affects OVERCOMMIT_NEVER mode.

Background

1. user reserve

__vm_enough_memory reserves a hardcoded 3% of the current process size for
other applications when overcommit is disabled.  This was done so that a
user could recover if they launched a memory hogging process.  Without the
reserve, a user would easily run into a message such as:

bash: fork: Cannot allocate memory

2. admin reserve

Additionally, a hardcoded 3% of free memory is reserved for root in both
overcommit 'guess' and 'never' modes.  This was intended to prevent a
scenario where root-cant-log-in and perform recovery operations.

Note that this reserve shrinks, and doesn't guarantee a useful reserve.

Motivation

The two hardcoded memory reserves should be updated to account for current
memory sizes.

Also, the admin reserve would be more useful if it didn't shrink too much.

When the current code was originally written, 1GB was considered
"enterprise".  Now the 3% reserve can grow to multiple GB on large memory
systems, and it only needs to be a few hundred MB at most to enable a user
or admin to recover a system with an unwanted memory hogging process.

I've found that reducing these reserves is especially beneficial for a
specific type of application load:

 * single application system
 * one or few processes (e.g. one per core)
 * allocating all available memory
 * not initializing every page immediately
 * long running

I've run scientific clusters with this sort of load.  A long running job
sometimes failed many hours (weeks of CPU time) into a calculation.  They
weren't initializing all of their memory immediately, and they weren't
using calloc, so I put systems into overcommit 'never' mode.  These
clusters run diskless and have no swap.

However, with the current reserves, a user wishing to allocate as much
memory as possible to one process may be prevented from using, for
example, almost 2GB out of 32GB.

The effect is less, but still significant when a user starts a job with
one process per core.  I have repeatedly seen a set of processes
requesting the same amount of memory fail because one of them could not
allocate the amount of memory a user would expect to be able to allocate.
For example, Message Passing Interfce (MPI) processes, one per core.  And
it is similar for other parallel programming frameworks.

Changing this reserve code will make the overcommit never mode more useful
by allowing applications to allocate nearly all of the available memory.

Also, the new admin_reserve_kbytes will be safer than the current behavior
since the hardcoded 3% of available memory reserve can shrink to something
useless in the case where applications have grabbed all available memory.

Risks

* "bash: fork: Cannot allocate memory"

  The downside of the first patch-- which creates a tunable user reserve
  that is only used in overcommit 'never' mode--is that an admin can set
  it so low that a user may not be able to kill their process, even if
  they already have a shell prompt.

  Of course, a user can get in the same predicament with the current 3%
  reserve--they just have to launch processes until 3% becomes negligible.

* root-cant-log-in problem

  The second patch, adding the tunable rootuser_reserve_pages, allows
  the admin to shoot themselves in the foot by setting it too small.  They
  can easily get the system into a state where root-can't-log-in.

  However, the new admin_reserve_kbytes will be safer than the current
  behavior since the hardcoded 3% of available memory reserve can shrink
  to something useless in the case where applications have grabbed all
  available memory.

Alternatives

 * Memory cgroups provide a more flexible way to limit application memory.

   Not everyone wants to set up cgroups or deal with their overhead.

 * We could create a fourth overcommit mode which provides smaller reserves.

   The size of useful reserves may be drastically different depending
   on the whether the system is embedded or enterprise.

 * Force users to initialize all of their memory or use calloc.

   Some users don't want/expect the system to overcommit when they malloc.
   Overcommit 'never' mode is for this scenario, and it should work well.

The new user and admin reserve tunables are simple to use, with low
overhead compared to cgroups.  The patches preserve current behavior where
3% of memory is less than 128MB, except that the admin reserve doesn't
shrink to an unusable size under pressure.  The code allows admins to tune
for embedded and enterprise usage.

FAQ

 * How is the root-cant-login problem addressed?
   What happens if admin_reserve_pages is set to 0?

   Root is free to shoot themselves in the foot by setting
   admin_reserve_kbytes too low.

   On x86_64, the minimum useful reserve is:
     8MB for overcommit 'guess'
   128MB for overcommit 'never'

   admin_reserve_pages defaults to min(3% free memory, 8MB)

   So, anyone switching to 'never' mode needs to adjust
   admin_reserve_pages.

 * How do you calculate a minimum useful reserve?

   A user or the admin needs enough memory to login and perform
   recovery operations, which includes, at a minimum:

   sshd or login + bash (or some other shell) + top (or ps, kill, etc.)

   For overcommit 'guess', we can sum resident set sizes (RSS)
   because we only need enough memory to handle what the recovery
   programs will typically use. On x86_64 this is about 8MB.

   For overcommit 'never', we can take the max of their virtual sizes (VSZ)
   and add the sum of their RSS. We use VSZ instead of RSS because mode
   forces us to ensure we can fulfill all of the requested memory allocations--
   even if the programs only use a fraction of what they ask for.
   On x86_64 this is about 128MB.

   When swap is enabled, reserves are useful even when they are as
   small as 10MB, regardless of overcommit mode.

   When both swap and overcommit are disabled, then the admin should
   tune the reserves higher to be absolutley safe. Over 230MB each
   was safest in my testing.

 * What happens if user_reserve_pages is set to 0?

   Note, this only affects overcomitt 'never' mode.

   Then a user will be able to allocate all available memory minus
   admin_reserve_kbytes.

   However, they will easily see a message such as:

   "bash: fork: Cannot allocate memory"

   And they won't be able to recover/kill their application.
   The admin should be able to recover the system if
   admin_reserve_kbytes is set appropriately.

 * What's the difference between overcommit 'guess' and 'never'?

   "Guess" allows an allocation if there are enough free + reclaimable
   pages. It has a hardcoded 3% of free pages reserved for root.

   "Never" allows an allocation if there is enough swap + a configurable
   percentage (default is 50) of physical RAM. It has a hardcoded 3% of
   free pages reserved for root, like "Guess" mode. It also has a
   hardcoded 3% of the current process size reserved for additional
   applications.

 * Why is overcommit 'guess' not suitable even when an app eventually
   writes to every page? It takes free pages, file pages, available
   swap pages, reclaimable slab pages into consideration. In other words,
   these are all pages available, then why isn't overcommit suitable?

   Because it only looks at the present state of the system. It
   does not take into account the memory that other applications have
   malloced, but haven't initialized yet. It overcommits the system.

Test Summary

There was little change in behavior in the default overcommit 'guess'
mode with swap enabled before and after the patch. This was expected.

Systems run most predictably (i.e. no oom kills) in overcommit 'never'
mode with swap enabled. This also allowed the most memory to be allocated
to a user application.

Overcommit 'guess' mode without swap is a bad idea. It is easy to
crash the system. None of the other tested combinations crashed.
This matches my experience on the Roadrunner supercomputer.

Without the tunable user reserve, a system in overcommit 'never' mode
and without swap does not allow the admin to recover, although the
admin can.

With the new tunable reserves, a system in overcommit 'never' mode
and without swap can be configured to:

1. maximize user-allocatable memory, running close to the edge of
recoverability

2. maximize recoverability, sacrificing allocatable memory to
ensure that a user cannot take down a system

Test Description

Fedora 18 VM - 4 x86_64 cores, 5725MB RAM, 4GB Swap

System is booted into multiuser console mode, with unnecessary services
turned off. Caches were dropped before each test.

Hogs are user memtester processes that attempt to allocate all free memory
as reported by /proc/meminfo

In overcommit 'never' mode, memory_ratio=100

Test Results

3.9.0-rc1-mm1

Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery
----------   ----   ----   -------------   ----   -------------   --------------
guess        yes    1      5432/5432       no     yes             yes
guess        yes    4      5444/5444       1      yes             yes
guess        no     1      5302/5449       no     yes             yes
guess        no     4      -               crash  no              no

never        yes    1      5460/5460       1      yes             yes
never        yes    4      5460/5460       1      yes             yes
never        no     1      5218/5432       no     no              yes
never        no     4      5203/5448       no     no              yes

3.9.0-rc1-mm1-tunablereserves

User and Admin Recovery show their respective reserves, if applicable.

Overcommit | Swap | Hogs | MB Got/Wanted | OOMs | User Recovery | Admin Recovery
----------   ----   ----   -------------   ----   -------------   --------------
guess        yes    1      5419/5419       no     - yes           8MB yes
guess        yes    4      5436/5436       1      - yes           8MB yes
guess        no     1      5440/5440       *      - yes           8MB yes
guess        no     4      -               crash  - no            8MB no

* process would successfully mlock, then the oom killer would pick it

never        yes    1      5446/5446       no     10MB yes        20MB yes
never        yes    4      5456/5456       no     10MB yes        20MB yes
never        no     1      5387/5429       no     128MB no        8MB barely
never        no     1      5323/5428       no     226MB barely    8MB barely
never        no     1      5323/5428       no     226MB barely    8MB barely

never        no     1      5359/5448       no     10MB no         10MB barely

never        no     1      5323/5428       no     0MB no          10MB barely
never        no     1      5332/5428       no     0MB no          50MB yes
never        no     1      5293/5429       no     0MB no          90MB yes

never        no     1      5001/5427       no     230MB yes       338MB yes
never        no     4*     4998/5424       no     230MB yes       338MB yes

* more memtesters were launched, able to allocate approximately another 100MB

Future Work

 - Test larger memory systems.

 - Test an embedded image.

 - Test other architectures.

 - Time malloc microbenchmarks.

 - Would it be useful to be able to set overcommit policy for
   each memory cgroup?

 - Some lines are slightly above 80 chars.
   Perhaps define a macro to convert between pages and kb?
   Other places in the kernel do this.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_user_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt            | 20 +++++++++++++++++++
 Documentation/vm/overcommit-accounting |  8 +++++++-
 include/linux/mm.h                     |  2 ++
 kernel/sysctl.c                        |  7 +++++++
 mm/mmap.c                              | 35 +++++++++++++++++++++++++++++-----
 mm/nommu.c                             | 35 +++++++++++++++++++++++++++++-----
 6 files changed, 96 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 078701fdbd4d..f69895738357 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -53,6 +53,7 @@ Currently, these files are in /proc/sys/vm:
 - percpu_pagelist_fraction
 - stat_interval
 - swappiness
+- user_reserve_kbytes
 - vfs_cache_pressure
 - zone_reclaim_mode
 
@@ -542,6 +543,7 @@ memory until it actually runs out.
 
 When this flag is 2, the kernel uses a "never overcommit"
 policy that attempts to prevent any overcommit of memory.
+Note that user_reserve_kbytes affects this policy.
 
 This feature can be very useful because there are a lot of
 programs that malloc() huge amounts of memory "just-in-case"
@@ -645,6 +647,24 @@ The default value is 60.
 
 ==============================================================
 
+- user_reserve_kbytes
+
+When overcommit_memory is set to 2, "never overommit" mode, reserve
+min(3% of current process size, user_reserve_kbytes) of free memory.
+This is intended to prevent a user from starting a single memory hogging
+process, such that they cannot recover (kill the hog).
+
+user_reserve_kbytes defaults to min(3% of the current process size, 128MB).
+
+If this is reduced to zero, then the user will be allowed to allocate
+all free memory with a single process, minus admin_reserve_kbytes.
+Any subsequent attempts to execute a command will result in
+"fork: Cannot allocate memory".
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
 vfs_cache_pressure
 ------------------
 
diff --git a/Documentation/vm/overcommit-accounting b/Documentation/vm/overcommit-accounting
index 706d7ed9d8d2..8eaa2fc4b8fa 100644
--- a/Documentation/vm/overcommit-accounting
+++ b/Documentation/vm/overcommit-accounting
@@ -8,7 +8,9 @@ The Linux kernel supports the following overcommit handling modes
 		default.
 
 1	-	Always overcommit. Appropriate for some scientific
-		applications.
+		applications. Classic example is code using sparse arrays
+		and just relying on the virtual memory consisting almost
+		entirely of zero pages.
 
 2	-	Don't overcommit. The total address space commit
 		for the system is not permitted to exceed swap + a
@@ -18,6 +20,10 @@ The Linux kernel supports the following overcommit handling modes
 		pages but will receive errors on memory allocation as
 		appropriate.
 
+		Useful for applications that want to guarantee their
+		memory allocations will be available in the future
+		without having to initialize every page.
+
 The overcommit policy is set via the sysctl `vm.overcommit_memory'.
 
 The overcommit percentage is set via `vm.overcommit_ratio'.
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7aa11a6736eb..43cfaabbde40 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -44,6 +44,8 @@ extern int sysctl_legacy_va_layout;
 #include <asm/pgtable.h>
 #include <asm/processor.h>
 
+extern unsigned long sysctl_user_reserve_kbytes;
+
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
 /* to align the pointer to the (next) page boundary */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dadde52253c..6daabb72bdb5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1429,6 +1429,13 @@ static struct ctl_table vm_table[] = {
 		.extra2		= &one,
 	},
 #endif
+	{
+		.procname	= "user_reserve_kbytes",
+		.data		= &sysctl_user_reserve_kbytes,
+		.maxlen		= sizeof(sysctl_user_reserve_kbytes),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{ }
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 081e6da8e1a4..80a965f35251 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -84,6 +84,7 @@ EXPORT_SYMBOL(vm_get_page_prot);
 int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic overcommit */
 int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 /*
  * Make sure vm_committed_as in one cacheline and not cacheline shared with
  * other variables. It can be updated by several CPUs frequently.
@@ -122,7 +123,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-	unsigned long free, allowed;
+	unsigned long free, allowed, reserve;
 
 	vm_acct_memory(pages);
 
@@ -183,10 +184,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		allowed -= allowed / 32;
 	allowed += total_swap_pages;
 
-	/* Don't let a single process grow too big:
-	   leave 3% of the size of this process for other processes */
-	if (mm)
-		allowed -= mm->total_vm / 32;
+	/*
+	 * Don't let a single process grow so big a user can't recover
+	 */
+	if (mm) {
+		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+		allowed -= min(mm->total_vm / 32, reserve);
+	}
 
 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
@@ -3094,3 +3098,24 @@ void __init mmap_init(void)
 	ret = percpu_counter_init(&vm_committed_as, 0);
 	VM_BUG_ON(ret);
 }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+	return 0;
+}
+module_init(init_user_reserve)
diff --git a/mm/nommu.c b/mm/nommu.c
index 2f1c75ed468e..58e4a0a5125f 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -63,6 +63,7 @@ int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
+unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
 int heap_stack_gap = 0;
 
 atomic_long_t mmap_pages_allocated;
@@ -1897,7 +1898,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
  */
 int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
-	unsigned long free, allowed;
+	unsigned long free, allowed, reserve;
 
 	vm_acct_memory(pages);
 
@@ -1957,10 +1958,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 		allowed -= allowed / 32;
 	allowed += total_swap_pages;
 
-	/* Don't let a single process grow too big:
-	   leave 3% of the size of this process for other processes */
-	if (mm)
-		allowed -= mm->total_vm / 32;
+	/*
+	 * Don't let a single process grow so big a user can't recover
+	 */
+	if (mm) {
+		reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+		allowed -= min(mm->total_vm / 32, reserve);
+	}
 
 	if (percpu_counter_read_positive(&vm_committed_as) < allowed)
 		return 0;
@@ -2122,3 +2126,24 @@ int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
 	up_write(&nommu_region_sem);
 	return 0;
 }
+
+/*
+ * Initialise sysctl_user_reserve_kbytes.
+ *
+ * This is intended to prevent a user from starting a single memory hogging
+ * process, such that they cannot recover (kill the hog) in OVERCOMMIT_NEVER
+ * mode.
+ *
+ * The default value is min(3% of free memory, 128MB)
+ * 128MB is enough to recover with sshd/login, bash, and top/kill.
+ */
+static int __meminit init_user_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_user_reserve_kbytes = min(free_kbytes / 32, 1UL << 17);
+	return 0;
+}
+module_init(init_user_reserve)
-- 
cgit 


From 4eeab4f5580d11bffedc697684b91b0bca0d5009 Mon Sep 17 00:00:00 2001
From: Andrew Shewmaker <agshew@gmail.com>
Date: Mon, 29 Apr 2013 15:08:11 -0700
Subject: mm: replace hardcoded 3% with admin_reserve_pages knob

Add an admin_reserve_kbytes knob to allow admins to change the hardcoded
memory reserve to something other than 3%, which may be multiple
gigabytes on large memory systems.  Only about 8MB is necessary to
enable recovery in the default mode, and only a few hundred MB are
required even when overcommit is disabled.

This affects OVERCOMMIT_GUESS and OVERCOMMIT_NEVER.

admin_reserve_kbytes is initialized to min(3% free pages, 8MB)

I arrived at 8MB by summing the RSS of sshd or login, bash, and top.

Please see first patch in this series for full background, motivation,
testing, and full changelog.

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: make init_admin_reserve() static]
Signed-off-by: Andrew Shewmaker <agshew@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 30 ++++++++++++++++++++++++++++++
 include/linux/mm.h          |  1 +
 kernel/sysctl.c             |  7 +++++++
 mm/mmap.c                   | 30 ++++++++++++++++++++++++++----
 mm/nommu.c                  | 30 ++++++++++++++++++++++++++----
 5 files changed, 90 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index f69895738357..dcc75a9ed919 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -18,6 +18,7 @@ files can be found in mm/swap.c.
 
 Currently, these files are in /proc/sys/vm:
 
+- admin_reserve_kbytes
 - block_dump
 - compact_memory
 - dirty_background_bytes
@@ -59,6 +60,35 @@ Currently, these files are in /proc/sys/vm:
 
 ==============================================================
 
+admin_reserve_kbytes
+
+The amount of free memory in the system that should be reserved for users
+with the capability cap_sys_admin.
+
+admin_reserve_kbytes defaults to min(3% of free pages, 8MB)
+
+That should provide enough for the admin to log in and kill a process,
+if necessary, under the default overcommit 'guess' mode.
+
+Systems running under overcommit 'never' should increase this to account
+for the full Virtual Memory Size of programs used to recover. Otherwise,
+root may not be able to log in to recover the system.
+
+How do you calculate a minimum useful reserve?
+
+sshd or login + bash (or some other shell) + top (or ps, kill, etc.)
+
+For overcommit 'guess', we can sum resident set sizes (RSS).
+On x86_64 this is about 8MB.
+
+For overcommit 'never', we can take the max of their virtual sizes (VSZ)
+and add the sum of their RSS.
+On x86_64 this is about 128MB.
+
+Changing this takes effect whenever an application requests memory.
+
+==============================================================
+
 block_dump
 
 block_dump enables block I/O debugging when set to a nonzero value. More
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43cfaabbde40..c05d7cfbb6b9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -45,6 +45,7 @@ extern int sysctl_legacy_va_layout;
 #include <asm/processor.h>
 
 extern unsigned long sysctl_user_reserve_kbytes;
+extern unsigned long sysctl_admin_reserve_kbytes;
 
 #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n))
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6daabb72bdb5..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1436,6 +1436,13 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+	{
+		.procname	= "admin_reserve_kbytes",
+		.data		= &sysctl_admin_reserve_kbytes,
+		.maxlen		= sizeof(sysctl_admin_reserve_kbytes),
+		.mode		= 0644,
+		.proc_handler	= proc_doulongvec_minmax,
+	},
 	{ }
 };
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 80a965f35251..5485f18e6631 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -85,6 +85,7 @@ int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS;  /* heuristic ove
 int sysctl_overcommit_ratio __read_mostly = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 /*
  * Make sure vm_committed_as in one cacheline and not cacheline shared with
  * other variables. It can be updated by several CPUs frequently.
@@ -164,10 +165,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 			free -= totalreserve_pages;
 
 		/*
-		 * Leave the last 3% for root
+		 * Reserve some for root
 		 */
 		if (!cap_sys_admin)
-			free -= free / 32;
+			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 
 		if (free > pages)
 			return 0;
@@ -178,10 +179,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 	allowed = (totalram_pages - hugetlb_total_pages())
 	       	* sysctl_overcommit_ratio / 100;
 	/*
-	 * Leave the last 3% for root
+	 * Reserve some for root
 	 */
 	if (!cap_sys_admin)
-		allowed -= allowed / 32;
+		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 	allowed += total_swap_pages;
 
 	/*
@@ -3119,3 +3120,24 @@ static int __meminit init_user_reserve(void)
 	return 0;
 }
 module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+	return 0;
+}
+module_init(init_admin_reserve)
diff --git a/mm/nommu.c b/mm/nommu.c
index 58e4a0a5125f..fbe3e2f317eb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -64,6 +64,7 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
+unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
 int heap_stack_gap = 0;
 
 atomic_long_t mmap_pages_allocated;
@@ -1939,10 +1940,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 			free -= totalreserve_pages;
 
 		/*
-		 * Leave the last 3% for root
+		 * Reserve some for root
 		 */
 		if (!cap_sys_admin)
-			free -= free / 32;
+			free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 
 		if (free > pages)
 			return 0;
@@ -1952,10 +1953,10 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 
 	allowed = totalram_pages * sysctl_overcommit_ratio / 100;
 	/*
-	 * Leave the last 3% for root
+	 * Reserve some 3% for root
 	 */
 	if (!cap_sys_admin)
-		allowed -= allowed / 32;
+		allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
 	allowed += total_swap_pages;
 
 	/*
@@ -2147,3 +2148,24 @@ static int __meminit init_user_reserve(void)
 	return 0;
 }
 module_init(init_user_reserve)
+
+/*
+ * Initialise sysctl_admin_reserve_kbytes.
+ *
+ * The purpose of sysctl_admin_reserve_kbytes is to allow the sys admin
+ * to log in and kill a memory hogging process.
+ *
+ * Systems with more than 256MB will reserve 8MB, enough to recover
+ * with sshd, bash, and top in OVERCOMMIT_GUESS. Smaller systems will
+ * only reserve 3% of free pages by default.
+ */
+static int __meminit init_admin_reserve(void)
+{
+	unsigned long free_kbytes;
+
+	free_kbytes = global_page_state(NR_FREE_PAGES) << (PAGE_SHIFT - 10);
+
+	sysctl_admin_reserve_kbytes = min(free_kbytes / 32, 1UL << 13);
+	return 0;
+}
+module_init(init_admin_reserve)
-- 
cgit 


From f1cb08798e2497238b28f377bd131426f0b9835d Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Mon, 29 Apr 2013 15:08:14 -0700
Subject: mm: remove CONFIG_HOTPLUG ifdefs

CONFIG_HOTPLUG is going away as an option, cleanup CONFIG_HOTPLUG
ifdefs in mm files.

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 7 +------
 mm/vmstat.c            | 2 --
 2 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 5fd71a7d0dfd..c586679b6fef 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -48,13 +48,8 @@ static inline void count_vm_events(enum vm_event_item item, long delta)
 }
 
 extern void all_vm_events(unsigned long *);
-#ifdef CONFIG_HOTPLUG
+
 extern void vm_events_fold_cpu(int cpu);
-#else
-static inline void vm_events_fold_cpu(int cpu)
-{
-}
-#endif
 
 #else
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e1d8ed172c42..c823776f7059 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -52,7 +52,6 @@ void all_vm_events(unsigned long *ret)
 }
 EXPORT_SYMBOL_GPL(all_vm_events);
 
-#ifdef CONFIG_HOTPLUG
 /*
  * Fold the foreign cpu events into our own.
  *
@@ -69,7 +68,6 @@ void vm_events_fold_cpu(int cpu)
 		fold_state->event[i] = 0;
 	}
 }
-#endif /* CONFIG_HOTPLUG */
 
 #endif /* CONFIG_VM_EVENT_COUNTERS */
 
-- 
cgit 


From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001
From: Toshi Kani <toshi.kani@hp.com>
Date: Mon, 29 Apr 2013 15:08:19 -0700
Subject: resource: add release_mem_region_adjustable()

Add release_mem_region_adjustable(), which releases a requested region
from a currently busy memory resource.  This interface adjusts the
matched memory resource accordingly even if the requested region does
not match exactly but still fits into.

This new interface is intended for memory hot-delete.  During bootup,
memory resources are inserted from the boot descriptor table, such as
EFI Memory Table and e820.  Each memory resource entry usually covers
the whole contigous memory range.  Memory hot-delete request, on the
other hand, may target to a particular range of memory resource, and its
size can be much smaller than the whole contiguous memory.  Since the
existing release interfaces like __release_region() require a requested
region to be exactly matched to a resource entry, they do not allow a
partial resource to be released.

This new interface is restrictive (i.e.  release under certain
conditions), which is consistent with other release interfaces,
__release_region() and __release_resource().  Additional release
conditions, such as an overlapping region to a resource entry, can be
supported after they are confirmed as valid cases.

There is no change to the existing interfaces since their restriction is
valid for I/O resources.

[akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()]
[akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily]
[akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Reviewed-by : Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Reviewed-by: Ram Pai <linuxram@us.ibm.com>
Cc: T Makphaibulchoke <tmac@hp.com>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Jiang Liu <jiang.liu@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ioport.h |   4 ++
 kernel/resource.c      | 103 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ioport.h b/include/linux/ioport.h
index 85ac9b9b72a2..89b7c24a36e9 100644
--- a/include/linux/ioport.h
+++ b/include/linux/ioport.h
@@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *,
 extern int __check_region(struct resource *, resource_size_t, resource_size_t);
 extern void __release_region(struct resource *, resource_size_t,
 				resource_size_t);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+extern int release_mem_region_adjustable(struct resource *, resource_size_t,
+				resource_size_t);
+#endif
 
 static inline int __deprecated check_region(resource_size_t s,
 						resource_size_t n)
diff --git a/kernel/resource.c b/kernel/resource.c
index ae246f97c5d3..4aef8867fd4b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1021,6 +1021,109 @@ void __release_region(struct resource *parent, resource_size_t start,
 }
 EXPORT_SYMBOL(__release_region);
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete.  The requested region
+ * is released from a currently busy memory resource.  The requested region
+ * must either match exactly or fit into a single busy resource entry.  In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ *   supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ *   assumes that all children remain in the lower address entry for
+ *   simplicity.  Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+			resource_size_t start, resource_size_t size)
+{
+	struct resource **p;
+	struct resource *res;
+	struct resource *new_res;
+	resource_size_t end;
+	int ret = -EINVAL;
+
+	end = start + size - 1;
+	if ((start < parent->start) || (end > parent->end))
+		return ret;
+
+	/* The kzalloc() result gets checked later */
+	new_res = kzalloc(sizeof(struct resource), GFP_KERNEL);
+
+	p = &parent->child;
+	write_lock(&resource_lock);
+
+	while ((res = *p)) {
+		if (res->start >= end)
+			break;
+
+		/* look for the next resource if it does not fit into */
+		if (res->start > start || res->end < end) {
+			p = &res->sibling;
+			continue;
+		}
+
+		if (!(res->flags & IORESOURCE_MEM))
+			break;
+
+		if (!(res->flags & IORESOURCE_BUSY)) {
+			p = &res->child;
+			continue;
+		}
+
+		/* found the target resource; let's adjust accordingly */
+		if (res->start == start && res->end == end) {
+			/* free the whole entry */
+			*p = res->sibling;
+			kfree(res);
+			ret = 0;
+		} else if (res->start == start && res->end != end) {
+			/* adjust the start */
+			ret = __adjust_resource(res, end + 1,
+						res->end - end);
+		} else if (res->start != start && res->end == end) {
+			/* adjust the end */
+			ret = __adjust_resource(res, res->start,
+						start - res->start);
+		} else {
+			/* split into two entries */
+			if (!new_res) {
+				ret = -ENOMEM;
+				break;
+			}
+			new_res->name = res->name;
+			new_res->start = end + 1;
+			new_res->end = res->end;
+			new_res->flags = res->flags;
+			new_res->parent = res->parent;
+			new_res->sibling = res->sibling;
+			new_res->child = NULL;
+
+			ret = __adjust_resource(res, res->start,
+						start - res->start);
+			if (ret)
+				break;
+			res->sibling = new_res;
+			new_res = NULL;
+		}
+
+		break;
+	}
+
+	write_unlock(&resource_lock);
+	kfree(new_res);
+	return ret;
+}
+#endif	/* CONFIG_MEMORY_HOTREMOVE */
+
 /*
  * Managed region resource
  */
-- 
cgit 


From 4edd7ceff0662afde195da6f6c43e7cbe1ed2dc4 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 29 Apr 2013 15:08:22 -0700
Subject: mm, hotplug: avoid compiling memory hotremove functions when disabled

__remove_pages() is only necessary for CONFIG_MEMORY_HOTREMOVE.  PowerPC
pseries will return -EOPNOTSUPP if unsupported.

Adding an #ifdef causes several other functions it depends on to also
become unnecessary, which saves in .text when disabled (it's disabled in
most defconfigs besides powerpc, including x86).  remove_memory_block()
becomes static since it is not referenced outside of
drivers/base/memory.c.

Build tested on x86 and powerpc with CONFIG_MEMORY_HOTREMOVE both enabled
and disabled.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Toshi Kani <toshi.kani@hp.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Wen Congyang <wency@cn.fujitsu.com>
Cc: Tang Chen <tangchen@cn.fujitsu.com>
Cc: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/platforms/pseries/hotplug-memory.c | 12 +++++
 drivers/base/memory.c                           | 44 +++++++--------
 include/linux/memory.h                          |  3 +-
 include/linux/memory_hotplug.h                  |  4 +-
 mm/memory_hotplug.c                             | 68 +++++++++++------------
 mm/sparse.c                                     | 72 +++++++++++++------------
 6 files changed, 113 insertions(+), 90 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2372c609fa2b..9a432de363b8 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -72,6 +72,7 @@ unsigned long memory_block_size_bytes(void)
 	return get_memblock_size();
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size)
 {
 	unsigned long start, start_pfn;
@@ -153,6 +154,17 @@ static int pseries_remove_memory(struct device_node *np)
 	ret = pseries_remove_memblock(base, lmb_size);
 	return ret;
 }
+#else
+static inline int pseries_remove_memblock(unsigned long base,
+					  unsigned int memblock_size)
+{
+	return -EOPNOTSUPP;
+}
+static inline int pseries_remove_memory(struct device_node *np)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 static int pseries_add_memory(struct device_node *np)
 {
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index a51007b79032..65d9799cbb61 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -93,16 +93,6 @@ int register_memory(struct memory_block *memory)
 	return error;
 }
 
-static void
-unregister_memory(struct memory_block *memory)
-{
-	BUG_ON(memory->dev.bus != &memory_subsys);
-
-	/* drop the ref. we got in remove_memory_block() */
-	kobject_put(&memory->dev.kobj);
-	device_unregister(&memory->dev);
-}
-
 unsigned long __weak memory_block_size_bytes(void)
 {
 	return MIN_MEMORY_BLOCK_SIZE;
@@ -637,8 +627,28 @@ static int add_memory_section(int nid, struct mem_section *section,
 	return ret;
 }
 
-int remove_memory_block(unsigned long node_id, struct mem_section *section,
-		int phys_device)
+/*
+ * need an interface for the VM to add new memory regions,
+ * but without onlining it.
+ */
+int register_new_memory(int nid, struct mem_section *section)
+{
+	return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG);
+}
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void
+unregister_memory(struct memory_block *memory)
+{
+	BUG_ON(memory->dev.bus != &memory_subsys);
+
+	/* drop the ref. we got in remove_memory_block() */
+	kobject_put(&memory->dev.kobj);
+	device_unregister(&memory->dev);
+}
+
+static int remove_memory_block(unsigned long node_id,
+			       struct mem_section *section, int phys_device)
 {
 	struct memory_block *mem;
 
@@ -661,15 +671,6 @@ int remove_memory_block(unsigned long node_id, struct mem_section *section,
 	return 0;
 }
 
-/*
- * need an interface for the VM to add new memory regions,
- * but without onlining it.
- */
-int register_new_memory(int nid, struct mem_section *section)
-{
-	return add_memory_section(nid, section, NULL, MEM_OFFLINE, HOTPLUG);
-}
-
 int unregister_memory_section(struct mem_section *section)
 {
 	if (!present_section(section))
@@ -677,6 +678,7 @@ int unregister_memory_section(struct mem_section *section)
 
 	return remove_memory_block(0, section, 0);
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /*
  * offline one memory block. If the memory block has been offlined, do nothing.
diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0ff6598ee62f..73817af8b480 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -115,9 +115,10 @@ extern void unregister_memory_notifier(struct notifier_block *nb);
 extern int register_memory_isolate_notifier(struct notifier_block *nb);
 extern void unregister_memory_isolate_notifier(struct notifier_block *nb);
 extern int register_new_memory(int, struct mem_section *);
+#ifdef CONFIG_MEMORY_HOTREMOVE
 extern int unregister_memory_section(struct mem_section *);
+#endif
 extern int memory_dev_init(void);
-extern int remove_memory_block(unsigned long, struct mem_section *, int);
 extern int memory_notify(unsigned long val, void *v);
 extern int memory_isolate_notify(unsigned long val, void *v);
 extern struct memory_block *find_memory_block_hinted(struct mem_section *,
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index b6a3be7d47bf..3e622c610925 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -97,13 +97,13 @@ extern void __online_page_free(struct page *page);
 #ifdef CONFIG_MEMORY_HOTREMOVE
 extern bool is_pageblock_removable_nolock(struct page *page);
 extern int arch_remove_memory(u64 start, u64 size);
+extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
+	unsigned long nr_pages);
 #endif /* CONFIG_MEMORY_HOTREMOVE */
 
 /* reasonably generic interface to expand the physical pages in a zone  */
 extern int __add_pages(int nid, struct zone *zone, unsigned long start_pfn,
 	unsigned long nr_pages);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
-	unsigned long nr_pages);
 
 #ifdef CONFIG_NUMA
 extern int memory_add_physaddr_to_nid(u64 start);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index c916582591eb..60f6daad1076 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -436,6 +436,40 @@ static int __meminit __add_section(int nid, struct zone *zone,
 	return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 }
 
+/*
+ * Reasonably generic function for adding memory.  It is
+ * expected that archs that support memory hotplug will
+ * call this function after deciding the zone to which to
+ * add the new pages.
+ */
+int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
+			unsigned long nr_pages)
+{
+	unsigned long i;
+	int err = 0;
+	int start_sec, end_sec;
+	/* during initialize mem_map, align hot-added range to section */
+	start_sec = pfn_to_section_nr(phys_start_pfn);
+	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+
+	for (i = start_sec; i <= end_sec; i++) {
+		err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
+
+		/*
+		 * EEXIST is finally dealt with by ioresource collision
+		 * check. see add_memory() => register_memory_resource()
+		 * Warning will be printed if there is collision.
+		 */
+		if (err && (err != -EEXIST))
+			break;
+		err = 0;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL_GPL(__add_pages);
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
 /* find the smallest valid pfn in the range [start_pfn, end_pfn) */
 static int find_smallest_section_pfn(int nid, struct zone *zone,
 				     unsigned long start_pfn,
@@ -658,39 +692,6 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
 	return 0;
 }
 
-/*
- * Reasonably generic function for adding memory.  It is
- * expected that archs that support memory hotplug will
- * call this function after deciding the zone to which to
- * add the new pages.
- */
-int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
-			unsigned long nr_pages)
-{
-	unsigned long i;
-	int err = 0;
-	int start_sec, end_sec;
-	/* during initialize mem_map, align hot-added range to section */
-	start_sec = pfn_to_section_nr(phys_start_pfn);
-	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
-
-	for (i = start_sec; i <= end_sec; i++) {
-		err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
-
-		/*
-		 * EEXIST is finally dealt with by ioresource collision
-		 * check. see add_memory() => register_memory_resource()
-		 * Warning will be printed if there is collision.
-		 */
-		if (err && (err != -EEXIST))
-			break;
-		err = 0;
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(__add_pages);
-
 /**
  * __remove_pages() - remove sections of pages from a zone
  * @zone: zone from which pages need to be removed
@@ -733,6 +734,7 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(__remove_pages);
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 
 int set_online_page_callback(online_page_callback_t callback)
 {
diff --git a/mm/sparse.c b/mm/sparse.c
index a37be5f9050d..1c91f0d3f6ab 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -620,6 +620,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 
 	vmemmap_free(start, end);
 }
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
 	unsigned long start = (unsigned long)memmap;
@@ -627,6 +628,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 
 	vmemmap_free(start, end);
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 #else
 static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
 {
@@ -664,6 +666,7 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
 			   get_order(sizeof(struct page) * nr_pages));
 }
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
 static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 {
 	unsigned long maps_section_nr, removing_section_nr, i;
@@ -690,40 +693,9 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
 			put_page_bootmem(page);
 	}
 }
+#endif /* CONFIG_MEMORY_HOTREMOVE */
 #endif /* CONFIG_SPARSEMEM_VMEMMAP */
 
-static void free_section_usemap(struct page *memmap, unsigned long *usemap)
-{
-	struct page *usemap_page;
-	unsigned long nr_pages;
-
-	if (!usemap)
-		return;
-
-	usemap_page = virt_to_page(usemap);
-	/*
-	 * Check to see if allocation came from hot-plug-add
-	 */
-	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
-		kfree(usemap);
-		if (memmap)
-			__kfree_section_memmap(memmap, PAGES_PER_SECTION);
-		return;
-	}
-
-	/*
-	 * The usemap came from bootmem. This is packed with other usemaps
-	 * on the section which has pgdat at boot time. Just keep it as is now.
-	 */
-
-	if (memmap) {
-		nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
-			>> PAGE_SHIFT;
-
-		free_map_bootmem(memmap, nr_pages);
-	}
-}
-
 /*
  * returns the number of sections whose mem_maps were properly
  * set.  If this is <=0, then that means that the passed-in
@@ -800,6 +772,39 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
 }
 #endif
 
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static void free_section_usemap(struct page *memmap, unsigned long *usemap)
+{
+	struct page *usemap_page;
+	unsigned long nr_pages;
+
+	if (!usemap)
+		return;
+
+	usemap_page = virt_to_page(usemap);
+	/*
+	 * Check to see if allocation came from hot-plug-add
+	 */
+	if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
+		kfree(usemap);
+		if (memmap)
+			__kfree_section_memmap(memmap, PAGES_PER_SECTION);
+		return;
+	}
+
+	/*
+	 * The usemap came from bootmem. This is packed with other usemaps
+	 * on the section which has pgdat at boot time. Just keep it as is now.
+	 */
+
+	if (memmap) {
+		nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+			>> PAGE_SHIFT;
+
+		free_map_bootmem(memmap, nr_pages);
+	}
+}
+
 void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 {
 	struct page *memmap = NULL;
@@ -819,4 +824,5 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
 	clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION);
 	free_section_usemap(memmap, usemap);
 }
-#endif
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+#endif /* CONFIG_MEMORY_HOTPLUG */
-- 
cgit 


From 70ddf637eebe47e61fb2be08a59315581b6d2f38 Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <anton.vorontsov@linaro.org>
Date: Mon, 29 Apr 2013 15:08:31 -0700
Subject: memcg: add memory.pressure_level events

With this patch userland applications that want to maintain the
interactivity/memory allocation cost can use the pressure level
notifications.  The levels are defined like this:

The "low" level means that the system is reclaiming memory for new
allocations.  Monitoring this reclaiming activity might be useful for
maintaining cache level.  Upon notification, the program (typically
"Activity Manager") might analyze vmstat and act in advance (i.e.
prematurely shutdown unimportant services).

The "medium" level means that the system is experiencing medium memory
pressure, the system might be making swap, paging out active file
caches, etc.  Upon this event applications may decide to further analyze
vmstat/zoneinfo/memcg or internal memory usage statistics and free any
resources that can be easily reconstructed or re-read from a disk.

The "critical" level means that the system is actively thrashing, it is
about to out of memory (OOM) or even the in-kernel OOM killer is on its
way to trigger.  Applications should do whatever they can to help the
system.  It might be too late to consult with vmstat or any other
statistics, so it's advisable to take an immediate action.

The events are propagated upward until the event is handled, i.e.  the
events are not pass-through.  Here is what this means: for example you
have three cgroups: A->B->C.  Now you set up an event listener on
cgroups A, B and C, and suppose group C experiences some pressure.  In
this situation, only group C will receive the notification, i.e.  groups
A and B will not receive it.  This is done to avoid excessive
"broadcasting" of messages, which disturbs the system and which is
especially bad if we are low on memory or thrashing.  So, organize the
cgroups wisely, or propagate the events manually (or, ask us to
implement the pass-through events, explaining why would you need them.)

Performance wise, the memory pressure notifications feature itself is
lightweight and does not require much of bookkeeping, in contrast to the
rest of memcg features.  Unfortunately, as of current memcg
implementation, pages accounting is an inseparable part and cannot be
turned off.  The good news is that there are some efforts[1] to improve
the situation; plus, implementing the same, fully API-compatible[2]
interface for CONFIG_MEMCG=n case (e.g.  embedded) is also a viable
option, so it will not require any changes on the userland side.

[1] http://permalink.gmane.org/gmane.linux.kernel.cgroups/6291
[2] http://lkml.org/lkml/2013/2/21/454

[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: fix CONFIG_CGROPUPS=n warnings]
Signed-off-by: Anton Vorontsov <anton.vorontsov@linaro.org>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Glauber Costa <glommer@parallels.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Leonid Moiseichuk <leonid.moiseichuk@nokia.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Cc: John Stultz <john.stultz@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memory.txt |  70 +++++++-
 include/linux/vmpressure.h       |  47 +++++
 mm/Makefile                      |   2 +-
 mm/memcontrol.c                  |  29 +++
 mm/vmpressure.c                  | 374 +++++++++++++++++++++++++++++++++++++++
 mm/vmscan.c                      |   8 +
 6 files changed, 528 insertions(+), 2 deletions(-)
 create mode 100644 include/linux/vmpressure.h
 create mode 100644 mm/vmpressure.c

(limited to 'include/linux')

diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt
index 8b8c28b9864c..f336ede58e62 100644
--- a/Documentation/cgroups/memory.txt
+++ b/Documentation/cgroups/memory.txt
@@ -40,6 +40,7 @@ Features:
  - soft limit
  - moving (recharging) account at moving a task is selectable.
  - usage threshold notifier
+ - memory pressure notifier
  - oom-killer disable knob and oom-notifier
  - Root cgroup has no limit controls.
 
@@ -65,6 +66,7 @@ Brief summary of control files.
  memory.stat			 # show various statistics
  memory.use_hierarchy		 # set/show hierarchical account enabled
  memory.force_empty		 # trigger forced move charge to parent
+ memory.pressure_level		 # set memory pressure notifications
  memory.swappiness		 # set/show swappiness parameter of vmscan
 				 (See sysctl's vm.swappiness)
  memory.move_charge_at_immigrate # set/show controls of moving charges
@@ -762,7 +764,73 @@ At reading, current status of OOM is shown.
 	under_oom	 0 or 1 (if 1, the memory cgroup is under OOM, tasks may
 				 be stopped.)
 
-11. TODO
+11. Memory Pressure
+
+The pressure level notifications can be used to monitor the memory
+allocation cost; based on the pressure, applications can implement
+different strategies of managing their memory resources. The pressure
+levels are defined as following:
+
+The "low" level means that the system is reclaiming memory for new
+allocations. Monitoring this reclaiming activity might be useful for
+maintaining cache level. Upon notification, the program (typically
+"Activity Manager") might analyze vmstat and act in advance (i.e.
+prematurely shutdown unimportant services).
+
+The "medium" level means that the system is experiencing medium memory
+pressure, the system might be making swap, paging out active file caches,
+etc. Upon this event applications may decide to further analyze
+vmstat/zoneinfo/memcg or internal memory usage statistics and free any
+resources that can be easily reconstructed or re-read from a disk.
+
+The "critical" level means that the system is actively thrashing, it is
+about to out of memory (OOM) or even the in-kernel OOM killer is on its
+way to trigger. Applications should do whatever they can to help the
+system. It might be too late to consult with vmstat or any other
+statistics, so it's advisable to take an immediate action.
+
+The events are propagated upward until the event is handled, i.e. the
+events are not pass-through. Here is what this means: for example you have
+three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
+and C, and suppose group C experiences some pressure. In this situation,
+only group C will receive the notification, i.e. groups A and B will not
+receive it. This is done to avoid excessive "broadcasting" of messages,
+which disturbs the system and which is especially bad if we are low on
+memory or thrashing. So, organize the cgroups wisely, or propagate the
+events manually (or, ask us to implement the pass-through events,
+explaining why would you need them.)
+
+The file memory.pressure_level is only used to setup an eventfd. To
+register a notification, an application must:
+
+- create an eventfd using eventfd(2);
+- open memory.pressure_level;
+- write string like "<event_fd> <fd of memory.pressure_level> <level>"
+  to cgroup.event_control.
+
+Application will be notified through eventfd when memory pressure is at
+the specific level (or higher). Read/write operations to
+memory.pressure_level are no implemented.
+
+Test:
+
+   Here is a small script example that makes a new cgroup, sets up a
+   memory limit, sets up a notification in the cgroup and then makes child
+   cgroup experience a critical pressure:
+
+   # cd /sys/fs/cgroup/memory/
+   # mkdir foo
+   # cd foo
+   # cgroup_event_listener memory.pressure_level low &
+   # echo 8000000 > memory.limit_in_bytes
+   # echo 8000000 > memory.memsw.limit_in_bytes
+   # echo $$ > tasks
+   # dd if=/dev/zero | read x
+
+   (Expect a bunch of notifications, and eventually, the oom-killer will
+   trigger.)
+
+12. TODO
 
 1. Add support for accounting huge pages (as a separate controller)
 2. Make per-cgroup scanner reclaim not-shared pages first
diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
new file mode 100644
index 000000000000..76be077340ea
--- /dev/null
+++ b/include/linux/vmpressure.h
@@ -0,0 +1,47 @@
+#ifndef __LINUX_VMPRESSURE_H
+#define __LINUX_VMPRESSURE_H
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/gfp.h>
+#include <linux/types.h>
+#include <linux/cgroup.h>
+
+struct vmpressure {
+	unsigned long scanned;
+	unsigned long reclaimed;
+	/* The lock is used to keep the scanned/reclaimed above in sync. */
+	struct mutex sr_lock;
+
+	/* The list of vmpressure_event structs. */
+	struct list_head events;
+	/* Have to grab the lock on events traversal or modifications. */
+	struct mutex events_lock;
+
+	struct work_struct work;
+};
+
+struct mem_cgroup;
+
+#ifdef CONFIG_MEMCG
+extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+		       unsigned long scanned, unsigned long reclaimed);
+extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
+
+extern void vmpressure_init(struct vmpressure *vmpr);
+extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr);
+extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css);
+extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+				     struct eventfd_ctx *eventfd,
+				     const char *args);
+extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+					struct eventfd_ctx *eventfd);
+#else
+static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+			      unsigned long scanned, unsigned long reclaimed) {}
+static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
+				   int prio) {}
+#endif /* CONFIG_MEMCG */
+#endif /* __LINUX_VMPRESSURE_H */
diff --git a/mm/Makefile b/mm/Makefile
index 3a4628751f89..72c5acb9345f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -50,7 +50,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
-obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o vmpressure.o
 obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
 obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
 obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7e5bc43c2d1f..360464f40e96 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -49,6 +49,7 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/vmalloc.h>
+#include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
@@ -261,6 +262,9 @@ struct mem_cgroup {
 	 */
 	struct res_counter res;
 
+	/* vmpressure notifications */
+	struct vmpressure vmpressure;
+
 	union {
 		/*
 		 * the counter to account for mem+swap usage.
@@ -359,6 +363,7 @@ struct mem_cgroup {
 	atomic_t	numainfo_events;
 	atomic_t	numainfo_updating;
 #endif
+
 	/*
 	 * Per cgroup active and inactive list, similar to the
 	 * per zone LRU lists.
@@ -510,6 +515,24 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 	return container_of(s, struct mem_cgroup, css);
 }
 
+/* Some nice accessors for the vmpressure. */
+struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
+{
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	return &memcg->vmpressure;
+}
+
+struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+{
+	return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+}
+
+struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
+{
+	return &mem_cgroup_from_css(css)->vmpressure;
+}
+
 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 {
 	return (memcg == root_mem_cgroup);
@@ -5907,6 +5930,11 @@ static struct cftype mem_cgroup_files[] = {
 		.unregister_event = mem_cgroup_oom_unregister_event,
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
+	{
+		.name = "pressure_level",
+		.register_event = vmpressure_register_event,
+		.unregister_event = vmpressure_unregister_event,
+	},
 #ifdef CONFIG_NUMA
 	{
 		.name = "numa_stat",
@@ -6188,6 +6216,7 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
+	vmpressure_init(&memcg->vmpressure);
 
 	return &memcg->css;
 
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
new file mode 100644
index 000000000000..736a6011c2c8
--- /dev/null
+++ b/mm/vmpressure.c
@@ -0,0 +1,374 @@
+/*
+ * Linux VM pressure
+ *
+ * Copyright 2012 Linaro Ltd.
+ *		  Anton Vorontsov <anton.vorontsov@linaro.org>
+ *
+ * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
+ * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/fs.h>
+#include <linux/log2.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/eventfd.h>
+#include <linux/swap.h>
+#include <linux/printk.h>
+#include <linux/vmpressure.h>
+
+/*
+ * The window size (vmpressure_win) is the number of scanned pages before
+ * we try to analyze scanned/reclaimed ratio. So the window is used as a
+ * rate-limit tunable for the "low" level notification, and also for
+ * averaging the ratio for medium/critical levels. Using small window
+ * sizes can cause lot of false positives, but too big window size will
+ * delay the notifications.
+ *
+ * As the vmscan reclaimer logic works with chunks which are multiple of
+ * SWAP_CLUSTER_MAX, it makes sense to use it for the window size as well.
+ *
+ * TODO: Make the window size depend on machine size, as we do for vmstat
+ * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
+ */
+static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+
+/*
+ * These thresholds are used when we account memory pressure through
+ * scanned/reclaimed ratio. The current values were chosen empirically. In
+ * essence, they are percents: the higher the value, the more number
+ * unsuccessful reclaims there were.
+ */
+static const unsigned int vmpressure_level_med = 60;
+static const unsigned int vmpressure_level_critical = 95;
+
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0                : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+	return container_of(work, struct vmpressure, work);
+}
+
+static struct vmpressure *cg_to_vmpressure(struct cgroup *cg)
+{
+	return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+}
+
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+	struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup;
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cg);
+
+	memcg = parent_mem_cgroup(memcg);
+	if (!memcg)
+		return NULL;
+	return memcg_to_vmpressure(memcg);
+}
+
+enum vmpressure_levels {
+	VMPRESSURE_LOW = 0,
+	VMPRESSURE_MEDIUM,
+	VMPRESSURE_CRITICAL,
+	VMPRESSURE_NUM_LEVELS,
+};
+
+static const char * const vmpressure_str_levels[] = {
+	[VMPRESSURE_LOW] = "low",
+	[VMPRESSURE_MEDIUM] = "medium",
+	[VMPRESSURE_CRITICAL] = "critical",
+};
+
+static enum vmpressure_levels vmpressure_level(unsigned long pressure)
+{
+	if (pressure >= vmpressure_level_critical)
+		return VMPRESSURE_CRITICAL;
+	else if (pressure >= vmpressure_level_med)
+		return VMPRESSURE_MEDIUM;
+	return VMPRESSURE_LOW;
+}
+
+static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+						    unsigned long reclaimed)
+{
+	unsigned long scale = scanned + reclaimed;
+	unsigned long pressure;
+
+	/*
+	 * We calculate the ratio (in percents) of how many pages were
+	 * scanned vs. reclaimed in a given time frame (window). Note that
+	 * time is in VM reclaimer's "ticks", i.e. number of pages
+	 * scanned. This makes it possible to set desired reaction time
+	 * and serves as a ratelimit.
+	 */
+	pressure = scale - (reclaimed * scale / scanned);
+	pressure = pressure * 100 / scale;
+
+	pr_debug("%s: %3lu  (s: %lu  r: %lu)\n", __func__, pressure,
+		 scanned, reclaimed);
+
+	return vmpressure_level(pressure);
+}
+
+struct vmpressure_event {
+	struct eventfd_ctx *efd;
+	enum vmpressure_levels level;
+	struct list_head node;
+};
+
+static bool vmpressure_event(struct vmpressure *vmpr,
+			     unsigned long scanned, unsigned long reclaimed)
+{
+	struct vmpressure_event *ev;
+	enum vmpressure_levels level;
+	bool signalled = false;
+
+	level = vmpressure_calc_level(scanned, reclaimed);
+
+	mutex_lock(&vmpr->events_lock);
+
+	list_for_each_entry(ev, &vmpr->events, node) {
+		if (level >= ev->level) {
+			eventfd_signal(ev->efd, 1);
+			signalled = true;
+		}
+	}
+
+	mutex_unlock(&vmpr->events_lock);
+
+	return signalled;
+}
+
+static void vmpressure_work_fn(struct work_struct *work)
+{
+	struct vmpressure *vmpr = work_to_vmpressure(work);
+	unsigned long scanned;
+	unsigned long reclaimed;
+
+	/*
+	 * Several contexts might be calling vmpressure(), so it is
+	 * possible that the work was rescheduled again before the old
+	 * work context cleared the counters. In that case we will run
+	 * just after the old work returns, but then scanned might be zero
+	 * here. No need for any locks here since we don't care if
+	 * vmpr->reclaimed is in sync.
+	 */
+	if (!vmpr->scanned)
+		return;
+
+	mutex_lock(&vmpr->sr_lock);
+	scanned = vmpr->scanned;
+	reclaimed = vmpr->reclaimed;
+	vmpr->scanned = 0;
+	vmpr->reclaimed = 0;
+	mutex_unlock(&vmpr->sr_lock);
+
+	do {
+		if (vmpressure_event(vmpr, scanned, reclaimed))
+			break;
+		/*
+		 * If not handled, propagate the event upward into the
+		 * hierarchy.
+		 */
+	} while ((vmpr = vmpressure_parent(vmpr)));
+}
+
+/**
+ * vmpressure() - Account memory pressure through scanned/reclaimed ratio
+ * @gfp:	reclaimer's gfp mask
+ * @memcg:	cgroup memory controller handle
+ * @scanned:	number of pages scanned
+ * @reclaimed:	number of pages reclaimed
+ *
+ * This function should be called from the vmscan reclaim path to account
+ * "instantaneous" memory pressure (scanned/reclaimed ratio). The raw
+ * pressure index is then further refined and averaged over time.
+ *
+ * This function does not return any value.
+ */
+void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
+		unsigned long scanned, unsigned long reclaimed)
+{
+	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+
+	/*
+	 * Here we only want to account pressure that userland is able to
+	 * help us with. For example, suppose that DMA zone is under
+	 * pressure; if we notify userland about that kind of pressure,
+	 * then it will be mostly a waste as it will trigger unnecessary
+	 * freeing of memory by userland (since userland is more likely to
+	 * have HIGHMEM/MOVABLE pages instead of the DMA fallback). That
+	 * is why we include only movable, highmem and FS/IO pages.
+	 * Indirect reclaim (kswapd) sets sc->gfp_mask to GFP_KERNEL, so
+	 * we account it too.
+	 */
+	if (!(gfp & (__GFP_HIGHMEM | __GFP_MOVABLE | __GFP_IO | __GFP_FS)))
+		return;
+
+	/*
+	 * If we got here with no pages scanned, then that is an indicator
+	 * that reclaimer was unable to find any shrinkable LRUs at the
+	 * current scanning depth. But it does not mean that we should
+	 * report the critical pressure, yet. If the scanning priority
+	 * (scanning depth) goes too high (deep), we will be notified
+	 * through vmpressure_prio(). But so far, keep calm.
+	 */
+	if (!scanned)
+		return;
+
+	mutex_lock(&vmpr->sr_lock);
+	vmpr->scanned += scanned;
+	vmpr->reclaimed += reclaimed;
+	scanned = vmpr->scanned;
+	mutex_unlock(&vmpr->sr_lock);
+
+	if (scanned < vmpressure_win || work_pending(&vmpr->work))
+		return;
+	schedule_work(&vmpr->work);
+}
+
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp:	reclaimer's gfp mask
+ * @memcg:	cgroup memory controller handle
+ * @prio:	reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+	/*
+	 * We only use prio for accounting critical level. For more info
+	 * see comment for vmpressure_level_critical_prio variable above.
+	 */
+	if (prio > vmpressure_level_critical_prio)
+		return;
+
+	/*
+	 * OK, the prio is below the threshold, updating vmpressure
+	 * information before shrinker dives into long shrinking of long
+	 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+	 * to the vmpressure() basically means that we signal 'critical'
+	 * level.
+	 */
+	vmpressure(gfp, memcg, vmpressure_win, 0);
+}
+
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @cg:		cgroup that is interested in vmpressure notifications
+ * @cft:	cgroup control files handle
+ * @eventfd:	eventfd context to link notifications with
+ * @args:	event arguments (used to set up a pressure level threshold)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a string that denotes pressure level
+ * threshold (one of vmpressure_str_levels, i.e. "low", "medium", or
+ * "critical").
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).register_event, and then cgroup core will handle everything by
+ * itself.
+ */
+int vmpressure_register_event(struct cgroup *cg, struct cftype *cft,
+			      struct eventfd_ctx *eventfd, const char *args)
+{
+	struct vmpressure *vmpr = cg_to_vmpressure(cg);
+	struct vmpressure_event *ev;
+	int level;
+
+	for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) {
+		if (!strcmp(vmpressure_str_levels[level], args))
+			break;
+	}
+
+	if (level >= VMPRESSURE_NUM_LEVELS)
+		return -EINVAL;
+
+	ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+	if (!ev)
+		return -ENOMEM;
+
+	ev->efd = eventfd;
+	ev->level = level;
+
+	mutex_lock(&vmpr->events_lock);
+	list_add(&ev->node, &vmpr->events);
+	mutex_unlock(&vmpr->events_lock);
+
+	return 0;
+}
+
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @cg:		cgroup handle
+ * @cft:	cgroup control files handle
+ * @eventfd:	eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * This function should not be used directly, just pass it to (struct
+ * cftype).unregister_event, and then cgroup core will handle everything
+ * by itself.
+ */
+void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft,
+				 struct eventfd_ctx *eventfd)
+{
+	struct vmpressure *vmpr = cg_to_vmpressure(cg);
+	struct vmpressure_event *ev;
+
+	mutex_lock(&vmpr->events_lock);
+	list_for_each_entry(ev, &vmpr->events, node) {
+		if (ev->efd != eventfd)
+			continue;
+		list_del(&ev->node);
+		kfree(ev);
+		break;
+	}
+	mutex_unlock(&vmpr->events_lock);
+}
+
+/**
+ * vmpressure_init() - Initialize vmpressure control structure
+ * @vmpr:	Structure to be initialized
+ *
+ * This function should be called on every allocated vmpressure structure
+ * before any usage.
+ */
+void vmpressure_init(struct vmpressure *vmpr)
+{
+	mutex_init(&vmpr->sr_lock);
+	mutex_init(&vmpr->events_lock);
+	INIT_LIST_HEAD(&vmpr->events);
+	INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e03a00b09da9..e53e49584cf3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/init.h>
 #include <linux/highmem.h>
+#include <linux/vmpressure.h>
 #include <linux/vmstat.h>
 #include <linux/file.h>
 #include <linux/writeback.h>
@@ -1982,6 +1983,11 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 			}
 			memcg = mem_cgroup_iter(root, memcg, &reclaim);
 		} while (memcg);
+
+		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
+			   sc->nr_scanned - nr_scanned,
+			   sc->nr_reclaimed - nr_reclaimed);
+
 	} while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed,
 					 sc->nr_scanned - nr_scanned, sc));
 }
@@ -2167,6 +2173,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		count_vm_event(ALLOCSTALL);
 
 	do {
+		vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
+				sc->priority);
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);
 
-- 
cgit 


From 2f772e6cadf8ad8fca38927b17e6be028be669f5 Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:08:34 -0700
Subject: mm: break up swap_writepage() for frontswap backends

swap_writepage() is currently where frontswap hooks into the swap write
path to capture pages with the frontswap_store() function.  However, if
a frontswap backend wants to "resume" the writeback of a page to the
swap device, it can't call swap_writepage() as the page will simply
reenter the backend.

This patch separates swap_writepage() into a top and bottom half, the
bottom half named __swap_writepage() to allow a frontswap backend, like
zswap, to resume writeback beyond the frontswap_store() hook.

__add_to_swap_cache() is also made non-static so that the page for which
writeback is to be resumed can be added to the swap cache.

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h |  2 ++
 mm/page_io.c         | 14 +++++++++++---
 mm/swap_state.c      |  2 +-
 3 files changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 2818a123f3ea..76f6c3b31235 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,6 +330,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
+extern int __swap_writepage(struct page *page, struct writeback_control *wbc);
 extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
@@ -345,6 +346,7 @@ extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
 extern int add_to_swap(struct page *);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
+extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *);
 extern void delete_from_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
diff --git a/mm/page_io.c b/mm/page_io.c
index 78eee32ee486..8e6bcf176cfb 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -185,9 +185,7 @@ bad_bmap:
  */
 int swap_writepage(struct page *page, struct writeback_control *wbc)
 {
-	struct bio *bio;
-	int ret = 0, rw = WRITE;
-	struct swap_info_struct *sis = page_swap_info(page);
+	int ret = 0;
 
 	if (try_to_free_swap(page)) {
 		unlock_page(page);
@@ -199,6 +197,16 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		end_page_writeback(page);
 		goto out;
 	}
+	ret = __swap_writepage(page, wbc);
+out:
+	return ret;
+}
+
+int __swap_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct bio *bio;
+	int ret = 0, rw = WRITE;
+	struct swap_info_struct *sis = page_swap_info(page);
 
 	if (sis->flags & SWP_FILE) {
 		struct kiocb kiocb;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 7efcf1525921..fe43fd5578cf 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -78,7 +78,7 @@ void show_swap_cache_info(void)
  * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
  * but sets SwapCache flag and private instead of mapping and index.
  */
-static int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int __add_to_swap_cache(struct page *page, swp_entry_t entry)
 {
 	int error;
 	struct address_space *address_space;
-- 
cgit 


From 1eec6702a80e04416d528846a5ff2122484d95ec Mon Sep 17 00:00:00 2001
From: Seth Jennings <sjenning@linux.vnet.ibm.com>
Date: Mon, 29 Apr 2013 15:08:35 -0700
Subject: mm: allow for outstanding swap writeback accounting

To prevent flooding the swap device with writebacks, frontswap backends
need to count and limit the number of outstanding writebacks.  The
incrementing of the counter can be done before the call to
__swap_writepage().  However, the caller must receive a notification
when the writeback completes in order to decrement the counter.

To achieve this functionality, this patch modifies __swap_writepage() to
take the bio completion callback function as an argument.

end_swap_bio_write(), the normal bio completion function, is also made
non-static so that code doing the accounting can call it after the
accounting is done.

There should be no behavioural change to existing code.

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Signed-off-by: Bob Liu <bob.liu@oracle.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Reviewed-by: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/swap.h | 4 +++-
 mm/page_io.c         | 9 +++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 76f6c3b31235..b5b12c71a2af 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -330,7 +330,9 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
 /* linux/mm/page_io.c */
 extern int swap_readpage(struct page *);
 extern int swap_writepage(struct page *page, struct writeback_control *wbc);
-extern int __swap_writepage(struct page *page, struct writeback_control *wbc);
+extern void end_swap_bio_write(struct bio *bio, int err);
+extern int __swap_writepage(struct page *page, struct writeback_control *wbc,
+	void (*end_write_func)(struct bio *, int));
 extern int swap_set_page_dirty(struct page *page);
 extern void end_swap_bio_read(struct bio *bio, int err);
 
diff --git a/mm/page_io.c b/mm/page_io.c
index 8e6bcf176cfb..8e0e5c0e7cdb 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -42,7 +42,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
 	return bio;
 }
 
-static void end_swap_bio_write(struct bio *bio, int err)
+void end_swap_bio_write(struct bio *bio, int err)
 {
 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct page *page = bio->bi_io_vec[0].bv_page;
@@ -197,12 +197,13 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		end_page_writeback(page);
 		goto out;
 	}
-	ret = __swap_writepage(page, wbc);
+	ret = __swap_writepage(page, wbc, end_swap_bio_write);
 out:
 	return ret;
 }
 
-int __swap_writepage(struct page *page, struct writeback_control *wbc)
+int __swap_writepage(struct page *page, struct writeback_control *wbc,
+	void (*end_write_func)(struct bio *, int))
 {
 	struct bio *bio;
 	int ret = 0, rw = WRITE;
@@ -234,7 +235,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc)
 		return ret;
 	}
 
-	bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
+	bio = get_swap_bio(GFP_NOIO, page, end_write_func);
 	if (bio == NULL) {
 		set_page_dirty(page);
 		unlock_page(page);
-- 
cgit 


From 5bc7b8aca942d03bf2716ddcfcb4e0b57e43a1b8 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shli@kernel.org>
Date: Mon, 29 Apr 2013 15:08:36 -0700
Subject: mm: thp: add split tail pages to shrink page list in page reclaim

In page reclaim, huge page is split.  split_huge_page() adds tail pages
to LRU list.  Since we are reclaiming a huge page, it's better we
reclaim all subpages of the huge page instead of just the head page.
This patch adds split tail pages to shrink page list so the tail pages
can be reclaimed soon.

Before this patch, run a swap workload:
  thp_fault_alloc 3492
  thp_fault_fallback 608
  thp_collapse_alloc 6
  thp_collapse_alloc_failed 0
  thp_split 916

With this patch:
  thp_fault_alloc 4085
  thp_fault_fallback 16
  thp_collapse_alloc 90
  thp_collapse_alloc_failed 0
  thp_split 1272

fallback allocation is reduced a lot.

[akpm@linux-foundation.org: fix CONFIG_SWAP=n build]
Signed-off-by: Shaohua Li <shli@fusionio.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 11 ++++++++++-
 include/linux/swap.h    |  6 +++---
 mm/huge_memory.c        | 21 +++++++++++++++------
 mm/swap.c               | 11 ++++++++---
 mm/swap_state.c         |  4 ++--
 mm/vmscan.c             |  2 +-
 6 files changed, 39 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index ee1c244a62a1..528454c2caa9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -99,7 +99,11 @@ extern int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 extern int handle_pte_fault(struct mm_struct *mm,
 			    struct vm_area_struct *vma, unsigned long address,
 			    pte_t *pte, pmd_t *pmd, unsigned int flags);
-extern int split_huge_page(struct page *page);
+extern int split_huge_page_to_list(struct page *page, struct list_head *list);
+static inline int split_huge_page(struct page *page)
+{
+	return split_huge_page_to_list(page, NULL);
+}
 extern void __split_huge_page_pmd(struct vm_area_struct *vma,
 		unsigned long address, pmd_t *pmd);
 #define split_huge_page_pmd(__vma, __address, __pmd)			\
@@ -186,6 +190,11 @@ extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vm
 #define transparent_hugepage_enabled(__vma) 0
 
 #define transparent_hugepage_flags 0UL
+static inline int
+split_huge_page_to_list(struct page *page, struct list_head *list)
+{
+	return 0;
+}
 static inline int split_huge_page(struct page *page)
 {
 	return 0;
diff --git a/include/linux/swap.h b/include/linux/swap.h
index b5b12c71a2af..1701ce4be746 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -236,7 +236,7 @@ extern unsigned long nr_free_pagecache_pages(void);
 extern void __lru_cache_add(struct page *, enum lru_list lru);
 extern void lru_cache_add_lru(struct page *, enum lru_list lru);
 extern void lru_add_page_tail(struct page *page, struct page *page_tail,
-			      struct lruvec *lruvec);
+			 struct lruvec *lruvec, struct list_head *head);
 extern void activate_page(struct page *);
 extern void mark_page_accessed(struct page *);
 extern void lru_add_drain(void);
@@ -346,7 +346,7 @@ extern struct address_space swapper_spaces[];
 #define swap_address_space(entry) (&swapper_spaces[swp_type(entry)])
 extern unsigned long total_swapcache_pages(void);
 extern void show_swap_cache_info(void);
-extern int add_to_swap(struct page *);
+extern int add_to_swap(struct page *, struct list_head *list);
 extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t);
 extern int __add_to_swap_cache(struct page *page, swp_entry_t entry);
 extern void __delete_from_swap_cache(struct page *);
@@ -465,7 +465,7 @@ static inline struct page *lookup_swap_cache(swp_entry_t swp)
 	return NULL;
 }
 
-static inline int add_to_swap(struct page *page)
+static inline int add_to_swap(struct page *page, struct list_head *list)
 {
 	return 0;
 }
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 45eaae030628..2ed1a160a85b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1559,7 +1559,8 @@ static int __split_huge_page_splitting(struct page *page,
 	return ret;
 }
 
-static void __split_huge_page_refcount(struct page *page)
+static void __split_huge_page_refcount(struct page *page,
+				       struct list_head *list)
 {
 	int i;
 	struct zone *zone = page_zone(page);
@@ -1645,7 +1646,7 @@ static void __split_huge_page_refcount(struct page *page)
 		BUG_ON(!PageDirty(page_tail));
 		BUG_ON(!PageSwapBacked(page_tail));
 
-		lru_add_page_tail(page, page_tail, lruvec);
+		lru_add_page_tail(page, page_tail, lruvec, list);
 	}
 	atomic_sub(tail_count, &page->_count);
 	BUG_ON(atomic_read(&page->_count) <= 0);
@@ -1752,7 +1753,8 @@ static int __split_huge_page_map(struct page *page,
 
 /* must be called with anon_vma->root->rwsem held */
 static void __split_huge_page(struct page *page,
-			      struct anon_vma *anon_vma)
+			      struct anon_vma *anon_vma,
+			      struct list_head *list)
 {
 	int mapcount, mapcount2;
 	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -1783,7 +1785,7 @@ static void __split_huge_page(struct page *page,
 		       mapcount, page_mapcount(page));
 	BUG_ON(mapcount != page_mapcount(page));
 
-	__split_huge_page_refcount(page);
+	__split_huge_page_refcount(page, list);
 
 	mapcount2 = 0;
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
@@ -1798,7 +1800,14 @@ static void __split_huge_page(struct page *page,
 	BUG_ON(mapcount != mapcount2);
 }
 
-int split_huge_page(struct page *page)
+/*
+ * Split a hugepage into normal pages. This doesn't change the position of head
+ * page. If @list is null, tail pages will be added to LRU list, otherwise, to
+ * @list. Both head page and tail pages will inherit mapping, flags, and so on
+ * from the hugepage.
+ * Return 0 if the hugepage is split successfully otherwise return 1.
+ */
+int split_huge_page_to_list(struct page *page, struct list_head *list)
 {
 	struct anon_vma *anon_vma;
 	int ret = 1;
@@ -1823,7 +1832,7 @@ int split_huge_page(struct page *page)
 		goto out_unlock;
 
 	BUG_ON(!PageSwapBacked(page));
-	__split_huge_page(page, anon_vma);
+	__split_huge_page(page, anon_vma, list);
 	count_vm_event(THP_SPLIT);
 
 	BUG_ON(PageCompound(page));
diff --git a/mm/swap.c b/mm/swap.c
index 8a529a01e8fc..acd40bfffa82 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -737,7 +737,7 @@ EXPORT_SYMBOL(__pagevec_release);
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* used by __split_huge_page_refcount() */
 void lru_add_page_tail(struct page *page, struct page *page_tail,
-		       struct lruvec *lruvec)
+		       struct lruvec *lruvec, struct list_head *list)
 {
 	int uninitialized_var(active);
 	enum lru_list lru;
@@ -749,7 +749,8 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 	VM_BUG_ON(NR_CPUS != 1 &&
 		  !spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
 
-	SetPageLRU(page_tail);
+	if (!list)
+		SetPageLRU(page_tail);
 
 	if (page_evictable(page_tail)) {
 		if (PageActive(page)) {
@@ -767,7 +768,11 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
 
 	if (likely(PageLRU(page)))
 		list_add_tail(&page_tail->lru, &page->lru);
-	else {
+	else if (list) {
+		/* page reclaim is reclaiming a huge page */
+		get_page(page_tail);
+		list_add_tail(&page_tail->lru, list);
+	} else {
 		struct list_head *list_head;
 		/*
 		 * Head page has not yet been counted, as an hpage,
diff --git a/mm/swap_state.c b/mm/swap_state.c
index fe43fd5578cf..b3d40dcf3624 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -160,7 +160,7 @@ void __delete_from_swap_cache(struct page *page)
  * Allocate swap space for the page and add the page to the
  * swap cache.  Caller needs to hold the page lock. 
  */
-int add_to_swap(struct page *page)
+int add_to_swap(struct page *page, struct list_head *list)
 {
 	swp_entry_t entry;
 	int err;
@@ -173,7 +173,7 @@ int add_to_swap(struct page *page)
 		return 0;
 
 	if (unlikely(PageTransHuge(page)))
-		if (unlikely(split_huge_page(page))) {
+		if (unlikely(split_huge_page_to_list(page, list))) {
 			swapcache_free(entry, NULL);
 			return 0;
 		}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e53e49584cf3..fa6a85378ee4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -781,7 +781,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		if (PageAnon(page) && !PageSwapCache(page)) {
 			if (!(sc->gfp_mask & __GFP_IO))
 				goto keep_locked;
-			if (!add_to_swap(page))
+			if (!add_to_swap(page, page_list))
 				goto activate_locked;
 			may_enter_fs = 1;
 		}
-- 
cgit 


From b4def3509d18c1db9198f92d4c35065e029a09a1 Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Mon, 29 Apr 2013 15:08:52 -0700
Subject: mm, nobootmem: clean-up of free_low_memory_core_early()

Remove unused argument and make function static, because there is no user
outside of nobootmem.c

Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jiang Liu <liuj97@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 1 -
 mm/nobootmem.c          | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index cdc3bab01832..5f0b0e1f7c08 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -44,7 +44,6 @@ extern unsigned long init_bootmem_node(pg_data_t *pgdat,
 				       unsigned long endpfn);
 extern unsigned long init_bootmem(unsigned long addr, unsigned long memend);
 
-extern unsigned long free_low_memory_core_early(int nodeid);
 extern unsigned long free_all_bootmem_node(pg_data_t *pgdat);
 extern unsigned long free_all_bootmem(void);
 
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 5e07d36e381e..a31be7a0493b 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -120,7 +120,7 @@ static unsigned long __init __free_memory_core(phys_addr_t start,
 	return end_pfn - start_pfn;
 }
 
-unsigned long __init free_low_memory_core_early(int nodeid)
+static unsigned long __init free_low_memory_core_early(void)
 {
 	unsigned long count = 0;
 	phys_addr_t start, end, size;
@@ -170,7 +170,7 @@ unsigned long __init free_all_bootmem(void)
 	 *  because in some case like Node0 doesn't have RAM installed
 	 *  low ram will be on Node1
 	 */
-	return free_low_memory_core_early(MAX_NUMNODES);
+	return free_low_memory_core_early();
 }
 
 /**
-- 
cgit 


From 9375db07adeaeea5f5ea7ca0463a8b371d71ddbb Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 29 Apr 2013 16:17:10 -0700
Subject: genalloc: add devres support, allow to find a managed pool by device

This patch adds three exported functions to lib/genalloc.c:
devm_gen_pool_create, dev_get_gen_pool, and of_get_named_gen_pool.

devm_gen_pool_create is a managed version of gen_pool_create that keeps
track of the pool via devres and allows the management code to
automatically destroy it after device removal.

dev_get_gen_pool retrieves the gen_pool for a given device, if it was
created with devm_gen_pool_create, using devres_find.

of_get_named_gen_pool retrieves the gen_pool for a given device node and
property name, where the property must contain a phandle pointing to a
platform device node.  The corresponding platform device is then fed into
dev_get_gen_pool and the resulting gen_pool is returned.

[akpm@linux-foundation.org: make the of_get_named_gen_pool() stub static, fixing a zillion link errors]
[akpm@linux-foundation.org: squish "struct device declared inside parameter list" warning]
Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Tested-by: Michal Simek <monstr@monstr.eu>
Cc: Fabio Estevam <fabio.estevam@freescale.com>
Cc: Matt Porter <mporter@ti.com>
Cc: Dong Aisheng <dong.aisheng@linaro.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Javier Martin <javier.martin@vista-silicon.com>
Cc: Huang Shijie <shijie8@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/genalloc.h | 18 +++++++++++
 lib/genalloc.c           | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 99 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h
index dd7c569aacad..661d374aeb2d 100644
--- a/include/linux/genalloc.h
+++ b/include/linux/genalloc.h
@@ -29,6 +29,10 @@
 
 #ifndef __GENALLOC_H__
 #define __GENALLOC_H__
+
+struct device;
+struct device_node;
+
 /**
  * Allocation callback function type definition
  * @map: Pointer to bitmap
@@ -105,4 +109,18 @@ extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size,
 extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 		unsigned long start, unsigned int nr, void *data);
 
+extern struct gen_pool *devm_gen_pool_create(struct device *dev,
+		int min_alloc_order, int nid);
+extern struct gen_pool *dev_get_gen_pool(struct device *dev);
+
+#ifdef CONFIG_OF
+extern struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index);
+#else
+static inline struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index)
+{
+	return NULL;
+}
+#endif
 #endif /* __GENALLOC_H__ */
diff --git a/lib/genalloc.c b/lib/genalloc.c
index 54920433705a..b35cfa9bc3d4 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -34,6 +34,8 @@
 #include <linux/rculist.h>
 #include <linux/interrupt.h>
 #include <linux/genalloc.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
 
 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
 {
@@ -480,3 +482,82 @@ unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size,
 	return start_bit;
 }
 EXPORT_SYMBOL(gen_pool_best_fit);
+
+static void devm_gen_pool_release(struct device *dev, void *res)
+{
+	gen_pool_destroy(*(struct gen_pool **)res);
+}
+
+/**
+ * devm_gen_pool_create - managed gen_pool_create
+ * @dev: device that provides the gen_pool
+ * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents
+ * @nid: node id of the node the pool structure should be allocated on, or -1
+ *
+ * Create a new special memory pool that can be used to manage special purpose
+ * memory not managed by the regular kmalloc/kfree interface. The pool will be
+ * automatically destroyed by the device management code.
+ */
+struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
+		int nid)
+{
+	struct gen_pool **ptr, *pool;
+
+	ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL);
+
+	pool = gen_pool_create(min_alloc_order, nid);
+	if (pool) {
+		*ptr = pool;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return pool;
+}
+
+/**
+ * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
+ * @dev: device to retrieve the gen_pool from
+ * @name: Optional name for the gen_pool, usually NULL
+ *
+ * Returns the gen_pool for the device if one is present, or NULL.
+ */
+struct gen_pool *dev_get_gen_pool(struct device *dev)
+{
+	struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL,
+					NULL);
+
+	if (!p)
+		return NULL;
+	return *p;
+}
+EXPORT_SYMBOL_GPL(dev_get_gen_pool);
+
+#ifdef CONFIG_OF
+/**
+ * of_get_named_gen_pool - find a pool by phandle property
+ * @np: device node
+ * @propname: property name containing phandle(s)
+ * @index: index into the phandle array
+ *
+ * Returns the pool that contains the chunk starting at the physical
+ * address of the device tree node pointed at by the phandle property,
+ * or NULL if not found.
+ */
+struct gen_pool *of_get_named_gen_pool(struct device_node *np,
+	const char *propname, int index)
+{
+	struct platform_device *pdev;
+	struct device_node *np_pool;
+
+	np_pool = of_parse_phandle(np, propname, index);
+	if (!np_pool)
+		return NULL;
+	pdev = of_find_device_by_node(np_pool);
+	if (!pdev)
+		return NULL;
+	return dev_get_gen_pool(&pdev->dev);
+}
+EXPORT_SYMBOL_GPL(of_get_named_gen_pool);
+#endif /* CONFIG_OF */
-- 
cgit 


From 657eee7d25fb1422422ca32d5619185ae56a245d Mon Sep 17 00:00:00 2001
From: Philipp Zabel <p.zabel@pengutronix.de>
Date: Mon, 29 Apr 2013 16:17:14 -0700
Subject: media: coda: use genalloc API

This patch depends on "genalloc: add devres support, allow to find a
managed pool by device", which provides the of_get_named_gen_pool and
dev_get_gen_pool functions.

Signed-off-by: Philipp Zabel <p.zabel@pengutronix.de>
Acked-by: Javier Martin <javier.martin@vista-silicon.com>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Dong Aisheng <dong.aisheng@linaro.org>
Cc: Fabio Estevam <fabio.estevam@freescale.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Huang Shijie <shijie8@gmail.com>
Cc: Matt Porter <mporter@ti.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Rob Herring <rob.herring@calxeda.com>
Cc: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/devicetree/bindings/media/coda.txt | 30 ++++++++++++++++
 drivers/media/platform/Kconfig                   |  1 -
 drivers/media/platform/coda.c                    | 45 ++++++++++++++++--------
 include/linux/platform_data/coda.h               | 18 ++++++++++
 4 files changed, 79 insertions(+), 15 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/media/coda.txt
 create mode 100644 include/linux/platform_data/coda.h

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/media/coda.txt b/Documentation/devicetree/bindings/media/coda.txt
new file mode 100644
index 000000000000..2865d04e4030
--- /dev/null
+++ b/Documentation/devicetree/bindings/media/coda.txt
@@ -0,0 +1,30 @@
+Chips&Media Coda multi-standard codec IP
+========================================
+
+Coda codec IPs are present in i.MX SoCs in various versions,
+called VPU (Video Processing Unit).
+
+Required properties:
+- compatible : should be "fsl,<chip>-src" for i.MX SoCs:
+  (a) "fsl,imx27-vpu" for CodaDx6 present in i.MX27
+  (b) "fsl,imx53-vpu" for CODA7541 present in i.MX53
+  (c) "fsl,imx6q-vpu" for CODA960 present in i.MX6q
+- reg: should be register base and length as documented in the
+  SoC reference manual
+- interrupts : Should contain the VPU interrupt. For CODA960,
+  a second interrupt is needed for the MJPEG unit.
+- clocks : Should contain the ahb and per clocks, in the order
+  determined by the clock-names property.
+- clock-names : Should be "ahb", "per"
+- iram : phandle pointing to the SRAM device node
+
+Example:
+
+vpu: vpu@63ff4000 {
+	compatible = "fsl,imx53-vpu";
+	reg = <0x63ff4000 0x1000>;
+	interrupts = <9>;
+	clocks = <&clks 63>, <&clks 63>;
+	clock-names = "ahb", "per";
+	iram = <&ocram>;
+};
diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig
index a0639e779973..26500094de7c 100644
--- a/drivers/media/platform/Kconfig
+++ b/drivers/media/platform/Kconfig
@@ -145,7 +145,6 @@ config VIDEO_CODA
 	depends on VIDEO_DEV && VIDEO_V4L2 && ARCH_MXC
 	select VIDEOBUF2_DMA_CONTIG
 	select V4L2_MEM2MEM_DEV
-	select IRAM_ALLOC if SOC_IMX53
 	---help---
 	   Coda is a range of video codec IPs that supports
 	   H.264, MPEG-4, and other video formats.
diff --git a/drivers/media/platform/coda.c b/drivers/media/platform/coda.c
index 20827ba168fc..b931c2a5c7fc 100644
--- a/drivers/media/platform/coda.c
+++ b/drivers/media/platform/coda.c
@@ -14,6 +14,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/firmware.h>
+#include <linux/genalloc.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/irq.h>
@@ -23,7 +24,7 @@
 #include <linux/slab.h>
 #include <linux/videodev2.h>
 #include <linux/of.h>
-#include <linux/platform_data/imx-iram.h>
+#include <linux/platform_data/coda.h>
 
 #include <media/v4l2-ctrls.h>
 #include <media/v4l2-device.h>
@@ -43,6 +44,7 @@
 #define CODA7_WORK_BUF_SIZE	(512 * 1024 + CODA_FMO_BUF_SIZE * 8 * 1024)
 #define CODA_PARA_BUF_SIZE	(10 * 1024)
 #define CODA_ISRAM_SIZE	(2048 * 2)
+#define CODADX6_IRAM_SIZE	0xb000
 #define CODA7_IRAM_SIZE		0x14000 /* 81920 bytes */
 
 #define CODA_MAX_FRAMEBUFFERS	2
@@ -128,7 +130,10 @@ struct coda_dev {
 
 	struct coda_aux_buf	codebuf;
 	struct coda_aux_buf	workbuf;
+	struct gen_pool		*iram_pool;
+	long unsigned int	iram_vaddr;
 	long unsigned int	iram_paddr;
+	unsigned long		iram_size;
 
 	spinlock_t		irqlock;
 	struct mutex		dev_mutex;
@@ -1926,6 +1931,9 @@ static int coda_probe(struct platform_device *pdev)
 	const struct of_device_id *of_id =
 			of_match_device(of_match_ptr(coda_dt_ids), &pdev->dev);
 	const struct platform_device_id *pdev_id;
+	struct coda_platform_data *pdata = pdev->dev.platform_data;
+	struct device_node *np = pdev->dev.of_node;
+	struct gen_pool *pool;
 	struct coda_dev *dev;
 	struct resource *res;
 	int ret, irq;
@@ -1988,6 +1996,16 @@ static int coda_probe(struct platform_device *pdev)
 		return -ENOENT;
 	}
 
+	/* Get IRAM pool from device tree or platform data */
+	pool = of_get_named_gen_pool(np, "iram", 0);
+	if (!pool && pdata)
+		pool = dev_get_gen_pool(pdata->iram_dev);
+	if (!pool) {
+		dev_err(&pdev->dev, "iram pool not available\n");
+		return -ENOMEM;
+	}
+	dev->iram_pool = pool;
+
 	ret = v4l2_device_register(&pdev->dev, &dev->v4l2_dev);
 	if (ret)
 		return ret;
@@ -2022,18 +2040,17 @@ static int coda_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	if (dev->devtype->product == CODA_DX6) {
-		dev->iram_paddr = 0xffff4c00;
-	} else {
-		void __iomem *iram_vaddr;
-
-		iram_vaddr = iram_alloc(CODA7_IRAM_SIZE,
-					&dev->iram_paddr);
-		if (!iram_vaddr) {
-			dev_err(&pdev->dev, "unable to alloc iram\n");
-			return -ENOMEM;
-		}
+	if (dev->devtype->product == CODA_DX6)
+		dev->iram_size = CODADX6_IRAM_SIZE;
+	else
+		dev->iram_size = CODA7_IRAM_SIZE;
+	dev->iram_vaddr = gen_pool_alloc(dev->iram_pool, dev->iram_size);
+	if (!dev->iram_vaddr) {
+		dev_err(&pdev->dev, "unable to alloc iram\n");
+		return -ENOMEM;
 	}
+	dev->iram_paddr = gen_pool_virt_to_phys(dev->iram_pool,
+						dev->iram_vaddr);
 
 	platform_set_drvdata(pdev, dev);
 
@@ -2050,8 +2067,8 @@ static int coda_remove(struct platform_device *pdev)
 	if (dev->alloc_ctx)
 		vb2_dma_contig_cleanup_ctx(dev->alloc_ctx);
 	v4l2_device_unregister(&dev->v4l2_dev);
-	if (dev->iram_paddr)
-		iram_free(dev->iram_paddr, CODA7_IRAM_SIZE);
+	if (dev->iram_vaddr)
+		gen_pool_free(dev->iram_pool, dev->iram_vaddr, dev->iram_size);
 	if (dev->codebuf.vaddr)
 		dma_free_coherent(&pdev->dev, dev->codebuf.size,
 				  &dev->codebuf.vaddr, dev->codebuf.paddr);
diff --git a/include/linux/platform_data/coda.h b/include/linux/platform_data/coda.h
new file mode 100644
index 000000000000..6ad4410d9e20
--- /dev/null
+++ b/include/linux/platform_data/coda.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2013 Philipp Zabel, Pengutronix
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#ifndef PLATFORM_CODA_H
+#define PLATFORM_CODA_H
+
+struct device;
+
+struct coda_platform_data {
+	struct device *iram_dev;
+};
+
+#endif
-- 
cgit 


From 2fb0815c9ee6b9ac50e15dd8360ec76d9fa46a22 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <linux@roeck-us.net>
Date: Mon, 29 Apr 2013 16:17:15 -0700
Subject: gcc4: disable __compiletime_object_size for GCC 4.6+

__builtin_object_size is known to be broken on gcc 4.6+.
See http://gcc.gnu.org/bugzilla/show_bug.cgi?id=48880 for details.

This causes unnecssary build warnings and errors such as

  In function 'copy_from_user', inlined from 'sb16_copy_from_user'
	at sound/oss/sb_audio.c:878:22:
  arch/x86/include/asm/uaccess_32.h:211:26: error: call to 'copy_from_user_overflow'
	declared with attribute error: copy_from_user() buffer size is not provably correct
  make[3]: [sound/oss/sb_audio.o] Error 1 (ignored)

Disable it where broken.

Signed-off-by: Guenter Roeck <linux@roeck-us.net>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index 68b162d92254..842de225055f 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -13,7 +13,7 @@
 #define __must_check 		__attribute__((warn_unused_result))
 #define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
 
-#if GCC_VERSION >= 40100
+#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
 # define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
 #endif
 
-- 
cgit 


From d0380e6c3c0f6edb986d8798a23acfaf33d5df23 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 29 Apr 2013 16:17:18 -0700
Subject: early_printk: consolidate random copies of identical code

The early console implementations are the same all over the place.  Move
the print function to kernel/printk and get rid of the copies.

[akpm@linux-foundation.org: arch/mips/kernel/early_printk.c needs kernel.h for va_list]
[paul.gortmaker@windriver.com: sh4: make the bios early console support depend on EARLY_PRINTK]
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Russell King <linux@arm.linux.org.uk>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Cc: Michal Simek <monstr@monstr.eu>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Richard Weinberger <richard@nod.at>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/kernel/early_printk.c        | 17 +++--------------
 arch/blackfin/kernel/early_printk.c   |  2 --
 arch/microblaze/kernel/early_printk.c | 26 ++++----------------------
 arch/mips/kernel/early_printk.c       | 12 ++++++------
 arch/powerpc/kernel/udbg.c            |  6 ++----
 arch/sh/kernel/sh_bios.c              |  4 ++--
 arch/tile/kernel/early_printk.c       | 27 +++++----------------------
 arch/um/kernel/early_printk.c         |  8 +++++---
 arch/unicore32/kernel/early_printk.c  | 12 ++++--------
 arch/x86/kernel/early_printk.c        | 21 ++-------------------
 include/linux/console.h               |  1 +
 include/linux/printk.h                |  6 ++++++
 kernel/printk.c                       | 30 +++++++++++++++++++++++-------
 13 files changed, 63 insertions(+), 109 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm/kernel/early_printk.c b/arch/arm/kernel/early_printk.c
index 85aa2b292692..43076536965c 100644
--- a/arch/arm/kernel/early_printk.c
+++ b/arch/arm/kernel/early_printk.c
@@ -29,28 +29,17 @@ static void early_console_write(struct console *con, const char *s, unsigned n)
 	early_write(s, n);
 }
 
-static struct console early_console = {
+static struct console early_console_dev = {
 	.name =		"earlycon",
 	.write =	early_console_write,
 	.flags =	CON_PRINTBUFFER | CON_BOOT,
 	.index =	-1,
 };
 
-asmlinkage void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_write(buf, n);
-	va_end(ap);
-}
-
 static int __init setup_early_printk(char *buf)
 {
-	register_console(&early_console);
+	early_console = &early_console_dev;
+	register_console(&early_console_dev);
 	return 0;
 }
 
diff --git a/arch/blackfin/kernel/early_printk.c b/arch/blackfin/kernel/early_printk.c
index 84ed8375113c..61fbd2de993d 100644
--- a/arch/blackfin/kernel/early_printk.c
+++ b/arch/blackfin/kernel/early_printk.c
@@ -25,8 +25,6 @@ extern struct console *bfin_earlyserial_init(unsigned int port,
 extern struct console *bfin_jc_early_init(void);
 #endif
 
-static struct console *early_console;
-
 /* Default console */
 #define DEFAULT_PORT 0
 #define DEFAULT_CFLAG CS8|B57600
diff --git a/arch/microblaze/kernel/early_printk.c b/arch/microblaze/kernel/early_printk.c
index 60dcacc68038..365f2d53f1b2 100644
--- a/arch/microblaze/kernel/early_printk.c
+++ b/arch/microblaze/kernel/early_printk.c
@@ -21,7 +21,6 @@
 #include <asm/setup.h>
 #include <asm/prom.h>
 
-static u32 early_console_initialized;
 static u32 base_addr;
 
 #ifdef CONFIG_SERIAL_UARTLITE_CONSOLE
@@ -109,27 +108,11 @@ static struct console early_serial_uart16550_console = {
 };
 #endif /* CONFIG_SERIAL_8250_CONSOLE */
 
-static struct console *early_console;
-
-void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	if (early_console_initialized) {
-		va_start(ap, fmt);
-		n = vscnprintf(buf, 512, fmt, ap);
-		early_console->write(early_console, buf, n);
-		va_end(ap);
-	}
-}
-
 int __init setup_early_printk(char *opt)
 {
 	int version = 0;
 
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
 	base_addr = of_early_console(&version);
@@ -159,7 +142,6 @@ int __init setup_early_printk(char *opt)
 		}
 
 		register_console(early_console);
-		early_console_initialized = 1;
 		return 0;
 	}
 	return 1;
@@ -169,7 +151,7 @@ int __init setup_early_printk(char *opt)
  * only for early console because of performance degression */
 void __init remap_early_printk(void)
 {
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	pr_info("early_printk_console remapping from 0x%x to ", base_addr);
 	base_addr = (u32) ioremap(base_addr, PAGE_SIZE);
@@ -194,9 +176,9 @@ void __init remap_early_printk(void)
 
 void __init disable_early_printk(void)
 {
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	pr_warn("disabling early console\n");
 	unregister_console(early_console);
-	early_console_initialized = 0;
+	early_console = NULL;
 }
diff --git a/arch/mips/kernel/early_printk.c b/arch/mips/kernel/early_printk.c
index 9e6440eaa455..505cb77d1280 100644
--- a/arch/mips/kernel/early_printk.c
+++ b/arch/mips/kernel/early_printk.c
@@ -7,7 +7,9 @@
  * Copyright (C) 2007 MIPS Technologies, Inc.
  *   written by Ralf Baechle (ralf@linux-mips.org)
  */
+#include <linux/kernel.h>
 #include <linux/console.h>
+#include <linux/printk.h>
 #include <linux/init.h>
 
 #include <asm/setup.h>
@@ -24,20 +26,18 @@ static void early_console_write(struct console *con, const char *s, unsigned n)
 	}
 }
 
-static struct console early_console = {
+static struct console early_console_prom = {
 	.name	= "early",
 	.write	= early_console_write,
 	.flags	= CON_PRINTBUFFER | CON_BOOT,
 	.index	= -1
 };
 
-static int early_console_initialized __initdata;
-
 void __init setup_early_printk(void)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return;
-	early_console_initialized = 1;
+	early_console = &early_console_prom;
 
-	register_console(&early_console);
+	register_console(&early_console_prom);
 }
diff --git a/arch/powerpc/kernel/udbg.c b/arch/powerpc/kernel/udbg.c
index f9748498fe58..13b867093499 100644
--- a/arch/powerpc/kernel/udbg.c
+++ b/arch/powerpc/kernel/udbg.c
@@ -156,15 +156,13 @@ static struct console udbg_console = {
 	.index	= 0,
 };
 
-static int early_console_initialized;
-
 /*
  * Called by setup_system after ppc_md->probe and ppc_md->early_init.
  * Call it again after setting udbg_putc in ppc_md->setup_arch.
  */
 void __init register_early_udbg_console(void)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return;
 
 	if (!udbg_putc)
@@ -174,7 +172,7 @@ void __init register_early_udbg_console(void)
 		printk(KERN_INFO "early console immortal !\n");
 		udbg_console.flags &= ~CON_BOOT;
 	}
-	early_console_initialized = 1;
+	early_console = &udbg_console;
 	register_console(&udbg_console);
 }
 
diff --git a/arch/sh/kernel/sh_bios.c b/arch/sh/kernel/sh_bios.c
index 47475cca068a..fe584e516964 100644
--- a/arch/sh/kernel/sh_bios.c
+++ b/arch/sh/kernel/sh_bios.c
@@ -104,6 +104,7 @@ void sh_bios_vbr_reload(void)
 		);
 }
 
+#ifdef CONFIG_EARLY_PRINTK
 /*
  *	Print a string through the BIOS
  */
@@ -144,8 +145,6 @@ static struct console bios_console = {
 	.index		= -1,
 };
 
-static struct console *early_console;
-
 static int __init setup_early_printk(char *buf)
 {
 	int keep_early = 0;
@@ -170,3 +169,4 @@ static int __init setup_early_printk(char *buf)
 	return 0;
 }
 early_param("earlyprintk", setup_early_printk);
+#endif
diff --git a/arch/tile/kernel/early_printk.c b/arch/tile/kernel/early_printk.c
index afb9c9a0d887..34d72a151bf3 100644
--- a/arch/tile/kernel/early_printk.c
+++ b/arch/tile/kernel/early_printk.c
@@ -17,6 +17,7 @@
 #include <linux/init.h>
 #include <linux/string.h>
 #include <linux/irqflags.h>
+#include <linux/printk.h>
 #include <asm/setup.h>
 #include <hv/hypervisor.h>
 
@@ -33,25 +34,8 @@ static struct console early_hv_console = {
 };
 
 /* Direct interface for emergencies */
-static struct console *early_console = &early_hv_console;
-static int early_console_initialized;
 static int early_console_complete;
 
-static void early_vprintk(const char *fmt, va_list ap)
-{
-	char buf[512];
-	int n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-}
-
-void early_printk(const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	early_vprintk(fmt, ap);
-	va_end(ap);
-}
-
 void early_panic(const char *fmt, ...)
 {
 	va_list ap;
@@ -69,14 +53,13 @@ static int __initdata keep_early;
 
 static int __init setup_early_printk(char *str)
 {
-	if (early_console_initialized)
+	if (early_console)
 		return 1;
 
 	if (str != NULL && strncmp(str, "keep", 4) == 0)
 		keep_early = 1;
 
 	early_console = &early_hv_console;
-	early_console_initialized = 1;
 	register_console(early_console);
 
 	return 0;
@@ -85,12 +68,12 @@ static int __init setup_early_printk(char *str)
 void __init disable_early_printk(void)
 {
 	early_console_complete = 1;
-	if (!early_console_initialized || !early_console)
+	if (!early_console)
 		return;
 	if (!keep_early) {
 		early_printk("disabling early console\n");
 		unregister_console(early_console);
-		early_console_initialized = 0;
+		early_console = NULL;
 	} else {
 		early_printk("keeping early console\n");
 	}
@@ -98,7 +81,7 @@ void __init disable_early_printk(void)
 
 void warn_early_printk(void)
 {
-	if (early_console_complete || early_console_initialized)
+	if (early_console_complete || early_console)
 		return;
 	early_printk("\
 Machine shutting down before console output is fully initialized.\n\
diff --git a/arch/um/kernel/early_printk.c b/arch/um/kernel/early_printk.c
index 49480f092456..4a0800bc37b2 100644
--- a/arch/um/kernel/early_printk.c
+++ b/arch/um/kernel/early_printk.c
@@ -16,7 +16,7 @@ static void early_console_write(struct console *con, const char *s, unsigned int
 	um_early_printk(s, n);
 }
 
-static struct console early_console = {
+static struct console early_console_dev = {
 	.name = "earlycon",
 	.write = early_console_write,
 	.flags = CON_BOOT,
@@ -25,8 +25,10 @@ static struct console early_console = {
 
 static int __init setup_early_printk(char *buf)
 {
-	register_console(&early_console);
-
+	if (!early_console) {
+		early_console = &early_console_dev;
+		register_console(&early_console_dev);
+	}
 	return 0;
 }
 
diff --git a/arch/unicore32/kernel/early_printk.c b/arch/unicore32/kernel/early_printk.c
index 3922255f1fa8..9be0d5d02a9a 100644
--- a/arch/unicore32/kernel/early_printk.c
+++ b/arch/unicore32/kernel/early_printk.c
@@ -33,21 +33,17 @@ static struct console early_ocd_console = {
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_ocd_console;
-
-static int __initdata keep_early;
-
 static int __init setup_early_printk(char *buf)
 {
-	if (!buf)
+	int keep_early;
+
+	if (!buf || early_console)
 		return 0;
 
 	if (strstr(buf, "keep"))
 		keep_early = 1;
 
-	if (!strncmp(buf, "ocd", 3))
-		early_console = &early_ocd_console;
+	early_console = &early_ocd_console;
 
 	if (keep_early)
 		early_console->flags &= ~CON_BOOT;
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 9b9f18b49918..d15f575a861b 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -169,25 +169,9 @@ static struct console early_serial_console = {
 	.index =	-1,
 };
 
-/* Direct interface for emergencies */
-static struct console *early_console = &early_vga_console;
-static int __initdata early_console_initialized;
-
-asmlinkage void early_printk(const char *fmt, ...)
-{
-	char buf[512];
-	int n;
-	va_list ap;
-
-	va_start(ap, fmt);
-	n = vscnprintf(buf, sizeof(buf), fmt, ap);
-	early_console->write(early_console, buf, n);
-	va_end(ap);
-}
-
 static inline void early_console_register(struct console *con, int keep_early)
 {
-	if (early_console->index != -1) {
+	if (con->index != -1) {
 		printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n",
 		       con->name);
 		return;
@@ -207,9 +191,8 @@ static int __init setup_early_printk(char *buf)
 	if (!buf)
 		return 0;
 
-	if (early_console_initialized)
+	if (early_console)
 		return 0;
-	early_console_initialized = 1;
 
 	keep = (strstr(buf, "keep") != NULL);
 
diff --git a/include/linux/console.h b/include/linux/console.h
index 29680a8cda99..73bab0f58af5 100644
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -141,6 +141,7 @@ struct console {
 	for (con = console_drivers; con != NULL; con = con->next)
 
 extern int console_set_on_cmdline;
+extern struct console *early_console;
 
 extern int add_preferred_console(char *name, int idx, char *options);
 extern int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options);
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 822171fcb1c8..1c35c23bccd7 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -95,8 +95,14 @@ int no_printk(const char *fmt, ...)
 	return 0;
 }
 
+#ifdef CONFIG_EARLY_PRINTK
 extern asmlinkage __printf(1, 2)
 void early_printk(const char *fmt, ...);
+void early_vprintk(const char *fmt, va_list ap);
+#else
+static inline __printf(1, 2) __cold
+void early_printk(const char *s, ...) { }
+#endif
 
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
diff --git a/kernel/printk.c b/kernel/printk.c
index 2de593df036b..1c8ca176458f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -49,13 +49,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/printk.h>
 
-/*
- * Architectures can override it:
- */
-void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
-{
-}
-
 /* printk's without a loglevel use this.. */
 #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
 
@@ -1723,6 +1716,29 @@ static size_t cont_print_text(char *text, size_t size) { return 0; }
 
 #endif /* CONFIG_PRINTK */
 
+#ifdef CONFIG_EARLY_PRINTK
+struct console *early_console;
+
+void early_vprintk(const char *fmt, va_list ap)
+{
+	if (early_console) {
+		char buf[512];
+		int n = vscnprintf(buf, sizeof(buf), fmt, ap);
+
+		early_console->write(early_console, buf, n);
+	}
+}
+
+asmlinkage void early_printk(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	early_vprintk(fmt, ap);
+	va_end(ap);
+}
+#endif
+
 static int __add_preferred_console(char *name, int idx, char *options,
 				   char *brl_options)
 {
-- 
cgit 


From 1b2c289b4f9018f4bd54d22ab54aaeda9181cd2a Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Mon, 29 Apr 2013 16:17:19 -0700
Subject: include/linux/printk.h: include stdarg.h

printk.h uses va_list but doesn't include stdarg.h.  Hence printk.h is
unusable unless its includer has already included kernel.h (which includes
stdarg.h).

Remove the dependency by including stdarg.h in printk.h

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/printk.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/printk.h b/include/linux/printk.h
index 1c35c23bccd7..4890fe62c011 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -1,6 +1,7 @@
 #ifndef __KERNEL_PRINTK__
 #define __KERNEL_PRINTK__
 
+#include <stdarg.h>
 #include <linux/init.h>
 #include <linux/kern_levels.h>
 
-- 
cgit 


From 600ffd33d09e3803d81607d3404a8cad709160e4 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 29 Apr 2013 16:18:02 -0700
Subject: backlight: lp855x: convert a type of device name

Configurable data, backlight device name is set to constant character type.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/video/backlight/lp855x_bl.c  | 2 +-
 include/linux/platform_data/lp855x.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c
index c98bdbfdc697..df937ce5c2ec 100644
--- a/drivers/video/backlight/lp855x_bl.c
+++ b/drivers/video/backlight/lp855x_bl.c
@@ -273,7 +273,7 @@ static int lp855x_backlight_register(struct lp855x *lp)
 	struct backlight_device *bl;
 	struct backlight_properties props;
 	struct lp855x_platform_data *pdata = lp->pdata;
-	char *name = pdata->name ? : DEFAULT_BL_NAME;
+	const char *name = pdata->name ? : DEFAULT_BL_NAME;
 
 	props.type = BACKLIGHT_PLATFORM;
 	props.max_brightness = MAX_BRIGHTNESS;
diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h
index 20ee8b221dbd..d135939d392a 100644
--- a/include/linux/platform_data/lp855x.h
+++ b/include/linux/platform_data/lp855x.h
@@ -128,7 +128,7 @@ struct lp855x_rom_data {
  * @rom_data : list of new eeprom/eprom registers
  */
 struct lp855x_platform_data {
-	char *name;
+	const char *name;
 	enum lp855x_brightness_ctrl_mode mode;
 	u8 device_control;
 	int initial_brightness;
-- 
cgit 


From 0b8185733966c1863b6b90ca2697327118ce5032 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 29 Apr 2013 16:18:03 -0700
Subject: backlight: lp855x: move backlight mode platform data

The brightness of LP855x devices is controlled by I2C register or PWM
input .  This mode was selected through the platform data, but it can be
chosen by the driver internally without platform data configuration.

How to decide the control mode:
  If the PWM period has specific value, the mode is PWM input.
  On the other hand, the mode is register-based.
  This mode selection is done on the _probe().

Move 'mode' from a header file to the driver private data structure,
'lp855 x'.  And correlated code was replaced.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/backlight/lp855x-driver.txt |  3 ---
 drivers/video/backlight/lp855x_bl.c       | 21 +++++++++++++++------
 include/linux/platform_data/lp855x.h      |  7 -------
 3 files changed, 15 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/backlight/lp855x-driver.txt b/Documentation/backlight/lp855x-driver.txt
index 18b06ca038ea..72e2a673f774 100644
--- a/Documentation/backlight/lp855x-driver.txt
+++ b/Documentation/backlight/lp855x-driver.txt
@@ -32,7 +32,6 @@ Platform data for lp855x
 For supporting platform specific data, the lp855x platform data can be used.
 
 * name : Backlight driver name. If it is not defined, default name is set.
-* mode : Brightness control mode. PWM or register based.
 * device_control : Value of DEVICE CONTROL register.
 * initial_brightness : Initial value of backlight brightness.
 * period_ns : Platform specific PWM period value. unit is nano.
@@ -54,7 +53,6 @@ static struct lp855x_rom_data lp8552_eeprom_arr[] = {
 
 static struct lp855x_platform_data lp8552_pdata = {
 	.name = "lcd-bl",
-	.mode = REGISTER_BASED,
 	.device_control = I2C_CONFIG(LP8552),
 	.initial_brightness = INITIAL_BRT,
 	.load_new_rom_data = 1,
@@ -65,7 +63,6 @@ static struct lp855x_platform_data lp8552_pdata = {
 example 2) lp8556 platform data : pwm input mode with default rom data
 
 static struct lp855x_platform_data lp8556_pdata = {
-	.mode = PWM_BASED,
 	.device_control = PWM_CONFIG(LP8556),
 	.initial_brightness = INITIAL_BRT,
 	.period_ns = 1000000,
diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c
index df937ce5c2ec..b94dc00cea3f 100644
--- a/drivers/video/backlight/lp855x_bl.c
+++ b/drivers/video/backlight/lp855x_bl.c
@@ -38,6 +38,11 @@
 #define DEFAULT_BL_NAME		"lcd-backlight"
 #define MAX_BRIGHTNESS		255
 
+enum lp855x_brightness_ctrl_mode {
+	PWM_BASED = 1,
+	REGISTER_BASED,
+};
+
 struct lp855x;
 
 /*
@@ -57,6 +62,7 @@ struct lp855x_device_config {
 struct lp855x {
 	const char *chipname;
 	enum lp855x_chip_id chip_id;
+	enum lp855x_brightness_ctrl_mode mode;
 	struct lp855x_device_config *cfg;
 	struct i2c_client *client;
 	struct backlight_device *bl;
@@ -238,18 +244,17 @@ static void lp855x_pwm_ctrl(struct lp855x *lp, int br, int max_br)
 static int lp855x_bl_update_status(struct backlight_device *bl)
 {
 	struct lp855x *lp = bl_get_data(bl);
-	enum lp855x_brightness_ctrl_mode mode = lp->pdata->mode;
 
 	if (bl->props.state & BL_CORE_SUSPENDED)
 		bl->props.brightness = 0;
 
-	if (mode == PWM_BASED) {
+	if (lp->mode == PWM_BASED) {
 		int br = bl->props.brightness;
 		int max_br = bl->props.max_brightness;
 
 		lp855x_pwm_ctrl(lp, br, max_br);
 
-	} else if (mode == REGISTER_BASED) {
+	} else if (lp->mode == REGISTER_BASED) {
 		u8 val = bl->props.brightness;
 		lp855x_write_byte(lp, lp->cfg->reg_brightness, val);
 	}
@@ -310,12 +315,11 @@ static ssize_t lp855x_get_bl_ctl_mode(struct device *dev,
 				     struct device_attribute *attr, char *buf)
 {
 	struct lp855x *lp = dev_get_drvdata(dev);
-	enum lp855x_brightness_ctrl_mode mode = lp->pdata->mode;
 	char *strmode = NULL;
 
-	if (mode == PWM_BASED)
+	if (lp->mode == PWM_BASED)
 		strmode = "pwm based";
-	else if (mode == REGISTER_BASED)
+	else if (lp->mode == REGISTER_BASED)
 		strmode = "register based";
 
 	return scnprintf(buf, PAGE_SIZE, "%s\n", strmode);
@@ -352,6 +356,11 @@ static int lp855x_probe(struct i2c_client *cl, const struct i2c_device_id *id)
 	if (!lp)
 		return -ENOMEM;
 
+	if (pdata->period_ns > 0)
+		lp->mode = PWM_BASED;
+	else
+		lp->mode = REGISTER_BASED;
+
 	lp->client = cl;
 	lp->dev = &cl->dev;
 	lp->pdata = pdata;
diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h
index d135939d392a..e0954dfea7ee 100644
--- a/include/linux/platform_data/lp855x.h
+++ b/include/linux/platform_data/lp855x.h
@@ -69,11 +69,6 @@ enum lp855x_chip_id {
 	LP8557,
 };
 
-enum lp855x_brightness_ctrl_mode {
-	PWM_BASED = 1,
-	REGISTER_BASED,
-};
-
 enum lp8550_brighntess_source {
 	LP8550_PWM_ONLY,
 	LP8550_I2C_ONLY = 2,
@@ -116,7 +111,6 @@ struct lp855x_rom_data {
 /**
  * struct lp855x_platform_data
  * @name : Backlight driver name. If it is not defined, default name is set.
- * @mode : brightness control by pwm or lp855x register
  * @device_control : value of DEVICE CONTROL register
  * @initial_brightness : initial value of backlight brightness
  * @period_ns : platform specific pwm period value. unit is nano.
@@ -129,7 +123,6 @@ struct lp855x_rom_data {
  */
 struct lp855x_platform_data {
 	const char *name;
-	enum lp855x_brightness_ctrl_mode mode;
 	u8 device_control;
 	int initial_brightness;
 	unsigned int period_ns;
-- 
cgit 


From 98e35be2badd7a499a001a0681d1904f8c56f3e6 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 29 Apr 2013 16:18:04 -0700
Subject: backlight: lp855x: fix initial brightness type

Valid range of the brightness is from 0 to 255, so initial brightness
is changed from integer to u8.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/platform_data/lp855x.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h
index e0954dfea7ee..a4ed0bd65fda 100644
--- a/include/linux/platform_data/lp855x.h
+++ b/include/linux/platform_data/lp855x.h
@@ -124,7 +124,7 @@ struct lp855x_rom_data {
 struct lp855x_platform_data {
 	const char *name;
 	u8 device_control;
-	int initial_brightness;
+	u8 initial_brightness;
 	unsigned int period_ns;
 	u8 load_new_rom_data;
 	int size_program;
-- 
cgit 


From c365e59d47b75c5f288f7e63d95dc0c9abcfe516 Mon Sep 17 00:00:00 2001
From: "Kim, Milo" <Milo.Kim@ti.com>
Date: Mon, 29 Apr 2013 16:18:05 -0700
Subject: backlight: lp855x: remove duplicate platform data

The 'load_new_rom_data' was used for checking whether new ROM data should
be updated or not.

However, we can decide it with 'size_program' data.  If the size is
greater than 0, it means updating ROM area is required.  Otherwise, the
default ROM data will be used.  Therefore, this duplicate platform data
can be removed.

Signed-off-by: Milo(Woogyom) Kim <milo.kim@ti.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/backlight/lp855x-driver.txt | 4 ----
 drivers/video/backlight/lp855x_bl.c       | 2 +-
 include/linux/platform_data/lp855x.h      | 4 ----
 3 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/backlight/lp855x-driver.txt b/Documentation/backlight/lp855x-driver.txt
index 72e2a673f774..1c732f0c6758 100644
--- a/Documentation/backlight/lp855x-driver.txt
+++ b/Documentation/backlight/lp855x-driver.txt
@@ -36,9 +36,6 @@ For supporting platform specific data, the lp855x platform data can be used.
 * initial_brightness : Initial value of backlight brightness.
 * period_ns : Platform specific PWM period value. unit is nano.
 	     Only valid when brightness is pwm input mode.
-* load_new_rom_data :
-	0 : use default configuration data
-	1 : update values of eeprom or eprom registers on loading driver
 * size_program : Total size of lp855x_rom_data.
 * rom_data : List of new eeprom/eprom registers.
 
@@ -55,7 +52,6 @@ static struct lp855x_platform_data lp8552_pdata = {
 	.name = "lcd-bl",
 	.device_control = I2C_CONFIG(LP8552),
 	.initial_brightness = INITIAL_BRT,
-	.load_new_rom_data = 1,
 	.size_program = ARRAY_SIZE(lp8552_eeprom_arr),
 	.rom_data = lp8552_eeprom_arr,
 };
diff --git a/drivers/video/backlight/lp855x_bl.c b/drivers/video/backlight/lp855x_bl.c
index b94dc00cea3f..f558f8f329e5 100644
--- a/drivers/video/backlight/lp855x_bl.c
+++ b/drivers/video/backlight/lp855x_bl.c
@@ -192,7 +192,7 @@ static int lp855x_configure(struct lp855x *lp)
 	if (ret)
 		goto err;
 
-	if (pd->load_new_rom_data && pd->size_program) {
+	if (pd->size_program > 0) {
 		for (i = 0; i < pd->size_program; i++) {
 			addr = pd->rom_data[i].addr;
 			val = pd->rom_data[i].val;
diff --git a/include/linux/platform_data/lp855x.h b/include/linux/platform_data/lp855x.h
index a4ed0bd65fda..ea3200527dd3 100644
--- a/include/linux/platform_data/lp855x.h
+++ b/include/linux/platform_data/lp855x.h
@@ -115,9 +115,6 @@ struct lp855x_rom_data {
  * @initial_brightness : initial value of backlight brightness
  * @period_ns : platform specific pwm period value. unit is nano.
 		Only valid when mode is PWM_BASED.
- * @load_new_rom_data :
-	0 : use default configuration data
-	1 : update values of eeprom or eprom registers on loading driver
  * @size_program : total size of lp855x_rom_data
  * @rom_data : list of new eeprom/eprom registers
  */
@@ -126,7 +123,6 @@ struct lp855x_platform_data {
 	u8 device_control;
 	u8 initial_brightness;
 	unsigned int period_ns;
-	u8 load_new_rom_data;
 	int size_program;
 	struct lp855x_rom_data *rom_data;
 };
-- 
cgit 


From 4d22f8c306233433bdf9298b2e7806e9c71674bc Mon Sep 17 00:00:00 2001
From: Matus Ujhelyi <ujhelyi.m@gmail.com>
Date: Mon, 29 Apr 2013 16:18:08 -0700
Subject: drivers/video/backlight/tps65217_bl.c add default brightness value
 option

Signed-off-by: Matus Ujhelyi <matus.ujhelyi@streamunlimited.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 .../bindings/video/backlight/tps65217-backlight.txt       |  3 +++
 drivers/video/backlight/tps65217_bl.c                     | 15 ++++++++++++++-
 include/linux/mfd/tps65217.h                              |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/Documentation/devicetree/bindings/video/backlight/tps65217-backlight.txt b/Documentation/devicetree/bindings/video/backlight/tps65217-backlight.txt
index 76485247b35d..5fb9279ac287 100644
--- a/Documentation/devicetree/bindings/video/backlight/tps65217-backlight.txt
+++ b/Documentation/devicetree/bindings/video/backlight/tps65217-backlight.txt
@@ -9,6 +9,7 @@ Required properties:
 - backlight: node for specifying WLED1 and WLED2 lines in TPS65217
 - isel: selection bit, valid values: 1 for ISEL1 (low-level) and 2 for ISEL2 (high-level)
 - fdim: PWM dimming frequency, valid values: 100, 200, 500, 1000
+- default-brightness: valid values: 0-100
 
 Each regulator is defined using the standard binding for regulators.
 
@@ -20,5 +21,7 @@ Example:
 		backlight {
 			isel = <1>;  /* 1 - ISET1, 2 ISET2 */
 			fdim = <100>; /* TPS65217_BL_FDIM_100HZ */
+			default-brightness = <50>;
 		};
 	};
+
diff --git a/drivers/video/backlight/tps65217_bl.c b/drivers/video/backlight/tps65217_bl.c
index 70881633b45a..05782312aeb3 100644
--- a/drivers/video/backlight/tps65217_bl.c
+++ b/drivers/video/backlight/tps65217_bl.c
@@ -245,6 +245,18 @@ tps65217_bl_parse_dt(struct platform_device *pdev)
 		}
 	}
 
+	if (!of_property_read_u32(node, "default-brightness", &val)) {
+		if (val < 0 ||
+			val > 100) {
+			dev_err(&pdev->dev,
+				"invalid 'default-brightness' value in the device tree\n");
+			err = ERR_PTR(-EINVAL);
+			goto err;
+		}
+
+		pdata->dft_brightness = val;
+	}
+
 	of_node_put(node);
 
 	return pdata;
@@ -311,7 +323,8 @@ static int tps65217_bl_probe(struct platform_device *pdev)
 		return PTR_ERR(tps65217_bl->bl);
 	}
 
-	tps65217_bl->bl->props.brightness = 0;
+	tps65217_bl->bl->props.brightness = pdata->dft_brightness;
+	backlight_update_status(tps65217_bl->bl);
 	platform_set_drvdata(pdev, tps65217_bl);
 
 	return 0;
diff --git a/include/linux/mfd/tps65217.h b/include/linux/mfd/tps65217.h
index 290762f93930..29eab2bd3dfa 100644
--- a/include/linux/mfd/tps65217.h
+++ b/include/linux/mfd/tps65217.h
@@ -228,6 +228,7 @@ enum tps65217_bl_fdim {
 struct tps65217_bl_pdata {
 	enum tps65217_bl_isel isel;
 	enum tps65217_bl_fdim fdim;
+	int dft_brightness;
 };
 
 /**
-- 
cgit 


From 2e0fb404c86d6c86dc33e284310eb5d28d192dcf Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Mon, 29 Apr 2013 16:18:11 -0700
Subject: lib, net: make isodigit() public and use it

There are at least two users of isodigit().  Let's make it a public
function of ctype.h.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ctype.h | 6 ++++++
 lib/dynamic_debug.c   | 1 -
 net/sunrpc/cache.c    | 1 -
 3 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ctype.h b/include/linux/ctype.h
index 8acfe312f947..653589e3e30e 100644
--- a/include/linux/ctype.h
+++ b/include/linux/ctype.h
@@ -61,4 +61,10 @@ static inline char _tolower(const char c)
 	return c | 0x20;
 }
 
+/* Fast check for octal digit */
+static inline int isodigit(const char c)
+{
+	return c >= '0' && c <= '7';
+}
+
 #endif
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
index 5276b99ca650..46032453abd5 100644
--- a/lib/dynamic_debug.c
+++ b/lib/dynamic_debug.c
@@ -281,7 +281,6 @@ static inline int parse_lineno(const char *str, unsigned int *val)
  * allow the user to express a query which matches a format
  * containing embedded spaces.
  */
-#define isodigit(c)		((c) >= '0' && (c) <= '7')
 static char *unescape(char *str)
 {
 	char *in = str;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 25d58e766014..ce2d180d05a4 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1208,7 +1208,6 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_pipe_upcall);
  * key and content are both parsed by cache
  */
 
-#define isodigit(c) (isdigit(c) && c <= '7')
 int qword_get(char **bpp, char *dest, int bufsize)
 {
 	/* return bytes copied, or -1 on error */
-- 
cgit 


From 3e217b660281bbdf2db7e58725a4934b5635cb46 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Mon, 29 Apr 2013 16:18:27 -0700
Subject: rtc: add devm_rtc_device_{register,unregister}()

These functions allow the driver core to automatically clean up any
allocation made by rtc drivers.  Thus it simplifies the error paths.

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/class.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/rtc.h |  6 +++++
 2 files changed, 76 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index 9b742d3ffb94..b72b40bd7805 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -259,6 +259,76 @@ void rtc_device_unregister(struct rtc_device *rtc)
 }
 EXPORT_SYMBOL_GPL(rtc_device_unregister);
 
+static void devm_rtc_device_release(struct device *dev, void *res)
+{
+	struct rtc_device *rtc = *(struct rtc_device **)res;
+
+	rtc_device_unregister(rtc);
+}
+
+static int devm_rtc_device_match(struct device *dev, void *res, void *data)
+{
+	struct rtc **r = res;
+
+	return *r == data;
+}
+
+/**
+ * devm_rtc_device_register - resource managed rtc_device_register()
+ * @name: the name of the device
+ * @dev: the device to register
+ * @ops: the rtc operations structure
+ * @owner: the module owner
+ *
+ * @return a struct rtc on success, or an ERR_PTR on error
+ *
+ * Managed rtc_device_register(). The rtc_device returned from this function
+ * are automatically freed on driver detach. See rtc_device_register()
+ * for more information.
+ */
+
+struct rtc_device *devm_rtc_device_register(const char *name,
+					struct device *dev,
+					const struct rtc_class_ops *ops,
+					struct module *owner)
+{
+	struct rtc_device **ptr, *rtc;
+
+	ptr = devres_alloc(devm_rtc_device_release, sizeof(*ptr), GFP_KERNEL);
+	if (!ptr)
+		return ERR_PTR(-ENOMEM);
+
+	rtc = rtc_device_register(name, dev, ops, owner);
+	if (!IS_ERR(rtc)) {
+		*ptr = rtc;
+		devres_add(dev, ptr);
+	} else {
+		devres_free(ptr);
+	}
+
+	return rtc;
+}
+EXPORT_SYMBOL_GPL(devm_rtc_device_register);
+
+/**
+ * devm_rtc_device_unregister - resource managed devm_rtc_device_unregister()
+ * @dev: the device to unregister
+ * @rtc: the RTC class device to unregister
+ *
+ * Deallocated a rtc allocated with devm_rtc_device_register(). Normally this
+ * function will not need to be called and the resource management code will
+ * ensure that the resource is freed.
+ */
+void devm_rtc_device_unregister(struct device *dev, struct rtc_device *rtc)
+{
+	int rc;
+
+	rc = devres_release(dev, devm_rtc_device_release,
+				devm_rtc_device_match, rtc);
+	WARN_ON(rc);
+}
+EXPORT_SYMBOL_GPL(devm_rtc_device_unregister);
+
 static int __init rtc_init(void)
 {
 	rtc_class = class_create(THIS_MODULE, "rtc");
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 580b24c8b8ca..d9557689bb8d 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -133,7 +133,13 @@ extern struct rtc_device *rtc_device_register(const char *name,
 					struct device *dev,
 					const struct rtc_class_ops *ops,
 					struct module *owner);
+extern struct rtc_device *devm_rtc_device_register(const char *name,
+					struct device *dev,
+					const struct rtc_class_ops *ops,
+					struct module *owner);
 extern void rtc_device_unregister(struct rtc_device *rtc);
+extern void devm_rtc_device_unregister(struct device *dev,
+					struct rtc_device *rtc);
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
-- 
cgit 


From 6636a9944b9d3b6c4a0a27d5be5abc49ceca3381 Mon Sep 17 00:00:00 2001
From: Jingoo Han <jg1.han@samsung.com>
Date: Mon, 29 Apr 2013 16:18:33 -0700
Subject: drivers/rtc/class.c: use struct device as the first argument for
 devm_rtc_device_register()

Other devm_* APIs use 'struct device *dev' as the first argument.  Thus,
in order to sync with other devm_* functions, struct device is used as
the first argument for devm_rtc_device_register().

Signed-off-by: Jingoo Han <jg1.han@samsung.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alessandro Zummo <a.zummo@towertech.it>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/rtc/class.c | 6 +++---
 include/linux/rtc.h | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index b72b40bd7805..66385402d20e 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -275,8 +275,8 @@ static int devm_rtc_device_match(struct device *dev, void *res, void *data)
 
 /**
  * devm_rtc_device_register - resource managed rtc_device_register()
- * @name: the name of the device
  * @dev: the device to register
+ * @name: the name of the device
  * @ops: the rtc operations structure
  * @owner: the module owner
  *
@@ -287,8 +287,8 @@ static int devm_rtc_device_match(struct device *dev, void *res, void *data)
  * for more information.
  */
 
-struct rtc_device *devm_rtc_device_register(const char *name,
-					struct device *dev,
+struct rtc_device *devm_rtc_device_register(struct device *dev,
+					const char *name,
 					const struct rtc_class_ops *ops,
 					struct module *owner)
 {
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index d9557689bb8d..c2c28975293c 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -133,8 +133,8 @@ extern struct rtc_device *rtc_device_register(const char *name,
 					struct device *dev,
 					const struct rtc_class_ops *ops,
 					struct module *owner);
-extern struct rtc_device *devm_rtc_device_register(const char *name,
-					struct device *dev,
+extern struct rtc_device *devm_rtc_device_register(struct device *dev,
+					const char *name,
 					const struct rtc_class_ops *ops,
 					struct module *owner);
 extern void rtc_device_unregister(struct rtc_device *rtc);
-- 
cgit 


From ea3983ace6b79c96e6ab3d3837e2eaf81ab881e2 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Mon, 29 Apr 2013 16:21:11 -0700
Subject: fat: restructure export_operations

Define two nfs export_operation structures,one for 'stale_rw' mounts and
the other for 'nostale_ro'.  The latter uses i_pos as a basis for encoding
and decoding file handles.

Also, assign i_pos to kstat->ino.  The logic for rebuilding the inode is
added in the subsequent patches.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ravishankar N <ravi.n1@samsung.com>
Signed-off-by: Amit Sahrawat <a.sahrawat@samsung.com>
Acked-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/fat/fat.h             |   8 +--
 fs/fat/file.c            |   5 ++
 fs/fat/inode.c           |  13 ++---
 fs/fat/nfs.c             | 130 +++++++++++++++++++++++++++++++++++++++++++++--
 include/linux/exportfs.h |  11 ++++
 5 files changed, 147 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 980c0346c168..c517fc066cf7 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -406,12 +406,8 @@ int fat_cache_init(void);
 void fat_cache_destroy(void);
 
 /* fat/nfs.c */
-struct fid;
-extern struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
-				       int fh_len, int fh_type);
-extern struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
-				       int fh_len, int fh_type);
-extern struct dentry *fat_get_parent(struct dentry *child_dir);
+extern const struct export_operations fat_export_ops;
+extern const struct export_operations fat_export_ops_nostale;
 
 /* helper for printk */
 typedef unsigned long long	llu;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 3978f8ca1823..b0b632e50ddb 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -306,6 +306,11 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	struct inode *inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 	stat->blksize = MSDOS_SB(inode->i_sb)->cluster_size;
+
+	if (MSDOS_SB(inode->i_sb)->options.nfs == FAT_NFS_NOSTALE_RO) {
+		/* Use i_pos for ino. This is used as fileid of nfs. */
+		stat->ino = fat_i_pos_read(MSDOS_SB(inode->i_sb), inode);
+	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(fat_getattr);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 68cb5a6eb539..27f49f4fa0d3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -18,7 +18,6 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
-#include <linux/exportfs.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
@@ -748,12 +747,6 @@ static const struct super_operations fat_sops = {
 	.show_options	= fat_show_options,
 };
 
-static const struct export_operations fat_export_ops = {
-	.fh_to_dentry	= fat_fh_to_dentry,
-	.fh_to_parent	= fat_fh_to_parent,
-	.get_parent	= fat_get_parent,
-};
-
 static int fat_show_options(struct seq_file *m, struct dentry *root)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(root->d_sb);
@@ -1177,8 +1170,10 @@ out:
 		opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
 	if (opts->unicode_xlate)
 		opts->utf8 = 0;
-	if (opts->nfs == FAT_NFS_NOSTALE_RO)
+	if (opts->nfs == FAT_NFS_NOSTALE_RO) {
 		sb->s_flags |= MS_RDONLY;
+		sb->s_export_op = &fat_export_ops_nostale;
+	}
 
 	return 0;
 }
@@ -1189,7 +1184,7 @@ static int fat_read_root(struct inode *inode)
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	int error;
 
-	MSDOS_I(inode)->i_pos = 0;
+	MSDOS_I(inode)->i_pos = MSDOS_ROOT_INO;
 	inode->i_uid = sbi->options.fs_uid;
 	inode->i_gid = sbi->options.fs_gid;
 	inode->i_version++;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index 499c10438ca2..d59c02543a10 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -14,6 +14,18 @@
 #include <linux/exportfs.h>
 #include "fat.h"
 
+struct fat_fid {
+	u32 i_gen;
+	u32 i_pos_low;
+	u16 i_pos_hi;
+	u16 parent_i_pos_hi;
+	u32 parent_i_pos_low;
+	u32 parent_i_gen;
+};
+
+#define FAT_FID_SIZE_WITHOUT_PARENT 3
+#define FAT_FID_SIZE_WITH_PARENT (sizeof(struct fat_fid)/sizeof(u32))
+
 /**
  * Look up a directory inode given its starting cluster.
  */
@@ -38,8 +50,8 @@ static struct inode *fat_dget(struct super_block *sb, int i_logstart)
 	return inode;
 }
 
-static struct inode *fat_nfs_get_inode(struct super_block *sb,
-				       u64 ino, u32 generation)
+static struct inode *__fat_nfs_get_inode(struct super_block *sb,
+				       u64 ino, u32 generation, loff_t i_pos)
 {
 	struct inode *inode;
 
@@ -55,35 +67,130 @@ static struct inode *fat_nfs_get_inode(struct super_block *sb,
 	return inode;
 }
 
+static struct inode *fat_nfs_get_inode(struct super_block *sb,
+				       u64 ino, u32 generation)
+{
+
+	return __fat_nfs_get_inode(sb, ino, generation, 0);
+}
+
+static int
+fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
+		      struct inode *parent)
+{
+	int len = *lenp;
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
+	struct fat_fid *fid = (struct fat_fid *) fh;
+	loff_t i_pos;
+	int type = FILEID_FAT_WITHOUT_PARENT;
+
+	if (parent) {
+		if (len < FAT_FID_SIZE_WITH_PARENT) {
+			*lenp = FAT_FID_SIZE_WITH_PARENT;
+			return FILEID_INVALID;
+		}
+	} else {
+		if (len < FAT_FID_SIZE_WITHOUT_PARENT) {
+			*lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+			return FILEID_INVALID;
+		}
+	}
+
+	i_pos = fat_i_pos_read(sbi, inode);
+	*lenp = FAT_FID_SIZE_WITHOUT_PARENT;
+	fid->i_gen = inode->i_generation;
+	fid->i_pos_low = i_pos & 0xFFFFFFFF;
+	fid->i_pos_hi = (i_pos >> 32) & 0xFFFF;
+	if (parent) {
+		i_pos = fat_i_pos_read(sbi, parent);
+		fid->parent_i_pos_hi = (i_pos >> 32) & 0xFFFF;
+		fid->parent_i_pos_low = i_pos & 0xFFFFFFFF;
+		fid->parent_i_gen = parent->i_generation;
+		type = FILEID_FAT_WITH_PARENT;
+		*lenp = FAT_FID_SIZE_WITH_PARENT;
+	}
+
+	return type;
+}
+
 /**
  * Map a NFS file handle to a corresponding dentry.
  * The dentry may or may not be connected to the filesystem root.
  */
-struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
+static struct dentry *fat_fh_to_dentry(struct super_block *sb, struct fid *fid,
 				int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    fat_nfs_get_inode);
 }
 
+static struct dentry *fat_fh_to_dentry_nostale(struct super_block *sb,
+					       struct fid *fh, int fh_len,
+					       int fh_type)
+{
+	struct inode *inode = NULL;
+	struct fat_fid *fid = (struct fat_fid *)fh;
+	loff_t i_pos;
+
+	switch (fh_type) {
+	case FILEID_FAT_WITHOUT_PARENT:
+		if (fh_len < FAT_FID_SIZE_WITHOUT_PARENT)
+			return NULL;
+		break;
+	case FILEID_FAT_WITH_PARENT:
+		if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+			return NULL;
+		break;
+	default:
+		return NULL;
+	}
+	i_pos = fid->i_pos_hi;
+	i_pos = (i_pos << 32) | (fid->i_pos_low);
+	inode = __fat_nfs_get_inode(sb, 0, fid->i_gen, i_pos);
+
+	return d_obtain_alias(inode);
+}
+
 /*
  * Find the parent for a file specified by NFS handle.
  * This requires that the handle contain the i_ino of the parent.
  */
-struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
+static struct dentry *fat_fh_to_parent(struct super_block *sb, struct fid *fid,
 				int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    fat_nfs_get_inode);
 }
 
+static struct dentry *fat_fh_to_parent_nostale(struct super_block *sb,
+					       struct fid *fh, int fh_len,
+					       int fh_type)
+{
+	struct inode *inode = NULL;
+	struct fat_fid *fid = (struct fat_fid *)fh;
+	loff_t i_pos;
+
+	if (fh_len < FAT_FID_SIZE_WITH_PARENT)
+		return NULL;
+
+	switch (fh_type) {
+	case FILEID_FAT_WITH_PARENT:
+		i_pos = fid->parent_i_pos_hi;
+		i_pos = (i_pos << 32) | (fid->parent_i_pos_low);
+		inode = __fat_nfs_get_inode(sb, 0, fid->parent_i_gen, i_pos);
+		break;
+	}
+
+	return d_obtain_alias(inode);
+}
+
 /*
  * Find the parent for a directory that is not currently connected to
  * the filesystem root.
  *
  * On entry, the caller holds child_dir->d_inode->i_mutex.
  */
-struct dentry *fat_get_parent(struct dentry *child_dir)
+static struct dentry *fat_get_parent(struct dentry *child_dir)
 {
 	struct super_block *sb = child_dir->d_sb;
 	struct buffer_head *bh = NULL;
@@ -98,3 +205,16 @@ struct dentry *fat_get_parent(struct dentry *child_dir)
 
 	return d_obtain_alias(parent_inode);
 }
+
+const struct export_operations fat_export_ops = {
+	.fh_to_dentry   = fat_fh_to_dentry,
+	.fh_to_parent   = fat_fh_to_parent,
+	.get_parent     = fat_get_parent,
+};
+
+const struct export_operations fat_export_ops_nostale = {
+	.encode_fh      = fat_encode_fh_nostale,
+	.fh_to_dentry   = fat_fh_to_dentry_nostale,
+	.fh_to_parent   = fat_fh_to_parent_nostale,
+	.get_parent     = fat_get_parent,
+};
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 5b9b5b317180..41b223a59a63 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -84,6 +84,17 @@ enum fid_type {
 	 */
 	FILEID_NILFS_WITH_PARENT = 0x62,
 
+	/*
+	 * 32 bit generation number, 40 bit i_pos.
+	 */
+	FILEID_FAT_WITHOUT_PARENT = 0x71,
+
+	/*
+	 * 32 bit generation number, 40 bit i_pos,
+	 * 32 bit parent generation number, 40 bit parent i_pos
+	 */
+	FILEID_FAT_WITH_PARENT = 0x72,
+
 	/*
 	 * Filesystems must not use 0xff file ID.
 	 */
-- 
cgit 


From 3e6628c4b347a558965041290c5a92791dd4c741 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 29 Apr 2013 16:21:16 -0700
Subject: idr: introduce idr_alloc_cyclic()

As Tejun points out, there are several users of the IDR facility that
attempt to use it in a cyclic fashion.  These users are likely to see
-ENOSPC errors after the counter wraps one or more times however.

This patchset adds a new idr_alloc_cyclic routine and converts several
of these users to it.  Many of these users are in obscure parts of the
kernel, and I don't have a good way to test some of them.  The change is
pretty straightforward though, so hopefully it won't be an issue.

There is one other cyclic user of idr_alloc that I didn't touch in
ipc/util.c.  That one is doing some strange stuff that I didn't quite
understand, but it looks like it should probably be converted later
somehow.

This patch:

Thus spake Tejun Heo:

    Ooh, BTW, the cyclic allocation is broken.  It's prone to -ENOSPC
    after the first wraparound.  There are several cyclic users in the
    kernel and I think it probably would be best to implement cyclic
    support in idr.

This patch does that by adding new idr_alloc_cyclic function that such
users in the kernel can use.  With this, there's no need for a caller to
keep track of the last value used as that's now tracked internally.  This
should prevent the ENOSPC problems that can hit when the "last allocated"
counter exceeds INT_MAX.

Later patches will convert existing cyclic users to the new interface.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Jack Morgenstein <jackm@dev.mellanox.co.il>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Or Gerlitz <ogerlitz@mellanox.com>
Cc: Robert Love <rlove@rlove.org>
Cc: Roland Dreier <roland@purestorage.com>
Cc: Sridhar Samudrala <sri@us.ibm.com>
Cc: Steve Wise <swise@opengridcomputing.com>
Cc: Tom Tucker <tom@opengridcomputing.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/idr.h |  2 ++
 lib/idr.c           | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/idr.h b/include/linux/idr.h
index 2640c7e99e51..a470ac3ef49d 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -42,6 +42,7 @@ struct idr {
 	struct idr_layer	*id_free;
 	int			layers;	/* only valid w/o concurrent changes */
 	int			id_free_cnt;
+	int			cur;	/* current pos for cyclic allocation */
 	spinlock_t		lock;
 };
 
@@ -75,6 +76,7 @@ struct idr {
 void *idr_find_slowpath(struct idr *idp, int id);
 void idr_preload(gfp_t gfp_mask);
 int idr_alloc(struct idr *idp, void *ptr, int start, int end, gfp_t gfp_mask);
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask);
 int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
 void *idr_get_next(struct idr *idp, int *nextid);
diff --git a/lib/idr.c b/lib/idr.c
index 322e2816f2fb..cca4b9302a71 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -495,6 +495,33 @@ int idr_alloc(struct idr *idr, void *ptr, int start, int end, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL_GPL(idr_alloc);
 
+/**
+ * idr_alloc_cyclic - allocate new idr entry in a cyclical fashion
+ * @idr: the (initialized) idr
+ * @ptr: pointer to be associated with the new id
+ * @start: the minimum id (inclusive)
+ * @end: the maximum id (exclusive, <= 0 for max)
+ * @gfp_mask: memory allocation flags
+ *
+ * Essentially the same as idr_alloc, but prefers to allocate progressively
+ * higher ids if it can. If the "cur" counter wraps, then it will start again
+ * at the "start" end of the range and allocate one that has already been used.
+ */
+int idr_alloc_cyclic(struct idr *idr, void *ptr, int start, int end,
+			gfp_t gfp_mask)
+{
+	int id;
+
+	id = idr_alloc(idr, ptr, max(start, idr->cur), end, gfp_mask);
+	if (id == -ENOSPC)
+		id = idr_alloc(idr, ptr, start, end, gfp_mask);
+
+	if (likely(id >= 0))
+		idr->cur = id + 1;
+	return id;
+}
+EXPORT_SYMBOL(idr_alloc_cyclic);
+
 static void idr_remove_warning(int id)
 {
 	printk(KERN_WARNING
-- 
cgit 


From a66c04b4534f9b25e1241dff9a9d94dff9fd66f8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Mon, 29 Apr 2013 16:21:21 -0700
Subject: inotify: convert inotify_add_to_idr() to use idr_alloc_cyclic()

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Cc: John McCutchan <john@johnmccutchan.com>
Cc: Robert Love <rlove@rlove.org>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 8 ++------
 include/linux/fsnotify_backend.h | 1 -
 2 files changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e0f7c1241a6a..8562bd3af947 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -359,7 +359,6 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 }
 
 static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
-			      int *last_wd,
 			      struct inotify_inode_mark *i_mark)
 {
 	int ret;
@@ -367,11 +366,10 @@ static int inotify_add_to_idr(struct idr *idr, spinlock_t *idr_lock,
 	idr_preload(GFP_KERNEL);
 	spin_lock(idr_lock);
 
-	ret = idr_alloc(idr, i_mark, *last_wd + 1, 0, GFP_NOWAIT);
+	ret = idr_alloc_cyclic(idr, i_mark, 1, 0, GFP_NOWAIT);
 	if (ret >= 0) {
 		/* we added the mark to the idr, take a reference */
 		i_mark->wd = ret;
-		*last_wd = i_mark->wd;
 		fsnotify_get_mark(&i_mark->fsn_mark);
 	}
 
@@ -638,8 +636,7 @@ static int inotify_new_watch(struct fsnotify_group *group,
 	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
 		goto out_err;
 
-	ret = inotify_add_to_idr(idr, idr_lock, &group->inotify_data.last_wd,
-				 tmp_i_mark);
+	ret = inotify_add_to_idr(idr, idr_lock, tmp_i_mark);
 	if (ret)
 		goto out_err;
 
@@ -697,7 +694,6 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.last_wd = 0;
 	group->inotify_data.user = get_current_user();
 
 	if (atomic_inc_return(&group->inotify_data.user->inotify_devs) >
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d5b0910d4961..4b2ee8d12f5e 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -157,7 +157,6 @@ struct fsnotify_group {
 		struct inotify_group_private_data {
 			spinlock_t	idr_lock;
 			struct idr      idr;
-			u32             last_wd;
 			struct user_struct      *user;
 		} inotify_data;
 #endif
-- 
cgit 


From 8d564368a9a3197f43e56dadf4a18c5738849f94 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Mon, 29 Apr 2013 16:21:42 -0700
Subject: net: rename random32 to prandom

Commit 496f2f93b1cc ("random32: rename random32 to prandom") renamed
random32() and srandom32() to prandom_u32() and prandom_seed()
respectively.

net_random() and net_srandom() need to be redefined with prandom_* in
order to finish the naming transition.

While I'm at it, enclose macro argument of net_srandom() with parenthesis.

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/net.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/net.h b/include/linux/net.h
index aa1673160a45..99c9f0c103c2 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -240,8 +240,8 @@ do {								\
 #define net_dbg_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
 
-#define net_random()		random32()
-#define net_srandom(seed)	srandom32((__force u32)seed)
+#define net_random()		prandom_u32()
+#define net_srandom(seed)	prandom_seed((__force u32)(seed))
 
 extern int   	     kernel_sendmsg(struct socket *sock, struct msghdr *msg,
 				    struct kvec *vec, size_t num, size_t len);
-- 
cgit 


From a5f04b9df1113e0c16271afe5e43028f0d763f13 Mon Sep 17 00:00:00 2001
From: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Date: Wed, 17 Apr 2013 19:38:13 +0200
Subject: HID: debug: break out hid_dump_report() into hid-debug

No semantic changes, but hid_dump_report should be in hid-debug.c, not
in hid-core.c

Signed-off-by: Benjamin Tissoires <benjamin.tissoires@redhat.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c    | 25 ++-----------------------
 drivers/hid/hid-debug.c   | 30 ++++++++++++++++++++++++++++++
 include/linux/hid-debug.h |  6 ++++--
 3 files changed, 36 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index aa341d135867..f86dd9708ca5 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -1260,8 +1260,6 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i
 	struct hid_report_enum *report_enum;
 	struct hid_driver *hdrv;
 	struct hid_report *report;
-	char *buf;
-	unsigned int i;
 	int ret = 0;
 
 	if (!hid)
@@ -1284,28 +1282,9 @@ int hid_input_report(struct hid_device *hid, int type, u8 *data, int size, int i
 	}
 
 	/* Avoid unnecessary overhead if debugfs is disabled */
-	if (list_empty(&hid->debug_list))
-		goto nomem;
-
-	buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC);
-
-	if (!buf)
-		goto nomem;
-
-	/* dump the report */
-	snprintf(buf, HID_DEBUG_BUFSIZE - 1,
-			"\nreport (size %u) (%snumbered) = ", size, report_enum->numbered ? "" : "un");
-	hid_debug_event(hid, buf);
-
-	for (i = 0; i < size; i++) {
-		snprintf(buf, HID_DEBUG_BUFSIZE - 1,
-				" %02x", data[i]);
-		hid_debug_event(hid, buf);
-	}
-	hid_debug_event(hid, "\n");
-	kfree(buf);
+	if (!list_empty(&hid->debug_list))
+		hid_dump_report(hid, type, data, size);
 
-nomem:
 	report = hid_get_report(report_enum, data);
 
 	if (!report) {
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 933fff0fff1f..094cbcfe1e1a 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -591,6 +591,36 @@ void hid_debug_event(struct hid_device *hdev, char *buf)
 }
 EXPORT_SYMBOL_GPL(hid_debug_event);
 
+void hid_dump_report(struct hid_device *hid, int type, u8 *data,
+		int size)
+{
+	struct hid_report_enum *report_enum;
+	char *buf;
+	unsigned int i;
+
+	buf = kmalloc(sizeof(char) * HID_DEBUG_BUFSIZE, GFP_ATOMIC);
+
+	if (!buf)
+		return;
+
+	report_enum = hid->report_enum + type;
+
+	/* dump the report */
+	snprintf(buf, HID_DEBUG_BUFSIZE - 1,
+			"\nreport (size %u) (%snumbered) = ", size,
+			report_enum->numbered ? "" : "un");
+	hid_debug_event(hid, buf);
+
+	for (i = 0; i < size; i++) {
+		snprintf(buf, HID_DEBUG_BUFSIZE - 1,
+				" %02x", data[i]);
+		hid_debug_event(hid, buf);
+	}
+	hid_debug_event(hid, "\n");
+	kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hid_dump_report);
+
 void hid_dump_input(struct hid_device *hdev, struct hid_usage *usage, __s32 value)
 {
 	char *buf;
diff --git a/include/linux/hid-debug.h b/include/linux/hid-debug.h
index 53744fa1c8b7..8663f216c563 100644
--- a/include/linux/hid-debug.h
+++ b/include/linux/hid-debug.h
@@ -22,11 +22,12 @@
  *
  */
 
-#define HID_DEBUG_BUFSIZE 512
-
 #ifdef CONFIG_DEBUG_FS
 
+#define HID_DEBUG_BUFSIZE 512
+
 void hid_dump_input(struct hid_device *, struct hid_usage *, __s32);
+void hid_dump_report(struct hid_device *, int , u8 *, int);
 void hid_dump_device(struct hid_device *, struct seq_file *);
 void hid_dump_field(struct hid_field *, int, struct seq_file *);
 char *hid_resolv_usage(unsigned, struct seq_file *);
@@ -50,6 +51,7 @@ struct hid_debug_list {
 #else
 
 #define hid_dump_input(a,b,c)		do { } while (0)
+#define hid_dump_report(a,b,c,d)	do { } while (0)
 #define hid_dump_device(a,b)		do { } while (0)
 #define hid_dump_field(a,b,c)		do { } while (0)
 #define hid_resolv_usage(a,b)		do { } while (0)
-- 
cgit 


From 2353f2bea307390e015493118e425152b8a5a431 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Tue, 16 Apr 2013 15:40:09 -0700
Subject: HID: protect hid_debug_list

Accesses to hid_device->hid_debug_list are not serialized properly, which
could result in SMP concurrency issues when HID debugfs events are accessesed
by multiple userspace processess.

Serialize all the list operations by a mutex.

Spotted by Al Viro.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c  | 1 +
 drivers/hid/hid-debug.c | 6 ++++++
 include/linux/hid.h     | 1 +
 3 files changed, 8 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index f86dd9708ca5..e7765ede339e 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2319,6 +2319,7 @@ struct hid_device *hid_allocate_device(void)
 
 	init_waitqueue_head(&hdev->debug_wait);
 	INIT_LIST_HEAD(&hdev->debug_list);
+	mutex_init(&hdev->debug_list_lock);
 	sema_init(&hdev->driver_lock, 1);
 
 	return hdev;
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 094cbcfe1e1a..7e56cb3855e3 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -580,12 +580,14 @@ void hid_debug_event(struct hid_device *hdev, char *buf)
 	int i;
 	struct hid_debug_list *list;
 
+	mutex_lock(&hdev->debug_list_lock);
 	list_for_each_entry(list, &hdev->debug_list, node) {
 		for (i = 0; i < strlen(buf); i++)
 			list->hid_debug_buf[(list->tail + i) % HID_DEBUG_BUFSIZE] =
 				buf[i];
 		list->tail = (list->tail + i) % HID_DEBUG_BUFSIZE;
         }
+	mutex_unlock(&hdev->debug_list_lock);
 
 	wake_up_interruptible(&hdev->debug_wait);
 }
@@ -990,7 +992,9 @@ static int hid_debug_events_open(struct inode *inode, struct file *file)
 	file->private_data = list;
 	mutex_init(&list->read_mutex);
 
+	mutex_lock(&list->hdev->debug_list_lock);
 	list_add_tail(&list->node, &list->hdev->debug_list);
+	mutex_unlock(&list->hdev->debug_list_lock);
 
 out:
 	return err;
@@ -1085,7 +1089,9 @@ static int hid_debug_events_release(struct inode *inode, struct file *file)
 {
 	struct hid_debug_list *list = file->private_data;
 
+	mutex_lock(&list->hdev->debug_list_lock);
 	list_del(&list->node);
+	mutex_unlock(&list->hdev->debug_list_lock);
 	kfree(list->hid_debug_buf);
 	kfree(list);
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index e14b465b1146..06579c72d195 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -512,6 +512,7 @@ struct hid_device {							/* device report descriptor */
 	struct dentry *debug_rdesc;
 	struct dentry *debug_events;
 	struct list_head debug_list;
+	struct mutex debug_list_lock;
 	wait_queue_head_t debug_wait;
 };
 
-- 
cgit 


From 18a1053f7b85acdda2428c9f694101070cb8e62a Mon Sep 17 00:00:00 2001
From: "Shimoda, Yoshihiro" <yoshihiro.shimoda.uh@renesas.com>
Date: Tue, 23 Apr 2013 20:00:12 +0900
Subject: sudmac: add support for SUDMAC

Some Renesas USB modules have SUDMAC. This patch supports it using
the shdma-base driver.

Signed-off-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Reviewed-by: Guennadi Liakhovetski <g.liakhovetski@gmx.de>
Acked-by: Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/sh/Kconfig  |   6 +
 drivers/dma/sh/Makefile |   1 +
 drivers/dma/sh/sudmac.c | 428 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/sudmac.h  |  52 ++++++
 4 files changed, 487 insertions(+)
 create mode 100644 drivers/dma/sh/sudmac.c
 create mode 100644 include/linux/sudmac.h

(limited to 'include/linux')

diff --git a/drivers/dma/sh/Kconfig b/drivers/dma/sh/Kconfig
index c0f7a3763f3d..5c1dee20c13e 100644
--- a/drivers/dma/sh/Kconfig
+++ b/drivers/dma/sh/Kconfig
@@ -16,3 +16,9 @@ config SH_DMAE
 	depends on SH_DMAE_BASE
 	help
 	  Enable support for the Renesas SuperH DMA controllers.
+
+config SUDMAC
+	tristate "Renesas SUDMAC support"
+	depends on SH_DMAE_BASE
+	help
+	  Enable support for the Renesas SUDMAC controllers.
diff --git a/drivers/dma/sh/Makefile b/drivers/dma/sh/Makefile
index eceaf469f5e3..c07ca4612e46 100644
--- a/drivers/dma/sh/Makefile
+++ b/drivers/dma/sh/Makefile
@@ -1,2 +1,3 @@
 obj-$(CONFIG_SH_DMAE_BASE) += shdma-base.o
 obj-$(CONFIG_SH_DMAE) += shdma.o
+obj-$(CONFIG_SUDMAC) += sudmac.o
diff --git a/drivers/dma/sh/sudmac.c b/drivers/dma/sh/sudmac.c
new file mode 100644
index 000000000000..e7c94bbddb53
--- /dev/null
+++ b/drivers/dma/sh/sudmac.c
@@ -0,0 +1,428 @@
+/*
+ * Renesas SUDMAC support
+ *
+ * Copyright (C) 2013 Renesas Solutions Corp.
+ *
+ * based on drivers/dma/sh/shdma.c:
+ * Copyright (C) 2011-2012 Guennadi Liakhovetski <g.liakhovetski@gmx.de>
+ * Copyright (C) 2009 Nobuhiro Iwamatsu <iwamatsu.nobuhiro@renesas.com>
+ * Copyright (C) 2009 Renesas Solutions, Inc. All rights reserved.
+ * Copyright (C) 2007 Freescale Semiconductor, Inc. All rights reserved.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/dmaengine.h>
+#include <linux/platform_device.h>
+#include <linux/sudmac.h>
+
+struct sudmac_chan {
+	struct shdma_chan shdma_chan;
+	void __iomem *base;
+	char dev_id[16];	/* unique name per DMAC of channel */
+
+	u32 offset;		/* for CFG, BA, BBC, CA, CBC, DEN */
+	u32 cfg;
+	u32 dint_end_bit;
+};
+
+struct sudmac_device {
+	struct shdma_dev shdma_dev;
+	struct sudmac_pdata *pdata;
+	void __iomem *chan_reg;
+};
+
+struct sudmac_regs {
+	u32 base_addr;
+	u32 base_byte_count;
+};
+
+struct sudmac_desc {
+	struct sudmac_regs hw;
+	struct shdma_desc shdma_desc;
+};
+
+#define to_chan(schan) container_of(schan, struct sudmac_chan, shdma_chan)
+#define to_desc(sdesc) container_of(sdesc, struct sudmac_desc, shdma_desc)
+#define to_sdev(sc) container_of(sc->shdma_chan.dma_chan.device, \
+				 struct sudmac_device, shdma_dev.dma_dev)
+
+/* SUDMAC register */
+#define SUDMAC_CH0CFG		0x00
+#define SUDMAC_CH0BA		0x10
+#define SUDMAC_CH0BBC		0x18
+#define SUDMAC_CH0CA		0x20
+#define SUDMAC_CH0CBC		0x28
+#define SUDMAC_CH0DEN		0x30
+#define SUDMAC_DSTSCLR		0x38
+#define SUDMAC_DBUFCTRL		0x3C
+#define SUDMAC_DINTCTRL		0x40
+#define SUDMAC_DINTSTS		0x44
+#define SUDMAC_DINTSTSCLR	0x48
+#define SUDMAC_CH0SHCTRL	0x50
+
+/* Definitions for the sudmac_channel.config */
+#define SUDMAC_SENDBUFM	0x1000 /* b12: Transmit Buffer Mode */
+#define SUDMAC_RCVENDM	0x0100 /* b8: Receive Data Transfer End Mode */
+#define SUDMAC_LBA_WAIT	0x0030 /* b5-4: Local Bus Access Wait */
+
+/* Definitions for the sudmac_channel.dint_end_bit */
+#define SUDMAC_CH1ENDE	0x0002 /* b1: Ch1 DMA Transfer End Int Enable */
+#define SUDMAC_CH0ENDE	0x0001 /* b0: Ch0 DMA Transfer End Int Enable */
+
+#define SUDMAC_DRV_NAME "sudmac"
+
+static void sudmac_writel(struct sudmac_chan *sc, u32 data, u32 reg)
+{
+	iowrite32(data, sc->base + reg);
+}
+
+static u32 sudmac_readl(struct sudmac_chan *sc, u32 reg)
+{
+	return ioread32(sc->base + reg);
+}
+
+static bool sudmac_is_busy(struct sudmac_chan *sc)
+{
+	u32 den = sudmac_readl(sc, SUDMAC_CH0DEN + sc->offset);
+
+	if (den)
+		return true; /* working */
+
+	return false; /* waiting */
+}
+
+static void sudmac_set_reg(struct sudmac_chan *sc, struct sudmac_regs *hw,
+			   struct shdma_desc *sdesc)
+{
+	sudmac_writel(sc, sc->cfg, SUDMAC_CH0CFG + sc->offset);
+	sudmac_writel(sc, hw->base_addr, SUDMAC_CH0BA + sc->offset);
+	sudmac_writel(sc, hw->base_byte_count, SUDMAC_CH0BBC + sc->offset);
+}
+
+static void sudmac_start(struct sudmac_chan *sc)
+{
+	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
+
+	sudmac_writel(sc, dintctrl | sc->dint_end_bit, SUDMAC_DINTCTRL);
+	sudmac_writel(sc, 1, SUDMAC_CH0DEN + sc->offset);
+}
+
+static void sudmac_start_xfer(struct shdma_chan *schan,
+			      struct shdma_desc *sdesc)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+	struct sudmac_desc *sd = to_desc(sdesc);
+
+	sudmac_set_reg(sc, &sd->hw, sdesc);
+	sudmac_start(sc);
+}
+
+static bool sudmac_channel_busy(struct shdma_chan *schan)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+
+	return sudmac_is_busy(sc);
+}
+
+static void sudmac_setup_xfer(struct shdma_chan *schan, int slave_id)
+{
+}
+
+static const struct sudmac_slave_config *sudmac_find_slave(
+	struct sudmac_chan *sc, int slave_id)
+{
+	struct sudmac_device *sdev = to_sdev(sc);
+	struct sudmac_pdata *pdata = sdev->pdata;
+	const struct sudmac_slave_config *cfg;
+	int i;
+
+	for (i = 0, cfg = pdata->slave; i < pdata->slave_num; i++, cfg++)
+		if (cfg->slave_id == slave_id)
+			return cfg;
+
+	return NULL;
+}
+
+static int sudmac_set_slave(struct shdma_chan *schan, int slave_id, bool try)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+	const struct sudmac_slave_config *cfg = sudmac_find_slave(sc, slave_id);
+
+	if (!cfg)
+		return -ENODEV;
+
+	return 0;
+}
+
+static inline void sudmac_dma_halt(struct sudmac_chan *sc)
+{
+	u32 dintctrl = sudmac_readl(sc, SUDMAC_DINTCTRL);
+
+	sudmac_writel(sc, 0, SUDMAC_CH0DEN + sc->offset);
+	sudmac_writel(sc, dintctrl & ~sc->dint_end_bit, SUDMAC_DINTCTRL);
+	sudmac_writel(sc, sc->dint_end_bit, SUDMAC_DINTSTSCLR);
+}
+
+static int sudmac_desc_setup(struct shdma_chan *schan,
+			     struct shdma_desc *sdesc,
+			     dma_addr_t src, dma_addr_t dst, size_t *len)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+	struct sudmac_desc *sd = to_desc(sdesc);
+
+	dev_dbg(sc->shdma_chan.dev, "%s: src=%x, dst=%x, len=%d\n",
+		__func__, src, dst, *len);
+
+	if (*len > schan->max_xfer_len)
+		*len = schan->max_xfer_len;
+
+	if (dst)
+		sd->hw.base_addr = dst;
+	else if (src)
+		sd->hw.base_addr = src;
+	sd->hw.base_byte_count = *len;
+
+	return 0;
+}
+
+static void sudmac_halt(struct shdma_chan *schan)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+
+	sudmac_dma_halt(sc);
+}
+
+static bool sudmac_chan_irq(struct shdma_chan *schan, int irq)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+	u32 dintsts = sudmac_readl(sc, SUDMAC_DINTSTS);
+
+	if (!(dintsts & sc->dint_end_bit))
+		return false;
+
+	/* DMA stop */
+	sudmac_dma_halt(sc);
+
+	return true;
+}
+
+static size_t sudmac_get_partial(struct shdma_chan *schan,
+				 struct shdma_desc *sdesc)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+	struct sudmac_desc *sd = to_desc(sdesc);
+	u32 current_byte_count = sudmac_readl(sc, SUDMAC_CH0CBC + sc->offset);
+
+	return sd->hw.base_byte_count - current_byte_count;
+}
+
+static bool sudmac_desc_completed(struct shdma_chan *schan,
+				  struct shdma_desc *sdesc)
+{
+	struct sudmac_chan *sc = to_chan(schan);
+	struct sudmac_desc *sd = to_desc(sdesc);
+	u32 current_addr = sudmac_readl(sc, SUDMAC_CH0CA + sc->offset);
+
+	return sd->hw.base_addr + sd->hw.base_byte_count == current_addr;
+}
+
+static int sudmac_chan_probe(struct sudmac_device *su_dev, int id, int irq,
+			     unsigned long flags)
+{
+	struct shdma_dev *sdev = &su_dev->shdma_dev;
+	struct platform_device *pdev = to_platform_device(sdev->dma_dev.dev);
+	struct sudmac_chan *sc;
+	struct shdma_chan *schan;
+	int err;
+
+	sc = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_chan), GFP_KERNEL);
+	if (!sc) {
+		dev_err(sdev->dma_dev.dev,
+			"No free memory for allocating dma channels!\n");
+		return -ENOMEM;
+	}
+
+	schan = &sc->shdma_chan;
+	schan->max_xfer_len = 64 * 1024 * 1024 - 1;
+
+	shdma_chan_probe(sdev, schan, id);
+
+	sc->base = su_dev->chan_reg;
+
+	/* get platform_data */
+	sc->offset = su_dev->pdata->channel->offset;
+	if (su_dev->pdata->channel->config & SUDMAC_TX_BUFFER_MODE)
+		sc->cfg |= SUDMAC_SENDBUFM;
+	if (su_dev->pdata->channel->config & SUDMAC_RX_END_MODE)
+		sc->cfg |= SUDMAC_RCVENDM;
+	sc->cfg |= (su_dev->pdata->channel->wait << 4) & SUDMAC_LBA_WAIT;
+
+	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH0)
+		sc->dint_end_bit |= SUDMAC_CH0ENDE;
+	if (su_dev->pdata->channel->dint_end_bit & SUDMAC_DMA_BIT_CH1)
+		sc->dint_end_bit |= SUDMAC_CH1ENDE;
+
+	/* set up channel irq */
+	if (pdev->id >= 0)
+		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d.%d",
+			 pdev->id, id);
+	else
+		snprintf(sc->dev_id, sizeof(sc->dev_id), "sudmac%d", id);
+
+	err = shdma_request_irq(schan, irq, flags, sc->dev_id);
+	if (err) {
+		dev_err(sdev->dma_dev.dev,
+			"DMA channel %d request_irq failed %d\n", id, err);
+		goto err_no_irq;
+	}
+
+	return 0;
+
+err_no_irq:
+	/* remove from dmaengine device node */
+	shdma_chan_remove(schan);
+	return err;
+}
+
+static void sudmac_chan_remove(struct sudmac_device *su_dev)
+{
+	struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
+	struct shdma_chan *schan;
+	int i;
+
+	shdma_for_each_chan(schan, &su_dev->shdma_dev, i) {
+		struct sudmac_chan *sc = to_chan(schan);
+
+		BUG_ON(!schan);
+
+		shdma_free_irq(&sc->shdma_chan);
+		shdma_chan_remove(schan);
+	}
+	dma_dev->chancnt = 0;
+}
+
+static dma_addr_t sudmac_slave_addr(struct shdma_chan *schan)
+{
+	/* SUDMAC doesn't need the address */
+	return 0;
+}
+
+static struct shdma_desc *sudmac_embedded_desc(void *buf, int i)
+{
+	return &((struct sudmac_desc *)buf)[i].shdma_desc;
+}
+
+static const struct shdma_ops sudmac_shdma_ops = {
+	.desc_completed = sudmac_desc_completed,
+	.halt_channel = sudmac_halt,
+	.channel_busy = sudmac_channel_busy,
+	.slave_addr = sudmac_slave_addr,
+	.desc_setup = sudmac_desc_setup,
+	.set_slave = sudmac_set_slave,
+	.setup_xfer = sudmac_setup_xfer,
+	.start_xfer = sudmac_start_xfer,
+	.embedded_desc = sudmac_embedded_desc,
+	.chan_irq = sudmac_chan_irq,
+	.get_partial = sudmac_get_partial,
+};
+
+static int sudmac_probe(struct platform_device *pdev)
+{
+	struct sudmac_pdata *pdata = pdev->dev.platform_data;
+	int err, i;
+	struct sudmac_device *su_dev;
+	struct dma_device *dma_dev;
+	struct resource *chan, *irq_res;
+
+	/* get platform data */
+	if (!pdata)
+		return -ENODEV;
+
+	chan = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	irq_res = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
+	if (!chan || !irq_res)
+		return -ENODEV;
+
+	err = -ENOMEM;
+	su_dev = devm_kzalloc(&pdev->dev, sizeof(struct sudmac_device),
+			      GFP_KERNEL);
+	if (!su_dev) {
+		dev_err(&pdev->dev, "Not enough memory\n");
+		return err;
+	}
+
+	dma_dev = &su_dev->shdma_dev.dma_dev;
+
+	su_dev->chan_reg = devm_request_and_ioremap(&pdev->dev, chan);
+	if (!su_dev->chan_reg)
+		return err;
+
+	dma_cap_set(DMA_SLAVE, dma_dev->cap_mask);
+
+	su_dev->shdma_dev.ops = &sudmac_shdma_ops;
+	su_dev->shdma_dev.desc_size = sizeof(struct sudmac_desc);
+	err = shdma_init(&pdev->dev, &su_dev->shdma_dev, pdata->channel_num);
+	if (err < 0)
+		return err;
+
+	/* platform data */
+	su_dev->pdata = pdev->dev.platform_data;
+
+	platform_set_drvdata(pdev, su_dev);
+
+	/* Create DMA Channel */
+	for (i = 0; i < pdata->channel_num; i++) {
+		err = sudmac_chan_probe(su_dev, i, irq_res->start, IRQF_SHARED);
+		if (err)
+			goto chan_probe_err;
+	}
+
+	err = dma_async_device_register(&su_dev->shdma_dev.dma_dev);
+	if (err < 0)
+		goto chan_probe_err;
+
+	return err;
+
+chan_probe_err:
+	sudmac_chan_remove(su_dev);
+
+	platform_set_drvdata(pdev, NULL);
+	shdma_cleanup(&su_dev->shdma_dev);
+
+	return err;
+}
+
+static int sudmac_remove(struct platform_device *pdev)
+{
+	struct sudmac_device *su_dev = platform_get_drvdata(pdev);
+	struct dma_device *dma_dev = &su_dev->shdma_dev.dma_dev;
+
+	dma_async_device_unregister(dma_dev);
+	sudmac_chan_remove(su_dev);
+	shdma_cleanup(&su_dev->shdma_dev);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+static struct platform_driver sudmac_driver = {
+	.driver		= {
+		.owner	= THIS_MODULE,
+		.name	= SUDMAC_DRV_NAME,
+	},
+	.probe		= sudmac_probe,
+	.remove		= sudmac_remove,
+};
+module_platform_driver(sudmac_driver);
+
+MODULE_AUTHOR("Yoshihiro Shimoda");
+MODULE_DESCRIPTION("Renesas SUDMAC driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:" SUDMAC_DRV_NAME);
diff --git a/include/linux/sudmac.h b/include/linux/sudmac.h
new file mode 100644
index 000000000000..377b8a5788fa
--- /dev/null
+++ b/include/linux/sudmac.h
@@ -0,0 +1,52 @@
+/*
+ * Header for the SUDMAC driver
+ *
+ * Copyright (C) 2013 Renesas Solutions Corp.
+ *
+ * This is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ */
+#ifndef SUDMAC_H
+#define SUDMAC_H
+
+#include <linux/dmaengine.h>
+#include <linux/shdma-base.h>
+#include <linux/types.h>
+
+/* Used by slave DMA clients to request DMA to/from a specific peripheral */
+struct sudmac_slave {
+	struct shdma_slave	shdma_slave;	/* Set by the platform */
+};
+
+/*
+ * Supplied by platforms to specify, how a DMA channel has to be configured for
+ * a certain peripheral
+ */
+struct sudmac_slave_config {
+	int		slave_id;
+};
+
+struct sudmac_channel {
+	unsigned long	offset;
+	unsigned long	config;
+	unsigned long	wait;		/* The configuable range is 0 to 3 */
+	unsigned long	dint_end_bit;
+};
+
+struct sudmac_pdata {
+	const struct sudmac_slave_config *slave;
+	int slave_num;
+	const struct sudmac_channel *channel;
+	int channel_num;
+};
+
+/* Definitions for the sudmac_channel.config */
+#define SUDMAC_TX_BUFFER_MODE	BIT(0)
+#define SUDMAC_RX_END_MODE	BIT(1)
+
+/* Definitions for the sudmac_channel.dint_end_bit */
+#define SUDMAC_DMA_BIT_CH0	BIT(0)
+#define SUDMAC_DMA_BIT_CH1	BIT(1)
+
+#endif
-- 
cgit 


From 8a415b8c05f261a52f45f2271b6c4731376fd5b5 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Mon, 29 Apr 2013 20:08:02 +0100
Subject: efi, pstore: Read data from variable store before memcpy()

Seiji reported getting empty dmesg-* files, because the data was never
actually read in efi_pstore_read_func(), and so the memcpy() was copying
garbage data.

This patch necessitated adding __efivar_entry_get() which is callable
between efivar_entry_iter_{begin,end}(). We can also delete
__efivar_entry_size() because efi_pstore_read_func() was the only
caller.

Reported-by: Seiji Aguchi <seiji.aguchi@hds.com>
Tested-by: Seiji Aguchi <seiji.aguchi@hds.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Matthew Garrett <matthew.garrett@nebula.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 drivers/firmware/efi/efi-pstore.c |  6 +++++-
 drivers/firmware/efi/vars.c       | 45 ++++++++++++++++++++-------------------
 include/linux/efi.h               |  3 ++-
 3 files changed, 30 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c
index af45c42086e1..67615d6d038d 100644
--- a/drivers/firmware/efi/efi-pstore.c
+++ b/drivers/firmware/efi/efi-pstore.c
@@ -73,7 +73,11 @@ static int efi_pstore_read_func(struct efivar_entry *entry, void *data)
 	} else
 		return 0;
 
-	__efivar_entry_size(entry, &size);
+	entry->var.DataSize = 1024;
+	__efivar_entry_get(entry, &entry->var.Attributes,
+			   &entry->var.DataSize, entry->var.Data);
+	size = entry->var.DataSize;
+
 	*cb_data->buf = kmalloc(size, GFP_KERNEL);
 	if (*cb_data->buf == NULL)
 		return -ENOMEM;
diff --git a/drivers/firmware/efi/vars.c b/drivers/firmware/efi/vars.c
index 1d80c1ca39c5..96d328b21c3e 100644
--- a/drivers/firmware/efi/vars.c
+++ b/drivers/firmware/efi/vars.c
@@ -689,54 +689,55 @@ struct efivar_entry *efivar_entry_find(efi_char16_t *name, efi_guid_t guid,
 EXPORT_SYMBOL_GPL(efivar_entry_find);
 
 /**
- * __efivar_entry_size - obtain the size of a variable
+ * efivar_entry_size - obtain the size of a variable
  * @entry: entry for this variable
  * @size: location to store the variable's size
- *
- * The caller MUST call efivar_entry_iter_begin() and
- * efivar_entry_iter_end() before and after the invocation of this
- * function, respectively.
  */
-int __efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
 {
 	const struct efivar_operations *ops = __efivars->ops;
 	efi_status_t status;
 
-	WARN_ON(!spin_is_locked(&__efivars->lock));
-
 	*size = 0;
+
+	spin_lock_irq(&__efivars->lock);
 	status = ops->get_variable(entry->var.VariableName,
 				   &entry->var.VendorGuid, NULL, size, NULL);
+	spin_unlock_irq(&__efivars->lock);
+
 	if (status != EFI_BUFFER_TOO_SMALL)
 		return efi_status_to_err(status);
 
 	return 0;
 }
-EXPORT_SYMBOL_GPL(__efivar_entry_size);
+EXPORT_SYMBOL_GPL(efivar_entry_size);
 
 /**
- * efivar_entry_size - obtain the size of a variable
- * @entry: entry for this variable
- * @size: location to store the variable's size
+ * __efivar_entry_get - call get_variable()
+ * @entry: read data for this variable
+ * @attributes: variable attributes
+ * @size: size of @data buffer
+ * @data: buffer to store variable data
+ *
+ * The caller MUST call efivar_entry_iter_begin() and
+ * efivar_entry_iter_end() before and after the invocation of this
+ * function, respectively.
  */
-int efivar_entry_size(struct efivar_entry *entry, unsigned long *size)
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		       unsigned long *size, void *data)
 {
 	const struct efivar_operations *ops = __efivars->ops;
 	efi_status_t status;
 
-	*size = 0;
+	WARN_ON(!spin_is_locked(&__efivars->lock));
 
-	spin_lock_irq(&__efivars->lock);
 	status = ops->get_variable(entry->var.VariableName,
-				   &entry->var.VendorGuid, NULL, size, NULL);
-	spin_unlock_irq(&__efivars->lock);
-
-	if (status != EFI_BUFFER_TOO_SMALL)
-		return efi_status_to_err(status);
+				   &entry->var.VendorGuid,
+				   attributes, size, data);
 
-	return 0;
+	return efi_status_to_err(status);
 }
-EXPORT_SYMBOL_GPL(efivar_entry_size);
+EXPORT_SYMBOL_GPL(__efivar_entry_get);
 
 /**
  * efivar_entry_get - call get_variable()
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 3f7257f1f5e8..2bc0ad78d058 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -808,8 +808,9 @@ void efivar_entry_remove(struct efivar_entry *entry);
 int __efivar_entry_delete(struct efivar_entry *entry);
 int efivar_entry_delete(struct efivar_entry *entry);
 
-int __efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
 int efivar_entry_size(struct efivar_entry *entry, unsigned long *size);
+int __efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
+		       unsigned long *size, void *data);
 int efivar_entry_get(struct efivar_entry *entry, u32 *attributes,
 		     unsigned long *size, void *data);
 int efivar_entry_set(struct efivar_entry *entry, u32 attributes,
-- 
cgit 


From f3002134158092178be81339ec5a22ff80e6c308 Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 30 Apr 2013 11:35:07 +0200
Subject: Revert "math64: New div64_u64_rem helper"

This reverts commit f792685006274a850e6cc0ea9ade275ccdfc90bc.

The cputime scaling code was changed/fixed and does not need the
div64_u64_rem() primitive anymore. It has no other users, so let's
remove them.

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: rostedt@goodmis.org
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1367314507-9728-4-git-send-email-sgruszka@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/math64.h | 19 +------------------
 lib/div64.c            | 19 ++++++-------------
 2 files changed, 7 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/math64.h b/include/linux/math64.h
index 931a619407bf..b8ba85544721 100644
--- a/include/linux/math64.h
+++ b/include/linux/math64.h
@@ -29,15 +29,6 @@ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
 	return dividend / divisor;
 }
 
-/**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor
- */
-static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
-{
-	*remainder = dividend % divisor;
-	return dividend / divisor;
-}
-
 /**
  * div64_u64 - unsigned 64bit divide with 64bit divisor
  */
@@ -70,16 +61,8 @@ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
 extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder);
 #endif
 
-#ifndef div64_u64_rem
-extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder);
-#endif
-
 #ifndef div64_u64
-static inline u64 div64_u64(u64 dividend, u64 divisor)
-{
-	u64 remainder;
-	return div64_u64_rem(dividend, divisor, &remainder);
-}
+extern u64 div64_u64(u64 dividend, u64 divisor);
 #endif
 
 #ifndef div64_s64
diff --git a/lib/div64.c b/lib/div64.c
index 3af5728d95fd..a163b6caef73 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,10 +79,9 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
- * div64_u64_rem - unsigned 64bit divide with 64bit divisor and 64bit remainder
+ * div64_u64 - unsigned 64bit divide with 64bit divisor
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
- * @remainder:  64bit remainder
  *
  * This implementation is a modified version of the algorithm proposed
  * by the book 'Hacker's Delight'.  The original source and full proof
@@ -90,33 +89,27 @@ EXPORT_SYMBOL(div_s64_rem);
  *
  * 'http://www.hackersdelight.org/HDcode/newCode/divDouble.c.txt'
  */
-#ifndef div64_u64_rem
-u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+#ifndef div64_u64
+u64 div64_u64(u64 dividend, u64 divisor)
 {
 	u32 high = divisor >> 32;
 	u64 quot;
 
 	if (high == 0) {
-		u32 rem32;
-		quot = div_u64_rem(dividend, divisor, &rem32);
-		*remainder = rem32;
+		quot = div_u64(dividend, divisor);
 	} else {
 		int n = 1 + fls(high);
 		quot = div_u64(dividend >> n, divisor >> n);
 
 		if (quot != 0)
 			quot--;
-
-		*remainder = dividend - quot * divisor;
-		if (*remainder >= divisor) {
+		if ((dividend - quot * divisor) >= divisor)
 			quot++;
-			*remainder -= divisor;
-		}
 	}
 
 	return quot;
 }
-EXPORT_SYMBOL(div64_u64_rem);
+EXPORT_SYMBOL(div64_u64);
 #endif
 
 /**
-- 
cgit 


From dc9eb698f441889f2d7926b1cc6f1e14f0787f00 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 19 Apr 2013 13:23:09 -0400
Subject: audit: stop pushing loginid, uid, sessionid as arguments

We always use current.  Stop pulling this when the skb comes in and
pushing it around as arguments.  Just get it at the end when you need
it.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/tty/tty_audit.c |   4 +-
 include/linux/audit.h   |   3 +-
 include/linux/tty.h     |   6 +--
 kernel/audit.c          | 100 ++++++++++++++++++------------------------------
 kernel/auditfilter.c    |  22 +++++------
 5 files changed, 52 insertions(+), 83 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 6953dc82850c..1e4e9f30ea09 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -202,10 +202,12 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch)
  * reference to the tty audit buffer if available.
  * Flush the buffer or return an appropriate error code.
  */
-int tty_audit_push_task(struct task_struct *tsk, kuid_t loginuid, u32 sessionid)
+int tty_audit_push_task(struct task_struct *tsk)
 {
 	struct tty_audit_buf *buf = ERR_PTR(-EPERM);
 	unsigned long flags;
+	kuid_t loginuid = audit_get_loginuid(tsk);
+	u32 sessionid = audit_get_sessionid(tsk);
 
 	if (!lock_task_sighand(tsk, &flags))
 		return -ESRCH;
diff --git a/include/linux/audit.h b/include/linux/audit.h
index b26d7f121ac5..a3a50cca1efb 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -441,8 +441,7 @@ extern int		    audit_update_lsm_rules(void);
 extern int audit_filter_user(int type);
 extern int audit_filter_type(int type);
 extern int  audit_receive_filter(int type, int pid, int seq,
-				void *data, size_t datasz, kuid_t loginuid,
-				u32 sessionid, u32 sid);
+				void *data, size_t datasz);
 extern int audit_enabled;
 #else /* CONFIG_AUDIT */
 static inline __printf(4, 5)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 8db1b569c37a..78e378b3971c 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -517,8 +517,7 @@ extern void tty_audit_exit(void);
 extern void tty_audit_fork(struct signal_struct *sig);
 extern void tty_audit_tiocsti(struct tty_struct *tty, char ch);
 extern void tty_audit_push(struct tty_struct *tty);
-extern int tty_audit_push_task(struct task_struct *tsk,
-			       kuid_t loginuid, u32 sessionid);
+extern int tty_audit_push_task(struct task_struct *tsk);
 #else
 static inline void tty_audit_add_data(struct tty_struct *tty,
 		unsigned char *data, size_t size, unsigned icanon)
@@ -536,8 +535,7 @@ static inline void tty_audit_fork(struct signal_struct *sig)
 static inline void tty_audit_push(struct tty_struct *tty)
 {
 }
-static inline int tty_audit_push_task(struct task_struct *tsk,
-				      kuid_t loginuid, u32 sessionid)
+static inline int tty_audit_push_task(struct task_struct *tsk)
 {
 	return 0;
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index 274882d308d3..bf1e1330cbb1 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -265,17 +265,22 @@ void audit_log_lost(const char *message)
 }
 
 static int audit_log_config_change(char *function_name, int new, int old,
-				   kuid_t loginuid, u32 sessionid, u32 sid,
 				   int allow_changes)
 {
 	struct audit_buffer *ab;
 	int rc = 0;
+	u32 sessionid = audit_get_sessionid(current);
+	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+	u32 sid;
+
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return rc;
 	audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
-			 old, from_kuid(&init_user_ns, loginuid), sessionid);
+			 old, auid, sessionid);
+
+	security_task_getsecid(current, &sid);
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
@@ -294,9 +299,7 @@ static int audit_log_config_change(char *function_name, int new, int old,
 	return rc;
 }
 
-static int audit_do_config_change(char *function_name, int *to_change,
-				  int new, kuid_t loginuid, u32 sessionid,
-				  u32 sid)
+static int audit_do_config_change(char *function_name, int *to_change, int new)
 {
 	int allow_changes, rc = 0, old = *to_change;
 
@@ -307,8 +310,7 @@ static int audit_do_config_change(char *function_name, int *to_change,
 		allow_changes = 1;
 
 	if (audit_enabled != AUDIT_OFF) {
-		rc = audit_log_config_change(function_name, new, old, loginuid,
-					     sessionid, sid, allow_changes);
+		rc = audit_log_config_change(function_name, new, old, allow_changes);
 		if (rc)
 			allow_changes = 0;
 	}
@@ -322,44 +324,37 @@ static int audit_do_config_change(char *function_name, int *to_change,
 	return rc;
 }
 
-static int audit_set_rate_limit(int limit, kuid_t loginuid, u32 sessionid,
-				u32 sid)
+static int audit_set_rate_limit(int limit)
 {
-	return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
-				      limit, loginuid, sessionid, sid);
+	return audit_do_config_change("audit_rate_limit", &audit_rate_limit, limit);
 }
 
-static int audit_set_backlog_limit(int limit, kuid_t loginuid, u32 sessionid,
-				   u32 sid)
+static int audit_set_backlog_limit(int limit)
 {
-	return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
-				      limit, loginuid, sessionid, sid);
+	return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, limit);
 }
 
-static int audit_set_enabled(int state, kuid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_enabled(int state)
 {
 	int rc;
 	if (state < AUDIT_OFF || state > AUDIT_LOCKED)
 		return -EINVAL;
 
-	rc =  audit_do_config_change("audit_enabled", &audit_enabled, state,
-				     loginuid, sessionid, sid);
-
+	rc =  audit_do_config_change("audit_enabled", &audit_enabled, state);
 	if (!rc)
 		audit_ever_enabled |= !!state;
 
 	return rc;
 }
 
-static int audit_set_failure(int state, kuid_t loginuid, u32 sessionid, u32 sid)
+static int audit_set_failure(int state)
 {
 	if (state != AUDIT_FAIL_SILENT
 	    && state != AUDIT_FAIL_PRINTK
 	    && state != AUDIT_FAIL_PANIC)
 		return -EINVAL;
 
-	return audit_do_config_change("audit_failure", &audit_failure, state,
-				      loginuid, sessionid, sid);
+	return audit_do_config_change("audit_failure", &audit_failure, state);
 }
 
 /*
@@ -627,12 +622,15 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 	return err;
 }
 
-static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
-				     kuid_t auid, u32 ses, u32 sid)
+static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 {
 	int rc = 0;
 	char *ctx = NULL;
 	u32 len;
+	u32 sessionid = audit_get_sessionid(current);
+	uid_t uid = from_kuid(&init_user_ns, current_uid());
+	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+	u32 sid;
 
 	if (!audit_enabled) {
 		*ab = NULL;
@@ -643,9 +641,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 	if (unlikely(!*ab))
 		return rc;
 	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
-			 task_tgid_vnr(current),
-			 from_kuid(&init_user_ns, current_uid()),
-			 from_kuid(&init_user_ns, auid), ses);
+			 task_tgid_vnr(current), uid, auid, sessionid);
+	security_task_getsecid(current, &sid);
 	if (sid) {
 		rc = security_secid_to_secctx(sid, &ctx, &len);
 		if (rc)
@@ -661,14 +658,12 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
 
 static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 {
-	u32			seq, sid;
+	u32			seq;
 	void			*data;
 	struct audit_status	*status_get, status_set;
 	int			err;
 	struct audit_buffer	*ab;
 	u16			msg_type = nlh->nlmsg_type;
-	kuid_t			loginuid; /* loginuid of sender */
-	u32			sessionid;
 	struct audit_sig_info   *sig_data;
 	char			*ctx = NULL;
 	u32			len;
@@ -677,9 +672,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 	if (err)
 		return err;
 
-	loginuid = audit_get_loginuid(current);
-	sessionid = audit_get_sessionid(current);
-	security_task_getsecid(current, &sid);
 	seq  = nlh->nlmsg_seq;
 	data = nlmsg_data(nlh);
 
@@ -700,14 +692,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			return -EINVAL;
 		status_get   = (struct audit_status *)data;
 		if (status_get->mask & AUDIT_STATUS_ENABLED) {
-			err = audit_set_enabled(status_get->enabled,
-						loginuid, sessionid, sid);
+			err = audit_set_enabled(status_get->enabled);
 			if (err < 0)
 				return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_FAILURE) {
-			err = audit_set_failure(status_get->failure,
-						loginuid, sessionid, sid);
+			err = audit_set_failure(status_get->failure);
 			if (err < 0)
 				return err;
 		}
@@ -715,22 +705,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 			int new_pid = status_get->pid;
 
 			if (audit_enabled != AUDIT_OFF)
-				audit_log_config_change("audit_pid", new_pid,
-							audit_pid, loginuid,
-							sessionid, sid, 1);
-
+				audit_log_config_change("audit_pid", new_pid, audit_pid, 1);
 			audit_pid = new_pid;
 			audit_nlk_portid = NETLINK_CB(skb).portid;
 		}
 		if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
-			err = audit_set_rate_limit(status_get->rate_limit,
-						   loginuid, sessionid, sid);
+			err = audit_set_rate_limit(status_get->rate_limit);
 			if (err < 0)
 				return err;
 		}
 		if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
-			err = audit_set_backlog_limit(status_get->backlog_limit,
-						      loginuid, sessionid, sid);
+			err = audit_set_backlog_limit(status_get->backlog_limit);
 		break;
 	case AUDIT_USER:
 	case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG:
@@ -742,14 +727,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (err == 1) {
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
-				err = tty_audit_push_task(current, loginuid,
-							     sessionid);
+				err = tty_audit_push_task(current);
 				if (err)
 					break;
 			}
-			audit_log_common_recv_msg(&ab, msg_type,
-						  loginuid, sessionid, sid);
-
+			audit_log_common_recv_msg(&ab, msg_type);
 			if (msg_type != AUDIT_USER_TTY)
 				audit_log_format(ab, " msg='%.1024s'",
 						 (char *)data);
@@ -772,26 +754,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
 			return -EINVAL;
 		if (audit_enabled == AUDIT_LOCKED) {
-			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-						  loginuid, sessionid, sid);
-
-			audit_log_format(ab, " audit_enabled=%d res=0",
-					 audit_enabled);
+			audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
+			audit_log_format(ab, " audit_enabled=%d res=0", audit_enabled);
 			audit_log_end(ab);
 			return -EPERM;
 		}
 		/* fallthrough */
 	case AUDIT_LIST_RULES:
 		err = audit_receive_filter(msg_type, NETLINK_CB(skb).portid,
-					   seq, data, nlmsg_len(nlh),
-					   loginuid, sessionid, sid);
+					   seq, data, nlmsg_len(nlh));
 		break;
 	case AUDIT_TRIM:
 		audit_trim_trees();
-
-		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-					  loginuid, sessionid, sid);
-
+		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
 		audit_log_format(ab, " op=trim res=1");
 		audit_log_end(ab);
 		break;
@@ -821,8 +796,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		/* OK, here comes... */
 		err = audit_tag_tree(old, new);
 
-		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE,
-					  loginuid, sessionid, sid);
+		audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE);
 
 		audit_log_format(ab, " op=make_equiv old=");
 		audit_log_untrustedstring(ab, old);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index ee9af6533327..f952234da2ca 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -980,11 +980,12 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
 }
 
 /* Log rule additions and removals */
-static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
-				  char *action, struct audit_krule *rule,
-				  int res)
+static void audit_log_rule_change(char *action, struct audit_krule *rule, int res)
 {
 	struct audit_buffer *ab;
+	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+	u32 sessionid = audit_get_sessionid(current);
+	u32 sid;
 
 	if (!audit_enabled)
 		return;
@@ -992,8 +993,8 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (!ab)
 		return;
-	audit_log_format(ab, "auid=%u ses=%u",
-			 from_kuid(&init_user_ns, loginuid), sessionid);
+	audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
+	security_task_getsecid(current, &sid);
 	if (sid) {
 		char *ctx = NULL;
 		u32 len;
@@ -1022,8 +1023,7 @@ static void audit_log_rule_change(kuid_t loginuid, u32 sessionid, u32 sid,
  * @sessionid: sessionid for netlink audit message
  * @sid: SE Linux Security ID of sender
  */
-int audit_receive_filter(int type, int pid, int seq, void *data,
-			 size_t datasz, kuid_t loginuid, u32 sessionid, u32 sid)
+int audit_receive_filter(int type, int pid, int seq, void *data, size_t datasz)
 {
 	struct task_struct *tsk;
 	struct audit_netlink_list *dest;
@@ -1061,9 +1061,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_add_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "add rule",
-				      &entry->rule, !err);
-
+		audit_log_rule_change("add rule", &entry->rule, !err);
 		if (err)
 			audit_free_rule(entry);
 		break;
@@ -1073,9 +1071,7 @@ int audit_receive_filter(int type, int pid, int seq, void *data,
 			return PTR_ERR(entry);
 
 		err = audit_del_rule(entry);
-		audit_log_rule_change(loginuid, sessionid, sid, "remove rule",
-				      &entry->rule, !err);
-
+		audit_log_rule_change("remove rule", &entry->rule, !err);
 		audit_free_rule(entry);
 		break;
 	default:
-- 
cgit 


From 152f497b9b5940f81de3205465840a5eb316458e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 19 Apr 2013 13:56:11 -0400
Subject: audit: push loginuid and sessionid processing down

Since we are always current, we can push a lot of this stuff to the
bottom and get rid of useless interfaces and arguments.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/tty/tty_audit.c | 72 +++++++++++++++++--------------------------------
 include/linux/tty.h     |  4 +--
 kernel/audit.c          |  2 +-
 3 files changed, 27 insertions(+), 51 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 1e4e9f30ea09..ea2e5ad71731 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -60,24 +60,22 @@ static void tty_audit_buf_put(struct tty_audit_buf *buf)
 		tty_audit_buf_free(buf);
 }
 
-static void tty_audit_log(const char *description, struct task_struct *tsk,
-			  kuid_t loginuid, unsigned sessionid, int major,
-			  int minor, unsigned char *data, size_t size)
+static void tty_audit_log(const char *description, int major, int minor,
+			  unsigned char *data, size_t size)
 {
 	struct audit_buffer *ab;
+	struct task_struct *tsk = current;
+	uid_t uid = from_kuid(&init_user_ns, task_uid(tsk));
+	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(tsk));
+	u32 sessionid = audit_get_sessionid(tsk);
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_TTY);
 	if (ab) {
 		char name[sizeof(tsk->comm)];
-		kuid_t uid = task_uid(tsk);
-
-		audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u "
-				 "major=%d minor=%d comm=", description,
-				 tsk->pid,
-				 from_kuid(&init_user_ns, uid),
-				 from_kuid(&init_user_ns, loginuid),
-				 sessionid,
-				 major, minor);
+
+		audit_log_format(ab, "%s pid=%u uid=%u auid=%u ses=%u major=%d"
+				 " minor=%d comm=", description, tsk->pid, uid,
+				 loginuid, sessionid, major, minor);
 		get_task_comm(name, tsk);
 		audit_log_untrustedstring(ab, name);
 		audit_log_format(ab, " data=");
@@ -90,11 +88,9 @@ static void tty_audit_log(const char *description, struct task_struct *tsk,
  *	tty_audit_buf_push	-	Push buffered data out
  *
  *	Generate an audit message from the contents of @buf, which is owned by
- *	@tsk with @loginuid.  @buf->mutex must be locked.
+ *	the current task.  @buf->mutex must be locked.
  */
-static void tty_audit_buf_push(struct task_struct *tsk, kuid_t loginuid,
-			       unsigned int sessionid,
-			       struct tty_audit_buf *buf)
+static void tty_audit_buf_push(struct tty_audit_buf *buf)
 {
 	if (buf->valid == 0)
 		return;
@@ -102,24 +98,10 @@ static void tty_audit_buf_push(struct task_struct *tsk, kuid_t loginuid,
 		buf->valid = 0;
 		return;
 	}
-	tty_audit_log("tty", tsk, loginuid, sessionid, buf->major, buf->minor,
-		      buf->data, buf->valid);
+	tty_audit_log("tty", buf->major, buf->minor, buf->data, buf->valid);
 	buf->valid = 0;
 }
 
-/**
- *	tty_audit_buf_push_current	-	Push buffered data out
- *
- *	Generate an audit message from the contents of @buf, which is owned by
- *	the current task.  @buf->mutex must be locked.
- */
-static void tty_audit_buf_push_current(struct tty_audit_buf *buf)
-{
-	kuid_t auid = audit_get_loginuid(current);
-	unsigned int sessionid = audit_get_sessionid(current);
-	tty_audit_buf_push(current, auid, sessionid, buf);
-}
-
 /**
  *	tty_audit_exit	-	Handle a task exit
  *
@@ -138,7 +120,7 @@ void tty_audit_exit(void)
 		return;
 
 	mutex_lock(&buf->mutex);
-	tty_audit_buf_push_current(buf);
+	tty_audit_buf_push(buf);
 	mutex_unlock(&buf->mutex);
 
 	tty_audit_buf_put(buf);
@@ -176,7 +158,7 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch)
 	if (buf) {
 		mutex_lock(&buf->mutex);
 		if (buf->major == major && buf->minor == minor)
-			tty_audit_buf_push_current(buf);
+			tty_audit_buf_push(buf);
 		mutex_unlock(&buf->mutex);
 		tty_audit_buf_put(buf);
 	}
@@ -187,27 +169,21 @@ void tty_audit_tiocsti(struct tty_struct *tty, char ch)
 
 		auid = audit_get_loginuid(current);
 		sessionid = audit_get_sessionid(current);
-		tty_audit_log("ioctl=TIOCSTI", current, auid, sessionid, major,
-			      minor, &ch, 1);
+		tty_audit_log("ioctl=TIOCSTI", major, minor, &ch, 1);
 	}
 }
 
 /**
- * tty_audit_push_task	-	Flush task's pending audit data
- * @tsk:		task pointer
- * @loginuid:		sender login uid
- * @sessionid:		sender session id
+ * tty_audit_push_current -	Flush current's pending audit data
  *
- * Called with a ref on @tsk held. Try to lock sighand and get a
- * reference to the tty audit buffer if available.
+ * Try to lock sighand and get a reference to the tty audit buffer if available.
  * Flush the buffer or return an appropriate error code.
  */
-int tty_audit_push_task(struct task_struct *tsk)
+int tty_audit_push_current(void)
 {
 	struct tty_audit_buf *buf = ERR_PTR(-EPERM);
+	struct task_struct *tsk = current;
 	unsigned long flags;
-	kuid_t loginuid = audit_get_loginuid(tsk);
-	u32 sessionid = audit_get_sessionid(tsk);
 
 	if (!lock_task_sighand(tsk, &flags))
 		return -ESRCH;
@@ -227,7 +203,7 @@ int tty_audit_push_task(struct task_struct *tsk)
 		return PTR_ERR(buf);
 
 	mutex_lock(&buf->mutex);
-	tty_audit_buf_push(tsk, loginuid, sessionid, buf);
+	tty_audit_buf_push(buf);
 	mutex_unlock(&buf->mutex);
 
 	tty_audit_buf_put(buf);
@@ -311,7 +287,7 @@ void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
 	minor = tty->driver->minor_start + tty->index;
 	if (buf->major != major || buf->minor != minor
 	    || buf->icanon != icanon) {
-		tty_audit_buf_push_current(buf);
+		tty_audit_buf_push(buf);
 		buf->major = major;
 		buf->minor = minor;
 		buf->icanon = icanon;
@@ -327,7 +303,7 @@ void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
 		data += run;
 		size -= run;
 		if (buf->valid == N_TTY_BUF_SIZE)
-			tty_audit_buf_push_current(buf);
+			tty_audit_buf_push(buf);
 	} while (size != 0);
 	mutex_unlock(&buf->mutex);
 	tty_audit_buf_put(buf);
@@ -359,7 +335,7 @@ void tty_audit_push(struct tty_struct *tty)
 		minor = tty->driver->minor_start + tty->index;
 		mutex_lock(&buf->mutex);
 		if (buf->major == major && buf->minor == minor)
-			tty_audit_buf_push_current(buf);
+			tty_audit_buf_push(buf);
 		mutex_unlock(&buf->mutex);
 		tty_audit_buf_put(buf);
 	}
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 78e378b3971c..96d640b32cd8 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -517,7 +517,7 @@ extern void tty_audit_exit(void);
 extern void tty_audit_fork(struct signal_struct *sig);
 extern void tty_audit_tiocsti(struct tty_struct *tty, char ch);
 extern void tty_audit_push(struct tty_struct *tty);
-extern int tty_audit_push_task(struct task_struct *tsk);
+extern int tty_audit_push_current(void);
 #else
 static inline void tty_audit_add_data(struct tty_struct *tty,
 		unsigned char *data, size_t size, unsigned icanon)
@@ -535,7 +535,7 @@ static inline void tty_audit_fork(struct signal_struct *sig)
 static inline void tty_audit_push(struct tty_struct *tty)
 {
 }
-static inline int tty_audit_push_task(struct task_struct *tsk)
+static inline int tty_audit_push_current(void)
 {
 	return 0;
 }
diff --git a/kernel/audit.c b/kernel/audit.c
index bf1e1330cbb1..79b42fd14c22 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -727,7 +727,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (err == 1) {
 			err = 0;
 			if (msg_type == AUDIT_USER_TTY) {
-				err = tty_audit_push_task(current);
+				err = tty_audit_push_current();
 				if (err)
 					break;
 			}
-- 
cgit 


From b122c3767c1d89763b4babca062c3171a71ed97c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 19 Apr 2013 15:00:33 -0400
Subject: audit: use a consistent audit helper to log lsm information

We have a number of places we were reimplementing the same code to write
out lsm labels.  Just do it one darn place.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h |  8 +++++---
 kernel/audit.c        | 34 ++++------------------------------
 kernel/auditfilter.c  | 13 +------------
 kernel/auditsc.c      | 10 +++++-----
 4 files changed, 15 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index a3a50cca1efb..e2dd9c124140 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -188,7 +188,7 @@ static inline int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
-extern void audit_log_task_context(struct audit_buffer *ab);
+extern int audit_log_task_context(struct audit_buffer *ab);
 extern void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
@@ -344,8 +344,10 @@ static inline int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
-static inline void audit_log_task_context(struct audit_buffer *ab)
-{ }
+static int void audit_log_task_context(struct audit_buffer *ab)
+{
+	return 0;
+}
 static inline void audit_log_task_info(struct audit_buffer *ab,
 				       struct task_struct *tsk)
 { }
diff --git a/kernel/audit.c b/kernel/audit.c
index 79b42fd14c22..a3c77b979b5b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -271,29 +271,15 @@ static int audit_log_config_change(char *function_name, int new, int old,
 	int rc = 0;
 	u32 sessionid = audit_get_sessionid(current);
 	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
-	u32 sid;
-
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return rc;
 	audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
 			 old, auid, sessionid);
-
-	security_task_getsecid(current, &sid);
-	if (sid) {
-		char *ctx = NULL;
-		u32 len;
-
-		rc = security_secid_to_secctx(sid, &ctx, &len);
-		if (rc) {
-			audit_log_format(ab, " sid=%u", sid);
-			allow_changes = 0; /* Something weird, deny request */
-		} else {
-			audit_log_format(ab, " subj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
+	rc = audit_log_task_context(ab);
+	if (rc)
+		allow_changes = 0; /* Something weird, deny request */
 	audit_log_format(ab, " res=%d", allow_changes);
 	audit_log_end(ab);
 	return rc;
@@ -625,12 +611,9 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 {
 	int rc = 0;
-	char *ctx = NULL;
-	u32 len;
 	u32 sessionid = audit_get_sessionid(current);
 	uid_t uid = from_kuid(&init_user_ns, current_uid());
 	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
-	u32 sid;
 
 	if (!audit_enabled) {
 		*ab = NULL;
@@ -642,16 +625,7 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 		return rc;
 	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
 			 task_tgid_vnr(current), uid, auid, sessionid);
-	security_task_getsecid(current, &sid);
-	if (sid) {
-		rc = security_secid_to_secctx(sid, &ctx, &len);
-		if (rc)
-			audit_log_format(*ab, " ssid=%u", sid);
-		else {
-			audit_log_format(*ab, " subj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
+	audit_log_task_context(*ab);
 
 	return rc;
 }
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f952234da2ca..478f4602c96b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -985,7 +985,6 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 	struct audit_buffer *ab;
 	uid_t loginuid = from_kuid(&init_user_ns, audit_get_loginuid(current));
 	u32 sessionid = audit_get_sessionid(current);
-	u32 sid;
 
 	if (!audit_enabled)
 		return;
@@ -994,17 +993,7 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re
 	if (!ab)
 		return;
 	audit_log_format(ab, "auid=%u ses=%u" ,loginuid, sessionid);
-	security_task_getsecid(current, &sid);
-	if (sid) {
-		char *ctx = NULL;
-		u32 len;
-		if (security_secid_to_secctx(sid, &ctx, &len))
-			audit_log_format(ab, " ssid=%u", sid);
-		else {
-			audit_log_format(ab, " subj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
+	audit_log_task_context(ab);
 	audit_log_format(ab, " op=");
 	audit_log_string(ab, action);
 	audit_log_key(ab, rule->filterkey);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 4baf61d39836..17e9a260a545 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1109,7 +1109,7 @@ static inline void audit_free_context(struct audit_context *context)
 	kfree(context);
 }
 
-void audit_log_task_context(struct audit_buffer *ab)
+int audit_log_task_context(struct audit_buffer *ab)
 {
 	char *ctx = NULL;
 	unsigned len;
@@ -1118,22 +1118,22 @@ void audit_log_task_context(struct audit_buffer *ab)
 
 	security_task_getsecid(current, &sid);
 	if (!sid)
-		return;
+		return 0;
 
 	error = security_secid_to_secctx(sid, &ctx, &len);
 	if (error) {
 		if (error != -EINVAL)
 			goto error_path;
-		return;
+		return 0;
 	}
 
 	audit_log_format(ab, " subj=%s", ctx);
 	security_release_secctx(ctx, len);
-	return;
+	return 0;
 
 error_path:
 	audit_panic("error in audit_log_task_context");
-	return;
+	return error;
 }
 
 EXPORT_SYMBOL(audit_log_task_context);
-- 
cgit 


From 4d3fb709b285ac885c40950a837edbfc90029c5f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@localhost.localdomain>
Date: Tue, 30 Apr 2013 09:53:34 -0400
Subject: helper for some session id stuff

---
 include/linux/audit.h |  2 ++
 kernel/audit.c        | 20 ++++++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index e2dd9c124140..b76bfc8efc25 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -89,6 +89,8 @@ extern int audit_classify_arch(int arch);
 
 struct filename;
 
+extern void audit_log_session_info(struct audit_buffer *ab);
+
 #ifdef CONFIG_AUDITSYSCALL
 /* These are defined in auditsc.c */
 				/* Public API */
diff --git a/kernel/audit.c b/kernel/audit.c
index a3c77b979b5b..44803f25b236 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -269,14 +269,12 @@ static int audit_log_config_change(char *function_name, int new, int old,
 {
 	struct audit_buffer *ab;
 	int rc = 0;
-	u32 sessionid = audit_get_sessionid(current);
-	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
 
 	ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
 	if (unlikely(!ab))
 		return rc;
-	audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new,
-			 old, auid, sessionid);
+	audit_log_format(ab, "%s=%d old=%d", function_name, new, old);
+	audit_log_session_info(ab);
 	rc = audit_log_task_context(ab);
 	if (rc)
 		allow_changes = 0; /* Something weird, deny request */
@@ -611,9 +609,7 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
 static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 {
 	int rc = 0;
-	u32 sessionid = audit_get_sessionid(current);
 	uid_t uid = from_kuid(&init_user_ns, current_uid());
-	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
 
 	if (!audit_enabled) {
 		*ab = NULL;
@@ -623,8 +619,8 @@ static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type)
 	*ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
 	if (unlikely(!*ab))
 		return rc;
-	audit_log_format(*ab, "pid=%d uid=%u auid=%u ses=%u",
-			 task_tgid_vnr(current), uid, auid, sessionid);
+	audit_log_format(*ab, "pid=%d uid=%u", task_tgid_vnr(current), uid);
+	audit_log_session_info(*ab);
 	audit_log_task_context(*ab);
 
 	return rc;
@@ -1376,6 +1372,14 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	kfree(pathname);
 }
 
+void audit_log_session_info(struct audit_buffer *ab)
+{
+	u32 sessionid = audit_get_sessionid(current);
+	uid_t auid = from_kuid(&init_user_ns, audit_get_loginuid(current));
+
+	audit_log_format(ab, "auid=%u ses=%u\n", auid, sessionid);
+}
+
 void audit_log_key(struct audit_buffer *ab, char *key)
 {
 	audit_log_format(ab, " key=");
-- 
cgit 


From 46e959ea2969cc1668d09b0dc55226946cf781f1 Mon Sep 17 00:00:00 2001
From: Richard Guy Briggs <rgb@redhat.com>
Date: Fri, 3 May 2013 14:03:50 -0400
Subject: audit: add an option to control logging of passwords with
 pam_tty_audit

Most commands are entered one line at a time and processed as complete lines
in non-canonical mode.  Commands that interactively require a password, enter
canonical mode to do this while shutting off echo.  This pair of features
(icanon and !echo) can be used to avoid logging passwords by audit while still
logging the rest of the command.

Adding a member (log_passwd) to the struct audit_tty_status passed in by
pam_tty_audit allows control of canonical mode without echo per task.

Signed-off-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 drivers/tty/tty_audit.c    |  9 +++++++++
 include/linux/sched.h      |  1 +
 include/uapi/linux/audit.h |  3 ++-
 kernel/audit.c             | 16 ++++++++++------
 4 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/tty/tty_audit.c b/drivers/tty/tty_audit.c
index 755d418019c8..5f3868202183 100644
--- a/drivers/tty/tty_audit.c
+++ b/drivers/tty/tty_audit.c
@@ -138,6 +138,7 @@ void tty_audit_fork(struct signal_struct *sig)
 
 	spin_lock_irqsave(&current->sighand->siglock, flags);
 	sig->audit_tty = current->signal->audit_tty;
+	sig->audit_tty_log_passwd = current->signal->audit_tty_log_passwd;
 	spin_unlock_irqrestore(&current->sighand->siglock, flags);
 }
 
@@ -275,10 +276,18 @@ void tty_audit_add_data(struct tty_struct *tty, unsigned char *data,
 {
 	struct tty_audit_buf *buf;
 	int major, minor;
+	int audit_log_tty_passwd;
+	unsigned long flags;
 
 	if (unlikely(size == 0))
 		return;
 
+	spin_lock_irqsave(&current->sighand->siglock, flags);
+	audit_log_tty_passwd = current->signal->audit_tty_log_passwd;
+	spin_unlock_irqrestore(&current->sighand->siglock, flags);
+	if (!audit_log_tty_passwd && icanon && !L_ECHO(tty))
+		return;
+
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY
 	    && tty->driver->subtype == PTY_TYPE_MASTER)
 		return;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d2112477ff5e..c4689fe92864 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -641,6 +641,7 @@ struct signal_struct {
 #endif
 #ifdef CONFIG_AUDIT
 	unsigned audit_tty;
+	unsigned audit_tty_log_passwd;
 	struct tty_audit_buf *tty_audit_buf;
 #endif
 #ifdef CONFIG_CGROUPS
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index 9f096f1c0907..c058c24b97ac 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -369,7 +369,8 @@ struct audit_status {
 };
 
 struct audit_tty_status {
-	__u32		enabled; /* 1 = enabled, 0 = disabled */
+	__u32		enabled;	/* 1 = enabled, 0 = disabled */
+	__u32		log_passwd;	/* 1 = enabled, 0 = disabled */
 };
 
 /* audit_rule_data supports filter rules with both integer and string
diff --git a/kernel/audit.c b/kernel/audit.c
index 241aa8593fa8..998a0d4155cf 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -49,6 +49,7 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
+#include <linux/kernel.h>
 
 #include <linux/audit.h>
 
@@ -808,6 +809,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 		spin_lock_irqsave(&tsk->sighand->siglock, flags);
 		s.enabled = tsk->signal->audit_tty != 0;
+		s.log_passwd = tsk->signal->audit_tty_log_passwd;
 		spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 
 		audit_send_reply(NETLINK_CB(skb).portid, seq,
@@ -815,18 +817,20 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		break;
 	}
 	case AUDIT_TTY_SET: {
-		struct audit_tty_status *s;
+		struct audit_tty_status s;
 		struct task_struct *tsk = current;
 		unsigned long flags;
 
-		if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
-			return -EINVAL;
-		s = data;
-		if (s->enabled != 0 && s->enabled != 1)
+		memset(&s, 0, sizeof(s));
+		/* guard against past and future API changes */
+		memcpy(&s, data, min(sizeof(s), (size_t)nlh->nlmsg_len));
+		if ((s.enabled != 0 && s.enabled != 1) ||
+		    (s.log_passwd != 0 && s.log_passwd != 1))
 			return -EINVAL;
 
 		spin_lock_irqsave(&tsk->sighand->siglock, flags);
-		tsk->signal->audit_tty = s->enabled != 0;
+		tsk->signal->audit_tty = s.enabled;
+		tsk->signal->audit_tty_log_passwd = s.log_passwd;
 		spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
 		break;
 	}
-- 
cgit 


From b24a30a7305418ff138ff51776fc555ec57c011a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 30 Apr 2013 15:30:32 -0400
Subject: audit: fix event coverage of AUDIT_ANOM_LINK

The userspace audit tools didn't like the existing formatting of the
AUDIT_ANOM_LINK event. It needed to be expanded to emit an AUDIT_PATH
event as well, so this implements the change. The bulk of the patch is
moving code out of auditsc.c into audit.c and audit.h for general use.
It expands audit_log_name to include an optional "struct path" argument
for the simple case of just needing to report a pathname. This also
makes
audit_log_task_info available when syscall auditing is not enabled,
since
it is needed in either case for process details.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Steve Grubb <sgrubb@redhat.com>
---
 include/linux/audit.h |  20 +--
 kernel/audit.c        | 244 ++++++++++++++++++++++++++++++++--
 kernel/audit.h        | 157 ++++++++++++++++++++++
 kernel/auditsc.c      | 353 +-------------------------------------------------
 4 files changed, 405 insertions(+), 369 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index b76bfc8efc25..469d11755e46 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -190,8 +190,6 @@ static inline int audit_get_sessionid(struct task_struct *tsk)
 	return tsk->sessionid;
 }
 
-extern int audit_log_task_context(struct audit_buffer *ab);
-extern void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk);
 extern void __audit_ipc_obj(struct kern_ipc_perm *ipcp);
 extern void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, umode_t mode);
 extern int __audit_bprm(struct linux_binprm *bprm);
@@ -346,13 +344,6 @@ static inline int audit_get_sessionid(struct task_struct *tsk)
 {
 	return -1;
 }
-static int void audit_log_task_context(struct audit_buffer *ab)
-{
-	return 0;
-}
-static inline void audit_log_task_info(struct audit_buffer *ab,
-				       struct task_struct *tsk)
-{ }
 static inline void audit_ipc_obj(struct kern_ipc_perm *ipcp)
 { }
 static inline void audit_ipc_set_perm(unsigned long qbytes, uid_t uid,
@@ -439,6 +430,10 @@ static inline void	    audit_log_secctx(struct audit_buffer *ab, u32 secid)
 { }
 #endif
 
+extern int audit_log_task_context(struct audit_buffer *ab);
+extern void audit_log_task_info(struct audit_buffer *ab,
+				struct task_struct *tsk);
+
 extern int		    audit_update_lsm_rules(void);
 
 				/* Private API (for audit.c only) */
@@ -485,6 +480,13 @@ static inline void audit_log_link_denied(const char *string,
 { }
 static inline void audit_log_secctx(struct audit_buffer *ab, u32 secid)
 { }
+static inline int audit_log_task_context(struct audit_buffer *ab)
+{
+	return 0;
+}
+static inline void audit_log_task_info(struct audit_buffer *ab,
+				       struct task_struct *tsk)
+{ }
 #define audit_enabled 0
 #endif /* CONFIG_AUDIT */
 static inline void audit_log_string(struct audit_buffer *ab, const char *buf)
diff --git a/kernel/audit.c b/kernel/audit.c
index d308723d22da..8cc580316948 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -50,6 +50,7 @@
 #include <linux/err.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
+#include <linux/syscalls.h>
 
 #include <linux/audit.h>
 
@@ -1393,6 +1394,224 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 		audit_log_format(ab, "(null)");
 }
 
+void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
+{
+	int i;
+
+	audit_log_format(ab, " %s=", prefix);
+	CAP_FOR_EACH_U32(i) {
+		audit_log_format(ab, "%08x",
+				 cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
+	}
+}
+
+void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
+{
+	kernel_cap_t *perm = &name->fcap.permitted;
+	kernel_cap_t *inh = &name->fcap.inheritable;
+	int log = 0;
+
+	if (!cap_isclear(*perm)) {
+		audit_log_cap(ab, "cap_fp", perm);
+		log = 1;
+	}
+	if (!cap_isclear(*inh)) {
+		audit_log_cap(ab, "cap_fi", inh);
+		log = 1;
+	}
+
+	if (log)
+		audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+				 name->fcap.fE, name->fcap_ver);
+}
+
+static inline int audit_copy_fcaps(struct audit_names *name,
+				   const struct dentry *dentry)
+{
+	struct cpu_vfs_cap_data caps;
+	int rc;
+
+	if (!dentry)
+		return 0;
+
+	rc = get_vfs_caps_from_disk(dentry, &caps);
+	if (rc)
+		return rc;
+
+	name->fcap.permitted = caps.permitted;
+	name->fcap.inheritable = caps.inheritable;
+	name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
+	name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >>
+				VFS_CAP_REVISION_SHIFT;
+
+	return 0;
+}
+
+/* Copy inode data into an audit_names. */
+void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
+		      const struct inode *inode)
+{
+	name->ino   = inode->i_ino;
+	name->dev   = inode->i_sb->s_dev;
+	name->mode  = inode->i_mode;
+	name->uid   = inode->i_uid;
+	name->gid   = inode->i_gid;
+	name->rdev  = inode->i_rdev;
+	security_inode_getsecid(inode, &name->osid);
+	audit_copy_fcaps(name, dentry);
+}
+
+/**
+ * audit_log_name - produce AUDIT_PATH record from struct audit_names
+ * @context: audit_context for the task
+ * @n: audit_names structure with reportable details
+ * @path: optional path to report instead of audit_names->name
+ * @record_num: record number to report when handling a list of names
+ * @call_panic: optional pointer to int that will be updated if secid fails
+ */
+void audit_log_name(struct audit_context *context, struct audit_names *n,
+		    struct path *path, int record_num, int *call_panic)
+{
+	struct audit_buffer *ab;
+	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
+	if (!ab)
+		return;
+
+	audit_log_format(ab, "item=%d", record_num);
+
+	if (path)
+		audit_log_d_path(ab, " name=", path);
+	else if (n->name) {
+		switch (n->name_len) {
+		case AUDIT_NAME_FULL:
+			/* log the full path */
+			audit_log_format(ab, " name=");
+			audit_log_untrustedstring(ab, n->name->name);
+			break;
+		case 0:
+			/* name was specified as a relative path and the
+			 * directory component is the cwd */
+			audit_log_d_path(ab, " name=", &context->pwd);
+			break;
+		default:
+			/* log the name's directory component */
+			audit_log_format(ab, " name=");
+			audit_log_n_untrustedstring(ab, n->name->name,
+						    n->name_len);
+		}
+	} else
+		audit_log_format(ab, " name=(null)");
+
+	if (n->ino != (unsigned long)-1) {
+		audit_log_format(ab, " inode=%lu"
+				 " dev=%02x:%02x mode=%#ho"
+				 " ouid=%u ogid=%u rdev=%02x:%02x",
+				 n->ino,
+				 MAJOR(n->dev),
+				 MINOR(n->dev),
+				 n->mode,
+				 from_kuid(&init_user_ns, n->uid),
+				 from_kgid(&init_user_ns, n->gid),
+				 MAJOR(n->rdev),
+				 MINOR(n->rdev));
+	}
+	if (n->osid != 0) {
+		char *ctx = NULL;
+		u32 len;
+		if (security_secid_to_secctx(
+			n->osid, &ctx, &len)) {
+			audit_log_format(ab, " osid=%u", n->osid);
+			if (call_panic)
+				*call_panic = 2;
+		} else {
+			audit_log_format(ab, " obj=%s", ctx);
+			security_release_secctx(ctx, len);
+		}
+	}
+
+	audit_log_fcaps(ab, n);
+	audit_log_end(ab);
+}
+
+int audit_log_task_context(struct audit_buffer *ab)
+{
+	char *ctx = NULL;
+	unsigned len;
+	int error;
+	u32 sid;
+
+	security_task_getsecid(current, &sid);
+	if (!sid)
+		return 0;
+
+	error = security_secid_to_secctx(sid, &ctx, &len);
+	if (error) {
+		if (error != -EINVAL)
+			goto error_path;
+		return 0;
+	}
+
+	audit_log_format(ab, " subj=%s", ctx);
+	security_release_secctx(ctx, len);
+	return 0;
+
+error_path:
+	audit_panic("error in audit_log_task_context");
+	return error;
+}
+EXPORT_SYMBOL(audit_log_task_context);
+
+void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
+{
+	const struct cred *cred;
+	char name[sizeof(tsk->comm)];
+	struct mm_struct *mm = tsk->mm;
+	char *tty;
+
+	if (!ab)
+		return;
+
+	/* tsk == current */
+	cred = current_cred();
+
+	spin_lock_irq(&tsk->sighand->siglock);
+	if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
+		tty = tsk->signal->tty->name;
+	else
+		tty = "(none)";
+	spin_unlock_irq(&tsk->sighand->siglock);
+
+	audit_log_format(ab,
+			 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
+			 " euid=%u suid=%u fsuid=%u"
+			 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
+			 sys_getppid(),
+			 tsk->pid,
+			 from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
+			 from_kuid(&init_user_ns, cred->uid),
+			 from_kgid(&init_user_ns, cred->gid),
+			 from_kuid(&init_user_ns, cred->euid),
+			 from_kuid(&init_user_ns, cred->suid),
+			 from_kuid(&init_user_ns, cred->fsuid),
+			 from_kgid(&init_user_ns, cred->egid),
+			 from_kgid(&init_user_ns, cred->sgid),
+			 from_kgid(&init_user_ns, cred->fsgid),
+			 audit_get_sessionid(tsk), tty);
+
+	get_task_comm(name, tsk);
+	audit_log_format(ab, " comm=");
+	audit_log_untrustedstring(ab, name);
+
+	if (mm) {
+		down_read(&mm->mmap_sem);
+		if (mm->exe_file)
+			audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
+		up_read(&mm->mmap_sem);
+	}
+	audit_log_task_context(ab);
+}
+EXPORT_SYMBOL(audit_log_task_info);
+
 /**
  * audit_log_link_denied - report a link restriction denial
  * @operation: specific link opreation
@@ -1401,19 +1620,28 @@ void audit_log_key(struct audit_buffer *ab, char *key)
 void audit_log_link_denied(const char *operation, struct path *link)
 {
 	struct audit_buffer *ab;
+	struct audit_names *name;
+
+	name = kzalloc(sizeof(*name), GFP_NOFS);
+	if (!name)
+		return;
 
+	/* Generate AUDIT_ANOM_LINK with subject, operation, outcome. */
 	ab = audit_log_start(current->audit_context, GFP_KERNEL,
 			     AUDIT_ANOM_LINK);
 	if (!ab)
-		return;
-	audit_log_format(ab, "op=%s action=denied", operation);
-	audit_log_format(ab, " pid=%d comm=", current->pid);
-	audit_log_untrustedstring(ab, current->comm);
-	audit_log_d_path(ab, " path=", link);
-	audit_log_format(ab, " dev=");
-	audit_log_untrustedstring(ab, link->dentry->d_inode->i_sb->s_id);
-	audit_log_format(ab, " ino=%lu", link->dentry->d_inode->i_ino);
+		goto out;
+	audit_log_format(ab, "op=%s", operation);
+	audit_log_task_info(ab, current);
+	audit_log_format(ab, " res=0");
 	audit_log_end(ab);
+
+	/* Generate AUDIT_PATH record with object. */
+	name->type = AUDIT_TYPE_NORMAL;
+	audit_copy_inode(name, link->dentry, link->dentry->d_inode);
+	audit_log_name(current->audit_context, name, link, 0, NULL);
+out:
+	kfree(name);
 }
 
 /**
diff --git a/kernel/audit.h b/kernel/audit.h
index d06ffc144f81..45c8325de5bb 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -22,6 +22,7 @@
 #include <linux/fs.h>
 #include <linux/audit.h>
 #include <linux/skbuff.h>
+#include <uapi/linux/mqueue.h>
 
 /* 0 = no checking
    1 = put_count checking
@@ -29,6 +30,11 @@
 */
 #define AUDIT_DEBUG 0
 
+/* AUDIT_NAMES is the number of slots we reserve in the audit_context
+ * for saving names from getname().  If we get more names we will allocate
+ * a name dynamically and also add those to the list anchored by names_list. */
+#define AUDIT_NAMES	5
+
 /* At task start time, the audit_state is set in the audit_context using
    a per-task filter.  At syscall entry, the audit_state is augmented by
    the syscall filter. */
@@ -59,8 +65,159 @@ struct audit_entry {
 	struct audit_krule	rule;
 };
 
+struct audit_cap_data {
+	kernel_cap_t		permitted;
+	kernel_cap_t		inheritable;
+	union {
+		unsigned int	fE;		/* effective bit of file cap */
+		kernel_cap_t	effective;	/* effective set of process */
+	};
+};
+
+/* When fs/namei.c:getname() is called, we store the pointer in name and
+ * we don't let putname() free it (instead we free all of the saved
+ * pointers at syscall exit time).
+ *
+ * Further, in fs/namei.c:path_lookup() we store the inode and device.
+ */
+struct audit_names {
+	struct list_head	list;		/* audit_context->names_list */
+
+	struct filename		*name;
+	int			name_len;	/* number of chars to log */
+	bool			name_put;	/* call __putname()? */
+
+	unsigned long		ino;
+	dev_t			dev;
+	umode_t			mode;
+	kuid_t			uid;
+	kgid_t			gid;
+	dev_t			rdev;
+	u32			osid;
+	struct audit_cap_data	fcap;
+	unsigned int		fcap_ver;
+	unsigned char		type;		/* record type */
+	/*
+	 * This was an allocated audit_names and not from the array of
+	 * names allocated in the task audit context.  Thus this name
+	 * should be freed on syscall exit.
+	 */
+	bool			should_free;
+};
+
+/* The per-task audit context. */
+struct audit_context {
+	int		    dummy;	/* must be the first element */
+	int		    in_syscall;	/* 1 if task is in a syscall */
+	enum audit_state    state, current_state;
+	unsigned int	    serial;     /* serial number for record */
+	int		    major;      /* syscall number */
+	struct timespec	    ctime;      /* time of syscall entry */
+	unsigned long	    argv[4];    /* syscall arguments */
+	long		    return_code;/* syscall return code */
+	u64		    prio;
+	int		    return_valid; /* return code is valid */
+	/*
+	 * The names_list is the list of all audit_names collected during this
+	 * syscall.  The first AUDIT_NAMES entries in the names_list will
+	 * actually be from the preallocated_names array for performance
+	 * reasons.  Except during allocation they should never be referenced
+	 * through the preallocated_names array and should only be found/used
+	 * by running the names_list.
+	 */
+	struct audit_names  preallocated_names[AUDIT_NAMES];
+	int		    name_count; /* total records in names_list */
+	struct list_head    names_list;	/* struct audit_names->list anchor */
+	char		    *filterkey;	/* key for rule that triggered record */
+	struct path	    pwd;
+	struct audit_aux_data *aux;
+	struct audit_aux_data *aux_pids;
+	struct sockaddr_storage *sockaddr;
+	size_t sockaddr_len;
+				/* Save things to print about task_struct */
+	pid_t		    pid, ppid;
+	kuid_t		    uid, euid, suid, fsuid;
+	kgid_t		    gid, egid, sgid, fsgid;
+	unsigned long	    personality;
+	int		    arch;
+
+	pid_t		    target_pid;
+	kuid_t		    target_auid;
+	kuid_t		    target_uid;
+	unsigned int	    target_sessionid;
+	u32		    target_sid;
+	char		    target_comm[TASK_COMM_LEN];
+
+	struct audit_tree_refs *trees, *first_trees;
+	struct list_head killed_trees;
+	int tree_count;
+
+	int type;
+	union {
+		struct {
+			int nargs;
+			long args[6];
+		} socketcall;
+		struct {
+			kuid_t			uid;
+			kgid_t			gid;
+			umode_t			mode;
+			u32			osid;
+			int			has_perm;
+			uid_t			perm_uid;
+			gid_t			perm_gid;
+			umode_t			perm_mode;
+			unsigned long		qbytes;
+		} ipc;
+		struct {
+			mqd_t			mqdes;
+			struct mq_attr		mqstat;
+		} mq_getsetattr;
+		struct {
+			mqd_t			mqdes;
+			int			sigev_signo;
+		} mq_notify;
+		struct {
+			mqd_t			mqdes;
+			size_t			msg_len;
+			unsigned int		msg_prio;
+			struct timespec		abs_timeout;
+		} mq_sendrecv;
+		struct {
+			int			oflag;
+			umode_t			mode;
+			struct mq_attr		attr;
+		} mq_open;
+		struct {
+			pid_t			pid;
+			struct audit_cap_data	cap;
+		} capset;
+		struct {
+			int			fd;
+			int			flags;
+		} mmap;
+	};
+	int fds[2];
+
+#if AUDIT_DEBUG
+	int		    put_count;
+	int		    ino_count;
+#endif
+};
+
 #ifdef CONFIG_AUDIT
+extern int audit_enabled;
 extern int audit_ever_enabled;
+
+extern void audit_copy_inode(struct audit_names *name,
+			     const struct dentry *dentry,
+			     const struct inode *inode);
+extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
+			  kernel_cap_t *cap);
+extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
+extern void audit_log_name(struct audit_context *context,
+			   struct audit_names *n, struct path *path,
+			   int record_num, int *call_panic);
 #endif
 
 extern int audit_pid;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 17e9a260a545..add3086bdb02 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -76,11 +76,6 @@
 #define AUDITSC_SUCCESS 1
 #define AUDITSC_FAILURE 2
 
-/* AUDIT_NAMES is the number of slots we reserve in the audit_context
- * for saving names from getname().  If we get more names we will allocate
- * a name dynamically and also add those to the list anchored by names_list. */
-#define AUDIT_NAMES	5
-
 /* no execve audit message should be longer than this (userspace limits) */
 #define MAX_EXECVE_AUDIT_LEN 7500
 
@@ -90,44 +85,6 @@ int audit_n_rules;
 /* determines whether we collect data for signals sent */
 int audit_signals;
 
-struct audit_cap_data {
-	kernel_cap_t		permitted;
-	kernel_cap_t		inheritable;
-	union {
-		unsigned int	fE;		/* effective bit of a file capability */
-		kernel_cap_t	effective;	/* effective set of a process */
-	};
-};
-
-/* When fs/namei.c:getname() is called, we store the pointer in name and
- * we don't let putname() free it (instead we free all of the saved
- * pointers at syscall exit time).
- *
- * Further, in fs/namei.c:path_lookup() we store the inode and device.
- */
-struct audit_names {
-	struct list_head	list;		/* audit_context->names_list */
-	struct filename	*name;
-	unsigned long		ino;
-	dev_t			dev;
-	umode_t			mode;
-	kuid_t			uid;
-	kgid_t			gid;
-	dev_t			rdev;
-	u32			osid;
-	struct audit_cap_data	 fcap;
-	unsigned int		fcap_ver;
-	int			name_len;	/* number of name's characters to log */
-	unsigned char		type;		/* record type */
-	bool			name_put;	/* call __putname() for this name */
-	/*
-	 * This was an allocated audit_names and not from the array of
-	 * names allocated in the task audit context.  Thus this name
-	 * should be freed on syscall exit
-	 */
-	bool			should_free;
-};
-
 struct audit_aux_data {
 	struct audit_aux_data	*next;
 	int			type;
@@ -175,106 +132,6 @@ struct audit_tree_refs {
 	struct audit_chunk *c[31];
 };
 
-/* The per-task audit context. */
-struct audit_context {
-	int		    dummy;	/* must be the first element */
-	int		    in_syscall;	/* 1 if task is in a syscall */
-	enum audit_state    state, current_state;
-	unsigned int	    serial;     /* serial number for record */
-	int		    major;      /* syscall number */
-	struct timespec	    ctime;      /* time of syscall entry */
-	unsigned long	    argv[4];    /* syscall arguments */
-	long		    return_code;/* syscall return code */
-	u64		    prio;
-	int		    return_valid; /* return code is valid */
-	/*
-	 * The names_list is the list of all audit_names collected during this
-	 * syscall.  The first AUDIT_NAMES entries in the names_list will
-	 * actually be from the preallocated_names array for performance
-	 * reasons.  Except during allocation they should never be referenced
-	 * through the preallocated_names array and should only be found/used
-	 * by running the names_list.
-	 */
-	struct audit_names  preallocated_names[AUDIT_NAMES];
-	int		    name_count; /* total records in names_list */
-	struct list_head    names_list;	/* anchor for struct audit_names->list */
-	char *		    filterkey;	/* key for rule that triggered record */
-	struct path	    pwd;
-	struct audit_aux_data *aux;
-	struct audit_aux_data *aux_pids;
-	struct sockaddr_storage *sockaddr;
-	size_t sockaddr_len;
-				/* Save things to print about task_struct */
-	pid_t		    pid, ppid;
-	kuid_t		    uid, euid, suid, fsuid;
-	kgid_t		    gid, egid, sgid, fsgid;
-	unsigned long	    personality;
-	int		    arch;
-
-	pid_t		    target_pid;
-	kuid_t		    target_auid;
-	kuid_t		    target_uid;
-	unsigned int	    target_sessionid;
-	u32		    target_sid;
-	char		    target_comm[TASK_COMM_LEN];
-
-	struct audit_tree_refs *trees, *first_trees;
-	struct list_head killed_trees;
-	int tree_count;
-
-	int type;
-	union {
-		struct {
-			int nargs;
-			long args[AUDITSC_ARGS];
-		} socketcall;
-		struct {
-			kuid_t			uid;
-			kgid_t			gid;
-			umode_t			mode;
-			u32			osid;
-			int			has_perm;
-			uid_t			perm_uid;
-			gid_t			perm_gid;
-			umode_t			perm_mode;
-			unsigned long		qbytes;
-		} ipc;
-		struct {
-			mqd_t			mqdes;
-			struct mq_attr 		mqstat;
-		} mq_getsetattr;
-		struct {
-			mqd_t			mqdes;
-			int			sigev_signo;
-		} mq_notify;
-		struct {
-			mqd_t			mqdes;
-			size_t			msg_len;
-			unsigned int		msg_prio;
-			struct timespec		abs_timeout;
-		} mq_sendrecv;
-		struct {
-			int			oflag;
-			umode_t			mode;
-			struct mq_attr		attr;
-		} mq_open;
-		struct {
-			pid_t			pid;
-			struct audit_cap_data	cap;
-		} capset;
-		struct {
-			int			fd;
-			int			flags;
-		} mmap;
-	};
-	int fds[2];
-
-#if AUDIT_DEBUG
-	int		    put_count;
-	int		    ino_count;
-#endif
-};
-
 static inline int open_arg(int flags, int mask)
 {
 	int n = ACC_MODE(flags);
@@ -1109,88 +966,6 @@ static inline void audit_free_context(struct audit_context *context)
 	kfree(context);
 }
 
-int audit_log_task_context(struct audit_buffer *ab)
-{
-	char *ctx = NULL;
-	unsigned len;
-	int error;
-	u32 sid;
-
-	security_task_getsecid(current, &sid);
-	if (!sid)
-		return 0;
-
-	error = security_secid_to_secctx(sid, &ctx, &len);
-	if (error) {
-		if (error != -EINVAL)
-			goto error_path;
-		return 0;
-	}
-
-	audit_log_format(ab, " subj=%s", ctx);
-	security_release_secctx(ctx, len);
-	return 0;
-
-error_path:
-	audit_panic("error in audit_log_task_context");
-	return error;
-}
-
-EXPORT_SYMBOL(audit_log_task_context);
-
-void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
-{
-	const struct cred *cred;
-	char name[sizeof(tsk->comm)];
-	struct mm_struct *mm = tsk->mm;
-	char *tty;
-
-	if (!ab)
-		return;
-
-	/* tsk == current */
-	cred = current_cred();
-
-	spin_lock_irq(&tsk->sighand->siglock);
-	if (tsk->signal && tsk->signal->tty)
-		tty = tsk->signal->tty->name;
-	else
-		tty = "(none)";
-	spin_unlock_irq(&tsk->sighand->siglock);
-
-
-	audit_log_format(ab,
-			 " ppid=%ld pid=%d auid=%u uid=%u gid=%u"
-			 " euid=%u suid=%u fsuid=%u"
-			 " egid=%u sgid=%u fsgid=%u ses=%u tty=%s",
-			 sys_getppid(),
-			 tsk->pid,
-			 from_kuid(&init_user_ns, tsk->loginuid),
-			 from_kuid(&init_user_ns, cred->uid),
-			 from_kgid(&init_user_ns, cred->gid),
-			 from_kuid(&init_user_ns, cred->euid),
-			 from_kuid(&init_user_ns, cred->suid),
-			 from_kuid(&init_user_ns, cred->fsuid),
-			 from_kgid(&init_user_ns, cred->egid),
-			 from_kgid(&init_user_ns, cred->sgid),
-			 from_kgid(&init_user_ns, cred->fsgid),
-			 tsk->sessionid, tty);
-
-	get_task_comm(name, tsk);
-	audit_log_format(ab, " comm=");
-	audit_log_untrustedstring(ab, name);
-
-	if (mm) {
-		down_read(&mm->mmap_sem);
-		if (mm->exe_file)
-			audit_log_d_path(ab, " exe=", &mm->exe_file->f_path);
-		up_read(&mm->mmap_sem);
-	}
-	audit_log_task_context(ab);
-}
-
-EXPORT_SYMBOL(audit_log_task_info);
-
 static int audit_log_pid_context(struct audit_context *context, pid_t pid,
 				 kuid_t auid, kuid_t uid, unsigned int sessionid,
 				 u32 sid, char *comm)
@@ -1408,35 +1183,6 @@ static void audit_log_execve_info(struct audit_context *context,
 	kfree(buf);
 }
 
-static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
-{
-	int i;
-
-	audit_log_format(ab, " %s=", prefix);
-	CAP_FOR_EACH_U32(i) {
-		audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]);
-	}
-}
-
-static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
-{
-	kernel_cap_t *perm = &name->fcap.permitted;
-	kernel_cap_t *inh = &name->fcap.inheritable;
-	int log = 0;
-
-	if (!cap_isclear(*perm)) {
-		audit_log_cap(ab, "cap_fp", perm);
-		log = 1;
-	}
-	if (!cap_isclear(*inh)) {
-		audit_log_cap(ab, "cap_fi", inh);
-		log = 1;
-	}
-
-	if (log)
-		audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver);
-}
-
 static void show_special(struct audit_context *context, int *call_panic)
 {
 	struct audit_buffer *ab;
@@ -1534,68 +1280,6 @@ static void show_special(struct audit_context *context, int *call_panic)
 	audit_log_end(ab);
 }
 
-static void audit_log_name(struct audit_context *context, struct audit_names *n,
-			   int record_num, int *call_panic)
-{
-	struct audit_buffer *ab;
-	ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
-	if (!ab)
-		return; /* audit_panic has been called */
-
-	audit_log_format(ab, "item=%d", record_num);
-
-	if (n->name) {
-		switch (n->name_len) {
-		case AUDIT_NAME_FULL:
-			/* log the full path */
-			audit_log_format(ab, " name=");
-			audit_log_untrustedstring(ab, n->name->name);
-			break;
-		case 0:
-			/* name was specified as a relative path and the
-			 * directory component is the cwd */
-			audit_log_d_path(ab, " name=", &context->pwd);
-			break;
-		default:
-			/* log the name's directory component */
-			audit_log_format(ab, " name=");
-			audit_log_n_untrustedstring(ab, n->name->name,
-						    n->name_len);
-		}
-	} else
-		audit_log_format(ab, " name=(null)");
-
-	if (n->ino != (unsigned long)-1) {
-		audit_log_format(ab, " inode=%lu"
-				 " dev=%02x:%02x mode=%#ho"
-				 " ouid=%u ogid=%u rdev=%02x:%02x",
-				 n->ino,
-				 MAJOR(n->dev),
-				 MINOR(n->dev),
-				 n->mode,
-				 from_kuid(&init_user_ns, n->uid),
-				 from_kgid(&init_user_ns, n->gid),
-				 MAJOR(n->rdev),
-				 MINOR(n->rdev));
-	}
-	if (n->osid != 0) {
-		char *ctx = NULL;
-		u32 len;
-		if (security_secid_to_secctx(
-			n->osid, &ctx, &len)) {
-			audit_log_format(ab, " osid=%u", n->osid);
-			*call_panic = 2;
-		} else {
-			audit_log_format(ab, " obj=%s", ctx);
-			security_release_secctx(ctx, len);
-		}
-	}
-
-	audit_log_fcaps(ab, n);
-
-	audit_log_end(ab);
-}
-
 static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
 {
 	int i, call_panic = 0;
@@ -1713,7 +1397,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 
 	i = 0;
 	list_for_each_entry(n, &context->names_list, list)
-		audit_log_name(context, n, i++, &call_panic);
+		audit_log_name(context, n, NULL, i++, &call_panic);
 
 	/* Send end of event record to help user space know we are finished */
 	ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
@@ -2078,41 +1762,6 @@ void audit_putname(struct filename *name)
 #endif
 }
 
-static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry)
-{
-	struct cpu_vfs_cap_data caps;
-	int rc;
-
-	if (!dentry)
-		return 0;
-
-	rc = get_vfs_caps_from_disk(dentry, &caps);
-	if (rc)
-		return rc;
-
-	name->fcap.permitted = caps.permitted;
-	name->fcap.inheritable = caps.inheritable;
-	name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE);
-	name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT;
-
-	return 0;
-}
-
-
-/* Copy inode data into an audit_names. */
-static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry,
-			     const struct inode *inode)
-{
-	name->ino   = inode->i_ino;
-	name->dev   = inode->i_sb->s_dev;
-	name->mode  = inode->i_mode;
-	name->uid   = inode->i_uid;
-	name->gid   = inode->i_gid;
-	name->rdev  = inode->i_rdev;
-	security_inode_getsecid(inode, &name->osid);
-	audit_copy_fcaps(name, dentry);
-}
-
 /**
  * __audit_inode - store the inode and device from a lookup
  * @name: name being audited
-- 
cgit 


From ace6128d603d3f15238baba104d0b37ccf0b6c07 Mon Sep 17 00:00:00 2001
From: Vincent Stehlé <vincent.stehle@laposte.net>
Date: Tue, 30 Apr 2013 15:26:45 -0700
Subject: memory hotplug: fix warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix the following compilation warnings:

  mm/slab.c: In function `kmem_cache_init_late':
  mm/slab.c:1778:2: warning: statement with no effect [-Wunused-value]

  mm/page_cgroup.c: In function `page_cgroup_init':
  mm/page_cgroup.c:305:2: warning: statement with no effect [-Wunused-value]

Signed-off-by: Vincent Stehlé <vincent.stehle@laposte.net>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 73817af8b480..85c31a8e2904 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -137,7 +137,7 @@ enum mem_add_context { BOOT, HOTPLUG };
 #define register_hotmemory_notifier(nb)		register_memory_notifier(nb)
 #define unregister_hotmemory_notifier(nb) 	unregister_memory_notifier(nb)
 #else
-#define hotplug_memory_notifier(fn, pri)	(0)
+#define hotplug_memory_notifier(fn, pri)	({ 0; })
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotmemory_notifier(nb)    ({ (void)(nb); 0; })
 #define unregister_hotmemory_notifier(nb)  ({ (void)(nb); })
-- 
cgit 


From 1e01c968db3d0aebd48e31db15f24516b03128df Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 30 Apr 2013 15:26:51 -0700
Subject: frontswap: make frontswap_init use a pointer for the ops

This simplifies the code in the frontswap - we can get rid of the
'backend_registered' test and instead check against frontswap_ops.

[v1: Rebase on top of 703ba7fe5e0 (ramster->zcache move]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andor Daam <andor.daam@googlemail.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Florian Schmaus <fschmaus@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Stefan Hengelein <ilendir@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/zcache/zcache-main.c |  8 ++++----
 drivers/xen/tmem.c                   |  6 +++---
 include/linux/frontswap.h            |  2 +-
 mm/frontswap.c                       | 38 +++++++++++++++++-------------------
 4 files changed, 26 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index e23d814b5392..09c69c8026f9 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -1707,9 +1707,9 @@ static struct frontswap_ops zcache_frontswap_ops = {
 	.init = zcache_frontswap_init
 };
 
-struct frontswap_ops zcache_frontswap_register_ops(void)
+struct frontswap_ops *zcache_frontswap_register_ops(void)
 {
-	struct frontswap_ops old_ops =
+	struct frontswap_ops *old_ops =
 		frontswap_register_ops(&zcache_frontswap_ops);
 
 	return old_ops;
@@ -1874,7 +1874,7 @@ static int __init zcache_init(void)
 			pr_warn("%s: cleancache_ops overridden\n", namestr);
 	}
 	if (zcache_enabled && !disable_frontswap) {
-		struct frontswap_ops old_ops;
+		struct frontswap_ops *old_ops;
 
 		old_ops = zcache_frontswap_register_ops();
 		if (frontswap_has_exclusive_gets)
@@ -1886,7 +1886,7 @@ static int __init zcache_init(void)
 			namestr, frontswap_has_exclusive_gets,
 			!disable_frontswap_ignore_nonactive);
 #endif
-		if (old_ops.init != NULL)
+		if (old_ops != NULL)
 			pr_warn("%s: frontswap_ops overridden\n", namestr);
 	}
 	if (ramster_enabled)
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 3ee836d42581..7a01a5fd0f63 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -362,7 +362,7 @@ static int __init no_frontswap(char *s)
 }
 __setup("nofrontswap", no_frontswap);
 
-static struct frontswap_ops __initdata tmem_frontswap_ops = {
+static struct frontswap_ops tmem_frontswap_ops = {
 	.store = tmem_frontswap_store,
 	.load = tmem_frontswap_load,
 	.invalidate_page = tmem_frontswap_flush_page,
@@ -378,11 +378,11 @@ static int __init xen_tmem_init(void)
 #ifdef CONFIG_FRONTSWAP
 	if (tmem_enabled && use_frontswap) {
 		char *s = "";
-		struct frontswap_ops old_ops =
+		struct frontswap_ops *old_ops =
 			frontswap_register_ops(&tmem_frontswap_ops);
 
 		tmem_frontswap_poolid = -1;
-		if (old_ops.init != NULL)
+		if (old_ops)
 			s = " (WARNING: frontswap_ops overridden)";
 		printk(KERN_INFO "frontswap enabled, RAM provided by "
 				 "Xen Transcendent Memory%s\n", s);
diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 30442547b9e6..d4f29875c7cc 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -14,7 +14,7 @@ struct frontswap_ops {
 };
 
 extern bool frontswap_enabled;
-extern struct frontswap_ops
+extern struct frontswap_ops *
 	frontswap_register_ops(struct frontswap_ops *ops);
 extern void frontswap_shrink(unsigned long);
 extern unsigned long frontswap_curr_pages(void);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index cbd2b8af8129..e44c9cbd1443 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -24,7 +24,7 @@
  * frontswap_ops is set by frontswap_register_ops to contain the pointers
  * to the frontswap "backend" implementation functions.
  */
-static struct frontswap_ops frontswap_ops __read_mostly;
+static struct frontswap_ops *frontswap_ops __read_mostly;
 
 /*
  * This global enablement flag reduces overhead on systems where frontswap_ops
@@ -108,41 +108,39 @@ static inline void inc_frontswap_invalidates(void) { }
  *
  * The time between the backend being registered and the swap file system
  * calling the backend (via the frontswap_* functions) is indeterminate as
- * backend_registered is not atomic_t (or a value guarded by a spinlock).
+ * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
  * That is OK as we are comfortable missing some of these calls to the newly
  * registered backend.
  *
  * Obviously the opposite (unloading the backend) must be done after all
  * the frontswap_[store|load|invalidate_area|invalidate_page] start
- * ignorning or failing the requests - at which point backend_registered
+ * ignorning or failing the requests - at which point frontswap_ops
  * would have to be made in some fashion atomic.
  */
 static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
-static bool backend_registered __read_mostly;
 
 /*
  * Register operations for frontswap, returning previous thus allowing
  * detection of multiple backends and possible nesting.
  */
-struct frontswap_ops frontswap_register_ops(struct frontswap_ops *ops)
+struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
 {
-	struct frontswap_ops old = frontswap_ops;
+	struct frontswap_ops *old = frontswap_ops;
 	int i;
 
-	frontswap_ops = *ops;
 	frontswap_enabled = true;
 
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		if (test_and_clear_bit(i, need_init))
-			(*frontswap_ops.init)(i);
+			ops->init(i);
 	}
 	/*
-	 * We MUST have backend_registered set _after_ the frontswap_init's
+	 * We MUST have frontswap_ops set _after_ the frontswap_init's
 	 * have been called. Otherwise __frontswap_store might fail. Hence
 	 * the barrier to make sure compiler does not re-order us.
 	 */
 	barrier();
-	backend_registered = true;
+	frontswap_ops = ops;
 	return old;
 }
 EXPORT_SYMBOL(frontswap_register_ops);
@@ -172,11 +170,11 @@ void __frontswap_init(unsigned type)
 {
 	struct swap_info_struct *sis = swap_info[type];
 
-	if (backend_registered) {
+	if (frontswap_ops) {
 		BUG_ON(sis == NULL);
 		if (sis->frontswap_map == NULL)
 			return;
-		(*frontswap_ops.init)(type);
+		frontswap_ops->init(type);
 	} else {
 		BUG_ON(type > MAX_SWAPFILES);
 		set_bit(type, need_init);
@@ -206,7 +204,7 @@ int __frontswap_store(struct page *page)
 	struct swap_info_struct *sis = swap_info[type];
 	pgoff_t offset = swp_offset(entry);
 
-	if (!backend_registered) {
+	if (!frontswap_ops) {
 		inc_frontswap_failed_stores();
 		return ret;
 	}
@@ -215,7 +213,7 @@ int __frontswap_store(struct page *page)
 	BUG_ON(sis == NULL);
 	if (frontswap_test(sis, offset))
 		dup = 1;
-	ret = frontswap_ops.store(type, offset, page);
+	ret = frontswap_ops->store(type, offset, page);
 	if (ret == 0) {
 		frontswap_set(sis, offset);
 		inc_frontswap_succ_stores();
@@ -250,13 +248,13 @@ int __frontswap_load(struct page *page)
 	struct swap_info_struct *sis = swap_info[type];
 	pgoff_t offset = swp_offset(entry);
 
-	if (!backend_registered)
+	if (!frontswap_ops)
 		return ret;
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(sis == NULL);
 	if (frontswap_test(sis, offset))
-		ret = frontswap_ops.load(type, offset, page);
+		ret = frontswap_ops->load(type, offset, page);
 	if (ret == 0) {
 		inc_frontswap_loads();
 		if (frontswap_tmem_exclusive_gets_enabled) {
@@ -276,12 +274,12 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
 	struct swap_info_struct *sis = swap_info[type];
 
-	if (!backend_registered)
+	if (!frontswap_ops)
 		return;
 
 	BUG_ON(sis == NULL);
 	if (frontswap_test(sis, offset)) {
-		frontswap_ops.invalidate_page(type, offset);
+		frontswap_ops->invalidate_page(type, offset);
 		__frontswap_clear(sis, offset);
 		inc_frontswap_invalidates();
 	}
@@ -296,11 +294,11 @@ void __frontswap_invalidate_area(unsigned type)
 {
 	struct swap_info_struct *sis = swap_info[type];
 
-	if (backend_registered) {
+	if (frontswap_ops) {
 		BUG_ON(sis == NULL);
 		if (sis->frontswap_map == NULL)
 			return;
-		(*frontswap_ops.invalidate_area)(type);
+		frontswap_ops->invalidate_area(type);
 		atomic_set(&sis->frontswap_pages, 0);
 		memset(sis->frontswap_map, 0, sis->max / sizeof(long));
 	}
-- 
cgit 


From f066ea230a65f939afc354beae62716ab5f0e645 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Tue, 30 Apr 2013 15:26:53 -0700
Subject: mm: frontswap: cleanup code

After allowing tmem backends to build/run as modules, frontswap_enabled
always true if defined CONFIG_FRONTSWAP.  But frontswap_test() depends on
whether backend is registered, mv it into frontswap.c using fronstswap_ops
to make the decision.

frontswap_set/clear are not used outside frontswap, so don't export them.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andor Daam <andor.daam@googlemail.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Florian Schmaus <fschmaus@gmail.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Stefan Hengelein <ilendir@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h | 28 +++--------------------
 mm/frontswap.c            | 57 +++++++++++++++++++++++++----------------------
 2 files changed, 33 insertions(+), 52 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index d4f29875c7cc..6c49e1eba552 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -22,6 +22,7 @@ extern void frontswap_writethrough(bool);
 #define FRONTSWAP_HAS_EXCLUSIVE_GETS
 extern void frontswap_tmem_exclusive_gets(bool);
 
+extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
 extern void __frontswap_init(unsigned type);
 extern int __frontswap_store(struct page *page);
 extern int __frontswap_load(struct page *page);
@@ -29,26 +30,11 @@ extern void __frontswap_invalidate_page(unsigned, pgoff_t);
 extern void __frontswap_invalidate_area(unsigned);
 
 #ifdef CONFIG_FRONTSWAP
+#define frontswap_enabled (1)
 
 static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
 {
-	bool ret = false;
-
-	if (frontswap_enabled && sis->frontswap_map)
-		ret = test_bit(offset, sis->frontswap_map);
-	return ret;
-}
-
-static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
-{
-	if (frontswap_enabled && sis->frontswap_map)
-		set_bit(offset, sis->frontswap_map);
-}
-
-static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
-{
-	if (frontswap_enabled && sis->frontswap_map)
-		clear_bit(offset, sis->frontswap_map);
+	return __frontswap_test(sis, offset);
 }
 
 static inline void frontswap_map_set(struct swap_info_struct *p,
@@ -71,14 +57,6 @@ static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset)
 	return false;
 }
 
-static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset)
-{
-}
-
-static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
-{
-}
-
 static inline void frontswap_map_set(struct swap_info_struct *p,
 				     unsigned long *map)
 {
diff --git a/mm/frontswap.c b/mm/frontswap.c
index e44c9cbd1443..2760b0f98822 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -26,14 +26,6 @@
  */
 static struct frontswap_ops *frontswap_ops __read_mostly;
 
-/*
- * This global enablement flag reduces overhead on systems where frontswap_ops
- * has not been registered, so is preferred to the slower alternative: a
- * function call that checks a non-global.
- */
-bool frontswap_enabled __read_mostly;
-EXPORT_SYMBOL(frontswap_enabled);
-
 /*
  * If enabled, frontswap_store will return failure even on success.  As
  * a result, the swap subsystem will always write the page to swap, in
@@ -128,8 +120,6 @@ struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
 	struct frontswap_ops *old = frontswap_ops;
 	int i;
 
-	frontswap_enabled = true;
-
 	for (i = 0; i < MAX_SWAPFILES; i++) {
 		if (test_and_clear_bit(i, need_init))
 			ops->init(i);
@@ -183,9 +173,21 @@ void __frontswap_init(unsigned type)
 }
 EXPORT_SYMBOL(__frontswap_init);
 
-static inline void __frontswap_clear(struct swap_info_struct *sis, pgoff_t offset)
+bool __frontswap_test(struct swap_info_struct *sis,
+				pgoff_t offset)
+{
+	bool ret = false;
+
+	if (frontswap_ops && sis->frontswap_map)
+		ret = test_bit(offset, sis->frontswap_map);
+	return ret;
+}
+EXPORT_SYMBOL(__frontswap_test);
+
+static inline void __frontswap_clear(struct swap_info_struct *sis,
+				pgoff_t offset)
 {
-	frontswap_clear(sis, offset);
+	clear_bit(offset, sis->frontswap_map);
 	atomic_dec(&sis->frontswap_pages);
 }
 
@@ -204,18 +206,20 @@ int __frontswap_store(struct page *page)
 	struct swap_info_struct *sis = swap_info[type];
 	pgoff_t offset = swp_offset(entry);
 
-	if (!frontswap_ops) {
-		inc_frontswap_failed_stores();
+	/*
+	 * Return if no backend registed.
+	 * Don't need to inc frontswap_failed_stores here.
+	 */
+	if (!frontswap_ops)
 		return ret;
-	}
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(sis == NULL);
-	if (frontswap_test(sis, offset))
+	if (__frontswap_test(sis, offset))
 		dup = 1;
 	ret = frontswap_ops->store(type, offset, page);
 	if (ret == 0) {
-		frontswap_set(sis, offset);
+		set_bit(offset, sis->frontswap_map);
 		inc_frontswap_succ_stores();
 		if (!dup)
 			atomic_inc(&sis->frontswap_pages);
@@ -248,18 +252,18 @@ int __frontswap_load(struct page *page)
 	struct swap_info_struct *sis = swap_info[type];
 	pgoff_t offset = swp_offset(entry);
 
-	if (!frontswap_ops)
-		return ret;
-
 	BUG_ON(!PageLocked(page));
 	BUG_ON(sis == NULL);
-	if (frontswap_test(sis, offset))
+	/*
+	 * __frontswap_test() will check whether there is backend registered
+	 */
+	if (__frontswap_test(sis, offset))
 		ret = frontswap_ops->load(type, offset, page);
 	if (ret == 0) {
 		inc_frontswap_loads();
 		if (frontswap_tmem_exclusive_gets_enabled) {
 			SetPageDirty(page);
-			frontswap_clear(sis, offset);
+			__frontswap_clear(sis, offset);
 		}
 	}
 	return ret;
@@ -274,11 +278,11 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 {
 	struct swap_info_struct *sis = swap_info[type];
 
-	if (!frontswap_ops)
-		return;
-
 	BUG_ON(sis == NULL);
-	if (frontswap_test(sis, offset)) {
+	/*
+	 * __frontswap_test() will check whether there is backend registered
+	 */
+	if (__frontswap_test(sis, offset)) {
 		frontswap_ops->invalidate_page(type, offset);
 		__frontswap_clear(sis, offset);
 		inc_frontswap_invalidates();
@@ -435,7 +439,6 @@ static int __init init_frontswap(void)
 	debugfs_create_u64("invalidates", S_IRUGO,
 				root, &frontswap_invalidates);
 #endif
-	frontswap_enabled = 1;
 	return 0;
 }
 
-- 
cgit 


From 4f89849da22db9d0edb378acea65e23fcd546173 Mon Sep 17 00:00:00 2001
From: Minchan Kim <minchan@kernel.org>
Date: Tue, 30 Apr 2013 15:26:54 -0700
Subject: frontswap: get rid of swap_lock dependency

Frontswap initialization routine depends on swap_lock, which want to be
atomic about frontswap's first appearance.  IOW, frontswap is not present
and will fail all calls OR frontswap is fully functional but if new
swap_info_struct isn't registered by enable_swap_info, swap subsystem
doesn't start I/O so there is no race between init procedure and page I/O
working on frontswap.

So let's remove unnecessary swap_lock dependency.

Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
[v1: Rebased on my branch, reworked to work with backends loading late]
[v2: Added a check for !map]
[v3: Made the invalidate path follow the init path]
[v4: Address comments by Wanpeng Li <liwanp@linux.vnet.ibm.com>]
Signed-off-by: Konrad Rzeszutek Wilk <konrad@darnok.org>
Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andor Daam <andor.daam@googlemail.com>
Cc: Florian Schmaus <fschmaus@gmail.com>
Cc: Stefan Hengelein <ilendir@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/frontswap.h |  6 +++---
 mm/frontswap.c            | 31 +++++++++++++++++++++++--------
 mm/swapfile.c             | 17 +++++++++--------
 3 files changed, 35 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h
index 6c49e1eba552..8293262401de 100644
--- a/include/linux/frontswap.h
+++ b/include/linux/frontswap.h
@@ -23,7 +23,7 @@ extern void frontswap_writethrough(bool);
 extern void frontswap_tmem_exclusive_gets(bool);
 
 extern bool __frontswap_test(struct swap_info_struct *, pgoff_t);
-extern void __frontswap_init(unsigned type);
+extern void __frontswap_init(unsigned type, unsigned long *map);
 extern int __frontswap_store(struct page *page);
 extern int __frontswap_load(struct page *page);
 extern void __frontswap_invalidate_page(unsigned, pgoff_t);
@@ -98,10 +98,10 @@ static inline void frontswap_invalidate_area(unsigned type)
 		__frontswap_invalidate_area(type);
 }
 
-static inline void frontswap_init(unsigned type)
+static inline void frontswap_init(unsigned type, unsigned long *map)
 {
 	if (frontswap_enabled)
-		__frontswap_init(type);
+		__frontswap_init(type, map);
 }
 
 #endif /* _LINUX_FRONTSWAP_H */
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 2760b0f98822..538367ef1372 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -121,8 +121,13 @@ struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
 	int i;
 
 	for (i = 0; i < MAX_SWAPFILES; i++) {
-		if (test_and_clear_bit(i, need_init))
+		if (test_and_clear_bit(i, need_init)) {
+			struct swap_info_struct *sis = swap_info[i];
+			/* __frontswap_init _should_ have set it! */
+			if (!sis->frontswap_map)
+				return ERR_PTR(-EINVAL);
 			ops->init(i);
+		}
 	}
 	/*
 	 * We MUST have frontswap_ops set _after_ the frontswap_init's
@@ -156,20 +161,30 @@ EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 /*
  * Called when a swap device is swapon'd.
  */
-void __frontswap_init(unsigned type)
+void __frontswap_init(unsigned type, unsigned long *map)
 {
 	struct swap_info_struct *sis = swap_info[type];
 
-	if (frontswap_ops) {
-		BUG_ON(sis == NULL);
-		if (sis->frontswap_map == NULL)
-			return;
+	BUG_ON(sis == NULL);
+
+	/*
+	 * p->frontswap is a bitmap that we MUST have to figure out which page
+	 * has gone in frontswap. Without it there is no point of continuing.
+	 */
+	if (WARN_ON(!map))
+		return;
+	/*
+	 * Irregardless of whether the frontswap backend has been loaded
+	 * before this function or it will be later, we _MUST_ have the
+	 * p->frontswap set to something valid to work properly.
+	 */
+	frontswap_map_set(sis, map);
+	if (frontswap_ops)
 		frontswap_ops->init(type);
-	} else {
+	else {
 		BUG_ON(type > MAX_SWAPFILES);
 		set_bit(type, need_init);
 	}
-
 }
 EXPORT_SYMBOL(__frontswap_init);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index d417efddfe74..6c340d908b27 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1509,8 +1509,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
 }
 
 static void _enable_swap_info(struct swap_info_struct *p, int prio,
-				unsigned char *swap_map,
-				unsigned long *frontswap_map)
+				unsigned char *swap_map)
 {
 	int i, prev;
 
@@ -1519,7 +1518,6 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
 	else
 		p->prio = --least_priority;
 	p->swap_map = swap_map;
-	frontswap_map_set(p, frontswap_map);
 	p->flags |= SWP_WRITEOK;
 	atomic_long_add(p->pages, &nr_swap_pages);
 	total_swap_pages += p->pages;
@@ -1542,10 +1540,10 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
 				unsigned char *swap_map,
 				unsigned long *frontswap_map)
 {
+	frontswap_init(p->type, frontswap_map);
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
-	_enable_swap_info(p, prio, swap_map, frontswap_map);
-	frontswap_init(p->type);
+	 _enable_swap_info(p, prio, swap_map);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 }
@@ -1554,7 +1552,7 @@ static void reinsert_swap_info(struct swap_info_struct *p)
 {
 	spin_lock(&swap_lock);
 	spin_lock(&p->lock);
-	_enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p));
+	_enable_swap_info(p, p->prio, p->swap_map);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
 }
@@ -1563,6 +1561,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 {
 	struct swap_info_struct *p = NULL;
 	unsigned char *swap_map;
+	unsigned long *frontswap_map;
 	struct file *swap_file, *victim;
 	struct address_space *mapping;
 	struct inode *inode;
@@ -1662,12 +1661,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
-	frontswap_invalidate_area(type);
+	frontswap_map = frontswap_map_get(p);
+	frontswap_map_set(p, NULL);
 	spin_unlock(&p->lock);
 	spin_unlock(&swap_lock);
+	frontswap_invalidate_area(type);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
-	vfree(frontswap_map_get(p));
+	vfree(frontswap_map);
 	/* Destroy swap account informatin */
 	swap_cgroup_swapoff(type);
 
-- 
cgit 


From 833f8662af9659508afc3cb80f09138eade378e2 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Tue, 30 Apr 2013 15:26:57 -0700
Subject: cleancache: Make cleancache_init use a pointer for the ops

Instead of using a backend_registered to determine whether a backend is
enabled.  This allows us to remove the backend_register check and just
do 'if (cleancache_ops)'

[v1: Rebase on top of b97c4b430b0a (ramster->zcache move]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andor Daam <andor.daam@googlemail.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Florian Schmaus <fschmaus@gmail.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Stefan Hengelein <ilendir@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/staging/zcache/zcache-main.c |  8 ++---
 drivers/xen/tmem.c                   |  6 ++--
 include/linux/cleancache.h           |  2 +-
 mm/cleancache.c                      | 62 +++++++++++++++++++-----------------
 4 files changed, 40 insertions(+), 38 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c
index 09c69c8026f9..6bd4ebb3494d 100644
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -1576,9 +1576,9 @@ static struct cleancache_ops zcache_cleancache_ops = {
 	.init_fs = zcache_cleancache_init_fs
 };
 
-struct cleancache_ops zcache_cleancache_register_ops(void)
+struct cleancache_ops *zcache_cleancache_register_ops(void)
 {
-	struct cleancache_ops old_ops =
+	struct cleancache_ops *old_ops =
 		cleancache_register_ops(&zcache_cleancache_ops);
 
 	return old_ops;
@@ -1860,7 +1860,7 @@ static int __init zcache_init(void)
 	}
 	zbud_init();
 	if (zcache_enabled && !disable_cleancache) {
-		struct cleancache_ops old_ops;
+		struct cleancache_ops *old_ops;
 
 		register_shrinker(&zcache_shrinker);
 		old_ops = zcache_cleancache_register_ops();
@@ -1870,7 +1870,7 @@ static int __init zcache_init(void)
 		pr_info("%s: cleancache: ignorenonactive = %d\n",
 			namestr, !disable_cleancache_ignore_nonactive);
 #endif
-		if (old_ops.init_fs != NULL)
+		if (old_ops != NULL)
 			pr_warn("%s: cleancache_ops overridden\n", namestr);
 	}
 	if (zcache_enabled && !disable_frontswap) {
diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c
index 7a01a5fd0f63..fd79eab08368 100644
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -236,7 +236,7 @@ static int __init no_cleancache(char *s)
 }
 __setup("nocleancache", no_cleancache);
 
-static struct cleancache_ops __initdata tmem_cleancache_ops = {
+static struct cleancache_ops tmem_cleancache_ops = {
 	.put_page = tmem_cleancache_put_page,
 	.get_page = tmem_cleancache_get_page,
 	.invalidate_page = tmem_cleancache_flush_page,
@@ -392,9 +392,9 @@ static int __init xen_tmem_init(void)
 	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
 	if (tmem_enabled && use_cleancache) {
 		char *s = "";
-		struct cleancache_ops old_ops =
+		struct cleancache_ops *old_ops =
 			cleancache_register_ops(&tmem_cleancache_ops);
-		if (old_ops.init_fs != NULL)
+		if (old_ops)
 			s = " (WARNING: cleancache_ops overridden)";
 		printk(KERN_INFO "cleancache enabled, RAM provided by "
 				 "Xen Transcendent Memory%s\n", s);
diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 42e55deee757..3af5ea839558 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -33,7 +33,7 @@ struct cleancache_ops {
 	void (*invalidate_fs)(int);
 };
 
-extern struct cleancache_ops
+extern struct cleancache_ops *
 	cleancache_register_ops(struct cleancache_ops *ops);
 extern void __cleancache_init_fs(struct super_block *);
 extern void __cleancache_init_shared_fs(char *, struct super_block *);
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 0cecdbba4bcd..b3ae19b72035 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -32,7 +32,7 @@ EXPORT_SYMBOL(cleancache_enabled);
  * cleancache_ops is set by cleancache_ops_register to contain the pointers
  * to the cleancache "backend" implementation functions.
  */
-static struct cleancache_ops cleancache_ops __read_mostly;
+static struct cleancache_ops *cleancache_ops __read_mostly;
 
 /*
  * Counters available via /sys/kernel/debug/frontswap (if debugfs is
@@ -72,15 +72,14 @@ static DEFINE_MUTEX(poolid_mutex);
 /*
  * When set to false (default) all calls to the cleancache functions, except
  * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!backend_registered) return. This means multiple threads (from
- * different filesystems) will be checking backend_registered. The usage of a
+ * by the if (!cleancache_ops) return. This means multiple threads (from
+ * different filesystems) will be checking cleancache_ops. The usage of a
  * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
  * OK if the time between the backend's have been initialized (and
- * backend_registered has been set to true) and when the filesystems start
+ * cleancache_ops has been set to not NULL) and when the filesystems start
  * actually calling the backends. The inverse (when unloading) is obviously
  * not good - but this shim does not do that (yet).
  */
-static bool backend_registered __read_mostly;
 
 /*
  * The backends and filesystems work all asynchronously. This is b/c the
@@ -90,13 +89,13 @@ static bool backend_registered __read_mostly;
  *		[shared_|]fs_poolid_map and uuids for.
  *
  *	b). user does I/Os -> we call the rest of __cleancache_* functions
- *		which return immediately as backend_registered is false.
+ *		which return immediately as cleancache_ops is false.
  *
  *	c). modprobe zcache -> cleancache_register_ops. We init the backend
- *		and set backend_registered to true, and for any fs_poolid_map
+ *		and set cleancache_ops to true, and for any fs_poolid_map
  *		(which is set by __cleancache_init_fs) we initialize the poolid.
  *
- *	d). user does I/Os -> now that backend_registered is true all the
+ *	d). user does I/Os -> now that cleancache_ops is true all the
  *		__cleancache_* functions can call the backend. They all check
  *		that fs_poolid_map is valid and if so invoke the backend.
  *
@@ -120,23 +119,26 @@ static bool backend_registered __read_mostly;
  * Register operations for cleancache, returning previous thus allowing
  * detection of multiple backends and possible nesting.
  */
-struct cleancache_ops cleancache_register_ops(struct cleancache_ops *ops)
+struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
 {
-	struct cleancache_ops old = cleancache_ops;
+	struct cleancache_ops *old = cleancache_ops;
 	int i;
 
 	mutex_lock(&poolid_mutex);
-	cleancache_ops = *ops;
-
-	backend_registered = true;
 	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
 		if (fs_poolid_map[i] == FS_NO_BACKEND)
-			fs_poolid_map[i] = (*cleancache_ops.init_fs)(PAGE_SIZE);
+			fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
 		if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
-			shared_fs_poolid_map[i] = (*cleancache_ops.init_shared_fs)
+			shared_fs_poolid_map[i] = ops->init_shared_fs
 					(uuids[i], PAGE_SIZE);
 	}
-out:
+	/*
+	 * We MUST set cleancache_ops _after_ we have called the backends
+	 * init_fs or init_shared_fs functions. Otherwise the compiler might
+	 * re-order where cleancache_ops is set in this function.
+	 */
+	barrier();
+	cleancache_ops = ops;
 	mutex_unlock(&poolid_mutex);
 	return old;
 }
@@ -151,8 +153,8 @@ void __cleancache_init_fs(struct super_block *sb)
 	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
 		if (fs_poolid_map[i] == FS_UNKNOWN) {
 			sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
-			if (backend_registered)
-				fs_poolid_map[i] = (*cleancache_ops.init_fs)(PAGE_SIZE);
+			if (cleancache_ops)
+				fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
 			else
 				fs_poolid_map[i] = FS_NO_BACKEND;
 			break;
@@ -172,8 +174,8 @@ void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
 		if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
 			sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
 			uuids[i] = uuid;
-			if (backend_registered)
-				shared_fs_poolid_map[i] = (*cleancache_ops.init_shared_fs)
+			if (cleancache_ops)
+				shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
 						(uuid, PAGE_SIZE);
 			else
 				shared_fs_poolid_map[i] = FS_NO_BACKEND;
@@ -240,7 +242,7 @@ int __cleancache_get_page(struct page *page)
 	int fake_pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
-	if (!backend_registered) {
+	if (!cleancache_ops) {
 		cleancache_failed_gets++;
 		goto out;
 	}
@@ -255,7 +257,7 @@ int __cleancache_get_page(struct page *page)
 		goto out;
 
 	if (pool_id >= 0)
-		ret = (*cleancache_ops.get_page)(pool_id,
+		ret = cleancache_ops->get_page(pool_id,
 				key, page->index, page);
 	if (ret == 0)
 		cleancache_succ_gets++;
@@ -282,7 +284,7 @@ void __cleancache_put_page(struct page *page)
 	int fake_pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
-	if (!backend_registered) {
+	if (!cleancache_ops) {
 		cleancache_puts++;
 		return;
 	}
@@ -296,7 +298,7 @@ void __cleancache_put_page(struct page *page)
 
 	if (pool_id >= 0 &&
 		cleancache_get_key(page->mapping->host, &key) >= 0) {
-		(*cleancache_ops.put_page)(pool_id, key, page->index, page);
+		cleancache_ops->put_page(pool_id, key, page->index, page);
 		cleancache_puts++;
 	}
 }
@@ -318,7 +320,7 @@ void __cleancache_invalidate_page(struct address_space *mapping,
 	int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
-	if (!backend_registered)
+	if (!cleancache_ops)
 		return;
 
 	if (fake_pool_id >= 0) {
@@ -328,7 +330,7 @@ void __cleancache_invalidate_page(struct address_space *mapping,
 
 		VM_BUG_ON(!PageLocked(page));
 		if (cleancache_get_key(mapping->host, &key) >= 0) {
-			(*cleancache_ops.invalidate_page)(pool_id,
+			cleancache_ops->invalidate_page(pool_id,
 					key, page->index);
 			cleancache_invalidates++;
 		}
@@ -351,7 +353,7 @@ void __cleancache_invalidate_inode(struct address_space *mapping)
 	int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
-	if (!backend_registered)
+	if (!cleancache_ops)
 		return;
 
 	if (fake_pool_id < 0)
@@ -360,7 +362,7 @@ void __cleancache_invalidate_inode(struct address_space *mapping)
 	pool_id = get_poolid_from_fake(fake_pool_id);
 
 	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
-		(*cleancache_ops.invalidate_inode)(pool_id, key);
+		cleancache_ops->invalidate_inode(pool_id, key);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_inode);
 
@@ -387,8 +389,8 @@ void __cleancache_invalidate_fs(struct super_block *sb)
 		fs_poolid_map[index] = FS_UNKNOWN;
 	}
 	sb->cleancache_poolid = -1;
-	if (backend_registered)
-		(*cleancache_ops.invalidate_fs)(old_poolid);
+	if (cleancache_ops)
+		cleancache_ops->invalidate_fs(old_poolid);
 	mutex_unlock(&poolid_mutex);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_fs);
-- 
cgit 


From ff610a1d55da22bf95bbc6a8b193e052169b34b7 Mon Sep 17 00:00:00 2001
From: Bob Liu <lliubbo@gmail.com>
Date: Tue, 30 Apr 2013 15:26:58 -0700
Subject: mm: cleancache: clean up cleancache_enabled

cleancache_ops is used to decide whether backend is registered.
So now cleancache_enabled is always true if defined CONFIG_CLEANCACHE.

Signed-off-by: Bob Liu <lliubbo@gmail.com>
Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
Cc: Andor Daam <andor.daam@googlemail.com>
Cc: Dan Magenheimer <dan.magenheimer@oracle.com>
Cc: Florian Schmaus <fschmaus@gmail.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Minchan Kim <minchan@kernel.org>
Cc: Stefan Hengelein <ilendir@googlemail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cleancache.h |  2 +-
 mm/cleancache.c            | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cleancache.h b/include/linux/cleancache.h
index 3af5ea839558..4ce9056b31a8 100644
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -42,9 +42,9 @@ extern void __cleancache_put_page(struct page *);
 extern void __cleancache_invalidate_page(struct address_space *, struct page *);
 extern void __cleancache_invalidate_inode(struct address_space *);
 extern void __cleancache_invalidate_fs(struct super_block *);
-extern int cleancache_enabled;
 
 #ifdef CONFIG_CLEANCACHE
+#define cleancache_enabled (1)
 static inline bool cleancache_fs_enabled(struct page *page)
 {
 	return page->mapping->host->i_sb->cleancache_poolid >= 0;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index b3ae19b72035..5875f48ce279 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -18,16 +18,6 @@
 #include <linux/debugfs.h>
 #include <linux/cleancache.h>
 
-/*
- * This global enablement flag may be read thousands of times per second
- * by cleancache_get/put/invalidate even on systems where cleancache_ops
- * is not claimed (e.g. cleancache is config'ed on but remains
- * disabled), so is preferred to the slower alternative: a function
- * call that checks a non-global.
- */
-int cleancache_enabled __read_mostly;
-EXPORT_SYMBOL(cleancache_enabled);
-
 /*
  * cleancache_ops is set by cleancache_ops_register to contain the pointers
  * to the cleancache "backend" implementation functions.
@@ -414,7 +404,6 @@ static int __init init_cleancache(void)
 		fs_poolid_map[i] = FS_UNKNOWN;
 		shared_fs_poolid_map[i] = FS_UNKNOWN;
 	}
-	cleancache_enabled = 1;
 	return 0;
 }
 module_init(init_cleancache)
-- 
cgit 


From 196779b9b4ce1922afabdc20d0270720603bd46c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:12 -0700
Subject: dump_stack: consolidate dump_stack() implementations and unify their
 behaviors

Both dump_stack() and show_stack() are currently implemented by each
architecture.  show_stack(NULL, NULL) dumps the backtrace for the
current task as does dump_stack().  On some archs, dump_stack() prints
extra information - pid, utsname and so on - in addition to the
backtrace while the two are identical on other archs.

The usages in arch-independent code of the two functions indicate
show_stack(NULL, NULL) should print out bare backtrace while
dump_stack() is used for debugging purposes when something went wrong,
so it does make sense to print additional information on the task which
triggered dump_stack().

There's no reason to require archs to implement two separate but mostly
identical functions.  It leads to unnecessary subtle information.

This patch expands the dummy fallback dump_stack() implementation in
lib/dump_stack.c such that it prints out debug information (taken from
x86) and invokes show_stack(NULL, NULL) and drops arch-specific
dump_stack() implementations in all archs except blackfin.  Blackfin's
dump_stack() does something wonky that I don't understand.

Debug information can be printed separately by calling
dump_stack_print_info() so that arch-specific dump_stack()
implementation can still emit the same debug information.  This is used
in blackfin.

This patch brings the following behavior changes.

* On some archs, an extra level in backtrace for show_stack() could be
  printed.  This is because the top frame was determined in
  dump_stack() on those archs while generic dump_stack() can't do that
  reliably.  It can be compensated by inlining dump_stack() but not
  sure whether that'd be necessary.

* Most archs didn't use to print debug info on dump_stack().  They do
  now.

An example WARN dump follows.

 WARNING: at kernel/workqueue.c:4841 init_workqueues+0x35/0x505()
 Hardware name: empty
 Modules linked in:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #9
  0000000000000009 ffff88007c861e08 ffffffff81c614dc ffff88007c861e48
  ffffffff8108f50f ffffffff82228240 0000000000000040 ffffffff8234a03c
  0000000000000000 0000000000000000 0000000000000000 ffff88007c861e58
 Call Trace:
  [<ffffffff81c614dc>] dump_stack+0x19/0x1b
  [<ffffffff8108f50f>] warn_slowpath_common+0x7f/0xc0
  [<ffffffff8108f56a>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8234a071>] init_workqueues+0x35/0x505
  ...

v2: CPU number added to the generic debug info as requested by s390
    folks and dropped the s390 specific dump_stack().  This loses %ksp
    from the debug message which the maintainers think isn't important
    enough to keep the s390-specific dump_stack() implementation.

    dump_stack_print_info() is moved to kernel/printk.c from
    lib/dump_stack.c.  Because linkage is per objecct file,
    dump_stack_print_info() living in the same lib file as generic
    dump_stack() means that archs which implement custom dump_stack()
    - at this point, only blackfin - can't use dump_stack_print_info()
    as that will bring in the generic version of dump_stack() too.  v1
    The v1 patch broke build on blackfin due to this issue.  The build
    breakage was reported by Fengguang Wu.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Acked-by: Vineet Gupta <vgupta@synopsys.com>
Acked-by: Martin Schwidefsky <schwidefsky@de.ibm.com>	[s390 bits]
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Richard Kuo <rkuo@codeaurora.org>		[hexagon bits]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/traps.c        |  7 -------
 arch/arc/kernel/stacktrace.c     |  7 -------
 arch/arm/kernel/traps.c          |  7 -------
 arch/arm64/kernel/traps.c        |  7 -------
 arch/avr32/kernel/process.c      |  8 --------
 arch/blackfin/kernel/dumpstack.c |  1 +
 arch/c6x/kernel/traps.c          |  9 ---------
 arch/cris/kernel/traps.c         |  7 -------
 arch/frv/kernel/traps.c          | 11 -----------
 arch/h8300/kernel/traps.c        |  7 -------
 arch/hexagon/kernel/traps.c      |  8 --------
 arch/ia64/kernel/process.c       |  8 --------
 arch/m32r/kernel/traps.c         |  9 ---------
 arch/m68k/kernel/traps.c         | 12 ------------
 arch/metag/kernel/traps.c        |  6 ------
 arch/microblaze/kernel/traps.c   |  6 ------
 arch/mips/kernel/traps.c         | 13 -------------
 arch/mn10300/kernel/traps.c      | 11 -----------
 arch/openrisc/kernel/traps.c     | 11 -----------
 arch/parisc/kernel/traps.c       |  8 --------
 arch/powerpc/kernel/process.c    |  6 ------
 arch/s390/kernel/dumpstack.c     | 17 -----------------
 arch/score/kernel/traps.c        | 10 ----------
 arch/sh/kernel/dumpstack.c       |  6 ------
 arch/sparc/kernel/process_32.c   |  7 -------
 arch/sparc/kernel/traps_64.c     |  7 -------
 arch/um/kernel/sysrq.c           | 12 ------------
 arch/unicore32/kernel/traps.c    |  6 ------
 arch/x86/kernel/dumpstack.c      | 18 ------------------
 arch/xtensa/kernel/traps.c       |  8 --------
 include/linux/printk.h           |  5 +++++
 kernel/printk.c                  | 18 ++++++++++++++++++
 lib/dump_stack.c                 | 11 ++++++++---
 33 files changed, 32 insertions(+), 262 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/traps.c b/arch/alpha/kernel/traps.c
index 4037461a6493..affccb959a9e 100644
--- a/arch/alpha/kernel/traps.c
+++ b/arch/alpha/kernel/traps.c
@@ -169,13 +169,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	dik_show_trace(sp);
 }
 
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void
 die_if_kernel(char * str, struct pt_regs *regs, long err, unsigned long *r9_15)
 {
diff --git a/arch/arc/kernel/stacktrace.c b/arch/arc/kernel/stacktrace.c
index a63ff842564b..ca0207b9d5b6 100644
--- a/arch/arc/kernel/stacktrace.c
+++ b/arch/arc/kernel/stacktrace.c
@@ -220,13 +220,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 	show_stacktrace(tsk, NULL);
 }
 
-/* Expected by Rest of kernel code */
-void dump_stack(void)
-{
-	show_stacktrace(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 /* Another API expected by schedular, shows up in "ps" as Wait Channel
  * Ofcourse just returning schedule( ) would be pointless so unwind until
  * the function is not in schedular code
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 1c089119b2d7..18b32e8e4497 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -204,13 +204,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 }
 #endif
 
-void dump_stack(void)
-{
-	dump_backtrace(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *tsk, unsigned long *sp)
 {
 	dump_backtrace(NULL, tsk);
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index b3c5f628bdb4..61d7dd29f756 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -167,13 +167,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 	}
 }
 
-void dump_stack(void)
-{
-	dump_backtrace(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *tsk, unsigned long *sp)
 {
 	dump_backtrace(NULL, tsk);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 073c3c2fa521..89a8017e8ca6 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -204,14 +204,6 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 	show_stack_log_lvl(tsk, (unsigned long)stack, NULL, "");
 }
 
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace_log_lvl(current, &stack, NULL, "");
-}
-EXPORT_SYMBOL(dump_stack);
-
 static const char *cpu_modes[] = {
 	"Application", "Supervisor", "Interrupt level 0", "Interrupt level 1",
 	"Interrupt level 2", "Interrupt level 3", "Exception", "NMI"
diff --git a/arch/blackfin/kernel/dumpstack.c b/arch/blackfin/kernel/dumpstack.c
index 5cfbaa298211..95ba6d9e9a3d 100644
--- a/arch/blackfin/kernel/dumpstack.c
+++ b/arch/blackfin/kernel/dumpstack.c
@@ -168,6 +168,7 @@ void dump_stack(void)
 #endif
 	trace_buffer_save(tflags);
 	dump_bfin_trace_buffer();
+	dump_stack_print_info(KERN_DEFAULT);
 	show_stack(current, &stack);
 	trace_buffer_restore(tflags);
 }
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index 1be74e5b4788..d0b96ef25c17 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -67,15 +67,6 @@ void show_regs(struct pt_regs *regs)
 	pr_err("A31: %08lx  B31: %08lx\n", regs->a31, regs->b31);
 }
 
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_stack(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
-
 void die(char *str, struct pt_regs *fp, int nr)
 {
 	console_verbose();
diff --git a/arch/cris/kernel/traps.c b/arch/cris/kernel/traps.c
index a11ad3229f8c..0ffda73734f5 100644
--- a/arch/cris/kernel/traps.c
+++ b/arch/cris/kernel/traps.c
@@ -146,13 +146,6 @@ show_stack(void)
 }
 #endif
 
-void
-dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void
 set_nmi_handler(void (*handler)(struct pt_regs *))
 {
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index 5cfd1420b091..cfcd802d6f9c 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -466,17 +466,6 @@ asmlinkage void compound_exception(unsigned long esfr1,
 	BUG();
 } /* end compound_exception() */
 
-/*****************************************************************************/
-/*
- * The architecture-independent backtrace generator
- */
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *task, unsigned long *sp)
 {
 }
diff --git a/arch/h8300/kernel/traps.c b/arch/h8300/kernel/traps.c
index 7833aa3e7c7d..cfe494dbe3da 100644
--- a/arch/h8300/kernel/traps.c
+++ b/arch/h8300/kernel/traps.c
@@ -164,10 +164,3 @@ void show_trace_task(struct task_struct *tsk)
 {
 	show_stack(tsk,(unsigned long *)tsk->thread.esp0);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL,NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/hexagon/kernel/traps.c b/arch/hexagon/kernel/traps.c
index be5e2dd9c9d3..cc2171b2aa04 100644
--- a/arch/hexagon/kernel/traps.c
+++ b/arch/hexagon/kernel/traps.c
@@ -191,14 +191,6 @@ void show_stack(struct task_struct *task, unsigned long *fp)
 	do_show_stack(task, fp, 0);
 }
 
-void dump_stack(void)
-{
-	unsigned long *fp;
-	asm("%0 = r30" : "=r" (fp));
-	show_stack(current, fp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 int die(const char *str, struct pt_regs *regs, long err)
 {
 	static struct {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a26fc640e4ce..182bd64cc726 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -95,14 +95,6 @@ show_stack (struct task_struct *task, unsigned long *sp)
 	}
 }
 
-void
-dump_stack (void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 void
 show_regs (struct pt_regs *regs)
 {
diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c
index 9fe3467a5133..a7a424f852e4 100644
--- a/arch/m32r/kernel/traps.c
+++ b/arch/m32r/kernel/traps.c
@@ -167,15 +167,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_trace(task, sp);
 }
 
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace(current, &stack);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static void show_registers(struct pt_regs *regs)
 {
 	int i = 0;
diff --git a/arch/m68k/kernel/traps.c b/arch/m68k/kernel/traps.c
index f32ab22e7ed3..88fcd8c70e7b 100644
--- a/arch/m68k/kernel/traps.c
+++ b/arch/m68k/kernel/traps.c
@@ -991,18 +991,6 @@ void show_stack(struct task_struct *task, unsigned long *stack)
 	show_trace(stack);
 }
 
-/*
- * The architecture-independent backtrace generator
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace(&stack);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 /*
  * The vector number returned in the frame pointer may also contain
  * the "fs" (Fault Status) bits on ColdFire. These are in the bottom
diff --git a/arch/metag/kernel/traps.c b/arch/metag/kernel/traps.c
index 8961f247b500..2ceeaae5b199 100644
--- a/arch/metag/kernel/traps.c
+++ b/arch/metag/kernel/traps.c
@@ -987,9 +987,3 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 
 	show_trace(tsk, sp, NULL);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/microblaze/kernel/traps.c b/arch/microblaze/kernel/traps.c
index 30e6b5004a6a..cb619533a192 100644
--- a/arch/microblaze/kernel/traps.c
+++ b/arch/microblaze/kernel/traps.c
@@ -75,9 +75,3 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 
 	debug_show_held_locks(task);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index c3abb88170fc..b512b28cf786 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -206,19 +206,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_stacktrace(task, &regs);
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	struct pt_regs regs;
-
-	prepare_frametrace(&regs);
-	show_backtrace(current, &regs);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static void show_code(unsigned int __user *pc)
 {
 	long i;
diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c
index b900e5afa0ae..a7a987c7954f 100644
--- a/arch/mn10300/kernel/traps.c
+++ b/arch/mn10300/kernel/traps.c
@@ -293,17 +293,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_trace(sp);
 }
 
-/*
- * the architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_stack(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
 /*
  * dump the register file in the specified exception frame
  */
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 5cce396016d0..3d3f6062f49c 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -105,17 +105,6 @@ void show_trace_task(struct task_struct *tsk)
 	 */
 }
 
-/*
- * The architecture-independent backtrace generator
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_stack(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_registers(struct pt_regs *regs)
 {
 	int i;
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index aeb8f8f2c07a..e64cf5f09b62 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -158,14 +158,6 @@ void show_regs(struct pt_regs *regs)
 	}
 }
 
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static void do_show_stack(struct unwind_frame_info *info)
 {
 	int i = 1;
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 16e77a81ab4f..624d44bb44dc 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1362,12 +1362,6 @@ void show_stack(struct task_struct *tsk, unsigned long *stack)
 	} while (count++ < kstack_depth_to_print);
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 #ifdef CONFIG_PPC64
 /* Called with hard IRQs off */
 void __ppc64_runlatch_on(void)
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 03dce39d01ee..2f1f639d1a3d 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -129,23 +129,6 @@ static void show_last_breaking_event(struct pt_regs *regs)
 #endif
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	printk("CPU: %d %s %s %.*s\n",
-	       task_thread_info(current)->cpu, print_tainted(),
-	       init_utsname()->release,
-	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
-	printk("Process %s (pid: %d, task: %p, ksp: %p)\n",
-	       current->comm, current->pid, current,
-	       (void *) current->thread.ksp);
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 static inline int mask_bits(struct pt_regs *regs, unsigned long bits)
 {
 	return (regs->psw.mask & bits) / ((~bits + 1) & bits);
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index 0e46fb19a848..a38f435fba7b 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -149,16 +149,6 @@ static void show_registers(struct pt_regs *regs)
 	printk(KERN_NOTICE "\n");
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	show_stack(current_thread_info()->task,
-		   (long *) get_irq_regs()->regs[0]);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void __die(const char *str, struct pt_regs *regs, const char *file,
 	const char *func, unsigned long line)
 {
diff --git a/arch/sh/kernel/dumpstack.c b/arch/sh/kernel/dumpstack.c
index 7617dc4129ac..b959f5592604 100644
--- a/arch/sh/kernel/dumpstack.c
+++ b/arch/sh/kernel/dumpstack.c
@@ -158,9 +158,3 @@ void show_stack(struct task_struct *tsk, unsigned long *sp)
 		 (unsigned long)task_stack_page(tsk));
 	show_trace(tsk, sp, NULL);
 }
-
-void dump_stack(void)
-{
-	show_stack(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index 2be4214b3905..dccf5f58d70f 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -164,13 +164,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp)
 	printk("\n");
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 /*
  * Note: sparc64 has a pretty intricated thread_saved_pc, check it out.
  */
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index 8d38ca97aa23..b3f833ab90eb 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -2350,13 +2350,6 @@ void show_stack(struct task_struct *tsk, unsigned long *_ksp)
 	} while (++count < 16);
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
 static inline struct reg_window *kernel_stack_up(struct reg_window *rw)
 {
 	unsigned long fp = rw->ins[6];
diff --git a/arch/um/kernel/sysrq.c b/arch/um/kernel/sysrq.c
index e562ff80409a..7d101a2a1541 100644
--- a/arch/um/kernel/sysrq.c
+++ b/arch/um/kernel/sysrq.c
@@ -35,18 +35,6 @@ void show_trace(struct task_struct *task, unsigned long * stack)
 }
 #endif
 
-/*
- * stack dumps generator - this is used by arch-independent code.
- * And this is identical to i386 currently.
- */
-void dump_stack(void)
-{
-	unsigned long stack;
-
-	show_trace(current, &stack);
-}
-EXPORT_SYMBOL(dump_stack);
-
 /*Stolen from arch/i386/kernel/traps.c */
 static const int kstack_depth_to_print = 24;
 
diff --git a/arch/unicore32/kernel/traps.c b/arch/unicore32/kernel/traps.c
index 0870b68d2ad9..c54e32410ead 100644
--- a/arch/unicore32/kernel/traps.c
+++ b/arch/unicore32/kernel/traps.c
@@ -170,12 +170,6 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		c_backtrace(fp, mode);
 }
 
-void dump_stack(void)
-{
-	dump_backtrace(NULL, NULL);
-}
-EXPORT_SYMBOL(dump_stack);
-
 void show_stack(struct task_struct *tsk, unsigned long *sp)
 {
 	dump_backtrace(NULL, tsk);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index dd1a7c391c90..deb6421c9e69 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -191,24 +191,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_stack_log_lvl(task, NULL, sp, bp, "");
 }
 
-/*
- * The architecture-independent dump_stack generator
- */
-void dump_stack(void)
-{
-	unsigned long bp;
-	unsigned long stack;
-
-	bp = stack_frame(current, NULL);
-	printk("Pid: %d, comm: %.20s %s %s %.*s\n",
-		current->pid, current->comm, print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
-	show_trace(NULL, NULL, &stack, bp);
-}
-EXPORT_SYMBOL(dump_stack);
-
 static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
 static int die_owner = -1;
 static unsigned int die_nest_count;
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 923db5c15278..384b7c7c2f6d 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -481,14 +481,6 @@ void show_stack(struct task_struct *task, unsigned long *sp)
 	show_trace(task, stack);
 }
 
-void dump_stack(void)
-{
-	show_stack(current, NULL);
-}
-
-EXPORT_SYMBOL(dump_stack);
-
-
 void show_code(unsigned int *pc)
 {
 	long i;
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 4890fe62c011..7ce1f878cf6a 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -145,6 +145,7 @@ extern void wake_up_klogd(void);
 
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
+void dump_stack_print_info(const char *log_lvl);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -182,6 +183,10 @@ static inline void log_buf_kexec_setup(void)
 static inline void setup_log_buf(int early)
 {
 }
+
+static inline void dump_stack_print_info(const char *log_lvl)
+{
+}
 #endif
 
 extern void dump_stack(void) __cold;
diff --git a/kernel/printk.c b/kernel/printk.c
index 376914e2869d..70b4b94a0eca 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -43,6 +43,7 @@
 #include <linux/rculist.h>
 #include <linux/poll.h>
 #include <linux/irq_work.h>
+#include <linux/utsname.h>
 
 #include <asm/uaccess.h>
 
@@ -2849,4 +2850,21 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
+
+/**
+ * dump_stack_print_info - print generic debug info for dump_stack()
+ * @log_lvl: log level
+ *
+ * Arch-specific dump_stack() implementations can use this function to
+ * print out the same debug information as the generic dump_stack().
+ */
+void dump_stack_print_info(const char *log_lvl)
+{
+	printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+	       log_lvl, raw_smp_processor_id(), current->pid, current->comm,
+	       print_tainted(), init_utsname()->release,
+	       (int)strcspn(init_utsname()->version, " "),
+	       init_utsname()->version);
+}
+
 #endif
diff --git a/lib/dump_stack.c b/lib/dump_stack.c
index 42f4f55c9458..53bad099ebd6 100644
--- a/lib/dump_stack.c
+++ b/lib/dump_stack.c
@@ -5,11 +5,16 @@
 
 #include <linux/kernel.h>
 #include <linux/export.h>
+#include <linux/sched.h>
 
+/**
+ * dump_stack - dump the current task information and its stack trace
+ *
+ * Architectures can override this implementation by implementing its own.
+ */
 void dump_stack(void)
 {
-	printk(KERN_NOTICE
-		"This architecture does not implement dump_stack()\n");
+	dump_stack_print_info(KERN_DEFAULT);
+	show_stack(NULL, NULL);
 }
-
 EXPORT_SYMBOL(dump_stack);
-- 
cgit 


From 98e5e1bf722c4f976a860aed06dd365a56a34ee0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:15 -0700
Subject: dump_stack: implement arch-specific hardware description in task
 dumps

x86 and ia64 can acquire extra hardware identification information
from DMI and print it along with task dumps; however, the usage isn't
consistent.

* x86 show_regs() collects vendor, product and board strings and print
  them out with PID, comm and utsname.  Some of the information is
  printed again later in the same dump.

* warn_slowpath_common() explicitly accesses the DMI board and prints
  it out with "Hardware name:" label.  This applies to both x86 and
  ia64 but is irrelevant on all other archs.

* ia64 doesn't show DMI information on other non-WARN dumps.

This patch introduces arch-specific hardware description used by
dump_stack().  It can be set by calling dump_stack_set_arch_desc()
during boot and, if exists, printed out in a separate line with
"Hardware name:" label.

dmi_set_dump_stack_arch_desc() is added which sets arch-specific
description from DMI data.  It uses dmi_ids_string[] which is set from
dmi_present() used for DMI debug message.  It is superset of the
information x86 show_regs() is using.  The function is called from x86
and ia64 boot code right after dmi_scan_machine().

This makes the explicit DMI handling in warn_slowpath_common()
unnecessary.  Removed.

show_regs() isn't yet converted to use generic debug information
printing and this patch doesn't remove the duplicate DMI handling in
x86 show_regs().  The next patch will unify show_regs() handling and
remove the duplication.

An example WARN dump follows.

 WARNING: at kernel/workqueue.c:4841 init_workqueues+0x35/0x505()
 Modules linked in:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #3
 Hardware name: empty empty/S3992, BIOS 080011  10/26/2007
  0000000000000009 ffff88007c861e08 ffffffff81c614dc ffff88007c861e48
  ffffffff8108f500 ffffffff82228240 0000000000000040 ffffffff8234a08e
  0000000000000000 0000000000000000 0000000000000000 ffff88007c861e58
 Call Trace:
  [<ffffffff81c614dc>] dump_stack+0x19/0x1b
  [<ffffffff8108f500>] warn_slowpath_common+0x70/0xa0
  [<ffffffff8108f54a>] warn_slowpath_null+0x1a/0x20
  [<ffffffff8234a0c3>] init_workqueues+0x35/0x505
  ...

v2: Use the same string as the debug message from dmi_present() which
    also contains BIOS information.  Move hardware name into its own
    line as warn_slowpath_common() did.  This change was suggested by
    Bjorn Helgaas.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/ia64/kernel/setup.c    |  1 +
 arch/x86/kernel/setup.c     |  1 +
 drivers/firmware/dmi_scan.c | 13 +++++++++++++
 include/linux/dmi.h         |  2 ++
 include/linux/printk.h      |  5 +++++
 kernel/panic.c              |  6 ------
 kernel/printk.c             | 26 ++++++++++++++++++++++++++
 7 files changed, 48 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 2029cc0d2fc6..13bfdd22afc8 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -1063,6 +1063,7 @@ check_bugs (void)
 static int __init run_dmi_scan(void)
 {
 	dmi_scan_machine();
+	dmi_set_dump_stack_arch_desc();
 	return 0;
 }
 core_initcall(run_dmi_scan);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4689855c2f8a..56f7fcfe7fa2 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -996,6 +996,7 @@ void __init setup_arch(char **cmdline_p)
 		efi_init();
 
 	dmi_scan_machine();
+	dmi_set_dump_stack_arch_desc();
 
 	/*
 	 * VMware detection requires dmi to be available, so this
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 862b1d27a85b..98c62081c034 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -530,6 +530,19 @@ void __init dmi_scan_machine(void)
 	dmi_initialized = 1;
 }
 
+/**
+ * dmi_set_dump_stack_arch_desc - set arch description for dump_stack()
+ *
+ * Invoke dump_stack_set_arch_desc() with DMI system information so that
+ * DMI identifiers are printed out on task dumps.  Arch boot code should
+ * call this function after dmi_scan_machine() if it wants to print out DMI
+ * identifiers on task dumps.
+ */
+void __init dmi_set_dump_stack_arch_desc(void)
+{
+	dump_stack_set_arch_desc("%s", dmi_ids_string);
+}
+
 /**
  *	dmi_matches - check if dmi_system_id structure matches system DMI data
  *	@dmi: pointer to the dmi_system_id structure to check
diff --git a/include/linux/dmi.h b/include/linux/dmi.h
index f156cca25ad0..b6eb7a05d58e 100644
--- a/include/linux/dmi.h
+++ b/include/linux/dmi.h
@@ -99,6 +99,7 @@ extern const char * dmi_get_system_info(int field);
 extern const struct dmi_device * dmi_find_device(int type, const char *name,
 	const struct dmi_device *from);
 extern void dmi_scan_machine(void);
+extern void dmi_set_dump_stack_arch_desc(void);
 extern bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp);
 extern int dmi_name_in_vendors(const char *str);
 extern int dmi_name_in_serial(const char *str);
@@ -114,6 +115,7 @@ static inline const char * dmi_get_system_info(int field) { return NULL; }
 static inline const struct dmi_device * dmi_find_device(int type, const char *name,
 	const struct dmi_device *from) { return NULL; }
 static inline void dmi_scan_machine(void) { return; }
+static inline void dmi_set_dump_stack_arch_desc(void) { }
 static inline bool dmi_get_date(int field, int *yearp, int *monthp, int *dayp)
 {
 	if (yearp)
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 7ce1f878cf6a..47827c0a034d 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -145,6 +145,7 @@ extern void wake_up_klogd(void);
 
 void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
+void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
 #else
 static inline __printf(1, 0)
@@ -184,6 +185,10 @@ static inline void setup_log_buf(int early)
 {
 }
 
+static inline void dump_stack_set_arch_desc(const char *fmt, ...)
+{
+}
+
 static inline void dump_stack_print_info(const char *log_lvl)
 {
 }
diff --git a/kernel/panic.c b/kernel/panic.c
index 7c57cc9eee2c..167ec097ce8b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -22,7 +22,6 @@
 #include <linux/sysrq.h>
 #include <linux/init.h>
 #include <linux/nmi.h>
-#include <linux/dmi.h>
 
 #define PANIC_TIMER_STEP 100
 #define PANIC_BLINK_SPD 18
@@ -400,13 +399,8 @@ struct slowpath_args {
 static void warn_slowpath_common(const char *file, int line, void *caller,
 				 unsigned taint, struct slowpath_args *args)
 {
-	const char *board;
-
 	printk(KERN_WARNING "------------[ cut here ]------------\n");
 	printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller);
-	board = dmi_get_system_info(DMI_PRODUCT_NAME);
-	if (board)
-		printk(KERN_WARNING "Hardware name: %s\n", board);
 
 	if (args)
 		vprintk(args->fmt, args->args);
diff --git a/kernel/printk.c b/kernel/printk.c
index 70b4b94a0eca..73a96def4804 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2851,6 +2851,28 @@ void kmsg_dump_rewind(struct kmsg_dumper *dumper)
 }
 EXPORT_SYMBOL_GPL(kmsg_dump_rewind);
 
+static char dump_stack_arch_desc_str[128];
+
+/**
+ * dump_stack_set_arch_desc - set arch-specific str to show with task dumps
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * The configured string will be printed right after utsname during task
+ * dumps.  Usually used to add arch-specific system identifiers.  If an
+ * arch wants to make use of such an ID string, it should initialize this
+ * as soon as possible during boot.
+ */
+void __init dump_stack_set_arch_desc(const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	vsnprintf(dump_stack_arch_desc_str, sizeof(dump_stack_arch_desc_str),
+		  fmt, args);
+	va_end(args);
+}
+
 /**
  * dump_stack_print_info - print generic debug info for dump_stack()
  * @log_lvl: log level
@@ -2865,6 +2887,10 @@ void dump_stack_print_info(const char *log_lvl)
 	       print_tainted(), init_utsname()->release,
 	       (int)strcspn(init_utsname()->version, " "),
 	       init_utsname()->version);
+
+	if (dump_stack_arch_desc_str[0] != '\0')
+		printk("%sHardware name: %s\n",
+		       log_lvl, dump_stack_arch_desc_str);
 }
 
 #endif
-- 
cgit 


From a43cb95d547a061ed5bf1acb28e0f5fd575e26c1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:17 -0700
Subject: dump_stack: unify debug information printed by show_regs()

show_regs() is inherently arch-dependent but it does make sense to print
generic debug information and some archs already do albeit in slightly
different forms.  This patch introduces a generic function to print debug
information from show_regs() so that different archs print out the same
information and it's much easier to modify what's printed.

show_regs_print_info() prints out the same debug info as dump_stack()
does plus task and thread_info pointers.

* Archs which didn't print debug info now do.

  alpha, arc, blackfin, c6x, cris, frv, h8300, hexagon, ia64, m32r,
  metag, microblaze, mn10300, openrisc, parisc, score, sh64, sparc,
  um, xtensa

* Already prints debug info.  Replaced with show_regs_print_info().
  The printed information is superset of what used to be there.

  arm, arm64, avr32, mips, powerpc, sh32, tile, unicore32, x86

* s390 is special in that it used to print arch-specific information
  along with generic debug info.  Heiko and Martin think that the
  arch-specific extra isn't worth keeping s390 specfic implementation.
  Converted to use the generic version.

Note that now all archs print the debug info before actual register
dumps.

An example BUG() dump follows.

 kernel BUG at /work/os/work/kernel/workqueue.c:4841!
 invalid opcode: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC
 Modules linked in:
 CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.9.0-rc1-work+ #7
 Hardware name: empty empty/S3992, BIOS 080011  10/26/2007
 task: ffff88007c85e040 ti: ffff88007c860000 task.ti: ffff88007c860000
 RIP: 0010:[<ffffffff8234a07e>]  [<ffffffff8234a07e>] init_workqueues+0x4/0x6
 RSP: 0000:ffff88007c861ec8  EFLAGS: 00010246
 RAX: ffff88007c861fd8 RBX: ffffffff824466a8 RCX: 0000000000000001
 RDX: 0000000000000046 RSI: 0000000000000001 RDI: ffffffff8234a07a
 RBP: ffff88007c861ec8 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000001 R11: 0000000000000000 R12: ffffffff8234a07a
 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
 FS:  0000000000000000(0000) GS:ffff88007dc00000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
 CR2: ffff88015f7ff000 CR3: 00000000021f1000 CR4: 00000000000007f0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
 Stack:
  ffff88007c861ef8 ffffffff81000312 ffffffff824466a8 ffff88007c85e650
  0000000000000003 0000000000000000 ffff88007c861f38 ffffffff82335e5d
  ffff88007c862080 ffffffff8223d8c0 ffff88007c862080 ffffffff81c47760
 Call Trace:
  [<ffffffff81000312>] do_one_initcall+0x122/0x170
  [<ffffffff82335e5d>] kernel_init_freeable+0x9b/0x1c8
  [<ffffffff81c47760>] ? rest_init+0x140/0x140
  [<ffffffff81c4776e>] kernel_init+0xe/0xf0
  [<ffffffff81c6be9c>] ret_from_fork+0x7c/0xb0
  [<ffffffff81c47760>] ? rest_init+0x140/0x140
  ...

v2: Typo fix in x86-32.

v3: CPU number dropped from show_regs_print_info() as
    dump_stack_print_info() has been updated to print it.  s390
    specific implementation dropped as requested by s390 maintainers.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Jesper Nilsson <jesper.nilsson@axis.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: Vineet Gupta <vgupta@synopsys.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Acked-by: Chris Metcalf <cmetcalf@tilera.com>		[tile bits]
Acked-by: Richard Kuo <rkuo@codeaurora.org>		[hexagon bits]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c         |  1 +
 arch/arc/kernel/troubleshoot.c      |  1 +
 arch/arm/kernel/process.c           |  8 ++------
 arch/arm64/kernel/process.c         |  7 +------
 arch/avr32/kernel/process.c         |  5 ++---
 arch/blackfin/kernel/trace.c        |  2 ++
 arch/c6x/kernel/traps.c             |  1 +
 arch/cris/arch-v10/kernel/process.c |  3 +++
 arch/cris/arch-v32/kernel/process.c |  3 +++
 arch/frv/kernel/traps.c             |  3 +--
 arch/h8300/kernel/process.c         |  2 ++
 arch/hexagon/kernel/vm_events.c     |  2 ++
 arch/ia64/kernel/process.c          |  4 ++--
 arch/m32r/kernel/process.c          |  2 ++
 arch/metag/kernel/process.c         |  2 ++
 arch/microblaze/kernel/process.c    |  2 ++
 arch/mips/kernel/traps.c            |  2 +-
 arch/mn10300/kernel/process.c       |  1 +
 arch/openrisc/kernel/process.c      |  1 +
 arch/parisc/kernel/traps.c          |  2 ++
 arch/powerpc/kernel/process.c       |  8 ++------
 arch/s390/kernel/dumpstack.c        |  9 +--------
 arch/score/kernel/traps.c           |  2 ++
 arch/sh/kernel/process_32.c         |  6 +-----
 arch/sh/kernel/process_64.c         |  1 +
 arch/sparc/kernel/process_32.c      |  2 ++
 arch/sparc/kernel/process_64.c      |  2 ++
 arch/tile/kernel/process.c          |  3 +--
 arch/um/sys-ppc/sysrq.c             |  2 ++
 arch/unicore32/kernel/process.c     |  6 +-----
 arch/x86/include/asm/bug.h          |  3 ---
 arch/x86/kernel/dumpstack_32.c      |  4 +---
 arch/x86/kernel/dumpstack_64.c      |  6 +-----
 arch/x86/kernel/process.c           | 24 ------------------------
 arch/x86/kernel/process_32.c        |  2 --
 arch/x86/kernel/process_64.c        |  1 -
 arch/xtensa/kernel/traps.c          |  2 ++
 include/linux/printk.h              |  5 +++++
 kernel/printk.c                     | 16 ++++++++++++++++
 39 files changed, 74 insertions(+), 84 deletions(-)

(limited to 'include/linux')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index a3fd8a29ccac..ab80a80d38a2 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -175,6 +175,7 @@ machine_power_off(void)
 void
 show_regs(struct pt_regs *regs)
 {
+	show_regs_print_info(KERN_DEFAULT);
 	dik_show_regs(regs, NULL);
 }
 
diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c
index 7c10873c311f..96be1e6e76d8 100644
--- a/arch/arc/kernel/troubleshoot.c
+++ b/arch/arc/kernel/troubleshoot.c
@@ -163,6 +163,7 @@ void show_regs(struct pt_regs *regs)
 		return;
 
 	print_task_path_n_nm(tsk, buf);
+	show_regs_print_info(KERN_INFO);
 
 	if (current->thread.cause_code)
 		show_ecr_verbose(regs);
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index c9a5e2ce8aa9..ae58d3b37d9d 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -225,11 +225,8 @@ void __show_regs(struct pt_regs *regs)
 	unsigned long flags;
 	char buf[64];
 
-	printk("CPU: %d    %s  (%s %.*s)\n",
-		raw_smp_processor_id(), print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
+	show_regs_print_info(KERN_DEFAULT);
+
 	print_symbol("PC is at %s\n", instruction_pointer(regs));
 	print_symbol("LR is at %s\n", regs->ARM_lr);
 	printk("pc : [<%08lx>]    lr : [<%08lx>]    psr: %08lx\n"
@@ -284,7 +281,6 @@ void __show_regs(struct pt_regs *regs)
 void show_regs(struct pt_regs * regs)
 {
 	printk("\n");
-	printk("Pid: %d, comm: %20s\n", task_pid_nr(current), current->comm);
 	__show_regs(regs);
 	dump_stack();
 }
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 6f3822f98dcd..f4919721f7dd 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -145,11 +145,7 @@ void __show_regs(struct pt_regs *regs)
 {
 	int i;
 
-	printk("CPU: %d    %s  (%s %.*s)\n",
-		raw_smp_processor_id(), print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
+	show_regs_print_info(KERN_DEFAULT);
 	print_symbol("PC is at %s\n", instruction_pointer(regs));
 	print_symbol("LR is at %s\n", regs->regs[30]);
 	printk("pc : [<%016llx>] lr : [<%016llx>] pstate: %08llx\n",
@@ -166,7 +162,6 @@ void __show_regs(struct pt_regs *regs)
 void show_regs(struct pt_regs * regs)
 {
 	printk("\n");
-	printk("Pid: %d, comm: %20s\n", task_pid_nr(current), current->comm);
 	__show_regs(regs);
 }
 
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 89a8017e8ca6..e7b61494c312 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -215,6 +215,8 @@ void show_regs_log_lvl(struct pt_regs *regs, const char *log_lvl)
 	unsigned long lr = regs->lr;
 	unsigned long mode = (regs->sr & MODE_MASK) >> MODE_SHIFT;
 
+	show_regs_print_info(log_lvl);
+
 	if (!user_mode(regs)) {
 		sp = (unsigned long)regs + FRAME_SIZE_FULL;
 
@@ -252,9 +254,6 @@ void show_regs_log_lvl(struct pt_regs *regs, const char *log_lvl)
 	       regs->sr & SR_I0M ? '0' : '.',
 	       regs->sr & SR_GM ? 'G' : 'g');
 	printk("%sCPU Mode: %s\n", log_lvl, cpu_modes[mode]);
-	printk("%sProcess: %s [%d] (task: %p thread: %p)\n",
-	       log_lvl, current->comm, current->pid, current,
-	       task_thread_info(current));
 }
 
 void show_regs(struct pt_regs *regs)
diff --git a/arch/blackfin/kernel/trace.c b/arch/blackfin/kernel/trace.c
index f7f7a18abca9..c36efa0c7163 100644
--- a/arch/blackfin/kernel/trace.c
+++ b/arch/blackfin/kernel/trace.c
@@ -853,6 +853,8 @@ void show_regs(struct pt_regs *fp)
 	unsigned char in_atomic = (bfin_read_IPEND() & 0x10) || in_atomic();
 
 	pr_notice("\n");
+	show_regs_print_info(KERN_NOTICE);
+
 	if (CPUID != bfin_cpuid())
 		pr_notice("Compiled for cpu family 0x%04x (Rev %d), "
 			"but running on:0x%04x (Rev %d)\n",
diff --git a/arch/c6x/kernel/traps.c b/arch/c6x/kernel/traps.c
index d0b96ef25c17..dcc2c2f6d67c 100644
--- a/arch/c6x/kernel/traps.c
+++ b/arch/c6x/kernel/traps.c
@@ -31,6 +31,7 @@ void __init trap_init(void)
 void show_regs(struct pt_regs *regs)
 {
 	pr_err("\n");
+	show_regs_print_info(KERN_ERR);
 	pr_err("PC: %08lx SP: %08lx\n", regs->pc, regs->sp);
 	pr_err("Status: %08lx ORIG_A4: %08lx\n", regs->csr, regs->orig_a4);
 	pr_err("A0: %08lx  B0: %08lx\n", regs->a0, regs->b0);
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index 2ba23c13df68..753e9a03cf87 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -176,6 +176,9 @@ unsigned long get_wchan(struct task_struct *p)
 void show_regs(struct pt_regs * regs)
 {
 	unsigned long usp = rdusp();
+
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("IRP: %08lx SRP: %08lx DCCR: %08lx USP: %08lx MOF: %08lx\n",
 	       regs->irp, regs->srp, regs->dccr, usp, regs->mof );
 	printk(" r0: %08lx  r1: %08lx   r2: %08lx  r3: %08lx\n",
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index 57451faa9b20..cebd32e2a8fb 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -164,6 +164,9 @@ get_wchan(struct task_struct *p)
 void show_regs(struct pt_regs * regs)
 {
 	unsigned long usp = rdusp();
+
+	show_regs_print_info(KERN_DEFAULT);
+
         printk("ERP: %08lx SRP: %08lx  CCS: %08lx USP: %08lx MOF: %08lx\n",
 		regs->erp, regs->srp, regs->ccs, usp, regs->mof);
 
diff --git a/arch/frv/kernel/traps.c b/arch/frv/kernel/traps.c
index cfcd802d6f9c..4bff48c19d29 100644
--- a/arch/frv/kernel/traps.c
+++ b/arch/frv/kernel/traps.c
@@ -497,6 +497,7 @@ void show_regs(struct pt_regs *regs)
 	int loop;
 
 	printk("\n");
+	show_regs_print_info(KERN_DEFAULT);
 
 	printk("Frame: @%08lx [%s]\n",
 	       (unsigned long) regs,
@@ -511,8 +512,6 @@ void show_regs(struct pt_regs *regs)
 		else
 			printk(" | ");
 	}
-
-	printk("Process %s (pid: %d)\n", current->comm, current->pid);
 }
 
 void die_if_kernel(const char *str, ...)
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index a17d2cd463d2..1a744ab7e7e5 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -83,6 +83,8 @@ void machine_power_off(void)
 
 void show_regs(struct pt_regs * regs)
 {
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("\nPC: %08lx  Status: %02x",
 	       regs->pc, regs->ccr);
 	printk("\nORIG_ER0: %08lx ER0: %08lx ER1: %08lx",
diff --git a/arch/hexagon/kernel/vm_events.c b/arch/hexagon/kernel/vm_events.c
index 9b5a4a295a68..f337281ebe67 100644
--- a/arch/hexagon/kernel/vm_events.c
+++ b/arch/hexagon/kernel/vm_events.c
@@ -33,6 +33,8 @@
  */
 void show_regs(struct pt_regs *regs)
 {
+	show_regs_print_info(KERN_EMERG);
+
 	printk(KERN_EMERG "restart_r0: \t0x%08lx   syscall_nr: %ld\n",
 	       regs->restart_r0, regs->syscall_nr);
 	printk(KERN_EMERG "preds: \t\t0x%08lx\n", regs->preds);
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 182bd64cc726..55d4ba47a907 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -101,8 +101,8 @@ show_regs (struct pt_regs *regs)
 	unsigned long ip = regs->cr_iip + ia64_psr(regs)->ri;
 
 	print_modules();
-	printk("\nPid: %d, CPU %d, comm: %20s\n", task_pid_nr(current),
-			smp_processor_id(), current->comm);
+	printk("\n");
+	show_regs_print_info(KERN_DEFAULT);
 	printk("psr : %016lx ifs : %016lx ip  : [<%016lx>]    %s (%s)\n",
 	       regs->cr_ipsr, regs->cr_ifs, ip, print_tainted(),
 	       init_utsname()->release);
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index e2d049018c3b..e69221d581d5 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -73,6 +73,8 @@ void machine_power_off(void)
 void show_regs(struct pt_regs * regs)
 {
 	printk("\n");
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("BPC[%08lx]:PSW[%08lx]:LR [%08lx]:FP [%08lx]\n", \
 	  regs->bpc, regs->psw, regs->lr, regs->fp);
 	printk("BBPC[%08lx]:BBPSW[%08lx]:SPU[%08lx]:SPI[%08lx]\n", \
diff --git a/arch/metag/kernel/process.c b/arch/metag/kernel/process.c
index dc5923544560..483dff986a23 100644
--- a/arch/metag/kernel/process.c
+++ b/arch/metag/kernel/process.c
@@ -129,6 +129,8 @@ void show_regs(struct pt_regs *regs)
 		"D1.7 "
 	};
 
+	show_regs_print_info(KERN_INFO);
+
 	pr_info(" pt_regs @ %p\n", regs);
 	pr_info(" SaveMask = 0x%04hx\n", regs->ctx.SaveMask);
 	pr_info(" Flags = 0x%04hx (%c%c%c%c)\n", regs->ctx.Flags,
diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c
index 7cce2e9c1719..a55893807274 100644
--- a/arch/microblaze/kernel/process.c
+++ b/arch/microblaze/kernel/process.c
@@ -20,6 +20,8 @@
 
 void show_regs(struct pt_regs *regs)
 {
+	show_regs_print_info(KERN_INFO);
+
 	pr_info(" Registers dump: mode=%X\r\n", regs->pt_mode);
 	pr_info(" r1=%08lX, r2=%08lX, r3=%08lX, r4=%08lX\n",
 				regs->r1, regs->r2, regs->r3, regs->r4);
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index b512b28cf786..25225515451f 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -231,7 +231,7 @@ static void __show_regs(const struct pt_regs *regs)
 	unsigned int cause = regs->cp0_cause;
 	int i;
 
-	printk("Cpu %d\n", smp_processor_id());
+	show_regs_print_info(KERN_DEFAULT);
 
 	/*
 	 * Saved main processor registers
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index 2da39fb8b3b2..3707da583d05 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -97,6 +97,7 @@ void machine_power_off(void)
 
 void show_regs(struct pt_regs *regs)
 {
+	show_regs_print_info(KERN_DEFAULT);
 }
 
 /*
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index 00c233bf0d06..386af258591d 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -90,6 +90,7 @@ void show_regs(struct pt_regs *regs)
 {
 	extern void show_registers(struct pt_regs *regs);
 
+	show_regs_print_info(KERN_DEFAULT);
 	/* __PHX__ cleanup this mess */
 	show_registers(regs);
 }
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index e64cf5f09b62..f702bff0bed9 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -126,6 +126,8 @@ void show_regs(struct pt_regs *regs)
 	user = user_mode(regs);
 	level = user ? KERN_DEBUG : KERN_CRIT;
 
+	show_regs_print_info(level);
+
 	print_gr(level, regs);
 
 	for (i = 0; i < 8; i += 4)
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 624d44bb44dc..13a8d9d0b5cb 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -831,6 +831,8 @@ void show_regs(struct pt_regs * regs)
 {
 	int i, trap;
 
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("NIP: "REG" LR: "REG" CTR: "REG"\n",
 	       regs->nip, regs->link, regs->ctr);
 	printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
@@ -850,12 +852,6 @@ void show_regs(struct pt_regs * regs)
 #else
 		printk("DAR: "REG", DSISR: %08lx\n", regs->dar, regs->dsisr);
 #endif
-	printk("TASK = %p[%d] '%s' THREAD: %p",
-	       current, task_pid_nr(current), current->comm, task_thread_info(current));
-
-#ifdef CONFIG_SMP
-	printk(" CPU: %d", raw_smp_processor_id());
-#endif /* CONFIG_SMP */
 
 	for (i = 0;  i < 32;  i++) {
 		if ((i % REGS_PER_LINE) == 0)
diff --git a/arch/s390/kernel/dumpstack.c b/arch/s390/kernel/dumpstack.c
index 2f1f639d1a3d..298297477257 100644
--- a/arch/s390/kernel/dumpstack.c
+++ b/arch/s390/kernel/dumpstack.c
@@ -166,14 +166,7 @@ void show_registers(struct pt_regs *regs)
 
 void show_regs(struct pt_regs *regs)
 {
-	printk("CPU: %d %s %s %.*s\n",
-	       task_thread_info(current)->cpu, print_tainted(),
-	       init_utsname()->release,
-	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
-	printk("Process %s (pid: %d, task: %p, ksp: %p)\n",
-	       current->comm, current->pid, current,
-	       (void *) current->thread.ksp);
+	show_regs_print_info(KERN_DEFAULT);
 	show_registers(regs);
 	/* Show stack backtrace if pt_regs is from kernel mode */
 	if (!user_mode(regs))
diff --git a/arch/score/kernel/traps.c b/arch/score/kernel/traps.c
index a38f435fba7b..1517a7dcd6d9 100644
--- a/arch/score/kernel/traps.c
+++ b/arch/score/kernel/traps.c
@@ -117,6 +117,8 @@ static void show_code(unsigned int *pc)
  */
 void show_regs(struct pt_regs *regs)
 {
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("r0 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n",
 		regs->regs[0], regs->regs[1], regs->regs[2], regs->regs[3],
 		regs->regs[4], regs->regs[5], regs->regs[6], regs->regs[7]);
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 73eb66fc6253..ebd3933005b4 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -32,11 +32,7 @@
 void show_regs(struct pt_regs * regs)
 {
 	printk("\n");
-	printk("Pid : %d, Comm: \t\t%s\n", task_pid_nr(current), current->comm);
-	printk("CPU : %d        \t\t%s  (%s %.*s)\n\n",
-	       smp_processor_id(), print_tainted(), init_utsname()->release,
-	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
+	show_regs_print_info(KERN_DEFAULT);
 
 	print_symbol("PC is at %s\n", instruction_pointer(regs));
 	print_symbol("PR is at %s\n", regs->pr);
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index e611c85144b1..174d124b419e 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -40,6 +40,7 @@ void show_regs(struct pt_regs *regs)
 	unsigned long long ah, al, bh, bl, ch, cl;
 
 	printk("\n");
+	show_regs_print_info(KERN_DEFAULT);
 
 	ah = (regs->pc) >> 32;
 	al = (regs->pc) & 0xffffffff;
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index dccf5f58d70f..fdd819dfdacf 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -112,6 +112,8 @@ void show_regs(struct pt_regs *r)
 {
 	struct reg_window32 *rw = (struct reg_window32 *) r->u_regs[14];
 
+	show_regs_print_info(KERN_DEFAULT);
+
         printk("PSR: %08lx PC: %08lx NPC: %08lx Y: %08lx    %s\n",
 	       r->psr, r->pc, r->npc, r->y, print_tainted());
 	printk("PC: <%pS>\n", (void *) r->pc);
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 9fbf0d14a361..c6dc1ba069ca 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -163,6 +163,8 @@ static void show_regwindow(struct pt_regs *regs)
 
 void show_regs(struct pt_regs *regs)
 {
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("TSTATE: %016lx TPC: %016lx TNPC: %016lx Y: %08x    %s\n", regs->tstate,
 	       regs->tpc, regs->tnpc, regs->y, print_tainted());
 	printk("TPC: <%pS>\n", (void *) regs->tpc);
diff --git a/arch/tile/kernel/process.c b/arch/tile/kernel/process.c
index 80b2a18deb87..8ac304484f98 100644
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -573,8 +573,7 @@ void show_regs(struct pt_regs *regs)
 	int i;
 
 	pr_err("\n");
-	pr_err(" Pid: %d, comm: %20s, CPU: %d\n",
-	       tsk->pid, tsk->comm, smp_processor_id());
+	show_regs_print_info(KERN_ERR);
 #ifdef __tilegx__
 	for (i = 0; i < 51; i += 3)
 		pr_err(" r%-2d: "REGFMT" r%-2d: "REGFMT" r%-2d: "REGFMT"\n",
diff --git a/arch/um/sys-ppc/sysrq.c b/arch/um/sys-ppc/sysrq.c
index f889449f9285..1ff1ad7f27da 100644
--- a/arch/um/sys-ppc/sysrq.c
+++ b/arch/um/sys-ppc/sysrq.c
@@ -11,6 +11,8 @@
 void show_regs(struct pt_regs_subarch *regs)
 {
 	printk("\n");
+	show_regs_print_info(KERN_DEFAULT);
+
 	printk("show_regs(): insert regs here.\n");
 #if 0
         printk("\n");
diff --git a/arch/unicore32/kernel/process.c b/arch/unicore32/kernel/process.c
index 7fab86d7c5d4..c9447691bdac 100644
--- a/arch/unicore32/kernel/process.c
+++ b/arch/unicore32/kernel/process.c
@@ -144,11 +144,7 @@ void __show_regs(struct pt_regs *regs)
 	unsigned long flags;
 	char buf[64];
 
-	printk(KERN_DEFAULT "CPU: %d    %s  (%s %.*s)\n",
-		raw_smp_processor_id(), print_tainted(),
-		init_utsname()->release,
-		(int)strcspn(init_utsname()->version, " "),
-		init_utsname()->version);
+	show_regs_print_info(KERN_DEFAULT);
 	print_symbol("PC is at %s\n", instruction_pointer(regs));
 	print_symbol("LR is at %s\n", regs->UCreg_lr);
 	printk(KERN_DEFAULT "pc : [<%08lx>]    lr : [<%08lx>]    psr: %08lx\n"
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index 11e1152222d0..2f03ff018d36 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -37,7 +37,4 @@ do {								\
 
 #include <asm-generic/bug.h>
 
-
-extern void show_regs_common(void);
-
 #endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 1038a417ea53..f2a1770ca176 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -86,11 +86,9 @@ void show_regs(struct pt_regs *regs)
 {
 	int i;
 
+	show_regs_print_info(KERN_EMERG);
 	__show_regs(regs, !user_mode_vm(regs));
 
-	pr_emerg("Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
-		 TASK_COMM_LEN, current->comm, task_pid_nr(current),
-		 current_thread_info(), current, task_thread_info(current));
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index b653675d5288..addb207dab92 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -249,14 +249,10 @@ void show_regs(struct pt_regs *regs)
 {
 	int i;
 	unsigned long sp;
-	const int cpu = smp_processor_id();
-	struct task_struct *cur = current;
 
 	sp = regs->sp;
-	printk("CPU %d ", cpu);
+	show_regs_print_info(KERN_DEFAULT);
 	__show_regs(regs, 1);
-	printk(KERN_DEFAULT "Process %s (pid: %d, threadinfo %p, task %p)\n",
-	       cur->comm, cur->pid, task_thread_info(cur), cur);
 
 	/*
 	 * When in-kernel, we also print out the stack and code at the
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 14fcf55a5c5b..607af0d4d5ef 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -121,30 +121,6 @@ void exit_thread(void)
 	drop_fpu(me);
 }
 
-void show_regs_common(void)
-{
-	const char *vendor, *product, *board;
-
-	vendor = dmi_get_system_info(DMI_SYS_VENDOR);
-	if (!vendor)
-		vendor = "";
-	product = dmi_get_system_info(DMI_PRODUCT_NAME);
-	if (!product)
-		product = "";
-
-	/* Board Name is optional */
-	board = dmi_get_system_info(DMI_BOARD_NAME);
-
-	printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s %s%s%s\n",
-	       current->pid, current->comm, print_tainted(),
-	       init_utsname()->release,
-	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version,
-	       vendor, product,
-	       board ? "/" : "",
-	       board ? board : "");
-}
-
 void flush_thread(void)
 {
 	struct task_struct *tsk = current;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b5a8905785e6..7305f7dfc7ab 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -84,8 +84,6 @@ void __show_regs(struct pt_regs *regs, int all)
 		savesegment(gs, gs);
 	}
 
-	show_regs_common();
-
 	printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
 			(u16)regs->cs, regs->ip, regs->flags,
 			smp_processor_id());
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 0f49677da51e..355ae06dbf94 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -62,7 +62,6 @@ void __show_regs(struct pt_regs *regs, int all)
 	unsigned int fsindex, gsindex;
 	unsigned int ds, cs, es;
 
-	show_regs_common();
 	printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
 	printk_address(regs->ip, 1);
 	printk(KERN_DEFAULT "RSP: %04lx:%016lx  EFLAGS: %08lx\n", regs->ss,
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 384b7c7c2f6d..458186dab5dc 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -383,6 +383,8 @@ void show_regs(struct pt_regs * regs)
 {
 	int i, wmask;
 
+	show_regs_print_info(KERN_DEFAULT);
+
 	wmask = regs->wmask & ~1;
 
 	for (i = 0; i < 16; i++) {
diff --git a/include/linux/printk.h b/include/linux/printk.h
index 47827c0a034d..6af944ab38f0 100644
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -147,6 +147,7 @@ void log_buf_kexec_setup(void);
 void __init setup_log_buf(int early);
 void dump_stack_set_arch_desc(const char *fmt, ...);
 void dump_stack_print_info(const char *log_lvl);
+void show_regs_print_info(const char *log_lvl);
 #else
 static inline __printf(1, 0)
 int vprintk(const char *s, va_list args)
@@ -192,6 +193,10 @@ static inline void dump_stack_set_arch_desc(const char *fmt, ...)
 static inline void dump_stack_print_info(const char *log_lvl)
 {
 }
+
+static inline void show_regs_print_info(const char *log_lvl)
+{
+}
 #endif
 
 extern void dump_stack(void) __cold;
diff --git a/kernel/printk.c b/kernel/printk.c
index 73a96def4804..e10ad515901f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2893,4 +2893,20 @@ void dump_stack_print_info(const char *log_lvl)
 		       log_lvl, dump_stack_arch_desc_str);
 }
 
+/**
+ * show_regs_print_info - print generic debug info for show_regs()
+ * @log_lvl: log level
+ *
+ * show_regs() implementations can use this function to print out generic
+ * debug information.
+ */
+void show_regs_print_info(const char *log_lvl)
+{
+	dump_stack_print_info(log_lvl);
+
+	printk("%stask: %p ti: %p task.ti: %p\n",
+	       log_lvl, current, current_thread_info(),
+	       task_thread_info(current));
+}
+
 #endif
-- 
cgit 


From cd42d559e45e3563c74403e453f8954b593db69d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:21 -0700
Subject: kthread: implement probe_kthread_data()

One of the problems that arise when converting dedicated custom threadpool
to workqueue is that the shared worker pool used by workqueue anonimizes
each worker making it more difficult to identify what the worker was doing
on which target from the output of sysrq-t or debug dump from oops, BUG()
and friends.

For example, after writeback is converted to use workqueue instead of
priviate thread pool, there's no easy to tell which backing device a
writeback work item was working on at the time of task dump, which,
according to our writeback brethren, is important in tracking down issues
with a lot of mounted file systems on a lot of different devices.

This patchset implements a way for a work function to mark its execution
instance so that task dump of the worker task includes information to
indicate what the work item was doing.

An example WARN dump would look like the following.

 WARNING: at fs/fs-writeback.c:1015 bdi_writeback_workfn+0x2b4/0x3c0()
 Modules linked in:
 CPU: 0 Pid: 28 Comm: kworker/u18:0 Not tainted 3.9.0-rc1-work+ #24
 Hardware name: empty empty/S3992, BIOS 080011  10/26/2007
 Workqueue: writeback bdi_writeback_workfn (flush-8:16)
  ffffffff820a3a98 ffff88015b927cb8 ffffffff81c61855 ffff88015b927cf8
  ffffffff8108f500 0000000000000000 ffff88007a171948 ffff88007a1716b0
  ffff88015b49df00 ffff88015b8d3940 0000000000000000 ffff88015b927d08
 Call Trace:
  [<ffffffff81c61855>] dump_stack+0x19/0x1b
  [<ffffffff8108f500>] warn_slowpath_common+0x70/0xa0
  ...

This patch:

Implement probe_kthread_data() which returns kthread_data if accessible.
The function is equivalent to kthread_data() except that the specified
@task may not be a kthread or its vfork_done is already cleared rendering
struct kthread inaccessible.  In the former case, probe_kthread_data() may
return any value.  In the latter, NULL.

This will be used to safely print debug information without affecting
synchronization in the normal paths.  Workqueue debug info printing on
dump_stack() and friends will make use of it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kthread.h |  1 +
 kernel/kthread.c        | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 8d816646f766..7dcef3317689 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -43,6 +43,7 @@ bool kthread_should_stop(void);
 bool kthread_should_park(void);
 bool kthread_freezable_should_stop(bool *was_frozen);
 void *kthread_data(struct task_struct *k);
+void *probe_kthread_data(struct task_struct *k);
 int kthread_park(struct task_struct *k);
 void kthread_unpark(struct task_struct *k);
 void kthread_parkme(void);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 16d8ddd268b1..760e86df8c20 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/freezer.h>
 #include <linux/ptrace.h>
+#include <linux/uaccess.h>
 #include <trace/events/sched.h>
 
 static DEFINE_SPINLOCK(kthread_create_lock);
@@ -135,6 +136,24 @@ void *kthread_data(struct task_struct *task)
 	return to_kthread(task)->data;
 }
 
+/**
+ * probe_kthread_data - speculative version of kthread_data()
+ * @task: possible kthread task in question
+ *
+ * @task could be a kthread task.  Return the data value specified when it
+ * was created if accessible.  If @task isn't a kthread task or its data is
+ * inaccessible for any reason, %NULL is returned.  This function requires
+ * that @task itself is safe to dereference.
+ */
+void *probe_kthread_data(struct task_struct *task)
+{
+	struct kthread *kthread = to_kthread(task);
+	void *data = NULL;
+
+	probe_kernel_read(&data, &kthread->data, sizeof(data));
+	return data;
+}
+
 static void __kthread_parkme(struct kthread *self)
 {
 	__set_current_state(TASK_PARKED);
-- 
cgit 


From 3d1cb2059d9374e58da481b783332cf191cb6620 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 30 Apr 2013 15:27:22 -0700
Subject: workqueue: include workqueue info when printing debug dump of a
 worker task

One of the problems that arise when converting dedicated custom
threadpool to workqueue is that the shared worker pool used by workqueue
anonimizes each worker making it more difficult to identify what the
worker was doing on which target from the output of sysrq-t or debug
dump from oops, BUG() and friends.

This patch implements set_worker_desc() which can be called from any
workqueue work function to set its description.  When the worker task is
dumped for whatever reason - sysrq-t, WARN, BUG, oops, lockdep assertion
and so on - the description will be printed out together with the
workqueue name and the worker function pointer.

The printing side is implemented by print_worker_info() which is called
from functions in task dump paths - sched_show_task() and
dump_stack_print_info().  print_worker_info() can be safely called on
any task in any state as long as the task struct itself is accessible.
It uses probe_*() functions to access worker fields.  It may print
garbage if something went very wrong, but it wouldn't cause (another)
oops.

The description is currently limited to 24bytes including the
terminating \0.  worker->desc_valid and workder->desc[] are added and
the 64 bytes marker which was already incorrect before adding the new
fields is moved to the correct position.

Here's an example dump with writeback updated to set the bdi name as
worker desc.

 Hardware name: Bochs
 Modules linked in:
 Pid: 7, comm: kworker/u9:0 Not tainted 3.9.0-rc1-work+ #1
 Workqueue: writeback bdi_writeback_workfn (flush-8:0)
  ffffffff820a3ab0 ffff88000f6e9cb8 ffffffff81c61845 ffff88000f6e9cf8
  ffffffff8108f50f 0000000000000000 0000000000000000 ffff88000cde16b0
  ffff88000cde1aa8 ffff88001ee19240 ffff88000f6e9fd8 ffff88000f6e9d08
 Call Trace:
  [<ffffffff81c61845>] dump_stack+0x19/0x1b
  [<ffffffff8108f50f>] warn_slowpath_common+0x7f/0xc0
  [<ffffffff8108f56a>] warn_slowpath_null+0x1a/0x20
  [<ffffffff81200150>] bdi_writeback_workfn+0x2a0/0x3b0
 ...

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Acked-by: Jan Kara <jack@suse.cz>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Dave Chinner <david@fromorbit.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h   |  5 +++
 kernel/printk.c             |  2 ++
 kernel/sched/core.c         |  1 +
 kernel/workqueue.c          | 79 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/workqueue_internal.h | 12 ++++++-
 5 files changed, 98 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 717975639378..623488fdc1f5 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -92,6 +92,9 @@ enum {
 	/* bit mask for work_busy() return values */
 	WORK_BUSY_PENDING	= 1 << 0,
 	WORK_BUSY_RUNNING	= 1 << 1,
+
+	/* maximum string length for set_worker_desc() */
+	WORKER_DESC_LEN		= 24,
 };
 
 struct work_struct {
@@ -447,6 +450,8 @@ extern void workqueue_set_max_active(struct workqueue_struct *wq,
 extern bool current_is_workqueue_rescuer(void);
 extern bool workqueue_congested(int cpu, struct workqueue_struct *wq);
 extern unsigned int work_busy(struct work_struct *work);
+extern __printf(1, 2) void set_worker_desc(const char *fmt, ...);
+extern void print_worker_info(const char *log_lvl, struct task_struct *task);
 
 /**
  * queue_work - queue work on a workqueue
diff --git a/kernel/printk.c b/kernel/printk.c
index e10ad515901f..96dcfcd9a2d4 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -2891,6 +2891,8 @@ void dump_stack_print_info(const char *log_lvl)
 	if (dump_stack_arch_desc_str[0] != '\0')
 		printk("%sHardware name: %s\n",
 		       log_lvl, dump_stack_arch_desc_str);
+
+	print_worker_info(log_lvl, current);
 }
 
 /**
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c70a8814a767..5662f58f0b69 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4586,6 +4586,7 @@ void sched_show_task(struct task_struct *p)
 		task_pid_nr(p), ppid,
 		(unsigned long)task_thread_info(p)->flags);
 
+	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
 }
 
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 154aa12af48e..4aa9f5bc6b2d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -46,6 +46,7 @@
 #include <linux/rculist.h>
 #include <linux/nodemask.h>
 #include <linux/moduleparam.h>
+#include <linux/uaccess.h>
 
 #include "workqueue_internal.h"
 
@@ -2197,6 +2198,7 @@ __acquires(&pool->lock)
 	worker->current_work = NULL;
 	worker->current_func = NULL;
 	worker->current_pwq = NULL;
+	worker->desc_valid = false;
 	pwq_dec_nr_in_flight(pwq, work_color);
 }
 
@@ -4365,6 +4367,83 @@ unsigned int work_busy(struct work_struct *work)
 }
 EXPORT_SYMBOL_GPL(work_busy);
 
+/**
+ * set_worker_desc - set description for the current work item
+ * @fmt: printf-style format string
+ * @...: arguments for the format string
+ *
+ * This function can be called by a running work function to describe what
+ * the work item is about.  If the worker task gets dumped, this
+ * information will be printed out together to help debugging.  The
+ * description can be at most WORKER_DESC_LEN including the trailing '\0'.
+ */
+void set_worker_desc(const char *fmt, ...)
+{
+	struct worker *worker = current_wq_worker();
+	va_list args;
+
+	if (worker) {
+		va_start(args, fmt);
+		vsnprintf(worker->desc, sizeof(worker->desc), fmt, args);
+		va_end(args);
+		worker->desc_valid = true;
+	}
+}
+
+/**
+ * print_worker_info - print out worker information and description
+ * @log_lvl: the log level to use when printing
+ * @task: target task
+ *
+ * If @task is a worker and currently executing a work item, print out the
+ * name of the workqueue being serviced and worker description set with
+ * set_worker_desc() by the currently executing work item.
+ *
+ * This function can be safely called on any task as long as the
+ * task_struct itself is accessible.  While safe, this function isn't
+ * synchronized and may print out mixups or garbages of limited length.
+ */
+void print_worker_info(const char *log_lvl, struct task_struct *task)
+{
+	work_func_t *fn = NULL;
+	char name[WQ_NAME_LEN] = { };
+	char desc[WORKER_DESC_LEN] = { };
+	struct pool_workqueue *pwq = NULL;
+	struct workqueue_struct *wq = NULL;
+	bool desc_valid = false;
+	struct worker *worker;
+
+	if (!(task->flags & PF_WQ_WORKER))
+		return;
+
+	/*
+	 * This function is called without any synchronization and @task
+	 * could be in any state.  Be careful with dereferences.
+	 */
+	worker = probe_kthread_data(task);
+
+	/*
+	 * Carefully copy the associated workqueue's workfn and name.  Keep
+	 * the original last '\0' in case the original contains garbage.
+	 */
+	probe_kernel_read(&fn, &worker->current_func, sizeof(fn));
+	probe_kernel_read(&pwq, &worker->current_pwq, sizeof(pwq));
+	probe_kernel_read(&wq, &pwq->wq, sizeof(wq));
+	probe_kernel_read(name, wq->name, sizeof(name) - 1);
+
+	/* copy worker description */
+	probe_kernel_read(&desc_valid, &worker->desc_valid, sizeof(desc_valid));
+	if (desc_valid)
+		probe_kernel_read(desc, worker->desc, sizeof(desc) - 1);
+
+	if (fn || name[0] || desc[0]) {
+		printk("%sWorkqueue: %s %pf", log_lvl, name, fn);
+		if (desc[0])
+			pr_cont(" (%s)", desc);
+		pr_cont("\n");
+	}
+}
+
 /*
  * CPU hotplug.
  *
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 84ab6e1dc6fb..ad83c96b2ece 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -29,15 +29,25 @@ struct worker {
 	struct work_struct	*current_work;	/* L: work being processed */
 	work_func_t		current_func;	/* L: current_work's fn */
 	struct pool_workqueue	*current_pwq; /* L: current_work's pwq */
+	bool			desc_valid;	/* ->desc is valid */
 	struct list_head	scheduled;	/* L: scheduled works */
+
+	/* 64 bytes boundary on 64bit, 32 on 32bit */
+
 	struct task_struct	*task;		/* I: worker task */
 	struct worker_pool	*pool;		/* I: the associated pool */
 						/* L: for rescuers */
-	/* 64 bytes boundary on 64bit, 32 on 32bit */
+
 	unsigned long		last_active;	/* L: last active timestamp */
 	unsigned int		flags;		/* X: flags */
 	int			id;		/* I: worker id */
 
+	/*
+	 * Opaque string set with work_set_desc().  Printed out with task
+	 * dump for debugging - WARN, BUG, panic or sysrq.
+	 */
+	char			desc[WORKER_DESC_LEN];
+
 	/* used only by rescuers to point to the target workqueue */
 	struct workqueue_struct	*rescue_wq;	/* I: the workqueue to rescue */
 };
-- 
cgit 


From 3440a1ca99707f093e9568ba9762764d3162dd8f Mon Sep 17 00:00:00 2001
From: liguang <lig.fnst@cn.fujitsu.com>
Date: Tue, 30 Apr 2013 15:27:26 -0700
Subject: kernel/smp.c: remove 'priv' of call_single_data

The 'priv' field is redundant; we can pass data via 'info'.

Signed-off-by: liguang <lig.fnst@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/smp.h | 1 -
 kernel/softirq.c    | 6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 3e07a7df6478..e6564c1dc552 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -20,7 +20,6 @@ struct call_single_data {
 	smp_call_func_t func;
 	void *info;
 	u16 flags;
-	u16 priv;
 };
 
 /* total number of cpus in this system (may exceed NR_CPUS) */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 14d7758074aa..aa82723c7202 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -620,8 +620,7 @@ static void remote_softirq_receive(void *data)
 	unsigned long flags;
 	int softirq;
 
-	softirq = cp->priv;
-
+	softirq = *(int *)cp->info;
 	local_irq_save(flags);
 	__local_trigger(cp, softirq);
 	local_irq_restore(flags);
@@ -631,9 +630,8 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 {
 	if (cpu_online(cpu)) {
 		cp->func = remote_softirq_receive;
-		cp->info = cp;
+		cp->info = &softirq;
 		cp->flags = 0;
-		cp->priv = softirq;
 
 		__smp_call_function_single(cpu, cp, 0);
 		return 0;
-- 
cgit 


From 74e3d1e17b2e11d175970b85acd44f5927000ba2 Mon Sep 17 00:00:00 2001
From: Fan Du <fan.du@windriver.com>
Date: Tue, 30 Apr 2013 15:27:27 -0700
Subject: include/linux/fs.h: disable preempt when acquire i_size_seqcount
 write lock

Two rt tasks bind to one CPU core.

The higher priority rt task A preempts a lower priority rt task B which
has already taken the write seq lock, and then the higher priority rt
task A try to acquire read seq lock, it's doomed to lockup.

rt task A with lower priority: call write
i_size_write                                        rt task B with higher priority: call sync, and preempt task A
  write_seqcount_begin(&inode->i_size_seqcount);    i_size_read
  inode->i_size = i_size;                             read_seqcount_begin <-- lockup here...

So disable preempt when acquiring every i_size_seqcount *write* lock will
cure the problem.

Signed-off-by: Fan Du <fan.du@windriver.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fs.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2c28271ab9d4..17d8b1596215 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -675,9 +675,11 @@ static inline loff_t i_size_read(const struct inode *inode)
 static inline void i_size_write(struct inode *inode, loff_t i_size)
 {
 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+	preempt_disable();
 	write_seqcount_begin(&inode->i_size_seqcount);
 	inode->i_size = i_size;
 	write_seqcount_end(&inode->i_size_seqcount);
+	preempt_enable();
 #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT)
 	preempt_disable();
 	inode->i_size = i_size;
-- 
cgit 


From 16c7fa05829e8b91db48e3539c5d6ff3c2b18a23 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Tue, 30 Apr 2013 15:27:30 -0700
Subject: lib/string_helpers: introduce generic string_unescape

There are several places in kernel where modules unescapes input to convert
C-Style Escape Sequences into byte codes.

The patch provides generic implementation of such approach. Test cases are
also included into the patch.

[akpm@linux-foundation.org: clarify comment]
[akpm@linux-foundation.org: export get_random_int() to modules]
Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Cc: Samuel Thibault <samuel.thibault@ens-lyon.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: William Hubbs <w.d.hubbs@gmail.com>
Cc: Chris Brannon <chris@the-brannons.com>
Cc: Kirk Reiser <kirk@braille.uwo.ca>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/random.c          |   1 +
 include/linux/string_helpers.h |  58 ++++++++++++++++++
 lib/Kconfig.debug              |   3 +
 lib/Makefile                   |   4 +-
 lib/string_helpers.c           | 133 +++++++++++++++++++++++++++++++++++++++++
 lib/test-string_helpers.c      | 103 +++++++++++++++++++++++++++++++
 6 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 lib/test-string_helpers.c

(limited to 'include/linux')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 32a6c5764950..cd9a6211dcad 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1485,6 +1485,7 @@ unsigned int get_random_int(void)
 
 	return ret;
 }
+EXPORT_SYMBOL(get_random_int);
 
 /*
  * randomize_range() returns a start address such that
diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h
index a3eb2f65b656..3eeee9672a4a 100644
--- a/include/linux/string_helpers.h
+++ b/include/linux/string_helpers.h
@@ -13,4 +13,62 @@ enum string_size_units {
 int string_get_size(u64 size, enum string_size_units units,
 		    char *buf, int len);
 
+#define UNESCAPE_SPACE		0x01
+#define UNESCAPE_OCTAL		0x02
+#define UNESCAPE_HEX		0x04
+#define UNESCAPE_SPECIAL	0x08
+#define UNESCAPE_ANY		\
+	(UNESCAPE_SPACE | UNESCAPE_OCTAL | UNESCAPE_HEX | UNESCAPE_SPECIAL)
+
+/**
+ * string_unescape - unquote characters in the given string
+ * @src:	source buffer (escaped)
+ * @dst:	destination buffer (unescaped)
+ * @size:	size of the destination buffer (0 to unlimit)
+ * @flags:	combination of the flags (bitwise OR):
+ *	%UNESCAPE_SPACE:
+ *		'\f' - form feed
+ *		'\n' - new line
+ *		'\r' - carriage return
+ *		'\t' - horizontal tab
+ *		'\v' - vertical tab
+ *	%UNESCAPE_OCTAL:
+ *		'\NNN' - byte with octal value NNN (1 to 3 digits)
+ *	%UNESCAPE_HEX:
+ *		'\xHH' - byte with hexadecimal value HH (1 to 2 digits)
+ *	%UNESCAPE_SPECIAL:
+ *		'\"' - double quote
+ *		'\\' - backslash
+ *		'\a' - alert (BEL)
+ *		'\e' - escape
+ *	%UNESCAPE_ANY:
+ *		all previous together
+ *
+ * Returns amount of characters processed to the destination buffer excluding
+ * trailing '\0'.
+ *
+ * Because the size of the output will be the same as or less than the size of
+ * the input, the transformation may be performed in place.
+ *
+ * Caller must provide valid source and destination pointers. Be aware that
+ * destination buffer will always be NULL-terminated. Source string must be
+ * NULL-terminated as well.
+ */
+int string_unescape(char *src, char *dst, size_t size, unsigned int flags);
+
+static inline int string_unescape_inplace(char *buf, unsigned int flags)
+{
+	return string_unescape(buf, buf, 0, flags);
+}
+
+static inline int string_unescape_any(char *src, char *dst, size_t size)
+{
+	return string_unescape(src, dst, size, UNESCAPE_ANY);
+}
+
+static inline int string_unescape_any_inplace(char *buf)
+{
+	return string_unescape_any(buf, buf, 0);
+}
+
 #endif
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 28be08c09bab..77ebaa3dfa12 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1463,5 +1463,8 @@ source "lib/Kconfig.kgdb"
 
 source "lib/Kconfig.kmemcheck"
 
+config TEST_STRING_HELPERS
+	tristate "Test functions located in the string_helpers module at runtime"
+
 config TEST_KSTRTOX
 	tristate "Test kstrto*() family of functions at runtime"
diff --git a/lib/Makefile b/lib/Makefile
index 6e2cc561f761..23c9a0fe74fc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -22,8 +22,10 @@ lib-y	+= kobject.o klist.o
 
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
-	 string_helpers.o gcd.o lcm.o list_sort.o uuid.o flex_array.o \
+	 gcd.o lcm.o list_sort.o uuid.o flex_array.o \
 	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o
+obj-y += string_helpers.o
+obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 
diff --git a/lib/string_helpers.c b/lib/string_helpers.c
index 1cffc223bff5..ed5c1454dd62 100644
--- a/lib/string_helpers.c
+++ b/lib/string_helpers.c
@@ -2,10 +2,12 @@
  * Helpers for formatting and printing strings
  *
  * Copyright 31 August 2008 James Bottomley
+ * Copyright (C) 2013, Intel Corporation
  */
 #include <linux/kernel.h>
 #include <linux/math64.h>
 #include <linux/export.h>
+#include <linux/ctype.h>
 #include <linux/string_helpers.h>
 
 /**
@@ -66,3 +68,134 @@ int string_get_size(u64 size, const enum string_size_units units,
 	return 0;
 }
 EXPORT_SYMBOL(string_get_size);
+
+static bool unescape_space(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+
+	switch (*q) {
+	case 'n':
+		*p = '\n';
+		break;
+	case 'r':
+		*p = '\r';
+		break;
+	case 't':
+		*p = '\t';
+		break;
+	case 'v':
+		*p = '\v';
+		break;
+	case 'f':
+		*p = '\f';
+		break;
+	default:
+		return false;
+	}
+	*dst += 1;
+	*src += 1;
+	return true;
+}
+
+static bool unescape_octal(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+	u8 num;
+
+	if (isodigit(*q) == 0)
+		return false;
+
+	num = (*q++) & 7;
+	while (num < 32 && isodigit(*q) && (q - *src < 3)) {
+		num <<= 3;
+		num += (*q++) & 7;
+	}
+	*p = num;
+	*dst += 1;
+	*src = q;
+	return true;
+}
+
+static bool unescape_hex(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+	int digit;
+	u8 num;
+
+	if (*q++ != 'x')
+		return false;
+
+	num = digit = hex_to_bin(*q++);
+	if (digit < 0)
+		return false;
+
+	digit = hex_to_bin(*q);
+	if (digit >= 0) {
+		q++;
+		num = (num << 4) | digit;
+	}
+	*p = num;
+	*dst += 1;
+	*src = q;
+	return true;
+}
+
+static bool unescape_special(char **src, char **dst)
+{
+	char *p = *dst, *q = *src;
+
+	switch (*q) {
+	case '\"':
+		*p = '\"';
+		break;
+	case '\\':
+		*p = '\\';
+		break;
+	case 'a':
+		*p = '\a';
+		break;
+	case 'e':
+		*p = '\e';
+		break;
+	default:
+		return false;
+	}
+	*dst += 1;
+	*src += 1;
+	return true;
+}
+
+int string_unescape(char *src, char *dst, size_t size, unsigned int flags)
+{
+	char *out = dst;
+
+	while (*src && --size) {
+		if (src[0] == '\\' && src[1] != '\0' && size > 1) {
+			src++;
+			size--;
+
+			if (flags & UNESCAPE_SPACE &&
+					unescape_space(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_OCTAL &&
+					unescape_octal(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_HEX &&
+					unescape_hex(&src, &out))
+				continue;
+
+			if (flags & UNESCAPE_SPECIAL &&
+					unescape_special(&src, &out))
+				continue;
+
+			*out++ = '\\';
+		}
+		*out++ = *src++;
+	}
+	*out = '\0';
+
+	return out - dst;
+}
+EXPORT_SYMBOL(string_unescape);
diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c
new file mode 100644
index 000000000000..6ac48de04c0e
--- /dev/null
+++ b/lib/test-string_helpers.c
@@ -0,0 +1,103 @@
+/*
+ * Test cases for lib/string_helpers.c module.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/string_helpers.h>
+
+struct test_string {
+	const char *in;
+	const char *out;
+	unsigned int flags;
+};
+
+static const struct test_string strings[] __initconst = {
+	{
+		.in = "\\f\\ \\n\\r\\t\\v",
+		.out = "\f\\ \n\r\t\v",
+		.flags = UNESCAPE_SPACE,
+	},
+	{
+		.in = "\\40\\1\\387\\0064\\05\\040\\8a\\110\\777",
+		.out = " \001\00387\0064\005 \\8aH?7",
+		.flags = UNESCAPE_OCTAL,
+	},
+	{
+		.in = "\\xv\\xa\\x2c\\xD\\x6f2",
+		.out = "\\xv\n,\ro2",
+		.flags = UNESCAPE_HEX,
+	},
+	{
+		.in = "\\h\\\\\\\"\\a\\e\\",
+		.out = "\\h\\\"\a\e\\",
+		.flags = UNESCAPE_SPECIAL,
+	},
+};
+
+static void __init test_string_unescape(unsigned int flags, bool inplace)
+{
+	char in[256];
+	char out_test[256];
+	char out_real[256];
+	int i, p = 0, q_test = 0, q_real = sizeof(out_real);
+
+	for (i = 0; i < ARRAY_SIZE(strings); i++) {
+		const char *s = strings[i].in;
+		int len = strlen(strings[i].in);
+
+		/* Copy string to in buffer */
+		memcpy(&in[p], s, len);
+		p += len;
+
+		/* Copy expected result for given flags */
+		if (flags & strings[i].flags) {
+			s = strings[i].out;
+			len = strlen(strings[i].out);
+		}
+		memcpy(&out_test[q_test], s, len);
+		q_test += len;
+	}
+	in[p++] = '\0';
+
+	/* Call string_unescape and compare result */
+	if (inplace) {
+		memcpy(out_real, in, p);
+		if (flags == UNESCAPE_ANY)
+			q_real = string_unescape_any_inplace(out_real);
+		else
+			q_real = string_unescape_inplace(out_real, flags);
+	} else if (flags == UNESCAPE_ANY) {
+		q_real = string_unescape_any(in, out_real, q_real);
+	} else {
+		q_real = string_unescape(in, out_real, q_real, flags);
+	}
+
+	if (q_real != q_test || memcmp(out_test, out_real, q_test)) {
+		pr_warn("Test failed: flags = %u\n", flags);
+		print_hex_dump(KERN_WARNING, "Input: ",
+			       DUMP_PREFIX_NONE, 16, 1, in, p - 1, true);
+		print_hex_dump(KERN_WARNING, "Expected: ",
+			       DUMP_PREFIX_NONE, 16, 1, out_test, q_test, true);
+		print_hex_dump(KERN_WARNING, "Got: ",
+			       DUMP_PREFIX_NONE, 16, 1, out_real, q_real, true);
+	}
+}
+
+static int __init test_string_helpers_init(void)
+{
+	unsigned int i;
+
+	pr_info("Running tests...\n");
+	for (i = 0; i < UNESCAPE_ANY + 1; i++)
+		test_string_unescape(i, false);
+	test_string_unescape(get_random_int() % (UNESCAPE_ANY + 1), true);
+
+	return -EINVAL;
+}
+module_init(test_string_helpers_init);
+MODULE_LICENSE("Dual BSD/GPL");
-- 
cgit 


From 1a0df59444972105f0d4c2b0c16ce414d70c420a Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 30 Apr 2013 15:27:34 -0700
Subject: kernel/compat.c: make do_sysinfo() static

The only use outside of kernel/timer.c was in kernel/compat.c, so move
compat_sys_sysinfo() next to sys_sysinfo() in kernel/timer.c.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Guenter Roeck <linux@roeck-us.net>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  2 --
 kernel/compat.c        | 65 ----------------------------------------------
 kernel/timer.c         | 70 +++++++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 69 insertions(+), 68 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2dac79c39199..6d1844f393c0 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -798,6 +798,4 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD
 #endif
 
-extern int do_sysinfo(struct sysinfo *info);
-
 #endif
diff --git a/kernel/compat.c b/kernel/compat.c
index 19971d8c7299..1e8f1455117a 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1138,71 +1138,6 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid,
 }
 #endif
 
-struct compat_sysinfo {
-	s32 uptime;
-	u32 loads[3];
-	u32 totalram;
-	u32 freeram;
-	u32 sharedram;
-	u32 bufferram;
-	u32 totalswap;
-	u32 freeswap;
-	u16 procs;
-	u16 pad;
-	u32 totalhigh;
-	u32 freehigh;
-	u32 mem_unit;
-	char _f[20-2*sizeof(u32)-sizeof(int)];
-};
-
-asmlinkage long
-compat_sys_sysinfo(struct compat_sysinfo __user *info)
-{
-	struct sysinfo s;
-
-	do_sysinfo(&s);
-
-	/* Check to see if any memory value is too large for 32-bit and scale
-	 *  down if needed
-	 */
-	if ((s.totalram >> 32) || (s.totalswap >> 32)) {
-		int bitcount = 0;
-
-		while (s.mem_unit < PAGE_SIZE) {
-			s.mem_unit <<= 1;
-			bitcount++;
-		}
-
-		s.totalram >>= bitcount;
-		s.freeram >>= bitcount;
-		s.sharedram >>= bitcount;
-		s.bufferram >>= bitcount;
-		s.totalswap >>= bitcount;
-		s.freeswap >>= bitcount;
-		s.totalhigh >>= bitcount;
-		s.freehigh >>= bitcount;
-	}
-
-	if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
-	    __put_user (s.uptime, &info->uptime) ||
-	    __put_user (s.loads[0], &info->loads[0]) ||
-	    __put_user (s.loads[1], &info->loads[1]) ||
-	    __put_user (s.loads[2], &info->loads[2]) ||
-	    __put_user (s.totalram, &info->totalram) ||
-	    __put_user (s.freeram, &info->freeram) ||
-	    __put_user (s.sharedram, &info->sharedram) ||
-	    __put_user (s.bufferram, &info->bufferram) ||
-	    __put_user (s.totalswap, &info->totalswap) ||
-	    __put_user (s.freeswap, &info->freeswap) ||
-	    __put_user (s.procs, &info->procs) ||
-	    __put_user (s.totalhigh, &info->totalhigh) ||
-	    __put_user (s.freehigh, &info->freehigh) ||
-	    __put_user (s.mem_unit, &info->mem_unit))
-		return -EFAULT;
-
-	return 0;
-}
-
 COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
 		       compat_pid_t, pid,
 		       struct compat_timespec __user *, interval)
diff --git a/kernel/timer.c b/kernel/timer.c
index dbf7a78a1ef1..06b3245624e2 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -41,6 +41,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/slab.h>
+#include <linux/compat.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1567,7 +1568,7 @@ SYSCALL_DEFINE0(gettid)
  * do_sysinfo - fill in sysinfo struct
  * @info: pointer to buffer to fill
  */
-int do_sysinfo(struct sysinfo *info)
+static int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
@@ -1642,6 +1643,73 @@ SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info)
 	return 0;
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_sysinfo {
+	s32 uptime;
+	u32 loads[3];
+	u32 totalram;
+	u32 freeram;
+	u32 sharedram;
+	u32 bufferram;
+	u32 totalswap;
+	u32 freeswap;
+	u16 procs;
+	u16 pad;
+	u32 totalhigh;
+	u32 freehigh;
+	u32 mem_unit;
+	char _f[20-2*sizeof(u32)-sizeof(int)];
+};
+
+asmlinkage long
+compat_sys_sysinfo(struct compat_sysinfo __user *info)
+{
+	struct sysinfo s;
+
+	do_sysinfo(&s);
+
+	/* Check to see if any memory value is too large for 32-bit and scale
+	 *  down if needed
+	 */
+	if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+		int bitcount = 0;
+
+		while (s.mem_unit < PAGE_SIZE) {
+			s.mem_unit <<= 1;
+			bitcount++;
+		}
+
+		s.totalram >>= bitcount;
+		s.freeram >>= bitcount;
+		s.sharedram >>= bitcount;
+		s.bufferram >>= bitcount;
+		s.totalswap >>= bitcount;
+		s.freeswap >>= bitcount;
+		s.totalhigh >>= bitcount;
+		s.freehigh >>= bitcount;
+	}
+
+	if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) ||
+	    __put_user (s.uptime, &info->uptime) ||
+	    __put_user (s.loads[0], &info->loads[0]) ||
+	    __put_user (s.loads[1], &info->loads[1]) ||
+	    __put_user (s.loads[2], &info->loads[2]) ||
+	    __put_user (s.totalram, &info->totalram) ||
+	    __put_user (s.freeram, &info->freeram) ||
+	    __put_user (s.sharedram, &info->sharedram) ||
+	    __put_user (s.bufferram, &info->bufferram) ||
+	    __put_user (s.totalswap, &info->totalswap) ||
+	    __put_user (s.freeswap, &info->freeswap) ||
+	    __put_user (s.procs, &info->procs) ||
+	    __put_user (s.totalhigh, &info->totalhigh) ||
+	    __put_user (s.freehigh, &info->freehigh) ||
+	    __put_user (s.mem_unit, &info->mem_unit))
+		return -EFAULT;
+
+	return 0;
+}
+#endif /* CONFIG_COMPAT */
+
 static int __cpuinit init_timers_cpu(int cpu)
 {
 	int j;
-- 
cgit 


From 938e4b22e2a7d0f6f3962e601339347b2d8e09f5 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Tue, 30 Apr 2013 15:28:02 -0700
Subject: usermodehelper: export call_usermodehelper_exec() and
 call_usermodehelper_setup()

call_usermodehelper_setup() + call_usermodehelper_exec() need to be
called instead of call_usermodehelper_fns() when the cleanup function
needs to be called even when an ENOMEM error occurs.  In this case using
call_usermodehelper_fns() the user can't distinguish if the cleanup
function was called or not.

[akpm@linux-foundation.org: export call_usermodehelper_setup() to modules]
Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h |  8 ++++++++
 kernel/kmod.c        | 57 ++++++++++++++++++++++------------------------------
 2 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 5398d5807075..7eebcf5d75f1 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -71,6 +71,14 @@ call_usermodehelper_fns(char *path, char **argv, char **envp, int wait,
 			int (*init)(struct subprocess_info *info, struct cred *new),
 			void (*cleanup)(struct subprocess_info *), void *data);
 
+extern struct subprocess_info *
+call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask,
+			  int (*init)(struct subprocess_info *info, struct cred *new),
+			  void (*cleanup)(struct subprocess_info *), void *data);
+
+extern int
+call_usermodehelper_exec(struct subprocess_info *info, int wait);
+
 static inline int
 call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 56dd34976d7b..e11ea14ac011 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -502,14 +502,28 @@ static void helper_unlock(void)
  * @argv: arg vector for process
  * @envp: environment for process
  * @gfp_mask: gfp mask for memory allocation
+ * @cleanup: a cleanup function
+ * @init: an init function
+ * @data: arbitrary context sensitive data
  *
  * Returns either %NULL on allocation failure, or a subprocess_info
  * structure.  This should be passed to call_usermodehelper_exec to
  * exec the process and free the structure.
+ *
+ * The init function is used to customize the helper process prior to
+ * exec.  A non-zero return code causes the process to error out, exit,
+ * and return the failure to the calling process
+ *
+ * The cleanup function is just before ethe subprocess_info is about to
+ * be freed.  This can be used for freeing the argv and envp.  The
+ * Function must be runnable in either a process context or the
+ * context in which call_usermodehelper_exec is called.
  */
-static
 struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
-						  char **envp, gfp_t gfp_mask)
+		char **envp, gfp_t gfp_mask,
+		int (*init)(struct subprocess_info *info, struct cred *new),
+		void (*cleanup)(struct subprocess_info *info),
+		void *data)
 {
 	struct subprocess_info *sub_info;
 	sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
@@ -520,36 +534,14 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
+
+	sub_info->cleanup = cleanup;
+	sub_info->init = init;
+	sub_info->data = data;
   out:
 	return sub_info;
 }
-
-/**
- * call_usermodehelper_setfns - set a cleanup/init function
- * @info: a subprocess_info returned by call_usermodehelper_setup
- * @cleanup: a cleanup function
- * @init: an init function
- * @data: arbitrary context sensitive data
- *
- * The init function is used to customize the helper process prior to
- * exec.  A non-zero return code causes the process to error out, exit,
- * and return the failure to the calling process
- *
- * The cleanup function is just before ethe subprocess_info is about to
- * be freed.  This can be used for freeing the argv and envp.  The
- * Function must be runnable in either a process context or the
- * context in which call_usermodehelper_exec is called.
- */
-static
-void call_usermodehelper_setfns(struct subprocess_info *info,
-		    int (*init)(struct subprocess_info *info, struct cred *new),
-		    void (*cleanup)(struct subprocess_info *info),
-		    void *data)
-{
-	info->cleanup = cleanup;
-	info->init = init;
-	info->data = data;
-}
+EXPORT_SYMBOL(call_usermodehelper_setup);
 
 /**
  * call_usermodehelper_exec - start a usermode application
@@ -563,7 +555,6 @@ void call_usermodehelper_setfns(struct subprocess_info *info,
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-static
 int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
@@ -615,6 +606,7 @@ unlock:
 	helper_unlock();
 	return retval;
 }
+EXPORT_SYMBOL(call_usermodehelper_exec);
 
 /*
  * call_usermodehelper_fns() will not run the caller-provided cleanup function
@@ -630,13 +622,12 @@ int call_usermodehelper_fns(
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 
-	info = call_usermodehelper_setup(path, argv, envp, gfp_mask);
+	info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
+					 init, cleanup, data);
 
 	if (info == NULL)
 		return -ENOMEM;
 
-	call_usermodehelper_setfns(info, init, cleanup, data);
-
 	return call_usermodehelper_exec(info, wait);
 }
 EXPORT_SYMBOL(call_usermodehelper_fns);
-- 
cgit 


From 66e5b7e1948cdbdca2b0cc6ddc6d69ee84583fb4 Mon Sep 17 00:00:00 2001
From: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Date: Tue, 30 Apr 2013 15:28:09 -0700
Subject: kmod: remove call_usermodehelper_fns()

This function suffers from not being able to determine if the cleanup is
called in case it returns -ENOMEM.  Nobody is using it anymore, so let's
remove it.

Signed-off-by: Lucas De Marchi <lucas.demarchi@profusion.mobi>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Tejun Heo <tj@kernel.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kmod.h | 11 +----------
 kernel/kmod.c        | 31 +++++++++++++++++--------------
 2 files changed, 18 insertions(+), 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 7eebcf5d75f1..0555cc66a15b 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -67,9 +67,7 @@ struct subprocess_info {
 };
 
 extern int
-call_usermodehelper_fns(char *path, char **argv, char **envp, int wait,
-			int (*init)(struct subprocess_info *info, struct cred *new),
-			void (*cleanup)(struct subprocess_info *), void *data);
+call_usermodehelper(char *path, char **argv, char **envp, int wait);
 
 extern struct subprocess_info *
 call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask,
@@ -79,13 +77,6 @@ call_usermodehelper_setup(char *path, char **argv, char **envp, gfp_t gfp_mask,
 extern int
 call_usermodehelper_exec(struct subprocess_info *info, int wait);
 
-static inline int
-call_usermodehelper(char *path, char **argv, char **envp, int wait)
-{
-	return call_usermodehelper_fns(path, argv, envp, wait,
-				       NULL, NULL, NULL);
-}
-
 extern struct ctl_table usermodehelper_table[];
 
 enum umh_disable_depth {
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9941a4f155d1..1296e72e4161 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -555,8 +555,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  * call_usermodehelper_exec - start a usermode application
  * @sub_info: information about the subprocessa
  * @wait: wait for the application to finish and return status.
- *        when -1 don't wait at all, but you get no useful error back when
- *        the program couldn't be exec'ed. This makes it safe to call
+ *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ *        when the program couldn't be exec'ed. This makes it safe to call
  *        from interrupt context.
  *
  * Runs a user-space application.  The application is started
@@ -616,29 +616,32 @@ unlock:
 }
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
-/*
- * call_usermodehelper_fns() will not run the caller-provided cleanup function
- * if a memory allocation failure is experienced.  So the caller might need to
- * check the call_usermodehelper_fns() return value: if it is -ENOMEM, perform
- * the necessaary cleanup within the caller.
+/**
+ * call_usermodehelper() - prepare and start a usermode application
+ * @path: path to usermode executable
+ * @argv: arg vector for process
+ * @envp: environment for process
+ * @wait: wait for the application to finish and return status.
+ *        when UMH_NO_WAIT don't wait at all, but you get no useful error back
+ *        when the program couldn't be exec'ed. This makes it safe to call
+ *        from interrupt context.
+ *
+ * This function is the equivalent to use call_usermodehelper_setup() and
+ * call_usermodehelper_exec().
  */
-int call_usermodehelper_fns(
-	char *path, char **argv, char **envp, int wait,
-	int (*init)(struct subprocess_info *info, struct cred *new),
-	void (*cleanup)(struct subprocess_info *), void *data)
+int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 
 	info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
-					 init, cleanup, data);
-
+					 NULL, NULL, NULL);
 	if (info == NULL)
 		return -ENOMEM;
 
 	return call_usermodehelper_exec(info, wait);
 }
-EXPORT_SYMBOL(call_usermodehelper_fns);
+EXPORT_SYMBOL(call_usermodehelper);
 
 static int proc_cap_handler(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
-- 
cgit 


From 403bad72b67d8b3f5a0240af5023adfa48132a65 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 30 Apr 2013 15:28:10 -0700
Subject: coredump: only SIGKILL should interrupt the coredumping task

There are 2 well known and ancient problems with coredump/signals, and a
lot of related bug reports:

- do_coredump() clears TIF_SIGPENDING but of course this can't help
  if, say, SIGCHLD comes after that.

  In this case the coredump can fail unexpectedly. See for example
  wait_for_dump_helper()->signal_pending() check but there are other
  reasons.

- At the same time, dumping a huge core on the slow media can take a
  lot of time/resources and there is no way to kill the coredumping
  task reliably. In particular this is not oom_kill-friendly.

This patch tries to fix the 1st problem, and makes the preparation for the
next changes.

We add the new SIGNAL_GROUP_COREDUMP flag set by zap_threads() to indicate
that this process dumps the core.  prepare_signal() checks this flag and
nacks any signal except SIGKILL.

Note that this check tries to be conservative, in the long term we should
probably treat the SIGNAL_GROUP_EXIT case equally but this needs more
discussion.  See marc.info/?l=linux-kernel&m=120508897917439

Notes:
	- recalc_sigpending() doesn't check SIGNAL_GROUP_COREDUMP.
	  The patch assumes that dump_write/etc paths should never
	  call it, but we can change it as well.

	- There is another source of TIF_SIGPENDING, freezer. This
	  will be addressed separately.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Tested-by: Mandeep Singh Baines <msb@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Neil Horman <nhorman@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/coredump.c         | 13 +++++--------
 include/linux/sched.h |  1 +
 kernel/signal.c       |  6 ++++--
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/fs/coredump.c b/fs/coredump.c
index d52f6bd5ad8e..f91cfd8cd5f2 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -280,8 +280,8 @@ static int zap_process(struct task_struct *start, int exit_code)
 	return nr;
 }
 
-static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
-				struct core_state *core_state, int exit_code)
+static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+			struct core_state *core_state, int exit_code)
 {
 	struct task_struct *g, *p;
 	unsigned long flags;
@@ -291,6 +291,9 @@ static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
 	if (!signal_group_exit(tsk->signal)) {
 		mm->core_state = core_state;
 		nr = zap_process(tsk, exit_code);
+		/* ignore all signals except SIGKILL, see prepare_signal() */
+		tsk->signal->flags |= SIGNAL_GROUP_COREDUMP;
+		clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
 	}
 	spin_unlock_irq(&tsk->sighand->siglock);
 	if (unlikely(nr < 0))
@@ -514,12 +517,6 @@ void do_coredump(siginfo_t *siginfo)
 
 	old_cred = override_creds(cred);
 
-	/*
-	 * Clear any false indication of pending signals that might
-	 * be seen by the filesystem code called to write the core file.
-	 */
-	clear_thread_flag(TIF_SIGPENDING);
-
 	ispipe = format_corename(&cn, &cprm);
 
 	if (ispipe) {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54ddcb82cddf..a22baf83c20c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -626,6 +626,7 @@ struct signal_struct {
 #define SIGNAL_STOP_STOPPED	0x00000001 /* job control stop in effect */
 #define SIGNAL_STOP_CONTINUED	0x00000002 /* SIGCONT since WCONTINUED reap */
 #define SIGNAL_GROUP_EXIT	0x00000004 /* group exit in progress */
+#define SIGNAL_GROUP_COREDUMP	0x00000008 /* coredump in progress */
 /*
  * Pending notifications to parent.
  */
diff --git a/kernel/signal.c b/kernel/signal.c
index 27ece019bd0a..cede58910f9c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -854,12 +854,14 @@ static void ptrace_trap_notify(struct task_struct *t)
  * Returns true if the signal should be actually delivered, otherwise
  * it should be dropped.
  */
-static int prepare_signal(int sig, struct task_struct *p, bool force)
+static bool prepare_signal(int sig, struct task_struct *p, bool force)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
 
-	if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) {
+	if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) {
+		if (signal->flags & SIGNAL_GROUP_COREDUMP)
+			return sig == SIGKILL;
 		/*
 		 * The process is in the middle of dying, nothing to do.
 		 */
-- 
cgit 


From e56fb2874015370e3b7f8d85051f6dce26051df9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Tue, 30 Apr 2013 15:28:20 -0700
Subject: exec: do not abuse ->cred_guard_mutex in threadgroup_lock()

threadgroup_lock() takes signal->cred_guard_mutex to ensure that
thread_group_leader() is stable.  This doesn't look nice, the scope of
this lock in do_execve() is huge.

And as Dave pointed out this can lead to deadlock, we have the
following dependencies:

	do_execve:		cred_guard_mutex -> i_mutex
	cgroup_mount:		i_mutex -> cgroup_mutex
	attach_task_by_pid:	cgroup_mutex -> cred_guard_mutex

Change de_thread() to take threadgroup_change_begin() around the
switch-the-leader code and change threadgroup_lock() to avoid
->cred_guard_mutex.

Note that de_thread() can't sleep with ->group_rwsem held, this can
obviously deadlock with the exiting leader if the writer is active, so it
does threadgroup_change_end() before schedule().

Reported-by: Dave Jones <davej@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c             |  3 +++
 include/linux/sched.h | 18 ++++--------------
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index 260f89f66651..963f510a25ab 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -898,11 +898,13 @@ static int de_thread(struct task_struct *tsk)
 
 		sig->notify_count = -1;	/* for exit_notify() */
 		for (;;) {
+			threadgroup_change_begin(tsk);
 			write_lock_irq(&tasklist_lock);
 			if (likely(leader->exit_state))
 				break;
 			__set_current_state(TASK_KILLABLE);
 			write_unlock_irq(&tasklist_lock);
+			threadgroup_change_end(tsk);
 			schedule();
 			if (unlikely(__fatal_signal_pending(tsk)))
 				goto killed;
@@ -960,6 +962,7 @@ static int de_thread(struct task_struct *tsk)
 		if (unlikely(leader->ptrace))
 			__wake_up_parent(leader, leader->parent);
 		write_unlock_irq(&tasklist_lock);
+		threadgroup_change_end(tsk);
 
 		release_task(leader);
 	}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a22baf83c20c..6f950048b6e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2249,27 +2249,18 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
  *
  * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
  * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * perform exec.  This is useful for cases where the threadgroup needs to
- * stay stable across blockable operations.
+ * change ->group_leader/pid.  This is useful for cases where the threadgroup
+ * needs to stay stable across blockable operations.
  *
  * fork and exit paths explicitly call threadgroup_change_{begin|end}() for
  * synchronization.  While held, no new task will be added to threadgroup
  * and no existing live task will have its PF_EXITING set.
  *
- * During exec, a task goes and puts its thread group through unusual
- * changes.  After de-threading, exclusive access is assumed to resources
- * which are usually shared by tasks in the same group - e.g. sighand may
- * be replaced with a new one.  Also, the exec'ing task takes over group
- * leader role including its pid.  Exclude these changes while locked by
- * grabbing cred_guard_mutex which is used to synchronize exec path.
+ * de_thread() does threadgroup_change_{begin|end}() when a non-leader
+ * sub-thread becomes a new leader.
  */
 static inline void threadgroup_lock(struct task_struct *tsk)
 {
-	/*
-	 * exec uses exit for de-threading nesting group_rwsem inside
-	 * cred_guard_mutex. Grab cred_guard_mutex first.
-	 */
-	mutex_lock(&tsk->signal->cred_guard_mutex);
 	down_write(&tsk->signal->group_rwsem);
 }
 
@@ -2282,7 +2273,6 @@ static inline void threadgroup_lock(struct task_struct *tsk)
 static inline void threadgroup_unlock(struct task_struct *tsk)
 {
 	up_write(&tsk->signal->group_rwsem);
-	mutex_unlock(&tsk->signal->cred_guard_mutex);
 }
 #else
 static inline void threadgroup_change_begin(struct task_struct *tsk) {}
-- 
cgit 


From 5cc5445164c16d32bab2912fac28356ab07aa8b4 Mon Sep 17 00:00:00 2001
From: "Raphael S.Carvalho" <raphael.scarv@gmail.com>
Date: Tue, 30 Apr 2013 15:28:27 -0700
Subject: pid_namespace.c/.h: simplify defines

Move BITS_PER_PAGE from pid_namespace.c to pid_namespace.h, since we can
simplify the define PID_MAP_ENTRIES by using the BITS_PER_PAGE.

[akpm@linux-foundation.org: kernel/pid.c:54:1: warning: "BITS_PER_PAGE" redefined]
Signed-off-by: Raphael S.Carvalho <raphael.scarv@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pid_namespace.h | 4 +++-
 kernel/pid.c                  | 3 ---
 kernel/pid_namespace.c        | 2 --
 3 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 215e5e3dda10..8ac32836690e 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -13,7 +13,9 @@ struct pidmap {
        void *page;
 };
 
-#define PIDMAP_ENTRIES         ((PID_MAX_LIMIT + 8*PAGE_SIZE - 1)/PAGE_SIZE/8)
+#define BITS_PER_PAGE		(PAGE_SIZE * 8)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+#define PIDMAP_ENTRIES		((PID_MAX_LIMIT+BITS_PER_PAGE-1)/BITS_PER_PAGE)
 
 struct bsd_acct_struct;
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 8147bdf22f36..6283d6412aff 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -51,9 +51,6 @@ int pid_max = PID_MAX_DEFAULT;
 int pid_max_min = RESERVED_PIDS + 1;
 int pid_max_max = PID_MAX_LIMIT;
 
-#define BITS_PER_PAGE		(PAGE_SIZE*8)
-#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
-
 static inline int mk_pid(struct pid_namespace *pid_ns,
 		struct pidmap *map, int off)
 {
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index bea15bdf82b0..69473c4a653f 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -19,8 +19,6 @@
 #include <linux/reboot.h>
 #include <linux/export.h>
 
-#define BITS_PER_PAGE		(PAGE_SIZE*8)
-
 struct pid_cache {
 	int nr_ids;
 	char name[16];
-- 
cgit 


From 536b39ecf1b52ab71c2c126db0137611b9e1a4d4 Mon Sep 17 00:00:00 2001
From: "zhangwei(Jovi)" <jovi.zhangwei@huawei.com>
Date: Tue, 30 Apr 2013 15:28:40 -0700
Subject: kernel/relay.c: move FIX_SIZE macro into relay.c

It's better to place FIX_SIZE macro in relay.c, instead of relay.h

Signed-off-by: zhangwei(Jovi) <jovi.zhangwei@huawei.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/relay.h | 3 ---
 kernel/relay.c        | 3 +++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/relay.h b/include/linux/relay.h
index 91cacc34c159..d7c8359693c6 100644
--- a/include/linux/relay.h
+++ b/include/linux/relay.h
@@ -20,9 +20,6 @@
 #include <linux/poll.h>
 #include <linux/kref.h>
 
-/* Needs a _much_ better name... */
-#define FIX_SIZE(x) ((((x) - 1) & PAGE_MASK) + PAGE_SIZE)
-
 /*
  * Tracks changes to rchan/rchan_buf structs
  */
diff --git a/kernel/relay.c b/kernel/relay.c
index d21006c4a3cc..4c2959b6c34d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -550,6 +550,9 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 
+/* Needs a _much_ better name... */
+#define FIX_SIZE(x) ((((x) - 1) & PAGE_MASK) + PAGE_SIZE)
+
 /**
  *	relay_open - create a new relay channel
  *	@base_filename: base name of files to create, %NULL for buffering only
-- 
cgit 


From 78df969550e7187f4dcd70b737217dcbc8e9a06a Mon Sep 17 00:00:00 2001
From: Sumit Semwal <sumit.semwal@linaro.org>
Date: Fri, 22 Mar 2013 18:22:16 +0530
Subject: dma-buf: replace dma_buf_export() with dma_buf_export_named()

For debugging purposes, it is useful to have a name-string added
while exporting buffers. Hence, dma_buf_export() is replaced with
dma_buf_export_named(), which additionally takes 'exp_name' as a
parameter.

For backward compatibility, and for lazy exporters who don't wish to
name themselves, a #define dma_buf_export() is also made available,
which adds a __FILE__ instead of 'exp_name'.

Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
  [Thanks for the idea!]
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 Documentation/dma-buf-sharing.txt | 13 +++++++++++--
 drivers/base/dma-buf.c            | 11 +++++++----
 include/linux/dma-buf.h           | 11 +++++++++--
 3 files changed, 27 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/dma-buf-sharing.txt b/Documentation/dma-buf-sharing.txt
index 4966b1be42ac..0b23261561d2 100644
--- a/Documentation/dma-buf-sharing.txt
+++ b/Documentation/dma-buf-sharing.txt
@@ -52,14 +52,23 @@ The dma_buf buffer sharing API usage contains the following steps:
    associated with this buffer.
 
    Interface:
-      struct dma_buf *dma_buf_export(void *priv, struct dma_buf_ops *ops,
-				     size_t size, int flags)
+      struct dma_buf *dma_buf_export_named(void *priv, struct dma_buf_ops *ops,
+				     size_t size, int flags,
+				     const char *exp_name)
 
    If this succeeds, dma_buf_export allocates a dma_buf structure, and returns a
    pointer to the same. It also associates an anonymous file with this buffer,
    so it can be exported. On failure to allocate the dma_buf object, it returns
    NULL.
 
+   'exp_name' is the name of exporter - to facilitate information while
+   debugging.
+
+   Exporting modules which do not wish to provide any specific name may use the
+   helper define 'dma_buf_export()', with the same arguments as above, but
+   without the last argument; a __FILE__ pre-processor directive will be
+   inserted in place of 'exp_name' instead.
+
 2. Userspace gets a handle to pass around to potential buffer-users
 
    Userspace entity requests for a file-descriptor (fd) which is a handle to the
diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index 2a7cb0df176b..d89102a29f65 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -77,22 +77,24 @@ static inline int is_dma_buf_file(struct file *file)
 }
 
 /**
- * dma_buf_export - Creates a new dma_buf, and associates an anon file
+ * dma_buf_export_named - Creates a new dma_buf, and associates an anon file
  * with this buffer, so it can be exported.
  * Also connect the allocator specific data and ops to the buffer.
+ * Additionally, provide a name string for exporter; useful in debugging.
  *
  * @priv:	[in]	Attach private data of allocator to this buffer
  * @ops:	[in]	Attach allocator-defined dma buf ops to the new buffer.
  * @size:	[in]	Size of the buffer
  * @flags:	[in]	mode flags for the file.
+ * @exp_name:	[in]	name of the exporting module - useful for debugging.
  *
  * Returns, on success, a newly created dma_buf object, which wraps the
  * supplied private data and operations for dma_buf_ops. On either missing
  * ops, or error in allocating struct dma_buf, will return negative error.
  *
  */
-struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
-				size_t size, int flags)
+struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops,
+				size_t size, int flags, const char *exp_name)
 {
 	struct dma_buf *dmabuf;
 	struct file *file;
@@ -114,6 +116,7 @@ struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
 	dmabuf->priv = priv;
 	dmabuf->ops = ops;
 	dmabuf->size = size;
+	dmabuf->exp_name = exp_name;
 
 	file = anon_inode_getfile("dmabuf", &dma_buf_fops, dmabuf, flags);
 
@@ -124,7 +127,7 @@ struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
 
 	return dmabuf;
 }
-EXPORT_SYMBOL_GPL(dma_buf_export);
+EXPORT_SYMBOL_GPL(dma_buf_export_named);
 
 
 /**
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 9978b614a1aa..6f55c0424f12 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -112,6 +112,7 @@ struct dma_buf_ops {
  * @file: file pointer used for sharing buffers across, and for refcounting.
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
+ * @exp_name: name of the exporter; useful for debugging.
  * @priv: exporter specific private data for this buffer object.
  */
 struct dma_buf {
@@ -123,6 +124,7 @@ struct dma_buf {
 	struct mutex lock;
 	unsigned vmapping_counter;
 	void *vmap_ptr;
+	const char *exp_name;
 	void *priv;
 };
 
@@ -162,8 +164,13 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf,
 							struct device *dev);
 void dma_buf_detach(struct dma_buf *dmabuf,
 				struct dma_buf_attachment *dmabuf_attach);
-struct dma_buf *dma_buf_export(void *priv, const struct dma_buf_ops *ops,
-			       size_t size, int flags);
+
+struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops,
+			       size_t size, int flags, const char *);
+
+#define dma_buf_export(priv, ops, size, flags)	\
+	dma_buf_export_named(priv, ops, size, flags, __FILE__)
+
 int dma_buf_fd(struct dma_buf *dmabuf, int flags);
 struct dma_buf *dma_buf_get(int fd);
 void dma_buf_put(struct dma_buf *dmabuf);
-- 
cgit 


From b89e35636bc75b72d15a1af6d49798802aff77d5 Mon Sep 17 00:00:00 2001
From: Sumit Semwal <sumit.semwal@linaro.org>
Date: Thu, 4 Apr 2013 11:44:37 +0530
Subject: dma-buf: Add debugfs support

Add debugfs support to make it easier to print debug information
about the dma-buf buffers.

Cc: Dave Airlie <airlied@redhat.com>
 [minor fixes on init and warning fix]
Cc: Dan Carpenter <dan.carpenter@oracle.com>
 [remove double unlock in fail case]
Signed-off-by: Sumit Semwal <sumit.semwal@linaro.org>
---
 drivers/base/dma-buf.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/dma-buf.h |   5 +-
 2 files changed, 162 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/base/dma-buf.c b/drivers/base/dma-buf.c
index d89102a29f65..08fe897c0b4c 100644
--- a/drivers/base/dma-buf.c
+++ b/drivers/base/dma-buf.c
@@ -27,9 +27,18 @@
 #include <linux/dma-buf.h>
 #include <linux/anon_inodes.h>
 #include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 static inline int is_dma_buf_file(struct file *);
 
+struct dma_buf_list {
+	struct list_head head;
+	struct mutex lock;
+};
+
+static struct dma_buf_list db_list;
+
 static int dma_buf_release(struct inode *inode, struct file *file)
 {
 	struct dma_buf *dmabuf;
@@ -42,6 +51,11 @@ static int dma_buf_release(struct inode *inode, struct file *file)
 	BUG_ON(dmabuf->vmapping_counter);
 
 	dmabuf->ops->release(dmabuf);
+
+	mutex_lock(&db_list.lock);
+	list_del(&dmabuf->list_node);
+	mutex_unlock(&db_list.lock);
+
 	kfree(dmabuf);
 	return 0;
 }
@@ -125,6 +139,10 @@ struct dma_buf *dma_buf_export_named(void *priv, const struct dma_buf_ops *ops,
 	mutex_init(&dmabuf->lock);
 	INIT_LIST_HEAD(&dmabuf->attachments);
 
+	mutex_lock(&db_list.lock);
+	list_add(&dmabuf->list_node, &db_list.head);
+	mutex_unlock(&db_list.lock);
+
 	return dmabuf;
 }
 EXPORT_SYMBOL_GPL(dma_buf_export_named);
@@ -551,3 +569,143 @@ void dma_buf_vunmap(struct dma_buf *dmabuf, void *vaddr)
 	mutex_unlock(&dmabuf->lock);
 }
 EXPORT_SYMBOL_GPL(dma_buf_vunmap);
+
+#ifdef CONFIG_DEBUG_FS
+static int dma_buf_describe(struct seq_file *s)
+{
+	int ret;
+	struct dma_buf *buf_obj;
+	struct dma_buf_attachment *attach_obj;
+	int count = 0, attach_count;
+	size_t size = 0;
+
+	ret = mutex_lock_interruptible(&db_list.lock);
+
+	if (ret)
+		return ret;
+
+	seq_printf(s, "\nDma-buf Objects:\n");
+	seq_printf(s, "\texp_name\tsize\tflags\tmode\tcount\n");
+
+	list_for_each_entry(buf_obj, &db_list.head, list_node) {
+		ret = mutex_lock_interruptible(&buf_obj->lock);
+
+		if (ret) {
+			seq_printf(s,
+				  "\tERROR locking buffer object: skipping\n");
+			continue;
+		}
+
+		seq_printf(s, "\t");
+
+		seq_printf(s, "\t%s\t%08zu\t%08x\t%08x\t%08ld\n",
+				buf_obj->exp_name, buf_obj->size,
+				buf_obj->file->f_flags, buf_obj->file->f_mode,
+				(long)(buf_obj->file->f_count.counter));
+
+		seq_printf(s, "\t\tAttached Devices:\n");
+		attach_count = 0;
+
+		list_for_each_entry(attach_obj, &buf_obj->attachments, node) {
+			seq_printf(s, "\t\t");
+
+			seq_printf(s, "%s\n", attach_obj->dev->init_name);
+			attach_count++;
+		}
+
+		seq_printf(s, "\n\t\tTotal %d devices attached\n",
+				attach_count);
+
+		count++;
+		size += buf_obj->size;
+		mutex_unlock(&buf_obj->lock);
+	}
+
+	seq_printf(s, "\nTotal %d objects, %zu bytes\n", count, size);
+
+	mutex_unlock(&db_list.lock);
+	return 0;
+}
+
+static int dma_buf_show(struct seq_file *s, void *unused)
+{
+	void (*func)(struct seq_file *) = s->private;
+	func(s);
+	return 0;
+}
+
+static int dma_buf_debug_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, dma_buf_show, inode->i_private);
+}
+
+static const struct file_operations dma_buf_debug_fops = {
+	.open           = dma_buf_debug_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+
+static struct dentry *dma_buf_debugfs_dir;
+
+static int dma_buf_init_debugfs(void)
+{
+	int err = 0;
+	dma_buf_debugfs_dir = debugfs_create_dir("dma_buf", NULL);
+	if (IS_ERR(dma_buf_debugfs_dir)) {
+		err = PTR_ERR(dma_buf_debugfs_dir);
+		dma_buf_debugfs_dir = NULL;
+		return err;
+	}
+
+	err = dma_buf_debugfs_create_file("bufinfo", dma_buf_describe);
+
+	if (err)
+		pr_debug("dma_buf: debugfs: failed to create node bufinfo\n");
+
+	return err;
+}
+
+static void dma_buf_uninit_debugfs(void)
+{
+	if (dma_buf_debugfs_dir)
+		debugfs_remove_recursive(dma_buf_debugfs_dir);
+}
+
+int dma_buf_debugfs_create_file(const char *name,
+				int (*write)(struct seq_file *))
+{
+	struct dentry *d;
+
+	d = debugfs_create_file(name, S_IRUGO, dma_buf_debugfs_dir,
+			write, &dma_buf_debug_fops);
+
+	if (IS_ERR(d))
+		return PTR_ERR(d);
+
+	return 0;
+}
+#else
+static inline int dma_buf_init_debugfs(void)
+{
+	return 0;
+}
+static inline void dma_buf_uninit_debugfs(void)
+{
+}
+#endif
+
+static int __init dma_buf_init(void)
+{
+	mutex_init(&db_list.lock);
+	INIT_LIST_HEAD(&db_list.head);
+	dma_buf_init_debugfs();
+	return 0;
+}
+subsys_initcall(dma_buf_init);
+
+static void __exit dma_buf_deinit(void)
+{
+	dma_buf_uninit_debugfs();
+}
+__exitcall(dma_buf_deinit);
diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
index 6f55c0424f12..dfac5ed31120 100644
--- a/include/linux/dma-buf.h
+++ b/include/linux/dma-buf.h
@@ -113,6 +113,7 @@ struct dma_buf_ops {
  * @attachments: list of dma_buf_attachment that denotes all devices attached.
  * @ops: dma_buf_ops associated with this buffer object.
  * @exp_name: name of the exporter; useful for debugging.
+ * @list_node: node for dma_buf accounting and debugging.
  * @priv: exporter specific private data for this buffer object.
  */
 struct dma_buf {
@@ -125,6 +126,7 @@ struct dma_buf {
 	unsigned vmapping_counter;
 	void *vmap_ptr;
 	const char *exp_name;
+	struct list_head list_node;
 	void *priv;
 };
 
@@ -192,5 +194,6 @@ int dma_buf_mmap(struct dma_buf *, struct vm_area_struct *,
 		 unsigned long);
 void *dma_buf_vmap(struct dma_buf *);
 void dma_buf_vunmap(struct dma_buf *, void *vaddr);
-
+int dma_buf_debugfs_create_file(const char *name,
+				int (*write)(struct seq_file *));
 #endif /* __DMA_BUF_H__ */
-- 
cgit 


From d69f3bad4675ac519d41ca2b11e1c00ca115cecd Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Tue, 30 Apr 2013 19:15:54 -0700
Subject: ipc: sysv shared memory limited to 8TiB

Trying to run an application which was trying to put data into half of
memory using shmget(), we found that having a shmall value below 8EiB-8TiB
would prevent us from using anything more than 8TiB.  By setting
kernel.shmall greater than 8EiB-8TiB would make the job work.

In the newseg() function, ns->shm_tot which, at 8TiB is INT_MAX.

ipc/shm.c:
 458 static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 459 {
...
 465         int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
...
 474         if (ns->shm_tot + numpages > ns->shm_ctlall)
 475                 return -ENOSPC;

[akpm@linux-foundation.org: make ipc/shm.c:newseg()'s numpages size_t, not int]
Signed-off-by: Robin Holt <holt@sgi.com>
Reported-by: Alex Thorlton <athorlton@sgi.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ipc_namespace.h | 2 +-
 ipc/shm.c                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index ae221a7b5092..c4d870b0d5e6 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -43,8 +43,8 @@ struct ipc_namespace {
 
 	size_t		shm_ctlmax;
 	size_t		shm_ctlall;
+	unsigned long	shm_tot;
 	int		shm_ctlmni;
-	int		shm_tot;
 	/*
 	 * Defines whether IPC_RMID is forced for _all_ shm segments regardless
 	 * of shmctl()
diff --git a/ipc/shm.c b/ipc/shm.c
index cb858df061d3..8247c49ec073 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -462,7 +462,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	size_t size = params->u.size;
 	int error;
 	struct shmid_kernel *shp;
-	int numpages = (size + PAGE_SIZE -1) >> PAGE_SHIFT;
+	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	struct file * file;
 	char name[13];
 	int id;
-- 
cgit 


From bd7c4b604a6cd707803c7c6ba142bfa131f9a9f3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 30 Apr 2013 05:35:05 +0000
Subject: netpoll: convert mutex into a semaphore

Bart Van Assche recently reported a warning to me:

<IRQ>  [<ffffffff8103d79f>] warn_slowpath_common+0x7f/0xc0
[<ffffffff8103d7fa>] warn_slowpath_null+0x1a/0x20
[<ffffffff814761dd>] mutex_trylock+0x16d/0x180
[<ffffffff813968c9>] netpoll_poll_dev+0x49/0xc30
[<ffffffff8136a2d2>] ? __alloc_skb+0x82/0x2a0
[<ffffffff81397715>] netpoll_send_skb_on_dev+0x265/0x410
[<ffffffff81397c5a>] netpoll_send_udp+0x28a/0x3a0
[<ffffffffa0541843>] ? write_msg+0x53/0x110 [netconsole]
[<ffffffffa05418bf>] write_msg+0xcf/0x110 [netconsole]
[<ffffffff8103eba1>] call_console_drivers.constprop.17+0xa1/0x1c0
[<ffffffff8103fb76>] console_unlock+0x2d6/0x450
[<ffffffff8104011e>] vprintk_emit+0x1ee/0x510
[<ffffffff8146f9f6>] printk+0x4d/0x4f
[<ffffffffa0004f1d>] scsi_print_command+0x7d/0xe0 [scsi_mod]

This resulted from my commit ca99ca14c which introduced a mutex_trylock
operation in a path that could execute in interrupt context.  When mutex
debugging is enabled, the above warns the user when we are in fact
exectuting in interrupt context
interrupt context.

After some discussion, It seems that a semaphore is the proper mechanism to use
here.  While mutexes are defined to be unusable in interrupt context, no such
condition exists for semaphores (save for the fact that the non blocking api
calls, like up and down_trylock must be used when in irq context).

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Bart Van Assche <bvanassche@acm.org>
CC: Bart Van Assche <bvanassche@acm.org>
CC: David Miller <davem@davemloft.net>
CC: netdev@vger.kernel.org
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netpoll.h |  2 +-
 net/core/netpoll.c      | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h
index 9d7d8c64f7c8..fa2cb76a7029 100644
--- a/include/linux/netpoll.h
+++ b/include/linux/netpoll.h
@@ -40,7 +40,7 @@ struct netpoll_info {
 
 	unsigned long rx_flags;
 	spinlock_t rx_lock;
-	struct mutex dev_lock;
+	struct semaphore dev_lock;
 	struct list_head rx_np; /* netpolls that registered an rx_hook */
 
 	struct sk_buff_head neigh_tx; /* list of neigh requests to reply to */
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 209d84253dd5..a5802a8b652f 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -206,17 +206,17 @@ static void netpoll_poll_dev(struct net_device *dev)
 	 * the dev_open/close paths use this to block netpoll activity
 	 * while changing device state
 	 */
-	if (!mutex_trylock(&ni->dev_lock))
+	if (!down_trylock(&ni->dev_lock))
 		return;
 
 	if (!netif_running(dev)) {
-		mutex_unlock(&ni->dev_lock);
+		up(&ni->dev_lock);
 		return;
 	}
 
 	ops = dev->netdev_ops;
 	if (!ops->ndo_poll_controller) {
-		mutex_unlock(&ni->dev_lock);
+		up(&ni->dev_lock);
 		return;
 	}
 
@@ -225,7 +225,7 @@ static void netpoll_poll_dev(struct net_device *dev)
 
 	poll_napi(dev);
 
-	mutex_unlock(&ni->dev_lock);
+	up(&ni->dev_lock);
 
 	if (dev->flags & IFF_SLAVE) {
 		if (ni) {
@@ -255,7 +255,7 @@ int netpoll_rx_disable(struct net_device *dev)
 	idx = srcu_read_lock(&netpoll_srcu);
 	ni = srcu_dereference(dev->npinfo, &netpoll_srcu);
 	if (ni)
-		mutex_lock(&ni->dev_lock);
+		down(&ni->dev_lock);
 	srcu_read_unlock(&netpoll_srcu, idx);
 	return 0;
 }
@@ -267,7 +267,7 @@ void netpoll_rx_enable(struct net_device *dev)
 	rcu_read_lock();
 	ni = rcu_dereference(dev->npinfo);
 	if (ni)
-		mutex_unlock(&ni->dev_lock);
+		up(&ni->dev_lock);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(netpoll_rx_enable);
@@ -1047,7 +1047,7 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev, gfp_t gfp)
 		INIT_LIST_HEAD(&npinfo->rx_np);
 
 		spin_lock_init(&npinfo->rx_lock);
-		mutex_init(&npinfo->dev_lock);
+		sema_init(&npinfo->dev_lock, 1);
 		skb_queue_head_init(&npinfo->neigh_tx);
 		skb_queue_head_init(&npinfo->txq);
 		INIT_DELAYED_WORK(&npinfo->tx_work, queue_process);
-- 
cgit 


From 20074f357da4a637430aec2879c9d864c5d2c23c Mon Sep 17 00:00:00 2001
From: Xi Wang <xi.wang@gmail.com>
Date: Wed, 1 May 2013 16:24:08 -0400
Subject: filter: fix va_list build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes the following build error.

In file included from include/linux/filter.h:52:0,
                 from arch/arm/net/bpf_jit_32.c:14:
include/linux/printk.h:54:2: error: unknown type name ‘va_list’
include/linux/printk.h:105:21: error: unknown type name ‘va_list’
include/linux/printk.h:108:30: error: unknown type name ‘va_list’

Signed-off-by: Xi Wang <xi.wang@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/filter.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d1248f401a56..c050dcc322a4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -48,6 +48,7 @@ extern int sk_chk_filter(struct sock_filter *filter, unsigned int flen);
 extern int sk_get_filter(struct sock *sk, struct sock_filter __user *filter, unsigned len);
 
 #ifdef CONFIG_BPF_JIT
+#include <stdarg.h>
 #include <linux/linkage.h>
 #include <linux/printk.h>
 
-- 
cgit 


From 271a15eabe094538d958dc68ccfc9c36b699247a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 00:38:51 +0100
Subject: proc: Supply PDE attribute setting accessor functions

Supply accessor functions to set attributes in proc_dir_entry structs.

The following are supplied: proc_set_size() and proc_set_user().

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
cc: linuxppc-dev@lists.ozlabs.org
cc: linux-media@vger.kernel.org
cc: netdev@vger.kernel.org
cc: linux-wireless@vger.kernel.org
cc: linux-pci@vger.kernel.org
cc: netfilter-devel@vger.kernel.org
cc: alsa-devel@alsa-project.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/powerpc/kernel/proc_powerpc.c        |  2 +-
 arch/powerpc/platforms/pseries/reconfig.c |  2 +-
 drivers/media/pci/ttpci/av7110_ir.c       |  2 +-
 drivers/net/irda/vlsi_ir.c                |  2 +-
 drivers/net/wireless/airo.c               | 34 ++++++++++---------------------
 drivers/pci/proc.c                        |  2 +-
 drivers/pnp/isapnp/proc.c                 |  2 +-
 drivers/zorro/proc.c                      |  2 +-
 fs/proc/generic.c                         | 13 ++++++++++++
 include/linux/proc_fs.h                   |  5 +++++
 kernel/configs.c                          |  2 +-
 kernel/profile.c                          |  2 +-
 net/netfilter/xt_recent.c                 |  3 +--
 sound/core/info.c                         |  2 +-
 14 files changed, 40 insertions(+), 35 deletions(-)

(limited to 'include/linux')

diff --git a/arch/powerpc/kernel/proc_powerpc.c b/arch/powerpc/kernel/proc_powerpc.c
index 41d8ee9c82f1..feb8580fdc84 100644
--- a/arch/powerpc/kernel/proc_powerpc.c
+++ b/arch/powerpc/kernel/proc_powerpc.c
@@ -83,7 +83,7 @@ static int __init proc_ppc64_init(void)
 			       &page_map_fops, vdso_data);
 	if (!pde)
 		return 1;
-	pde->size = PAGE_SIZE;
+	proc_set_size(pde, PAGE_SIZE);
 
 	return 0;
 }
diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c
index d6491bd481d0..f93cdf55628c 100644
--- a/arch/powerpc/platforms/pseries/reconfig.c
+++ b/arch/powerpc/platforms/pseries/reconfig.c
@@ -452,7 +452,7 @@ static int proc_ppc64_create_ofdt(void)
 
 	ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops);
 	if (ent)
-		ent->size = 0;
+		proc_set_size(ent, 0);
 
 	return 0;
 }
diff --git a/drivers/media/pci/ttpci/av7110_ir.c b/drivers/media/pci/ttpci/av7110_ir.c
index eb822862a646..0e763a784e2b 100644
--- a/drivers/media/pci/ttpci/av7110_ir.c
+++ b/drivers/media/pci/ttpci/av7110_ir.c
@@ -375,7 +375,7 @@ int av7110_ir_init(struct av7110 *av7110)
 	if (av_cnt == 1) {
 		e = proc_create("av7110_ir", S_IWUSR, NULL, &av7110_ir_proc_fops);
 		if (e)
-			e->size = 4 + 256 * sizeof(u16);
+			proc_set_size(e, 4 + 256 * sizeof(u16));
 	}
 
 	tasklet_init(&av7110->ir.ir_tasklet, av7110_emit_key, (unsigned long) &av7110->ir);
diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c
index e22cd4e7017a..5f4758492e4c 100644
--- a/drivers/net/irda/vlsi_ir.c
+++ b/drivers/net/irda/vlsi_ir.c
@@ -1678,7 +1678,7 @@ vlsi_irda_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 			IRDA_WARNING("%s: failed to create proc entry\n",
 				     __func__);
 		} else {
-			ent->size = 0;
+			proc_set_size(ent, 0);
 		}
 		idev->proc_entry = ent;
 	}
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index 66e398d4730d..21d02335b46f 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -4507,73 +4507,63 @@ static int setup_proc_entry( struct net_device *dev,
 					    airo_entry);
 	if (!apriv->proc_entry)
 		goto fail;
-	apriv->proc_entry->uid = proc_kuid;
-	apriv->proc_entry->gid = proc_kgid;
+	proc_set_user(apriv->proc_entry, proc_kuid, proc_kgid);
 
 	/* Setup the StatsDelta */
 	entry = proc_create_data("StatsDelta", S_IRUGO & proc_perm,
 				 apriv->proc_entry, &proc_statsdelta_ops, dev);
 	if (!entry)
 		goto fail_stats_delta;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the Stats */
 	entry = proc_create_data("Stats", S_IRUGO & proc_perm,
 				 apriv->proc_entry, &proc_stats_ops, dev);
 	if (!entry)
 		goto fail_stats;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the Status */
 	entry = proc_create_data("Status", S_IRUGO & proc_perm,
 				 apriv->proc_entry, &proc_status_ops, dev);
 	if (!entry)
 		goto fail_status;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the Config */
 	entry = proc_create_data("Config", proc_perm,
 				 apriv->proc_entry, &proc_config_ops, dev);
 	if (!entry)
 		goto fail_config;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the SSID */
 	entry = proc_create_data("SSID", proc_perm,
 				 apriv->proc_entry, &proc_SSID_ops, dev);
 	if (!entry)
 		goto fail_ssid;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the APList */
 	entry = proc_create_data("APList", proc_perm,
 				 apriv->proc_entry, &proc_APList_ops, dev);
 	if (!entry)
 		goto fail_aplist;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the BSSList */
 	entry = proc_create_data("BSSList", proc_perm,
 				 apriv->proc_entry, &proc_BSSList_ops, dev);
 	if (!entry)
 		goto fail_bsslist;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
+	proc_set_user(entry, proc_kuid, proc_kgid);
 
 	/* Setup the WepKey */
 	entry = proc_create_data("WepKey", proc_perm,
 				 apriv->proc_entry, &proc_wepkey_ops, dev);
 	if (!entry)
 		goto fail_wepkey;
-	entry->uid = proc_kuid;
-	entry->gid = proc_kgid;
-
+	proc_set_user(entry, proc_kuid, proc_kgid);
 	return 0;
 
 fail_wepkey:
@@ -5695,10 +5685,8 @@ static int __init airo_init_module( void )
 
 	airo_entry = proc_mkdir_mode("driver/aironet", airo_perm, NULL);
 
-	if (airo_entry) {
-		airo_entry->uid = proc_kuid;
-		airo_entry->gid = proc_kgid;
-	}
+	if (airo_entry)
+		proc_set_user(airo_entry, proc_kuid, proc_kgid);
 
 	for (i = 0; i < 4 && io[i] && irq[i]; i++) {
 		airo_print_info("", "Trying to configure ISA adapter at irq=%d "
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 12e4fb5824c1..7cde7c131fd0 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -419,7 +419,7 @@ int pci_proc_attach_device(struct pci_dev *dev)
 			     &proc_bus_pci_operations, dev);
 	if (!e)
 		return -ENOMEM;
-	e->size = dev->cfg_size;
+	proc_set_size(e, dev->cfg_size);
 	dev->procent = e;
 
 	return 0;
diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c
index af4d40affb79..2365ef37ae24 100644
--- a/drivers/pnp/isapnp/proc.c
+++ b/drivers/pnp/isapnp/proc.c
@@ -105,7 +105,7 @@ static int isapnp_proc_attach_device(struct pnp_dev *dev)
 			&isapnp_proc_bus_file_operations, dev);
 	if (!e)
 		return -ENOMEM;
-	e->size = 256;
+	proc_set_size(e, 256);
 	return 0;
 }
 
diff --git a/drivers/zorro/proc.c b/drivers/zorro/proc.c
index 6d3a602c004b..1c15ee7456b6 100644
--- a/drivers/zorro/proc.c
+++ b/drivers/zorro/proc.c
@@ -139,7 +139,7 @@ static int __init zorro_proc_attach_device(unsigned int slot)
 				 &zorro_autocon[slot]);
 	if (!entry)
 		return -ENOMEM;
-	entry->size = sizeof(struct zorro_dev);
+	proc_set_size(entry, sizeof(struct zorro_dev));
 	return 0;
 }
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 1c07cadeb8db..5f6f6c38701f 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -498,6 +498,19 @@ out:
 	return NULL;
 }
 EXPORT_SYMBOL(proc_create_data);
+ 
+void proc_set_size(struct proc_dir_entry *de, loff_t size)
+{
+	de->size = size;
+}
+EXPORT_SYMBOL(proc_set_size);
+
+void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid)
+{
+	de->uid = uid;
+	de->gid = gid;
+}
+EXPORT_SYMBOL(proc_set_user);
 
 static void free_proc_entry(struct proc_dir_entry *de)
 {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 805edacfc2fc..28a4d7e78803 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -130,6 +130,9 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 	struct proc_dir_entry *parent);
 
+extern void proc_set_size(struct proc_dir_entry *, loff_t);
+extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
+
 extern struct file *proc_ns_fget(int fd);
 extern bool proc_ns_inode(struct inode *inode);
 
@@ -158,6 +161,8 @@ static inline struct proc_dir_entry *proc_mkdir(const char *name,
 	struct proc_dir_entry *parent) {return NULL;}
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
+static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
+static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
 
 struct tty_driver;
 static inline void proc_tty_register_driver(struct tty_driver *driver) {};
diff --git a/kernel/configs.c b/kernel/configs.c
index 42e8fa075eed..c18b1f1ae515 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -79,7 +79,7 @@ static int __init ikconfig_init(void)
 	if (!entry)
 		return -ENOMEM;
 
-	entry->size = kernel_config_data_size;
+	proc_set_size(entry, kernel_config_data_size);
 
 	return 0;
 }
diff --git a/kernel/profile.c b/kernel/profile.c
index 524ce5e29d3f..0bf400737660 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -600,7 +600,7 @@ int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */
 			    NULL, &proc_profile_operations);
 	if (!entry)
 		return 0;
-	entry->size = (1+prof_len) * sizeof(atomic_t);
+	proc_set_size(entry, (1 + prof_len) * sizeof(atomic_t));
 	hotcpu_notifier(profile_cpu_callback, 0);
 	return 0;
 }
diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
index 3db2d387cf52..1e657cf715c4 100644
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -401,8 +401,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 		ret = -ENOMEM;
 		goto out;
 	}
-	pde->uid = uid;
-	pde->gid = gid;
+	proc_set_user(pde, uid, gid);
 #endif
 	spin_lock_bh(&recent_lock);
 	list_add_tail(&t->list, &recent_net->tables);
diff --git a/sound/core/info.c b/sound/core/info.c
index 3aa88640808e..c7f41c3bbd5c 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -970,7 +970,7 @@ int snd_info_register(struct snd_info_entry * entry)
 			mutex_unlock(&info_mutex);
 			return -ENOMEM;
 		}
-		p->size = entry->size;
+		proc_set_size(p, entry->size);
 	}
 	entry->p = p;
 	if (entry->parent)
-- 
cgit 


From 0bb80f240520c4148b623161e7856858c021696d Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 01:50:06 +0100
Subject: proc: Split the namespace stuff out into linux/proc_ns.h

Split the proc namespace stuff out into linux/proc_ns.h.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: netdev@vger.kernel.org
cc: Serge E. Hallyn <serge.hallyn@ubuntu.com>
cc: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c           |  6 ++--
 fs/proc/inode.c          |  8 +++---
 fs/proc/namespaces.c     | 17 +++++++----
 include/linux/proc_fs.h  | 68 ++------------------------------------------
 include/linux/proc_ns.h  | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 init/version.c           |  2 +-
 ipc/msgutil.c            |  2 +-
 ipc/namespace.c          |  2 +-
 kernel/nsproxy.c         |  6 ++--
 kernel/pid.c             |  1 +
 kernel/pid_namespace.c   |  2 +-
 kernel/user.c            |  2 +-
 kernel/user_namespace.c  |  2 +-
 kernel/utsname.c         |  2 +-
 net/core/net_namespace.c |  7 +++--
 15 files changed, 109 insertions(+), 92 deletions(-)
 create mode 100644 include/linux/proc_ns.h

(limited to 'include/linux')

diff --git a/fs/namespace.c b/fs/namespace.c
index ed0708f2415f..0f0cf9379c9e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -21,7 +21,7 @@
 #include <linux/fs_struct.h>	/* get_fs_root et.al. */
 #include <linux/fsnotify.h>	/* fsnotify_vfsmount_delete */
 #include <linux/uaccess.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -1350,13 +1350,13 @@ static bool mnt_ns_loop(struct path *path)
 	 * mount namespace loop?
 	 */
 	struct inode *inode = path->dentry->d_inode;
-	struct proc_inode *ei;
+	struct proc_ns *ei;
 	struct mnt_namespace *mnt_ns;
 
 	if (!proc_ns_inode(inode))
 		return false;
 
-	ei = PROC_I(inode);
+	ei = get_proc_ns(inode);
 	if (ei->ns_ops != &mntns_operations)
 		return false;
 
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd2f76427fec..073aea60cf8f 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -51,8 +51,8 @@ static void proc_evict_inode(struct inode *inode)
 		sysctl_head_put(head);
 	}
 	/* Release any associated namespace */
-	ns_ops = PROC_I(inode)->ns_ops;
-	ns = PROC_I(inode)->ns;
+	ns_ops = PROC_I(inode)->ns.ns_ops;
+	ns = PROC_I(inode)->ns.ns;
 	if (ns_ops && ns)
 		ns_ops->put(ns);
 }
@@ -73,8 +73,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
 	ei->pde = NULL;
 	ei->sysctl = NULL;
 	ei->sysctl_entry = NULL;
-	ei->ns = NULL;
-	ei->ns_ops = NULL;
+	ei->ns.ns = NULL;
+	ei->ns.ns_ops = NULL;
 	inode = &ei->vfs_inode;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	return inode;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 66b51c0383da..54bdc6701e9f 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -51,7 +51,7 @@ static int ns_delete_dentry(const struct dentry *dentry)
 static char *ns_dname(struct dentry *dentry, char *buffer, int buflen)
 {
 	struct inode *inode = dentry->d_inode;
-	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
+	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns.ns_ops;
 
 	return dynamic_dname(dentry, buffer, buflen, "%s:[%lu]",
 		ns_ops->name, inode->i_ino);
@@ -95,8 +95,8 @@ static struct dentry *proc_ns_get_dentry(struct super_block *sb,
 		inode->i_op = &ns_inode_operations;
 		inode->i_mode = S_IFREG | S_IRUGO;
 		inode->i_fop = &ns_file_operations;
-		ei->ns_ops = ns_ops;
-		ei->ns = ns;
+		ei->ns.ns_ops = ns_ops;
+		ei->ns.ns = ns;
 		unlock_new_inode(inode);
 	} else {
 		ns_ops->put(ns);
@@ -128,7 +128,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto out_put_task;
 
-	ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns_ops);
+	ns_path.dentry = proc_ns_get_dentry(sb, task, ei->ns.ns_ops);
 	if (IS_ERR(ns_path.dentry)) {
 		error = ERR_CAST(ns_path.dentry);
 		goto out_put_task;
@@ -148,7 +148,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 {
 	struct inode *inode = dentry->d_inode;
 	struct proc_inode *ei = PROC_I(inode);
-	const struct proc_ns_operations *ns_ops = ei->ns_ops;
+	const struct proc_ns_operations *ns_ops = ei->ns.ns_ops;
 	struct task_struct *task;
 	void *ns;
 	char name[50];
@@ -202,7 +202,7 @@ static struct dentry *proc_ns_instantiate(struct inode *dir,
 	ei = PROC_I(inode);
 	inode->i_mode = S_IFLNK|S_IRWXUGO;
 	inode->i_op = &proc_ns_link_inode_operations;
-	ei->ns_ops = ns_ops;
+	ei->ns.ns_ops = ns_ops;
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 	d_add(dentry, inode);
@@ -337,6 +337,11 @@ out_invalid:
 	return ERR_PTR(-EINVAL);
 }
 
+struct proc_ns *get_proc_ns(struct inode *inode)
+{
+	return &PROC_I(inode)->ns;
+}
+
 bool proc_ns_inode(struct inode *inode)
 {
 	return inode->i_fop == &ns_file_operations;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 28a4d7e78803..8f7d8f24141a 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -6,6 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/magic.h>
 #include <linux/atomic.h>
+#include <linux/proc_ns.h>
 
 struct net;
 struct completion;
@@ -23,18 +24,6 @@ struct mm_struct;
 /* Worst case buffer size needed for holding an integer. */
 #define PROC_NUMBUF 13
 
-/*
- * We always define these enumerators
- */
-
-enum {
-	PROC_ROOT_INO		= 1,
-	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
-	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
-	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
-	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
-};
-
 /*
  * This is not completely implemented yet. The idea is to
  * create an in-memory tree (like the actual /proc filesystem
@@ -81,10 +70,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent);
 
-struct pid_namespace;
-
-extern int pid_ns_prepare_proc(struct pid_namespace *ns);
-extern void pid_ns_release_proc(struct pid_namespace *ns);
 
 /*
  * proc_tty.c
@@ -132,12 +117,6 @@ extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
 
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
-
-extern struct file *proc_ns_fget(int fd);
-extern bool proc_ns_inode(struct inode *inode);
-
-extern int proc_alloc_inum(unsigned int *pino);
-extern void proc_free_inum(unsigned int inum);
 #else
 
 static inline void proc_flush_task(struct task_struct *task)
@@ -168,50 +147,8 @@ struct tty_driver;
 static inline void proc_tty_register_driver(struct tty_driver *driver) {};
 static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
 
-static inline int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
-	return 0;
-}
-
-static inline void pid_ns_release_proc(struct pid_namespace *ns)
-{
-}
-
-static inline struct file *proc_ns_fget(int fd)
-{
-	return ERR_PTR(-EINVAL);
-}
-
-static inline bool proc_ns_inode(struct inode *inode)
-{
-	return false;
-}
-
-static inline int proc_alloc_inum(unsigned int *inum)
-{
-	*inum = 1;
-	return 0;
-}
-static inline void proc_free_inum(unsigned int inum)
-{
-}
 #endif /* CONFIG_PROC_FS */
 
-struct nsproxy;
-struct proc_ns_operations {
-	const char *name;
-	int type;
-	void *(*get)(struct task_struct *task);
-	void (*put)(void *ns);
-	int (*install)(struct nsproxy *nsproxy, void *ns);
-	unsigned int (*inum)(void *ns);
-};
-extern const struct proc_ns_operations netns_operations;
-extern const struct proc_ns_operations utsns_operations;
-extern const struct proc_ns_operations ipcns_operations;
-extern const struct proc_ns_operations pidns_operations;
-extern const struct proc_ns_operations userns_operations;
-extern const struct proc_ns_operations mntns_operations;
 
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
@@ -231,8 +168,7 @@ struct proc_inode {
 	struct proc_dir_entry *pde;
 	struct ctl_table_header *sysctl;
 	struct ctl_table *sysctl_entry;
-	void *ns;
-	const struct proc_ns_operations *ns_ops;
+	struct proc_ns ns;
 	struct inode vfs_inode;
 };
 
diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
new file mode 100644
index 000000000000..34a1e105bef4
--- /dev/null
+++ b/include/linux/proc_ns.h
@@ -0,0 +1,74 @@
+/*
+ * procfs namespace bits
+ */
+#ifndef _LINUX_PROC_NS_H
+#define _LINUX_PROC_NS_H
+
+struct pid_namespace;
+struct nsproxy;
+
+struct proc_ns_operations {
+	const char *name;
+	int type;
+	void *(*get)(struct task_struct *task);
+	void (*put)(void *ns);
+	int (*install)(struct nsproxy *nsproxy, void *ns);
+	unsigned int (*inum)(void *ns);
+};
+
+struct proc_ns {
+	void *ns;
+	const struct proc_ns_operations *ns_ops;
+};
+
+extern const struct proc_ns_operations netns_operations;
+extern const struct proc_ns_operations utsns_operations;
+extern const struct proc_ns_operations ipcns_operations;
+extern const struct proc_ns_operations pidns_operations;
+extern const struct proc_ns_operations userns_operations;
+extern const struct proc_ns_operations mntns_operations;
+
+/*
+ * We always define these enumerators
+ */
+enum {
+	PROC_ROOT_INO		= 1,
+	PROC_IPC_INIT_INO	= 0xEFFFFFFFU,
+	PROC_UTS_INIT_INO	= 0xEFFFFFFEU,
+	PROC_USER_INIT_INO	= 0xEFFFFFFDU,
+	PROC_PID_INIT_INO	= 0xEFFFFFFCU,
+};
+
+#ifdef CONFIG_PROC_FS
+
+extern int pid_ns_prepare_proc(struct pid_namespace *ns);
+extern void pid_ns_release_proc(struct pid_namespace *ns);
+extern struct file *proc_ns_fget(int fd);
+extern struct proc_ns *get_proc_ns(struct inode *);
+extern int proc_alloc_inum(unsigned int *pino);
+extern void proc_free_inum(unsigned int inum);
+extern bool proc_ns_inode(struct inode *inode);
+
+#else /* CONFIG_PROC_FS */
+
+static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
+static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
+
+static inline struct file *proc_ns_fget(int fd)
+{
+	return ERR_PTR(-EINVAL);
+}
+
+static inline struct proc_ns *get_proc_ns(struct inode *inode) { return NULL; }
+
+static inline int proc_alloc_inum(unsigned int *inum)
+{
+	*inum = 1;
+	return 0;
+}
+static inline void proc_free_inum(unsigned int inum) {}
+static inline bool proc_ns_inode(struct inode *inode) { return false; }
+
+#endif /* CONFIG_PROC_FS */
+
+#endif /* _LINUX_PROC_NS_H */
diff --git a/init/version.c b/init/version.c
index 58170f18912d..1a4718e500fe 100644
--- a/init/version.c
+++ b/init/version.c
@@ -12,7 +12,7 @@
 #include <linux/utsname.h>
 #include <generated/utsrelease.h>
 #include <linux/version.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 
 #ifndef CONFIG_KALLSYMS
 #define version(a) Version_ ## a
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index 5df8e4bf1db0..8f0201735f16 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -16,7 +16,7 @@
 #include <linux/msg.h>
 #include <linux/ipc_namespace.h>
 #include <linux/utsname.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <asm/uaccess.h>
 
 #include "util.h"
diff --git a/ipc/namespace.c b/ipc/namespace.c
index 7c1fa451b0b0..7ee61bf44933 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -12,7 +12,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 
 #include "util.h"
 
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index afc0456f227a..364ceab15f0c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,7 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <linux/file.h>
 #include <linux/syscalls.h>
 
@@ -241,7 +241,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 	const struct proc_ns_operations *ops;
 	struct task_struct *tsk = current;
 	struct nsproxy *new_nsproxy;
-	struct proc_inode *ei;
+	struct proc_ns *ei;
 	struct file *file;
 	int err;
 
@@ -250,7 +250,7 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype)
 		return PTR_ERR(file);
 
 	err = -EINVAL;
-	ei = PROC_I(file_inode(file));
+	ei = get_proc_ns(file_inode(file));
 	ops = ei->ns_ops;
 	if (nstype && (ops->type != nstype))
 		goto out;
diff --git a/kernel/pid.c b/kernel/pid.c
index 047dc6264638..686255e2c39e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -36,6 +36,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/init_task.h>
 #include <linux/syscalls.h>
+#include <linux/proc_ns.h>
 #include <linux/proc_fs.h>
 
 #define pid_hashfn(nr, ns)	\
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c6023..4af28a849065 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -15,7 +15,7 @@
 #include <linux/err.h>
 #include <linux/acct.h>
 #include <linux/slab.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <linux/reboot.h>
 #include <linux/export.h>
 
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03b..5bbb91988e69 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,7 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 
 /*
  * userns count is 1 for root user, 1 for init_uts_ns,
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b14f4d342043..51855f5f6311 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,7 +9,7 @@
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 #include <linux/highuid.h>
 #include <linux/cred.h>
 #include <linux/securebits.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index a47fc5de3113..2fc8576efaa8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,7 +15,7 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
-#include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
 
 static struct uts_namespace *create_uts_ns(void)
 {
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 80e271d9e64b..f97652036754 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -10,7 +10,8 @@
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
-#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/proc_ns.h>
 #include <linux/file.h>
 #include <linux/export.h>
 #include <linux/user_namespace.h>
@@ -336,7 +337,7 @@ EXPORT_SYMBOL_GPL(__put_net);
 
 struct net *get_net_ns_by_fd(int fd)
 {
-	struct proc_inode *ei;
+	struct proc_ns *ei;
 	struct file *file;
 	struct net *net;
 
@@ -344,7 +345,7 @@ struct net *get_net_ns_by_fd(int fd)
 	if (IS_ERR(file))
 		return ERR_CAST(file);
 
-	ei = PROC_I(file_inode(file));
+	ei = get_proc_ns(file_inode(file));
 	if (ei->ns_ops == &netns_operations)
 		net = get_net(ei->ns);
 	else
-- 
cgit 


From 4abfd0298900851930310e5d736a7f3a105089ec Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 02:09:03 +0100
Subject: proc: Move PDE_NET() to fs/proc/proc_net.c

Move PDE_NET() to fs/proc/proc_net.c as that's where the only user is.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/proc_net.c      | 4 ++++
 include/linux/proc_fs.h | 5 -----
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index b4ac6572474f..986e83220d56 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -26,6 +26,10 @@
 
 #include "internal.h"
 
+static inline struct net *PDE_NET(struct proc_dir_entry *pde)
+{
+	return pde->parent->data;
+}
 
 static struct net *get_proc_net(const struct inode *inode)
 {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 8f7d8f24141a..33772248a3b1 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -187,11 +187,6 @@ static inline void *PDE_DATA(const struct inode *inode)
 	return PROC_I(inode)->pde->data;
 }
 
-static inline struct net *PDE_NET(struct proc_dir_entry *pde)
-{
-	return pde->parent->data;
-}
-
 #include <linux/signal.h>
 
 void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set);
-- 
cgit 


From 34db8aaf0f95ffac407d39da22972b38da631db4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 02:29:19 +0100
Subject: proc: Move some bits from linux/proc_fs.h to
 linux/{of.h,signal.h,tty.h}

Move some bits from linux/proc_fs.h to linux/of.h, signal.h and tty.h.

Also move proc_tty_init() and proc_device_tree_init() to fs/proc/internal.h as
they're internal to procfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
cc: devicetree-discuss@lists.ozlabs.org
cc: linux-arch@vger.kernel.org
cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Jri Slaby <jslaby@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/internal.h      | 16 ++++++++++++++++
 include/linux/of.h      | 10 ++++++++++
 include/linux/proc_fs.h | 37 -------------------------------------
 include/linux/signal.h  |  5 +++++
 include/linux/tty.h     |  7 +++++++
 5 files changed, 38 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 32d8f510d65c..c529b5f16ee4 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -198,3 +198,19 @@ extern const struct inode_operations proc_ns_dir_inode_operations;
 extern const struct file_operations proc_ns_dir_operations;
 
 extern int proc_setup_self(struct super_block *);
+
+/*
+ * proc_devtree.c
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+extern void proc_device_tree_init(void);
+#endif /* CONFIG_PROC_DEVICETREE */
+
+/*
+ * proc_tty.c
+ */
+#ifdef CONFIG_TTY
+extern void proc_tty_init(void);
+#else
+static inline void proc_tty_init(void) {}
+#endif
diff --git a/include/linux/of.h b/include/linux/of.h
index a0f129284948..2d25ff8fe39a 100644
--- a/include/linux/of.h
+++ b/include/linux/of.h
@@ -540,4 +540,14 @@ static inline int of_property_read_u32(const struct device_node *np,
 	return of_property_read_u32_array(np, propname, out_value, 1);
 }
 
+#if defined(CONFIG_PROC_FS) && defined(CONFIG_PROC_DEVICETREE)
+extern void proc_device_tree_add_node(struct device_node *, struct proc_dir_entry *);
+extern void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop);
+extern void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
+					 struct property *prop);
+extern void proc_device_tree_update_prop(struct proc_dir_entry *pde,
+					 struct property *newprop,
+					 struct property *oldprop);
+#endif
+
 #endif /* _LINUX_OF_H */
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 33772248a3b1..80d9e24a79ac 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -70,36 +70,6 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent);
 
-
-/*
- * proc_tty.c
- */
-struct tty_driver;
-#ifdef CONFIG_TTY
-extern void proc_tty_init(void);
-#else
-static inline void proc_tty_init(void)
-{ }
-#endif
-extern void proc_tty_register_driver(struct tty_driver *driver);
-extern void proc_tty_unregister_driver(struct tty_driver *driver);
-
-/*
- * proc_devtree.c
- */
-#ifdef CONFIG_PROC_DEVICETREE
-struct device_node;
-struct property;
-extern void proc_device_tree_init(void);
-extern void proc_device_tree_add_node(struct device_node *, struct proc_dir_entry *);
-extern void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop);
-extern void proc_device_tree_remove_prop(struct proc_dir_entry *pde,
-					 struct property *prop);
-extern void proc_device_tree_update_prop(struct proc_dir_entry *pde,
-					 struct property *newprop,
-					 struct property *oldprop);
-#endif /* CONFIG_PROC_DEVICETREE */
-
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
 extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
@@ -143,10 +113,6 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
 static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
 
-struct tty_driver;
-static inline void proc_tty_register_driver(struct tty_driver *driver) {};
-static inline void proc_tty_unregister_driver(struct tty_driver *driver) {};
-
 #endif /* CONFIG_PROC_FS */
 
 
@@ -187,7 +153,4 @@ static inline void *PDE_DATA(const struct inode *inode)
 	return PROC_I(inode)->pde->data;
 }
 
-#include <linux/signal.h>
-
-void render_sigset_t(struct seq_file *m, const char *header, sigset_t *set);
 #endif /* _LINUX_PROC_FS_H */
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a2dcb94ea49d..1135e3696e81 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -434,4 +434,9 @@ void signals_init(void);
 int restore_altstack(const stack_t __user *);
 int __save_altstack(stack_t __user *, unsigned long);
 
+#ifdef CONFIG_PROC_FS
+struct seq_file;
+extern void render_sigset_t(struct seq_file *, const char *, sigset_t *);
+#endif
+
 #endif /* _LINUX_SIGNAL_H */
diff --git a/include/linux/tty.h b/include/linux/tty.h
index c75d886b0307..e014fb1a84bd 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -658,5 +658,12 @@ do {									\
 	finish_wait(&wq, &__wait);					\
 } while (0)
 
+#ifdef CONFIG_PROC_FS
+extern void proc_tty_register_driver(struct tty_driver *);
+extern void proc_tty_unregister_driver(struct tty_driver *);
+#else
+static inline void proc_tty_register_driver(struct tty_driver *d) {}
+static inline void proc_tty_unregister_driver(struct tty_driver *d) {}
+#endif
 
 #endif
-- 
cgit 


From 270b5ac2151707c25d3327722c5badfbd95945bc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 02:48:30 +0100
Subject: proc: Add proc_mkdir_data()

Add proc_mkdir_data() to allow procfs directories to be created that are
annotated at the time of creation with private data rather than doing this
post-creation.  This means no access is then required to the proc_dir_entry
struct to set this.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Neela Syam Kolli <megaraidlinux@lsi.com>
cc: Jerry Chuang <jerry-chuang@realtek.com>
cc: linux-scsi@vger.kernel.org
cc: devel@driverdev.osuosl.org
cc: linux-wireless@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/message/i2o/i2o_proc.c         |  8 ++------
 drivers/scsi/megaraid.c                |  4 ++--
 drivers/staging/rtl8192u/r8192U_core.c |  3 +--
 fs/proc/generic.c                      | 30 ++++++++++++------------------
 fs/reiserfs/procfs.c                   |  3 +--
 include/linux/proc_fs.h                | 13 ++++++++++---
 6 files changed, 28 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/message/i2o/i2o_proc.c b/drivers/message/i2o/i2o_proc.c
index 70a840f9b283..b7d87cd227a9 100644
--- a/drivers/message/i2o/i2o_proc.c
+++ b/drivers/message/i2o/i2o_proc.c
@@ -1913,14 +1913,12 @@ static void i2o_proc_device_add(struct proc_dir_entry *dir,
 
 	osm_debug("adding device /proc/i2o/%s/%s\n", dev->iop->name, buff);
 
-	devdir = proc_mkdir(buff, dir);
+	devdir = proc_mkdir_data(buff, 0, dir, dev);
 	if (!devdir) {
 		osm_warn("Could not allocate procdir!\n");
 		return;
 	}
 
-	devdir->data = dev;
-
 	i2o_proc_create_entries(devdir, generic_dev_entries, dev);
 
 	/* Inform core that we want updates about this device's status */
@@ -1954,12 +1952,10 @@ static int i2o_proc_iop_add(struct proc_dir_entry *dir,
 
 	osm_debug("adding IOP /proc/i2o/%s\n", c->name);
 
-	iopdir = proc_mkdir(c->name, dir);
+	iopdir = proc_mkdir_data(c->name, 0, dir, c);
 	if (!iopdir)
 		return -1;
 
-	iopdir->data = c;
-
 	i2o_proc_create_entries(iopdir, i2o_proc_generic_iop_entries, c);
 
 	list_for_each_entry(dev, &c->devices, list)
diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index a1c90bd34e78..ef3384d39e11 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2818,12 +2818,12 @@ mega_create_proc_entry(int index, struct proc_dir_entry *parent)
 
 	sprintf(string, "hba%d", adapter->host->host_no);
 
-	dir = adapter->controller_proc_dir_entry = proc_mkdir(string, parent);
+	dir = adapter->controller_proc_dir_entry =
+		proc_mkdir_data(string, 0, parent, adapter);
 	if(!dir) {
 		printk(KERN_WARNING "\nmegaraid: proc_mkdir failed\n");
 		return;
 	}
-	dir->data = adapter;
 
 	for (f = mega_proc_files; f->name; f++) {
 		de = proc_create_data(f->name, S_IRUSR, dir, &mega_proc_fops,
diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c
index 433c3df95de0..d81d7d55f25e 100644
--- a/drivers/staging/rtl8192u/r8192U_core.c
+++ b/drivers/staging/rtl8192u/r8192U_core.c
@@ -672,13 +672,12 @@ void rtl8192_proc_init_one(struct net_device *dev)
 	struct r8192_priv *priv = (struct r8192_priv *)ieee80211_priv(dev);
 
 	if (rtl8192_proc) {
-		priv->dir_dev = proc_mkdir(dev->name, rtl8192_proc);
+		priv->dir_dev = proc_mkdir_data(dev->name, 0, rtl8192_proc, dev);
 		if (!priv->dir_dev) {
 			RT_TRACE(COMP_ERR, "Unable to initialize /proc/net/rtl8192/%s\n",
 				 dev->name);
 			return;
 		}
-		priv->dir_dev->data = dev;
 
 		for (f = rtl8192_proc_files; f->name[0]; f++) {
 			if (!proc_create_data(f->name, S_IFREG | S_IRUGO,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5f6f6c38701f..4074da57c99e 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -428,13 +428,17 @@ struct proc_dir_entry *proc_symlink(const char *name,
 }
 EXPORT_SYMBOL(proc_symlink);
 
-struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
-		struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
+		struct proc_dir_entry *parent, void *data)
 {
 	struct proc_dir_entry *ent;
 
+	if (mode == 0)
+		mode = S_IRUGO | S_IXUGO;
+
 	ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
 	if (ent) {
+		ent->data = data;
 		if (proc_register(parent, ent) < 0) {
 			kfree(ent);
 			ent = NULL;
@@ -442,29 +446,19 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
 	}
 	return ent;
 }
-EXPORT_SYMBOL(proc_mkdir_mode);
+EXPORT_SYMBOL_GPL(proc_mkdir_data);
 
-struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
-		struct proc_dir_entry *parent)
+struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
+				       struct proc_dir_entry *parent)
 {
-	struct proc_dir_entry *ent;
-
-	ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2);
-	if (ent) {
-		ent->data = net;
-		if (proc_register(parent, ent) < 0) {
-			kfree(ent);
-			ent = NULL;
-		}
-	}
-	return ent;
+	return proc_mkdir_data(name, mode, parent, NULL);
 }
-EXPORT_SYMBOL_GPL(proc_net_mkdir);
+EXPORT_SYMBOL(proc_mkdir_mode);
 
 struct proc_dir_entry *proc_mkdir(const char *name,
 		struct proc_dir_entry *parent)
 {
-	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
+	return proc_mkdir_data(name, 0, parent, NULL);
 }
 EXPORT_SYMBOL(proc_mkdir);
 
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 274adea363ff..07c2162ef556 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -479,9 +479,8 @@ int reiserfs_proc_info_init(struct super_block *sb)
 		*s = '!';
 
 	spin_lock_init(&__PINFO(sb).lock);
-	REISERFS_SB(sb)->procdir = proc_mkdir(b, proc_info_root);
+	REISERFS_SB(sb)->procdir = proc_mkdir_data(b, 0, proc_info_root, sb);
 	if (REISERFS_SB(sb)->procdir) {
-		REISERFS_SB(sb)->procdir->data = sb;
 		add_file(sb, "version", show_version);
 		add_file(sb, "super", show_super);
 		add_file(sb, "per-level", show_per_level);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 80d9e24a79ac..a0fb1c2f1d8e 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -73,6 +73,8 @@ extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent);
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
 extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
+extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
+					      struct proc_dir_entry *, void *);
 extern struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
 			struct proc_dir_entry *parent);
 
@@ -82,9 +84,6 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
 	return proc_create_data(name, mode, parent, proc_fops, NULL);
 }
  
-extern struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name,
-	struct proc_dir_entry *parent);
-
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
 #else
@@ -108,6 +107,8 @@ static inline struct proc_dir_entry *proc_symlink(const char *name,
 		struct proc_dir_entry *parent,const char *dest) {return NULL;}
 static inline struct proc_dir_entry *proc_mkdir(const char *name,
 	struct proc_dir_entry *parent) {return NULL;}
+static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
+	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
 static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
@@ -153,4 +154,10 @@ static inline void *PDE_DATA(const struct inode *inode)
 	return PROC_I(inode)->pde->data;
 }
 
+static inline struct proc_dir_entry *proc_net_mkdir(
+	struct net *net, const char *name, struct proc_dir_entry *parent)
+{
+	return proc_mkdir_data(name, 0, parent, net);
+}
+
 #endif /* _LINUX_PROC_FS_H */
-- 
cgit 


From 4a520d2769beb736ba2bd084b8293ce148a1a7ae Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 14:06:01 +0100
Subject: proc: Supply an accessor for getting the data from a PDE's parent

Supply an accessor function for getting the private data from the parent
proc_dir_entry struct of the proc_dir_entry struct associated with an inode.

ReiserFS, for instance, stores the super_block pointer in the proc directory
it makes for that super_block, and a pointer to the respective seq_file show
function in each of the proc files in that directory.

This allows a reduction in the number of file_operations structs, open
functions and seq_operations structs required.  The problem otherwise is that
each show function requires two pieces of data but only has storage for one
per PDE (and this has no release function).

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Acked-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
cc: Jerry Chuang <jerry-chuang@realtek.com>
cc: Maxim Mikityanskiy <maxtram95@gmail.com>
cc: YAMANE Toshiaki <yamanetoshi@gmail.com>
cc: linux-wireless@vger.kernel.org
cc: linux-scsi@vger.kernel.org
cc: devel@driverdev.osuosl.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/scsi/megaraid.c                | 2 +-
 drivers/staging/rtl8187se/r8180_core.c | 2 +-
 drivers/staging/rtl8192u/r8192U_core.c | 2 +-
 fs/proc/generic.c                      | 7 +++++++
 include/linux/proc_fs.h                | 1 +
 5 files changed, 11 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/megaraid.c b/drivers/scsi/megaraid.c
index ef3384d39e11..7373255aa1e8 100644
--- a/drivers/scsi/megaraid.c
+++ b/drivers/scsi/megaraid.c
@@ -2760,7 +2760,7 @@ proc_show_rdrv_40(struct seq_file *m, void *v)
  */
 static int mega_proc_open(struct inode *inode, struct file *file)
 {
-	adapter_t *adapter = PDE(inode)->parent->data;
+	adapter_t *adapter = proc_get_parent_data(inode);
 	int (*show)(struct seq_file *, void *) = PDE_DATA(inode);
 
 	return single_open(file, show, adapter);
diff --git a/drivers/staging/rtl8187se/r8180_core.c b/drivers/staging/rtl8187se/r8180_core.c
index ab469ceae88c..f7c1d9905ec6 100644
--- a/drivers/staging/rtl8187se/r8180_core.c
+++ b/drivers/staging/rtl8187se/r8180_core.c
@@ -296,7 +296,7 @@ void rtl8180_proc_remove_one(struct net_device *dev)
  */
 static int rtl8180_proc_open(struct inode *inode, struct file *file)
 {
-	struct net_device *dev = PDE(inode)->parent->data;
+	struct net_device *dev = proc_get_parent_data(inode);
 	int (*show)(struct seq_file *, void *) = PDE_DATA(inode);
 
 	return single_open(file, show, dev);
diff --git a/drivers/staging/rtl8192u/r8192U_core.c b/drivers/staging/rtl8192u/r8192U_core.c
index 27ba2a3111d2..145923397556 100644
--- a/drivers/staging/rtl8192u/r8192U_core.c
+++ b/drivers/staging/rtl8192u/r8192U_core.c
@@ -637,7 +637,7 @@ void rtl8192_proc_module_remove(void)
  */
 static int rtl8192_proc_open(struct inode *inode, struct file *file)
 {
-	struct net_device *dev = PDE(inode)->parent->data;
+	struct net_device *dev = proc_get_parent_data(inode);
 	int (*show)(struct seq_file *, void *) = PDE_DATA(inode);
 
 	return single_open(file, show, dev);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 4074da57c99e..75e08d36b2f1 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -617,3 +617,10 @@ int remove_proc_subtree(const char *name, struct proc_dir_entry *parent)
 	return 0;
 }
 EXPORT_SYMBOL(remove_proc_subtree);
+
+void *proc_get_parent_data(const struct inode *inode)
+{
+	struct proc_dir_entry *de = PDE(inode);
+	return de->parent->data;
+}
+EXPORT_SYMBOL_GPL(proc_get_parent_data);
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index a0fb1c2f1d8e..bae820742341 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -86,6 +86,7 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
  
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
+extern void *proc_get_parent_data(const struct inode *);
 #else
 
 static inline void proc_flush_task(struct task_struct *task)
-- 
cgit 


From 8d8b97ba499cb69fccb5fd9f2b439e3265fc3f27 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 19 Apr 2013 23:11:24 -0400
Subject: take cgroup_open() and cpuset_open() to fs/proc/base.c

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/base.c         | 31 +++++++++++++++++++++++++++++++
 include/linux/cgroup.h |  2 +-
 include/linux/cpuset.h |  3 +--
 kernel/cgroup.c        | 15 +--------------
 kernel/cpuset.c        | 15 +--------------
 5 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index f2637c972160..8281986693be 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -404,6 +404,37 @@ static const struct file_operations proc_lstats_operations = {
 
 #endif
 
+#ifdef CONFIG_CGROUPS
+static int cgroup_open(struct inode *inode, struct file *file)
+{
+	struct pid *pid = PROC_I(inode)->pid;
+	return single_open(file, proc_cgroup_show, pid);
+}
+
+static const struct file_operations proc_cgroup_operations = {
+	.open		= cgroup_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
+#ifdef CONFIG_PROC_PID_CPUSET
+
+static int cpuset_open(struct inode *inode, struct file *file)
+{
+	struct pid *pid = PROC_I(inode)->pid;
+	return single_open(file, proc_cpuset_show, pid);
+}
+
+static const struct file_operations proc_cpuset_operations = {
+	.open		= cpuset_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+#endif
+
 static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long totalpages = totalram_pages + total_swap_pages;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 900af5964f55..68f2157b71d4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -42,7 +42,7 @@ extern int cgroupstats_build(struct cgroupstats *stats,
 extern int cgroup_load_subsys(struct cgroup_subsys *ss);
 extern void cgroup_unload_subsys(struct cgroup_subsys *ss);
 
-extern const struct file_operations proc_cgroup_operations;
+extern int proc_cgroup_show(struct seq_file *, void *);
 
 /* Define the enumeration of all builtin cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8c8a60d29407..22b637c5ecae 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -64,10 +64,9 @@ extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
 extern int cpuset_memory_pressure_enabled;
 extern void __cpuset_memory_pressure_bump(void);
 
-extern const struct file_operations proc_cpuset_operations;
-struct seq_file;
 extern void cpuset_task_status_allowed(struct seq_file *m,
 					struct task_struct *task);
+extern int proc_cpuset_show(struct seq_file *, void *);
 
 extern int cpuset_mem_spread_node(void);
 extern int cpuset_slab_spread_node(void);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a32f9432666c..d5cffe80b469 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4769,7 +4769,7 @@ out:
  */
 
 /* TODO: Use a proper seq_file iterator */
-static int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, void *v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
@@ -4821,19 +4821,6 @@ out:
 	return retval;
 }
 
-static int cgroup_open(struct inode *inode, struct file *file)
-{
-	struct pid *pid = PROC_I(inode)->pid;
-	return single_open(file, proc_cgroup_show, pid);
-}
-
-const struct file_operations proc_cgroup_operations = {
-	.open		= cgroup_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 4f9dfe43ecbd..1b6f615be587 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2666,7 +2666,7 @@ void __cpuset_memory_pressure_bump(void)
  *    and we take cpuset_mutex, keeping cpuset_attach() from changing it
  *    anyway.
  */
-static int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, void *unused_v)
 {
 	struct pid *pid;
 	struct task_struct *tsk;
@@ -2700,19 +2700,6 @@ out_free:
 out:
 	return retval;
 }
-
-static int cpuset_open(struct inode *inode, struct file *file)
-{
-	struct pid *pid = PROC_I(inode)->pid;
-	return single_open(file, proc_cpuset_show, pid);
-}
-
-const struct file_operations proc_cpuset_operations = {
-	.open		= cpuset_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
 #endif /* CONFIG_PROC_PID_CPUSET */
 
 /* Display task mems_allowed in /proc/<pid>/status file. */
-- 
cgit 


From a8ca16ea7b0abb0a7e49492d1123b715f0ec62e8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 17:27:28 +0100
Subject: proc: Supply a function to remove a proc entry by PDE

Supply a function (proc_remove()) to remove a proc entry (and any subtree
rooted there) by proc_dir_entry pointer rather than by name and (optionally)
root dir entry pointer.  This allows us to eliminate all remaining pde->name
accesses outside of procfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Grant Likely <grant.likely@linaro.or>
cc: linux-acpi@vger.kernel.org
cc: openipmi-developer@lists.sourceforge.net
cc: devicetree-discuss@lists.ozlabs.org
cc: linux-pci@vger.kernel.org
cc: netdev@vger.kernel.org
cc: netfilter-devel@vger.kernel.org
cc: alsa-devel@alsa-project.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/acpi/sbs.c                  | 21 ++++-----------------
 drivers/char/ipmi/ipmi_msghandler.c |  2 +-
 drivers/misc/sgi-gru/gruprocfs.c    |  2 +-
 drivers/of/base.c                   | 11 +----------
 drivers/pci/proc.c                  | 12 +++---------
 fs/proc/generic.c                   |  7 +++++++
 fs/proc/vmcore.c                    |  2 +-
 include/linux/proc_fs.h             |  2 ++
 kernel/irq/proc.c                   |  6 +-----
 net/8021q/vlanproc.c                |  9 ++-------
 net/core/pktgen.c                   |  6 ++----
 net/ipv4/netfilter/ipt_CLUSTERIP.c  |  4 ++--
 net/ipv6/proc.c                     |  3 +--
 sound/core/info.c                   | 19 +++++--------------
 14 files changed, 33 insertions(+), 73 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index a296e08d76b6..b6241eeb1132 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -521,19 +521,6 @@ acpi_sbs_add_fs(struct proc_dir_entry **dir,
 	return 0;
 }
 
-static void
-acpi_sbs_remove_fs(struct proc_dir_entry **dir,
-			   struct proc_dir_entry *parent_dir)
-{
-	if (*dir) {
-		remove_proc_entry(ACPI_SBS_FILE_INFO, *dir);
-		remove_proc_entry(ACPI_SBS_FILE_STATE, *dir);
-		remove_proc_entry(ACPI_SBS_FILE_ALARM, *dir);
-		remove_proc_entry((*dir)->name, parent_dir);
-		*dir = NULL;
-	}
-}
-
 /* Smart Battery Interface */
 static struct proc_dir_entry *acpi_battery_dir = NULL;
 
@@ -836,8 +823,8 @@ static void acpi_battery_remove(struct acpi_sbs *sbs, int id)
 		power_supply_unregister(&battery->bat);
 	}
 #ifdef CONFIG_ACPI_PROCFS_POWER
-	if (battery->proc_entry)
-		acpi_sbs_remove_fs(&battery->proc_entry, acpi_battery_dir);
+	proc_remove(battery->proc_entry);
+	battery->proc_entry = NULL;
 #endif
 }
 
@@ -873,8 +860,8 @@ static void acpi_charger_remove(struct acpi_sbs *sbs)
 	if (sbs->charger.dev)
 		power_supply_unregister(&sbs->charger);
 #ifdef CONFIG_ACPI_PROCFS_POWER
-	if (sbs->charger_entry)
-		acpi_sbs_remove_fs(&sbs->charger_entry, acpi_ac_dir);
+	proc_remove(sbs->charger_entry);
+	sbs->charger_entry = NULL;
 #endif
 }
 
diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c
index 1420bbbe1a61..4d439d2fcfd6 100644
--- a/drivers/char/ipmi/ipmi_msghandler.c
+++ b/drivers/char/ipmi/ipmi_msghandler.c
@@ -4541,7 +4541,7 @@ static void __exit cleanup_ipmi(void)
 	del_timer_sync(&ipmi_timer);
 
 #ifdef CONFIG_PROC_FS
-	remove_proc_entry(proc_ipmi_root->name, NULL);
+	proc_remove(proc_ipmi_root);
 #endif /* CONFIG_PROC_FS */
 
 	driver_unregister(&ipmidriver.driver);
diff --git a/drivers/misc/sgi-gru/gruprocfs.c b/drivers/misc/sgi-gru/gruprocfs.c
index 950dbe9ecb36..797d7962cc88 100644
--- a/drivers/misc/sgi-gru/gruprocfs.c
+++ b/drivers/misc/sgi-gru/gruprocfs.c
@@ -355,7 +355,7 @@ static void delete_proc_files(void)
 		for (p = proc_files; p->name; p++)
 			if (p->entry)
 				remove_proc_entry(p->name, proc_gru);
-		remove_proc_entry("gru", proc_gru->parent);
+		proc_remove(proc_gru);
 	}
 }
 
diff --git a/drivers/of/base.c b/drivers/of/base.c
index 321d3ef05006..9c704369eda8 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -1452,16 +1452,7 @@ int of_attach_node(struct device_node *np)
 #ifdef CONFIG_PROC_DEVICETREE
 static void of_remove_proc_dt_entry(struct device_node *dn)
 {
-	struct device_node *parent = dn->parent;
-	struct property *prop = dn->properties;
-
-	while (prop) {
-		remove_proc_entry(prop->name, dn->pde);
-		prop = prop->next;
-	}
-
-	if (dn->pde)
-		remove_proc_entry(dn->pde->name, parent->pde);
+	proc_remove(dn->pde);
 }
 #else
 static void of_remove_proc_dt_entry(struct device_node *dn)
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 7cde7c131fd0..08126087ec31 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -427,20 +427,14 @@ int pci_proc_attach_device(struct pci_dev *dev)
 
 int pci_proc_detach_device(struct pci_dev *dev)
 {
-	struct proc_dir_entry *e;
-
-	if ((e = dev->procent)) {
-		remove_proc_entry(e->name, dev->bus->procdir);
-		dev->procent = NULL;
-	}
+	proc_remove(dev->procent);
+	dev->procent = NULL;
 	return 0;
 }
 
 int pci_proc_detach_bus(struct pci_bus* bus)
 {
-	struct proc_dir_entry *de = bus->procdir;
-	if (de)
-		remove_proc_entry(de->name, proc_bus_pci_dir);
+	proc_remove(bus->procdir);
 	return 0;
 }
 
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 75e08d36b2f1..d9631d9b7aff 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -624,3 +624,10 @@ void *proc_get_parent_data(const struct inode *inode)
 	return de->parent->data;
 }
 EXPORT_SYMBOL_GPL(proc_get_parent_data);
+
+void proc_remove(struct proc_dir_entry *de)
+{
+	if (de)
+		remove_proc_subtree(de->name, de->parent);
+}
+EXPORT_SYMBOL(proc_remove);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 38edddc25816..17f7e080d7ff 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -699,7 +699,7 @@ void vmcore_cleanup(void)
 	struct list_head *pos, *next;
 
 	if (proc_vmcore) {
-		remove_proc_entry(proc_vmcore->name, proc_vmcore->parent);
+		proc_remove(proc_vmcore);
 		proc_vmcore = NULL;
 	}
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index bae820742341..cb78d5be6859 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -67,6 +67,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 				struct proc_dir_entry *parent,
 				const struct file_operations *proc_fops,
 				void *data);
+extern void proc_remove(struct proc_dir_entry *);
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent);
 
@@ -101,6 +102,7 @@ static inline struct proc_dir_entry *proc_create_data(const char *name,
 {
 	return NULL;
 }
+static inline void proc_remove(struct proc_dir_entry *de) {}
 #define remove_proc_entry(name, parent) do {} while (0)
 #define remove_proc_subtree(name, parent) do {} while (0)
 
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d59ae3751a33..19ed5c425c3b 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -366,11 +366,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
-	if (action->dir) {
-		struct irq_desc *desc = irq_to_desc(irq);
-
-		remove_proc_entry(action->dir->name, desc->dir);
-	}
+	proc_remove(action->dir);
 }
 
 static void register_default_affinity_proc(void)
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index 959ddbb0ca4d..1d0e89213a28 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -184,14 +184,9 @@ int vlan_proc_add_dev(struct net_device *vlandev)
  */
 int vlan_proc_rem_dev(struct net_device *vlandev)
 {
-	struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
-
 	/** NOTE:  This will consume the memory pointed to by dent, it seems. */
-	if (vlan_dev_priv(vlandev)->dent) {
-		remove_proc_entry(vlan_dev_priv(vlandev)->dent->name,
-				  vn->proc_vlan_dir);
-		vlan_dev_priv(vlandev)->dent = NULL;
-	}
+	proc_remove(vlan_dev_priv(vlandev)->dent);
+	vlan_dev_priv(vlandev)->dent = NULL;
 	return 0;
 }
 
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f6af4fe59f2e..6c41e979dc88 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1904,7 +1904,7 @@ static void pktgen_change_name(const struct pktgen_net *pn, struct net_device *d
 			if (pkt_dev->odev != dev)
 				continue;
 
-			remove_proc_entry(pkt_dev->entry->name, pn->proc_dir);
+			proc_remove(pkt_dev->entry);
 
 			pkt_dev->entry = proc_create_data(dev->name, 0600,
 							  pn->proc_dir,
@@ -3576,8 +3576,6 @@ static void _rem_dev_from_if_list(struct pktgen_thread *t,
 static int pktgen_remove_device(struct pktgen_thread *t,
 				struct pktgen_dev *pkt_dev)
 {
-	struct pktgen_net *pn = t->net;
-
 	pr_debug("remove_device pkt_dev=%p\n", pkt_dev);
 
 	if (pkt_dev->running) {
@@ -3597,7 +3595,7 @@ static int pktgen_remove_device(struct pktgen_thread *t,
 	_rem_dev_from_if_list(t, pkt_dev);
 
 	if (pkt_dev->entry)
-		remove_proc_entry(pkt_dev->entry->name, pn->proc_dir);
+		proc_remove(pkt_dev->entry);
 
 #ifdef CONFIG_XFRM
 	free_SAs(pkt_dev);
diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c
index e4738fef070a..0b732efd32e2 100644
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -105,7 +105,7 @@ clusterip_config_entry_put(struct clusterip_config *c)
 		 * functions are also incrementing the refcount on their own,
 		 * so it's safe to remove the entry even if it's in use. */
 #ifdef CONFIG_PROC_FS
-		remove_proc_entry(c->pde->name, c->pde->parent);
+		proc_remove(c->pde);
 #endif
 		return;
 	}
@@ -736,7 +736,7 @@ static void __exit clusterip_tg_exit(void)
 {
 	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
 #ifdef CONFIG_PROC_FS
-	remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent);
+	proc_remove(clusterip_procdir);
 #endif
 	nf_unregister_hook(&cip_arp_ops);
 	xt_unregister_target(&clusterip_tg_reg);
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 7ea6e180139c..537d9ee7209f 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -287,8 +287,7 @@ int snmp6_unregister_dev(struct inet6_dev *idev)
 		return -ENOENT;
 	if (!idev->stats.proc_dir_entry)
 		return -EINVAL;
-	remove_proc_entry(idev->stats.proc_dir_entry->name,
-			  net->mib.proc_net_devsnmp6);
+	proc_remove(idev->stats.proc_dir_entry);
 	idev->stats.proc_dir_entry = NULL;
 	return 0;
 }
diff --git a/sound/core/info.c b/sound/core/info.c
index c7f41c3bbd5c..3c9bd6b10a96 100644
--- a/sound/core/info.c
+++ b/sound/core/info.c
@@ -153,13 +153,6 @@ EXPORT_SYMBOL(snd_seq_root);
 struct snd_info_entry *snd_oss_root;
 #endif
 
-static void snd_remove_proc_entry(struct proc_dir_entry *parent,
-				  struct proc_dir_entry *de)
-{
-	if (de)
-		remove_proc_entry(de->name, parent);
-}
-
 static loff_t snd_info_entry_llseek(struct file *file, loff_t offset, int orig)
 {
 	struct snd_info_private_data *data;
@@ -580,7 +573,7 @@ int __exit snd_info_done(void)
 #ifdef CONFIG_SND_OSSEMUL
 		snd_info_free_entry(snd_oss_root);
 #endif
-		snd_remove_proc_entry(NULL, snd_proc_root);
+		proc_remove(snd_proc_root);
 	}
 	return 0;
 }
@@ -642,7 +635,7 @@ void snd_info_card_id_change(struct snd_card *card)
 {
 	mutex_lock(&info_mutex);
 	if (card->proc_root_link) {
-		snd_remove_proc_entry(snd_proc_root, card->proc_root_link);
+		proc_remove(card->proc_root_link);
 		card->proc_root_link = NULL;
 	}
 	if (strcmp(card->id, card->proc_root->name))
@@ -661,10 +654,8 @@ void snd_info_card_disconnect(struct snd_card *card)
 	if (!card)
 		return;
 	mutex_lock(&info_mutex);
-	if (card->proc_root_link) {
-		snd_remove_proc_entry(snd_proc_root, card->proc_root_link);
-		card->proc_root_link = NULL;
-	}
+	proc_remove(card->proc_root_link);
+	card->proc_root_link = NULL;
 	if (card->proc_root)
 		snd_info_disconnect(card->proc_root);
 	mutex_unlock(&info_mutex);
@@ -856,7 +847,7 @@ static void snd_info_disconnect(struct snd_info_entry *entry)
 	list_del_init(&entry->list);
 	root = entry->parent == NULL ? snd_proc_root : entry->parent->p;
 	snd_BUG_ON(!root);
-	snd_remove_proc_entry(root, entry->p);
+	proc_remove(entry->p);
 	entry->p = NULL;
 }
 
-- 
cgit 


From c30480b92cf497aa3b463367a82f1c2fdc5c46e9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 12 Apr 2013 18:03:36 +0100
Subject: proc: Make the PROC_I() and PDE() macros internal to procfs

Make the PROC_I() and PDE() macros internal to procfs.  This means making
PDE_DATA() out of line.  This could be made more optimal by storing
PDE()->data into inode->i_private.

Also provide a __PDE_DATA() that is inline and internal to procfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/generic.c       |  8 +++++++-
 fs/proc/internal.h      | 18 ++++++++++++++++++
 fs/proc/proc_devtree.c  |  2 +-
 include/linux/proc_fs.h | 17 ++---------------
 4 files changed, 28 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index d9631d9b7aff..a2596afffae6 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -165,7 +165,7 @@ void proc_free_inum(unsigned int inum)
 
 static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-	nd_set_link(nd, PDE_DATA(dentry->d_inode));
+	nd_set_link(nd, __PDE_DATA(dentry->d_inode));
 	return NULL;
 }
 
@@ -631,3 +631,9 @@ void proc_remove(struct proc_dir_entry *de)
 		remove_proc_subtree(de->name, de->parent);
 }
 EXPORT_SYMBOL(proc_remove);
+
+void *PDE_DATA(const struct inode *inode)
+{
+	return __PDE_DATA(inode);
+}
+EXPORT_SYMBOL(PDE_DATA);
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index c529b5f16ee4..86a24060e1b9 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -84,6 +84,24 @@ struct proc_maps_private {
 
 void proc_init_inodecache(void);
 
+/*
+ * General functions
+ */
+static inline struct proc_inode *PROC_I(const struct inode *inode)
+{
+	return container_of(inode, struct proc_inode, vfs_inode);
+}
+
+static inline struct proc_dir_entry *PDE(const struct inode *inode)
+{
+	return PROC_I(inode)->pde;
+}
+
+static inline void *__PDE_DATA(const struct inode *inode)
+{
+	return PDE(inode)->data;
+}
+
 static inline struct pid *proc_pid(struct inode *inode)
 {
 	return PROC_I(inode)->pid;
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index e0043c7e7ab7..505afc950e0a 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -41,7 +41,7 @@ static int property_proc_show(struct seq_file *m, void *v)
 
 static int property_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, property_proc_show, PDE_DATA(inode));
+	return single_open(file, property_proc_show, __PDE_DATA(inode));
 }
 
 static const struct file_operations property_proc_fops = {
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index cb78d5be6859..2112926de854 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -87,6 +87,7 @@ static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
  
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
+extern void *PDE_DATA(const struct inode *);
 extern void *proc_get_parent_data(const struct inode *);
 #else
 
@@ -116,6 +117,7 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
 static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
 static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
+static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;}
 
 #endif /* CONFIG_PROC_FS */
 
@@ -142,21 +144,6 @@ struct proc_inode {
 	struct inode vfs_inode;
 };
 
-static inline struct proc_inode *PROC_I(const struct inode *inode)
-{
-	return container_of(inode, struct proc_inode, vfs_inode);
-}
-
-static inline struct proc_dir_entry *PDE(const struct inode *inode)
-{
-	return PROC_I(inode)->pde;
-}
-
-static inline void *PDE_DATA(const struct inode *inode)
-{
-	return PROC_I(inode)->pde->data;
-}
-
 static inline struct proc_dir_entry *proc_net_mkdir(
 	struct net *net, const char *name, struct proc_dir_entry *parent)
 {
-- 
cgit 


From 59d8053f1e16904d54ed7469d4b36801ea6b8f2c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Apr 2013 13:34:43 +0100
Subject: proc: Move non-public stuff from linux/proc_fs.h to
 fs/proc/internal.h

Move non-public declarations and definitions from linux/proc_fs.h to
fs/proc/internal.h.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/internal.h      | 307 +++++++++++++++++++++++++++++++-----------------
 fs/proc/kcore.c         |   1 +
 include/linux/proc_fs.h | 140 ++++++----------------
 3 files changed, 230 insertions(+), 218 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 86a24060e1b9..04255b6e96b7 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -1,4 +1,4 @@
-/* internal.h: internal procfs definitions
+/* Internal procfs definitions
  *
  * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -9,81 +9,66 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <linux/sched.h>
 #include <linux/proc_fs.h>
+#include <linux/proc_ns.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
 #include <linux/binfmts.h>
-struct  ctl_table_header;
-struct  mempolicy;
 
-extern struct proc_dir_entry proc_root;
-extern void proc_self_init(void);
-#ifdef CONFIG_PROC_SYSCTL
-extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *head);
-#else
-static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
-#endif
-#ifdef CONFIG_NET
-extern int proc_net_init(void);
-#else
-static inline int proc_net_init(void) { return 0; }
-#endif
+struct ctl_table_header;
+struct mempolicy;
 
-struct vmalloc_info {
-	unsigned long	used;
-	unsigned long	largest_chunk;
+/*
+ * This is not completely implemented yet. The idea is to
+ * create an in-memory tree (like the actual /proc filesystem
+ * tree) of these proc_dir_entries, so that we can dynamically
+ * add new files to /proc.
+ *
+ * The "next" pointer creates a linked list of one /proc directory,
+ * while parent/subdir create the directory structure (every
+ * /proc file has a parent, but "subdir" is NULL for all
+ * non-directory entries).
+ */
+struct proc_dir_entry {
+	unsigned int low_ino;
+	umode_t mode;
+	nlink_t nlink;
+	kuid_t uid;
+	kgid_t gid;
+	loff_t size;
+	const struct inode_operations *proc_iops;
+	const struct file_operations *proc_fops;
+	struct proc_dir_entry *next, *parent, *subdir;
+	void *data;
+	atomic_t count;		/* use count */
+	atomic_t in_use;	/* number of callers into module in progress; */
+			/* negative -> it's going away RSN */
+	struct completion *pde_unload_completion;
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	u8 namelen;
+	char name[];
 };
 
-#ifdef CONFIG_MMU
-#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
-extern void get_vmalloc_info(struct vmalloc_info *vmi);
-#else
-
-#define VMALLOC_TOTAL 0UL
-#define get_vmalloc_info(vmi)			\
-do {						\
-	(vmi)->used = 0;			\
-	(vmi)->largest_chunk = 0;		\
-} while(0)
-#endif
-
-extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
-				struct pid *pid, struct task_struct *task);
-extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
-
-extern const struct file_operations proc_tid_children_operations;
-extern const struct file_operations proc_pid_maps_operations;
-extern const struct file_operations proc_tid_maps_operations;
-extern const struct file_operations proc_pid_numa_maps_operations;
-extern const struct file_operations proc_tid_numa_maps_operations;
-extern const struct file_operations proc_pid_smaps_operations;
-extern const struct file_operations proc_tid_smaps_operations;
-extern const struct file_operations proc_clear_refs_operations;
-extern const struct file_operations proc_pagemap_operations;
-extern const struct file_operations proc_net_operations;
-extern const struct inode_operations proc_net_inode_operations;
-extern const struct inode_operations proc_pid_link_inode_operations;
+union proc_op {
+	int (*proc_get_link)(struct dentry *, struct path *);
+	int (*proc_read)(struct task_struct *task, char *page);
+	int (*proc_show)(struct seq_file *m,
+		struct pid_namespace *ns, struct pid *pid,
+		struct task_struct *task);
+};
 
-struct proc_maps_private {
+struct proc_inode {
 	struct pid *pid;
-	struct task_struct *task;
-#ifdef CONFIG_MMU
-	struct vm_area_struct *tail_vma;
-#endif
-#ifdef CONFIG_NUMA
-	struct mempolicy *task_mempolicy;
-#endif
+	int fd;
+	union proc_op op;
+	struct proc_dir_entry *pde;
+	struct ctl_table_header *sysctl;
+	struct ctl_table *sysctl_entry;
+	struct proc_ns ns;
+	struct inode vfs_inode;
 };
 
-void proc_init_inodecache(void);
-
 /*
  * General functions
  */
@@ -150,79 +135,142 @@ out:
 }
 
 /*
- * base.c
+ * Offset of the first process in the /proc root directory..
  */
-extern int pid_delete_dentry(const struct dentry *);
+#define FIRST_PROCESS_ENTRY 256
 
-struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *ino,
-		struct dentry *dentry);
-int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
-		filldir_t filldir);
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 13
 
-struct pde_opener {
-	struct file *file;
-	struct list_head lh;
-	int closing;
-	struct completion *c;
-};
+/*
+ * array.c
+ */
+extern const struct file_operations proc_tid_children_operations;
+
+extern int proc_tid_stat(struct seq_file *, struct pid_namespace *,
+			 struct pid *, struct task_struct *);
+extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *,
+			  struct pid *, struct task_struct *);
+extern int proc_pid_status(struct seq_file *, struct pid_namespace *,
+			   struct pid *, struct task_struct *);
+extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
+			  struct pid *, struct task_struct *);
+
+/*
+ * base.c
+ */
+extern const struct dentry_operations pid_dentry_operations;
+extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
+extern int proc_setattr(struct dentry *, struct iattr *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern int pid_revalidate(struct dentry *, unsigned int);
+extern int pid_delete_dentry(const struct dentry *);
+extern int proc_pid_readdir(struct file *, void *, filldir_t);
+extern struct dentry *proc_pid_lookup(struct inode *, struct dentry *, unsigned int);
+extern loff_t mem_lseek(struct file *, loff_t, int);
 
-void proc_entry_rundown(struct proc_dir_entry *);
+/* Lookups */
+typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
+				     struct task_struct *, const void *);
+extern int proc_fill_cache(struct file *, void *, filldir_t, const char *, int,
+			   instantiate_t, struct task_struct *, const void *);
 
+/*
+ * generic.c
+ */
 extern spinlock_t proc_subdir_lock;
 
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, unsigned int);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-unsigned long task_statm(struct mm_struct *,
-	unsigned long *, unsigned long *, unsigned long *, unsigned long *);
-void task_mem(struct seq_file *, struct mm_struct *);
+extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+extern struct dentry *proc_lookup_de(struct proc_dir_entry *, struct inode *,
+				     struct dentry *);
+extern int proc_readdir(struct file *, void *, filldir_t);
+extern int proc_readdir_de(struct proc_dir_entry *, struct file *, void *, filldir_t);
 
 static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde)
 {
 	atomic_inc(&pde->count);
 	return pde;
 }
-void pde_put(struct proc_dir_entry *pde);
+extern void pde_put(struct proc_dir_entry *);
+
+/*
+ * inode.c
+ */
+struct pde_opener {
+	struct file *file;
+	struct list_head lh;
+	int closing;
+	struct completion *c;
+};
 
-int proc_fill_super(struct super_block *);
-struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
-int proc_remount(struct super_block *sb, int *flags, char *data);
+extern const struct inode_operations proc_pid_link_inode_operations;
+
+extern void proc_init_inodecache(void);
+extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
+extern int proc_fill_super(struct super_block *);
+extern void proc_entry_rundown(struct proc_dir_entry *);
 
 /*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
+ * mmu.c
  */
-int proc_readdir(struct file *, void *, filldir_t);
-struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
+struct vmalloc_info {
+	unsigned long	used;
+	unsigned long	largest_chunk;
+};
 
+#ifdef CONFIG_MMU
+#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START)
+extern void get_vmalloc_info(struct vmalloc_info *);
 
+#else
+#define VMALLOC_TOTAL 0UL
+static inline void get_vmalloc_info(struct vmalloc_info *vmi)
+{
+	vmi->used = 0;
+	vmi->largest_chunk = 0;
+}
+#endif
 
-/* Lookups */
-typedef struct dentry *instantiate_t(struct inode *, struct dentry *,
-				struct task_struct *, const void *);
-int proc_fill_cache(struct file *filp, void *dirent, filldir_t filldir,
-	const char *name, int len,
-	instantiate_t instantiate, struct task_struct *task, const void *ptr);
-int pid_revalidate(struct dentry *dentry, unsigned int flags);
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task);
-extern const struct dentry_operations pid_dentry_operations;
-int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
-int proc_setattr(struct dentry *dentry, struct iattr *attr);
+/*
+ * proc_devtree.c
+ */
+#ifdef CONFIG_PROC_DEVICETREE
+extern void proc_device_tree_init(void);
+#endif
 
+/*
+ * proc_namespaces.c
+ */
 extern const struct inode_operations proc_ns_dir_inode_operations;
 extern const struct file_operations proc_ns_dir_operations;
 
+/*
+ * proc_net.c
+ */
+extern const struct file_operations proc_net_operations;
+extern const struct inode_operations proc_net_inode_operations;
+
+#ifdef CONFIG_NET
+extern int proc_net_init(void);
+#else
+static inline int proc_net_init(void) { return 0; }
+#endif
+
+/*
+ * proc_self.c
+ */
 extern int proc_setup_self(struct super_block *);
 
 /*
- * proc_devtree.c
+ * proc_sysctl.c
  */
-#ifdef CONFIG_PROC_DEVICETREE
-extern void proc_device_tree_init(void);
-#endif /* CONFIG_PROC_DEVICETREE */
+#ifdef CONFIG_PROC_SYSCTL
+extern int proc_sys_init(void);
+extern void sysctl_head_put(struct ctl_table_header *);
+#else
+static inline void proc_sys_init(void) { }
+static inline void sysctl_head_put(struct ctl_table_header *head) { }
+#endif
 
 /*
  * proc_tty.c
@@ -232,3 +280,40 @@ extern void proc_tty_init(void);
 #else
 static inline void proc_tty_init(void) {}
 #endif
+
+/*
+ * root.c
+ */
+extern struct proc_dir_entry proc_root;
+
+extern void proc_self_init(void);
+extern int proc_remount(struct super_block *, int *, char *);
+
+/*
+ * task_[no]mmu.c
+ */
+struct proc_maps_private {
+	struct pid *pid;
+	struct task_struct *task;
+#ifdef CONFIG_MMU
+	struct vm_area_struct *tail_vma;
+#endif
+#ifdef CONFIG_NUMA
+	struct mempolicy *task_mempolicy;
+#endif
+};
+
+extern const struct file_operations proc_pid_maps_operations;
+extern const struct file_operations proc_tid_maps_operations;
+extern const struct file_operations proc_pid_numa_maps_operations;
+extern const struct file_operations proc_tid_numa_maps_operations;
+extern const struct file_operations proc_pid_smaps_operations;
+extern const struct file_operations proc_tid_smaps_operations;
+extern const struct file_operations proc_clear_refs_operations;
+extern const struct file_operations proc_pagemap_operations;
+
+extern unsigned long task_vsize(struct mm_struct *);
+extern unsigned long task_statm(struct mm_struct *,
+				unsigned long *, unsigned long *,
+				unsigned long *, unsigned long *);
+extern void task_mem(struct seq_file *, struct mm_struct *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 8e6ce830de44..13cf87c4686f 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -28,6 +28,7 @@
 #include <linux/ioport.h>
 #include <linux/memory.h>
 #include <asm/sections.h>
+#include "internal.h"
 
 #define CORE_STR "CORE"
 
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 2112926de854..608e60a74c3c 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -1,148 +1,74 @@
-#ifndef _LINUX_PROC_FS_H
-#define _LINUX_PROC_FS_H
-
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/spinlock.h>
-#include <linux/magic.h>
-#include <linux/atomic.h>
-#include <linux/proc_ns.h>
-
-struct net;
-struct completion;
-struct mm_struct;
-
 /*
  * The proc filesystem constants/structures
  */
+#ifndef _LINUX_PROC_FS_H
+#define _LINUX_PROC_FS_H
 
-/*
- * Offset of the first process in the /proc root directory..
- */
-#define FIRST_PROCESS_ENTRY 256
-
-/* Worst case buffer size needed for holding an integer. */
-#define PROC_NUMBUF 13
-
-/*
- * This is not completely implemented yet. The idea is to
- * create an in-memory tree (like the actual /proc filesystem
- * tree) of these proc_dir_entries, so that we can dynamically
- * add new files to /proc.
- *
- * The "next" pointer creates a linked list of one /proc directory,
- * while parent/subdir create the directory structure (every
- * /proc file has a parent, but "subdir" is NULL for all
- * non-directory entries).
- */
+#include <linux/types.h>
+#include <linux/fs.h>
 
-struct proc_dir_entry {
-	unsigned int low_ino;
-	umode_t mode;
-	nlink_t nlink;
-	kuid_t uid;
-	kgid_t gid;
-	loff_t size;
-	const struct inode_operations *proc_iops;
-	const struct file_operations *proc_fops;
-	struct proc_dir_entry *next, *parent, *subdir;
-	void *data;
-	atomic_t count;		/* use count */
-	atomic_t in_use;	/* number of callers into module in progress; */
-			/* negative -> it's going away RSN */
-	struct completion *pde_unload_completion;
-	struct list_head pde_openers;	/* who did ->open, but not ->release */
-	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
-	u8 namelen;
-	char name[];
-};
+struct proc_dir_entry;
 
 #ifdef CONFIG_PROC_FS
 
 extern void proc_root_init(void);
-
-void proc_flush_task(struct task_struct *task);
-
-struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
-				struct proc_dir_entry *parent,
-				const struct file_operations *proc_fops,
-				void *data);
-extern void proc_remove(struct proc_dir_entry *);
-extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
-extern int remove_proc_subtree(const char *name, struct proc_dir_entry *parent);
+extern void proc_flush_task(struct task_struct *);
 
 extern struct proc_dir_entry *proc_symlink(const char *,
 		struct proc_dir_entry *, const char *);
-extern struct proc_dir_entry *proc_mkdir(const char *,struct proc_dir_entry *);
+extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
 extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
 					      struct proc_dir_entry *, void *);
-extern struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode,
-			struct proc_dir_entry *parent);
-
-static inline struct proc_dir_entry *proc_create(const char *name, umode_t mode,
-	struct proc_dir_entry *parent, const struct file_operations *proc_fops)
+extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t,
+					      struct proc_dir_entry *);
+ 
+extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
+					       struct proc_dir_entry *,
+					       const struct file_operations *,
+					       void *);
+
+static inline struct proc_dir_entry *proc_create(
+	const char *name, umode_t mode, struct proc_dir_entry *parent,
+	const struct file_operations *proc_fops)
 {
 	return proc_create_data(name, mode, parent, proc_fops, NULL);
 }
- 
+
 extern void proc_set_size(struct proc_dir_entry *, loff_t);
 extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t);
 extern void *PDE_DATA(const struct inode *);
 extern void *proc_get_parent_data(const struct inode *);
-#else
+extern void proc_remove(struct proc_dir_entry *);
+extern void remove_proc_entry(const char *, struct proc_dir_entry *);
+extern int remove_proc_subtree(const char *, struct proc_dir_entry *);
+
+#else /* CONFIG_PROC_FS */
 
 static inline void proc_flush_task(struct task_struct *task)
 {
 }
 
-#define proc_create(name, mode, parent, fops)  ({ (void)(mode), NULL; })
-
-static inline struct proc_dir_entry *proc_create_data(const char *name,
-	umode_t mode, struct proc_dir_entry *parent,
-	const struct file_operations *proc_fops, void *data)
-{
-	return NULL;
-}
-static inline void proc_remove(struct proc_dir_entry *de) {}
-#define remove_proc_entry(name, parent) do {} while (0)
-#define remove_proc_subtree(name, parent) do {} while (0)
-
 static inline struct proc_dir_entry *proc_symlink(const char *name,
-		struct proc_dir_entry *parent,const char *dest) {return NULL;}
+		struct proc_dir_entry *parent,const char *dest) { return NULL;}
 static inline struct proc_dir_entry *proc_mkdir(const char *name,
 	struct proc_dir_entry *parent) {return NULL;}
 static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
 	umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir_mode(const char *name,
 	umode_t mode, struct proc_dir_entry *parent) { return NULL; }
+#define proc_create(name, mode, parent, proc_fops) ({NULL;})
+#define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;})
+
 static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {}
 static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {}
 static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;}
+static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; }
 
-#endif /* CONFIG_PROC_FS */
-
-
-union proc_op {
-	int (*proc_get_link)(struct dentry *, struct path *);
-	int (*proc_read)(struct task_struct *task, char *page);
-	int (*proc_show)(struct seq_file *m,
-		struct pid_namespace *ns, struct pid *pid,
-		struct task_struct *task);
-};
-
-struct ctl_table_header;
-struct ctl_table;
+static inline void proc_remove(struct proc_dir_entry *de) {}
+#define remove_proc_entry(name, parent) do {} while (0)
+static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; }
 
-struct proc_inode {
-	struct pid *pid;
-	int fd;
-	union proc_op op;
-	struct proc_dir_entry *pde;
-	struct ctl_table_header *sysctl;
-	struct ctl_table *sysctl_entry;
-	struct proc_ns ns;
-	struct inode vfs_inode;
-};
+#endif /* CONFIG_PROC_FS */
 
 static inline struct proc_dir_entry *proc_net_mkdir(
 	struct net *net, const char *name, struct proc_dir_entry *parent)
-- 
cgit 


From ac3e3c5b1164397656df81b9e9ab4991184d3236 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 28 Apr 2013 21:42:33 -0400
Subject: don't bother with deferred freeing of fdtables

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file.c               | 68 ++-----------------------------------------------
 include/linux/fdtable.h |  1 -
 2 files changed, 2 insertions(+), 67 deletions(-)

(limited to 'include/linux')

diff --git a/fs/file.c b/fs/file.c
index 3906d9577a18..4a78f981557a 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -23,24 +23,10 @@
 #include <linux/rcupdate.h>
 #include <linux/workqueue.h>
 
-struct fdtable_defer {
-	spinlock_t lock;
-	struct work_struct wq;
-	struct fdtable *next;
-};
-
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
 int sysctl_nr_open_max = 1024 * 1024; /* raised later */
 
-/*
- * We use this list to defer free fdtables that have vmalloced
- * sets/arrays. By keeping a per-cpu list, we avoid having to embed
- * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in
- * this per-task structure.
- */
-static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
 static void *alloc_fdmem(size_t size)
 {
 	/*
@@ -67,46 +53,9 @@ static void __free_fdtable(struct fdtable *fdt)
 	kfree(fdt);
 }
 
-static void free_fdtable_work(struct work_struct *work)
-{
-	struct fdtable_defer *f =
-		container_of(work, struct fdtable_defer, wq);
-	struct fdtable *fdt;
-
-	spin_lock_bh(&f->lock);
-	fdt = f->next;
-	f->next = NULL;
-	spin_unlock_bh(&f->lock);
-	while(fdt) {
-		struct fdtable *next = fdt->next;
-
-		__free_fdtable(fdt);
-		fdt = next;
-	}
-}
-
 static void free_fdtable_rcu(struct rcu_head *rcu)
 {
-	struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
-	struct fdtable_defer *fddef;
-
-	BUG_ON(!fdt);
-	BUG_ON(fdt->max_fds <= NR_OPEN_DEFAULT);
-
-	if (!is_vmalloc_addr(fdt->fd) && !is_vmalloc_addr(fdt->open_fds)) {
-		kfree(fdt->fd);
-		kfree(fdt->open_fds);
-		kfree(fdt);
-	} else {
-		fddef = &get_cpu_var(fdtable_defer_list);
-		spin_lock(&fddef->lock);
-		fdt->next = fddef->next;
-		fddef->next = fdt;
-		/* vmallocs are handled from the workqueue context */
-		schedule_work(&fddef->wq);
-		spin_unlock(&fddef->lock);
-		put_cpu_var(fdtable_defer_list);
-	}
+	__free_fdtable(container_of(rcu, struct fdtable, rcu));
 }
 
 /*
@@ -174,7 +123,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	fdt->open_fds = data;
 	data += nr / BITS_PER_BYTE;
 	fdt->close_on_exec = data;
-	fdt->next = NULL;
 
 	return fdt;
 
@@ -221,7 +169,7 @@ static int expand_fdtable(struct files_struct *files, int nr)
 		/* Continue as planned */
 		copy_fdtable(new_fdt, cur_fdt);
 		rcu_assign_pointer(files->fdt, new_fdt);
-		if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
+		if (cur_fdt != &files->fdtab)
 			call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
 	} else {
 		/* Somebody else expanded, so undo our attempt */
@@ -316,7 +264,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
 	new_fdt->close_on_exec = newf->close_on_exec_init;
 	new_fdt->open_fds = newf->open_fds_init;
 	new_fdt->fd = &newf->fd_array[0];
-	new_fdt->next = NULL;
 
 	spin_lock(&oldf->file_lock);
 	old_fdt = files_fdtable(oldf);
@@ -490,19 +437,8 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-static void fdtable_defer_list_init(int cpu)
-{
-	struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
-	spin_lock_init(&fddef->lock);
-	INIT_WORK(&fddef->wq, free_fdtable_work);
-	fddef->next = NULL;
-}
-
 void __init files_defer_init(void)
 {
-	int i;
-	for_each_possible_cpu(i)
-		fdtable_defer_list_init(i);
 	sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) &
 			     -BITS_PER_LONG;
 }
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index fb7dacae0522..085197bd8812 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -27,7 +27,6 @@ struct fdtable {
 	unsigned long *close_on_exec;
 	unsigned long *open_fds;
 	struct rcu_head rcu;
-	struct fdtable *next;
 };
 
 static inline bool close_on_exec(int fd, const struct fdtable *fdt)
-- 
cgit 


From 126de6b20bfb82cc19012d5048f11f339ae5a021 Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Wed, 1 May 2013 22:04:17 +0100
Subject: linkage.h: fix build breakage due to symbol prefix handling

Al's commit e1b5bb6d1236 ("consolidate cond_syscall and SYSCALL_ALIAS
declarations") broke the build on blackfin and metag due to the
following code:

  #ifndef SYMBOL_NAME
  #ifdef CONFIG_SYMBOL_PREFIX
  #define SYMBOL_NAME(x) CONFIG_SYMBOL_PREFIX ## x
  #else
  #define SYMBOL_NAME(x) x
  #endif
  #endif
  #define __SYMBOL_NAME(x) __stringify(SYMBOL_NAME(x))

__stringify literally stringifies CONFIG_SYMBOL_PREFIX ##x, so you get
lines like this in kernel/sys_ni.s:

  .weak CONFIG_SYMBOL_PREFIXsys_quotactl
  .set CONFIG_SYMBOL_PREFIXsys_quotactl,CONFIG_SYMBOL_PREFIXsys_ni_syscall

The patches in Rusty's modules-next tree such as "CONFIG_SYMBOL_PREFIX:
cleanup." cleans up the whole mess around symbol prefixes, so this patch
just attempts to fix the build in the meantime.

The intermediate definition of SYMBOL_NAME above isn't used and is
incorrect when CONFIG_SYMBOL_PREFIX is defined as CONFIG_SYMBOL_PREFIX
is a quoted string literal, so define __SYMBOL_NAME directly depending
on CONFIG_SYMBOL_PREFIX.

Signed-off-by: James Hogan <james.hogan@imgtec.com>
Mea-culpa-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Mike Frysinger <vapier@gentoo.org>
Cc: uclinux-dist-devel@blackfin.uclinux.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/linkage.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/linkage.h b/include/linux/linkage.h
index 829d66c67fc2..de09dec25ec3 100644
--- a/include/linux/linkage.h
+++ b/include/linux/linkage.h
@@ -15,14 +15,11 @@
 #define asmlinkage CPP_ASMLINKAGE
 #endif
 
-#ifndef SYMBOL_NAME
 #ifdef CONFIG_SYMBOL_PREFIX
-#define SYMBOL_NAME(x) CONFIG_SYMBOL_PREFIX ## x
+#define __SYMBOL_NAME(x) CONFIG_SYMBOL_PREFIX __stringify(x)
 #else
-#define SYMBOL_NAME(x) x
+#define __SYMBOL_NAME(x) __stringify(x)
 #endif
-#endif
-#define __SYMBOL_NAME(x) __stringify(SYMBOL_NAME(x))
 
 #ifndef cond_syscall
 #define cond_syscall(x) asm(".weak\t" __SYMBOL_NAME(x) \
-- 
cgit 


From 07c09b725543ff2958c11522d583f90f7fdba735 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 15 Feb 2013 22:10:17 -0600
Subject: libceph: make ceph_msg->bio_seg be unsigned

The bio_seg field is used by the ceph messenger in iterating through
a bio.  It should never have a negative value, so make it an
unsigned.  (I contemplated making it unsigned short to match the
struct bio definition, but it offered no benefit.)

Change variables used to hold bio_seg values to all be unsigned as
well.  Change two variable names in init_bio_iter() to match the
convention used everywhere else.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger.c           | 16 +++++++++-------
 2 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 60903e0f665c..8297288a66e0 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -86,7 +86,7 @@ struct ceph_msg {
 #ifdef CONFIG_BLOCK
 	struct bio  *bio;		/* instead of pages/pagelist */
 	struct bio  *bio_iter;		/* bio iterator */
-	int bio_seg;			/* current bio segment */
+	unsigned int bio_seg;		/* current bio segment */
 #endif /* CONFIG_BLOCK */
 	struct ceph_pagelist *trail;	/* the trailing part of the data */
 	bool front_is_vmalloc;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2c0669fb54e3..c06f94009d73 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -697,18 +697,19 @@ static void con_out_kvec_add(struct ceph_connection *con,
 }
 
 #ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **iter, int *seg)
+static void init_bio_iter(struct bio *bio, struct bio **bio_iter,
+			unsigned int *bio_seg)
 {
 	if (!bio) {
-		*iter = NULL;
-		*seg = 0;
+		*bio_iter = NULL;
+		*bio_seg = 0;
 		return;
 	}
-	*iter = bio;
-	*seg = bio->bi_idx;
+	*bio_iter = bio;
+	*bio_seg = (unsigned int) bio->bi_idx;
 }
 
-static void iter_bio_next(struct bio **bio_iter, int *seg)
+static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
 {
 	if (*bio_iter == NULL)
 		return;
@@ -1818,7 +1819,8 @@ static int read_partial_message_pages(struct ceph_connection *con,
 
 #ifdef CONFIG_BLOCK
 static int read_partial_message_bio(struct ceph_connection *con,
-				    struct bio **bio_iter, int *bio_seg,
+				    struct bio **bio_iter,
+				    unsigned int *bio_seg,
 				    unsigned int data_len, bool do_datacrc)
 {
 	struct bio_vec *bv = bio_iovec_idx(*bio_iter, *bio_seg);
-- 
cgit 


From d4b515fa10dd52a2aef88df7299e9f3a8ab0957a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 25 Feb 2013 17:35:46 -0600
Subject: libceph: distinguish page array and pagelist count

Use distinct fields for tracking the number of pages in a message's
page array and in a message's page list.  Currently only one or the
other is used at a time, but that will be changing soon.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/mds_client.c           |  4 ++--
 include/linux/ceph/messenger.h |  3 ++-
 net/ceph/messenger.c           | 14 ++++++++------
 net/ceph/osd_client.c          |  4 ++--
 4 files changed, 14 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 442880d099c9..5c17705f88b1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1719,7 +1719,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
 	msg->pages = req->r_pages;
-	msg->nr_pages = req->r_num_pages;
+	msg->page_count = req->r_num_pages;
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
 	msg->hdr.data_off = cpu_to_le16(0);
 
@@ -2600,10 +2600,10 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	}
 
 	reply->pagelist = pagelist;
+	reply->pagelist_count = calc_pages_for(0, pagelist->length);
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
-	reply->nr_pages = calc_pages_for(0, pagelist->length);
 	ceph_con_send(&session->s_con, reply);
 
 	mutex_unlock(&session->s_mutex);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8297288a66e0..1b08349a413c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -75,9 +75,10 @@ struct ceph_msg {
 	struct kvec front;              /* unaligned blobs of message */
 	struct ceph_buffer *middle;
 	struct page **pages;            /* data payload.  NOT OWNER. */
-	unsigned nr_pages;              /* size of page array */
+	unsigned page_count;		/* size of page array */
 	unsigned page_alignment;        /* io offset in first page */
 	struct ceph_pagelist *pagelist; /* instead of pages */
+	unsigned int pagelist_count;	/* number of pages in pagelist */
 
 	struct ceph_connection *con;
 	struct list_head list_head;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c06f94009d73..9d8abb0a7cef 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -813,7 +813,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
 	     le32_to_cpu(m->hdr.data_len),
-	     m->nr_pages);
+	     m->page_count);
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
@@ -1072,7 +1072,7 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	const size_t trail_off = data_len - trail_len;
 
 	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-	     con, msg, con->out_msg_pos.page, msg->nr_pages,
+	     con, msg, con->out_msg_pos.page, msg->page_count,
 	     con->out_msg_pos.page_pos);
 
 	/*
@@ -2715,9 +2715,10 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	m->middle = NULL;
 
 	/* data */
-	m->nr_pages = 0;
+	m->page_count = 0;
 	m->page_alignment = 0;
 	m->pages = NULL;
+	m->pagelist_count = 0;
 	m->pagelist = NULL;
 #ifdef	CONFIG_BLOCK
 	m->bio = NULL;
@@ -2890,13 +2891,14 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	m->nr_pages = 0;
+	m->page_count = 0;
 	m->pages = NULL;
 
 	if (m->pagelist) {
 		ceph_pagelist_release(m->pagelist);
 		kfree(m->pagelist);
 		m->pagelist = NULL;
+		m->pagelist_count = 0;
 	}
 
 	m->trail = NULL;
@@ -2910,8 +2912,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 
 void ceph_msg_dump(struct ceph_msg *msg)
 {
-	pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
-		 msg->front_max, msg->nr_pages);
+	pr_debug("msg_dump %p (front_max %d page_count %d)\n", msg,
+		 msg->front_max, msg->page_count);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 29e4fe09e31a..c3d8c6904df3 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1742,7 +1742,7 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 	int rc = 0;
 
 	req->r_request->pages = req->r_pages;
-	req->r_request->nr_pages = req->r_num_pages;
+	req->r_request->page_count = req->r_num_pages;
 #ifdef CONFIG_BLOCK
 	req->r_request->bio = req->r_bio;
 #endif
@@ -2093,7 +2093,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 			goto out;
 		}
 		m->pages = req->r_pages;
-		m->nr_pages = req->r_num_pages;
+		m->page_count = req->r_num_pages;
 		m->page_alignment = req->r_page_alignment;
 #ifdef CONFIG_BLOCK
 		m->bio = req->r_bio;
-- 
cgit 


From 0d5af1643535508f82d6bcc2b9b93b180e8c3f4b Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 27 Feb 2013 10:26:25 -0600
Subject: libceph: complete lingering requests only once

An osd request marked to linger will be re-submitted in the event
a connection to the target osd gets dropped.  Currently, if there
is a callback function associated with a request it will be called
each time a request is submitted--which for lingering requests can
be more than once.

Change it so a request--including lingering ones--will get completed
(from the perspective of the user of the osd client) exactly once.

This resolves:
    http://tracker.ceph.com/issues/3967

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h | 1 +
 net/ceph/osd_client.c           | 5 +++++
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 1dd5d466b6f9..a79f833bba4a 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -85,6 +85,7 @@ struct ceph_osd_request {
 	s32               r_reply_op_result[CEPH_OSD_MAX_OP];
 	int               r_got_reply;
 	int		  r_linger;
+	int		  r_completed;
 
 	struct ceph_osd_client *r_osdc;
 	struct kref       r_kref;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1d9ebf967b00..a28c976ae3ae 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1174,6 +1174,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	u32 reassert_epoch;
 	u64 reassert_version;
 	u32 osdmap_epoch;
+	int already_completed;
 	int i;
 
 	tid = le64_to_cpu(msg->hdr.tid);
@@ -1282,7 +1283,11 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	    ((flags & CEPH_OSD_FLAG_WRITE) == 0))
 		__unregister_request(osdc, req);
 
+	already_completed = req->r_completed;
+	req->r_completed = 1;
 	mutex_unlock(&osdc->request_mutex);
+	if (already_completed)
+		goto done;
 
 	if (req->r_callback)
 		req->r_callback(req, msg);
-- 
cgit 


From 2a24d1f4bd7995de133c857bfdc77ac82c842300 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:15 -0600
Subject: libceph: use (void *) for untyped data in osd ops

Two of the fields defining osd operations are defined using (char *)
while the data they represent are really untyped, not character
strings.  Change them to have type (void *).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index a79f833bba4a..ec33588194ef 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -184,7 +184,7 @@ struct ceph_osd_req_op {
 		} extent;
 		struct {
 			const char *name;
-			const char  *val;
+			const void *val;
 			u32 name_len;
 			u32 value_len;
 			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
@@ -193,7 +193,7 @@ struct ceph_osd_req_op {
 		struct {
 			const char *class_name;
 			const char *method_name;
-			const char *indata;
+			const void *indata;
 			u32 indata_len;
 			__u8 class_len;
 			__u8 method_len;
-- 
cgit 


From ec02a2f2ffae13e038453ae89592a8c6210f7f4d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:15 -0600
Subject: libceph: kill ceph_msg->pagelist_count

The pagelist_count field is never actually used, so get rid of it.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/mds_client.c           | 1 -
 include/linux/ceph/messenger.h | 1 -
 net/ceph/messenger.c           | 2 --
 3 files changed, 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 9811caae7be4..4efbc63e0bb6 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2604,7 +2604,6 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	}
 
 	reply->pagelist = pagelist;
-	reply->pagelist_count = calc_pages_for(0, pagelist->length);
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 1b08349a413c..6c118748a7f8 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -78,7 +78,6 @@ struct ceph_msg {
 	unsigned page_count;		/* size of page array */
 	unsigned page_alignment;        /* io offset in first page */
 	struct ceph_pagelist *pagelist; /* instead of pages */
-	unsigned int pagelist_count;	/* number of pages in pagelist */
 
 	struct ceph_connection *con;
 	struct list_head list_head;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9d8abb0a7cef..0f9933a5a8b0 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2718,7 +2718,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	m->page_count = 0;
 	m->page_alignment = 0;
 	m->pages = NULL;
-	m->pagelist_count = 0;
 	m->pagelist = NULL;
 #ifdef	CONFIG_BLOCK
 	m->bio = NULL;
@@ -2898,7 +2897,6 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_pagelist_release(m->pagelist);
 		kfree(m->pagelist);
 		m->pagelist = NULL;
-		m->pagelist_count = 0;
 	}
 
 	m->trail = NULL;
-- 
cgit 


From 41766f87f54cc8bef023b4b0550f48753959345a Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:15 -0600
Subject: libceph: rename ceph_calc_object_layout()

The purpose of ceph_calc_object_layout() is to fill in the pool
number and seed for a ceph_pg structure provided, based on a given
osd map and target object id.

Currently that function takes a file layout parameter, but the only
thing used out of that is its pool number.

Change the function so it takes a pool number rather than the full
file layout structure.  Only update the ceph_pg if the pool is found
in the osd map.  Get rid of few useless lines of code from the
function while there.

Since the function now very clearly just fills in the ceph_pg
structure it's provided, rename it ceph_calc_ceph_pg().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/ioctl.c             |  5 +++--
 include/linux/ceph/osdmap.h |  6 ++----
 net/ceph/osd_client.c       |  4 ++--
 net/ceph/osdmap.c           | 23 +++++++++--------------
 4 files changed, 16 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 4a989345b37b..e0b4ef31d3c8 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -208,8 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
 
 	snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
 		 ceph_ino(inode), dl.object_no);
-	ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
-				osdc->osdmap);
+
+	ceph_calc_ceph_pg(&pgid, dl.object_name, osdc->osdmap,
+		ceph_file_layout_pg_pool(ci->i_layout));
 
 	dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
 	if (dl.osd >= 0) {
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index c819190d1642..167daf60c4e8 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -131,10 +131,8 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
 					 u64 *bno, u64 *oxoff, u64 *oxlen);
 
 /* calculate mapping of object to a placement group */
-extern int ceph_calc_object_layout(struct ceph_pg *pg,
-				   const char *oid,
-				   struct ceph_file_layout *fl,
-				   struct ceph_osdmap *osdmap);
+extern int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
+			  struct ceph_osdmap *osdmap, uint64_t pool);
 extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
 			       struct ceph_pg pgid,
 			       int *acting);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index d7ce457c59d9..38d09d13bb15 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -948,8 +948,8 @@ static int __map_request(struct ceph_osd_client *osdc,
 	int err;
 
 	dout("map_request %p tid %lld\n", req, req->r_tid);
-	err = ceph_calc_object_layout(&pgid, req->r_oid,
-				      &req->r_file_layout, osdc->osdmap);
+	err = ceph_calc_ceph_pg(&pgid, req->r_oid, osdc->osdmap,
+				ceph_file_layout_pg_pool(req->r_file_layout));
 	if (err) {
 		list_move(&req->r_req_lru_item, &osdc->req_notarget);
 		return err;
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 4543b9aba40c..09898711f2fd 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1111,27 +1111,22 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
  * calculate an object layout (i.e. pgid) from an oid,
  * file_layout, and osdmap
  */
-int ceph_calc_object_layout(struct ceph_pg *pg,
-			    const char *oid,
-			    struct ceph_file_layout *fl,
-			    struct ceph_osdmap *osdmap)
+int ceph_calc_ceph_pg(struct ceph_pg *pg, const char *oid,
+			struct ceph_osdmap *osdmap, uint64_t pool)
 {
-	unsigned int num, num_mask;
-	struct ceph_pg_pool_info *pool;
+	struct ceph_pg_pool_info *pool_info;
 
 	BUG_ON(!osdmap);
-	pg->pool = le32_to_cpu(fl->fl_pg_pool);
-	pool = __lookup_pg_pool(&osdmap->pg_pools, pg->pool);
-	if (!pool)
+	pool_info = __lookup_pg_pool(&osdmap->pg_pools, pool);
+	if (!pool_info)
 		return -EIO;
-	pg->seed = ceph_str_hash(pool->object_hash, oid, strlen(oid));
-	num = pool->pg_num;
-	num_mask = pool->pg_num_mask;
+	pg->pool = pool;
+	pg->seed = ceph_str_hash(pool_info->object_hash, oid, strlen(oid));
 
-	dout("calc_object_layout '%s' pgid %lld.%x\n", oid, pg->pool, pg->seed);
+	dout("%s '%s' pgid %lld.%x\n", __func__, oid, pg->pool, pg->seed);
 	return 0;
 }
-EXPORT_SYMBOL(ceph_calc_object_layout);
+EXPORT_SYMBOL(ceph_calc_ceph_pg);
 
 /*
  * Calculate raw osd vector for the given pgid.  Return pointer to osd
-- 
cgit 


From 153e5167e0e237faaefb7adf82db5748c1452d73 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:15 -0600
Subject: libceph: don't assign page info in ceph_osdc_new_request()

Currently ceph_osdc_new_request() assigns an osd request's
r_num_pages and r_alignment fields.  The only thing it does
after that is call ceph_osdc_build_request(), and that doesn't
need those fields to be assigned.

Move the assignment of those fields out of ceph_osdc_new_request()
and into its caller.  As a result, the page_align parameter is no
longer used, so get rid of it.

Note that in ceph_sync_write(), the value for req->r_num_pages had
already been calculated earlier (as num_pages, and fortunately
it was computed the same way).  So don't bother recomputing it,
but because it's not needed earlier, move that calculation after the
call to ceph_osdc_new_request().  Hold off making the assignment to
r_alignment, doing it instead r_pages and r_num_pages are
getting set.

Similarly, in start_read(), nr_pages already holds the number of
pages in the array (and is calculated the same way), so there's no
need to recompute it.  Move the assignment of the page alignment
down with the others there as well.

This and the next few patches are preparation work for:
    http://tracker.ceph.com/issues/4127

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  |  7 +++++--
 fs/ceph/file.c                  |  9 +++++----
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 19 ++++++++-----------
 4 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e53f24b15b12..e324222acc82 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -309,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    NULL, false, 0);
+				    NULL, false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -338,6 +338,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	}
 	req->r_pages = pages;
 	req->r_num_pages = nr_pages;
+	req->r_page_alignment = 0;
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
@@ -820,7 +821,7 @@ get_more_pages:
 					    snapc, do_sync,
 					    ci->i_truncate_seq,
 					    ci->i_truncate_size,
-					    &inode->i_mtime, true, 0);
+					    &inode->i_mtime, true);
 
 				if (IS_ERR(req)) {
 					rc = PTR_ERR(req);
@@ -828,6 +829,8 @@ get_more_pages:
 					break;
 				}
 
+				req->r_num_pages = calc_pages_for(0, len);
+				req->r_page_alignment = 0;
 				max_pages = req->r_num_pages;
 
 				alloc_page_vec(fsc, req);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 146ac9040141..f2754cdb5a03 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -527,19 +527,19 @@ more:
 	buf_align = (unsigned long)data & ~PAGE_MASK;
 	len = left;
 
-	/* write from beginning of first page, regardless of io alignment */
-	page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
-	num_pages = calc_pages_for(page_align, len);
 	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
 				    ceph_vino(inode), pos, &len,
 				    CEPH_OSD_OP_WRITE, flags,
 				    ci->i_snap_realm->cached_context,
 				    do_sync,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    &mtime, false, page_align);
+				    &mtime, false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	/* write from beginning of first page, regardless of io alignment */
+	page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
+	num_pages = calc_pages_for(page_align, len);
 	if (file->f_flags & O_DIRECT) {
 		pages = ceph_get_direct_page_vector(data, num_pages, false);
 		if (IS_ERR(pages)) {
@@ -573,6 +573,7 @@ more:
 	}
 	req->r_pages = pages;
 	req->r_num_pages = num_pages;
+	req->r_page_alignment = page_align;
 	req->r_inode = inode;
 
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ec33588194ef..803a9db0b475 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -247,7 +247,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      int do_sync, u32 truncate_seq,
 				      u64 truncate_size,
 				      struct timespec *mtime,
-				      bool use_mempool, int page_align);
+				      bool use_mempool);
 
 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
 					 struct ceph_osd_request *req);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 38d09d13bb15..de427cc7f6d0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -432,8 +432,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       u32 truncate_seq,
 					       u64 truncate_size,
 					       struct timespec *mtime,
-					       bool use_mempool,
-					       int page_align)
+					       bool use_mempool)
 {
 	struct ceph_osd_req_op ops[2];
 	struct ceph_osd_request *req;
@@ -470,11 +469,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
 
-	/* The alignment may differ from the natural (file) alignment */
-
-	req->r_num_pages = calc_pages_for(page_align, *plen);
-	req->r_page_alignment = page_align;
-
 	ceph_osdc_build_request(req, off, *plen, num_op, ops,
 				snapc, vino.snap, mtime);
 
@@ -1945,12 +1939,14 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, 0, truncate_seq, truncate_size, NULL,
-				    false, page_align);
+				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
 	req->r_pages = pages;
+	req->r_num_pages = calc_pages_for(page_align, *plen);
+	req->r_page_alignment = page_align;
 
 	dout("readpages  final extent is %llu~%llu (%d pages align %d)\n",
 	     off, *plen, req->r_num_pages, page_align);
@@ -1986,14 +1982,15 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
 				    snapc, 0,
 				    truncate_seq, truncate_size, mtime,
-				    true, page_align);
+				    true);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
 	req->r_pages = pages;
-	dout("writepages %llu~%llu (%d pages)\n", off, len,
-	     req->r_num_pages);
+	req->r_num_pages = calc_pages_for(page_align, len);
+	req->r_page_alignment = page_align;
+	dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages);
 
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
-- 
cgit 


From 2794a82a11cfeae0890741b18b0049ddb55ce646 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: libceph: separate osd request data info

Pull the fields in an osd request structure that define the data for
the request out into a separate structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  8 +++---
 fs/ceph/addr.c                  | 55 +++++++++++++++++++++--------------------
 fs/ceph/file.c                  |  8 +++---
 include/linux/ceph/osd_client.h | 24 ++++++++++++------
 net/ceph/osd_client.c           | 44 ++++++++++++++++-----------------
 5 files changed, 74 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b7b7a88d9f68..0e814dfda48e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1425,12 +1425,12 @@ static struct ceph_osd_request *rbd_osd_req_create(
 		break;		/* Nothing to do */
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
-		osd_req->r_bio = obj_request->bio_list;
+		osd_req->r_data.bio = obj_request->bio_list;
 		break;
 	case OBJ_REQUEST_PAGES:
-		osd_req->r_pages = obj_request->pages;
-		osd_req->r_num_pages = obj_request->page_count;
-		osd_req->r_page_alignment = offset & ~PAGE_MASK;
+		osd_req->r_data.pages = obj_request->pages;
+		osd_req->r_data.num_pages = obj_request->page_count;
+		osd_req->r_data.alignment = offset & ~PAGE_MASK;
 		break;
 	}
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e324222acc82..3a1a77b0ae9f 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -243,8 +243,8 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
-	for (i = 0; i < req->r_num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
-		struct page *page = req->r_pages[i];
+	for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
+		struct page *page = req->r_data.pages[i];
 
 		if (bytes < (int)PAGE_CACHE_SIZE) {
 			/* zero (remainder of) page */
@@ -258,7 +258,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 		unlock_page(page);
 		page_cache_release(page);
 	}
-	kfree(req->r_pages);
+	kfree(req->r_data.pages);
 }
 
 static void ceph_unlock_page_vector(struct page **pages, int num_pages)
@@ -336,9 +336,9 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	req->r_pages = pages;
-	req->r_num_pages = nr_pages;
-	req->r_page_alignment = 0;
+	req->r_data.pages = pages;
+	req->r_data.num_pages = nr_pages;
+	req->r_data.alignment = 0;
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
@@ -374,7 +374,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 		max = (fsc->mount_options->rsize + PAGE_CACHE_SIZE - 1)
 			>> PAGE_SHIFT;
 
-	dout("readpages %p file %p nr_pages %d max %d\n", inode, file, nr_pages,
+	dout("readpages %p file %p nr_pages %d max %d\n", inode,
+		file, nr_pages,
 	     max);
 	while (!list_empty(page_list)) {
 		rc = start_read(inode, page_list, max);
@@ -567,7 +568,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 		 * raced with a truncation and was adjusted at the osd,
 		 * so don't believe the reply.
 		 */
-		wrote = req->r_num_pages;
+		wrote = req->r_data.num_pages;
 	} else {
 		wrote = 0;
 		mapping_set_error(mapping, rc);
@@ -576,8 +577,8 @@ static void writepages_finish(struct ceph_osd_request *req,
 	     inode, rc, bytes, wrote);
 
 	/* clean all pages */
-	for (i = 0; i < req->r_num_pages; i++) {
-		page = req->r_pages[i];
+	for (i = 0; i < req->r_data.num_pages; i++) {
+		page = req->r_data.pages[i];
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
 
@@ -606,31 +607,31 @@ static void writepages_finish(struct ceph_osd_request *req,
 		unlock_page(page);
 	}
 	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-	ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+	ceph_put_wrbuffer_cap_refs(ci, req->r_data.num_pages, snapc);
 
-	ceph_release_pages(req->r_pages, req->r_num_pages);
-	if (req->r_pages_from_pool)
-		mempool_free(req->r_pages,
+	ceph_release_pages(req->r_data.pages, req->r_data.num_pages);
+	if (req->r_data.pages_from_pool)
+		mempool_free(req->r_data.pages,
 			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
 	else
-		kfree(req->r_pages);
+		kfree(req->r_data.pages);
 	ceph_osdc_put_request(req);
 }
 
 /*
  * allocate a page vec, either directly, or if necessary, via a the
- * mempool.  we avoid the mempool if we can because req->r_num_pages
+ * mempool.  we avoid the mempool if we can because req->r_data.num_pages
  * may be less than the maximum write size.
  */
 static void alloc_page_vec(struct ceph_fs_client *fsc,
 			   struct ceph_osd_request *req)
 {
-	req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+	req->r_data.pages = kmalloc(sizeof(struct page *) * req->r_data.num_pages,
 			       GFP_NOFS);
-	if (!req->r_pages) {
-		req->r_pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
-		req->r_pages_from_pool = 1;
-		WARN_ON(!req->r_pages);
+	if (!req->r_data.pages) {
+		req->r_data.pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
+		req->r_data.pages_from_pool = 1;
+		WARN_ON(!req->r_data.pages);
 	}
 }
 
@@ -829,9 +830,9 @@ get_more_pages:
 					break;
 				}
 
-				req->r_num_pages = calc_pages_for(0, len);
-				req->r_page_alignment = 0;
-				max_pages = req->r_num_pages;
+				req->r_data.num_pages = calc_pages_for(0, len);
+				req->r_data.alignment = 0;
+				max_pages = req->r_data.num_pages;
 
 				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
@@ -853,7 +854,7 @@ get_more_pages:
 			}
 
 			set_page_writeback(page);
-			req->r_pages[locked_pages] = page;
+			req->r_data.pages[locked_pages] = page;
 			locked_pages++;
 			next = page->index + 1;
 		}
@@ -883,14 +884,14 @@ get_more_pages:
 		}
 
 		/* submit the write */
-		offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+		offset = req->r_data.pages[0]->index << PAGE_CACHE_SHIFT;
 		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
 			  (u64)locked_pages << PAGE_CACHE_SHIFT);
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
 		/* revise final length, page count */
-		req->r_num_pages = locked_pages;
+		req->r_data.num_pages = locked_pages;
 		req->r_request_ops[0].extent.length = cpu_to_le64(len);
 		req->r_request_ops[0].payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f2754cdb5a03..d35fc05af06f 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -568,12 +568,12 @@ more:
 		if ((file->f_flags & O_SYNC) == 0) {
 			/* get a second commit callback */
 			req->r_safe_callback = sync_write_commit;
-			req->r_own_pages = 1;
+			req->r_data.own_pages = 1;
 		}
 	}
-	req->r_pages = pages;
-	req->r_num_pages = num_pages;
-	req->r_page_alignment = page_align;
+	req->r_data.pages = pages;
+	req->r_data.num_pages = num_pages;
+	req->r_data.alignment = page_align;
 	req->r_inode = inode;
 
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 803a9db0b475..600b8278d11e 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -50,6 +50,21 @@ struct ceph_osd {
 
 #define CEPH_OSD_MAX_OP 10
 
+struct ceph_osd_data {
+	struct {
+		struct {
+			struct page	**pages;
+			u32		num_pages;
+			u32		alignment;
+			bool		pages_from_pool;
+			bool		own_pages;
+		};
+#ifdef CONFIG_BLOCK
+		struct bio       *bio;
+#endif /* CONFIG_BLOCK */
+	};
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
@@ -105,15 +120,8 @@ struct ceph_osd_request {
 
 	struct ceph_file_layout r_file_layout;
 	struct ceph_snap_context *r_snapc;    /* snap context for writes */
-	unsigned          r_num_pages;        /* size of page array (follows) */
-	unsigned          r_page_alignment;   /* io offset in first page */
-	struct page     **r_pages;            /* pages for data payload */
-	int               r_pages_from_pool;
-	int               r_own_pages;        /* if true, i own page list */
-#ifdef CONFIG_BLOCK
-	struct bio       *r_bio;	      /* instead of pages */
-#endif
 
+	struct ceph_osd_data r_data;
 	struct ceph_pagelist r_trail;	      /* trailing part of the data */
 };
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index de427cc7f6d0..1f8c7a7c203b 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -122,9 +122,9 @@ void ceph_osdc_release_request(struct kref *kref)
 	}
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	if (req->r_own_pages)
-		ceph_release_page_vector(req->r_pages,
-					 req->r_num_pages);
+	if (req->r_data.own_pages)
+		ceph_release_page_vector(req->r_data.pages,
+					 req->r_data.num_pages);
 	ceph_put_snap_context(req->r_snapc);
 	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)
@@ -1739,11 +1739,11 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 {
 	int rc = 0;
 
-	req->r_request->pages = req->r_pages;
-	req->r_request->page_count = req->r_num_pages;
-	req->r_request->page_alignment = req->r_page_alignment;
+	req->r_request->pages = req->r_data.pages;
+	req->r_request->page_count = req->r_data.num_pages;
+	req->r_request->page_alignment = req->r_data.alignment;
 #ifdef CONFIG_BLOCK
-	req->r_request->bio = req->r_bio;
+	req->r_request->bio = req->r_data.bio;
 #endif
 	req->r_request->trail = &req->r_trail;
 
@@ -1944,12 +1944,12 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-	req->r_pages = pages;
-	req->r_num_pages = calc_pages_for(page_align, *plen);
-	req->r_page_alignment = page_align;
+	req->r_data.pages = pages;
+	req->r_data.num_pages = calc_pages_for(page_align, *plen);
+	req->r_data.alignment = page_align;
 
 	dout("readpages  final extent is %llu~%llu (%d pages align %d)\n",
-	     off, *plen, req->r_num_pages, page_align);
+	     off, *plen, req->r_data.num_pages, page_align);
 
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
@@ -1987,10 +1987,10 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
-	req->r_pages = pages;
-	req->r_num_pages = calc_pages_for(page_align, len);
-	req->r_page_alignment = page_align;
-	dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_num_pages);
+	req->r_data.pages = pages;
+	req->r_data.num_pages = calc_pages_for(page_align, len);
+	req->r_data.alignment = page_align;
+	dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_data.num_pages);
 
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
@@ -2083,22 +2083,22 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		int want = calc_pages_for(req->r_page_alignment, data_len);
+		int want = calc_pages_for(req->r_data.alignment, data_len);
 
-		if (req->r_pages && unlikely(req->r_num_pages < want)) {
+		if (req->r_data.pages && unlikely(req->r_data.num_pages < want)) {
 			pr_warning("tid %lld reply has %d bytes %d pages, we"
 				   " had only %d pages ready\n", tid, data_len,
-				   want, req->r_num_pages);
+				   want, req->r_data.num_pages);
 			*skip = 1;
 			ceph_msg_put(m);
 			m = NULL;
 			goto out;
 		}
-		m->pages = req->r_pages;
-		m->page_count = req->r_num_pages;
-		m->page_alignment = req->r_page_alignment;
+		m->pages = req->r_data.pages;
+		m->page_count = req->r_data.num_pages;
+		m->page_alignment = req->r_data.alignment;
 #ifdef CONFIG_BLOCK
-		m->bio = req->r_bio;
+		m->bio = req->r_data.bio;
 #endif
 	}
 	*skip = 0;
-- 
cgit 


From 2ac2b7a6d4976bd6b5dc0751aa77d12d48d3ac4c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: libceph: distinguish page and bio requests

An osd request uses either pages or a bio list for its data.  Use a
union to record information about the two, and add a data type
tag to select between them.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  4 +++
 fs/ceph/addr.c                  |  4 +++
 fs/ceph/file.c                  |  1 +
 include/linux/ceph/osd_client.h | 11 +++++++-
 net/ceph/osd_client.c           | 56 ++++++++++++++++++++++++++---------------
 5 files changed, 55 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 0e814dfda48e..f189bc2909b0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1425,12 +1425,16 @@ static struct ceph_osd_request *rbd_osd_req_create(
 		break;		/* Nothing to do */
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
+		osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO;
 		osd_req->r_data.bio = obj_request->bio_list;
 		break;
 	case OBJ_REQUEST_PAGES:
+		osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
 		osd_req->r_data.pages = obj_request->pages;
 		osd_req->r_data.num_pages = obj_request->page_count;
 		osd_req->r_data.alignment = offset & ~PAGE_MASK;
+		osd_req->r_data.pages_from_pool = false;
+		osd_req->r_data.own_pages = false;
 		break;
 	}
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 3a1a77b0ae9f..276fe96f12e3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -243,6 +243,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
+	BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES);
 	for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
 		struct page *page = req->r_data.pages[i];
 
@@ -336,6 +337,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
+	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
 	req->r_data.pages = pages;
 	req->r_data.num_pages = nr_pages;
 	req->r_data.alignment = 0;
@@ -561,6 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
+	BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES);
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -830,6 +833,7 @@ get_more_pages:
 					break;
 				}
 
+				req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
 				req->r_data.num_pages = calc_pages_for(0, len);
 				req->r_data.alignment = 0;
 				max_pages = req->r_data.num_pages;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d35fc05af06f..3643a386ab23 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -571,6 +571,7 @@ more:
 			req->r_data.own_pages = 1;
 		}
 	}
+	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
 	req->r_data.pages = pages;
 	req->r_data.num_pages = num_pages;
 	req->r_data.alignment = page_align;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 600b8278d11e..56604b33dc3c 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -50,8 +50,17 @@ struct ceph_osd {
 
 #define CEPH_OSD_MAX_OP 10
 
+enum ceph_osd_data_type {
+	CEPH_OSD_DATA_TYPE_NONE,
+	CEPH_OSD_DATA_TYPE_PAGES,
+#ifdef CONFIG_BLOCK
+	CEPH_OSD_DATA_TYPE_BIO,
+#endif /* CONFIG_BLOCK */
+};
+
 struct ceph_osd_data {
-	struct {
+	enum ceph_osd_data_type	type;
+	union {
 		struct {
 			struct page	**pages;
 			u32		num_pages;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1f8c7a7c203b..591e1b0cccbe 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -122,7 +122,8 @@ void ceph_osdc_release_request(struct kref *kref)
 	}
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	if (req->r_data.own_pages)
+	if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES &&
+			req->r_data.own_pages)
 		ceph_release_page_vector(req->r_data.pages,
 					 req->r_data.num_pages);
 	ceph_put_snap_context(req->r_snapc);
@@ -188,6 +189,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
+	req->r_data.type = CEPH_OSD_DATA_TYPE_NONE;
 	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
@@ -1739,12 +1741,17 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 {
 	int rc = 0;
 
-	req->r_request->pages = req->r_data.pages;
-	req->r_request->page_count = req->r_data.num_pages;
-	req->r_request->page_alignment = req->r_data.alignment;
+	if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) {
+		req->r_request->pages = req->r_data.pages;
+		req->r_request->page_count = req->r_data.num_pages;
+		req->r_request->page_alignment = req->r_data.alignment;
 #ifdef CONFIG_BLOCK
-	req->r_request->bio = req->r_data.bio;
+	} else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) {
+		req->r_request->bio = req->r_data.bio;
 #endif
+	} else {
+		pr_err("unknown request data type %d\n", req->r_data.type);
+	}
 	req->r_request->trail = &req->r_trail;
 
 	register_request(osdc, req);
@@ -1944,6 +1951,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
+	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
 	req->r_data.pages = pages;
 	req->r_data.num_pages = calc_pages_for(page_align, *plen);
 	req->r_data.alignment = page_align;
@@ -1987,6 +1995,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
+	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
 	req->r_data.pages = pages;
 	req->r_data.num_pages = calc_pages_for(page_align, len);
 	req->r_data.alignment = page_align;
@@ -2083,23 +2092,30 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		int want = calc_pages_for(req->r_data.alignment, data_len);
-
-		if (req->r_data.pages && unlikely(req->r_data.num_pages < want)) {
-			pr_warning("tid %lld reply has %d bytes %d pages, we"
-				   " had only %d pages ready\n", tid, data_len,
-				   want, req->r_data.num_pages);
-			*skip = 1;
-			ceph_msg_put(m);
-			m = NULL;
-			goto out;
-		}
-		m->pages = req->r_data.pages;
-		m->page_count = req->r_data.num_pages;
-		m->page_alignment = req->r_data.alignment;
+		if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) {
+			int want;
+
+			want = calc_pages_for(req->r_data.alignment, data_len);
+			if (req->r_data.pages &&
+				unlikely(req->r_data.num_pages < want)) {
+
+				pr_warning("tid %lld reply has %d bytes %d "
+					"pages, we had only %d pages ready\n",
+					tid, data_len, want,
+					req->r_data.num_pages);
+				*skip = 1;
+				ceph_msg_put(m);
+				m = NULL;
+				goto out;
+			}
+			m->pages = req->r_data.pages;
+			m->page_count = req->r_data.num_pages;
+			m->page_alignment = req->r_data.alignment;
 #ifdef CONFIG_BLOCK
-		m->bio = req->r_data.bio;
+		} else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) {
+			m->bio = req->r_data.bio;
 #endif
+		}
 	}
 	*skip = 0;
 	req->r_con_filling_msg = con->ops->get(con);
-- 
cgit 


From 0fff87ec798abdb4a99f01cbb0197266bb68c5dc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: libceph: separate read and write data

An osd request defines information about where data to be read
should be placed as well as where data to write comes from.
Currently these are represented by common fields.

Keep information about data for writing separate from data to be
read by splitting these into data_in and data_out fields.

This is the key patch in this whole series, in that it actually
identifies which osd requests generate outgoing data and which
generate incoming data.  It's less obvious (currently) that an osd
CALL op generates both outgoing and incoming data; that's the focus
of some upcoming work.

This resolves:
    http://tracker.ceph.com/issues/4127

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 18 +++++----
 fs/ceph/addr.c                  | 67 ++++++++++++++++++---------------
 fs/ceph/file.c                  | 10 ++---
 include/linux/ceph/osd_client.h |  5 ++-
 net/ceph/osd_client.c           | 83 +++++++++++++++++++++++++----------------
 5 files changed, 105 insertions(+), 78 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f189bc2909b0..3f69eb1bc656 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1398,6 +1398,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	struct ceph_snap_context *snapc = NULL;
 	struct ceph_osd_client *osdc;
 	struct ceph_osd_request *osd_req;
+	struct ceph_osd_data *osd_data;
 	struct timespec now;
 	struct timespec *mtime;
 	u64 snap_id = CEPH_NOSNAP;
@@ -1418,6 +1419,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
 	if (!osd_req)
 		return NULL;	/* ENOMEM */
+	osd_data = write_request ? &osd_req->r_data_out : &osd_req->r_data_in;
 
 	rbd_assert(obj_request_type_valid(obj_request->type));
 	switch (obj_request->type) {
@@ -1425,16 +1427,16 @@ static struct ceph_osd_request *rbd_osd_req_create(
 		break;		/* Nothing to do */
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
-		osd_req->r_data.type = CEPH_OSD_DATA_TYPE_BIO;
-		osd_req->r_data.bio = obj_request->bio_list;
+		osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
+		osd_data->bio = obj_request->bio_list;
 		break;
 	case OBJ_REQUEST_PAGES:
-		osd_req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
-		osd_req->r_data.pages = obj_request->pages;
-		osd_req->r_data.num_pages = obj_request->page_count;
-		osd_req->r_data.alignment = offset & ~PAGE_MASK;
-		osd_req->r_data.pages_from_pool = false;
-		osd_req->r_data.own_pages = false;
+		osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+		osd_data->pages = obj_request->pages;
+		osd_data->num_pages = obj_request->page_count;
+		osd_data->alignment = offset & ~PAGE_MASK;
+		osd_data->pages_from_pool = false;
+		osd_data->own_pages = false;
 		break;
 	}
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 276fe96f12e3..c117c51741d5 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -243,9 +243,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
-	BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES);
-	for (i = 0; i < req->r_data.num_pages; i++, bytes -= PAGE_CACHE_SIZE) {
-		struct page *page = req->r_data.pages[i];
+	BUG_ON(req->r_data_in.type != CEPH_OSD_DATA_TYPE_PAGES);
+	for (i = 0; i < req->r_data_in.num_pages; i++) {
+		struct page *page = req->r_data_in.pages[i];
 
 		if (bytes < (int)PAGE_CACHE_SIZE) {
 			/* zero (remainder of) page */
@@ -258,8 +258,9 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 		SetPageUptodate(page);
 		unlock_page(page);
 		page_cache_release(page);
+		bytes -= PAGE_CACHE_SIZE;
 	}
-	kfree(req->r_data.pages);
+	kfree(req->r_data_in.pages);
 }
 
 static void ceph_unlock_page_vector(struct page **pages, int num_pages)
@@ -337,10 +338,10 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
-	req->r_data.pages = pages;
-	req->r_data.num_pages = nr_pages;
-	req->r_data.alignment = 0;
+	req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES;
+	req->r_data_in.pages = pages;
+	req->r_data_in.num_pages = nr_pages;
+	req->r_data_in.alignment = 0;
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
@@ -563,7 +564,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	BUG_ON(req->r_data.type != CEPH_OSD_DATA_TYPE_PAGES);
+	BUG_ON(req->r_data_out.type != CEPH_OSD_DATA_TYPE_PAGES);
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -571,7 +572,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 		 * raced with a truncation and was adjusted at the osd,
 		 * so don't believe the reply.
 		 */
-		wrote = req->r_data.num_pages;
+		wrote = req->r_data_out.num_pages;
 	} else {
 		wrote = 0;
 		mapping_set_error(mapping, rc);
@@ -580,8 +581,8 @@ static void writepages_finish(struct ceph_osd_request *req,
 	     inode, rc, bytes, wrote);
 
 	/* clean all pages */
-	for (i = 0; i < req->r_data.num_pages; i++) {
-		page = req->r_data.pages[i];
+	for (i = 0; i < req->r_data_out.num_pages; i++) {
+		page = req->r_data_out.pages[i];
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
 
@@ -610,31 +611,34 @@ static void writepages_finish(struct ceph_osd_request *req,
 		unlock_page(page);
 	}
 	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-	ceph_put_wrbuffer_cap_refs(ci, req->r_data.num_pages, snapc);
+	ceph_put_wrbuffer_cap_refs(ci, req->r_data_out.num_pages, snapc);
 
-	ceph_release_pages(req->r_data.pages, req->r_data.num_pages);
-	if (req->r_data.pages_from_pool)
-		mempool_free(req->r_data.pages,
+	ceph_release_pages(req->r_data_out.pages, req->r_data_out.num_pages);
+	if (req->r_data_out.pages_from_pool)
+		mempool_free(req->r_data_out.pages,
 			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
 	else
-		kfree(req->r_data.pages);
+		kfree(req->r_data_out.pages);
 	ceph_osdc_put_request(req);
 }
 
 /*
  * allocate a page vec, either directly, or if necessary, via a the
- * mempool.  we avoid the mempool if we can because req->r_data.num_pages
+ * mempool.  we avoid the mempool if we can because req->r_data_out.num_pages
  * may be less than the maximum write size.
  */
 static void alloc_page_vec(struct ceph_fs_client *fsc,
 			   struct ceph_osd_request *req)
 {
-	req->r_data.pages = kmalloc(sizeof(struct page *) * req->r_data.num_pages,
-			       GFP_NOFS);
-	if (!req->r_data.pages) {
-		req->r_data.pages = mempool_alloc(fsc->wb_pagevec_pool, GFP_NOFS);
-		req->r_data.pages_from_pool = 1;
-		WARN_ON(!req->r_data.pages);
+	size_t size;
+
+	size = sizeof (struct page *) * req->r_data_out.num_pages;
+	req->r_data_out.pages = kmalloc(size, GFP_NOFS);
+	if (!req->r_data_out.pages) {
+		req->r_data_out.pages = mempool_alloc(fsc->wb_pagevec_pool,
+							GFP_NOFS);
+		req->r_data_out.pages_from_pool = 1;
+		WARN_ON(!req->r_data_out.pages);
 	}
 }
 
@@ -833,10 +837,11 @@ get_more_pages:
 					break;
 				}
 
-				req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
-				req->r_data.num_pages = calc_pages_for(0, len);
-				req->r_data.alignment = 0;
-				max_pages = req->r_data.num_pages;
+				req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
+				req->r_data_out.num_pages =
+						calc_pages_for(0, len);
+				req->r_data_out.alignment = 0;
+				max_pages = req->r_data_out.num_pages;
 
 				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
@@ -858,7 +863,7 @@ get_more_pages:
 			}
 
 			set_page_writeback(page);
-			req->r_data.pages[locked_pages] = page;
+			req->r_data_out.pages[locked_pages] = page;
 			locked_pages++;
 			next = page->index + 1;
 		}
@@ -888,14 +893,14 @@ get_more_pages:
 		}
 
 		/* submit the write */
-		offset = req->r_data.pages[0]->index << PAGE_CACHE_SHIFT;
+		offset = req->r_data_out.pages[0]->index << PAGE_CACHE_SHIFT;
 		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
 			  (u64)locked_pages << PAGE_CACHE_SHIFT);
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
 		/* revise final length, page count */
-		req->r_data.num_pages = locked_pages;
+		req->r_data_out.num_pages = locked_pages;
 		req->r_request_ops[0].extent.length = cpu_to_le64(len);
 		req->r_request_ops[0].payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3643a386ab23..501fb37b81a2 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -568,13 +568,13 @@ more:
 		if ((file->f_flags & O_SYNC) == 0) {
 			/* get a second commit callback */
 			req->r_safe_callback = sync_write_commit;
-			req->r_data.own_pages = 1;
+			req->r_data_out.own_pages = 1;
 		}
 	}
-	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
-	req->r_data.pages = pages;
-	req->r_data.num_pages = num_pages;
-	req->r_data.alignment = page_align;
+	req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
+	req->r_data_out.pages = pages;
+	req->r_data_out.num_pages = num_pages;
+	req->r_data_out.alignment = page_align;
 	req->r_inode = inode;
 
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 56604b33dc3c..40e02603723d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -130,8 +130,9 @@ struct ceph_osd_request {
 	struct ceph_file_layout r_file_layout;
 	struct ceph_snap_context *r_snapc;    /* snap context for writes */
 
-	struct ceph_osd_data r_data;
-	struct ceph_pagelist r_trail;	      /* trailing part of the data */
+	struct ceph_osd_data r_data_in;
+	struct ceph_osd_data r_data_out;
+	struct ceph_pagelist r_trail;	      /* trailing part of data out */
 };
 
 struct ceph_osd_event {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 591e1b0cccbe..f9cf44504484 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -122,10 +122,16 @@ void ceph_osdc_release_request(struct kref *kref)
 	}
 	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
-	if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES &&
-			req->r_data.own_pages)
-		ceph_release_page_vector(req->r_data.pages,
-					 req->r_data.num_pages);
+
+	if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES &&
+			req->r_data_in.own_pages)
+		ceph_release_page_vector(req->r_data_in.pages,
+					 req->r_data_in.num_pages);
+	if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES &&
+			req->r_data_out.own_pages)
+		ceph_release_page_vector(req->r_data_out.pages,
+					 req->r_data_out.num_pages);
+
 	ceph_put_snap_context(req->r_snapc);
 	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)
@@ -189,7 +195,8 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
-	req->r_data.type = CEPH_OSD_DATA_TYPE_NONE;
+	req->r_data_in.type = CEPH_OSD_DATA_TYPE_NONE;
+	req->r_data_out.type = CEPH_OSD_DATA_TYPE_NONE;
 	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
@@ -1740,17 +1747,21 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 			    bool nofail)
 {
 	int rc = 0;
+	struct ceph_osd_data *osd_data;
+
+	/* Set up outgoing data */
 
-	if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) {
-		req->r_request->pages = req->r_data.pages;
-		req->r_request->page_count = req->r_data.num_pages;
-		req->r_request->page_alignment = req->r_data.alignment;
+	osd_data = &req->r_data_out;
+	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+		req->r_request->pages = osd_data->pages;
+		req->r_request->page_count = osd_data->num_pages;
+		req->r_request->page_alignment = osd_data->alignment;
 #ifdef CONFIG_BLOCK
-	} else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) {
-		req->r_request->bio = req->r_data.bio;
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
+		req->r_request->bio = osd_data->bio;
 #endif
 	} else {
-		pr_err("unknown request data type %d\n", req->r_data.type);
+		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
 	}
 	req->r_request->trail = &req->r_trail;
 
@@ -1939,6 +1950,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			struct page **pages, int num_pages, int page_align)
 {
 	struct ceph_osd_request *req;
+	struct ceph_osd_data *osd_data;
 	int rc = 0;
 
 	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
@@ -1951,13 +1963,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 		return PTR_ERR(req);
 
 	/* it may be a short read due to an object boundary */
-	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
-	req->r_data.pages = pages;
-	req->r_data.num_pages = calc_pages_for(page_align, *plen);
-	req->r_data.alignment = page_align;
+
+	osd_data = &req->r_data_in;
+	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+	osd_data->pages = pages;
+	osd_data->num_pages = calc_pages_for(page_align, *plen);
+	osd_data->alignment = page_align;
 
 	dout("readpages  final extent is %llu~%llu (%d pages align %d)\n",
-	     off, *plen, req->r_data.num_pages, page_align);
+	     off, *plen, osd_data->num_pages, page_align);
 
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
@@ -1981,6 +1995,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 struct page **pages, int num_pages)
 {
 	struct ceph_osd_request *req;
+	struct ceph_osd_data *osd_data;
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
@@ -1995,11 +2010,13 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
-	req->r_data.type = CEPH_OSD_DATA_TYPE_PAGES;
-	req->r_data.pages = pages;
-	req->r_data.num_pages = calc_pages_for(page_align, len);
-	req->r_data.alignment = page_align;
-	dout("writepages %llu~%llu (%d pages)\n", off, len, req->r_data.num_pages);
+	osd_data = &req->r_data_out;
+	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+	osd_data->pages = pages;
+	osd_data->num_pages = calc_pages_for(page_align, len);
+	osd_data->alignment = page_align;
+	dout("writepages %llu~%llu (%d pages)\n", off, len,
+		osd_data->num_pages);
 
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
@@ -2092,28 +2109,30 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		if (req->r_data.type == CEPH_OSD_DATA_TYPE_PAGES) {
+		struct ceph_osd_data *osd_data = &req->r_data_in;
+
+		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 			int want;
 
-			want = calc_pages_for(req->r_data.alignment, data_len);
-			if (req->r_data.pages &&
-				unlikely(req->r_data.num_pages < want)) {
+			want = calc_pages_for(osd_data->alignment, data_len);
+			if (osd_data->pages &&
+				unlikely(osd_data->num_pages < want)) {
 
 				pr_warning("tid %lld reply has %d bytes %d "
 					"pages, we had only %d pages ready\n",
 					tid, data_len, want,
-					req->r_data.num_pages);
+					osd_data->num_pages);
 				*skip = 1;
 				ceph_msg_put(m);
 				m = NULL;
 				goto out;
 			}
-			m->pages = req->r_data.pages;
-			m->page_count = req->r_data.num_pages;
-			m->page_alignment = req->r_data.alignment;
+			m->pages = osd_data->pages;
+			m->page_count = osd_data->num_pages;
+			m->page_alignment = osd_data->alignment;
 #ifdef CONFIG_BLOCK
-		} else if (req->r_data.type == CEPH_OSD_DATA_TYPE_BIO) {
-			m->bio = req->r_data.bio;
+		} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
+			m->bio = osd_data->bio;
 #endif
 		}
 	}
-- 
cgit 


From 7b11ba37585595034a91df8869414f732466b800 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Mar 2013 18:51:03 -0600
Subject: libceph: define CEPH_MSG_MAX_MIDDLE_LEN

This is probably unnecessary but the code read as if it were wrong
in read_partial_message().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/libceph.h | 1 +
 net/ceph/messenger.c         | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 29818fc3fa49..5493d7b86423 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -66,6 +66,7 @@ struct ceph_options {
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 
 #define CEPH_MSG_MAX_FRONT_LEN	(16*1024*1024)
+#define CEPH_MSG_MAX_MIDDLE_LEN	(16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN	(16*1024*1024)
 
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index af0c35d40048..b8d0da56d610 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1887,7 +1887,7 @@ static int read_partial_message(struct ceph_connection *con)
 	if (front_len > CEPH_MSG_MAX_FRONT_LEN)
 		return -EIO;
 	middle_len = le32_to_cpu(con->in_hdr.middle_len);
-	if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+	if (middle_len > CEPH_MSG_MAX_MIDDLE_LEN)
 		return -EIO;
 	data_len = le32_to_cpu(con->in_hdr.data_len);
 	if (data_len > CEPH_MSG_MAX_DATA_LEN)
-- 
cgit 


From e0c594878e3211b09208c779df5f996f0b831d9e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 7 Mar 2013 15:38:25 -0600
Subject: libceph: record byte count not page count

Record the byte count for an osd request rather than the page count.
The number of pages can always be derived from the byte count (and
alignment/offset) but the reverse is not true.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  2 +-
 fs/ceph/addr.c                  | 33 ++++++++++++++++-----------
 fs/ceph/file.c                  |  2 +-
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 50 ++++++++++++++++++++++++-----------------
 5 files changed, 52 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 3f69eb1bc656..04cd5fdfc8f3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1433,7 +1433,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 	case OBJ_REQUEST_PAGES:
 		osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
 		osd_data->pages = obj_request->pages;
-		osd_data->num_pages = obj_request->page_count;
+		osd_data->length = obj_request->length;
 		osd_data->alignment = offset & ~PAGE_MASK;
 		osd_data->pages_from_pool = false;
 		osd_data->own_pages = false;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c117c51741d5..45745aae4786 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -238,13 +238,16 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	struct inode *inode = req->r_inode;
 	int rc = req->r_result;
 	int bytes = le32_to_cpu(msg->hdr.data_len);
+	int num_pages;
 	int i;
 
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
 	BUG_ON(req->r_data_in.type != CEPH_OSD_DATA_TYPE_PAGES);
-	for (i = 0; i < req->r_data_in.num_pages; i++) {
+	num_pages = calc_pages_for((u64)req->r_data_in.alignment,
+					(u64)req->r_data_in.length);
+	for (i = 0; i < num_pages; i++) {
 		struct page *page = req->r_data_in.pages[i];
 
 		if (bytes < (int)PAGE_CACHE_SIZE) {
@@ -340,7 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	}
 	req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES;
 	req->r_data_in.pages = pages;
-	req->r_data_in.num_pages = nr_pages;
+	req->r_data_in.length = len;
 	req->r_data_in.alignment = 0;
 	req->r_callback = finish_read;
 	req->r_inode = inode;
@@ -555,6 +558,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	unsigned wrote;
 	struct page *page;
+	int num_pages;
 	int i;
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
@@ -565,6 +569,8 @@ static void writepages_finish(struct ceph_osd_request *req,
 	unsigned issued = ceph_caps_issued(ci);
 
 	BUG_ON(req->r_data_out.type != CEPH_OSD_DATA_TYPE_PAGES);
+	num_pages = calc_pages_for((u64)req->r_data_out.alignment,
+					(u64)req->r_data_out.length);
 	if (rc >= 0) {
 		/*
 		 * Assume we wrote the pages we originally sent.  The
@@ -572,7 +578,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 		 * raced with a truncation and was adjusted at the osd,
 		 * so don't believe the reply.
 		 */
-		wrote = req->r_data_out.num_pages;
+		wrote = num_pages;
 	} else {
 		wrote = 0;
 		mapping_set_error(mapping, rc);
@@ -581,7 +587,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	     inode, rc, bytes, wrote);
 
 	/* clean all pages */
-	for (i = 0; i < req->r_data_out.num_pages; i++) {
+	for (i = 0; i < num_pages; i++) {
 		page = req->r_data_out.pages[i];
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
@@ -611,9 +617,9 @@ static void writepages_finish(struct ceph_osd_request *req,
 		unlock_page(page);
 	}
 	dout("%p wrote+cleaned %d pages\n", inode, wrote);
-	ceph_put_wrbuffer_cap_refs(ci, req->r_data_out.num_pages, snapc);
+	ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc);
 
-	ceph_release_pages(req->r_data_out.pages, req->r_data_out.num_pages);
+	ceph_release_pages(req->r_data_out.pages, num_pages);
 	if (req->r_data_out.pages_from_pool)
 		mempool_free(req->r_data_out.pages,
 			     ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
@@ -624,15 +630,18 @@ static void writepages_finish(struct ceph_osd_request *req,
 
 /*
  * allocate a page vec, either directly, or if necessary, via a the
- * mempool.  we avoid the mempool if we can because req->r_data_out.num_pages
+ * mempool.  we avoid the mempool if we can because req->r_data_out.length
  * may be less than the maximum write size.
  */
 static void alloc_page_vec(struct ceph_fs_client *fsc,
 			   struct ceph_osd_request *req)
 {
 	size_t size;
+	int num_pages;
 
-	size = sizeof (struct page *) * req->r_data_out.num_pages;
+	num_pages = calc_pages_for((u64)req->r_data_out.alignment,
+					(u64)req->r_data_out.length);
+	size = sizeof (struct page *) * num_pages;
 	req->r_data_out.pages = kmalloc(size, GFP_NOFS);
 	if (!req->r_data_out.pages) {
 		req->r_data_out.pages = mempool_alloc(fsc->wb_pagevec_pool,
@@ -838,11 +847,9 @@ get_more_pages:
 				}
 
 				req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
-				req->r_data_out.num_pages =
-						calc_pages_for(0, len);
+				req->r_data_out.length = len;
 				req->r_data_out.alignment = 0;
-				max_pages = req->r_data_out.num_pages;
-
+				max_pages = calc_pages_for(0, (u64)len);
 				alloc_page_vec(fsc, req);
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
@@ -900,7 +907,7 @@ get_more_pages:
 		     locked_pages, offset, len);
 
 		/* revise final length, page count */
-		req->r_data_out.num_pages = locked_pages;
+		req->r_data_out.length = len;
 		req->r_request_ops[0].extent.length = cpu_to_le64(len);
 		req->r_request_ops[0].payload_len = cpu_to_le32(len);
 		req->r_request->hdr.data_len = cpu_to_le32(len);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 501fb37b81a2..0ac6e159bdc6 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -573,7 +573,7 @@ more:
 	}
 	req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
 	req->r_data_out.pages = pages;
-	req->r_data_out.num_pages = num_pages;
+	req->r_data_out.length = len;
 	req->r_data_out.alignment = page_align;
 	req->r_inode = inode;
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 40e02603723d..a8016dfbfdba 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -63,7 +63,7 @@ struct ceph_osd_data {
 	union {
 		struct {
 			struct page	**pages;
-			u32		num_pages;
+			u64		length;
 			u32		alignment;
 			bool		pages_from_pool;
 			bool		own_pages;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f9cf44504484..202af14dc6dc 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -107,6 +107,7 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
  */
 void ceph_osdc_release_request(struct kref *kref)
 {
+	int num_pages;
 	struct ceph_osd_request *req = container_of(kref,
 						    struct ceph_osd_request,
 						    r_kref);
@@ -124,13 +125,17 @@ void ceph_osdc_release_request(struct kref *kref)
 		ceph_msg_put(req->r_reply);
 
 	if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES &&
-			req->r_data_in.own_pages)
-		ceph_release_page_vector(req->r_data_in.pages,
-					 req->r_data_in.num_pages);
+			req->r_data_in.own_pages) {
+		num_pages = calc_pages_for((u64)req->r_data_in.alignment,
+						(u64)req->r_data_in.length);
+		ceph_release_page_vector(req->r_data_in.pages, num_pages);
+	}
 	if (req->r_data_out.type == CEPH_OSD_DATA_TYPE_PAGES &&
-			req->r_data_out.own_pages)
-		ceph_release_page_vector(req->r_data_out.pages,
-					 req->r_data_out.num_pages);
+			req->r_data_out.own_pages) {
+		num_pages = calc_pages_for((u64)req->r_data_out.alignment,
+						(u64)req->r_data_out.length);
+		ceph_release_page_vector(req->r_data_out.pages, num_pages);
+	}
 
 	ceph_put_snap_context(req->r_snapc);
 	ceph_pagelist_release(&req->r_trail);
@@ -1753,8 +1758,12 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 
 	osd_data = &req->r_data_out;
 	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+		unsigned int page_count;
+
 		req->r_request->pages = osd_data->pages;
-		req->r_request->page_count = osd_data->num_pages;
+		page_count = calc_pages_for((u64)osd_data->alignment,
+						(u64)osd_data->length);
+		req->r_request->page_count = page_count;
 		req->r_request->page_alignment = osd_data->alignment;
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
@@ -1967,11 +1976,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 	osd_data = &req->r_data_in;
 	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
 	osd_data->pages = pages;
-	osd_data->num_pages = calc_pages_for(page_align, *plen);
+	osd_data->length = *plen;
 	osd_data->alignment = page_align;
 
-	dout("readpages  final extent is %llu~%llu (%d pages align %d)\n",
-	     off, *plen, osd_data->num_pages, page_align);
+	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
+	     off, *plen, osd_data->length, page_align);
 
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
@@ -2013,10 +2022,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 	osd_data = &req->r_data_out;
 	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
 	osd_data->pages = pages;
-	osd_data->num_pages = calc_pages_for(page_align, len);
+	osd_data->length = len;
 	osd_data->alignment = page_align;
-	dout("writepages %llu~%llu (%d pages)\n", off, len,
-		osd_data->num_pages);
+	dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length);
 
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
@@ -2112,23 +2120,23 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		struct ceph_osd_data *osd_data = &req->r_data_in;
 
 		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-			int want;
+			unsigned int page_count;
 
-			want = calc_pages_for(osd_data->alignment, data_len);
 			if (osd_data->pages &&
-				unlikely(osd_data->num_pages < want)) {
+				unlikely(osd_data->length < data_len)) {
 
-				pr_warning("tid %lld reply has %d bytes %d "
-					"pages, we had only %d pages ready\n",
-					tid, data_len, want,
-					osd_data->num_pages);
+				pr_warning("tid %lld reply has %d bytes "
+					"we had only %llu bytes ready\n",
+					tid, data_len, osd_data->length);
 				*skip = 1;
 				ceph_msg_put(m);
 				m = NULL;
 				goto out;
 			}
+			page_count = calc_pages_for((u64)osd_data->alignment,
+							(u64)osd_data->length);
 			m->pages = osd_data->pages;
-			m->page_count = osd_data->num_pages;
+			m->page_count = page_count;
 			m->page_alignment = osd_data->alignment;
 #ifdef CONFIG_BLOCK
 		} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
-- 
cgit 


From 02afca6ca00b7972887c5cc77068356f33bdfc18 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: libceph: isolate message page field manipulation

Define a function ceph_msg_data_set_pages(), which more clearly
abstracts the assignment page-related fields for data in a ceph
message structure.  Use this new function in the osd client and mds
client.

Ideally, these fields would never be set more than once (with
BUG_ON() calls to guarantee that).  At the moment though the osd
client sets these every time it receives a message, and in the event
of a communication problem this can happen more than once.  (This
will be resolved shortly, but setting up these helpers first makes
it all a bit easier to work with.)

Rearrange the field order in a ceph_msg structure to group those
that are used to define the possible data payloads.

This partially resolves:
    http://tracker.ceph.com/issues/4263

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/mds_client.c           |  4 ++--
 include/linux/ceph/messenger.h | 22 +++++++++++++---------
 net/ceph/messenger.c           | 11 +++++++++++
 net/ceph/osd_client.c          | 10 ++++------
 4 files changed, 30 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ecfb738bca30..90198a407023 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1721,8 +1721,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
-	msg->pages = req->r_pages;
-	msg->page_count = req->r_num_pages;
+	ceph_msg_data_set_pages(msg, req->r_pages, req->r_num_pages, 0);
+
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
 	msg->hdr.data_off = cpu_to_le16(0);
 
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 6c118748a7f8..aa463b9b30af 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -74,21 +74,22 @@ struct ceph_msg {
 	struct ceph_msg_footer footer;	/* footer */
 	struct kvec front;              /* unaligned blobs of message */
 	struct ceph_buffer *middle;
-	struct page **pages;            /* data payload.  NOT OWNER. */
-	unsigned page_count;		/* size of page array */
-	unsigned page_alignment;        /* io offset in first page */
-	struct ceph_pagelist *pagelist; /* instead of pages */
-
-	struct ceph_connection *con;
-	struct list_head list_head;
 
-	struct kref kref;
+	struct page **pages;		/* data payload.  NOT OWNER. */
+	unsigned int page_alignment;	/* io offset in first page */
+	unsigned int page_count;	/* # pages in array or list */
+	struct ceph_pagelist *pagelist; /* instead of pages */
 #ifdef CONFIG_BLOCK
+	unsigned int bio_seg;		/* current bio segment */
 	struct bio  *bio;		/* instead of pages/pagelist */
 	struct bio  *bio_iter;		/* bio iterator */
-	unsigned int bio_seg;		/* current bio segment */
 #endif /* CONFIG_BLOCK */
 	struct ceph_pagelist *trail;	/* the trailing part of the data */
+
+	struct ceph_connection *con;
+	struct list_head list_head;	/* links for connection lists */
+
+	struct kref kref;
 	bool front_is_vmalloc;
 	bool more_to_follow;
 	bool needs_out_seq;
@@ -218,6 +219,9 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
 
+extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
+				unsigned int page_count, size_t alignment);
+
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
 extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ce1669f75ca5..cec39cb623f0 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2689,6 +2689,17 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
+		unsigned int page_count, size_t alignment)
+{
+	/* BUG_ON(msg->pages); */
+	/* BUG_ON(msg->page_count); */
+
+	msg->pages = pages;
+	msg->page_count = page_count;
+	msg->page_alignment = alignment & ~PAGE_MASK;
+}
+EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
 /*
  * construct a new message with given type, size
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 202af14dc6dc..a09d57134075 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1760,11 +1760,10 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 		unsigned int page_count;
 
-		req->r_request->pages = osd_data->pages;
 		page_count = calc_pages_for((u64)osd_data->alignment,
 						(u64)osd_data->length);
-		req->r_request->page_count = page_count;
-		req->r_request->page_alignment = osd_data->alignment;
+		ceph_msg_data_set_pages(req->r_request, osd_data->pages,
+			page_count, osd_data->alignment);
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 		req->r_request->bio = osd_data->bio;
@@ -2135,9 +2134,8 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 			}
 			page_count = calc_pages_for((u64)osd_data->alignment,
 							(u64)osd_data->length);
-			m->pages = osd_data->pages;
-			m->page_count = page_count;
-			m->page_alignment = osd_data->alignment;
+			ceph_msg_data_set_pages(m, osd_data->pages,
+				osd_data->num_pages, osd_data->alignment);
 #ifdef CONFIG_BLOCK
 		} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 			m->bio = osd_data->bio;
-- 
cgit 


From f1baeb2b9fc1c2c87ec02f1bf8cb88e108d4fbce Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 7 Mar 2013 15:38:26 -0600
Subject: libceph: set page info with byte length

When setting page array information for message data, provide the
byte length rather than the page count ceph_msg_data_set_pages().

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/mds_client.c           |  2 +-
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger.c           |  4 ++--
 net/ceph/osd_client.c          | 14 ++++----------
 4 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 90198a407023..03eb943ebce5 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1721,7 +1721,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	msg->front.iov_len = p - msg->front.iov_base;
 	msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
 
-	ceph_msg_data_set_pages(msg, req->r_pages, req->r_num_pages, 0);
+	ceph_msg_data_set_pages(msg, req->r_pages, req->r_data_len, 0);
 
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
 	msg->hdr.data_off = cpu_to_le16(0);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index aa463b9b30af..e6d20e892a88 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -220,7 +220,7 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 extern void ceph_con_keepalive(struct ceph_connection *con);
 
 extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
-				unsigned int page_count, size_t alignment);
+				size_t length, size_t alignment);
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cec39cb623f0..fc59fcc9be77 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2690,13 +2690,13 @@ void ceph_con_keepalive(struct ceph_connection *con)
 EXPORT_SYMBOL(ceph_con_keepalive);
 
 void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
-		unsigned int page_count, size_t alignment)
+		size_t length, size_t alignment)
 {
 	/* BUG_ON(msg->pages); */
 	/* BUG_ON(msg->page_count); */
 
 	msg->pages = pages;
-	msg->page_count = page_count;
+	msg->page_count = calc_pages_for((u64)alignment, (u64)length);
 	msg->page_alignment = alignment & ~PAGE_MASK;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a09d57134075..f29bedac7310 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1758,12 +1758,9 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 
 	osd_data = &req->r_data_out;
 	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-		unsigned int page_count;
-
-		page_count = calc_pages_for((u64)osd_data->alignment,
-						(u64)osd_data->length);
+		BUG_ON(osd_data->length > (u64) SIZE_MAX);
 		ceph_msg_data_set_pages(req->r_request, osd_data->pages,
-			page_count, osd_data->alignment);
+			osd_data->length, osd_data->alignment);
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 		req->r_request->bio = osd_data->bio;
@@ -2119,8 +2116,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		struct ceph_osd_data *osd_data = &req->r_data_in;
 
 		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-			unsigned int page_count;
-
 			if (osd_data->pages &&
 				unlikely(osd_data->length < data_len)) {
 
@@ -2132,10 +2127,9 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 				m = NULL;
 				goto out;
 			}
-			page_count = calc_pages_for((u64)osd_data->alignment,
-							(u64)osd_data->length);
+			BUG_ON(osd_data->length > (u64) SIZE_MAX);
 			ceph_msg_data_set_pages(m, osd_data->pages,
-				osd_data->num_pages, osd_data->alignment);
+				osd_data->length, osd_data->alignment);
 #ifdef CONFIG_BLOCK
 		} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 			m->bio = osd_data->bio;
-- 
cgit 


From 27fa83852ba275361eaa1a1283cf6704fa8191a6 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Feb 2013 12:16:43 -0600
Subject: libceph: isolate other message data fields

Define ceph_msg_data_set_pagelist(), ceph_msg_data_set_bio(), and
ceph_msg_data_set_trail() to clearly abstract the assignment of the
remaining data-related fields in a ceph message structure.  Use the
new functions in the osd client and mds client.

This partially resolves:
    http://tracker.ceph.com/issues/4263

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/mds_client.c           |  2 +-
 include/linux/ceph/messenger.h |  5 +++++
 net/ceph/messenger.c           | 28 ++++++++++++++++++++++++++++
 net/ceph/osd_client.c          |  6 +++---
 4 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 03eb943ebce5..3b2aa8702ae0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2603,7 +2603,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 			goto fail;
 	}
 
-	reply->pagelist = pagelist;
+	ceph_msg_data_set_pagelist(reply, pagelist);
 	if (recon_state.flock)
 		reply->hdr.version = cpu_to_le16(2);
 	reply->hdr.data_len = cpu_to_le32(pagelist->length);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e6d20e892a88..9d9be4682ac3 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -221,6 +221,11 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 
 extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
+extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
+				struct ceph_pagelist *pagelist);
+extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio);
+extern void ceph_msg_data_set_trail(struct ceph_msg *msg,
+				struct ceph_pagelist *trail);
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index fc59fcc9be77..d1183536d5a8 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2701,6 +2701,34 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
+void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
+				struct ceph_pagelist *pagelist)
+{
+	/* BUG_ON(!pagelist); */
+	/* BUG_ON(msg->pagelist); */
+
+	msg->pagelist = pagelist;
+}
+EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
+
+void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
+{
+	/* BUG_ON(!bio); */
+	/* BUG_ON(msg->bio); */
+
+	msg->bio = bio;
+}
+EXPORT_SYMBOL(ceph_msg_data_set_bio);
+
+void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail)
+{
+	/* BUG_ON(!trail); */
+	/* BUG_ON(msg->trail); */
+
+	msg->trail = trail;
+}
+EXPORT_SYMBOL(ceph_msg_data_set_trail);
+
 /*
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f29bedac7310..387e3123d1ed 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1763,12 +1763,12 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 			osd_data->length, osd_data->alignment);
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
-		req->r_request->bio = osd_data->bio;
+		ceph_msg_data_set_bio(req->r_request, osd_data->bio);
 #endif
 	} else {
 		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
 	}
-	req->r_request->trail = &req->r_trail;
+	ceph_msg_data_set_trail(req->r_request, &req->r_trail);
 
 	register_request(osdc, req);
 
@@ -2132,7 +2132,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 				osd_data->length, osd_data->alignment);
 #ifdef CONFIG_BLOCK
 		} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
-			m->bio = osd_data->bio;
+			ceph_msg_data_set_bio(m, osd_data->bio);
 #endif
 		}
 	}
-- 
cgit 


From 4a73ef27ad04f1b8ea23eb55e50b20fcc0530a6f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 7 Mar 2013 15:38:26 -0600
Subject: libceph: record message data byte length

Record the number of bytes of data in a page array rather than the
number of pages in the array.  It can be assumed that the page array
is of sufficient size to hold the number of bytes indicated (and
offset by the indicated alignment).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  2 +-
 net/ceph/messenger.c           | 20 +++++++++-----------
 2 files changed, 10 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 9d9be4682ac3..1991a6f9dc90 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -77,7 +77,7 @@ struct ceph_msg {
 
 	struct page **pages;		/* data payload.  NOT OWNER. */
 	unsigned int page_alignment;	/* io offset in first page */
-	unsigned int page_count;	/* # pages in array or list */
+	size_t length;			/* # data bytes in array or list */
 	struct ceph_pagelist *pagelist; /* instead of pages */
 #ifdef CONFIG_BLOCK
 	unsigned int bio_seg;		/* current bio segment */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 1965d785cf83..f48e2af95005 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -809,11 +809,10 @@ static void prepare_write_message(struct ceph_connection *con)
 		m->bio_iter = NULL;
 #endif
 
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     le32_to_cpu(m->hdr.data_len),
-	     m->page_count);
+	     le32_to_cpu(m->hdr.data_len), m->length);
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
@@ -1091,9 +1090,8 @@ static int write_partial_msg_pages(struct ceph_connection *con)
 	const size_t trail_len = (msg->trail ? msg->trail->length : 0);
 	const size_t trail_off = data_len - trail_len;
 
-	dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
-	     con, msg, con->out_msg_pos.page, msg->page_count,
-	     con->out_msg_pos.page_pos);
+	dout("write_partial_msg_pages %p msg %p page %d offset %d\n",
+	     con, msg, con->out_msg_pos.page, con->out_msg_pos.page_pos);
 
 	/*
 	 * Iterate through each page that contains data to be
@@ -2695,10 +2693,10 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 	/* BUG_ON(!pages); */
 	/* BUG_ON(!length); */
 	/* BUG_ON(msg->pages); */
-	/* BUG_ON(msg->page_count); */
+	/* BUG_ON(msg->length); */
 
 	msg->pages = pages;
-	msg->page_count = calc_pages_for((u64)alignment, (u64)length);
+	msg->length = length;
 	msg->page_alignment = alignment & ~PAGE_MASK;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
@@ -2906,7 +2904,7 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	m->page_count = 0;
+	m->length = 0;
 	m->pages = NULL;
 
 	if (m->pagelist) {
@@ -2926,8 +2924,8 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 
 void ceph_msg_dump(struct ceph_msg *msg)
 {
-	pr_debug("msg_dump %p (front_max %d page_count %d)\n", msg,
-		 msg->front_max, msg->page_count);
+	pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
+		 msg->front_max, msg->length);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
-- 
cgit 


From 97fb1c7f6637ee61c90b8bc186d464cfd426b063 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:16 -0600
Subject: libceph: define ceph_msg_has_*() data macros

Define and use macros ceph_msg_has_*() to determine whether to
operate on the pages, pagelist, bio, and trail fields of a message.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  7 +++++++
 net/ceph/messenger.c           | 44 ++++++++++++++++++++++++++----------------
 2 files changed, 34 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 1991a6f9dc90..889fe4720133 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -64,6 +64,13 @@ struct ceph_messenger {
 	u32 required_features;
 };
 
+#define ceph_msg_has_pages(m)		((m)->pages != NULL)
+#define ceph_msg_has_pagelist(m)	((m)->pagelist != NULL)
+#ifdef CONFIG_BLOCK
+#define ceph_msg_has_bio(m)		((m)->bio != NULL)
+#endif /* CONFIG_BLOCK */
+#define ceph_msg_has_trail(m)		((m)->trail != NULL)
+
 /*
  * a single message.  it contains a header (src, dest, message type, etc.),
  * footer (crc values, mainly), a "front" message body, and possibly a
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f70bc92348d9..c74b5289778a 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -746,12 +746,12 @@ static void prepare_message_data(struct ceph_msg *msg,
 
 	/* initialize page iterator */
 	msg_pos->page = 0;
-	if (msg->pages)
+	if (ceph_msg_has_pages(msg))
 		msg_pos->page_pos = msg->page_alignment;
 	else
 		msg_pos->page_pos = 0;
 #ifdef CONFIG_BLOCK
-	if (msg->bio)
+	if (ceph_msg_has_bio(msg))
 		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
 #endif
 	msg_pos->data_pos = 0;
@@ -1052,14 +1052,16 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	msg_pos->page_pos = 0;
 	msg_pos->page++;
 	msg_pos->did_page_crc = false;
-	if (in_trail)
+	if (in_trail) {
+		BUG_ON(!ceph_msg_has_trail(msg));
 		list_rotate_left(&msg->trail->head);
-	else if (msg->pagelist)
+	} else if (ceph_msg_has_pagelist(msg)) {
 		list_rotate_left(&msg->pagelist->head);
 #ifdef CONFIG_BLOCK
-	else if (msg->bio)
+	} else if (ceph_msg_has_bio(msg)) {
 		iter_bio_next(&msg->bio_iter, &msg->bio_seg);
 #endif
+	}
 }
 
 static void in_msg_pos_next(struct ceph_connection *con, size_t len,
@@ -1114,8 +1116,13 @@ static int write_partial_message_data(struct ceph_connection *con)
 	int ret;
 	int total_max_write;
 	bool in_trail = false;
-	const size_t trail_len = (msg->trail ? msg->trail->length : 0);
-	const size_t trail_off = data_len - trail_len;
+	size_t trail_len = 0;
+	size_t trail_off = data_len;
+
+	if (ceph_msg_has_trail(msg)) {
+		trail_len = msg->trail->length;
+		trail_off -= trail_len;
+	}
 
 	dout("%s %p msg %p page %d offset %d\n", __func__,
 	     con, msg, msg_pos->page, msg_pos->page_pos);
@@ -1140,17 +1147,17 @@ static int write_partial_message_data(struct ceph_connection *con)
 			total_max_write = trail_off - msg_pos->data_pos;
 
 		if (in_trail) {
+			BUG_ON(!ceph_msg_has_trail(msg));
 			total_max_write = data_len - msg_pos->data_pos;
-
 			page = list_first_entry(&msg->trail->head,
 						struct page, lru);
-		} else if (msg->pages) {
+		} else if (ceph_msg_has_pages(msg)) {
 			page = msg->pages[msg_pos->page];
-		} else if (msg->pagelist) {
+		} else if (ceph_msg_has_pagelist(msg)) {
 			page = list_first_entry(&msg->pagelist->head,
 						struct page, lru);
 #ifdef CONFIG_BLOCK
-		} else if (msg->bio) {
+		} else if (ceph_msg_has_bio(msg)) {
 			struct bio_vec *bv;
 
 			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
@@ -1908,13 +1915,13 @@ static int read_partial_msg_data(struct ceph_connection *con)
 
 	data_len = le32_to_cpu(con->in_hdr.data_len);
 	while (msg_pos->data_pos < data_len) {
-		if (msg->pages) {
+		if (ceph_msg_has_pages(msg)) {
 			ret = read_partial_message_pages(con, msg->pages,
 						 data_len, do_datacrc);
 			if (ret <= 0)
 				return ret;
 #ifdef CONFIG_BLOCK
-		} else if (msg->bio) {
+		} else if (ceph_msg_has_bio(msg)) {
 			ret = read_partial_message_bio(con,
 						 data_len, do_datacrc);
 			if (ret <= 0)
@@ -2946,16 +2953,19 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	m->length = 0;
-	m->pages = NULL;
+	if (ceph_msg_has_pages(m)) {
+		m->length = 0;
+		m->pages = NULL;
+	}
 
-	if (m->pagelist) {
+	if (ceph_msg_has_pagelist(m)) {
 		ceph_pagelist_release(m->pagelist);
 		kfree(m->pagelist);
 		m->pagelist = NULL;
 	}
 
-	m->trail = NULL;
+	if (ceph_msg_has_trail(m))
+		m->trail = NULL;
 
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
-- 
cgit 


From f9e15777afd87585f2222dfd446c2e52deb65eba Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:16 -0600
Subject: libceph: be explicit about message data representation

A ceph message has a data payload portion.  The memory for that data
(either the source of data to send or the location to place data
that is received) is specified in several ways.  The ceph_msg
structure includes fields for all of those ways, but this
mispresents the fact that not all of them are used at a time.

Specifically, the data in a message can be in:
    - an array of pages
    - a list of pages
    - a list of Linux bios
    - a second list of pages (the "trail")
(The two page lists are currently only ever used for outgoing data.)

Impose more structure on the ceph message, making the grouping of
some of these fields explicit.  Shorten the name of the
"page_alignment" field.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h | 33 ++++++++++++--------
 net/ceph/messenger.c           | 68 +++++++++++++++++++++---------------------
 2 files changed, 55 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 889fe4720133..fb2b18a20c13 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -64,12 +64,12 @@ struct ceph_messenger {
 	u32 required_features;
 };
 
-#define ceph_msg_has_pages(m)		((m)->pages != NULL)
-#define ceph_msg_has_pagelist(m)	((m)->pagelist != NULL)
+#define ceph_msg_has_pages(m)		((m)->p.pages != NULL)
+#define ceph_msg_has_pagelist(m)	((m)->l.pagelist != NULL)
 #ifdef CONFIG_BLOCK
-#define ceph_msg_has_bio(m)		((m)->bio != NULL)
+#define ceph_msg_has_bio(m)		((m)->b.bio != NULL)
 #endif /* CONFIG_BLOCK */
-#define ceph_msg_has_trail(m)		((m)->trail != NULL)
+#define ceph_msg_has_trail(m)		((m)->t.trail != NULL)
 
 /*
  * a single message.  it contains a header (src, dest, message type, etc.),
@@ -82,16 +82,25 @@ struct ceph_msg {
 	struct kvec front;              /* unaligned blobs of message */
 	struct ceph_buffer *middle;
 
-	struct page **pages;		/* data payload.  NOT OWNER. */
-	unsigned int page_alignment;	/* io offset in first page */
-	size_t length;			/* # data bytes in array or list */
-	struct ceph_pagelist *pagelist; /* instead of pages */
+	/* data payload */
+	struct {
+		struct page	**pages;	/* NOT OWNER. */
+		size_t		length;		/* # data bytes in array */
+		unsigned int	alignment;	/* first page */
+	} p;
+	struct {
+		struct ceph_pagelist	*pagelist;
+	} l;
 #ifdef CONFIG_BLOCK
-	unsigned int bio_seg;		/* current bio segment */
-	struct bio  *bio;		/* instead of pages/pagelist */
-	struct bio  *bio_iter;		/* bio iterator */
+	struct {
+		struct bio	*bio_iter;	/* iterator */
+		struct bio	*bio;
+		unsigned int	bio_seg;	/* current seg in bio */
+	} b;
 #endif /* CONFIG_BLOCK */
-	struct ceph_pagelist *trail;	/* the trailing part of the data */
+	struct {
+		struct ceph_pagelist *trail;	/* trailing part of data */
+	} t;
 
 	struct ceph_connection *con;
 	struct list_head list_head;	/* links for connection lists */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c74b5289778a..f485455f05a8 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -747,12 +747,12 @@ static void prepare_message_data(struct ceph_msg *msg,
 	/* initialize page iterator */
 	msg_pos->page = 0;
 	if (ceph_msg_has_pages(msg))
-		msg_pos->page_pos = msg->page_alignment;
+		msg_pos->page_pos = msg->p.alignment;
 	else
 		msg_pos->page_pos = 0;
 #ifdef CONFIG_BLOCK
 	if (ceph_msg_has_bio(msg))
-		init_bio_iter(msg->bio, &msg->bio_iter, &msg->bio_seg);
+		init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg);
 #endif
 	msg_pos->data_pos = 0;
 	msg_pos->did_page_crc = false;
@@ -822,7 +822,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     le32_to_cpu(m->hdr.data_len), m->length);
+	     le32_to_cpu(m->hdr.data_len), m->p.length);
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
@@ -1054,12 +1054,12 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	msg_pos->did_page_crc = false;
 	if (in_trail) {
 		BUG_ON(!ceph_msg_has_trail(msg));
-		list_rotate_left(&msg->trail->head);
+		list_rotate_left(&msg->t.trail->head);
 	} else if (ceph_msg_has_pagelist(msg)) {
-		list_rotate_left(&msg->pagelist->head);
+		list_rotate_left(&msg->l.pagelist->head);
 #ifdef CONFIG_BLOCK
 	} else if (ceph_msg_has_bio(msg)) {
-		iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+		iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg);
 #endif
 	}
 }
@@ -1082,8 +1082,8 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len,
 	msg_pos->page_pos = 0;
 	msg_pos->page++;
 #ifdef CONFIG_BLOCK
-	if (msg->bio)
-		iter_bio_next(&msg->bio_iter, &msg->bio_seg);
+	if (msg->b.bio)
+		iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg);
 #endif /* CONFIG_BLOCK */
 }
 
@@ -1120,7 +1120,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 	size_t trail_off = data_len;
 
 	if (ceph_msg_has_trail(msg)) {
-		trail_len = msg->trail->length;
+		trail_len = msg->t.trail->length;
 		trail_off -= trail_len;
 	}
 
@@ -1149,18 +1149,18 @@ static int write_partial_message_data(struct ceph_connection *con)
 		if (in_trail) {
 			BUG_ON(!ceph_msg_has_trail(msg));
 			total_max_write = data_len - msg_pos->data_pos;
-			page = list_first_entry(&msg->trail->head,
+			page = list_first_entry(&msg->t.trail->head,
 						struct page, lru);
 		} else if (ceph_msg_has_pages(msg)) {
-			page = msg->pages[msg_pos->page];
+			page = msg->p.pages[msg_pos->page];
 		} else if (ceph_msg_has_pagelist(msg)) {
-			page = list_first_entry(&msg->pagelist->head,
+			page = list_first_entry(&msg->l.pagelist->head,
 						struct page, lru);
 #ifdef CONFIG_BLOCK
 		} else if (ceph_msg_has_bio(msg)) {
 			struct bio_vec *bv;
 
-			bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+			bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg);
 			page = bv->bv_page;
 			bio_offset = bv->bv_offset;
 			max_write = bv->bv_len;
@@ -1880,8 +1880,8 @@ static int read_partial_message_bio(struct ceph_connection *con,
 	int ret;
 
 	BUG_ON(!msg);
-	BUG_ON(!msg->bio_iter);
-	bv = bio_iovec_idx(msg->bio_iter, msg->bio_seg);
+	BUG_ON(!msg->b.bio_iter);
+	bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg);
 	page = bv->bv_page;
 	page_offset = bv->bv_offset + msg_pos->page_pos;
 	BUG_ON(msg_pos->data_pos >= data_len);
@@ -1916,7 +1916,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	data_len = le32_to_cpu(con->in_hdr.data_len);
 	while (msg_pos->data_pos < data_len) {
 		if (ceph_msg_has_pages(msg)) {
-			ret = read_partial_message_pages(con, msg->pages,
+			ret = read_partial_message_pages(con, msg->p.pages,
 						 data_len, do_datacrc);
 			if (ret <= 0)
 				return ret;
@@ -2741,12 +2741,12 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 {
 	BUG_ON(!pages);
 	BUG_ON(!length);
-	BUG_ON(msg->pages);
-	BUG_ON(msg->length);
+	BUG_ON(msg->p.pages);
+	BUG_ON(msg->p.length);
 
-	msg->pages = pages;
-	msg->length = length;
-	msg->page_alignment = alignment & ~PAGE_MASK;
+	msg->p.pages = pages;
+	msg->p.length = length;
+	msg->p.alignment = alignment & ~PAGE_MASK;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
@@ -2755,18 +2755,18 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 {
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
-	BUG_ON(msg->pagelist);
+	BUG_ON(msg->l.pagelist);
 
-	msg->pagelist = pagelist;
+	msg->l.pagelist = pagelist;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 
 void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
 {
 	BUG_ON(!bio);
-	BUG_ON(msg->bio);
+	BUG_ON(msg->b.bio);
 
-	msg->bio = bio;
+	msg->b.bio = bio;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
 
@@ -2774,9 +2774,9 @@ void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail)
 {
 	BUG_ON(!trail);
 	BUG_ON(!trail->length);
-	BUG_ON(msg->trail);
+	BUG_ON(msg->t.trail);
 
-	msg->trail = trail;
+	msg->t.trail = trail;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_trail);
 
@@ -2954,18 +2954,18 @@ void ceph_msg_last_put(struct kref *kref)
 		m->middle = NULL;
 	}
 	if (ceph_msg_has_pages(m)) {
-		m->length = 0;
-		m->pages = NULL;
+		m->p.length = 0;
+		m->p.pages = NULL;
 	}
 
 	if (ceph_msg_has_pagelist(m)) {
-		ceph_pagelist_release(m->pagelist);
-		kfree(m->pagelist);
-		m->pagelist = NULL;
+		ceph_pagelist_release(m->l.pagelist);
+		kfree(m->l.pagelist);
+		m->l.pagelist = NULL;
 	}
 
 	if (ceph_msg_has_trail(m))
-		m->trail = NULL;
+		m->t.trail = NULL;
 
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
@@ -2977,7 +2977,7 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 void ceph_msg_dump(struct ceph_msg *msg)
 {
 	pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
-		 msg->front_max, msg->length);
+		 msg->front_max, msg->p.length);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
-- 
cgit 


From 437945094fed0deb1810e8da95465c8f26bc6f80 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 1 Mar 2013 18:00:16 -0600
Subject: libceph: abstract message data

Group the types of message data into an abstract structure with a
type indicator and a union containing fields appropriate to the
type of data it represents.  Use this to represent the pages,
pagelist, bio, and trail in a ceph message.

Verify message data is of type NONE in ceph_msg_data_set_*()
routines.  Since information about message data of type NONE really
should not be interpreted, get rid of the other assertions in those
functions.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h | 71 ++++++++++++++++++++++++++++++------------
 net/ceph/messenger.c           | 33 ++++++++++++++------
 2 files changed, 74 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index fb2b18a20c13..5860dd0c2caf 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -64,12 +64,55 @@ struct ceph_messenger {
 	u32 required_features;
 };
 
-#define ceph_msg_has_pages(m)		((m)->p.pages != NULL)
-#define ceph_msg_has_pagelist(m)	((m)->l.pagelist != NULL)
+#define ceph_msg_has_pages(m)		((m)->p.type == CEPH_MSG_DATA_PAGES)
+#define ceph_msg_has_pagelist(m)	((m)->l.type == CEPH_MSG_DATA_PAGELIST)
 #ifdef CONFIG_BLOCK
-#define ceph_msg_has_bio(m)		((m)->b.bio != NULL)
+#define ceph_msg_has_bio(m)		((m)->b.type == CEPH_MSG_DATA_BIO)
 #endif /* CONFIG_BLOCK */
-#define ceph_msg_has_trail(m)		((m)->t.trail != NULL)
+#define ceph_msg_has_trail(m)		((m)->t.type == CEPH_MSG_DATA_PAGELIST)
+
+enum ceph_msg_data_type {
+	CEPH_MSG_DATA_NONE,	/* message contains no data payload */
+	CEPH_MSG_DATA_PAGES,	/* data source/destination is a page array */
+	CEPH_MSG_DATA_PAGELIST,	/* data source/destination is a pagelist */
+#ifdef CONFIG_BLOCK
+	CEPH_MSG_DATA_BIO,	/* data source/destination is a bio list */
+#endif /* CONFIG_BLOCK */
+};
+
+static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
+{
+	switch (type) {
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
+	case CEPH_MSG_DATA_PAGELIST:
+#ifdef CONFIG_BLOCK
+	case CEPH_MSG_DATA_BIO:
+#endif /* CONFIG_BLOCK */
+		return true;
+	default:
+		return false;
+	}
+}
+
+struct ceph_msg_data {
+	enum ceph_msg_data_type		type;
+	union {
+#ifdef CONFIG_BLOCK
+		struct {
+			struct bio	*bio_iter;	/* iterator */
+			struct bio	*bio;
+			unsigned int	bio_seg;	/* current seg in bio */
+		};
+#endif /* CONFIG_BLOCK */
+		struct {
+			struct page	**pages;	/* NOT OWNER. */
+			size_t		length;		/* total # bytes */
+			unsigned int	alignment;	/* first page */
+		};
+		struct ceph_pagelist	*pagelist;
+	};
+};
 
 /*
  * a single message.  it contains a header (src, dest, message type, etc.),
@@ -83,24 +126,12 @@ struct ceph_msg {
 	struct ceph_buffer *middle;
 
 	/* data payload */
-	struct {
-		struct page	**pages;	/* NOT OWNER. */
-		size_t		length;		/* # data bytes in array */
-		unsigned int	alignment;	/* first page */
-	} p;
-	struct {
-		struct ceph_pagelist	*pagelist;
-	} l;
+	struct ceph_msg_data	p;	/* pages */
+	struct ceph_msg_data	l;	/* pagelist */
 #ifdef CONFIG_BLOCK
-	struct {
-		struct bio	*bio_iter;	/* iterator */
-		struct bio	*bio;
-		unsigned int	bio_seg;	/* current seg in bio */
-	} b;
+	struct ceph_msg_data	b;	/* bio */
 #endif /* CONFIG_BLOCK */
-	struct {
-		struct ceph_pagelist *trail;	/* trailing part of data */
-	} t;
+	struct ceph_msg_data	t;	/* trail */
 
 	struct ceph_connection *con;
 	struct list_head list_head;	/* links for connection lists */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f485455f05a8..f256b4b174ad 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1054,7 +1054,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	msg_pos->did_page_crc = false;
 	if (in_trail) {
 		BUG_ON(!ceph_msg_has_trail(msg));
-		list_rotate_left(&msg->t.trail->head);
+		list_rotate_left(&msg->t.pagelist->head);
 	} else if (ceph_msg_has_pagelist(msg)) {
 		list_rotate_left(&msg->l.pagelist->head);
 #ifdef CONFIG_BLOCK
@@ -1120,7 +1120,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 	size_t trail_off = data_len;
 
 	if (ceph_msg_has_trail(msg)) {
-		trail_len = msg->t.trail->length;
+		trail_len = msg->t.pagelist->length;
 		trail_off -= trail_len;
 	}
 
@@ -1149,7 +1149,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 		if (in_trail) {
 			BUG_ON(!ceph_msg_has_trail(msg));
 			total_max_write = data_len - msg_pos->data_pos;
-			page = list_first_entry(&msg->t.trail->head,
+			page = list_first_entry(&msg->t.pagelist->head,
 						struct page, lru);
 		} else if (ceph_msg_has_pages(msg)) {
 			page = msg->p.pages[msg_pos->page];
@@ -2736,14 +2736,19 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
+static void ceph_msg_data_init(struct ceph_msg_data *data)
+{
+	data->type = CEPH_MSG_DATA_NONE;
+}
+
 void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 		size_t length, size_t alignment)
 {
 	BUG_ON(!pages);
 	BUG_ON(!length);
-	BUG_ON(msg->p.pages);
-	BUG_ON(msg->p.length);
+	BUG_ON(msg->p.type != CEPH_MSG_DATA_NONE);
 
+	msg->p.type = CEPH_MSG_DATA_PAGES;
 	msg->p.pages = pages;
 	msg->p.length = length;
 	msg->p.alignment = alignment & ~PAGE_MASK;
@@ -2755,8 +2760,9 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 {
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
-	BUG_ON(msg->l.pagelist);
+	BUG_ON(msg->l.type != CEPH_MSG_DATA_NONE);
 
+	msg->l.type = CEPH_MSG_DATA_PAGELIST;
 	msg->l.pagelist = pagelist;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
@@ -2764,8 +2770,9 @@ EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
 {
 	BUG_ON(!bio);
-	BUG_ON(msg->b.bio);
+	BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE);
 
+	msg->b.type = CEPH_MSG_DATA_BIO;
 	msg->b.bio = bio;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
@@ -2774,9 +2781,10 @@ void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail)
 {
 	BUG_ON(!trail);
 	BUG_ON(!trail->length);
-	BUG_ON(msg->t.trail);
+	BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE);
 
-	msg->t.trail = trail;
+	msg->t.type = CEPH_MSG_DATA_PAGELIST;
+	msg->t.pagelist = trail;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_trail);
 
@@ -2800,6 +2808,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	INIT_LIST_HEAD(&m->list_head);
 	kref_init(&m->kref);
 
+	ceph_msg_data_init(&m->p);
+	ceph_msg_data_init(&m->l);
+	ceph_msg_data_init(&m->b);
+	ceph_msg_data_init(&m->t);
+
 	/* front */
 	m->front_max = front_len;
 	if (front_len) {
@@ -2965,7 +2978,7 @@ void ceph_msg_last_put(struct kref *kref)
 	}
 
 	if (ceph_msg_has_trail(m))
-		m->t.trail = NULL;
+		m->t.pagelist = NULL;
 
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
-- 
cgit 


From fe38a2b67bc6b3a60da82a23e9082256a30e39d9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Mar 2013 23:39:39 -0600
Subject: libceph: start defining message data cursor

This patch lays out the foundation for using generic routines to
manage processing items of message data.

For simplicity, we'll start with just the trail portion of a
message, because it stands alone and is only present for outgoing
data.

First some basic concepts.  We'll use the term "data item" to
represent one of the ceph_msg_data structures associated with a
message.  There are currently four of those, with single-letter
field names p, l, b, and t.  A data item is further broken into
"pieces" which always lie in a single page.  A data item will
include a "cursor" that will track state as the memory defined by
the item is consumed by sending data from or receiving data into it.

We define three routines to manipulate a data item's cursor: the
"init" routine; the "next" routine; and the "advance" routine.  The
"init" routine initializes the cursor so it points at the beginning
of the first piece in the item.  The "next" routine returns the
page, page offset, and length (limited by both the page and item
size) of the next unconsumed piece in the item.  It also indicates
to the caller whether the piece being returned is the last one in
the data item.

The "advance" routine consumes the requested number of bytes in the
item (advancing the cursor).  This is used to record the number of
bytes from the current piece that were actually sent or received by
the network code.  It returns an indication of whether the result
means the current piece has been fully consumed.  This is used by
the message send code to determine whether it should calculate the
CRC for the next piece processed.

The trail of a message is implemented as a ceph pagelist.  The
routines defined for it will be usable for non-trail pagelist data
as well.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |   7 +++
 net/ceph/messenger.c           | 138 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 135 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 5860dd0c2caf..14862438faff 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -95,6 +95,12 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 	}
 }
 
+struct ceph_msg_data_cursor {
+	bool		last_piece;	/* now at last piece of data item */
+	struct page	*page;		/* current page in pagelist */
+	size_t		offset;		/* pagelist bytes consumed */
+};
+
 struct ceph_msg_data {
 	enum ceph_msg_data_type		type;
 	union {
@@ -112,6 +118,7 @@ struct ceph_msg_data {
 		};
 		struct ceph_pagelist	*pagelist;
 	};
+	struct ceph_msg_data_cursor	cursor;		/* pagelist only */
 };
 
 /*
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index f256b4b174ad..b978cf8b27ff 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -21,6 +21,9 @@
 #include <linux/ceph/pagelist.h>
 #include <linux/export.h>
 
+#define list_entry_next(pos, member)					\
+	list_entry(pos->member.next, typeof(*pos), member)
+
 /*
  * Ceph uses the messenger to exchange ceph_msg messages with other
  * hosts in the system.  The messenger provides ordered and reliable
@@ -738,6 +741,109 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
 }
 #endif
 
+/*
+ * Message data is handled (sent or received) in pieces, where each
+ * piece resides on a single page.  The network layer might not
+ * consume an entire piece at once.  A data item's cursor keeps
+ * track of which piece is next to process and how much remains to
+ * be processed in that piece.  It also tracks whether the current
+ * piece is the last one in the data item.
+ */
+static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_pagelist *pagelist;
+	struct page *page;
+
+	if (data->type != CEPH_MSG_DATA_PAGELIST)
+		return;
+
+	pagelist = data->pagelist;
+	BUG_ON(!pagelist);
+	if (!pagelist->length)
+		return;		/* pagelist can be assigned but empty */
+
+	BUG_ON(list_empty(&pagelist->head));
+	page = list_first_entry(&pagelist->head, struct page, lru);
+
+	cursor->page = page;
+	cursor->offset = 0;
+	cursor->last_piece = pagelist->length <= PAGE_SIZE;
+}
+
+/*
+ * Return the page containing the next piece to process for a given
+ * data item, and supply the page offset and length of that piece.
+ * Indicate whether this is the last piece in this data item.
+ */
+static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
+						size_t *page_offset,
+						size_t *length,
+						bool *last_piece)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_pagelist *pagelist;
+	size_t piece_end;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+	pagelist = data->pagelist;
+	BUG_ON(!pagelist);
+
+	BUG_ON(!cursor->page);
+	BUG_ON(cursor->offset >= pagelist->length);
+
+	*last_piece = cursor->last_piece;
+	if (*last_piece) {
+		/* pagelist offset is always 0 */
+		piece_end = pagelist->length & ~PAGE_MASK;
+		if (!piece_end)
+			piece_end = PAGE_SIZE;
+	} else {
+		piece_end = PAGE_SIZE;
+	}
+	*page_offset = cursor->offset & ~PAGE_MASK;
+	*length = piece_end - *page_offset;
+
+	return data->cursor.page;
+}
+
+/*
+ * Returns true if the result moves the cursor on to the next piece
+ * (the next page) of the pagelist.
+ */
+static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_pagelist *pagelist;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
+
+	pagelist = data->pagelist;
+	BUG_ON(!pagelist);
+	BUG_ON(!cursor->page);
+	BUG_ON(cursor->offset + bytes > pagelist->length);
+	BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
+
+	/* Advance the cursor offset */
+
+	cursor->offset += bytes;
+	/* pagelist offset is always 0 */
+	if (!bytes || cursor->offset & ~PAGE_MASK)
+		return false;	/* more bytes to process in the current page */
+
+	/* Move on to the next page */
+
+	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
+	cursor->page = list_entry_next(cursor->page, lru);
+
+	/* cursor offset is at page boundary; pagelist offset is always 0 */
+	if (pagelist->length - cursor->offset <= PAGE_SIZE)
+		cursor->last_piece = true;
+
+	return true;
+}
+
 static void prepare_message_data(struct ceph_msg *msg,
 				struct ceph_msg_pos *msg_pos)
 {
@@ -755,6 +861,12 @@ static void prepare_message_data(struct ceph_msg *msg,
 		init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg);
 #endif
 	msg_pos->data_pos = 0;
+
+	/* If there's a trail, initialize its cursor */
+
+	if (ceph_msg_has_trail(msg))
+		ceph_msg_data_cursor_init(&msg->t);
+
 	msg_pos->did_page_crc = false;
 }
 
@@ -1045,6 +1157,12 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 
 	msg_pos->data_pos += sent;
 	msg_pos->page_pos += sent;
+	if (in_trail) {
+		bool need_crc;
+
+		need_crc = ceph_msg_data_advance(&msg->t, sent);
+		BUG_ON(need_crc && sent != len);
+	}
 	if (sent < len)
 		return;
 
@@ -1052,10 +1170,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	msg_pos->page_pos = 0;
 	msg_pos->page++;
 	msg_pos->did_page_crc = false;
-	if (in_trail) {
-		BUG_ON(!ceph_msg_has_trail(msg));
-		list_rotate_left(&msg->t.pagelist->head);
-	} else if (ceph_msg_has_pagelist(msg)) {
+	if (ceph_msg_has_pagelist(msg)) {
 		list_rotate_left(&msg->l.pagelist->head);
 #ifdef CONFIG_BLOCK
 	} else if (ceph_msg_has_bio(msg)) {
@@ -1141,6 +1256,8 @@ static int write_partial_message_data(struct ceph_connection *con)
 		size_t length;
 		int max_write = PAGE_SIZE;
 		int bio_offset = 0;
+		bool use_cursor = false;
+		bool last_piece = true;	/* preserve existing behavior */
 
 		in_trail = in_trail || msg_pos->data_pos >= trail_off;
 		if (!in_trail)
@@ -1148,9 +1265,9 @@ static int write_partial_message_data(struct ceph_connection *con)
 
 		if (in_trail) {
 			BUG_ON(!ceph_msg_has_trail(msg));
-			total_max_write = data_len - msg_pos->data_pos;
-			page = list_first_entry(&msg->t.pagelist->head,
-						struct page, lru);
+			use_cursor = true;
+			page = ceph_msg_data_next(&msg->t, &page_offset,
+							&length, &last_piece);
 		} else if (ceph_msg_has_pages(msg)) {
 			page = msg->p.pages[msg_pos->page];
 		} else if (ceph_msg_has_pagelist(msg)) {
@@ -1168,8 +1285,9 @@ static int write_partial_message_data(struct ceph_connection *con)
 		} else {
 			page = zero_page;
 		}
-		length = min_t(int, max_write - msg_pos->page_pos,
-			    total_max_write);
+		if (!use_cursor)
+			length = min_t(int, max_write - msg_pos->page_pos,
+					    total_max_write);
 
 		page_offset = msg_pos->page_pos + bio_offset;
 		if (do_datacrc && !msg_pos->did_page_crc) {
@@ -1180,7 +1298,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 			msg_pos->did_page_crc = true;
 		}
 		ret = ceph_tcp_sendpage(con->sock, page, page_offset,
-				      length, true);
+				      length, last_piece);
 		if (ret <= 0)
 			goto out;
 
-- 
cgit 


From dd236fcb65d7b6b80c408cb5f66aab55f4594284 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Mar 2013 23:39:39 -0600
Subject: libceph: prepare for other message data item types

This just inserts some infrastructure in preparation for handling
other types of ceph message data items.  No functional changes,
just trying to simplify review by separating out some noise.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |   8 ++-
 net/ceph/messenger.c           | 117 ++++++++++++++++++++++++++++++++---------
 2 files changed, 99 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 14862438faff..716c3fdeb257 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -97,8 +97,12 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 
 struct ceph_msg_data_cursor {
 	bool		last_piece;	/* now at last piece of data item */
-	struct page	*page;		/* current page in pagelist */
-	size_t		offset;		/* pagelist bytes consumed */
+	union {
+		struct {				/* pagelist */
+			struct page	*page;		/* page from list */
+			size_t		offset;		/* bytes from list */
+		};
+	};
 };
 
 struct ceph_msg_data {
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b978cf8b27ff..4cc27a136e35 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -742,21 +742,16 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
 #endif
 
 /*
- * Message data is handled (sent or received) in pieces, where each
- * piece resides on a single page.  The network layer might not
- * consume an entire piece at once.  A data item's cursor keeps
- * track of which piece is next to process and how much remains to
- * be processed in that piece.  It also tracks whether the current
- * piece is the last one in the data item.
+ * For a pagelist, a piece is whatever remains to be consumed in the
+ * first page in the list, or the front of the next page.
  */
-static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
+static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data)
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	struct ceph_pagelist *pagelist;
 	struct page *page;
 
-	if (data->type != CEPH_MSG_DATA_PAGELIST)
-		return;
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
 
 	pagelist = data->pagelist;
 	BUG_ON(!pagelist);
@@ -771,15 +766,9 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
 	cursor->last_piece = pagelist->length <= PAGE_SIZE;
 }
 
-/*
- * Return the page containing the next piece to process for a given
- * data item, and supply the page offset and length of that piece.
- * Indicate whether this is the last piece in this data item.
- */
-static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
+static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
 						size_t *page_offset,
-						size_t *length,
-						bool *last_piece)
+						size_t *length)
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	struct ceph_pagelist *pagelist;
@@ -793,8 +782,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
 	BUG_ON(!cursor->page);
 	BUG_ON(cursor->offset >= pagelist->length);
 
-	*last_piece = cursor->last_piece;
-	if (*last_piece) {
+	if (cursor->last_piece) {
 		/* pagelist offset is always 0 */
 		piece_end = pagelist->length & ~PAGE_MASK;
 		if (!piece_end)
@@ -808,11 +796,8 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
 	return data->cursor.page;
 }
 
-/*
- * Returns true if the result moves the cursor on to the next piece
- * (the next page) of the pagelist.
- */
-static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
+static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
+						size_t bytes)
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	struct ceph_pagelist *pagelist;
@@ -844,6 +829,90 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 	return true;
 }
 
+/*
+ * Message data is handled (sent or received) in pieces, where each
+ * piece resides on a single page.  The network layer might not
+ * consume an entire piece at once.  A data item's cursor keeps
+ * track of which piece is next to process and how much remains to
+ * be processed in that piece.  It also tracks whether the current
+ * piece is the last one in the data item.
+ */
+static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
+{
+	switch (data->type) {
+	case CEPH_MSG_DATA_PAGELIST:
+		ceph_msg_data_pagelist_cursor_init(data);
+		break;
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
+#ifdef CONFIG_BLOCK
+	case CEPH_MSG_DATA_BIO:
+#endif /* CONFIG_BLOCK */
+	default:
+		/* BUG(); */
+		break;
+	}
+}
+
+/*
+ * Return the page containing the next piece to process for a given
+ * data item, and supply the page offset and length of that piece.
+ * Indicate whether this is the last piece in this data item.
+ */
+static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
+					size_t *page_offset,
+					size_t *length,
+					bool *last_piece)
+{
+	struct page *page;
+
+	switch (data->type) {
+	case CEPH_MSG_DATA_PAGELIST:
+		page = ceph_msg_data_pagelist_next(data, page_offset, length);
+		break;
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
+#ifdef CONFIG_BLOCK
+	case CEPH_MSG_DATA_BIO:
+#endif /* CONFIG_BLOCK */
+	default:
+		page = NULL;
+		break;
+	}
+	BUG_ON(!page);
+	BUG_ON(*page_offset + *length > PAGE_SIZE);
+	BUG_ON(!*length);
+	if (last_piece)
+		*last_piece = data->cursor.last_piece;
+
+	return page;
+}
+
+/*
+ * Returns true if the result moves the cursor on to the next piece
+ * of the data item.
+ */
+static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
+{
+	bool new_piece;
+
+	switch (data->type) {
+	case CEPH_MSG_DATA_PAGELIST:
+		new_piece = ceph_msg_data_pagelist_advance(data, bytes);
+		break;
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
+#ifdef CONFIG_BLOCK
+	case CEPH_MSG_DATA_BIO:
+#endif /* CONFIG_BLOCK */
+	default:
+		BUG();
+		break;
+	}
+
+	return new_piece;
+}
+
 static void prepare_message_data(struct ceph_msg *msg,
 				struct ceph_msg_pos *msg_pos)
 {
-- 
cgit 


From 6aaa4511deb4b0fd776d1153dc63a89cdc024fb8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 6 Mar 2013 23:39:39 -0600
Subject: libceph: implement bio message data item cursor

Implement and use cursor routines for bio message data items for
outbound message data.

(See the previous commit for reasoning in support of the changes
in out_msg_pos_next().)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |   7 +++
 net/ceph/messenger.c           | 137 ++++++++++++++++++++++++++++++++++-------
 2 files changed, 123 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 716c3fdeb257..76b4645e2dff 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -98,6 +98,13 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 struct ceph_msg_data_cursor {
 	bool		last_piece;	/* now at last piece of data item */
 	union {
+#ifdef CONFIG_BLOCK
+		struct {				/* bio */
+			struct bio	*bio;		/* bio from list */
+			unsigned int	vector_index;	/* vector from bio */
+			unsigned int	vector_offset;	/* bytes from vector */
+		};
+#endif /* CONFIG_BLOCK */
 		struct {				/* pagelist */
 			struct page	*page;		/* page from list */
 			size_t		offset;		/* bytes from list */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 30c8792be180..209990a853e5 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -739,6 +739,95 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
 	if (*seg == (*bio_iter)->bi_vcnt)
 		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
 }
+
+/*
+ * For a bio data item, a piece is whatever remains of the next
+ * entry in the current bio iovec, or the first entry in the next
+ * bio in the list.
+ */
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct bio *bio;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+	bio = data->bio;
+	BUG_ON(!bio);
+	BUG_ON(!bio->bi_vcnt);
+	/* resid = bio->bi_size */
+
+	cursor->bio = bio;
+	cursor->vector_index = 0;
+	cursor->vector_offset = 0;
+	cursor->last_piece = !bio->bi_next && bio->bi_vcnt == 1;
+}
+
+static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
+						size_t *page_offset,
+						size_t *length)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct bio *bio;
+	struct bio_vec *bio_vec;
+	unsigned int index;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+	bio = cursor->bio;
+	BUG_ON(!bio);
+
+	index = cursor->vector_index;
+	BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+
+	bio_vec = &bio->bi_io_vec[index];
+	BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
+	*page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
+	BUG_ON(*page_offset >= PAGE_SIZE);
+	*length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+	BUG_ON(*length > PAGE_SIZE);
+
+	return bio_vec->bv_page;
+}
+
+static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct bio *bio;
+	struct bio_vec *bio_vec;
+	unsigned int index;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+
+	bio = cursor->bio;
+	BUG_ON(!bio);
+
+	index = cursor->vector_index;
+	BUG_ON(index >= (unsigned int) bio->bi_vcnt);
+	bio_vec = &bio->bi_io_vec[index];
+	BUG_ON(cursor->vector_offset + bytes > bio_vec->bv_len);
+
+	/* Advance the cursor offset */
+
+	cursor->vector_offset += bytes;
+	if (cursor->vector_offset < bio_vec->bv_len)
+		return false;	/* more bytes to process in this segment */
+
+	/* Move on to the next segment, and possibly the next bio */
+
+	if (++cursor->vector_index == (unsigned int) bio->bi_vcnt) {
+		bio = bio->bi_next;
+		cursor->bio = bio;
+		cursor->vector_index = 0;
+	}
+	cursor->vector_offset = 0;
+
+	if (!cursor->last_piece && bio && !bio->bi_next)
+		if (cursor->vector_index == (unsigned int) bio->bi_vcnt - 1)
+			cursor->last_piece = true;
+
+	return true;
+}
 #endif
 
 /*
@@ -843,11 +932,13 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
 	case CEPH_MSG_DATA_PAGELIST:
 		ceph_msg_data_pagelist_cursor_init(data);
 		break;
-	case CEPH_MSG_DATA_NONE:
-	case CEPH_MSG_DATA_PAGES:
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
+		ceph_msg_data_bio_cursor_init(data);
+		break;
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
 	default:
 		/* BUG(); */
 		break;
@@ -870,11 +961,13 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
 	case CEPH_MSG_DATA_PAGELIST:
 		page = ceph_msg_data_pagelist_next(data, page_offset, length);
 		break;
-	case CEPH_MSG_DATA_NONE:
-	case CEPH_MSG_DATA_PAGES:
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
+		page = ceph_msg_data_bio_next(data, page_offset, length);
+		break;
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
 	default:
 		page = NULL;
 		break;
@@ -900,11 +993,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 	case CEPH_MSG_DATA_PAGELIST:
 		new_piece = ceph_msg_data_pagelist_advance(data, bytes);
 		break;
-	case CEPH_MSG_DATA_NONE:
-	case CEPH_MSG_DATA_PAGES:
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
+		new_piece = ceph_msg_data_bio_advance(data, bytes);
+		break;
 #endif /* CONFIG_BLOCK */
+	case CEPH_MSG_DATA_NONE:
+	case CEPH_MSG_DATA_PAGES:
 	default:
 		BUG();
 		break;
@@ -933,6 +1028,10 @@ static void prepare_message_data(struct ceph_msg *msg,
 
 	/* Initialize data cursors */
 
+#ifdef CONFIG_BLOCK
+	if (ceph_msg_has_bio(msg))
+		ceph_msg_data_cursor_init(&msg->b);
+#endif /* CONFIG_BLOCK */
 	if (ceph_msg_has_pagelist(msg))
 		ceph_msg_data_cursor_init(&msg->l);
 	if (ceph_msg_has_trail(msg))
@@ -1233,6 +1332,10 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 		need_crc = ceph_msg_data_advance(&msg->t, sent);
 	else if (ceph_msg_has_pagelist(msg))
 		need_crc = ceph_msg_data_advance(&msg->l, sent);
+#ifdef CONFIG_BLOCK
+	else if (ceph_msg_has_bio(msg))
+		need_crc = ceph_msg_data_advance(&msg->b, sent);
+#endif /* CONFIG_BLOCK */
 	BUG_ON(need_crc && sent != len);
 
 	if (sent < len)
@@ -1242,10 +1345,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	msg_pos->page_pos = 0;
 	msg_pos->page++;
 	msg_pos->did_page_crc = false;
-#ifdef CONFIG_BLOCK
-	if (ceph_msg_has_bio(msg))
-		iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg);
-#endif
 }
 
 static void in_msg_pos_next(struct ceph_connection *con, size_t len,
@@ -1323,8 +1422,6 @@ static int write_partial_message_data(struct ceph_connection *con)
 		struct page *page = NULL;
 		size_t page_offset;
 		size_t length;
-		int max_write = PAGE_SIZE;
-		int bio_offset = 0;
 		bool use_cursor = false;
 		bool last_piece = true;	/* preserve existing behavior */
 
@@ -1345,21 +1442,19 @@ static int write_partial_message_data(struct ceph_connection *con)
 							&length, &last_piece);
 #ifdef CONFIG_BLOCK
 		} else if (ceph_msg_has_bio(msg)) {
-			struct bio_vec *bv;
-
-			bv = bio_iovec_idx(msg->b.bio_iter, msg->b.bio_seg);
-			page = bv->bv_page;
-			bio_offset = bv->bv_offset;
-			max_write = bv->bv_len;
+			use_cursor = true;
+			page = ceph_msg_data_next(&msg->b, &page_offset,
+							&length, &last_piece);
 #endif
 		} else {
 			page = zero_page;
 		}
-		if (!use_cursor)
-			length = min_t(int, max_write - msg_pos->page_pos,
+		if (!use_cursor) {
+			length = min_t(int, PAGE_SIZE - msg_pos->page_pos,
 					    total_max_write);
 
-		page_offset = msg_pos->page_pos + bio_offset;
+			page_offset = msg_pos->page_pos;
+		}
 		if (do_datacrc && !msg_pos->did_page_crc) {
 			u32 crc = le32_to_cpu(msg->footer.data_crc);
 
-- 
cgit 


From e766d7b55e10f93c7bab298135a4e90dcc46620d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 7 Mar 2013 15:38:28 -0600
Subject: libceph: implement pages array cursor

Implement and use cursor routines for page array message data items
for outbound message data.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  6 +++
 net/ceph/messenger.c           | 93 ++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 95 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 76b4645e2dff..b53b9ef65009 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -105,6 +105,12 @@ struct ceph_msg_data_cursor {
 			unsigned int	vector_offset;	/* bytes from vector */
 		};
 #endif /* CONFIG_BLOCK */
+		struct {				/* pages */
+			size_t		resid;		/* bytes from array */
+			unsigned int	page_offset;	/* offset in page */
+			unsigned short	page_index;	/* index in array */
+			unsigned short	page_count;	/* pages in array */
+		};
 		struct {				/* pagelist */
 			struct page	*page;		/* page from list */
 			size_t		offset;		/* bytes from list */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 209990a853e5..d611156808b3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -830,6 +830,79 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
 }
 #endif
 
+/*
+ * For a page array, a piece comes from the first page in the array
+ * that has not already been fully consumed.
+ */
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	int page_count;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+	BUG_ON(!data->pages);
+	BUG_ON(!data->length);
+
+	page_count = calc_pages_for(data->alignment, (u64)data->length);
+	BUG_ON(page_count > (int) USHRT_MAX);
+	cursor->resid = data->length;
+	cursor->page_offset = data->alignment & ~PAGE_MASK;
+	cursor->page_index = 0;
+	cursor->page_count = (unsigned short) page_count;
+	cursor->last_piece = cursor->page_count == 1;
+}
+
+static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
+					size_t *page_offset,
+					size_t *length)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+	BUG_ON(cursor->page_index >= cursor->page_count);
+	BUG_ON(cursor->page_offset >= PAGE_SIZE);
+	BUG_ON(!cursor->resid);
+
+	*page_offset = cursor->page_offset;
+	if (cursor->last_piece) {
+		BUG_ON(*page_offset + cursor->resid > PAGE_SIZE);
+		*length = cursor->resid;
+	} else {
+		*length = PAGE_SIZE - *page_offset;
+	}
+
+	return data->pages[cursor->page_index];
+}
+
+static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
+						size_t bytes)
+{
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
+
+	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+
+	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
+	BUG_ON(bytes > cursor->resid);
+
+	/* Advance the cursor page offset */
+
+	cursor->resid -= bytes;
+	cursor->page_offset += bytes;
+	if (!bytes || cursor->page_offset & ~PAGE_MASK)
+		return false;	/* more bytes to process in the current page */
+
+	/* Move on to the next page */
+
+	BUG_ON(cursor->page_index >= cursor->page_count);
+	cursor->page_offset = 0;
+	cursor->page_index++;
+	cursor->last_piece = cursor->page_index == cursor->page_count - 1;
+
+	return true;
+}
+
 /*
  * For a pagelist, a piece is whatever remains to be consumed in the
  * first page in the list, or the front of the next page.
@@ -932,13 +1005,15 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
 	case CEPH_MSG_DATA_PAGELIST:
 		ceph_msg_data_pagelist_cursor_init(data);
 		break;
+	case CEPH_MSG_DATA_PAGES:
+		ceph_msg_data_pages_cursor_init(data);
+		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
 		ceph_msg_data_bio_cursor_init(data);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
-	case CEPH_MSG_DATA_PAGES:
 	default:
 		/* BUG(); */
 		break;
@@ -961,13 +1036,15 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
 	case CEPH_MSG_DATA_PAGELIST:
 		page = ceph_msg_data_pagelist_next(data, page_offset, length);
 		break;
+	case CEPH_MSG_DATA_PAGES:
+		page = ceph_msg_data_pages_next(data, page_offset, length);
+		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
 		page = ceph_msg_data_bio_next(data, page_offset, length);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
-	case CEPH_MSG_DATA_PAGES:
 	default:
 		page = NULL;
 		break;
@@ -993,13 +1070,15 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 	case CEPH_MSG_DATA_PAGELIST:
 		new_piece = ceph_msg_data_pagelist_advance(data, bytes);
 		break;
+	case CEPH_MSG_DATA_PAGES:
+		new_piece = ceph_msg_data_pages_advance(data, bytes);
+		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
 		new_piece = ceph_msg_data_bio_advance(data, bytes);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
-	case CEPH_MSG_DATA_PAGES:
 	default:
 		BUG();
 		break;
@@ -1032,6 +1111,8 @@ static void prepare_message_data(struct ceph_msg *msg,
 	if (ceph_msg_has_bio(msg))
 		ceph_msg_data_cursor_init(&msg->b);
 #endif /* CONFIG_BLOCK */
+	if (ceph_msg_has_pages(msg))
+		ceph_msg_data_cursor_init(&msg->p);
 	if (ceph_msg_has_pagelist(msg))
 		ceph_msg_data_cursor_init(&msg->l);
 	if (ceph_msg_has_trail(msg))
@@ -1330,6 +1411,8 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	msg_pos->page_pos += sent;
 	if (in_trail)
 		need_crc = ceph_msg_data_advance(&msg->t, sent);
+	else if (ceph_msg_has_pages(msg))
+		need_crc = ceph_msg_data_advance(&msg->p, sent);
 	else if (ceph_msg_has_pagelist(msg))
 		need_crc = ceph_msg_data_advance(&msg->l, sent);
 #ifdef CONFIG_BLOCK
@@ -1435,7 +1518,9 @@ static int write_partial_message_data(struct ceph_connection *con)
 			page = ceph_msg_data_next(&msg->t, &page_offset,
 							&length, &last_piece);
 		} else if (ceph_msg_has_pages(msg)) {
-			page = msg->p.pages[msg_pos->page];
+			use_cursor = true;
+			page = ceph_msg_data_next(&msg->p, &page_offset,
+							&length, &last_piece);
 		} else if (ceph_msg_has_pagelist(msg)) {
 			use_cursor = true;
 			page = ceph_msg_data_next(&msg->l, &page_offset,
-- 
cgit 


From 175face2ba31025b0dcd6da4e711fca7764287fa Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Mar 2013 13:35:36 -0600
Subject: libceph: let osd ops determine request data length

The length of outgoing data in an osd request is dependent on the
osd ops that are embedded in that request.  Each op is encoded into
a request message using osd_req_encode_op(), so that should be used
to determine the amount of outgoing data implied by the op as it
is encoded.

Have osd_req_encode_op() return the number of bytes of outgoing data
implied by the op being encoded, and accumulate and use that in
ceph_osdc_build_request().

As a result, ceph_osdc_build_request() no longer requires its "len"
parameter, so get rid of it.

Using the sum of the op lengths rather than the length provided is
a valid change because:
    - The only callers of osd ceph_osdc_build_request() are
      rbd and the osd client (in ceph_osdc_new_request() on
      behalf of the file system).
    - When rbd calls it, the length provided is only non-zero for
      write requests, and in that case the single op has the
      same length value as what was passed here.
    - When called from ceph_osdc_new_request(), (it's not all that
      easy to see, but) the length passed is also always the same
      as the extent length encoded in its (single) write op if
      present.

This resolves:
    http://tracker.ceph.com/issues/4406

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  2 +-
 include/linux/ceph/osd_client.h |  3 +--
 net/ceph/osd_client.c           | 33 +++++++++++++++++++--------------
 3 files changed, 21 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 04cd5fdfc8f3..dea4401c4f77 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1462,7 +1462,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 
 	/* osd_req will get its own reference to snapc (if non-null) */
 
-	ceph_osdc_build_request(osd_req, offset, length, 1, op,
+	ceph_osdc_build_request(osd_req, offset, 1, op,
 				snapc, snap_id, mtime);
 
 	return osd_req;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index a8016dfbfdba..bcf3f72ec3f8 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -249,8 +249,7 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       bool use_mempool,
 					       gfp_t gfp_flags);
 
-extern void ceph_osdc_build_request(struct ceph_osd_request *req,
-				    u64 off, u64 len,
+extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 				    unsigned int num_op,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 37d89614a61b..ce34faaa453f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -222,10 +222,13 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 }
 EXPORT_SYMBOL(ceph_osdc_alloc_request);
 
-static void osd_req_encode_op(struct ceph_osd_request *req,
+static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			      struct ceph_osd_op *dst,
 			      struct ceph_osd_req_op *src)
 {
+	u64 out_data_len = 0;
+	u64 tmp;
+
 	dst->op = cpu_to_le16(src->op);
 
 	switch (src->op) {
@@ -233,10 +236,10 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
-		dst->extent.offset =
-			cpu_to_le64(src->extent.offset);
-		dst->extent.length =
-			cpu_to_le64(src->extent.length);
+		if (src->op == CEPH_OSD_OP_WRITE)
+			out_data_len = src->extent.length;
+		dst->extent.offset = cpu_to_le64(src->extent.offset);
+		dst->extent.length = cpu_to_le64(src->extent.length);
 		dst->extent.truncate_size =
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
@@ -247,12 +250,14 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
 
+		tmp = req->r_trail.length;
 		ceph_pagelist_append(&req->r_trail, src->cls.class_name,
 				     src->cls.class_len);
 		ceph_pagelist_append(&req->r_trail, src->cls.method_name,
 				     src->cls.method_len);
 		ceph_pagelist_append(&req->r_trail, src->cls.indata,
 				     src->cls.indata_len);
+		out_data_len = req->r_trail.length - tmp;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -326,6 +331,8 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
 		break;
 	}
 	dst->payload_len = cpu_to_le32(src->payload_len);
+
+	return out_data_len;
 }
 
 /*
@@ -333,7 +340,7 @@ static void osd_req_encode_op(struct ceph_osd_request *req,
  *
  */
 void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, u64 len, unsigned int num_ops,
+			     u64 off, unsigned int num_ops,
 			     struct ceph_osd_req_op *src_ops,
 			     struct ceph_snap_context *snapc, u64 snap_id,
 			     struct timespec *mtime)
@@ -385,12 +392,13 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	dout("oid '%.*s' len %d\n", req->r_oid_len, req->r_oid, req->r_oid_len);
 	p += req->r_oid_len;
 
-	/* ops */
+	/* ops--can imply data */
 	ceph_encode_16(&p, num_ops);
 	src_op = src_ops;
 	req->r_request_ops = p;
+	data_len = 0;
 	for (i = 0; i < num_ops; i++, src_op++) {
-		osd_req_encode_op(req, p, src_op);
+		data_len += osd_req_encode_op(req, p, src_op);
 		p += sizeof(struct ceph_osd_op);
 	}
 
@@ -407,11 +415,9 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	req->r_request_attempts = p;
 	p += 4;
 
-	data_len = req->r_trail.length;
-	if (flags & CEPH_OSD_FLAG_WRITE) {
+	/* data */
+	if (flags & CEPH_OSD_FLAG_WRITE)
 		req->r_request->hdr.data_off = cpu_to_le16(off);
-		data_len += len;
-	}
 	req->r_request->hdr.data_len = cpu_to_le32(data_len);
 
 	BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
@@ -477,13 +483,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		ceph_osdc_put_request(req);
 		return ERR_PTR(r);
 	}
-
 	req->r_file_layout = *layout;  /* keep a copy */
 
 	snprintf(req->r_oid, sizeof(req->r_oid), "%llx.%08llx", vino.ino, bno);
 	req->r_oid_len = strlen(req->r_oid);
 
-	ceph_osdc_build_request(req, off, *plen, num_op, ops,
+	ceph_osdc_build_request(req, off, num_op, ops,
 				snapc, vino.snap, mtime);
 
 	return req;
-- 
cgit 


From 9a5e6d09ddd0cd68ce64c3aa54095e4a0e85b089 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Mar 2013 13:35:36 -0600
Subject: libceph: have osd requests support pagelist data

Add support for recording a ceph pagelist as data associated with an
osd request.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h | 4 +++-
 net/ceph/osd_client.c           | 3 +++
 2 files changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index bcf3f72ec3f8..cf0ba93426da 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -53,6 +53,7 @@ struct ceph_osd {
 enum ceph_osd_data_type {
 	CEPH_OSD_DATA_TYPE_NONE,
 	CEPH_OSD_DATA_TYPE_PAGES,
+	CEPH_OSD_DATA_TYPE_PAGELIST,
 #ifdef CONFIG_BLOCK
 	CEPH_OSD_DATA_TYPE_BIO,
 #endif /* CONFIG_BLOCK */
@@ -68,8 +69,9 @@ struct ceph_osd_data {
 			bool		pages_from_pool;
 			bool		own_pages;
 		};
+		struct ceph_pagelist	*pagelist;
 #ifdef CONFIG_BLOCK
-		struct bio       *bio;
+		struct bio       	*bio;
 #endif /* CONFIG_BLOCK */
 	};
 };
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ce34faaa453f..4159df2d67af 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1757,6 +1757,9 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg,
 		if (osd_data->length)
 			ceph_msg_data_set_pages(msg, osd_data->pages,
 				osd_data->length, osd_data->alignment);
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+		BUG_ON(!osd_data->pagelist->length);
+		ceph_msg_data_set_pagelist(msg, osd_data->pagelist);
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
 		ceph_msg_data_set_bio(msg, osd_data->bio);
-- 
cgit 


From 95e072eb38f99c724739d91a1f12bb8bfe1619b5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Mar 2013 13:35:36 -0600
Subject: libceph: kill osd request r_trail

The osd trail is a pagelist, used only for a CALL osd operation
to hold the class and method names, along with any input data for
the call.

It is only currently used by the rbd client, and when it's used it
is the only bit of outbound data in the osd request.  Since we
already support (non-trail) pagelist data in a message, we can
just save this outbound CALL data in the "normal" pagelist rather
than the trail, and get rid of the trail entirely.

The existing pagelist support depends on the pagelist being
dynamically allocated, and ownership of it is passed to the
messenger once it's been attached to a message.  (That is to say,
the messenger releases and frees the pagelist when it's done with
it).  That means we need to dynamically allocate the pagelist also.

Note that we simply assert that the allocation of a pagelist
structure succeeds.  Appending to a pagelist might require a dynamic
allocation, so we're already assuming we won't run into trouble
doing so (we're just ignore any failures--and that should be fixed
at some point).

This resolves:
    http://tracker.ceph.com/issues/4407

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  1 -
 net/ceph/osd_client.c           | 23 ++++++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index cf0ba93426da..1dab291b2dc6 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -134,7 +134,6 @@ struct ceph_osd_request {
 
 	struct ceph_osd_data r_data_in;
 	struct ceph_osd_data r_data_out;
-	struct ceph_pagelist r_trail;	      /* trailing part of data out */
 };
 
 struct ceph_osd_event {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4159df2d67af..cb14db8496bd 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -138,7 +138,6 @@ void ceph_osdc_release_request(struct kref *kref)
 	}
 
 	ceph_put_snap_context(req->r_snapc);
-	ceph_pagelist_release(&req->r_trail);
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
@@ -202,7 +201,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
 	req->r_data_in.type = CEPH_OSD_DATA_TYPE_NONE;
 	req->r_data_out.type = CEPH_OSD_DATA_TYPE_NONE;
-	ceph_pagelist_init(&req->r_trail);
 
 	/* create request message; allow space for oid */
 	if (use_mempool)
@@ -227,7 +225,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			      struct ceph_osd_req_op *src)
 {
 	u64 out_data_len = 0;
-	u64 tmp;
+	struct ceph_pagelist *pagelist;
 
 	dst->op = cpu_to_le16(src->op);
 
@@ -246,18 +244,23 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			cpu_to_le32(src->extent.truncate_seq);
 		break;
 	case CEPH_OSD_OP_CALL:
+		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+		BUG_ON(!pagelist);
+		ceph_pagelist_init(pagelist);
+
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
-
-		tmp = req->r_trail.length;
-		ceph_pagelist_append(&req->r_trail, src->cls.class_name,
+		ceph_pagelist_append(pagelist, src->cls.class_name,
 				     src->cls.class_len);
-		ceph_pagelist_append(&req->r_trail, src->cls.method_name,
+		ceph_pagelist_append(pagelist, src->cls.method_name,
 				     src->cls.method_len);
-		ceph_pagelist_append(&req->r_trail, src->cls.indata,
+		ceph_pagelist_append(pagelist, src->cls.indata,
 				     src->cls.indata_len);
-		out_data_len = req->r_trail.length - tmp;
+
+		req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST;
+		req->r_data_out.pagelist = pagelist;
+		out_data_len = pagelist->length;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -1782,8 +1785,6 @@ int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 
 	ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in);
 	ceph_osdc_msg_data_set(req->r_request, &req->r_data_out);
-	if (req->r_trail.length)
-		ceph_msg_data_set_trail(req->r_request, &req->r_trail);
 
 	register_request(osdc, req);
 
-- 
cgit 


From 9d2a06c2750177dca5f8d0e89884c1d409d64bbc Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 8 Mar 2013 13:35:36 -0600
Subject: libceph: kill message trail

The wart that is the ceph message trail can now be removed, because
its only user was the osd client, and the previous patch made that
no longer the case.

The result allows write_partial_msg_pages() to be simplified
considerably.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  4 ----
 net/ceph/messenger.c           | 44 +++++-------------------------------------
 2 files changed, 5 insertions(+), 43 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b53b9ef65009..0e4536cc46f0 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -69,7 +69,6 @@ struct ceph_messenger {
 #ifdef CONFIG_BLOCK
 #define ceph_msg_has_bio(m)		((m)->b.type == CEPH_MSG_DATA_BIO)
 #endif /* CONFIG_BLOCK */
-#define ceph_msg_has_trail(m)		((m)->t.type == CEPH_MSG_DATA_PAGELIST)
 
 enum ceph_msg_data_type {
 	CEPH_MSG_DATA_NONE,	/* message contains no data payload */
@@ -155,7 +154,6 @@ struct ceph_msg {
 #ifdef CONFIG_BLOCK
 	struct ceph_msg_data	b;	/* bio */
 #endif /* CONFIG_BLOCK */
-	struct ceph_msg_data	t;	/* trail */
 
 	struct ceph_connection *con;
 	struct list_head list_head;	/* links for connection lists */
@@ -295,8 +293,6 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 				struct ceph_pagelist *pagelist);
 extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio);
-extern void ceph_msg_data_set_trail(struct ceph_msg *msg,
-				struct ceph_pagelist *trail);
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index d611156808b3..ff58d3182754 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1115,8 +1115,6 @@ static void prepare_message_data(struct ceph_msg *msg,
 		ceph_msg_data_cursor_init(&msg->p);
 	if (ceph_msg_has_pagelist(msg))
 		ceph_msg_data_cursor_init(&msg->l);
-	if (ceph_msg_has_trail(msg))
-		ceph_msg_data_cursor_init(&msg->t);
 
 	msg_pos->did_page_crc = false;
 }
@@ -1398,7 +1396,7 @@ out:
 }
 
 static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
-			size_t len, size_t sent, bool in_trail)
+			size_t len, size_t sent)
 {
 	struct ceph_msg *msg = con->out_msg;
 	struct ceph_msg_pos *msg_pos = &con->out_msg_pos;
@@ -1409,9 +1407,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 
 	msg_pos->data_pos += sent;
 	msg_pos->page_pos += sent;
-	if (in_trail)
-		need_crc = ceph_msg_data_advance(&msg->t, sent);
-	else if (ceph_msg_has_pages(msg))
+	if (ceph_msg_has_pages(msg))
 		need_crc = ceph_msg_data_advance(&msg->p, sent);
 	else if (ceph_msg_has_pagelist(msg))
 		need_crc = ceph_msg_data_advance(&msg->l, sent);
@@ -1481,14 +1477,6 @@ static int write_partial_message_data(struct ceph_connection *con)
 	bool do_datacrc = !con->msgr->nocrc;
 	int ret;
 	int total_max_write;
-	bool in_trail = false;
-	size_t trail_len = 0;
-	size_t trail_off = data_len;
-
-	if (ceph_msg_has_trail(msg)) {
-		trail_len = msg->t.pagelist->length;
-		trail_off -= trail_len;
-	}
 
 	dout("%s %p msg %p page %d offset %d\n", __func__,
 	     con, msg, msg_pos->page, msg_pos->page_pos);
@@ -1508,16 +1496,9 @@ static int write_partial_message_data(struct ceph_connection *con)
 		bool use_cursor = false;
 		bool last_piece = true;	/* preserve existing behavior */
 
-		in_trail = in_trail || msg_pos->data_pos >= trail_off;
-		if (!in_trail)
-			total_max_write = trail_off - msg_pos->data_pos;
+		total_max_write = data_len - msg_pos->data_pos;
 
-		if (in_trail) {
-			BUG_ON(!ceph_msg_has_trail(msg));
-			use_cursor = true;
-			page = ceph_msg_data_next(&msg->t, &page_offset,
-							&length, &last_piece);
-		} else if (ceph_msg_has_pages(msg)) {
+		if (ceph_msg_has_pages(msg)) {
 			use_cursor = true;
 			page = ceph_msg_data_next(&msg->p, &page_offset,
 							&length, &last_piece);
@@ -1552,7 +1533,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 		if (ret <= 0)
 			goto out;
 
-		out_msg_pos_next(con, page, length, (size_t) ret, in_trail);
+		out_msg_pos_next(con, page, length, (size_t) ret);
 	}
 
 	dout("%s %p msg %p done\n", __func__, con, msg);
@@ -3145,17 +3126,6 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
 
-void ceph_msg_data_set_trail(struct ceph_msg *msg, struct ceph_pagelist *trail)
-{
-	BUG_ON(!trail);
-	BUG_ON(!trail->length);
-	BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE);
-
-	msg->t.type = CEPH_MSG_DATA_PAGELIST;
-	msg->t.pagelist = trail;
-}
-EXPORT_SYMBOL(ceph_msg_data_set_trail);
-
 /*
  * construct a new message with given type, size
  * the new msg has a ref count of 1.
@@ -3179,7 +3149,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	ceph_msg_data_init(&m->p);
 	ceph_msg_data_init(&m->l);
 	ceph_msg_data_init(&m->b);
-	ceph_msg_data_init(&m->t);
 
 	/* front */
 	m->front_max = front_len;
@@ -3345,9 +3314,6 @@ void ceph_msg_last_put(struct kref *kref)
 		m->l.pagelist = NULL;
 	}
 
-	if (ceph_msg_has_trail(m))
-		m->t.pagelist = NULL;
-
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
 	else
-- 
cgit 


From 3a23083bda56850a1dc0e1c6d270b1f5dc789f07 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Mar 2013 08:47:40 -0700
Subject: libceph: implement RECONNECT_SEQ feature

This is an old protocol extension that allows the client and server to
avoid resending old messages after a reconnect (following a socket error).
Instead, the exchange their sequence numbers during the handshake.  This
avoids sending a bunch of useless data over the socket.

It has been supported in the server code since v0.22 (Sep 2010).

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 include/linux/ceph/ceph_features.h |  2 ++
 include/linux/ceph/msgr.h          |  1 +
 net/ceph/messenger.c               | 43 +++++++++++++++++++++++++++++++++-----
 3 files changed, 41 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h
index 76554cecaab2..4c42080347af 100644
--- a/include/linux/ceph/ceph_features.h
+++ b/include/linux/ceph/ceph_features.h
@@ -41,6 +41,7 @@
  */
 #define CEPH_FEATURES_SUPPORTED_DEFAULT  \
 	(CEPH_FEATURE_NOSRCADDR |		\
+	 CEPH_FEATURE_RECONNECT_SEQ |		\
 	 CEPH_FEATURE_PGID64 |			\
 	 CEPH_FEATURE_PGPOOL3 |			\
 	 CEPH_FEATURE_OSDENC |			\
@@ -51,6 +52,7 @@
 
 #define CEPH_FEATURES_REQUIRED_DEFAULT   \
 	(CEPH_FEATURE_NOSRCADDR |	 \
+	 CEPH_FEATURE_RECONNECT_SEQ |	 \
 	 CEPH_FEATURE_PGID64 |		 \
 	 CEPH_FEATURE_PGPOOL3 |		 \
 	 CEPH_FEATURE_OSDENC)
diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h
index 680d3d648cac..3d94a73b5f30 100644
--- a/include/linux/ceph/msgr.h
+++ b/include/linux/ceph/msgr.h
@@ -87,6 +87,7 @@ struct ceph_entity_inst {
 #define CEPH_MSGR_TAG_BADPROTOVER  10  /* bad protocol version */
 #define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
 #define CEPH_MSGR_TAG_FEATURES      12 /* insufficient features */
+#define CEPH_MSGR_TAG_SEQ           13 /* 64-bit int follows with seen seq number */
 
 
 /*
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 997daccf973a..e8491db43f5e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1246,6 +1246,24 @@ static void prepare_write_ack(struct ceph_connection *con)
 	con_flag_set(con, CON_FLAG_WRITE_PENDING);
 }
 
+/*
+ * Prepare to share the seq during handshake
+ */
+static void prepare_write_seq(struct ceph_connection *con)
+{
+	dout("prepare_write_seq %p %llu -> %llu\n", con,
+	     con->in_seq_acked, con->in_seq);
+	con->in_seq_acked = con->in_seq;
+
+	con_out_kvec_reset(con);
+
+	con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+	con_out_kvec_add(con, sizeof (con->out_temp_ack),
+			 &con->out_temp_ack);
+
+	con_flag_set(con, CON_FLAG_WRITE_PENDING);
+}
+
 /*
  * Prepare to write keepalive byte.
  */
@@ -1582,6 +1600,13 @@ static void prepare_read_ack(struct ceph_connection *con)
 	con->in_base_pos = 0;
 }
 
+static void prepare_read_seq(struct ceph_connection *con)
+{
+	dout("prepare_read_seq %p\n", con);
+	con->in_base_pos = 0;
+	con->in_tag = CEPH_MSGR_TAG_SEQ;
+}
+
 static void prepare_read_tag(struct ceph_connection *con)
 {
 	dout("prepare_read_tag %p\n", con);
@@ -2059,6 +2084,7 @@ static int process_connect(struct ceph_connection *con)
 		prepare_read_connect(con);
 		break;
 
+	case CEPH_MSGR_TAG_SEQ:
 	case CEPH_MSGR_TAG_READY:
 		if (req_feat & ~server_feat) {
 			pr_err("%s%lld %s protocol feature mismatch,"
@@ -2089,7 +2115,12 @@ static int process_connect(struct ceph_connection *con)
 
 		con->delay = 0;      /* reset backoff memory */
 
-		prepare_read_tag(con);
+		if (con->in_reply.tag == CEPH_MSGR_TAG_SEQ) {
+			prepare_write_seq(con);
+			prepare_read_seq(con);
+		} else {
+			prepare_read_tag(con);
+		}
 		break;
 
 	case CEPH_MSGR_TAG_WAIT:
@@ -2123,7 +2154,6 @@ static int read_partial_ack(struct ceph_connection *con)
 	return read_partial(con, end, size, &con->in_temp_ack);
 }
 
-
 /*
  * We can finally discard anything that's been acked.
  */
@@ -2148,8 +2178,6 @@ static void process_ack(struct ceph_connection *con)
 }
 
 
-
-
 static int read_partial_message_section(struct ceph_connection *con,
 					struct kvec *section,
 					unsigned int sec_len, u32 *crc)
@@ -2672,7 +2700,12 @@ more:
 			prepare_read_tag(con);
 		goto more;
 	}
-	if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+	if (con->in_tag == CEPH_MSGR_TAG_ACK ||
+	    con->in_tag == CEPH_MSGR_TAG_SEQ) {
+		/*
+		 * the final handshake seq exchange is semantically
+		 * equivalent to an ACK
+		 */
 		ret = read_partial_ack(con);
 		if (ret <= 0)
 			goto out;
-- 
cgit 


From 0bed9b5c523d577378b6f83eab5835fe30c27208 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Mar 2013 10:26:01 -0700
Subject: libceph: add update_authorizer auth method

Currently the messenger calls out to a get_authorizer con op, which will
create a new authorizer if it doesn't yet have one.  In the meantime, when
we rotate our service keys, the authorizer doesn't get updated.  Eventually
it will be rejected by the server on a new connection attempt and get
invalidated, and we will then rebuild a new authorizer, but this is not
ideal.

Instead, if we do have an authorizer, call a new update_authorizer op that
will verify that the current authorizer is using the latest secret.  If it
is not, we will build a new one that does.  This avoids the transient
failure.

This fixes one of the sorry sequence of events for bug

	http://tracker.ceph.com/issues/4282

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/mds_client.c      |  7 ++++++-
 include/linux/ceph/auth.h |  3 +++
 net/ceph/auth_x.c         | 23 +++++++++++++++++++++++
 net/ceph/auth_x.h         |  1 +
 net/ceph/osd_client.c     |  5 +++++
 5 files changed, 38 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 0db6f5206d11..010ff83d640b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3445,7 +3445,12 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	}
 	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
 		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
-							auth);
+						     auth);
+		if (ret)
+			return ERR_PTR(ret);
+	} else if (ac->ops && ac->ops_update_authorizer) {
+		int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						     auth);
 		if (ret)
 			return ERR_PTR(ret);
 	}
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index d4080f309b56..73e973e70026 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -52,6 +52,9 @@ struct ceph_auth_client_ops {
 	 */
 	int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
 				 struct ceph_auth_handshake *auth);
+	/* ensure that an existing authorizer is up to date */
+	int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type,
+				 struct ceph_auth_handshake *auth);
 	int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
 				       struct ceph_authorizer *a, size_t len);
 	void (*destroy_authorizer)(struct ceph_auth_client *ac,
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index bd8758dbfded..2d5981555cd6 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -298,6 +298,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
 			return -ENOMEM;
 	}
 	au->service = th->service;
+	au->secret_id = th->secret_id;
 
 	msg_a = au->buf->vec.iov_base;
 	msg_a->struct_v = 1;
@@ -555,6 +556,27 @@ static int ceph_x_create_authorizer(
 	return 0;
 }
 
+static int ceph_x_update_authorizer(
+	struct ceph_auth_client *ac, int peer_type,
+	struct ceph_auth_handshake *auth)
+{
+	struct ceph_x_authorizer *au;
+	struct ceph_x_ticket_handler *th;
+	int ret;
+
+	th = get_ticket_handler(ac, peer_type);
+	if (IS_ERR(th))
+		return PTR_ERR(th);
+
+	au = (struct ceph_x_authorizer *)auth->authorizer;
+	if (au->secret_id < th->secret_id) {
+		dout("ceph_x_update_authorizer service %u secret %llu < %llu\n",
+		     au->service, au->secret_id, th->secret_id);
+		return ceph_x_build_authorizer(ac, th, au);
+	}
+	return 0;
+}
+
 static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
 					  struct ceph_authorizer *a, size_t len)
 {
@@ -641,6 +663,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = {
 	.build_request = ceph_x_build_request,
 	.handle_reply = ceph_x_handle_reply,
 	.create_authorizer = ceph_x_create_authorizer,
+	.update_authorizer = ceph_x_update_authorizer,
 	.verify_authorizer_reply = ceph_x_verify_authorizer_reply,
 	.destroy_authorizer = ceph_x_destroy_authorizer,
 	.invalidate_authorizer = ceph_x_invalidate_authorizer,
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index f459e93b774f..c5a058da7ac8 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -29,6 +29,7 @@ struct ceph_x_authorizer {
 	struct ceph_buffer *buf;
 	unsigned int service;
 	u64 nonce;
+	u64 secret_id;
 	char reply_buf[128];  /* big enough for encrypted blob */
 };
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cb14db8496bd..5ef24e3e1627 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -2220,6 +2220,11 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 							auth);
 		if (ret)
 			return ERR_PTR(ret);
+	} else if (ac->ops && ac->ops->update_authorizer) {
+		int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+						     auth);
+		if (ret)
+			return ERR_PTR(ret);
 	}
 	*proto = ac->protocol;
 
-- 
cgit 


From 27859f9773e4a0b2042435b13400ee2c891a61f4 Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Mar 2013 10:26:14 -0700
Subject: libceph: wrap auth ops in wrapper functions

Use wrapper functions that check whether the auth op exists so that callers
do not need a bunch of conditional checks.  Simplifies the external
interface.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 fs/ceph/mds_client.c      | 26 ++++++++++++--------------
 include/linux/ceph/auth.h | 13 +++++++++++++
 net/ceph/auth.c           | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 net/ceph/auth_x.c         |  1 -
 net/ceph/mon_client.c     |  7 +++----
 net/ceph/osd_client.c     | 26 +++++++++-----------------
 6 files changed, 84 insertions(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 010ff83d640b..13ae44eaa980 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -365,9 +365,9 @@ void ceph_put_mds_session(struct ceph_mds_session *s)
 	     atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
 	if (atomic_dec_and_test(&s->s_ref)) {
 		if (s->s_auth.authorizer)
-		     s->s_mdsc->fsc->client->monc.auth->ops->destroy_authorizer(
-			     s->s_mdsc->fsc->client->monc.auth,
-			     s->s_auth.authorizer);
+			ceph_auth_destroy_authorizer(
+				s->s_mdsc->fsc->client->monc.auth,
+				s->s_auth.authorizer);
 		kfree(s);
 	}
 }
@@ -3439,18 +3439,17 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &s->s_auth;
 
 	if (force_new && auth->authorizer) {
-		if (ac->ops && ac->ops->destroy_authorizer)
-			ac->ops->destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(ac, auth->authorizer);
 		auth->authorizer = NULL;
 	}
-	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
-		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
-						     auth);
+	if (!auth->authorizer) {
+		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
 		if (ret)
 			return ERR_PTR(ret);
-	} else if (ac->ops && ac->ops_update_authorizer) {
-		int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
-						     auth);
+	} else {
+		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_MDS,
+						      auth);
 		if (ret)
 			return ERR_PTR(ret);
 	}
@@ -3466,7 +3465,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	return ac->ops->verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
@@ -3475,8 +3474,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	if (ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
+	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
 
 	return ceph_monc_validate_auth(&mdsc->fsc->client->monc);
 }
diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index 73e973e70026..c9c3b3abe4a3 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -97,5 +97,18 @@ extern int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len);
 
 extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
+extern int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
+				       int peer_type,
+				       struct ceph_auth_handshake *auth);
+extern void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
+					 struct ceph_authorizer *a);
+extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
+				       int peer_type,
+				       struct ceph_auth_handshake *a);
+extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+					     struct ceph_authorizer *a,
+					     size_t len);
+extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac,
+					    int peer_type);
 
 #endif
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index b4bf4ac090f1..a22de543cedb 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -257,3 +257,50 @@ int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 		return 0;
 	return ac->ops->is_authenticated(ac);
 }
+EXPORT_SYMBOL(ceph_auth_is_authenticated);
+
+int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
+				int peer_type,
+				struct ceph_auth_handshake *auth)
+{
+	if (ac->ops && ac->ops->create_authorizer)
+		return ac->ops->create_authorizer(ac, peer_type, auth);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_auth_create_authorizer);
+
+void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
+				  struct ceph_authorizer *a)
+{
+	if (ac->ops && ac->ops->destroy_authorizer)
+		ac->ops->destroy_authorizer(ac, a);
+}
+EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
+
+int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
+				int peer_type,
+				struct ceph_auth_handshake *a)
+{
+	int ret = 0;
+
+	if (ac->ops && ac->ops->update_authorizer)
+		ret = ac->ops->update_authorizer(ac, peer_type, a);
+	return ret;
+}
+EXPORT_SYMBOL(ceph_auth_update_authorizer);
+
+int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
+				      struct ceph_authorizer *a, size_t len)
+{
+	if (ac->ops && ac->ops->verify_authorizer_reply)
+		return ac->ops->verify_authorizer_reply(ac, a, len);
+	return 0;
+}
+EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
+
+void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
+{
+	if (ac->ops && ac->ops->invalidate_authorizer)
+		ac->ops->invalidate_authorizer(ac, peer_type);
+}
+EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index 2d5981555cd6..96238ba95f2b 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -562,7 +562,6 @@ static int ceph_x_update_authorizer(
 {
 	struct ceph_x_authorizer *au;
 	struct ceph_x_ticket_handler *th;
-	int ret;
 
 	th = get_ticket_handler(ac, peer_type);
 	if (IS_ERR(th))
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index aef5b1062bee..1fe25cd29d0e 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -737,7 +737,7 @@ static void delayed_work(struct work_struct *work)
 
 		__validate_auth(monc);
 
-		if (monc->auth->ops->is_authenticated(monc->auth))
+		if (ceph_auth_is_authenticated(monc->auth))
 			__send_subscribe(monc);
 	}
 	__schedule_delayed(monc);
@@ -892,8 +892,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 
 	mutex_lock(&monc->mutex);
 	had_debugfs_info = have_debugfs_info(monc);
-	if (monc->auth->ops)
-		was_auth = monc->auth->ops->is_authenticated(monc->auth);
+	was_auth = ceph_auth_is_authenticated(monc->auth);
 	monc->pending_auth = 0;
 	ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
 				     msg->front.iov_len,
@@ -904,7 +903,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
 		wake_up_all(&monc->client->auth_wq);
 	} else if (ret > 0) {
 		__send_prepared_auth_request(monc, ret);
-	} else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
+	} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
 		dout("authenticated, starting session\n");
 
 		monc->client->msgr.inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 5ef24e3e1627..7041906a55a6 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -666,8 +666,7 @@ static void put_osd(struct ceph_osd *osd)
 	if (atomic_dec_and_test(&osd->o_ref) && osd->o_auth.authorizer) {
 		struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
 
-		if (ac->ops && ac->ops->destroy_authorizer)
-			ac->ops->destroy_authorizer(ac, osd->o_auth.authorizer);
+		ceph_auth_destroy_authorizer(ac, osd->o_auth.authorizer);
 		kfree(osd);
 	}
 }
@@ -2211,17 +2210,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 	struct ceph_auth_handshake *auth = &o->o_auth;
 
 	if (force_new && auth->authorizer) {
-		if (ac->ops && ac->ops->destroy_authorizer)
-			ac->ops->destroy_authorizer(ac, auth->authorizer);
+		ceph_auth_destroy_authorizer(ac, auth->authorizer);
 		auth->authorizer = NULL;
 	}
-	if (!auth->authorizer && ac->ops && ac->ops->create_authorizer) {
-		int ret = ac->ops->create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
-							auth);
+	if (!auth->authorizer) {
+		int ret = ceph_auth_create_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+						      auth);
 		if (ret)
 			return ERR_PTR(ret);
-	} else if (ac->ops && ac->ops->update_authorizer) {
-		int ret = ac->ops->update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
+	} else {
+		int ret = ceph_auth_update_authorizer(ac, CEPH_ENTITY_TYPE_OSD,
 						     auth);
 		if (ret)
 			return ERR_PTR(ret);
@@ -2238,11 +2236,7 @@ static int verify_authorizer_reply(struct ceph_connection *con, int len)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
-	/*
-	 * XXX If ac->ops or ac->ops->verify_authorizer_reply is null,
-	 * XXX which do we do:  succeed or fail?
-	 */
-	return ac->ops->verify_authorizer_reply(ac, o->o_auth.authorizer, len);
+	return ceph_auth_verify_authorizer_reply(ac, o->o_auth.authorizer, len);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
@@ -2251,9 +2245,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
 	struct ceph_osd_client *osdc = o->o_osdc;
 	struct ceph_auth_client *ac = osdc->client->monc.auth;
 
-	if (ac->ops && ac->ops->invalidate_authorizer)
-		ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
-
+	ceph_auth_invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
 	return ceph_monc_validate_auth(&osdc->client->monc);
 }
 
-- 
cgit 


From e9966076cdd952e19f2dd4854cd719be0d7cbebc Mon Sep 17 00:00:00 2001
From: Sage Weil <sage@inktank.com>
Date: Mon, 25 Mar 2013 10:26:30 -0700
Subject: libceph: wrap auth methods in a mutex

The auth code is called from a variety of contexts, include the mon_client
(protected by the monc's mutex) and the messenger callbacks (currently
protected by nothing).  Avoid chaos by protecting all auth state with a
mutex.  Nothing is blocking, so this should be simple and lightweight.

Signed-off-by: Sage Weil <sage@inktank.com>
Reviewed-by: Alex Elder <elder@inktank.com>
---
 include/linux/ceph/auth.h |  2 ++
 net/ceph/auth.c           | 78 ++++++++++++++++++++++++++++++++++-------------
 2 files changed, 58 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h
index c9c3b3abe4a3..5f3386844134 100644
--- a/include/linux/ceph/auth.h
+++ b/include/linux/ceph/auth.h
@@ -78,6 +78,8 @@ struct ceph_auth_client {
 	u64 global_id;          /* our unique id in system */
 	const struct ceph_crypto_key *key;     /* our secret key */
 	unsigned want_keys;     /* which services we want */
+
+	struct mutex mutex;
 };
 
 extern struct ceph_auth_client *ceph_auth_init(const char *name,
diff --git a/net/ceph/auth.c b/net/ceph/auth.c
index a22de543cedb..6b923bcaa2a4 100644
--- a/net/ceph/auth.c
+++ b/net/ceph/auth.c
@@ -47,6 +47,7 @@ struct ceph_auth_client *ceph_auth_init(const char *name, const struct ceph_cryp
 	if (!ac)
 		goto out;
 
+	mutex_init(&ac->mutex);
 	ac->negotiating = true;
 	if (name)
 		ac->name = name;
@@ -73,10 +74,12 @@ void ceph_auth_destroy(struct ceph_auth_client *ac)
  */
 void ceph_auth_reset(struct ceph_auth_client *ac)
 {
+	mutex_lock(&ac->mutex);
 	dout("auth_reset %p\n", ac);
 	if (ac->ops && !ac->negotiating)
 		ac->ops->reset(ac);
 	ac->negotiating = true;
+	mutex_unlock(&ac->mutex);
 }
 
 int ceph_entity_name_encode(const char *name, void **p, void *end)
@@ -102,6 +105,7 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 	int i, num;
 	int ret;
 
+	mutex_lock(&ac->mutex);
 	dout("auth_build_hello\n");
 	monhdr->have_version = 0;
 	monhdr->session_mon = cpu_to_le16(-1);
@@ -122,15 +126,19 @@ int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
 
 	ret = ceph_entity_name_encode(ac->name, &p, end);
 	if (ret < 0)
-		return ret;
+		goto out;
 	ceph_decode_need(&p, end, sizeof(u64), bad);
 	ceph_encode_64(&p, ac->global_id);
 
 	ceph_encode_32(&lenp, p - lenp - sizeof(u32));
-	return p - buf;
+	ret = p - buf;
+out:
+	mutex_unlock(&ac->mutex);
+	return ret;
 
 bad:
-	return -ERANGE;
+	ret = -ERANGE;
+	goto out;
 }
 
 static int ceph_build_auth_request(struct ceph_auth_client *ac,
@@ -151,11 +159,13 @@ static int ceph_build_auth_request(struct ceph_auth_client *ac,
 	if (ret < 0) {
 		pr_err("error %d building auth method %s request\n", ret,
 		       ac->ops->name);
-		return ret;
+		goto out;
 	}
 	dout(" built request %d bytes\n", ret);
 	ceph_encode_32(&p, ret);
-	return p + ret - msg_buf;
+	ret = p + ret - msg_buf;
+out:
+	return ret;
 }
 
 /*
@@ -176,6 +186,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 	int result_msg_len;
 	int ret = -EINVAL;
 
+	mutex_lock(&ac->mutex);
 	dout("handle_auth_reply %p %p\n", p, end);
 	ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
 	protocol = ceph_decode_32(&p);
@@ -227,35 +238,44 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
 
 	ret = ac->ops->handle_reply(ac, result, payload, payload_end);
 	if (ret == -EAGAIN) {
-		return ceph_build_auth_request(ac, reply_buf, reply_len);
+		ret = ceph_build_auth_request(ac, reply_buf, reply_len);
 	} else if (ret) {
 		pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
-		return ret;
 	}
-	return 0;
 
-bad:
-	pr_err("failed to decode auth msg\n");
 out:
+	mutex_unlock(&ac->mutex);
 	return ret;
+
+bad:
+	pr_err("failed to decode auth msg\n");
+	ret = -EINVAL;
+	goto out;
 }
 
 int ceph_build_auth(struct ceph_auth_client *ac,
 		    void *msg_buf, size_t msg_len)
 {
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
 	if (!ac->protocol)
-		return ceph_auth_build_hello(ac, msg_buf, msg_len);
-	BUG_ON(!ac->ops);
-	if (ac->ops->should_authenticate(ac))
-		return ceph_build_auth_request(ac, msg_buf, msg_len);
-	return 0;
+		ret = ceph_auth_build_hello(ac, msg_buf, msg_len);
+	else if (ac->ops->should_authenticate(ac))
+		ret = ceph_build_auth_request(ac, msg_buf, msg_len);
+	mutex_unlock(&ac->mutex);
+	return ret;
 }
 
 int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
 {
-	if (!ac->ops)
-		return 0;
-	return ac->ops->is_authenticated(ac);
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
+	if (ac->ops)
+		ret = ac->ops->is_authenticated(ac);
+	mutex_unlock(&ac->mutex);
+	return ret;
 }
 EXPORT_SYMBOL(ceph_auth_is_authenticated);
 
@@ -263,17 +283,23 @@ int ceph_auth_create_authorizer(struct ceph_auth_client *ac,
 				int peer_type,
 				struct ceph_auth_handshake *auth)
 {
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->create_authorizer)
-		return ac->ops->create_authorizer(ac, peer_type, auth);
-	return 0;
+		ret = ac->ops->create_authorizer(ac, peer_type, auth);
+	mutex_unlock(&ac->mutex);
+	return ret;
 }
 EXPORT_SYMBOL(ceph_auth_create_authorizer);
 
 void ceph_auth_destroy_authorizer(struct ceph_auth_client *ac,
 				  struct ceph_authorizer *a)
 {
+	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->destroy_authorizer)
 		ac->ops->destroy_authorizer(ac, a);
+	mutex_unlock(&ac->mutex);
 }
 EXPORT_SYMBOL(ceph_auth_destroy_authorizer);
 
@@ -283,8 +309,10 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac,
 {
 	int ret = 0;
 
+	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->update_authorizer)
 		ret = ac->ops->update_authorizer(ac, peer_type, a);
+	mutex_unlock(&ac->mutex);
 	return ret;
 }
 EXPORT_SYMBOL(ceph_auth_update_authorizer);
@@ -292,15 +320,21 @@ EXPORT_SYMBOL(ceph_auth_update_authorizer);
 int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac,
 				      struct ceph_authorizer *a, size_t len)
 {
+	int ret = 0;
+
+	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->verify_authorizer_reply)
-		return ac->ops->verify_authorizer_reply(ac, a, len);
-	return 0;
+		ret = ac->ops->verify_authorizer_reply(ac, a, len);
+	mutex_unlock(&ac->mutex);
+	return ret;
 }
 EXPORT_SYMBOL(ceph_auth_verify_authorizer_reply);
 
 void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, int peer_type)
 {
+	mutex_lock(&ac->mutex);
 	if (ac->ops && ac->ops->invalidate_authorizer)
 		ac->ops->invalidate_authorizer(ac, peer_type);
+	mutex_unlock(&ac->mutex);
 }
 EXPORT_SYMBOL(ceph_auth_invalidate_authorizer);
-- 
cgit 


From 25aff7c559c8b54a810bc094d59fe037cfed6b18 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Mar 2013 23:34:22 -0500
Subject: libceph: record residual bytes for all message data types

All of the data types can use this, not just the page array.  Until
now, only the bio type doesn't have it available, and only the
initiator of the request (the rbd client) is able to supply the
length of the full request without re-scanning the bio list.  Change
the cursor init routines so the length is supplied based on the
message header "data_len" field, and use that length to intiialize
the "resid" field of the cursor.

In addition, change the way "last_piece" is defined so it is based
on the residual number of bytes in the original request.  This is
necessary (at least for bio messages) because it is possible for
a read request to succeed without consuming all of the space
available in the data buffer.

This resolves:
    http://tracker.ceph.com/issues/4427

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |   2 +-
 net/ceph/messenger.c           | 111 +++++++++++++++++++++++------------------
 2 files changed, 63 insertions(+), 50 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 0e4536cc46f0..459e55280bf8 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -95,6 +95,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 }
 
 struct ceph_msg_data_cursor {
+	size_t		resid;		/* bytes not yet consumed */
 	bool		last_piece;	/* now at last piece of data item */
 	union {
 #ifdef CONFIG_BLOCK
@@ -105,7 +106,6 @@ struct ceph_msg_data_cursor {
 		};
 #endif /* CONFIG_BLOCK */
 		struct {				/* pages */
-			size_t		resid;		/* bytes from array */
 			unsigned int	page_offset;	/* offset in page */
 			unsigned short	page_index;	/* index in array */
 			unsigned short	page_count;	/* pages in array */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 95f90b01f753..0ac4f6cb7339 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -745,7 +745,8 @@ static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
  * entry in the current bio iovec, or the first entry in the next
  * bio in the list.
  */
-static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data)
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data,
+					size_t length)
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	struct bio *bio;
@@ -755,12 +756,12 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data)
 	bio = data->bio;
 	BUG_ON(!bio);
 	BUG_ON(!bio->bi_vcnt);
-	/* resid = bio->bi_size */
 
+	cursor->resid = length;
 	cursor->bio = bio;
 	cursor->vector_index = 0;
 	cursor->vector_offset = 0;
-	cursor->last_piece = !bio->bi_next && bio->bi_vcnt == 1;
+	cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
 }
 
 static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
@@ -784,8 +785,12 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
 	BUG_ON(cursor->vector_offset >= bio_vec->bv_len);
 	*page_offset = (size_t) (bio_vec->bv_offset + cursor->vector_offset);
 	BUG_ON(*page_offset >= PAGE_SIZE);
-	*length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
+	if (cursor->last_piece) /* pagelist offset is always 0 */
+		*length = cursor->resid;
+	else
+		*length = (size_t) (bio_vec->bv_len - cursor->vector_offset);
 	BUG_ON(*length > PAGE_SIZE);
+	BUG_ON(*length > cursor->resid);
 
 	return bio_vec->bv_page;
 }
@@ -805,26 +810,33 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
 	index = cursor->vector_index;
 	BUG_ON(index >= (unsigned int) bio->bi_vcnt);
 	bio_vec = &bio->bi_io_vec[index];
-	BUG_ON(cursor->vector_offset + bytes > bio_vec->bv_len);
 
 	/* Advance the cursor offset */
 
+	BUG_ON(cursor->resid < bytes);
+	cursor->resid -= bytes;
 	cursor->vector_offset += bytes;
 	if (cursor->vector_offset < bio_vec->bv_len)
 		return false;	/* more bytes to process in this segment */
+	BUG_ON(cursor->vector_offset != bio_vec->bv_len);
 
 	/* Move on to the next segment, and possibly the next bio */
 
-	if (++cursor->vector_index == (unsigned int) bio->bi_vcnt) {
+	if (++index == (unsigned int) bio->bi_vcnt) {
 		bio = bio->bi_next;
-		cursor->bio = bio;
-		cursor->vector_index = 0;
+		index = 0;
 	}
+	cursor->bio = bio;
+	cursor->vector_index = index;
 	cursor->vector_offset = 0;
 
-	if (!cursor->last_piece && bio && !bio->bi_next)
-		if (cursor->vector_index == (unsigned int) bio->bi_vcnt - 1)
+	if (!cursor->last_piece) {
+		BUG_ON(!cursor->resid);
+		BUG_ON(!bio);
+		/* A short read is OK, so use <= rather than == */
+		if (cursor->resid <= bio->bi_io_vec[index].bv_len)
 			cursor->last_piece = true;
+	}
 
 	return true;
 }
@@ -834,7 +846,8 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
  * For a page array, a piece comes from the first page in the array
  * that has not already been fully consumed.
  */
-static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data)
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data,
+					size_t length)
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	int page_count;
@@ -843,14 +856,15 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data)
 
 	BUG_ON(!data->pages);
 	BUG_ON(!data->length);
+	BUG_ON(length != data->length);
 
+	cursor->resid = length;
 	page_count = calc_pages_for(data->alignment, (u64)data->length);
-	BUG_ON(page_count > (int) USHRT_MAX);
-	cursor->resid = data->length;
 	cursor->page_offset = data->alignment & ~PAGE_MASK;
 	cursor->page_index = 0;
+	BUG_ON(page_count > (int) USHRT_MAX);
 	cursor->page_count = (unsigned short) page_count;
-	cursor->last_piece = cursor->page_count == 1;
+	cursor->last_piece = length <= PAGE_SIZE;
 }
 
 static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
@@ -863,15 +877,12 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
 
 	BUG_ON(cursor->page_index >= cursor->page_count);
 	BUG_ON(cursor->page_offset >= PAGE_SIZE);
-	BUG_ON(!cursor->resid);
 
 	*page_offset = cursor->page_offset;
-	if (cursor->last_piece) {
-		BUG_ON(*page_offset + cursor->resid > PAGE_SIZE);
+	if (cursor->last_piece)
 		*length = cursor->resid;
-	} else {
+	else
 		*length = PAGE_SIZE - *page_offset;
-	}
 
 	return data->pages[cursor->page_index];
 }
@@ -884,7 +895,6 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
 
 	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
-	BUG_ON(bytes > cursor->resid);
 
 	/* Advance the cursor page offset */
 
@@ -898,7 +908,7 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
 	BUG_ON(cursor->page_index >= cursor->page_count);
 	cursor->page_offset = 0;
 	cursor->page_index++;
-	cursor->last_piece = cursor->page_index == cursor->page_count - 1;
+	cursor->last_piece = cursor->resid <= PAGE_SIZE;
 
 	return true;
 }
@@ -907,7 +917,8 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
  * For a pagelist, a piece is whatever remains to be consumed in the
  * first page in the list, or the front of the next page.
  */
-static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data)
+static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data,
+					size_t length)
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	struct ceph_pagelist *pagelist;
@@ -917,15 +928,18 @@ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data)
 
 	pagelist = data->pagelist;
 	BUG_ON(!pagelist);
-	if (!pagelist->length)
+	BUG_ON(length != pagelist->length);
+
+	if (!length)
 		return;		/* pagelist can be assigned but empty */
 
 	BUG_ON(list_empty(&pagelist->head));
 	page = list_first_entry(&pagelist->head, struct page, lru);
 
+	cursor->resid = length;
 	cursor->page = page;
 	cursor->offset = 0;
-	cursor->last_piece = pagelist->length <= PAGE_SIZE;
+	cursor->last_piece = length <= PAGE_SIZE;
 }
 
 static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
@@ -934,7 +948,6 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
 {
 	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	struct ceph_pagelist *pagelist;
-	size_t piece_end;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
 
@@ -942,18 +955,13 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
 	BUG_ON(!pagelist);
 
 	BUG_ON(!cursor->page);
-	BUG_ON(cursor->offset >= pagelist->length);
+	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
 
-	if (cursor->last_piece) {
-		/* pagelist offset is always 0 */
-		piece_end = pagelist->length & ~PAGE_MASK;
-		if (!piece_end)
-			piece_end = PAGE_SIZE;
-	} else {
-		piece_end = PAGE_SIZE;
-	}
 	*page_offset = cursor->offset & ~PAGE_MASK;
-	*length = piece_end - *page_offset;
+	if (cursor->last_piece) /* pagelist offset is always 0 */
+		*length = cursor->resid;
+	else
+		*length = PAGE_SIZE - *page_offset;
 
 	return data->cursor.page;
 }
@@ -968,12 +976,13 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
 
 	pagelist = data->pagelist;
 	BUG_ON(!pagelist);
-	BUG_ON(!cursor->page);
-	BUG_ON(cursor->offset + bytes > pagelist->length);
+
+	BUG_ON(cursor->offset + cursor->resid != pagelist->length);
 	BUG_ON((cursor->offset & ~PAGE_MASK) + bytes > PAGE_SIZE);
 
 	/* Advance the cursor offset */
 
+	cursor->resid -= bytes;
 	cursor->offset += bytes;
 	/* pagelist offset is always 0 */
 	if (!bytes || cursor->offset & ~PAGE_MASK)
@@ -983,10 +992,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
 
 	BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
 	cursor->page = list_entry_next(cursor->page, lru);
-
-	/* cursor offset is at page boundary; pagelist offset is always 0 */
-	if (pagelist->length - cursor->offset <= PAGE_SIZE)
-		cursor->last_piece = true;
+	cursor->last_piece = cursor->resid <= PAGE_SIZE;
 
 	return true;
 }
@@ -999,18 +1005,19 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
  * be processed in that piece.  It also tracks whether the current
  * piece is the last one in the data item.
  */
-static void ceph_msg_data_cursor_init(struct ceph_msg_data *data)
+static void ceph_msg_data_cursor_init(struct ceph_msg_data *data,
+					size_t length)
 {
 	switch (data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
-		ceph_msg_data_pagelist_cursor_init(data);
+		ceph_msg_data_pagelist_cursor_init(data, length);
 		break;
 	case CEPH_MSG_DATA_PAGES:
-		ceph_msg_data_pages_cursor_init(data);
+		ceph_msg_data_pages_cursor_init(data, length);
 		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
-		ceph_msg_data_bio_cursor_init(data);
+		ceph_msg_data_bio_cursor_init(data, length);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
@@ -1064,8 +1071,10 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
  */
 static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 {
+	struct ceph_msg_data_cursor *cursor = &data->cursor;
 	bool new_piece;
 
+	BUG_ON(bytes > cursor->resid);
 	switch (data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
 		new_piece = ceph_msg_data_pagelist_advance(data, bytes);
@@ -1090,8 +1099,12 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 static void prepare_message_data(struct ceph_msg *msg,
 				struct ceph_msg_pos *msg_pos)
 {
+	size_t data_len;
+
 	BUG_ON(!msg);
-	BUG_ON(!msg->hdr.data_len);
+
+	data_len = le32_to_cpu(msg->hdr.data_len);
+	BUG_ON(!data_len);
 
 	/* initialize page iterator */
 	msg_pos->page = 0;
@@ -1109,12 +1122,12 @@ static void prepare_message_data(struct ceph_msg *msg,
 
 #ifdef CONFIG_BLOCK
 	if (ceph_msg_has_bio(msg))
-		ceph_msg_data_cursor_init(&msg->b);
+		ceph_msg_data_cursor_init(&msg->b, data_len);
 #endif /* CONFIG_BLOCK */
 	if (ceph_msg_has_pages(msg))
-		ceph_msg_data_cursor_init(&msg->p);
+		ceph_msg_data_cursor_init(&msg->p, data_len);
 	if (ceph_msg_has_pagelist(msg))
-		ceph_msg_data_cursor_init(&msg->l);
+		ceph_msg_data_cursor_init(&msg->l, data_len);
 
 	msg_pos->did_page_crc = false;
 }
-- 
cgit 


From 6518be47f910f62a98cb6044dbb457af55241f95 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Mar 2013 23:34:23 -0500
Subject: libceph: kill ceph message bio_iter, bio_seg

The bio_iter and bio_seg fields in a message are no longer used, we
use the cursor instead.  So get rid of them and the functions that
operate on them them.

This is related to:
    http://tracker.ceph.com/issues/4428

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  6 +-----
 net/ceph/messenger.c           | 31 -------------------------------
 2 files changed, 1 insertion(+), 36 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 459e55280bf8..252e01b7f7de 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -121,11 +121,7 @@ struct ceph_msg_data {
 	enum ceph_msg_data_type		type;
 	union {
 #ifdef CONFIG_BLOCK
-		struct {
-			struct bio	*bio_iter;	/* iterator */
-			struct bio	*bio;
-			unsigned int	bio_seg;	/* current seg in bio */
-		};
+		struct bio		*bio;
 #endif /* CONFIG_BLOCK */
 		struct {
 			struct page	**pages;	/* NOT OWNER. */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c795d46d7d4b..b634d2098777 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -716,29 +716,6 @@ static void con_out_kvec_add(struct ceph_connection *con,
 }
 
 #ifdef CONFIG_BLOCK
-static void init_bio_iter(struct bio *bio, struct bio **bio_iter,
-			unsigned int *bio_seg)
-{
-	if (!bio) {
-		*bio_iter = NULL;
-		*bio_seg = 0;
-		return;
-	}
-	*bio_iter = bio;
-	*bio_seg = (unsigned int) bio->bi_idx;
-}
-
-static void iter_bio_next(struct bio **bio_iter, unsigned int *seg)
-{
-	if (*bio_iter == NULL)
-		return;
-
-	BUG_ON(*seg >= (*bio_iter)->bi_vcnt);
-
-	(*seg)++;
-	if (*seg == (*bio_iter)->bi_vcnt)
-		init_bio_iter((*bio_iter)->bi_next, bio_iter, seg);
-}
 
 /*
  * For a bio data item, a piece is whatever remains of the next
@@ -1112,10 +1089,6 @@ static void prepare_message_data(struct ceph_msg *msg,
 		msg_pos->page_pos = msg->p.alignment;
 	else
 		msg_pos->page_pos = 0;
-#ifdef CONFIG_BLOCK
-	if (ceph_msg_has_bio(msg))
-		init_bio_iter(msg->b.bio, &msg->b.bio_iter, &msg->b.bio_seg);
-#endif
 	msg_pos->data_pos = 0;
 
 	/* Initialize data cursors */
@@ -1478,10 +1451,6 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len,
 	BUG_ON(received != len);
 	msg_pos->page_pos = 0;
 	msg_pos->page++;
-#ifdef CONFIG_BLOCK
-	if (msg->b.bio)
-		iter_bio_next(&msg->b.bio_iter, &msg->b.bio_seg);
-#endif /* CONFIG_BLOCK */
 }
 
 static u32 ceph_crc32c_page(u32 crc, struct page *page,
-- 
cgit 


From 4c59b4a278f9b7a418ad8af933fd7b341df64393 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Mar 2013 23:34:23 -0500
Subject: libceph: collapse all data items into one

It turns out that only one of the data item types is ever used at
any one time in a single message (currently).
    - A page array is used by the osd client (on behalf of the file
      system) and by rbd.  Only one osd op (and therefore at most
      one data item) is ever used at a time by rbd.  And the only
      time the file system sends two, the second op contains no
      data.
    - A bio is only used by the rbd client (and again, only one
      data item per message)
    - A page list is used by the file system and by rbd for outgoing
      data, but only one op (and one data item) at a time.

We can therefore collapse all three of our data item fields into a
single field "data", and depend on the messenger code to properly
handle it based on its type.

This allows us to eliminate quite a bit of duplicated code.

This is related to:
    http://tracker.ceph.com/issues/4429

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  12 +---
 net/ceph/messenger.c           | 123 +++++++++++++----------------------------
 2 files changed, 40 insertions(+), 95 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 252e01b7f7de..af786b29f7a4 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -64,11 +64,7 @@ struct ceph_messenger {
 	u32 required_features;
 };
 
-#define ceph_msg_has_pages(m)		((m)->p.type == CEPH_MSG_DATA_PAGES)
-#define ceph_msg_has_pagelist(m)	((m)->l.type == CEPH_MSG_DATA_PAGELIST)
-#ifdef CONFIG_BLOCK
-#define ceph_msg_has_bio(m)		((m)->b.type == CEPH_MSG_DATA_BIO)
-#endif /* CONFIG_BLOCK */
+#define ceph_msg_has_data(m)		((m)->data.type != CEPH_MSG_DATA_NONE)
 
 enum ceph_msg_data_type {
 	CEPH_MSG_DATA_NONE,	/* message contains no data payload */
@@ -145,11 +141,7 @@ struct ceph_msg {
 	struct ceph_buffer *middle;
 
 	/* data payload */
-	struct ceph_msg_data	p;	/* pages */
-	struct ceph_msg_data	l;	/* pagelist */
-#ifdef CONFIG_BLOCK
-	struct ceph_msg_data	b;	/* bio */
-#endif /* CONFIG_BLOCK */
+	struct ceph_msg_data	data;
 
 	struct ceph_connection *con;
 	struct list_head list_head;	/* links for connection lists */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a19ba00ce777..6b5b5c625547 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1085,22 +1085,15 @@ static void prepare_message_data(struct ceph_msg *msg,
 
 	/* initialize page iterator */
 	msg_pos->page = 0;
-	if (ceph_msg_has_pages(msg))
-		msg_pos->page_pos = msg->p.alignment;
+	if (ceph_msg_has_data(msg))
+		msg_pos->page_pos = msg->data.alignment;
 	else
 		msg_pos->page_pos = 0;
 	msg_pos->data_pos = 0;
 
-	/* Initialize data cursors */
+	/* Initialize data cursor */
 
-#ifdef CONFIG_BLOCK
-	if (ceph_msg_has_bio(msg))
-		ceph_msg_data_cursor_init(&msg->b, data_len);
-#endif /* CONFIG_BLOCK */
-	if (ceph_msg_has_pages(msg))
-		ceph_msg_data_cursor_init(&msg->p, data_len);
-	if (ceph_msg_has_pagelist(msg))
-		ceph_msg_data_cursor_init(&msg->l, data_len);
+	ceph_msg_data_cursor_init(&msg->data, data_len);
 
 	msg_pos->did_page_crc = false;
 }
@@ -1166,10 +1159,10 @@ static void prepare_write_message(struct ceph_connection *con)
 		m->needs_out_seq = false;
 	}
 
-	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d (%zd)\n",
+	dout("prepare_write_message %p seq %lld type %d len %d+%d+%d\n",
 	     m, con->out_seq, le16_to_cpu(m->hdr.type),
 	     le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
-	     le32_to_cpu(m->hdr.data_len), m->p.length);
+	     le32_to_cpu(m->hdr.data_len));
 	BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
 
 	/* tag + hdr + front + middle */
@@ -1411,14 +1404,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 
 	msg_pos->data_pos += sent;
 	msg_pos->page_pos += sent;
-	if (ceph_msg_has_pages(msg))
-		need_crc = ceph_msg_data_advance(&msg->p, sent);
-	else if (ceph_msg_has_pagelist(msg))
-		need_crc = ceph_msg_data_advance(&msg->l, sent);
-#ifdef CONFIG_BLOCK
-	else if (ceph_msg_has_bio(msg))
-		need_crc = ceph_msg_data_advance(&msg->b, sent);
-#endif /* CONFIG_BLOCK */
+	need_crc = ceph_msg_data_advance(&msg->data, sent);
 	BUG_ON(need_crc && sent != len);
 
 	if (sent < len)
@@ -1441,12 +1427,8 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len,
 
 	msg_pos->data_pos += received;
 	msg_pos->page_pos += received;
-	if (ceph_msg_has_pages(msg))
-		(void) ceph_msg_data_advance(&msg->p, received);
-#ifdef CONFIG_BLOCK
-	else if (ceph_msg_has_bio(msg))
-		(void) ceph_msg_data_advance(&msg->b, received);
-#endif /* CONFIG_BLOCK */
+	(void) ceph_msg_data_advance(&msg->data, received);
+
 	if (received < len)
 		return;
 
@@ -1486,6 +1468,9 @@ static int write_partial_message_data(struct ceph_connection *con)
 	dout("%s %p msg %p page %d offset %d\n", __func__,
 	     con, msg, msg_pos->page, msg_pos->page_pos);
 
+	if (WARN_ON(!ceph_msg_has_data(msg)))
+		return -EINVAL;
+
 	/*
 	 * Iterate through each page that contains data to be
 	 * written, and send as much as possible for each.
@@ -1500,23 +1485,8 @@ static int write_partial_message_data(struct ceph_connection *con)
 		size_t length;
 		bool last_piece;
 
-		if (ceph_msg_has_pages(msg)) {
-			page = ceph_msg_data_next(&msg->p, &page_offset,
-							&length, &last_piece);
-		} else if (ceph_msg_has_pagelist(msg)) {
-			page = ceph_msg_data_next(&msg->l, &page_offset,
-							&length, &last_piece);
-#ifdef CONFIG_BLOCK
-		} else if (ceph_msg_has_bio(msg)) {
-			page = ceph_msg_data_next(&msg->b, &page_offset,
-							&length, &last_piece);
-#endif
-		} else {
-			WARN(1, "con %p data_len %u but no outbound data\n",
-				con, data_len);
-			ret = -EINVAL;
-			goto out;
-		}
+		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
+							&last_piece);
 		if (do_datacrc && !msg_pos->did_page_crc) {
 			u32 crc = le32_to_cpu(msg->footer.data_crc);
 
@@ -2197,20 +2167,13 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	int ret;
 
 	BUG_ON(!msg);
+	if (WARN_ON(!ceph_msg_has_data(msg)))
+		return -EIO;
 
 	data_len = le32_to_cpu(con->in_hdr.data_len);
 	while (msg_pos->data_pos < data_len) {
-		if (ceph_msg_has_pages(msg)) {
-			page = ceph_msg_data_next(&msg->p, &page_offset,
-					&length, NULL);
-#ifdef CONFIG_BLOCK
-		} else if (ceph_msg_has_bio(msg)) {
-			page = ceph_msg_data_next(&msg->b, &page_offset,
-					&length, NULL);
-#endif
-		} else {
-			BUG_ON(1);
-		}
+		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
+							NULL);
 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
 		if (ret <= 0)
 			return ret;
@@ -2218,7 +2181,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 		if (do_datacrc)
 			con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
 							page, page_offset, ret);
-
 		in_msg_pos_next(con, length, ret);
 	}
 
@@ -3043,12 +3005,12 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 {
 	BUG_ON(!pages);
 	BUG_ON(!length);
-	BUG_ON(msg->p.type != CEPH_MSG_DATA_NONE);
+	BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE);
 
-	msg->p.type = CEPH_MSG_DATA_PAGES;
-	msg->p.pages = pages;
-	msg->p.length = length;
-	msg->p.alignment = alignment & ~PAGE_MASK;
+	msg->data.type = CEPH_MSG_DATA_PAGES;
+	msg->data.pages = pages;
+	msg->data.length = length;
+	msg->data.alignment = alignment & ~PAGE_MASK;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
@@ -3057,20 +3019,20 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 {
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
-	BUG_ON(msg->l.type != CEPH_MSG_DATA_NONE);
+	BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE);
 
-	msg->l.type = CEPH_MSG_DATA_PAGELIST;
-	msg->l.pagelist = pagelist;
+	msg->data.type = CEPH_MSG_DATA_PAGELIST;
+	msg->data.pagelist = pagelist;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 
 void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
 {
 	BUG_ON(!bio);
-	BUG_ON(msg->b.type != CEPH_MSG_DATA_NONE);
+	BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE);
 
-	msg->b.type = CEPH_MSG_DATA_BIO;
-	msg->b.bio = bio;
+	msg->data.type = CEPH_MSG_DATA_BIO;
+	msg->data.bio = bio;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
 
@@ -3094,9 +3056,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	INIT_LIST_HEAD(&m->list_head);
 	kref_init(&m->kref);
 
-	ceph_msg_data_init(&m->p);
-	ceph_msg_data_init(&m->l);
-	ceph_msg_data_init(&m->b);
+	ceph_msg_data_init(&m->data);
 
 	/* front */
 	m->front_max = front_len;
@@ -3251,20 +3211,13 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	if (ceph_msg_has_pages(m)) {
-		m->p.length = 0;
-		m->p.pages = NULL;
-		m->p.type = CEPH_OSD_DATA_TYPE_NONE;
-	}
-	if (ceph_msg_has_pagelist(m)) {
-		ceph_pagelist_release(m->l.pagelist);
-		kfree(m->l.pagelist);
-		m->l.pagelist = NULL;
-		m->l.type = CEPH_OSD_DATA_TYPE_NONE;
-	}
-	if (ceph_msg_has_bio(m)) {
-		m->b.bio = NULL;
-		m->b.type = CEPH_OSD_DATA_TYPE_NONE;
+	if (ceph_msg_has_data(m)) {
+		if (m->data.type == CEPH_MSG_DATA_PAGELIST) {
+			ceph_pagelist_release(m->data.pagelist);
+			kfree(m->data.pagelist);
+		}
+		memset(&m->data, 0, sizeof m->data);
+		ceph_msg_data_init(&m->data);
 	}
 
 	if (m->pool)
@@ -3277,7 +3230,7 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 void ceph_msg_dump(struct ceph_msg *msg)
 {
 	pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
-		 msg->front_max, msg->p.length);
+		 msg->front_max, msg->data.length);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
-- 
cgit 


From 859a35d5523e8e6a5c3568c12febe2e1270bc3a1 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Mar 2013 23:34:23 -0500
Subject: libceph: kill most of ceph_msg_pos

All but one of the fields in the ceph_msg_pos structure are now
never used (only assigned), so get rid of them.  This allows
several small blocks of code to go away.

This is cleanup of old code related to:
    http://tracker.ceph.com/issues/4428

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  2 --
 net/ceph/messenger.c           | 22 +---------------------
 2 files changed, 1 insertion(+), 23 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index af786b29f7a4..c76b228cb524 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -157,8 +157,6 @@ struct ceph_msg {
 };
 
 struct ceph_msg_pos {
-	int page, page_pos;  /* which page; offset in page */
-	int data_pos;        /* offset in data payload */
 	bool did_page_crc;   /* true if we've calculated crc for current page */
 };
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 2fabf006e8f5..19f9fffc170c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1083,14 +1083,6 @@ static void prepare_message_data(struct ceph_msg *msg,
 	data_len = le32_to_cpu(msg->hdr.data_len);
 	BUG_ON(!data_len);
 
-	/* initialize page iterator */
-	msg_pos->page = 0;
-	if (ceph_msg_has_data(msg))
-		msg_pos->page_pos = msg->data.alignment;
-	else
-		msg_pos->page_pos = 0;
-	msg_pos->data_pos = 0;
-
 	/* Initialize data cursor */
 
 	ceph_msg_data_cursor_init(&msg->data, data_len);
@@ -1402,8 +1394,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 	BUG_ON(!msg);
 	BUG_ON(!sent);
 
-	msg_pos->data_pos += sent;
-	msg_pos->page_pos += sent;
 	need_crc = ceph_msg_data_advance(&msg->data, sent);
 	BUG_ON(need_crc && sent != len);
 
@@ -1411,8 +1401,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 		return;
 
 	BUG_ON(sent != len);
-	msg_pos->page_pos = 0;
-	msg_pos->page++;
 	msg_pos->did_page_crc = false;
 }
 
@@ -1420,21 +1408,16 @@ static void in_msg_pos_next(struct ceph_connection *con, size_t len,
 				size_t received)
 {
 	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_pos *msg_pos = &con->in_msg_pos;
 
 	BUG_ON(!msg);
 	BUG_ON(!received);
 
-	msg_pos->data_pos += received;
-	msg_pos->page_pos += received;
 	(void) ceph_msg_data_advance(&msg->data, received);
 
 	if (received < len)
 		return;
 
 	BUG_ON(received != len);
-	msg_pos->page_pos = 0;
-	msg_pos->page++;
 }
 
 static u32 ceph_crc32c_page(u32 crc, struct page *page,
@@ -1465,8 +1448,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 	bool do_datacrc = !con->msgr->nocrc;
 	int ret;
 
-	dout("%s %p msg %p page %d offset %d\n", __func__,
-	     con, msg, msg_pos->page, msg_pos->page_pos);
+	dout("%s %p msg %p\n", __func__, con, msg);
 
 	if (WARN_ON(!ceph_msg_has_data(msg)))
 		return -EINVAL;
@@ -2159,7 +2141,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	struct ceph_msg *msg = con->in_msg;
 	struct ceph_msg_data_cursor *cursor = &msg->data.cursor;
 	const bool do_datacrc = !con->msgr->nocrc;
-	unsigned int data_len;
 	struct page *page;
 	size_t page_offset;
 	size_t length;
@@ -2169,7 +2150,6 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	if (WARN_ON(!ceph_msg_has_data(msg)))
 		return -EIO;
 
-	data_len = le32_to_cpu(con->in_hdr.data_len);
 	while (cursor->resid) {
 		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
 							NULL);
-- 
cgit 


From f5db90bcf2c69d099f9d828a8104796f41de6bc5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Mar 2013 23:34:23 -0500
Subject: libceph: kill last of ceph_msg_pos

The only remaining field in the ceph_msg_pos structure is
did_page_crc.  In the new cursor model of things that flag (or
something like it) belongs in the cursor.

Define a new field "need_crc" in the cursor (which applies to all
types of data) and initialize it to true whenever a cursor is
initialized.

In write_partial_message_data(), the data CRC still will be computed
as before, but it will check the cursor->need_crc field to determine
whether it's needed.  Any time the cursor is advanced to a new piece
of a data item, need_crc will be set, and this will cause the crc
for that entire piece to be accumulated into the data crc.

In write_partial_message_data() the intermediate crc value is now
held in a local variable so it doesn't have to be byte-swapped so
many times.  In read_partial_msg_data() we do something similar
(but mainly for consistency there).

With that, the ceph_msg_pos structure can go away,  and it no longer
needs to be passed as an argument to prepare_message_data().

This cleanup is related to:
    http://tracker.ceph.com/issues/4428

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  7 +-----
 net/ceph/messenger.c           | 56 +++++++++++++++++++++++-------------------
 2 files changed, 32 insertions(+), 31 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index c76b228cb524..686df5bfa717 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -93,6 +93,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 struct ceph_msg_data_cursor {
 	size_t		resid;		/* bytes not yet consumed */
 	bool		last_piece;	/* now at last piece of data item */
+	bool		need_crc;	/* new piece; crc update needed */
 	union {
 #ifdef CONFIG_BLOCK
 		struct {				/* bio */
@@ -156,10 +157,6 @@ struct ceph_msg {
 	struct ceph_msgpool *pool;
 };
 
-struct ceph_msg_pos {
-	bool did_page_crc;   /* true if we've calculated crc for current page */
-};
-
 /* ceph connection fault delay defaults, for exponential backoff */
 #define BASE_DELAY_INTERVAL	(HZ/2)
 #define MAX_DELAY_INTERVAL	(5 * 60 * HZ)
@@ -217,7 +214,6 @@ struct ceph_connection {
 	struct ceph_msg *out_msg;        /* sending message (== tail of
 					    out_sent) */
 	bool out_msg_done;
-	struct ceph_msg_pos out_msg_pos;
 
 	struct kvec out_kvec[8],         /* sending header/footer data */
 		*out_kvec_cur;
@@ -231,7 +227,6 @@ struct ceph_connection {
 	/* message in temps */
 	struct ceph_msg_header in_hdr;
 	struct ceph_msg *in_msg;
-	struct ceph_msg_pos in_msg_pos;
 	u32 in_front_crc, in_middle_crc, in_data_crc;  /* calculated crc */
 
 	char in_tag;         /* protocol control byte */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 19f9fffc170c..eee7a878dbfb 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1002,6 +1002,7 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data,
 		/* BUG(); */
 		break;
 	}
+	data->cursor.need_crc = true;
 }
 
 /*
@@ -1069,12 +1070,12 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 		BUG();
 		break;
 	}
+	data->cursor.need_crc = new_piece;
 
 	return new_piece;
 }
 
-static void prepare_message_data(struct ceph_msg *msg,
-				struct ceph_msg_pos *msg_pos)
+static void prepare_message_data(struct ceph_msg *msg)
 {
 	size_t data_len;
 
@@ -1086,8 +1087,6 @@ static void prepare_message_data(struct ceph_msg *msg,
 	/* Initialize data cursor */
 
 	ceph_msg_data_cursor_init(&msg->data, data_len);
-
-	msg_pos->did_page_crc = false;
 }
 
 /*
@@ -1186,7 +1185,7 @@ static void prepare_write_message(struct ceph_connection *con)
 	/* is there a data payload? */
 	con->out_msg->footer.data_crc = 0;
 	if (m->hdr.data_len) {
-		prepare_message_data(con->out_msg, &con->out_msg_pos);
+		prepare_message_data(con->out_msg);
 		con->out_more = 1;  /* data + footer will follow */
 	} else {
 		/* no, queue up footer too and be done */
@@ -1388,8 +1387,7 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 			size_t len, size_t sent)
 {
 	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_pos *msg_pos = &con->out_msg_pos;
-	bool need_crc = false;
+	bool need_crc;
 
 	BUG_ON(!msg);
 	BUG_ON(!sent);
@@ -1401,7 +1399,6 @@ static void out_msg_pos_next(struct ceph_connection *con, struct page *page,
 		return;
 
 	BUG_ON(sent != len);
-	msg_pos->did_page_crc = false;
 }
 
 static void in_msg_pos_next(struct ceph_connection *con, size_t len,
@@ -1444,9 +1441,8 @@ static int write_partial_message_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
 	struct ceph_msg_data_cursor *cursor = &msg->data.cursor;
-	struct ceph_msg_pos *msg_pos = &con->out_msg_pos;
 	bool do_datacrc = !con->msgr->nocrc;
-	int ret;
+	u32 crc;
 
 	dout("%s %p msg %p\n", __func__, con, msg);
 
@@ -1461,38 +1457,40 @@ static int write_partial_message_data(struct ceph_connection *con)
 	 * need to map the page.  If we have no pages, they have
 	 * been revoked, so use the zero page.
 	 */
+	crc = do_datacrc ? le32_to_cpu(msg->footer.data_crc) : 0;
 	while (cursor->resid) {
 		struct page *page;
 		size_t page_offset;
 		size_t length;
 		bool last_piece;
+		int ret;
 
 		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
 							&last_piece);
-		if (do_datacrc && !msg_pos->did_page_crc) {
-			u32 crc = le32_to_cpu(msg->footer.data_crc);
+		if (do_datacrc && cursor->need_crc)
 			crc = ceph_crc32c_page(crc, page, page_offset, length);
-			msg->footer.data_crc = cpu_to_le32(crc);
-			msg_pos->did_page_crc = true;
-		}
 		ret = ceph_tcp_sendpage(con->sock, page, page_offset,
 				      length, last_piece);
-		if (ret <= 0)
-			goto out;
+		if (ret <= 0) {
+			if (do_datacrc)
+				msg->footer.data_crc = cpu_to_le32(crc);
 
+			return ret;
+		}
 		out_msg_pos_next(con, page, length, (size_t) ret);
 	}
 
 	dout("%s %p msg %p done\n", __func__, con, msg);
 
 	/* prepare and queue up footer, too */
-	if (!do_datacrc)
+	if (do_datacrc)
+		msg->footer.data_crc = cpu_to_le32(crc);
+	else
 		msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
 	con_out_kvec_reset(con);
 	prepare_write_message_footer(con);
-	ret = 1;
-out:
-	return ret;
+
+	return 1;	/* must return > 0 to indicate success */
 }
 
 /*
@@ -2144,24 +2142,32 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	struct page *page;
 	size_t page_offset;
 	size_t length;
+	u32 crc = 0;
 	int ret;
 
 	BUG_ON(!msg);
 	if (WARN_ON(!ceph_msg_has_data(msg)))
 		return -EIO;
 
+	if (do_datacrc)
+		crc = con->in_data_crc;
 	while (cursor->resid) {
 		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
 							NULL);
 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
-		if (ret <= 0)
+		if (ret <= 0) {
+			if (do_datacrc)
+				con->in_data_crc = crc;
+
 			return ret;
+		}
 
 		if (do_datacrc)
-			con->in_data_crc = ceph_crc32c_page(con->in_data_crc,
-							page, page_offset, ret);
+			crc = ceph_crc32c_page(crc, page, page_offset, ret);
 		in_msg_pos_next(con, length, ret);
 	}
+	if (do_datacrc)
+		con->in_data_crc = crc;
 
 	return 1;	/* must return > 0 to indicate success */
 }
@@ -2257,7 +2263,7 @@ static int read_partial_message(struct ceph_connection *con)
 		/* prepare for data payload, if any */
 
 		if (data_len)
-			prepare_message_data(con->in_msg, &con->in_msg_pos);
+			prepare_message_data(con->in_msg);
 	}
 
 	/* front */
-- 
cgit 


From 6644ed7b7e04f8e588aebdaa58cededb9416ab95 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Mar 2013 23:34:24 -0500
Subject: libceph: make message data be a pointer

Begin the transition from a single message data item to a list of
them by replacing the "data" structure in a message with a pointer
to a ceph_msg_data structure.

A null pointer will indicate the message has no data; replace the
use of ceph_msg_has_data() with a simple check for a null pointer.

Create functions ceph_msg_data_create() and ceph_msg_data_destroy()
to dynamically allocate and free a data item structure of a given type.

When a message has its data item "set," allocate one of these to
hold the data description, and free it when the last reference to
the message is dropped.

This partially resolves:
    http://tracker.ceph.com/issues/4429

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  5 +--
 net/ceph/messenger.c           | 94 +++++++++++++++++++++++++++---------------
 2 files changed, 62 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 686df5bfa717..3181321bed6d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -64,8 +64,6 @@ struct ceph_messenger {
 	u32 required_features;
 };
 
-#define ceph_msg_has_data(m)		((m)->data.type != CEPH_MSG_DATA_NONE)
-
 enum ceph_msg_data_type {
 	CEPH_MSG_DATA_NONE,	/* message contains no data payload */
 	CEPH_MSG_DATA_PAGES,	/* data source/destination is a page array */
@@ -141,8 +139,7 @@ struct ceph_msg {
 	struct kvec front;              /* unaligned blobs of message */
 	struct ceph_buffer *middle;
 
-	/* data payload */
-	struct ceph_msg_data	data;
+	struct ceph_msg_data	*data;	/* data payload */
 
 	struct ceph_connection *con;
 	struct list_head list_head;	/* links for connection lists */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index dd4b8226a48a..d4e46d8a088c 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -1086,7 +1086,7 @@ static void prepare_message_data(struct ceph_msg *msg)
 
 	/* Initialize data cursor */
 
-	ceph_msg_data_cursor_init(&msg->data, data_len);
+	ceph_msg_data_cursor_init(msg->data, data_len);
 }
 
 /*
@@ -1406,13 +1406,13 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
 static int write_partial_message_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->data.cursor;
+	struct ceph_msg_data_cursor *cursor = &msg->data->cursor;
 	bool do_datacrc = !con->msgr->nocrc;
 	u32 crc;
 
 	dout("%s %p msg %p\n", __func__, con, msg);
 
-	if (WARN_ON(!ceph_msg_has_data(msg)))
+	if (WARN_ON(!msg->data))
 		return -EINVAL;
 
 	/*
@@ -1432,7 +1432,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 		bool need_crc;
 		int ret;
 
-		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
+		page = ceph_msg_data_next(msg->data, &page_offset, &length,
 							&last_piece);
 		ret = ceph_tcp_sendpage(con->sock, page, page_offset,
 				      length, last_piece);
@@ -1444,7 +1444,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 		}
 		if (do_datacrc && cursor->need_crc)
 			crc = ceph_crc32c_page(crc, page, page_offset, length);
-		need_crc = ceph_msg_data_advance(&msg->data, (size_t) ret);
+		need_crc = ceph_msg_data_advance(msg->data, (size_t)ret);
 	}
 
 	dout("%s %p msg %p done\n", __func__, con, msg);
@@ -2104,7 +2104,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 static int read_partial_msg_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->data.cursor;
+	struct ceph_msg_data_cursor *cursor = &msg->data->cursor;
 	const bool do_datacrc = !con->msgr->nocrc;
 	struct page *page;
 	size_t page_offset;
@@ -2113,13 +2113,13 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	int ret;
 
 	BUG_ON(!msg);
-	if (WARN_ON(!ceph_msg_has_data(msg)))
+	if (!msg->data)
 		return -EIO;
 
 	if (do_datacrc)
 		crc = con->in_data_crc;
 	while (cursor->resid) {
-		page = ceph_msg_data_next(&msg->data, &page_offset, &length,
+		page = ceph_msg_data_next(msg->data, &page_offset, &length,
 							NULL);
 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
 		if (ret <= 0) {
@@ -2131,7 +2131,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
 
 		if (do_datacrc)
 			crc = ceph_crc32c_page(crc, page, page_offset, ret);
-		(void) ceph_msg_data_advance(&msg->data, (size_t) ret);
+		(void) ceph_msg_data_advance(msg->data, (size_t)ret);
 	}
 	if (do_datacrc)
 		con->in_data_crc = crc;
@@ -2947,44 +2947,80 @@ void ceph_con_keepalive(struct ceph_connection *con)
 }
 EXPORT_SYMBOL(ceph_con_keepalive);
 
-static void ceph_msg_data_init(struct ceph_msg_data *data)
+static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 {
-	data->type = CEPH_MSG_DATA_NONE;
+	struct ceph_msg_data *data;
+
+	if (WARN_ON(!ceph_msg_data_type_valid(type)))
+		return NULL;
+
+	data = kzalloc(sizeof (*data), GFP_NOFS);
+	if (data)
+		data->type = type;
+
+	return data;
+}
+
+static void ceph_msg_data_destroy(struct ceph_msg_data *data)
+{
+	if (!data)
+		return;
+
+	if (data->type == CEPH_MSG_DATA_PAGELIST) {
+		ceph_pagelist_release(data->pagelist);
+		kfree(data->pagelist);
+	}
+	kfree(data);
 }
 
 void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 		size_t length, size_t alignment)
 {
+	struct ceph_msg_data *data;
+
 	BUG_ON(!pages);
 	BUG_ON(!length);
-	BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE);
+	BUG_ON(msg->data != NULL);
+
+	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
+	BUG_ON(!data);
+	data->pages = pages;
+	data->length = length;
+	data->alignment = alignment & ~PAGE_MASK;
 
-	msg->data.type = CEPH_MSG_DATA_PAGES;
-	msg->data.pages = pages;
-	msg->data.length = length;
-	msg->data.alignment = alignment & ~PAGE_MASK;
+	msg->data = data;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
 void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 				struct ceph_pagelist *pagelist)
 {
+	struct ceph_msg_data *data;
+
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
-	BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE);
+	BUG_ON(msg->data != NULL);
 
-	msg->data.type = CEPH_MSG_DATA_PAGELIST;
-	msg->data.pagelist = pagelist;
+	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
+	BUG_ON(!data);
+	data->pagelist = pagelist;
+
+	msg->data = data;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 
 void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
 {
+	struct ceph_msg_data *data;
+
 	BUG_ON(!bio);
-	BUG_ON(msg->data.type != CEPH_MSG_DATA_NONE);
+	BUG_ON(msg->data != NULL);
 
-	msg->data.type = CEPH_MSG_DATA_BIO;
-	msg->data.bio = bio;
+	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
+	BUG_ON(!data);
+	data->bio = bio;
+
+	msg->data = data;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
 
@@ -3008,8 +3044,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 	INIT_LIST_HEAD(&m->list_head);
 	kref_init(&m->kref);
 
-	ceph_msg_data_init(&m->data);
-
 	/* front */
 	m->front_max = front_len;
 	if (front_len) {
@@ -3163,14 +3197,8 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	if (ceph_msg_has_data(m)) {
-		if (m->data.type == CEPH_MSG_DATA_PAGELIST) {
-			ceph_pagelist_release(m->data.pagelist);
-			kfree(m->data.pagelist);
-		}
-		memset(&m->data, 0, sizeof m->data);
-		ceph_msg_data_init(&m->data);
-	}
+	ceph_msg_data_destroy(m->data);
+	m->data = NULL;
 
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
@@ -3182,7 +3210,7 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 void ceph_msg_dump(struct ceph_msg *msg)
 {
 	pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
-		 msg->front_max, msg->data.length);
+		 msg->front_max, msg->data->length);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
-- 
cgit 


From adfe695a25e92e3a4597807fbc7f9a8105218776 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 13 Mar 2013 20:50:00 -0500
Subject: ceph: move max constant definitions

Move some definitions for max integer values out of the rbd code and
into the more central "decode.h" header file.  These really belong
in a Linux (or libc) header somewhere, but I haven't gotten around
to proposing that yet.

This is in preparation for moving some code out of rbd.c and into
the osd client.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c         | 7 -------
 include/linux/ceph/decode.h | 7 +++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index dea4401c4f77..6ed508bd363a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -52,13 +52,6 @@
 #define	SECTOR_SHIFT	9
 #define	SECTOR_SIZE	(1ULL << SECTOR_SHIFT)
 
-/* It might be useful to have these defined elsewhere */
-
-#define	U8_MAX	((u8)	(~0U))
-#define	U16_MAX	((u16)	(~0U))
-#define	U32_MAX	((u32)	(~0U))
-#define	U64_MAX	((u64)	(~0ULL))
-
 #define RBD_DRV_NAME "rbd"
 #define RBD_DRV_NAME_LONG "rbd (rados block device)"
 
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 360d9d08ca9e..689f1df37bff 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -8,6 +8,13 @@
 
 #include <linux/ceph/types.h>
 
+/* This seemed to be the easiest place to define these */
+
+#define	U8_MAX	((u8)  (~0U))
+#define	U16_MAX	((u16) (~0U))
+#define	U32_MAX	((u32) (~0U))
+#define	U64_MAX	((u64) (~0ULL))
+
 /*
  * in all cases,
  *   void **p     pointer to position pointer
-- 
cgit 


From 33803f3300265661b5c5d20a9811c6a2a157d545 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 13 Mar 2013 20:50:00 -0500
Subject: libceph: define source request op functions

The rbd code has a function that allocates and populates a
ceph_osd_req_op structure (the in-core version of an osd request
operation).  When reviewed, Josh suggested two things: that the
big varargs function might be better split into type-specific
functions; and that this functionality really belongs in the osd
client rather than rbd.

This patch implements both of Josh's suggestions.  It breaks
up the rbd function into separate functions and defines them
in the osd client module as exported interfaces.  Unlike the
rbd version, however, the functions don't allocate an osd_req_op
structure; they are provided the address of one and that is
initialized instead.

The rbd function has been eliminated and calls to it have been
replaced by calls to the new routines.  The rbd code now now use a
stack (struct) variable to hold the op rather than allocating and
freeing it each time.

For now only the capabilities used by rbd are implemented.
Implementing all the other osd op types, and making the rest of the
code use it will be done separately, in the next few patches.

Note that only the extent, cls, and watch portions of the
ceph_osd_req_op structure are currently used.  Delete the others
(xattr, pgls, and snap) from its definition so nobody thinks it's
actually implemented or needed.  We can add it back again later
if needed, when we know it's been tested.

This (and a few follow-on patches) resolves:
    http://tracker.ceph.com/issues/3861

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 117 ++++++----------------------------------
 include/linux/ceph/osd_client.h |  26 ++++-----
 net/ceph/osd_client.c           |  84 +++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+), 116 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6ed508bd363a..f04d45b6b563 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1134,76 +1134,6 @@ static bool obj_request_type_valid(enum obj_request_type type)
 	}
 }
 
-static struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
-{
-	struct ceph_osd_req_op *op;
-	va_list args;
-	size_t size;
-
-	op = kzalloc(sizeof (*op), GFP_NOIO);
-	if (!op)
-		return NULL;
-	op->op = opcode;
-	va_start(args, opcode);
-	switch (opcode) {
-	case CEPH_OSD_OP_READ:
-	case CEPH_OSD_OP_WRITE:
-		/* rbd_osd_req_op_create(READ, offset, length) */
-		/* rbd_osd_req_op_create(WRITE, offset, length) */
-		op->extent.offset = va_arg(args, u64);
-		op->extent.length = va_arg(args, u64);
-		if (opcode == CEPH_OSD_OP_WRITE)
-			op->payload_len = op->extent.length;
-		break;
-	case CEPH_OSD_OP_STAT:
-		break;
-	case CEPH_OSD_OP_CALL:
-		/* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
-		op->cls.class_name = va_arg(args, char *);
-		size = strlen(op->cls.class_name);
-		rbd_assert(size <= (size_t) U8_MAX);
-		op->cls.class_len = size;
-		op->payload_len = size;
-
-		op->cls.method_name = va_arg(args, char *);
-		size = strlen(op->cls.method_name);
-		rbd_assert(size <= (size_t) U8_MAX);
-		op->cls.method_len = size;
-		op->payload_len += size;
-
-		op->cls.argc = 0;
-		op->cls.indata = va_arg(args, void *);
-		size = va_arg(args, size_t);
-		rbd_assert(size <= (size_t) U32_MAX);
-		op->cls.indata_len = (u32) size;
-		op->payload_len += size;
-		break;
-	case CEPH_OSD_OP_NOTIFY_ACK:
-	case CEPH_OSD_OP_WATCH:
-		/* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
-		/* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
-		op->watch.cookie = va_arg(args, u64);
-		op->watch.ver = va_arg(args, u64);
-		op->watch.ver = cpu_to_le64(op->watch.ver);
-		if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
-			op->watch.flag = (u8) 1;
-		break;
-	default:
-		rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
-		kfree(op);
-		op = NULL;
-		break;
-	}
-	va_end(args);
-
-	return op;
-}
-
-static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
-{
-	kfree(op);
-}
-
 static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
 				struct rbd_obj_request *obj_request)
 {
@@ -1628,7 +1558,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 	while (resid) {
 		const char *object_name;
 		unsigned int clone_size;
-		struct ceph_osd_req_op *op;
+		struct ceph_osd_req_op op;
 		u64 offset;
 		u64 length;
 
@@ -1657,13 +1587,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 		 * request.  Note that the contents of the op are
 		 * copied by rbd_osd_req_create().
 		 */
-		op = rbd_osd_req_op_create(opcode, offset, length);
-		if (!op)
-			goto out_partial;
+		osd_req_op_extent_init(&op, opcode, offset, length, 0, 0);
 		obj_request->osd_req = rbd_osd_req_create(rbd_dev,
 						img_request->write_request,
-						obj_request, op);
-		rbd_osd_req_op_destroy(op);
+						obj_request, &op);
 		if (!obj_request->osd_req)
 			goto out_partial;
 		/* status and version are initially zero-filled */
@@ -1766,7 +1693,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver, u64 notify_id)
 {
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op *op;
+	struct ceph_osd_req_op op;
 	struct ceph_osd_client *osdc;
 	int ret;
 
@@ -1776,12 +1703,9 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 		return -ENOMEM;
 
 	ret = -ENOMEM;
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
-	if (!op)
-		goto out;
+	osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0);
 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
-						obj_request, op);
-	rbd_osd_req_op_destroy(op);
+						obj_request, &op);
 	if (!obj_request->osd_req)
 		goto out;
 
@@ -1823,7 +1747,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op *op;
+	struct ceph_osd_req_op op;
 	int ret;
 
 	rbd_assert(start ^ !!rbd_dev->watch_event);
@@ -1843,14 +1767,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	if (!obj_request)
 		goto out_cancel;
 
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
+	osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH,
 				rbd_dev->watch_event->cookie,
 				rbd_dev->header.obj_version, start);
-	if (!op)
-		goto out_cancel;
 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
-							obj_request, op);
-	rbd_osd_req_op_destroy(op);
+							obj_request, &op);
 	if (!obj_request->osd_req)
 		goto out_cancel;
 
@@ -1912,7 +1833,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 {
 	struct rbd_obj_request *obj_request;
 	struct ceph_osd_client *osdc;
-	struct ceph_osd_req_op *op;
+	struct ceph_osd_req_op op;
 	struct page **pages;
 	u32 page_count;
 	int ret;
@@ -1939,13 +1860,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	obj_request->pages = pages;
 	obj_request->page_count = page_count;
 
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
-					method_name, outbound, outbound_size);
-	if (!op)
-		goto out;
+	osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name,
+					outbound, outbound_size);
 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
-						obj_request, op);
-	rbd_osd_req_op_destroy(op);
+						obj_request, &op);
 	if (!obj_request->osd_req)
 		goto out;
 
@@ -2125,7 +2043,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 				char *buf, u64 *version)
 
 {
-	struct ceph_osd_req_op *op;
+	struct ceph_osd_req_op op;
 	struct rbd_obj_request *obj_request;
 	struct ceph_osd_client *osdc;
 	struct page **pages = NULL;
@@ -2147,12 +2065,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	obj_request->pages = pages;
 	obj_request->page_count = page_count;
 
-	op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
-	if (!op)
-		goto out;
+	osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0);
 	obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
-						obj_request, op);
-	rbd_osd_req_op_destroy(op);
+						obj_request, &op);
 	if (!obj_request->osd_req)
 		goto out;
 
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 1dab291b2dc6..5fd2cbfcfd91 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -201,14 +201,6 @@ struct ceph_osd_req_op {
 			u64 truncate_size;
 			u32 truncate_seq;
 		} extent;
-		struct {
-			const char *name;
-			const void *val;
-			u32 name_len;
-			u32 value_len;
-			__u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
-			__u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
-		} xattr;
 		struct {
 			const char *class_name;
 			const char *method_name;
@@ -218,13 +210,6 @@ struct ceph_osd_req_op {
 			__u8 method_len;
 			__u8 argc;
 		} cls;
-		struct {
-			u64 cookie;
-			u64 count;
-		} pgls;
-	        struct {
-		        u64 snapid;
-	        } snap;
 		struct {
 			u64 cookie;
 			u64 ver;
@@ -244,6 +229,17 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
+extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode);
+extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
+					u64 offset, u64 length,
+					u64 truncate_size, u32 truncate_seq);
+extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
+					const char *class, const char *method,
+					const void *request_data,
+					size_t request_data_size);
+extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
+					u64 cookie, u64 version, int flag);
+
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
 					       unsigned int num_op,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 4e5c0438ea35..02ed72820479 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -289,6 +289,90 @@ static bool osd_req_opcode_valid(u16 opcode)
 	}
 }
 
+/*
+ * This is an osd op init function for opcodes that have no data or
+ * other information associated with them.  It also serves as a
+ * common init routine for all the other init functions, below.
+ */
+void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode)
+{
+	BUG_ON(!osd_req_opcode_valid(opcode));
+
+	memset(op, 0, sizeof (*op));
+
+	op->op = opcode;
+}
+
+void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
+				u64 offset, u64 length,
+				u64 truncate_size, u32 truncate_seq)
+{
+	size_t payload_len = 0;
+
+	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
+
+	osd_req_op_init(op, opcode);
+
+	op->extent.offset = offset;
+	op->extent.length = length;
+	op->extent.truncate_size = truncate_size;
+	op->extent.truncate_seq = truncate_seq;
+	if (opcode == CEPH_OSD_OP_WRITE)
+		payload_len += length;
+
+	op->payload_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_extent_init);
+
+void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
+			const char *class, const char *method,
+			const void *request_data, size_t request_data_size)
+{
+	size_t payload_len = 0;
+	size_t size;
+
+	BUG_ON(opcode != CEPH_OSD_OP_CALL);
+
+	osd_req_op_init(op, opcode);
+
+	op->cls.class_name = class;
+	size = strlen(class);
+	BUG_ON(size > (size_t) U8_MAX);
+	op->cls.class_len = size;
+	payload_len += size;
+
+	op->cls.method_name = method;
+	size = strlen(method);
+	BUG_ON(size > (size_t) U8_MAX);
+	op->cls.method_len = size;
+	payload_len += size;
+
+	op->cls.indata = request_data;
+	BUG_ON(request_data_size > (size_t) U32_MAX);
+	op->cls.indata_len = (u32) request_data_size;
+	payload_len += request_data_size;
+
+	op->cls.argc = 0;	/* currently unused */
+
+	op->payload_len = payload_len;
+}
+EXPORT_SYMBOL(osd_req_op_cls_init);
+
+void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
+				u64 cookie, u64 version, int flag)
+{
+	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+
+	osd_req_op_init(op, opcode);
+
+	op->watch.cookie = cookie;
+	/* op->watch.ver = version; */	/* XXX 3847 */
+	op->watch.ver = cpu_to_le64(version);
+	if (opcode == CEPH_OSD_OP_WATCH && flag)
+		op->watch.flag = (u8) 1;
+}
+EXPORT_SYMBOL(osd_req_op_watch_init);
+
 static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			      struct ceph_osd_op *dst,
 			      struct ceph_osd_req_op *src)
-- 
cgit 


From ef4859d6479d19bcc65c3156cf3b7dd747355c29 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 1 Apr 2013 18:58:26 -0500
Subject: libceph: define ceph_decode_pgid() only once

There are two basically identical definitions of __decode_pgid()
in libceph, one in "net/ceph/osdmap.c" and the other in
"net/ceph/osd_client.c".  Get rid of both, and instead define
a single inline version in "include/linux/ceph/osdmap.h".

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osdmap.h | 24 ++++++++++++++++++++++++
 net/ceph/osd_client.c       | 22 +---------------------
 net/ceph/osdmap.c           | 22 ++--------------------
 3 files changed, 27 insertions(+), 41 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 167daf60c4e8..d05cc4451af6 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -3,6 +3,7 @@
 
 #include <linux/rbtree.h>
 #include <linux/ceph/types.h>
+#include <linux/ceph/decode.h>
 #include <linux/ceph/ceph_fs.h>
 #include <linux/crush/crush.h>
 
@@ -119,6 +120,29 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
 	return &map->osd_addr[osd];
 }
 
+static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
+{
+	__u8 version;
+
+	if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) {
+		pr_warning("incomplete pg encoding");
+
+		return -EINVAL;
+	}
+	version = ceph_decode_8(p);
+	if (version > 1) {
+		pr_warning("do not understand pg encoding %d > 1",
+			(int)version);
+		return -EINVAL;
+	}
+
+	pgid->pool = ceph_decode_64(p);
+	pgid->seed = ceph_decode_32(p);
+	*p += 4;	/* skip deprecated preferred value */
+
+	return 0;
+}
+
 extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
 extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 					    struct ceph_osdmap *map,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 69ef6539ca14..ca79cad50840 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1268,26 +1268,6 @@ static void complete_request(struct ceph_osd_request *req)
 	complete_all(&req->r_safe_completion);  /* fsync waiter */
 }
 
-static int __decode_pgid(void **p, void *end, struct ceph_pg *pgid)
-{
-	__u8 v;
-
-	ceph_decode_need(p, end, 1 + 8 + 4 + 4, bad);
-	v = ceph_decode_8(p);
-	if (v > 1) {
-		pr_warning("do not understand pg encoding %d > 1", v);
-		return -EINVAL;
-	}
-	pgid->pool = ceph_decode_64(p);
-	pgid->seed = ceph_decode_32(p);
-	*p += 4;
-	return 0;
-
-bad:
-	pr_warning("incomplete pg encoding");
-	return -EINVAL;
-}
-
 /*
  * handle osd op reply.  either call the callback if it is specified,
  * or do the completion to wake up the waiting thread.
@@ -1321,7 +1301,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	ceph_decode_need(&p, end, object_len, bad);
 	p += object_len;
 
-	err = __decode_pgid(&p, end, &pg);
+	err = ceph_decode_pgid(&p, end, &pg);
 	if (err)
 		goto bad;
 
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 09898711f2fd..603ddd92db19 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -654,24 +654,6 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
 	return 0;
 }
 
-static int __decode_pgid(void **p, void *end, struct ceph_pg *pg)
-{
-	u8 v;
-
-	ceph_decode_need(p, end, 1+8+4+4, bad);
-	v = ceph_decode_8(p);
-	if (v != 1)
-		goto bad;
-	pg->pool = ceph_decode_64(p);
-	pg->seed = ceph_decode_32(p);
-	*p += 4; /* skip preferred */
-	return 0;
-
-bad:
-	dout("error decoding pgid\n");
-	return -EINVAL;
-}
-
 /*
  * decode a full map.
  */
@@ -765,7 +747,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
 		struct ceph_pg pgid;
 		struct ceph_pg_mapping *pg;
 
-		err = __decode_pgid(p, end, &pgid);
+		err = ceph_decode_pgid(p, end, &pgid);
 		if (err)
 			goto bad;
 		ceph_decode_need(p, end, sizeof(u32), bad);
@@ -983,7 +965,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
 		struct ceph_pg pgid;
 		u32 pglen;
 
-		err = __decode_pgid(p, end, &pgid);
+		err = ceph_decode_pgid(p, end, &pgid);
 		if (err)
 			goto bad;
 		ceph_decode_need(p, end, sizeof(u32), bad);
-- 
cgit 


From ace6d3a96f00c271b3f337adcde8e8cbe39c3820 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 1 Apr 2013 16:12:14 -0500
Subject: libceph: drop ceph_osd_request->r_con_filling_msg

A field in an osd request keeps track of whether a connection is
currently filling the request's reply message.  This patch gets rid
of that field.

An osd request includes two messages--a request and a reply--and
they're both associated with the connection that existed to its
the target osd at the time the request was created.

An osd request can be dropped early, even when it's in flight.
And at that time both messages are released.  It's possible the
reply message has been supplied to its connection to receive
an incoming response message at the time the osd request gets
dropped.  So ceph_osdc_release_request() revokes that message
from the connection before releasing it so things get cleaned up
properly.

Previously this may have caused a problem, because the connection
that a message was associated with might have gone away before the
revoke request.  And to avoid any problems using that connection,
the osd client held a reference to it when it supplies its response
message.

However since this commit:
    38941f80 libceph: have messages point to their connection
all messages hold a reference to the connection they are associated
with whenever the connection is actively operating on the message
(i.e. while the message is queued to send or sending, and when it
data is being received into it).  And if a message has no connection
associated with it, ceph_msg_revoke_incoming() won't do anything
when asked to revoke it.

As a result, there is no need to keep an additional reference to the
connection associated with a message when we hand the message to the
messenger when it calls our alloc_msg() method to receive something.
If the connection *were* operating on it, it would have its own
reference, and if not, there's no work to be done when we need to
revoke it.

So get rid of the osd request's r_con_filling_msg field.

This resolves:
    http://tracker.ceph.com/issues/4647

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  2 --
 net/ceph/osd_client.c           | 29 +++++------------------------
 2 files changed, 5 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 5fd2cbfcfd91..3b5ba31c2cbd 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -89,8 +89,6 @@ struct ceph_osd_request {
 	int              r_pg_osds[CEPH_PG_MAX_SIZE];
 	int              r_num_pg_osds;
 
-	struct ceph_connection *r_con_filling_msg;
-
 	struct ceph_msg  *r_request, *r_reply;
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ca79cad50840..e0887923e5ab 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -91,15 +91,10 @@ void ceph_osdc_release_request(struct kref *kref)
 
 	if (req->r_request)
 		ceph_msg_put(req->r_request);
-	if (req->r_con_filling_msg) {
-		dout("%s revoking msg %p from con %p\n", __func__,
-		     req->r_reply, req->r_con_filling_msg);
+	if (req->r_reply) {
 		ceph_msg_revoke_incoming(req->r_reply);
-		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
-		req->r_con_filling_msg = NULL;
-	}
-	if (req->r_reply)
 		ceph_msg_put(req->r_reply);
+	}
 
 	if (req->r_data_in.type == CEPH_OSD_DATA_TYPE_PAGES &&
 			req->r_data_in.own_pages) {
@@ -1353,16 +1348,6 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	for (i = 0; i < numops; i++)
 		req->r_reply_op_result[i] = ceph_decode_32(&p);
 
-	/*
-	 * if this connection filled our message, drop our reference now, to
-	 * avoid a (safe but slower) revoke later.
-	 */
-	if (req->r_con_filling_msg == con && req->r_reply == msg) {
-		dout(" dropping con_filling_msg ref %p\n", con);
-		req->r_con_filling_msg = NULL;
-		con->ops->put(con);
-	}
-
 	if (!req->r_got_reply) {
 		unsigned int bytes;
 
@@ -2199,13 +2184,10 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		goto out;
 	}
 
-	if (req->r_con_filling_msg) {
+	if (req->r_reply->con)
 		dout("%s revoking msg %p from old con %p\n", __func__,
-		     req->r_reply, req->r_con_filling_msg);
-		ceph_msg_revoke_incoming(req->r_reply);
-		req->r_con_filling_msg->ops->put(req->r_con_filling_msg);
-		req->r_con_filling_msg = NULL;
-	}
+		     req->r_reply, req->r_reply->con);
+	ceph_msg_revoke_incoming(req->r_reply);
 
 	if (front > req->r_reply->front.iov_len) {
 		pr_warning("get_reply front %d > preallocated %d\n",
@@ -2236,7 +2218,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		}
 	}
 	*skip = 0;
-	req->r_con_filling_msg = con->ops->get(con);
 	dout("get_reply tid %lld %p\n", tid, m);
 
 out:
-- 
cgit 


From fdce58ccb5df621695b079378c619046acabc778 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:06 -0500
Subject: libceph: record length of bio list with bio

When assigning a bio pointer to an osd request, we don't have an
efficient way of knowing the total length bytes in the bio list.
That information is available at the point it's set up by the rbd
code, so record it with the osd data when it's set.

This and the next patch are related to maintaining the length of a
message's data independent of the message header, as described here:
    http://tracker.ceph.com/issues/4589

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 1 +
 include/linux/ceph/osd_client.h | 5 ++++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index f04d45b6b563..e95a92e89330 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1352,6 +1352,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
 		rbd_assert(obj_request->bio_list != NULL);
 		osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
 		osd_data->bio = obj_request->bio_list;
+		osd_data->bio_length = obj_request->length;
 		break;
 	case OBJ_REQUEST_PAGES:
 		osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 3b5ba31c2cbd..fdda93ebbb4c 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -71,7 +71,10 @@ struct ceph_osd_data {
 		};
 		struct ceph_pagelist	*pagelist;
 #ifdef CONFIG_BLOCK
-		struct bio       	*bio;
+		struct {
+			struct bio	*bio;		/* list of bios */
+			size_t		bio_length;	/* total in list */
+		};
 #endif /* CONFIG_BLOCK */
 	};
 };
-- 
cgit 


From a19308048182d5f9e16b03b1d1c038d9346c7589 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:06 -0500
Subject: libceph: record message data length

Keep track of the length of the data portion for a message in a
separate field in the ceph_msg structure.  This information has
been maintained in wire byte order in the message header, but
that's going to change soon.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  4 +++-
 net/ceph/messenger.c           | 10 +++++++++-
 net/ceph/osd_client.c          |  2 +-
 3 files changed, 13 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 3181321bed6d..b832c0ce899a 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -139,6 +139,7 @@ struct ceph_msg {
 	struct kvec front;              /* unaligned blobs of message */
 	struct ceph_buffer *middle;
 
+	size_t			data_length;
 	struct ceph_msg_data	*data;	/* data payload */
 
 	struct ceph_connection *con;
@@ -270,7 +271,8 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
 extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 				struct ceph_pagelist *pagelist);
-extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio);
+extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
+				size_t length);
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index ee160864e8ea..fa9b4d0243a0 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -2981,6 +2981,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 
 	BUG_ON(!pages);
 	BUG_ON(!length);
+	BUG_ON(msg->data_length);
 	BUG_ON(msg->data != NULL);
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
@@ -2990,6 +2991,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 	data->alignment = alignment & ~PAGE_MASK;
 
 	msg->data = data;
+	msg->data_length = length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
@@ -3000,6 +3002,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
+	BUG_ON(msg->data_length);
 	BUG_ON(msg->data != NULL);
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
@@ -3007,14 +3010,17 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 	data->pagelist = pagelist;
 
 	msg->data = data;
+	msg->data_length = pagelist->length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 
-void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
+void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
+		size_t length)
 {
 	struct ceph_msg_data *data;
 
 	BUG_ON(!bio);
+	BUG_ON(msg->data_length);
 	BUG_ON(msg->data != NULL);
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
@@ -3022,6 +3028,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio)
 	data->bio = bio;
 
 	msg->data = data;
+	msg->data_length = length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
 
@@ -3200,6 +3207,7 @@ void ceph_msg_last_put(struct kref *kref)
 	}
 	ceph_msg_data_destroy(m->data);
 	m->data = NULL;
+	m->data_length = 0;
 
 	if (m->pool)
 		ceph_msgpool_put(m->pool, m);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e0887923e5ab..0b4951e27532 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1848,7 +1848,7 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg,
 		ceph_msg_data_set_pagelist(msg, osd_data->pagelist);
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
-		ceph_msg_data_set_bio(msg, osd_data->bio);
+		ceph_msg_data_set_bio(msg, osd_data->bio, osd_data->bio_length);
 #endif
 	} else {
 		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
-- 
cgit 


From acead002b200569273bed331c93c4a91d25e10b8 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:05 -0500
Subject: libceph: don't build request in ceph_osdc_new_request()

This patch moves the call to ceph_osdc_build_request() out of
ceph_osdc_new_request() and into its caller.

This is in order to defer formatting osd operation information into
the request message until just before request is started.

The only unusual (ab)user of ceph_osdc_build_request() is
ceph_writepages_start(), where the final length of write request may
change (downward) based on the current inode size or the oldest
snapshot context with dirty data for the inode.

The remaining callers don't change anything in the request after has
been built.

This means the ops array is now supplied by the caller.  It also
means there is no need to pass the mtime to ceph_osdc_new_request()
(it gets provided to ceph_osdc_build_request()).  And rather than
passing a do_sync flag, have the number of ops in the ops array
supplied imply adding a second STARTSYNC operation after the READ or
WRITE requested.

This and some of the patches that follow are related to having the
messenger (only) be responsible for filling the content of the
message header, as described here:
    http://tracker.ceph.com/issues/4589

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 36 +++++++++++++++++++++++-------------
 fs/ceph/file.c                  | 20 +++++++++++++-------
 include/linux/ceph/osd_client.h | 12 ++++++------
 net/ceph/osd_client.c           | 40 +++++++++++++++++++++-------------------
 4 files changed, 63 insertions(+), 45 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ae438d02a422..681463d5459b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -284,7 +284,9 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		&ceph_inode_to_client(inode)->client->osdc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct page *page = list_entry(page_list->prev, struct page, lru);
+	struct ceph_vino vino;
 	struct ceph_osd_request *req;
+	struct ceph_osd_req_op op;
 	u64 off;
 	u64 len;
 	int i;
@@ -308,16 +310,17 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	len = nr_pages << PAGE_CACHE_SHIFT;
 	dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages,
 	     off, len);
-
-	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
-				    off, &len,
-				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, 0,
+	vino = ceph_vino(inode);
+	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
+				    1, &op, CEPH_OSD_OP_READ,
+				    CEPH_OSD_FLAG_READ, NULL,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    NULL, false);
+				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL);
+
 	/* build page vector */
 	nr_pages = calc_pages_for(0, len);
 	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS);
@@ -736,6 +739,7 @@ retry:
 	last_snapc = snapc;
 
 	while (!done && index <= end) {
+		struct ceph_osd_req_op ops[2];
 		unsigned i;
 		int first;
 		pgoff_t next;
@@ -825,20 +829,22 @@ get_more_pages:
 
 			/* ok */
 			if (locked_pages == 0) {
+				struct ceph_vino vino;
+				int num_ops = do_sync ? 2 : 1;
+
 				/* prepare async write request */
 				offset = (u64) page_offset(page);
 				len = wsize;
+				vino = ceph_vino(inode);
+				/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 				req = ceph_osdc_new_request(&fsc->client->osdc,
-					    &ci->i_layout,
-					    ceph_vino(inode),
-					    offset, &len,
+					    &ci->i_layout, vino, offset, &len,
+					    num_ops, ops,
 					    CEPH_OSD_OP_WRITE,
 					    CEPH_OSD_FLAG_WRITE |
 						    CEPH_OSD_FLAG_ONDISK,
-					    snapc, do_sync,
-					    ci->i_truncate_seq,
-					    ci->i_truncate_size,
-					    &inode->i_mtime, true);
+					    snapc, ci->i_truncate_seq,
+					    ci->i_truncate_size, true);
 
 				if (IS_ERR(req)) {
 					rc = PTR_ERR(req);
@@ -846,6 +852,10 @@ get_more_pages:
 					break;
 				}
 
+				ceph_osdc_build_request(req, offset,
+					num_ops, ops, snapc, vino.snap,
+					&inode->i_mtime);
+
 				req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
 				req->r_data_out.length = len;
 				req->r_data_out.alignment = 0;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index aeafa67bfe99..3d6dcf23b4ad 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -475,14 +475,17 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	struct inode *inode = file_inode(file);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_vino vino;
 	struct ceph_osd_request *req;
+	struct ceph_osd_req_op ops[2];
+	int num_ops = 1;
 	struct page **pages;
 	int num_pages;
 	long long unsigned pos;
 	u64 len;
 	int written = 0;
 	int flags;
-	int do_sync = 0;
 	int check_caps = 0;
 	int page_align, io_align;
 	unsigned long buf_align;
@@ -516,7 +519,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
 		flags |= CEPH_OSD_FLAG_ACK;
 	else
-		do_sync = 1;
+		num_ops++;	/* Also include a 'startsync' command. */
 
 	/*
 	 * we may need to do multiple writes here if we span an object
@@ -527,16 +530,19 @@ more:
 	buf_align = (unsigned long)data & ~PAGE_MASK;
 	len = left;
 
+	snapc = ci->i_snap_realm->cached_context;
+	vino = ceph_vino(inode);
 	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-				    ceph_vino(inode), pos, &len,
-				    CEPH_OSD_OP_WRITE, flags,
-				    ci->i_snap_realm->cached_context,
-				    do_sync,
+				    vino, pos, &len, num_ops, ops,
+				    CEPH_OSD_OP_WRITE, flags, snapc,
 				    ci->i_truncate_seq, ci->i_truncate_size,
-				    &mtime, false);
+				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	ceph_osdc_build_request(req, pos, num_ops, ops,
+				snapc, vino.snap, &mtime);
+
 	/* write from beginning of first page, regardless of io alignment */
 	page_align = file->f_flags & O_DIRECT ? buf_align : io_align;
 	num_pages = calc_pages_for(page_align, len);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index fdda93ebbb4c..ffaf9076fdc4 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -243,12 +243,12 @@ extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 					       struct ceph_snap_context *snapc,
-					       unsigned int num_op,
+					       unsigned int num_ops,
 					       bool use_mempool,
 					       gfp_t gfp_flags);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				    unsigned int num_op,
+				    unsigned int num_ops,
 				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
 				    u64 snap_id,
@@ -257,11 +257,11 @@ extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
-				      u64 offset, u64 *len, int op, int flags,
+				      u64 offset, u64 *len,
+				      int num_ops, struct ceph_osd_req_op *ops,
+				      int opcode, int flags,
 				      struct ceph_snap_context *snapc,
-				      int do_sync, u32 truncate_seq,
-				      u64 truncate_size,
-				      struct timespec *mtime,
+				      u32 truncate_seq, u64 truncate_size,
 				      bool use_mempool);
 
 extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0b4951e27532..115790aac30a 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -512,9 +512,7 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	msg->front.iov_len = msg_size;
 	msg->hdr.front_len = cpu_to_le32(msg_size);
 
-	dout("build_request msg_size was %d num_ops %d\n", (int)msg_size,
-	     num_ops);
-	return;
+	dout("build_request msg_size was %d\n", (int)msg_size);
 }
 EXPORT_SYMBOL(ceph_osdc_build_request);
 
@@ -532,18 +530,15 @@ EXPORT_SYMBOL(ceph_osdc_build_request);
 struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       struct ceph_file_layout *layout,
 					       struct ceph_vino vino,
-					       u64 off, u64 *plen,
+					       u64 off, u64 *plen, int num_ops,
+					       struct ceph_osd_req_op *ops,
 					       int opcode, int flags,
 					       struct ceph_snap_context *snapc,
-					       int do_sync,
 					       u32 truncate_seq,
 					       u64 truncate_size,
-					       struct timespec *mtime,
 					       bool use_mempool)
 {
-	struct ceph_osd_req_op ops[2];
 	struct ceph_osd_request *req;
-	unsigned int num_op = do_sync ? 2 : 1;
 	u64 objnum = 0;
 	u64 objoff = 0;
 	u64 objlen = 0;
@@ -553,7 +548,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
 
-	req = ceph_osdc_alloc_request(osdc, snapc, num_op, use_mempool,
+	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, use_mempool,
 					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
@@ -578,7 +573,12 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	osd_req_op_extent_init(&ops[0], opcode, objoff, objlen,
 				truncate_size, truncate_seq);
-	if (do_sync)
+	/*
+	 * A second op in the ops array means the caller wants to
+	 * also issue a include a 'startsync' command so that the
+	 * osd will flush data quickly.
+	 */
+	if (num_ops > 1)
 		osd_req_op_init(&ops[1], CEPH_OSD_OP_STARTSYNC);
 
 	req->r_file_layout = *layout;  /* keep a copy */
@@ -587,9 +587,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 		vino.ino, objnum);
 	req->r_oid_len = strlen(req->r_oid);
 
-	ceph_osdc_build_request(req, off, num_op, ops,
-				snapc, vino.snap, mtime);
-
 	return req;
 }
 EXPORT_SYMBOL(ceph_osdc_new_request);
@@ -2047,17 +2044,20 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request *req;
 	struct ceph_osd_data *osd_data;
+	struct ceph_osd_req_op op;
 	int rc = 0;
 
 	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
 	     vino.snap, off, *plen);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, &op,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
-				    NULL, 0, truncate_seq, truncate_size, NULL,
+				    NULL, truncate_seq, truncate_size,
 				    false);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL);
+
 	/* it may be a short read due to an object boundary */
 
 	osd_data = &req->r_data_in;
@@ -2092,19 +2092,21 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 {
 	struct ceph_osd_request *req;
 	struct ceph_osd_data *osd_data;
+	struct ceph_osd_req_op op;
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
-	BUG_ON(vino.snap != CEPH_NOSNAP);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, &op,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
-				    snapc, 0,
-				    truncate_seq, truncate_size, mtime,
+				    snapc, truncate_seq, truncate_size,
 				    true);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
+	ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime);
+
 	/* it may be a short write due to an object boundary */
 	osd_data = &req->r_data_out;
 	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
-- 
cgit 


From e5975c7c8eb6aeab8d2f76a98c368081082795e0 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:05 -0500
Subject: ceph: build osd request message later for writepages

Hold off building the osd request message in ceph_writepages_start()
until just before it will be submitted to the osd client for
execution.

We'll still create the request and allocate the page pointer array
after we learn we have at least one page to write.  A local variable
will be used to keep track of the allocated array of pages.  Wait
until just before submitting the request for assigning that page
array pointer to the request message.

Create ands use a new function osd_req_op_extent_update() whose
purpose is to serve this one spot where the length value supplied
when an osd request's op was initially formatted might need to get
changed (reduced, never increased) before submitting the request.

Previously, ceph_writepages_start() assigned the message header's
data length because of this update.  That's no longer necessary,
because ceph_osdc_build_request() will recalculate the right
value to use based on the content of the ops in the request.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/addr.c                  | 59 +++++++++++++++++++++++------------------
 include/linux/ceph/osd_client.h |  1 +
 net/ceph/osd_client.c           | 13 +++++++++
 3 files changed, 47 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 0a3d2ce89660..5d8ce79385ed 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -737,10 +737,14 @@ retry:
 
 	while (!done && index <= end) {
 		struct ceph_osd_req_op ops[2];
+		int num_ops = do_sync ? 2 : 1;
+		struct ceph_vino vino;
 		unsigned i;
 		int first;
 		pgoff_t next;
 		int pvec_pages, locked_pages;
+		struct page **pages = NULL;
+		mempool_t *pool = NULL;	/* Becomes non-null if mempool used */
 		struct page *page;
 		int want;
 		u64 offset, len;
@@ -824,16 +828,19 @@ get_more_pages:
 				break;
 			}
 
-			/* ok */
+			/*
+			 * We have something to write.  If this is
+			 * the first locked page this time through,
+			 * allocate an osd request and a page array
+			 * that it will use.
+			 */
 			if (locked_pages == 0) {
-				struct ceph_vino vino;
-				int num_ops = do_sync ? 2 : 1;
 				size_t size;
-				struct page **pages;
-				mempool_t *pool = NULL;
+
+				BUG_ON(pages);
 
 				/* prepare async write request */
-				offset = (u64) page_offset(page);
+				offset = (u64)page_offset(page);
 				len = wsize;
 				req = ceph_writepages_osd_request(inode,
 							offset, &len, snapc,
@@ -845,11 +852,6 @@ get_more_pages:
 					break;
 				}
 
-				vino = ceph_vino(inode);
-				ceph_osdc_build_request(req, offset,
-					num_ops, ops, snapc, vino.snap,
-					&inode->i_mtime);
-
 				req->r_callback = writepages_finish;
 				req->r_inode = inode;
 
@@ -858,16 +860,9 @@ get_more_pages:
 				pages = kmalloc(size, GFP_NOFS);
 				if (!pages) {
 					pool = fsc->wb_pagevec_pool;
-
 					pages = mempool_alloc(pool, GFP_NOFS);
-					WARN_ON(!pages);
+					BUG_ON(!pages);
 				}
-
-				req->r_data_out.pages = pages;
-				req->r_data_out.pages_from_pool = !!pool;
-				req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
-				req->r_data_out.length = len;
-				req->r_data_out.alignment = 0;
 			}
 
 			/* note position of first page in pvec */
@@ -885,7 +880,7 @@ get_more_pages:
 			}
 
 			set_page_writeback(page);
-			req->r_data_out.pages[locked_pages] = page;
+			pages[locked_pages] = page;
 			locked_pages++;
 			next = page->index + 1;
 		}
@@ -914,18 +909,30 @@ get_more_pages:
 			pvec.nr -= i-first;
 		}
 
-		/* submit the write */
-		offset = page_offset(req->r_data_out.pages[0]);
+		/* Format the osd request message and submit the write */
+
+		offset = page_offset(pages[0]);
 		len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
 			  (u64)locked_pages << PAGE_CACHE_SHIFT);
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		/* revise final length, page count */
+		req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
+		req->r_data_out.pages = pages;
 		req->r_data_out.length = len;
-		req->r_request_ops[0].extent.length = cpu_to_le64(len);
-		req->r_request_ops[0].payload_len = cpu_to_le32(len);
-		req->r_request->hdr.data_len = cpu_to_le32(len);
+		req->r_data_out.alignment = 0;
+		req->r_data_out.pages_from_pool = !!pool;
+
+		pages = NULL;	/* request message now owns the pages array */
+		pool = NULL;
+
+		/* Update the write op length in case we changed it */
+
+		osd_req_op_extent_update(&ops[0], len);
+
+		vino = ceph_vino(inode);
+		ceph_osdc_build_request(req, offset, num_ops, ops,
+					snapc, vino.snap, &inode->i_mtime);
 
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ffaf9076fdc4..5ee1a3776b4b 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -234,6 +234,7 @@ extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode);
 extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
 					u64 offset, u64 length,
 					u64 truncate_size, u32 truncate_seq);
+extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length);
 extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 					const char *class, const char *method,
 					const void *request_data,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 9ca693d0df19..426ca1f2a721 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -296,6 +296,19 @@ void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
 }
 EXPORT_SYMBOL(osd_req_op_extent_init);
 
+void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length)
+{
+	u64 previous = op->extent.length;
+
+	if (length == previous)
+		return;		/* Nothing to do */
+	BUG_ON(length > previous);
+
+	op->extent.length = length;
+	op->payload_len -= previous - length;
+}
+EXPORT_SYMBOL(osd_req_op_extent_update);
+
 void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 			const char *class, const char *method,
 			const void *request_data, size_t request_data_size)
-- 
cgit 


From 43bfe5de9fa78e07248b70992ce50321efec622c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 3 Apr 2013 01:28:57 -0500
Subject: libceph: define osd data initialization helpers

Define and use functions that encapsulate the initializion of a
ceph_osd_data structure.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 14 ++++-------
 fs/ceph/addr.c                  | 13 +++-------
 fs/ceph/file.c                  | 10 +++-----
 include/linux/ceph/osd_client.h | 11 +++++++++
 net/ceph/osd_client.c           | 55 +++++++++++++++++++++++++++++------------
 5 files changed, 63 insertions(+), 40 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index afbc9f6f8ff1..ab21b5218ae3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1350,17 +1350,13 @@ static struct ceph_osd_request *rbd_osd_req_create(
 		break;		/* Nothing to do */
 	case OBJ_REQUEST_BIO:
 		rbd_assert(obj_request->bio_list != NULL);
-		osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
-		osd_data->bio = obj_request->bio_list;
-		osd_data->bio_length = obj_request->length;
+		ceph_osd_data_bio_init(osd_data, obj_request->bio_list,
+					obj_request->length);
 		break;
 	case OBJ_REQUEST_PAGES:
-		osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
-		osd_data->pages = obj_request->pages;
-		osd_data->length = obj_request->length;
-		osd_data->alignment = offset & ~PAGE_MASK;
-		osd_data->pages_from_pool = false;
-		osd_data->own_pages = false;
+		ceph_osd_data_pages_init(osd_data, obj_request->pages,
+				obj_request->length, offset & ~PAGE_MASK,
+				false, false);
 		break;
 	}
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 5d8ce79385ed..cf9032abc8f5 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -342,10 +342,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	req->r_data_in.type = CEPH_OSD_DATA_TYPE_PAGES;
-	req->r_data_in.pages = pages;
-	req->r_data_in.length = len;
-	req->r_data_in.alignment = 0;
+	ceph_osd_data_pages_init(&req->r_data_in, pages, len, 0,
+					false, false);
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
@@ -917,11 +915,8 @@ get_more_pages:
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
-		req->r_data_out.pages = pages;
-		req->r_data_out.length = len;
-		req->r_data_out.alignment = 0;
-		req->r_data_out.pages_from_pool = !!pool;
+		ceph_osd_data_pages_init(&req->r_data_out, pages, len, 0,
+						!!pool, false);
 
 		pages = NULL;	/* request message now owns the pages array */
 		pool = NULL;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 47826c2ef511..da642af14a28 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -491,6 +491,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	unsigned long buf_align;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
+	bool own_pages = false;
 
 	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
@@ -571,14 +572,11 @@ more:
 		if ((file->f_flags & O_SYNC) == 0) {
 			/* get a second commit callback */
 			req->r_safe_callback = sync_write_commit;
-			req->r_data_out.own_pages = 1;
+			own_pages = true;
 		}
 	}
-	req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGES;
-	req->r_data_out.pages = pages;
-	req->r_data_out.length = len;
-	req->r_data_out.alignment = page_align;
-	req->r_inode = inode;
+	ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align,
+					false, own_pages);
 
 	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 	ceph_osdc_build_request(req, pos, num_ops, ops,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 5ee1a3776b4b..af60dac1f9c0 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -280,6 +280,17 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
 	kref_put(&req->r_kref, ceph_osdc_release_request);
 }
 
+extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
+				     struct page **pages, u64 length,
+				     u32 alignment, bool pages_from_pool,
+				     bool own_pages);
+extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
+					struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
+extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
+				   struct bio *bio, size_t bio_length);
+#endif /* CONFIG_BLOCK */
+
 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 				   struct ceph_osd_request *req,
 				   bool nofail);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 1379b3313348..f8f8561b602e 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -79,6 +79,38 @@ static int calc_layout(struct ceph_file_layout *layout, u64 off, u64 *plen,
 	return 0;
 }
 
+void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
+			struct page **pages, u64 length, u32 alignment,
+			bool pages_from_pool, bool own_pages)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
+	osd_data->pages = pages;
+	osd_data->length = length;
+	osd_data->alignment = alignment;
+	osd_data->pages_from_pool = pages_from_pool;
+	osd_data->own_pages = own_pages;
+}
+EXPORT_SYMBOL(ceph_osd_data_pages_init);
+
+void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
+			struct ceph_pagelist *pagelist)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
+	osd_data->pagelist = pagelist;
+}
+EXPORT_SYMBOL(ceph_osd_data_pagelist_init);
+
+#ifdef CONFIG_BLOCK
+void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
+			struct bio *bio, size_t bio_length)
+{
+	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
+	osd_data->bio = bio;
+	osd_data->bio_length = bio_length;
+}
+EXPORT_SYMBOL(ceph_osd_data_bio_init);
+#endif /* CONFIG_BLOCK */
+
 /*
  * requests
  */
@@ -400,8 +432,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		ceph_pagelist_append(pagelist, src->cls.indata,
 				     src->cls.indata_len);
 
-		req->r_data_out.type = CEPH_OSD_DATA_TYPE_PAGELIST;
-		req->r_data_out.pagelist = pagelist;
+		ceph_osd_data_pagelist_init(&req->r_data_out, pagelist);
 		out_data_len = pagelist->length;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
@@ -2056,7 +2087,6 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			struct page **pages, int num_pages, int page_align)
 {
 	struct ceph_osd_request *req;
-	struct ceph_osd_data *osd_data;
 	struct ceph_osd_req_op op;
 	int rc = 0;
 
@@ -2071,14 +2101,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 
 	/* it may be a short read due to an object boundary */
 
-	osd_data = &req->r_data_in;
-	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
-	osd_data->pages = pages;
-	osd_data->length = *plen;
-	osd_data->alignment = page_align;
+	ceph_osd_data_pages_init(&req->r_data_in, pages, *plen, page_align,
+				false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
-	     off, *plen, osd_data->length, page_align);
+	     off, *plen, *plen, page_align);
 
 	ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL);
 
@@ -2104,7 +2131,6 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 struct page **pages, int num_pages)
 {
 	struct ceph_osd_request *req;
-	struct ceph_osd_data *osd_data;
 	struct ceph_osd_req_op op;
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
@@ -2119,12 +2145,9 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
-	osd_data = &req->r_data_out;
-	osd_data->type = CEPH_OSD_DATA_TYPE_PAGES;
-	osd_data->pages = pages;
-	osd_data->length = len;
-	osd_data->alignment = page_align;
-	dout("writepages %llu~%llu (%llu bytes)\n", off, len, osd_data->length);
+	ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align,
+				false, false);
+	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
 	ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime);
 
-- 
cgit 


From 79528734f3ae4699a2886f62f55e18fb34fb3651 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 3 Apr 2013 21:32:51 -0500
Subject: libceph: keep source rather than message osd op array

An osd request keeps a pointer to the osd operations (ops) array
that it builds in its request message.

In order to allow each op in the array to have its own distinct
data, we will need to keep track of each op's data, and that
information does not go over the wire.

As long as we're tracking the data we might as well just track the
entire (source) op definition for each of the ops.  And if we're
doing that, we'll have no more need to keep a pointer to the
wire-encoded version.

This patch makes the array of source ops be kept with the osd
request structure, and uses that instead of the version encoded in
the message in places where that was previously used.  The array
will be embedded in the request structure, and the maximum number of
ops we ever actually use is currently 2.  So reduce CEPH_OSD_MAX_OP
to 2 to reduce the size of the structure.

The result of doing this sort of ripples back up, and as a result
various function parameters and local variables become unnecessary.

Make r_num_ops be unsigned, and move the definition of struct
ceph_osd_req_op earlier to ensure it's defined where needed.

It does not yet add per-op data, that's coming soon.

This resolves:
    http://tracker.ceph.com/issues/4656

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 42 ++++++++++++++-----------
 fs/ceph/addr.c                  | 21 ++++++-------
 fs/ceph/file.c                  |  6 ++--
 include/linux/ceph/osd_client.h | 70 ++++++++++++++++++++---------------------
 net/ceph/debugfs.c              |  4 +--
 net/ceph/osd_client.c           | 53 ++++++++++++++++---------------
 6 files changed, 97 insertions(+), 99 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 4a4be14a9189..c12b55559f16 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1285,7 +1285,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 	 */
 	obj_request->xferred = osd_req->r_reply_op_len[0];
 	rbd_assert(obj_request->xferred < (u64) UINT_MAX);
-	opcode = osd_req->r_request_ops[0].op;
+	opcode = osd_req->r_ops[0].op;
 	switch (opcode) {
 	case CEPH_OSD_OP_READ:
 		rbd_osd_read_callback(obj_request);
@@ -1312,8 +1312,7 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
 }
 
 static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request,
-					bool write_request,
-					struct ceph_osd_req_op *op)
+					bool write_request)
 {
 	struct rbd_img_request *img_request = obj_request->img_request;
 	struct ceph_snap_context *snapc = NULL;
@@ -1333,7 +1332,7 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request,
 	}
 
 	ceph_osdc_build_request(obj_request->osd_req, obj_request->offset,
-			1, op, snapc, snap_id, mtime);
+			snapc, snap_id, mtime);
 }
 
 static struct ceph_osd_request *rbd_osd_req_create(
@@ -1562,7 +1561,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 	while (resid) {
 		const char *object_name;
 		unsigned int clone_size;
-		struct ceph_osd_req_op op;
+		struct ceph_osd_req_op *op;
 		u64 offset;
 		u64 length;
 
@@ -1591,8 +1590,9 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 		if (!obj_request->osd_req)
 			goto out_partial;
 
-		osd_req_op_extent_init(&op, opcode, offset, length, 0, 0);
-		rbd_osd_req_format_op(obj_request, write_request, &op);
+		op = &obj_request->osd_req->r_ops[0];
+		osd_req_op_extent_init(op, opcode, offset, length, 0, 0);
+		rbd_osd_req_format_op(obj_request, write_request);
 
 		/* status and version are initially zero-filled */
 
@@ -1694,7 +1694,7 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver, u64 notify_id)
 {
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op op;
+	struct ceph_osd_req_op *op;
 	struct ceph_osd_client *osdc;
 	int ret;
 
@@ -1708,8 +1708,9 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	osd_req_op_watch_init(&op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0);
-	rbd_osd_req_format_op(obj_request, false, &op);
+	op = &obj_request->osd_req->r_ops[0];
+	osd_req_op_watch_init(op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0);
+	rbd_osd_req_format_op(obj_request, false);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	obj_request->callback = rbd_obj_request_put;
@@ -1749,7 +1750,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op op;
+	struct ceph_osd_req_op *op;
 	int ret;
 
 	rbd_assert(start ^ !!rbd_dev->watch_event);
@@ -1773,10 +1774,11 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	if (!obj_request->osd_req)
 		goto out_cancel;
 
-	osd_req_op_watch_init(&op, CEPH_OSD_OP_WATCH,
+	op = &obj_request->osd_req->r_ops[0];
+	osd_req_op_watch_init(op, CEPH_OSD_OP_WATCH,
 				rbd_dev->watch_event->cookie,
 				rbd_dev->header.obj_version, start);
-	rbd_osd_req_format_op(obj_request, true, &op);
+	rbd_osd_req_format_op(obj_request, true);
 
 	if (start)
 		ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
@@ -1836,7 +1838,7 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 {
 	struct rbd_obj_request *obj_request;
 	struct ceph_osd_client *osdc;
-	struct ceph_osd_req_op op;
+	struct ceph_osd_req_op *op;
 	struct page **pages;
 	u32 page_count;
 	int ret;
@@ -1866,9 +1868,10 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	osd_req_op_cls_init(&op, CEPH_OSD_OP_CALL, class_name, method_name,
+	op = &obj_request->osd_req->r_ops[0];
+	osd_req_op_cls_init(op, CEPH_OSD_OP_CALL, class_name, method_name,
 					outbound, outbound_size);
-	rbd_osd_req_format_op(obj_request, false, &op);
+	rbd_osd_req_format_op(obj_request, false);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	ret = rbd_obj_request_submit(osdc, obj_request);
@@ -2046,8 +2049,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 				char *buf, u64 *version)
 
 {
-	struct ceph_osd_req_op op;
 	struct rbd_obj_request *obj_request;
+	struct ceph_osd_req_op *op;
 	struct ceph_osd_client *osdc;
 	struct page **pages = NULL;
 	u32 page_count;
@@ -2072,8 +2075,9 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	osd_req_op_extent_init(&op, CEPH_OSD_OP_READ, offset, length, 0, 0);
-	rbd_osd_req_format_op(obj_request, false, &op);
+	op = &obj_request->osd_req->r_ops[0];
+	osd_req_op_extent_init(op, CEPH_OSD_OP_READ, offset, length, 0, 0);
+	rbd_osd_req_format_op(obj_request, false);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
 	ret = rbd_obj_request_submit(osdc, obj_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 127be29a6c22..c9da074f0fe6 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -288,7 +288,6 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	struct page *page = list_entry(page_list->prev, struct page, lru);
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
-	struct ceph_osd_req_op op;
 	u64 off;
 	u64 len;
 	int i;
@@ -314,7 +313,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	     off, len);
 	vino = ceph_vino(inode);
 	req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len,
-				    1, &op, CEPH_OSD_OP_READ,
+				    1, CEPH_OSD_OP_READ,
 				    CEPH_OSD_FLAG_READ, NULL,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    false);
@@ -349,7 +348,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
-	ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL);
+	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
 
 	dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len);
 	ret = ceph_osdc_start_request(osdc, req, false);
@@ -567,7 +566,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	struct ceph_snap_context *snapc = req->r_snapc;
 	struct address_space *mapping = inode->i_mapping;
 	int rc = req->r_result;
-	u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
+	u64 bytes = req->r_ops[0].extent.length;
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
@@ -635,8 +634,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 
 static struct ceph_osd_request *
 ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
-				struct ceph_snap_context *snapc,
-				int num_ops, struct ceph_osd_req_op *ops)
+				struct ceph_snap_context *snapc, int num_ops)
 {
 	struct ceph_fs_client *fsc;
 	struct ceph_inode_info *ci;
@@ -648,7 +646,7 @@ ceph_writepages_osd_request(struct inode *inode, u64 offset, u64 *len,
 	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 
 	return ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-			vino, offset, len, num_ops, ops, CEPH_OSD_OP_WRITE,
+			vino, offset, len, num_ops, CEPH_OSD_OP_WRITE,
 			CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK,
 			snapc, ci->i_truncate_seq, ci->i_truncate_size, true);
 }
@@ -738,7 +736,6 @@ retry:
 	last_snapc = snapc;
 
 	while (!done && index <= end) {
-		struct ceph_osd_req_op ops[2];
 		int num_ops = do_sync ? 2 : 1;
 		struct ceph_vino vino;
 		unsigned i;
@@ -846,7 +843,7 @@ get_more_pages:
 				len = wsize;
 				req = ceph_writepages_osd_request(inode,
 							offset, &len, snapc,
-							num_ops, ops);
+							num_ops);
 
 				if (IS_ERR(req)) {
 					rc = PTR_ERR(req);
@@ -927,11 +924,11 @@ get_more_pages:
 
 		/* Update the write op length in case we changed it */
 
-		osd_req_op_extent_update(&ops[0], len);
+		osd_req_op_extent_update(&req->r_ops[0], len);
 
 		vino = ceph_vino(inode);
-		ceph_osdc_build_request(req, offset, num_ops, ops,
-					snapc, vino.snap, &inode->i_mtime);
+		ceph_osdc_build_request(req, offset, snapc, vino.snap,
+					&inode->i_mtime);
 
 		rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
 		BUG_ON(rc);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index da642af14a28..a12f47642c40 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -478,7 +478,6 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
 	struct ceph_snap_context *snapc;
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
-	struct ceph_osd_req_op ops[2];
 	int num_ops = 1;
 	struct page **pages;
 	int num_pages;
@@ -534,7 +533,7 @@ more:
 	snapc = ci->i_snap_realm->cached_context;
 	vino = ceph_vino(inode);
 	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-				    vino, pos, &len, num_ops, ops,
+				    vino, pos, &len, num_ops,
 				    CEPH_OSD_OP_WRITE, flags, snapc,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    false);
@@ -579,8 +578,7 @@ more:
 					false, own_pages);
 
 	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
-	ceph_osdc_build_request(req, pos, num_ops, ops,
-				snapc, vino.snap, &mtime);
+	ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
 	if (!ret) {
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index af60dac1f9c0..f4c1a2a22a14 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -48,7 +48,7 @@ struct ceph_osd {
 };
 
 
-#define CEPH_OSD_MAX_OP 10
+#define CEPH_OSD_MAX_OP	2
 
 enum ceph_osd_data_type {
 	CEPH_OSD_DATA_TYPE_NONE,
@@ -79,6 +79,34 @@ struct ceph_osd_data {
 	};
 };
 
+struct ceph_osd_req_op {
+	u16 op;           /* CEPH_OSD_OP_* */
+	u32 payload_len;
+	union {
+		struct {
+			u64 offset, length;
+			u64 truncate_size;
+			u32 truncate_seq;
+		} extent;
+		struct {
+			const char *class_name;
+			const char *method_name;
+			const void *indata;
+			u32 indata_len;
+			__u8 class_len;
+			__u8 method_len;
+			__u8 argc;
+		} cls;
+		struct {
+			u64 cookie;
+			u64 ver;
+			u32 prot_ver;
+			u32 timeout;
+			__u8 flag;
+		} watch;
+	};
+};
+
 /* an in-flight request */
 struct ceph_osd_request {
 	u64             r_tid;              /* unique for this client */
@@ -95,10 +123,11 @@ struct ceph_osd_request {
 	struct ceph_msg  *r_request, *r_reply;
 	int               r_flags;     /* any additional flags for the osd */
 	u32               r_sent;      /* >0 if r_request is sending/sent */
-	int               r_num_ops;
 
-	/* encoded message content */
-	struct ceph_osd_op *r_request_ops;
+	/* request osd ops array  */
+	unsigned int		r_num_ops;
+	struct ceph_osd_req_op	r_ops[CEPH_OSD_MAX_OP];
+
 	/* these are updated on each send */
 	__le32           *r_request_osdmap_epoch;
 	__le32           *r_request_flags;
@@ -193,34 +222,6 @@ struct ceph_osd_client {
 	struct workqueue_struct	*notify_wq;
 };
 
-struct ceph_osd_req_op {
-	u16 op;           /* CEPH_OSD_OP_* */
-	u32 payload_len;
-	union {
-		struct {
-			u64 offset, length;
-			u64 truncate_size;
-			u32 truncate_seq;
-		} extent;
-		struct {
-			const char *class_name;
-			const char *method_name;
-			const void *indata;
-			u32 indata_len;
-			__u8 class_len;
-			__u8 method_len;
-			__u8 argc;
-		} cls;
-		struct {
-			u64 cookie;
-			u64 ver;
-			u32 prot_ver;
-			u32 timeout;
-			__u8 flag;
-		} watch;
-	};
-};
-
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 			  struct ceph_client *client);
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
@@ -249,8 +250,6 @@ extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *
 					       gfp_t gfp_flags);
 
 extern void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
-				    unsigned int num_ops,
-				    struct ceph_osd_req_op *src_ops,
 				    struct ceph_snap_context *snapc,
 				    u64 snap_id,
 				    struct timespec *mtime);
@@ -259,8 +258,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
 				      struct ceph_file_layout *layout,
 				      struct ceph_vino vino,
 				      u64 offset, u64 *len,
-				      int num_ops, struct ceph_osd_req_op *ops,
-				      int opcode, int flags,
+				      int num_ops, int opcode, int flags,
 				      struct ceph_snap_context *snapc,
 				      u32 truncate_seq, u64 truncate_size,
 				      bool use_mempool);
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 00d051f4894e..83661cdc0766 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -123,8 +123,8 @@ static int osdc_show(struct seq_file *s, void *pp)
 	mutex_lock(&osdc->request_mutex);
 	for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
 		struct ceph_osd_request *req;
+		unsigned int i;
 		int opcode;
-		int i;
 
 		req = rb_entry(p, struct ceph_osd_request, r_node);
 
@@ -142,7 +142,7 @@ static int osdc_show(struct seq_file *s, void *pp)
 			seq_printf(s, "\t");
 
 		for (i = 0; i < req->r_num_ops; i++) {
-			opcode = le16_to_cpu(req->r_request_ops[i].op);
+			opcode = req->r_ops[i].op;
 			seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
 		}
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index e197c5c0b3a2..a498d2de17a4 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -186,6 +186,9 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	struct ceph_msg *msg;
 	size_t msg_size;
 
+	BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
+	BUG_ON(num_ops > CEPH_OSD_MAX_OP);
+
 	msg_size = 4 + 4 + 8 + 8 + 4+8;
 	msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
 	msg_size += 1 + 8 + 4 + 4;     /* pg_t */
@@ -207,6 +210,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 
 	req->r_osdc = osdc;
 	req->r_mempool = use_mempool;
+	req->r_num_ops = num_ops;
 
 	kref_init(&req->r_kref);
 	init_completion(&req->r_completion);
@@ -418,12 +422,14 @@ void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
 EXPORT_SYMBOL(osd_req_op_watch_init);
 
 static u64 osd_req_encode_op(struct ceph_osd_request *req,
-			      struct ceph_osd_op *dst,
-			      struct ceph_osd_req_op *src)
+			      struct ceph_osd_op *dst, unsigned int which)
 {
+	struct ceph_osd_req_op *src;
 	u64 out_data_len = 0;
 	struct ceph_pagelist *pagelist;
 
+	BUG_ON(which >= req->r_num_ops);
+	src = &req->r_ops[which];
 	if (WARN_ON(!osd_req_opcode_valid(src->op))) {
 		pr_err("unrecognized osd opcode %d\n", src->op);
 
@@ -487,21 +493,17 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
  * build new request AND message
  *
  */
-void ceph_osdc_build_request(struct ceph_osd_request *req,
-			     u64 off, unsigned int num_ops,
-			     struct ceph_osd_req_op *src_ops,
-			     struct ceph_snap_context *snapc, u64 snap_id,
-			     struct timespec *mtime)
+void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
+				struct ceph_snap_context *snapc, u64 snap_id,
+				struct timespec *mtime)
 {
 	struct ceph_msg *msg = req->r_request;
-	struct ceph_osd_req_op *src_op;
 	void *p;
 	size_t msg_size;
 	int flags = req->r_flags;
 	u64 data_len;
-	int i;
+	unsigned int i;
 
-	req->r_num_ops = num_ops;
 	req->r_snapid = snap_id;
 	req->r_snapc = ceph_get_snap_context(snapc);
 
@@ -541,12 +543,10 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
 	p += req->r_oid_len;
 
 	/* ops--can imply data */
-	ceph_encode_16(&p, num_ops);
-	src_op = src_ops;
-	req->r_request_ops = p;
+	ceph_encode_16(&p, (u16)req->r_num_ops);
 	data_len = 0;
-	for (i = 0; i < num_ops; i++, src_op++) {
-		data_len += osd_req_encode_op(req, p, src_op);
+	for (i = 0; i < req->r_num_ops; i++) {
+		data_len += osd_req_encode_op(req, p, i);
 		p += sizeof(struct ceph_osd_op);
 	}
 
@@ -602,7 +602,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       struct ceph_file_layout *layout,
 					       struct ceph_vino vino,
 					       u64 off, u64 *plen, int num_ops,
-					       struct ceph_osd_req_op *ops,
 					       int opcode, int flags,
 					       struct ceph_snap_context *snapc,
 					       u32 truncate_seq,
@@ -610,6 +609,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       bool use_mempool)
 {
 	struct ceph_osd_request *req;
+	struct ceph_osd_req_op *op;
 	u64 objnum = 0;
 	u64 objoff = 0;
 	u64 objlen = 0;
@@ -623,6 +623,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
+
 	req->r_flags = flags;
 
 	/* calculate max write size */
@@ -642,7 +643,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 			truncate_size = object_size;
 	}
 
-	osd_req_op_extent_init(&ops[0], opcode, objoff, objlen,
+	op = &req->r_ops[0];
+	osd_req_op_extent_init(op, opcode, objoff, objlen,
 				truncate_size, truncate_seq);
 	/*
 	 * A second op in the ops array means the caller wants to
@@ -650,7 +652,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	 * osd will flush data quickly.
 	 */
 	if (num_ops > 1)
-		osd_req_op_init(&ops[1], CEPH_OSD_OP_STARTSYNC);
+		osd_req_op_init(++op, CEPH_OSD_OP_STARTSYNC);
 
 	req->r_file_layout = *layout;  /* keep a copy */
 
@@ -1342,7 +1344,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	struct ceph_osd_request *req;
 	u64 tid;
 	int object_len;
-	int numops, payload_len, flags;
+	unsigned int numops;
+	int payload_len, flags;
 	s32 result;
 	s32 retry_attempt;
 	struct ceph_pg pg;
@@ -1352,7 +1355,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
 	u32 osdmap_epoch;
 	int already_completed;
 	u32 bytes;
-	int i;
+	unsigned int i;
 
 	tid = le64_to_cpu(msg->hdr.tid);
 	dout("handle_reply %p tid %llu\n", msg, tid);
@@ -2116,12 +2119,11 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 			struct page **pages, int num_pages, int page_align)
 {
 	struct ceph_osd_request *req;
-	struct ceph_osd_req_op op;
 	int rc = 0;
 
 	dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
 	     vino.snap, off, *plen);
-	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1, &op,
+	req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1,
 				    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
 				    NULL, truncate_seq, truncate_size,
 				    false);
@@ -2136,7 +2138,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
 	     off, *plen, *plen, page_align);
 
-	ceph_osdc_build_request(req, off, 1, &op, NULL, vino.snap, NULL);
+	ceph_osdc_build_request(req, off, NULL, vino.snap, NULL);
 
 	rc = ceph_osdc_start_request(osdc, req, false);
 	if (!rc)
@@ -2160,12 +2162,11 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 			 struct page **pages, int num_pages)
 {
 	struct ceph_osd_request *req;
-	struct ceph_osd_req_op op;
 	int rc = 0;
 	int page_align = off & ~PAGE_MASK;
 
 	BUG_ON(vino.snap != CEPH_NOSNAP);	/* snapshots aren't writeable */
-	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1, &op,
+	req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1,
 				    CEPH_OSD_OP_WRITE,
 				    CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
 				    snapc, truncate_seq, truncate_size,
@@ -2178,7 +2179,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
-	ceph_osdc_build_request(req, off, 1, &op, snapc, CEPH_NOSNAP, mtime);
+	ceph_osdc_build_request(req, off, snapc, CEPH_NOSNAP, mtime);
 
 	rc = ceph_osdc_start_request(osdc, req, true);
 	if (!rc)
-- 
cgit 


From 54d5064912649e296552f298e6472ffd37cd8f90 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 3 Apr 2013 01:28:58 -0500
Subject: libceph: rename data out field in osd request op

There are fields "indata" and "indata_len" defined the ceph osd
request op structure.  The "in" part is with from the point of view
of the osd server, but is a little confusing here on the client
side.  Change their names to use "request" instead of "in" to
indicate that it defines data provided with the request (as opposed
the data returned in the response).

Rename the local variable in osd_req_encode_op() to match.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  4 ++--
 net/ceph/osd_client.c           | 18 +++++++++---------
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index f4c1a2a22a14..a9c4089894c8 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -91,8 +91,8 @@ struct ceph_osd_req_op {
 		struct {
 			const char *class_name;
 			const char *method_name;
-			const void *indata;
-			u32 indata_len;
+			const void *request_data;
+			u32 request_data_len;
 			__u8 class_len;
 			__u8 method_len;
 			__u8 argc;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index a498d2de17a4..87fcf0b795c0 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -395,9 +395,9 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 	op->cls.method_len = size;
 	payload_len += size;
 
-	op->cls.indata = request_data;
+	op->cls.request_data = request_data;
 	BUG_ON(request_data_size > (size_t) U32_MAX);
-	op->cls.indata_len = (u32) request_data_size;
+	op->cls.request_data_len = (u32) request_data_size;
 	payload_len += request_data_size;
 
 	op->cls.argc = 0;	/* currently unused */
@@ -425,7 +425,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			      struct ceph_osd_op *dst, unsigned int which)
 {
 	struct ceph_osd_req_op *src;
-	u64 out_data_len = 0;
+	u64 request_data_len = 0;
 	struct ceph_pagelist *pagelist;
 
 	BUG_ON(which >= req->r_num_ops);
@@ -442,7 +442,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
 		if (src->op == CEPH_OSD_OP_WRITE)
-			out_data_len = src->extent.length;
+			request_data_len = src->extent.length;
 		dst->extent.offset = cpu_to_le64(src->extent.offset);
 		dst->extent.length = cpu_to_le64(src->extent.length);
 		dst->extent.truncate_size =
@@ -457,16 +457,16 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
-		dst->cls.indata_len = cpu_to_le32(src->cls.indata_len);
+		dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len);
 		ceph_pagelist_append(pagelist, src->cls.class_name,
 				     src->cls.class_len);
 		ceph_pagelist_append(pagelist, src->cls.method_name,
 				     src->cls.method_len);
-		ceph_pagelist_append(pagelist, src->cls.indata,
-				     src->cls.indata_len);
+		ceph_pagelist_append(pagelist, src->cls.request_data,
+				     src->cls.request_data_len);
 
 		ceph_osd_data_pagelist_init(&req->r_data_out, pagelist);
-		out_data_len = pagelist->length;
+		request_data_len = pagelist->length;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
@@ -486,7 +486,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 	dst->op = cpu_to_le16(src->op);
 	dst->payload_len = cpu_to_le32(src->payload_len);
 
-	return out_data_len;
+	return request_data_len;
 }
 
 /*
-- 
cgit 


From 8c042b0df99cd06ef8473ef6e204b87b3dc80158 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 3 Apr 2013 01:28:58 -0500
Subject: libceph: add data pointers in osd op structures

An extent type osd operation currently implies that there will
be corresponding data supplied in the data portion of the request
(for write) or response (for read) message.  Similarly, an osd class
method operation implies a data item will be supplied to receive
the response data from the operation.

Add a ceph_osd_data pointer to each of those structures, and assign
it to point to eithre the incoming or the outgoing data structure in
the osd message.  The data is not always available when an op is
initially set up, so add two new functions to allow setting them
after the op has been initialized.

Begin to make use of the data item pointer available in the osd
operation rather than the request data in or out structure in
places where it's convenient.  Add some assertions to verify
pointers are always set the way they're expected to be.

This is a sort of stepping stone toward really moving the data
into the osd request ops, to allow for some validation before
making that jump.

This is the first in a series of patches that resolve:
    http://tracker.ceph.com/issues/4657

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 24 ++++++++++++++++++++----
 fs/ceph/addr.c                  |  8 +++++---
 fs/ceph/file.c                  |  5 +++--
 include/linux/ceph/osd_client.h |  6 ++++++
 net/ceph/osd_client.c           | 26 +++++++++++++++++++++++++-
 5 files changed, 59 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c12b55559f16..eb64ed0f228f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1315,23 +1315,39 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request,
 					bool write_request)
 {
 	struct rbd_img_request *img_request = obj_request->img_request;
+	struct ceph_osd_request *osd_req = obj_request->osd_req;
+	struct ceph_osd_data *osd_data = NULL;
 	struct ceph_snap_context *snapc = NULL;
 	u64 snap_id = CEPH_NOSNAP;
 	struct timespec *mtime = NULL;
 	struct timespec now;
 
-	rbd_assert(obj_request->osd_req != NULL);
+	rbd_assert(osd_req != NULL);
 
 	if (write_request) {
+		osd_data = &osd_req->r_data_out;
 		now = CURRENT_TIME;
 		mtime = &now;
 		if (img_request)
 			snapc = img_request->snapc;
-	} else if (img_request) {
-		snap_id = img_request->snap_id;
+	} else {
+		osd_data = &osd_req->r_data_in;
+		if (img_request)
+			snap_id = img_request->snap_id;
 	}
+	if (obj_request->type != OBJ_REQUEST_NODATA) {
+		struct ceph_osd_req_op *op = &obj_request->osd_req->r_ops[0];
 
-	ceph_osdc_build_request(obj_request->osd_req, obj_request->offset,
+		/*
+		 * If it has data, it's either a object class method
+		 * call (cls) or it's an extent operation.
+		 */
+		if (op->op == CEPH_OSD_OP_CALL)
+			osd_req_op_cls_response_data(op, osd_data);
+		else
+			osd_req_op_extent_osd_data(op, osd_data);
+	}
+	ceph_osdc_build_request(osd_req, obj_request->offset,
 			snapc, snap_id, mtime);
 }
 
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index c9da074f0fe6..0ac3a37753cb 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -343,7 +343,8 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	ceph_osd_data_pages_init(&req->r_data_in, pages, len, 0,
+	BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_in);
+	ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, 0,
 					false, false);
 	req->r_callback = finish_read;
 	req->r_inode = inode;
@@ -916,8 +917,9 @@ get_more_pages:
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		ceph_osd_data_pages_init(&req->r_data_out, pages, len, 0,
-						!!pool, false);
+		BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out);
+		ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages,
+						len, 0, !!pool, false);
 
 		pages = NULL;	/* request message now owns the pages array */
 		pool = NULL;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index a12f47642c40..cddc10fd7cf9 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -574,8 +574,9 @@ more:
 			own_pages = true;
 		}
 	}
-	ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align,
-					false, own_pages);
+	BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out);
+	ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len,
+					page_align, false, own_pages);
 
 	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 	ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index a9c4089894c8..ae5193550fbf 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -87,12 +87,14 @@ struct ceph_osd_req_op {
 			u64 offset, length;
 			u64 truncate_size;
 			u32 truncate_seq;
+			struct ceph_osd_data *osd_data;
 		} extent;
 		struct {
 			const char *class_name;
 			const char *method_name;
 			const void *request_data;
 			u32 request_data_len;
+			struct ceph_osd_data *response_data;
 			__u8 class_len;
 			__u8 method_len;
 			__u8 argc;
@@ -236,10 +238,14 @@ extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
 					u64 offset, u64 length,
 					u64 truncate_size, u32 truncate_seq);
 extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length);
+extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op,
+					struct ceph_osd_data *osd_data);
 extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 					const char *class, const char *method,
 					const void *request_data,
 					size_t request_data_size);
+extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op,
+					struct ceph_osd_data *response_data);
 extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
 					u64 cookie, u64 version, int flag);
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 87fcf0b795c0..23491e92b229 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -372,6 +372,13 @@ void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length)
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
+void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op,
+				struct ceph_osd_data *osd_data)
+{
+	op->extent.osd_data = osd_data;
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data);
+
 void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 			const char *class, const char *method,
 			const void *request_data, size_t request_data_size)
@@ -406,6 +413,13 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
 
+void osd_req_op_cls_response_data(struct ceph_osd_req_op *op,
+				struct ceph_osd_data *response_data)
+{
+	op->cls.response_data = response_data;
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data);
+
 void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
 				u64 cookie, u64 version, int flag)
 {
@@ -449,6 +463,10 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
+		if (src->op == CEPH_OSD_OP_WRITE)
+			WARN_ON(src->extent.osd_data != &req->r_data_out);
+		else
+			WARN_ON(src->extent.osd_data != &req->r_data_in);
 		break;
 	case CEPH_OSD_OP_CALL:
 		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
@@ -464,8 +482,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 				     src->cls.method_len);
 		ceph_pagelist_append(pagelist, src->cls.request_data,
 				     src->cls.request_data_len);
-
 		ceph_osd_data_pagelist_init(&req->r_data_out, pagelist);
+
+		WARN_ON(src->cls.response_data != &req->r_data_in);
 		request_data_len = pagelist->length;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
@@ -609,6 +628,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       bool use_mempool)
 {
 	struct ceph_osd_request *req;
+	struct ceph_osd_data *osd_data;
 	struct ceph_osd_req_op *op;
 	u64 objnum = 0;
 	u64 objoff = 0;
@@ -623,6 +643,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
+	osd_data = opcode == CEPH_OSD_OP_WRITE ? &req->r_data_out
+					       : &req->r_data_in;
 
 	req->r_flags = flags;
 
@@ -646,6 +668,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	op = &req->r_ops[0];
 	osd_req_op_extent_init(op, opcode, objoff, objlen,
 				truncate_size, truncate_seq);
+	osd_req_op_extent_osd_data(op, osd_data);
+
 	/*
 	 * A second op in the ops array means the caller wants to
 	 * also issue a include a 'startsync' command so that the
-- 
cgit 


From c99d2d4abb6c405ef52e9bc1da87b382b8f41739 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 01:27:11 -0500
Subject: libceph: specify osd op by index in request

An osd request now holds all of its source op structures, and every
place that initializes one of these is in fact initializing one
of the entries in the the osd request's array.

So rather than supplying the address of the op to initialize, have
caller specify the osd request and an indication of which op it
would like to initialize.  This better hides the details the
op structure (and faciltates moving the data pointers they use).

Since osd_req_op_init() is a common routine, and it's not used
outside the osd client code, give it static scope.  Also make
it return the address of the specified op (so all the other
init routines don't have to repeat that code).

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 35 ++++++++++------------
 fs/ceph/addr.c                  |  2 +-
 include/linux/ceph/osd_client.h | 19 +++++++-----
 net/ceph/osd_client.c           | 64 +++++++++++++++++++++++++----------------
 4 files changed, 67 insertions(+), 53 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index eb64ed0f228f..80ac772587c8 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1336,16 +1336,17 @@ static void rbd_osd_req_format_op(struct rbd_obj_request *obj_request,
 			snap_id = img_request->snap_id;
 	}
 	if (obj_request->type != OBJ_REQUEST_NODATA) {
-		struct ceph_osd_req_op *op = &obj_request->osd_req->r_ops[0];
-
 		/*
 		 * If it has data, it's either a object class method
 		 * call (cls) or it's an extent operation.
 		 */
-		if (op->op == CEPH_OSD_OP_CALL)
-			osd_req_op_cls_response_data(op, osd_data);
+		/* XXX This use of the ops array goes away in the next patch */
+		if (obj_request->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL)
+			osd_req_op_cls_response_data(obj_request->osd_req, 0,
+						osd_data);
 		else
-			osd_req_op_extent_osd_data(op, osd_data);
+			osd_req_op_extent_osd_data(obj_request->osd_req, 0,
+						osd_data);
 	}
 	ceph_osdc_build_request(osd_req, obj_request->offset,
 			snapc, snap_id, mtime);
@@ -1577,7 +1578,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 	while (resid) {
 		const char *object_name;
 		unsigned int clone_size;
-		struct ceph_osd_req_op *op;
 		u64 offset;
 		u64 length;
 
@@ -1606,8 +1606,8 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 		if (!obj_request->osd_req)
 			goto out_partial;
 
-		op = &obj_request->osd_req->r_ops[0];
-		osd_req_op_extent_init(op, opcode, offset, length, 0, 0);
+		osd_req_op_extent_init(obj_request->osd_req, 0,
+					opcode, offset, length, 0, 0);
 		rbd_osd_req_format_op(obj_request, write_request);
 
 		/* status and version are initially zero-filled */
@@ -1710,7 +1710,6 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 				   u64 ver, u64 notify_id)
 {
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op *op;
 	struct ceph_osd_client *osdc;
 	int ret;
 
@@ -1724,8 +1723,8 @@ static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	op = &obj_request->osd_req->r_ops[0];
-	osd_req_op_watch_init(op, CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver, 0);
+	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
+					notify_id, ver, 0);
 	rbd_osd_req_format_op(obj_request, false);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
@@ -1766,7 +1765,6 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op *op;
 	int ret;
 
 	rbd_assert(start ^ !!rbd_dev->watch_event);
@@ -1790,8 +1788,7 @@ static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
 	if (!obj_request->osd_req)
 		goto out_cancel;
 
-	op = &obj_request->osd_req->r_ops[0];
-	osd_req_op_watch_init(op, CEPH_OSD_OP_WATCH,
+	osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
 				rbd_dev->watch_event->cookie,
 				rbd_dev->header.obj_version, start);
 	rbd_osd_req_format_op(obj_request, true);
@@ -1854,7 +1851,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 {
 	struct rbd_obj_request *obj_request;
 	struct ceph_osd_client *osdc;
-	struct ceph_osd_req_op *op;
 	struct page **pages;
 	u32 page_count;
 	int ret;
@@ -1884,8 +1880,8 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	op = &obj_request->osd_req->r_ops[0];
-	osd_req_op_cls_init(op, CEPH_OSD_OP_CALL, class_name, method_name,
+	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
+					class_name, method_name,
 					outbound, outbound_size);
 	rbd_osd_req_format_op(obj_request, false);
 
@@ -2066,7 +2062,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 
 {
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_req_op *op;
 	struct ceph_osd_client *osdc;
 	struct page **pages = NULL;
 	u32 page_count;
@@ -2091,8 +2086,8 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	op = &obj_request->osd_req->r_ops[0];
-	osd_req_op_extent_init(op, CEPH_OSD_OP_READ, offset, length, 0, 0);
+	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
+					offset, length, 0, 0);
 	rbd_osd_req_format_op(obj_request, false);
 
 	osdc = &rbd_dev->rbd_client->client->osdc;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 0ac3a37753cb..cc57104a7266 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -926,7 +926,7 @@ get_more_pages:
 
 		/* Update the write op length in case we changed it */
 
-		osd_req_op_extent_update(&req->r_ops[0], len);
+		osd_req_op_extent_update(req, 0, len);
 
 		vino = ceph_vino(inode);
 		ceph_osdc_build_request(req, offset, snapc, vino.snap,
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index ae5193550fbf..144d57cbef9e 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -233,20 +233,25 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
-extern void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode);
-extern void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
+extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+					unsigned int which, u16 opcode,
 					u64 offset, u64 length,
 					u64 truncate_size, u32 truncate_seq);
-extern void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length);
-extern void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op,
+extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+					unsigned int which, u64 length);
+extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+					unsigned int which,
 					struct ceph_osd_data *osd_data);
-extern void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
+extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
+					unsigned int which, u16 opcode,
 					const char *class, const char *method,
 					const void *request_data,
 					size_t request_data_size);
-extern void osd_req_op_cls_response_data(struct ceph_osd_req_op *op,
+extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
+					unsigned int which,
 					struct ceph_osd_data *response_data);
-extern void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
+extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+					unsigned int which, u16 opcode,
 					u64 cookie, u64 version, int flag);
 
 extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 23491e92b229..ad24f210bf0c 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -329,25 +329,32 @@ static bool osd_req_opcode_valid(u16 opcode)
  * other information associated with them.  It also serves as a
  * common init routine for all the other init functions, below.
  */
-void osd_req_op_init(struct ceph_osd_req_op *op, u16 opcode)
+static struct ceph_osd_req_op *
+osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
+				u16 opcode)
 {
+	struct ceph_osd_req_op *op;
+
+	BUG_ON(which >= osd_req->r_num_ops);
 	BUG_ON(!osd_req_opcode_valid(opcode));
 
+	op = &osd_req->r_ops[which];
 	memset(op, 0, sizeof (*op));
-
 	op->op = opcode;
+
+	return op;
 }
 
-void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
+void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
+				unsigned int which, u16 opcode,
 				u64 offset, u64 length,
 				u64 truncate_size, u32 truncate_seq)
 {
+	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
 	size_t payload_len = 0;
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
 
-	osd_req_op_init(op, opcode);
-
 	op->extent.offset = offset;
 	op->extent.length = length;
 	op->extent.truncate_size = truncate_size;
@@ -359,9 +366,15 @@ void osd_req_op_extent_init(struct ceph_osd_req_op *op, u16 opcode,
 }
 EXPORT_SYMBOL(osd_req_op_extent_init);
 
-void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length)
+void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
+				unsigned int which, u64 length)
 {
-	u64 previous = op->extent.length;
+	struct ceph_osd_req_op *op;
+	u64 previous;
+
+	BUG_ON(which >= osd_req->r_num_ops);
+	op = &osd_req->r_ops[which];
+	previous = op->extent.length;
 
 	if (length == previous)
 		return;		/* Nothing to do */
@@ -372,24 +385,25 @@ void osd_req_op_extent_update(struct ceph_osd_req_op *op, u64 length)
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
-void osd_req_op_extent_osd_data(struct ceph_osd_req_op *op,
+void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+				unsigned int which,
 				struct ceph_osd_data *osd_data)
 {
-	op->extent.osd_data = osd_data;
+	BUG_ON(which >= osd_req->r_num_ops);
+	osd_req->r_ops[which].extent.osd_data = osd_data;
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
-void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
-			const char *class, const char *method,
+void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
+			u16 opcode, const char *class, const char *method,
 			const void *request_data, size_t request_data_size)
 {
+	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
 	size_t payload_len = 0;
 	size_t size;
 
 	BUG_ON(opcode != CEPH_OSD_OP_CALL);
 
-	osd_req_op_init(op, opcode);
-
 	op->cls.class_name = class;
 	size = strlen(class);
 	BUG_ON(size > (size_t) U8_MAX);
@@ -412,26 +426,28 @@ void osd_req_op_cls_init(struct ceph_osd_req_op *op, u16 opcode,
 	op->payload_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
-
-void osd_req_op_cls_response_data(struct ceph_osd_req_op *op,
+void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
+				unsigned int which,
 				struct ceph_osd_data *response_data)
 {
-	op->cls.response_data = response_data;
+	BUG_ON(which >= osd_req->r_num_ops);
+	osd_req->r_ops[which].cls.response_data = response_data;
 }
 EXPORT_SYMBOL(osd_req_op_cls_response_data);
 
-void osd_req_op_watch_init(struct ceph_osd_req_op *op, u16 opcode,
+void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
+				unsigned int which, u16 opcode,
 				u64 cookie, u64 version, int flag)
 {
-	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
+	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
 
-	osd_req_op_init(op, opcode);
+	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
 
 	op->watch.cookie = cookie;
 	/* op->watch.ver = version; */	/* XXX 3847 */
 	op->watch.ver = cpu_to_le64(version);
 	if (opcode == CEPH_OSD_OP_WATCH && flag)
-		op->watch.flag = (u8) 1;
+		op->watch.flag = (u8)1;
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
 
@@ -629,7 +645,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 {
 	struct ceph_osd_request *req;
 	struct ceph_osd_data *osd_data;
-	struct ceph_osd_req_op *op;
 	u64 objnum = 0;
 	u64 objoff = 0;
 	u64 objlen = 0;
@@ -665,10 +680,9 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 			truncate_size = object_size;
 	}
 
-	op = &req->r_ops[0];
-	osd_req_op_extent_init(op, opcode, objoff, objlen,
+	osd_req_op_extent_init(req, 0, opcode, objoff, objlen,
 				truncate_size, truncate_seq);
-	osd_req_op_extent_osd_data(op, osd_data);
+	osd_req_op_extent_osd_data(req, 0, osd_data);
 
 	/*
 	 * A second op in the ops array means the caller wants to
@@ -676,7 +690,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 	 * osd will flush data quickly.
 	 */
 	if (num_ops > 1)
-		osd_req_op_init(++op, CEPH_OSD_OP_STARTSYNC);
+		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC);
 
 	req->r_file_layout = *layout;  /* keep a copy */
 
-- 
cgit 


From 5f562df5f59340eae4272501b974903f48d2ad92 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 01:27:12 -0500
Subject: libceph: format class info at init time

An object class method is formatted using a pagelist which contains
the class name, the method name, and the data concatenated into an
osd request's outbound data.

Currently when a class op is initialized in osd_req_op_cls_init(),
the lengths of and pointers to these three items are recorded.
Later, when the op is getting formatted into the request message, a
new pagelist is created and that is when these items get copied into
the pagelist.

This patch makes it so the pagelist to hold these items is created
when the op is initialized instead.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  3 ++-
 net/ceph/osd_client.c           | 29 +++++++++++++++--------------
 2 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 144d57cbef9e..71c41575646d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -93,8 +93,9 @@ struct ceph_osd_req_op {
 			const char *class_name;
 			const char *method_name;
 			const void *request_data;
-			u32 request_data_len;
+			struct ceph_osd_data *request_info;
 			struct ceph_osd_data *response_data;
+			u32 request_data_len;
 			__u8 class_len;
 			__u8 method_len;
 			__u8 argc;
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index ad24f210bf0c..db2624860384 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -399,28 +399,39 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 			const void *request_data, size_t request_data_size)
 {
 	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
+	struct ceph_pagelist *pagelist;
 	size_t payload_len = 0;
 	size_t size;
 
 	BUG_ON(opcode != CEPH_OSD_OP_CALL);
 
+	pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+	BUG_ON(!pagelist);
+	ceph_pagelist_init(pagelist);
+
 	op->cls.class_name = class;
 	size = strlen(class);
 	BUG_ON(size > (size_t) U8_MAX);
 	op->cls.class_len = size;
+	ceph_pagelist_append(pagelist, class, size);
 	payload_len += size;
 
 	op->cls.method_name = method;
 	size = strlen(method);
 	BUG_ON(size > (size_t) U8_MAX);
 	op->cls.method_len = size;
+	ceph_pagelist_append(pagelist, method, size);
 	payload_len += size;
 
 	op->cls.request_data = request_data;
 	BUG_ON(request_data_size > (size_t) U32_MAX);
 	op->cls.request_data_len = (u32) request_data_size;
+	ceph_pagelist_append(pagelist, request_data, request_data_size);
 	payload_len += request_data_size;
 
+	op->cls.request_info = &osd_req->r_data_out;
+	ceph_osd_data_pagelist_init(op->cls.request_info, pagelist);
+
 	op->cls.argc = 0;	/* currently unused */
 
 	op->payload_len = payload_len;
@@ -456,7 +467,6 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 {
 	struct ceph_osd_req_op *src;
 	u64 request_data_len = 0;
-	struct ceph_pagelist *pagelist;
 
 	BUG_ON(which >= req->r_num_ops);
 	src = &req->r_ops[which];
@@ -485,23 +495,14 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			WARN_ON(src->extent.osd_data != &req->r_data_in);
 		break;
 	case CEPH_OSD_OP_CALL:
-		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
-		BUG_ON(!pagelist);
-		ceph_pagelist_init(pagelist);
-
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len);
-		ceph_pagelist_append(pagelist, src->cls.class_name,
-				     src->cls.class_len);
-		ceph_pagelist_append(pagelist, src->cls.method_name,
-				     src->cls.method_len);
-		ceph_pagelist_append(pagelist, src->cls.request_data,
-				     src->cls.request_data_len);
-		ceph_osd_data_pagelist_init(&req->r_data_out, pagelist);
-
 		WARN_ON(src->cls.response_data != &req->r_data_in);
-		request_data_len = pagelist->length;
+		WARN_ON(src->cls.request_info != &req->r_data_out);
+		BUG_ON(src->cls.request_info->type !=
+					CEPH_OSD_DATA_TYPE_PAGELIST);
+		request_data_len = src->cls.request_info->pagelist->length;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-- 
cgit 


From a4ce40a9a7c1053ac2a41cf64255e44e356e5522 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 01:27:12 -0500
Subject: libceph: combine initializing and setting osd data

This ends up being a rather large patch but what it's doing is
somewhat straightforward.

Basically, this is replacing two calls with one.  The first of the
two calls is initializing a struct ceph_osd_data with data (either a
page array, a page list, or a bio list); the second is setting an
osd request op so it associates that data with one of the op's
parameters.  In place of those two will be a single function that
initializes the op directly.

That means we sort of fan out a set of the needed functions:
    - extent ops with pages data
    - extent ops with pagelist data
    - extent ops with bio list data
and
    - class ops with page data for receiving a response

We also have define another one, but it's only used internally:
    - class ops with pagelist data for request parameters

Note that we *still* haven't gotten rid of the osd request's
r_data_in and r_data_out fields.  All the osd ops refer to them for
their data.  For now, these data fields are pointers assigned to the
appropriate r_data_* field when these new functions are called.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  20 ++----
 fs/ceph/addr.c                  |  12 ++--
 fs/ceph/file.c                  |   3 +-
 include/linux/ceph/osd_client.h |  43 ++++++-----
 net/ceph/osd_client.c           | 155 +++++++++++++++++++++++++++++++---------
 5 files changed, 161 insertions(+), 72 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index db29783436c8..6f7a52cf75c7 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1592,7 +1592,6 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 	rbd_assert(resid > 0);
 	while (resid) {
 		struct ceph_osd_request *osd_req;
-		struct ceph_osd_data *osd_data;
 		const char *object_name;
 		unsigned int clone_size;
 		u64 offset;
@@ -1625,13 +1624,10 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 		obj_request->osd_req = osd_req;
 		obj_request->callback = rbd_img_obj_callback;
 
-		osd_data = write_request ? &osd_req->r_data_out
-					 : &osd_req->r_data_in;
 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
 						0, 0);
-		ceph_osd_data_bio_init(osd_data, obj_request->bio_list,
-					obj_request->length);
-		osd_req_op_extent_osd_data(osd_req, 0, osd_data);
+		osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
+				obj_request->bio_list, obj_request->length);
 		rbd_osd_req_format(obj_request, write_request);
 
 		rbd_img_obj_request_add(img_request, obj_request);
@@ -1821,7 +1817,6 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_data *osd_data;
 	struct page **pages;
 	u32 page_count;
 	int ret;
@@ -1851,13 +1846,12 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	osd_data = &obj_request->osd_req->r_data_in;
 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
 					class_name, method_name,
 					outbound, outbound_size);
-	ceph_osd_data_pages_init(osd_data, obj_request->pages, inbound_size,
+	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
+					obj_request->pages, inbound_size,
 					0, false, false);
-	osd_req_op_cls_response_data(obj_request->osd_req, 0, osd_data);
 	rbd_osd_req_format(obj_request, false);
 
 	ret = rbd_obj_request_submit(osdc, obj_request);
@@ -2037,7 +2031,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 {
 	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
 	struct rbd_obj_request *obj_request;
-	struct ceph_osd_data *osd_data;
 	struct page **pages = NULL;
 	u32 page_count;
 	size_t size;
@@ -2061,14 +2054,13 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 	if (!obj_request->osd_req)
 		goto out;
 
-	osd_data = &obj_request->osd_req->r_data_in;
 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
 					offset, length, 0, 0);
-	ceph_osd_data_pages_init(osd_data, obj_request->pages,
+	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
+					obj_request->pages,
 					obj_request->length,
 					obj_request->offset & ~PAGE_MASK,
 					false, false);
-	osd_req_op_extent_osd_data(obj_request->osd_req, 0, osd_data);
 	rbd_osd_req_format(obj_request, false);
 
 	ret = rbd_obj_request_submit(osdc, obj_request);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index cc57104a7266..27d62070a8e9 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -245,7 +245,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
-	osd_data = &req->r_data_in;
+	osd_data = osd_req_op_extent_osd_data(req, 0, false);
 	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
 	num_pages = calc_pages_for((u64)osd_data->alignment,
 					(u64)osd_data->length);
@@ -343,8 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_in);
-	ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len, 0,
+	osd_req_op_extent_osd_data_pages(req, 0, false, pages, len, 0,
 					false, false);
 	req->r_callback = finish_read;
 	req->r_inode = inode;
@@ -572,7 +571,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	osd_data = &req->r_data_out;
+	osd_data = osd_req_op_extent_osd_data(req, 0, true);
 	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
 	num_pages = calc_pages_for((u64)osd_data->alignment,
 					(u64)osd_data->length);
@@ -917,9 +916,8 @@ get_more_pages:
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out);
-		ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages,
-						len, 0, !!pool, false);
+		osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, 0,
+							!!pool, false);
 
 		pages = NULL;	/* request message now owns the pages array */
 		pool = NULL;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index cddc10fd7cf9..0f9c4095614b 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -574,8 +574,7 @@ more:
 			own_pages = true;
 		}
 	}
-	BUG_ON(req->r_ops[0].extent.osd_data != &req->r_data_out);
-	ceph_osd_data_pages_init(req->r_ops[0].extent.osd_data, pages, len,
+	osd_req_op_extent_osd_data_pages(req, 0, true, pages, len,
 					page_align, false, own_pages);
 
 	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 71c41575646d..f8a00b48e550 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -240,17 +240,39 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 					u64 truncate_size, u32 truncate_seq);
 extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
 					unsigned int which, u64 length);
-extern void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+
+extern struct ceph_osd_data *osd_req_op_extent_osd_data(
+					struct ceph_osd_request *osd_req,
+					unsigned int which, bool write_request);
+extern struct ceph_osd_data *osd_req_op_cls_response_data(
+					struct ceph_osd_request *osd_req,
+					unsigned int which);
+
+extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
+					unsigned int which, bool write_request,
+					struct page **pages, u64 length,
+					u32 alignment, bool pages_from_pool,
+					bool own_pages);
+extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
+					unsigned int which, bool write_request,
+					struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
+extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
+					unsigned int which, bool write_request,
+					struct bio *bio, size_t bio_length);
+#endif /* CONFIG_BLOCK */
+
+extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
 					unsigned int which,
-					struct ceph_osd_data *osd_data);
+					struct page **pages, u64 length,
+					u32 alignment, bool pages_from_pool,
+					bool own_pages);
+
 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					const char *class, const char *method,
 					const void *request_data,
 					size_t request_data_size);
-extern void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-					unsigned int which,
-					struct ceph_osd_data *response_data);
 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					u64 cookie, u64 version, int flag);
@@ -290,17 +312,6 @@ static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
 	kref_put(&req->r_kref, ceph_osdc_release_request);
 }
 
-extern void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
-				     struct page **pages, u64 length,
-				     u32 alignment, bool pages_from_pool,
-				     bool own_pages);
-extern void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
-					struct ceph_pagelist *pagelist);
-#ifdef CONFIG_BLOCK
-extern void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
-				   struct bio *bio, size_t bio_length);
-#endif /* CONFIG_BLOCK */
-
 extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
 				   struct ceph_osd_request *req,
 				   bool nofail);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 932b8af8b8ee..86cb52404f17 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1,3 +1,4 @@
+
 #include <linux/ceph/ceph_debug.h>
 
 #include <linux/module.h>
@@ -85,7 +86,7 @@ static void ceph_osd_data_init(struct ceph_osd_data *osd_data)
 	osd_data->type = CEPH_OSD_DATA_TYPE_NONE;
 }
 
-void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
+static void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
 			struct page **pages, u64 length, u32 alignment,
 			bool pages_from_pool, bool own_pages)
 {
@@ -96,27 +97,131 @@ void ceph_osd_data_pages_init(struct ceph_osd_data *osd_data,
 	osd_data->pages_from_pool = pages_from_pool;
 	osd_data->own_pages = own_pages;
 }
-EXPORT_SYMBOL(ceph_osd_data_pages_init);
 
-void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
+static void ceph_osd_data_pagelist_init(struct ceph_osd_data *osd_data,
 			struct ceph_pagelist *pagelist)
 {
 	osd_data->type = CEPH_OSD_DATA_TYPE_PAGELIST;
 	osd_data->pagelist = pagelist;
 }
-EXPORT_SYMBOL(ceph_osd_data_pagelist_init);
 
 #ifdef CONFIG_BLOCK
-void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
+static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
 			struct bio *bio, size_t bio_length)
 {
 	osd_data->type = CEPH_OSD_DATA_TYPE_BIO;
 	osd_data->bio = bio;
 	osd_data->bio_length = bio_length;
 }
-EXPORT_SYMBOL(ceph_osd_data_bio_init);
 #endif /* CONFIG_BLOCK */
 
+struct ceph_osd_data *
+osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
+			unsigned int which, bool write_request)
+{
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	/* return &osd_req->r_ops[which].extent.osd_data; */
+	return write_request ? &osd_req->r_data_out : &osd_req->r_data_in;
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data);
+
+struct ceph_osd_data *
+osd_req_op_cls_request_info(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	/* return &osd_req->r_ops[which].cls.request_info; */
+	return &osd_req->r_data_out;	/* Request data is outgoing */
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_info);	/* ??? */
+
+struct ceph_osd_data *
+osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	/* return &osd_req->r_ops[which].cls.response_data; */
+	return &osd_req->r_data_in;	/* Response data is incoming */
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
+
+void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, bool write_request,
+			struct page **pages, u64 length, u32 alignment,
+			bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+
+	osd_req->r_ops[which].extent.osd_data =
+		osd_req_op_extent_osd_data(osd_req, which, write_request);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
+
+void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
+			unsigned int which, bool write_request,
+			struct ceph_pagelist *pagelist)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
+	ceph_osd_data_pagelist_init(osd_data, pagelist);
+
+	osd_req->r_ops[which].extent.osd_data =
+		osd_req_op_extent_osd_data(osd_req, which, write_request);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
+
+#ifdef CONFIG_BLOCK
+void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
+			unsigned int which, bool write_request,
+			struct bio *bio, size_t bio_length)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
+	ceph_osd_data_bio_init(osd_data, bio, bio_length);
+
+	osd_req->r_ops[which].extent.osd_data =
+		osd_req_op_extent_osd_data(osd_req, which, write_request);
+}
+EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
+#endif /* CONFIG_BLOCK */
+
+static void osd_req_op_cls_request_info_pagelist(
+			struct ceph_osd_request *osd_req,
+			unsigned int which, struct ceph_pagelist *pagelist)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_cls_request_info(osd_req, which);
+	ceph_osd_data_pagelist_init(osd_data, pagelist);
+
+	osd_req->r_ops[which].cls.request_info =
+		osd_req_op_cls_request_info(osd_req, which);
+}
+
+void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages, u64 length,
+			u32 alignment, bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_cls_response_data(osd_req, which);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+
+	osd_req->r_ops[which].cls.response_data =
+		osd_req_op_cls_response_data(osd_req, which);
+}
+EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
+
 static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
 {
 	switch (osd_data->type) {
@@ -385,15 +490,6 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
-void osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
-				unsigned int which,
-				struct ceph_osd_data *osd_data)
-{
-	BUG_ON(which >= osd_req->r_num_ops);
-	osd_req->r_ops[which].extent.osd_data = osd_data;
-}
-EXPORT_SYMBOL(osd_req_op_extent_osd_data);
-
 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 			u16 opcode, const char *class, const char *method,
 			const void *request_data, size_t request_data_size)
@@ -429,22 +525,13 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 	ceph_pagelist_append(pagelist, request_data, request_data_size);
 	payload_len += request_data_size;
 
-	op->cls.request_info = &osd_req->r_data_out;
-	ceph_osd_data_pagelist_init(op->cls.request_info, pagelist);
+	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
 	op->cls.argc = 0;	/* currently unused */
 
 	op->payload_len = payload_len;
 }
 EXPORT_SYMBOL(osd_req_op_cls_init);
-void osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
-				unsigned int which,
-				struct ceph_osd_data *response_data)
-{
-	BUG_ON(which >= osd_req->r_num_ops);
-	osd_req->r_ops[which].cls.response_data = response_data;
-}
-EXPORT_SYMBOL(osd_req_op_cls_response_data);
 
 void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 				unsigned int which, u16 opcode,
@@ -547,7 +634,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					       bool use_mempool)
 {
 	struct ceph_osd_request *req;
-	struct ceph_osd_data *osd_data;
 	u64 objnum = 0;
 	u64 objoff = 0;
 	u64 objlen = 0;
@@ -561,8 +647,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 					GFP_NOFS);
 	if (!req)
 		return ERR_PTR(-ENOMEM);
-	osd_data = opcode == CEPH_OSD_OP_WRITE ? &req->r_data_out
-					       : &req->r_data_in;
 
 	req->r_flags = flags;
 
@@ -585,7 +669,6 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
 
 	osd_req_op_extent_init(req, 0, opcode, objoff, objlen,
 				truncate_size, truncate_seq);
-	osd_req_op_extent_osd_data(req, 0, osd_data);
 
 	/*
 	 * A second op in the ops array means the caller wants to
@@ -2171,8 +2254,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 
 	/* it may be a short read due to an object boundary */
 
-	ceph_osd_data_pages_init(&req->r_data_in, pages, *plen, page_align,
-				false, false);
+	osd_req_op_extent_osd_data_pages(req, 0, false,
+				pages, *plen, page_align, false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
 	     off, *plen, *plen, page_align);
@@ -2214,7 +2297,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
-	ceph_osd_data_pages_init(&req->r_data_out, pages, len, page_align,
+	osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
@@ -2308,8 +2391,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 	m = ceph_msg_get(req->r_reply);
 
 	if (data_len > 0) {
-		struct ceph_osd_data *osd_data = &req->r_data_in;
+		struct ceph_osd_data *osd_data;
 
+		/*
+		 * XXX This is assuming there is only one op containing
+		 * XXX page data.  Probably OK for reads, but this
+		 * XXX ought to be done more generally.
+		 */
+		osd_data = osd_req_op_extent_osd_data(req, 0, false);
 		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 			if (osd_data->pages &&
 				unlikely(osd_data->length < data_len)) {
-- 
cgit 


From ec9123c56787fa7fb2608f05b19d21c5e1912d87 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 01:27:12 -0500
Subject: libceph: set the data pointers when encoding ops

Still using the osd request r_data_in and r_data_out pointer, but
we're basically only referring to it via the data pointers in the
osd ops.  And we're transferring that information to the request
or reply message only when the op indicates it's needed, in
osd_req_encode_op().

To avoid a forward reference, ceph_osdc_msg_data_set() was moved up
in the file.

Don't bother calling ceph_osd_data_init(), in ceph_osd_alloc(),
because the ops array will already be zeroed anyway.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  2 +-
 net/ceph/osd_client.c           | 63 ++++++++++++++++++++---------------------
 2 files changed, 32 insertions(+), 33 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index f8a00b48e550..dd4ca4ba8cab 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -51,7 +51,7 @@ struct ceph_osd {
 #define CEPH_OSD_MAX_OP	2
 
 enum ceph_osd_data_type {
-	CEPH_OSD_DATA_TYPE_NONE,
+	CEPH_OSD_DATA_TYPE_NONE = 0,
 	CEPH_OSD_DATA_TYPE_PAGES,
 	CEPH_OSD_DATA_TYPE_PAGELIST,
 #ifdef CONFIG_BLOCK
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 86cb52404f17..cc4003fdc01f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -339,9 +339,6 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 	}
 	req->r_reply = msg;
 
-	ceph_osd_data_init(&req->r_data_in);
-	ceph_osd_data_init(&req->r_data_out);
-
 	/* create request message; allow space for oid */
 	if (use_mempool)
 		msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -549,6 +546,28 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
 
+static void ceph_osdc_msg_data_set(struct ceph_msg *msg,
+				struct ceph_osd_data *osd_data)
+{
+	u64 length = ceph_osd_data_length(osd_data);
+
+	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
+		BUG_ON(length > (u64) SIZE_MAX);
+		if (length)
+			ceph_msg_data_set_pages(msg, osd_data->pages,
+					length, osd_data->alignment);
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
+		BUG_ON(!length);
+		ceph_msg_data_set_pagelist(msg, osd_data->pagelist);
+#ifdef CONFIG_BLOCK
+	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
+		ceph_msg_data_set_bio(msg, osd_data->bio, length);
+#endif
+	} else {
+		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
+	}
+}
+
 static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			      struct ceph_osd_op *dst, unsigned int which)
 {
@@ -576,17 +595,24 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
-		if (src->op == CEPH_OSD_OP_WRITE)
+		if (src->op == CEPH_OSD_OP_WRITE) {
 			WARN_ON(src->extent.osd_data != &req->r_data_out);
-		else
+			ceph_osdc_msg_data_set(req->r_request,
+						src->extent.osd_data);
+		} else {
 			WARN_ON(src->extent.osd_data != &req->r_data_in);
+			ceph_osdc_msg_data_set(req->r_reply,
+						src->extent.osd_data);
+		}
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len);
 		WARN_ON(src->cls.response_data != &req->r_data_in);
+		ceph_osdc_msg_data_set(req->r_reply, src->cls.response_data);
 		WARN_ON(src->cls.request_info != &req->r_data_out);
+		ceph_osdc_msg_data_set(req->r_request, src->cls.request_info);
 		BUG_ON(src->cls.request_info->type !=
 					CEPH_OSD_DATA_TYPE_PAGELIST);
 		request_data_len = src->cls.request_info->pagelist->length;
@@ -1930,28 +1956,6 @@ bad:
 	return;
 }
 
-static void ceph_osdc_msg_data_set(struct ceph_msg *msg,
-				struct ceph_osd_data *osd_data)
-{
-	u64 length = ceph_osd_data_length(osd_data);
-
-	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
-		BUG_ON(length > (u64) SIZE_MAX);
-		if (length)
-			ceph_msg_data_set_pages(msg, osd_data->pages,
-					length, osd_data->alignment);
-	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
-		BUG_ON(!length);
-		ceph_msg_data_set_pagelist(msg, osd_data->pagelist);
-#ifdef CONFIG_BLOCK
-	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
-		ceph_msg_data_set_bio(msg, osd_data->bio, length);
-#endif
-	} else {
-		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
-	}
-}
-
 /*
  * build new request AND message
  *
@@ -1967,11 +1971,6 @@ void ceph_osdc_build_request(struct ceph_osd_request *req, u64 off,
 	u64 data_len;
 	unsigned int i;
 
-	/* Set up response incoming data and request outgoing data fields */
-
-	ceph_osdc_msg_data_set(req->r_reply, &req->r_data_in);
-	ceph_osdc_msg_data_set(req->r_request, &req->r_data_out);
-
 	req->r_snapid = snap_id;
 	req->r_snapc = ceph_get_snap_context(snapc);
 
-- 
cgit 


From 5476492fba9fd0b4118aacf5b924dd29b8cca56c Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 01:27:12 -0500
Subject: libceph: kill off osd request r_data_in and r_data_out

Finally!  Convert the osd op data pointers into real structures, and
make the switch over to using them instead of having all ops share
the in and/or out data structures in the osd request.

Set up a new function to traverse the set of ops and release any
data associated with them (pages).

This and the patches leading up to it resolve:
    http://tracker.ceph.com/issues/4657

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  9 ++---
 net/ceph/osd_client.c           | 79 ++++++++++++++++++++---------------------
 2 files changed, 42 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index dd4ca4ba8cab..4ec46c0ceaf7 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -87,14 +87,14 @@ struct ceph_osd_req_op {
 			u64 offset, length;
 			u64 truncate_size;
 			u32 truncate_seq;
-			struct ceph_osd_data *osd_data;
+			struct ceph_osd_data osd_data;
 		} extent;
 		struct {
 			const char *class_name;
 			const char *method_name;
 			const void *request_data;
-			struct ceph_osd_data *request_info;
-			struct ceph_osd_data *response_data;
+			struct ceph_osd_data request_info;
+			struct ceph_osd_data response_data;
 			u32 request_data_len;
 			__u8 class_len;
 			__u8 method_len;
@@ -164,9 +164,6 @@ struct ceph_osd_request {
 
 	struct ceph_file_layout r_file_layout;
 	struct ceph_snap_context *r_snapc;    /* snap context for writes */
-
-	struct ceph_osd_data r_data_in;
-	struct ceph_osd_data r_data_out;
 };
 
 struct ceph_osd_event {
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index cc4003fdc01f..2562e4e52245 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -121,8 +121,7 @@ osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 {
 	BUG_ON(which >= osd_req->r_num_ops);
 
-	/* return &osd_req->r_ops[which].extent.osd_data; */
-	return write_request ? &osd_req->r_data_out : &osd_req->r_data_in;
+	return &osd_req->r_ops[which].extent.osd_data;
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data);
 
@@ -132,8 +131,7 @@ osd_req_op_cls_request_info(struct ceph_osd_request *osd_req,
 {
 	BUG_ON(which >= osd_req->r_num_ops);
 
-	/* return &osd_req->r_ops[which].cls.request_info; */
-	return &osd_req->r_data_out;	/* Request data is outgoing */
+	return &osd_req->r_ops[which].cls.request_info;
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_info);	/* ??? */
 
@@ -143,8 +141,7 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
 {
 	BUG_ON(which >= osd_req->r_num_ops);
 
-	/* return &osd_req->r_ops[which].cls.response_data; */
-	return &osd_req->r_data_in;	/* Response data is incoming */
+	return &osd_req->r_ops[which].cls.response_data;
 }
 EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
 
@@ -158,9 +155,6 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
 	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
-
-	osd_req->r_ops[which].extent.osd_data =
-		osd_req_op_extent_osd_data(osd_req, which, write_request);
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
 
@@ -172,9 +166,6 @@ void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
 
 	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
-
-	osd_req->r_ops[which].extent.osd_data =
-		osd_req_op_extent_osd_data(osd_req, which, write_request);
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
 
@@ -187,9 +178,6 @@ void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
 
 	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
 	ceph_osd_data_bio_init(osd_data, bio, bio_length);
-
-	osd_req->r_ops[which].extent.osd_data =
-		osd_req_op_extent_osd_data(osd_req, which, write_request);
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
 #endif /* CONFIG_BLOCK */
@@ -202,9 +190,6 @@ static void osd_req_op_cls_request_info_pagelist(
 
 	osd_data = osd_req_op_cls_request_info(osd_req, which);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
-
-	osd_req->r_ops[which].cls.request_info =
-		osd_req_op_cls_request_info(osd_req, which);
 }
 
 void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
@@ -216,9 +201,6 @@ void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
 	osd_data = osd_req_op_cls_response_data(osd_req, which);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
-
-	osd_req->r_ops[which].cls.response_data =
-		osd_req_op_cls_response_data(osd_req, which);
 }
 EXPORT_SYMBOL(osd_req_op_cls_response_data_pages);
 
@@ -241,18 +223,39 @@ static u64 ceph_osd_data_length(struct ceph_osd_data *osd_data)
 	}
 }
 
+
 static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
 {
-	if (osd_data->type != CEPH_OSD_DATA_TYPE_PAGES)
-		return;
-
-	if (osd_data->own_pages) {
+	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES && osd_data->own_pages) {
 		int num_pages;
 
 		num_pages = calc_pages_for((u64)osd_data->alignment,
 						(u64)osd_data->length);
 		ceph_release_page_vector(osd_data->pages, num_pages);
 	}
+	ceph_osd_data_init(osd_data);
+}
+
+static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	struct ceph_osd_req_op *op;
+
+	BUG_ON(which >= osd_req->r_num_ops);
+	op = &osd_req->r_ops[which];
+
+	switch (op->op) {
+	case CEPH_OSD_OP_READ:
+	case CEPH_OSD_OP_WRITE:
+		ceph_osd_data_release(&op->extent.osd_data);
+		break;
+	case CEPH_OSD_OP_CALL:
+		ceph_osd_data_release(&op->cls.request_info);
+		ceph_osd_data_release(&op->cls.response_data);
+		break;
+	default:
+		break;
+	}
 }
 
 /*
@@ -261,6 +264,7 @@ static void ceph_osd_data_release(struct ceph_osd_data *osd_data)
 void ceph_osdc_release_request(struct kref *kref)
 {
 	struct ceph_osd_request *req;
+	unsigned int which;
 
 	req = container_of(kref, struct ceph_osd_request, r_kref);
 	if (req->r_request)
@@ -270,8 +274,8 @@ void ceph_osdc_release_request(struct kref *kref)
 		ceph_msg_put(req->r_reply);
 	}
 
-	ceph_osd_data_release(&req->r_data_in);
-	ceph_osd_data_release(&req->r_data_out);
+	for (which = 0; which < req->r_num_ops; which++)
+		osd_req_op_data_release(req, which);
 
 	ceph_put_snap_context(req->r_snapc);
 	if (req->r_mempool)
@@ -595,27 +599,22 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
-		if (src->op == CEPH_OSD_OP_WRITE) {
-			WARN_ON(src->extent.osd_data != &req->r_data_out);
+		if (src->op == CEPH_OSD_OP_WRITE)
 			ceph_osdc_msg_data_set(req->r_request,
-						src->extent.osd_data);
-		} else {
-			WARN_ON(src->extent.osd_data != &req->r_data_in);
+						&src->extent.osd_data);
+		else
 			ceph_osdc_msg_data_set(req->r_reply,
-						src->extent.osd_data);
-		}
+						&src->extent.osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len);
-		WARN_ON(src->cls.response_data != &req->r_data_in);
-		ceph_osdc_msg_data_set(req->r_reply, src->cls.response_data);
-		WARN_ON(src->cls.request_info != &req->r_data_out);
-		ceph_osdc_msg_data_set(req->r_request, src->cls.request_info);
-		BUG_ON(src->cls.request_info->type !=
+		ceph_osdc_msg_data_set(req->r_reply, &src->cls.response_data);
+		ceph_osdc_msg_data_set(req->r_request, &src->cls.request_info);
+		BUG_ON(src->cls.request_info.type !=
 					CEPH_OSD_DATA_TYPE_PAGELIST);
-		request_data_len = src->cls.request_info->pagelist->length;
+		request_data_len = src->cls.request_info.pagelist->length;
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-- 
cgit 


From ea96571f7b865edaf1acd472e6f2cddc9fb67892 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 14:46:01 -0500
Subject: libceph: fix possible CONFIG_BLOCK build problem

This patch:
    15a0d7b libceph: record message data length
did not enclose some bio-specific code inside CONFIG_BLOCK as
it should have.  Fix that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h | 2 ++
 net/ceph/messenger.c           | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index b832c0ce899a..cdeebae03e0d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -271,8 +271,10 @@ extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
 extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 				struct ceph_pagelist *pagelist);
+#ifdef CONFIG_BLOCK
 extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 				size_t length);
+#endif /* CONFIG_BLOCK */
 
 extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 				     bool can_fail);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index a6fda9532102..994192beda02 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -817,7 +817,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
 
 	return true;
 }
-#endif
+#endif /* CONFIG_BLOCK */
 
 /*
  * For a page array, a piece comes from the first page in the array
@@ -3011,6 +3011,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 
+#ifdef	CONFIG_BLOCK
 void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 		size_t length)
 {
@@ -3028,6 +3029,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 	msg->data_length = length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
+#endif	/* CONFIG_BLOCK */
 
 /*
  * construct a new message with given type, size
-- 
cgit 


From c851c49591ebf000c610711e39eea7da5ff05b21 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 14:46:01 -0500
Subject: libceph: record bio length

The bio is the only data item type that doesn't record its full
length.  Fix that.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h | 5 ++++-
 net/ceph/messenger.c           | 1 +
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index cdeebae03e0d..4fb870a5b5fc 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -116,7 +116,10 @@ struct ceph_msg_data {
 	enum ceph_msg_data_type		type;
 	union {
 #ifdef CONFIG_BLOCK
-		struct bio		*bio;
+		struct {
+			struct bio	*bio;
+			size_t		bio_length;
+		};
 #endif /* CONFIG_BLOCK */
 		struct {
 			struct page	**pages;	/* NOT OWNER. */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index cb5b4e6733f0..731bb9efa2c6 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3032,6 +3032,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
 	BUG_ON(!data);
 	data->bio = bio;
+	data->bio_length = length;
 
 	msg->data = data;
 	msg->data_length = length;
-- 
cgit 


From 36153ec9dd6287d7cedf6afb51453c445d946cee Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:06 -0500
Subject: libceph: move cursor into message

A message will only be processing a single data item at a time, so
there's no need for each data item to have its own cursor.

Move the cursor embedded in the message data structure into the
message itself.  To minimize the impact, keep the data->cursor
field, but make it be a pointer to the cursor in the message.

Move the definition of ceph_msg_data above ceph_msg_data_cursor so
the cursor can point to the data without a forward definition rather
than vice-versa.

This and the upcoming patches are part of:
    http://tracker.ceph.com/issues/3761

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h | 43 +++++++++++++++++++++---------------------
 net/ceph/messenger.c           | 35 ++++++++++++++++++----------------
 2 files changed, 41 insertions(+), 37 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 4fb870a5b5fc..e7557242817c 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -88,6 +88,25 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 	}
 }
 
+struct ceph_msg_data {
+	enum ceph_msg_data_type		type;
+	union {
+#ifdef CONFIG_BLOCK
+		struct {
+			struct bio	*bio;
+			size_t		bio_length;
+		};
+#endif /* CONFIG_BLOCK */
+		struct {
+			struct page	**pages;	/* NOT OWNER. */
+			size_t		length;		/* total # bytes */
+			unsigned int	alignment;	/* first page */
+		};
+		struct ceph_pagelist	*pagelist;
+	};
+	struct ceph_msg_data_cursor	*cursor;
+};
+
 struct ceph_msg_data_cursor {
 	size_t		resid;		/* bytes not yet consumed */
 	bool		last_piece;	/* now at last piece of data item */
@@ -112,25 +131,6 @@ struct ceph_msg_data_cursor {
 	};
 };
 
-struct ceph_msg_data {
-	enum ceph_msg_data_type		type;
-	union {
-#ifdef CONFIG_BLOCK
-		struct {
-			struct bio	*bio;
-			size_t		bio_length;
-		};
-#endif /* CONFIG_BLOCK */
-		struct {
-			struct page	**pages;	/* NOT OWNER. */
-			size_t		length;		/* total # bytes */
-			unsigned int	alignment;	/* first page */
-		};
-		struct ceph_pagelist	*pagelist;
-	};
-	struct ceph_msg_data_cursor	cursor;		/* pagelist only */
-};
-
 /*
  * a single message.  it contains a header (src, dest, message type, etc.),
  * footer (crc values, mainly), a "front" message body, and possibly a
@@ -142,8 +142,9 @@ struct ceph_msg {
 	struct kvec front;              /* unaligned blobs of message */
 	struct ceph_buffer *middle;
 
-	size_t			data_length;
-	struct ceph_msg_data	*data;	/* data payload */
+	size_t				data_length;
+	struct ceph_msg_data		*data;
+	struct ceph_msg_data_cursor	cursor;
 
 	struct ceph_connection *con;
 	struct list_head list_head;	/* links for connection lists */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 731bb9efa2c6..4626da34a5c3 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -725,7 +725,7 @@ static void con_out_kvec_add(struct ceph_connection *con,
 static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data,
 					size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct bio *bio;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
@@ -745,7 +745,7 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
 						size_t *page_offset,
 						size_t *length)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct bio *bio;
 	struct bio_vec *bio_vec;
 	unsigned int index;
@@ -774,7 +774,7 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
 
 static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct bio *bio;
 	struct bio_vec *bio_vec;
 	unsigned int index;
@@ -826,7 +826,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
 static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data,
 					size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	int page_count;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
@@ -849,7 +849,7 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
 					size_t *page_offset,
 					size_t *length)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
 
@@ -868,7 +868,7 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
 static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
 						size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
 
@@ -897,7 +897,7 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
 static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data,
 					size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct ceph_pagelist *pagelist;
 	struct page *page;
 
@@ -923,7 +923,7 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
 						size_t *page_offset,
 						size_t *length)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct ceph_pagelist *pagelist;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
@@ -941,13 +941,13 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
 	else
 		*length = PAGE_SIZE - *page_offset;
 
-	return data->cursor.page;
+	return data->cursor->page;
 }
 
 static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
 						size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct ceph_pagelist *pagelist;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
@@ -1003,7 +1003,7 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data,
 		/* BUG(); */
 		break;
 	}
-	data->cursor.need_crc = true;
+	data->cursor->need_crc = true;
 }
 
 /*
@@ -1039,7 +1039,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
 	BUG_ON(*page_offset + *length > PAGE_SIZE);
 	BUG_ON(!*length);
 	if (last_piece)
-		*last_piece = data->cursor.last_piece;
+		*last_piece = data->cursor->last_piece;
 
 	return page;
 }
@@ -1050,7 +1050,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
  */
 static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = &data->cursor;
+	struct ceph_msg_data_cursor *cursor = data->cursor;
 	bool new_piece;
 
 	BUG_ON(bytes > cursor->resid);
@@ -1071,7 +1071,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 		BUG();
 		break;
 	}
-	data->cursor.need_crc = new_piece;
+	data->cursor->need_crc = new_piece;
 
 	return new_piece;
 }
@@ -1404,7 +1404,7 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
 static int write_partial_message_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->data->cursor;
+	struct ceph_msg_data_cursor *cursor = msg->data->cursor;
 	bool do_datacrc = !con->msgr->nocrc;
 	u32 crc;
 
@@ -2102,7 +2102,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 static int read_partial_msg_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = &msg->data->cursor;
+	struct ceph_msg_data_cursor *cursor = msg->data->cursor;
 	const bool do_datacrc = !con->msgr->nocrc;
 	struct page *page;
 	size_t page_offset;
@@ -2991,6 +2991,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
 	BUG_ON(!data);
+	data->cursor = &msg->cursor;
 	data->pages = pages;
 	data->length = length;
 	data->alignment = alignment & ~PAGE_MASK;
@@ -3012,6 +3013,7 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
 	BUG_ON(!data);
+	data->cursor = &msg->cursor;
 	data->pagelist = pagelist;
 
 	msg->data = data;
@@ -3031,6 +3033,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
 	BUG_ON(!data);
+	data->cursor = &msg->cursor;
 	data->bio = bio;
 	data->bio_length = length;
 
-- 
cgit 


From 8ae4f4f5c056150d5480710ab356801e84d01a3d Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:06 -0500
Subject: libceph: have cursor point to data

Rather than having a ceph message data item point to the cursor it's
associated with, have the cursor point to a data item.  This will
allow a message cursor to be used for more than one data item.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |   8 +--
 net/ceph/messenger.c           | 113 ++++++++++++++++++++---------------------
 2 files changed, 59 insertions(+), 62 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index e7557242817c..8846ff610502 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -104,13 +104,13 @@ struct ceph_msg_data {
 		};
 		struct ceph_pagelist	*pagelist;
 	};
-	struct ceph_msg_data_cursor	*cursor;
 };
 
 struct ceph_msg_data_cursor {
-	size_t		resid;		/* bytes not yet consumed */
-	bool		last_piece;	/* now at last piece of data item */
-	bool		need_crc;	/* new piece; crc update needed */
+	struct ceph_msg_data	*data;		/* data item this describes */
+	size_t			resid;		/* bytes not yet consumed */
+	bool			last_piece;	/* current is last piece */
+	bool			need_crc;	/* crc update needed */
 	union {
 #ifdef CONFIG_BLOCK
 		struct {				/* bio */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 4626da34a5c3..3aa0f30c3c5e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -722,10 +722,10 @@ static void con_out_kvec_add(struct ceph_connection *con,
  * entry in the current bio iovec, or the first entry in the next
  * bio in the list.
  */
-static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data,
+static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
 					size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 	struct bio *bio;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
@@ -741,11 +741,11 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data *data,
 	cursor->last_piece = length <= bio->bi_io_vec[0].bv_len;
 }
 
-static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
+static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor,
 						size_t *page_offset,
 						size_t *length)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 	struct bio *bio;
 	struct bio_vec *bio_vec;
 	unsigned int index;
@@ -772,14 +772,14 @@ static struct page *ceph_msg_data_bio_next(struct ceph_msg_data *data,
 	return bio_vec->bv_page;
 }
 
-static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
+static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor,
+					size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
 	struct bio *bio;
 	struct bio_vec *bio_vec;
 	unsigned int index;
 
-	BUG_ON(data->type != CEPH_MSG_DATA_BIO);
+	BUG_ON(cursor->data->type != CEPH_MSG_DATA_BIO);
 
 	bio = cursor->bio;
 	BUG_ON(!bio);
@@ -823,10 +823,10 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data *data, size_t bytes)
  * For a page array, a piece comes from the first page in the array
  * that has not already been fully consumed.
  */
-static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data,
+static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
 					size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 	int page_count;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
@@ -845,11 +845,11 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data *data,
 	cursor->last_piece = (size_t)cursor->page_offset + length <= PAGE_SIZE;
 }
 
-static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
-					size_t *page_offset,
-					size_t *length)
+static struct page *
+ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor,
+					size_t *page_offset, size_t *length)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
 
@@ -865,12 +865,10 @@ static struct page *ceph_msg_data_pages_next(struct ceph_msg_data *data,
 	return data->pages[cursor->page_index];
 }
 
-static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
+static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor,
 						size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
-
-	BUG_ON(data->type != CEPH_MSG_DATA_PAGES);
+	BUG_ON(cursor->data->type != CEPH_MSG_DATA_PAGES);
 
 	BUG_ON(cursor->page_offset + bytes > PAGE_SIZE);
 
@@ -894,10 +892,11 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data *data,
  * For a pagelist, a piece is whatever remains to be consumed in the
  * first page in the list, or the front of the next page.
  */
-static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data,
+static void
+ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
 					size_t length)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 	struct ceph_pagelist *pagelist;
 	struct page *page;
 
@@ -919,11 +918,11 @@ static void ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data *data,
 	cursor->last_piece = length <= PAGE_SIZE;
 }
 
-static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
-						size_t *page_offset,
-						size_t *length)
+static struct page *
+ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor,
+				size_t *page_offset, size_t *length)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 	struct ceph_pagelist *pagelist;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
@@ -941,13 +940,13 @@ static struct page *ceph_msg_data_pagelist_next(struct ceph_msg_data *data,
 	else
 		*length = PAGE_SIZE - *page_offset;
 
-	return data->cursor->page;
+	return cursor->page;
 }
 
-static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
+static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
 						size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
+	struct ceph_msg_data *data = cursor->data;
 	struct ceph_pagelist *pagelist;
 
 	BUG_ON(data->type != CEPH_MSG_DATA_PAGELIST);
@@ -983,19 +982,21 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data *data,
  * be processed in that piece.  It also tracks whether the current
  * piece is the last one in the data item.
  */
-static void ceph_msg_data_cursor_init(struct ceph_msg_data *data,
-					size_t length)
+static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
 {
-	switch (data->type) {
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+
+	cursor->data = msg->data;
+	switch (cursor->data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
-		ceph_msg_data_pagelist_cursor_init(data, length);
+		ceph_msg_data_pagelist_cursor_init(cursor, length);
 		break;
 	case CEPH_MSG_DATA_PAGES:
-		ceph_msg_data_pages_cursor_init(data, length);
+		ceph_msg_data_pages_cursor_init(cursor, length);
 		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
-		ceph_msg_data_bio_cursor_init(data, length);
+		ceph_msg_data_bio_cursor_init(cursor, length);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
@@ -1003,7 +1004,7 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data,
 		/* BUG(); */
 		break;
 	}
-	data->cursor->need_crc = true;
+	cursor->need_crc = true;
 }
 
 /*
@@ -1011,23 +1012,22 @@ static void ceph_msg_data_cursor_init(struct ceph_msg_data *data,
  * data item, and supply the page offset and length of that piece.
  * Indicate whether this is the last piece in this data item.
  */
-static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
-					size_t *page_offset,
-					size_t *length,
+static struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor,
+					size_t *page_offset, size_t *length,
 					bool *last_piece)
 {
 	struct page *page;
 
-	switch (data->type) {
+	switch (cursor->data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
-		page = ceph_msg_data_pagelist_next(data, page_offset, length);
+		page = ceph_msg_data_pagelist_next(cursor, page_offset, length);
 		break;
 	case CEPH_MSG_DATA_PAGES:
-		page = ceph_msg_data_pages_next(data, page_offset, length);
+		page = ceph_msg_data_pages_next(cursor, page_offset, length);
 		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
-		page = ceph_msg_data_bio_next(data, page_offset, length);
+		page = ceph_msg_data_bio_next(cursor, page_offset, length);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
@@ -1039,7 +1039,7 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
 	BUG_ON(*page_offset + *length > PAGE_SIZE);
 	BUG_ON(!*length);
 	if (last_piece)
-		*last_piece = data->cursor->last_piece;
+		*last_piece = cursor->last_piece;
 
 	return page;
 }
@@ -1048,22 +1048,22 @@ static struct page *ceph_msg_data_next(struct ceph_msg_data *data,
  * Returns true if the result moves the cursor on to the next piece
  * of the data item.
  */
-static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
+static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
+				size_t bytes)
 {
-	struct ceph_msg_data_cursor *cursor = data->cursor;
 	bool new_piece;
 
 	BUG_ON(bytes > cursor->resid);
-	switch (data->type) {
+	switch (cursor->data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
-		new_piece = ceph_msg_data_pagelist_advance(data, bytes);
+		new_piece = ceph_msg_data_pagelist_advance(cursor, bytes);
 		break;
 	case CEPH_MSG_DATA_PAGES:
-		new_piece = ceph_msg_data_pages_advance(data, bytes);
+		new_piece = ceph_msg_data_pages_advance(cursor, bytes);
 		break;
 #ifdef CONFIG_BLOCK
 	case CEPH_MSG_DATA_BIO:
-		new_piece = ceph_msg_data_bio_advance(data, bytes);
+		new_piece = ceph_msg_data_bio_advance(cursor, bytes);
 		break;
 #endif /* CONFIG_BLOCK */
 	case CEPH_MSG_DATA_NONE:
@@ -1071,7 +1071,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data *data, size_t bytes)
 		BUG();
 		break;
 	}
-	data->cursor->need_crc = new_piece;
+	cursor->need_crc = new_piece;
 
 	return new_piece;
 }
@@ -1083,7 +1083,7 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
 
 	/* Initialize data cursor */
 
-	ceph_msg_data_cursor_init(msg->data, (size_t)data_len);
+	ceph_msg_data_cursor_init(msg, (size_t)data_len);
 }
 
 /*
@@ -1404,7 +1404,7 @@ static u32 ceph_crc32c_page(u32 crc, struct page *page,
 static int write_partial_message_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->out_msg;
-	struct ceph_msg_data_cursor *cursor = msg->data->cursor;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	bool do_datacrc = !con->msgr->nocrc;
 	u32 crc;
 
@@ -1430,7 +1430,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 		bool need_crc;
 		int ret;
 
-		page = ceph_msg_data_next(msg->data, &page_offset, &length,
+		page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
 							&last_piece);
 		ret = ceph_tcp_sendpage(con->sock, page, page_offset,
 				      length, last_piece);
@@ -1442,7 +1442,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 		}
 		if (do_datacrc && cursor->need_crc)
 			crc = ceph_crc32c_page(crc, page, page_offset, length);
-		need_crc = ceph_msg_data_advance(msg->data, (size_t)ret);
+		need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
 	}
 
 	dout("%s %p msg %p done\n", __func__, con, msg);
@@ -2102,7 +2102,7 @@ static int read_partial_message_section(struct ceph_connection *con,
 static int read_partial_msg_data(struct ceph_connection *con)
 {
 	struct ceph_msg *msg = con->in_msg;
-	struct ceph_msg_data_cursor *cursor = msg->data->cursor;
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
 	const bool do_datacrc = !con->msgr->nocrc;
 	struct page *page;
 	size_t page_offset;
@@ -2117,7 +2117,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	if (do_datacrc)
 		crc = con->in_data_crc;
 	while (cursor->resid) {
-		page = ceph_msg_data_next(msg->data, &page_offset, &length,
+		page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
 							NULL);
 		ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
 		if (ret <= 0) {
@@ -2129,7 +2129,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
 
 		if (do_datacrc)
 			crc = ceph_crc32c_page(crc, page, page_offset, ret);
-		(void) ceph_msg_data_advance(msg->data, (size_t)ret);
+		(void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
 	}
 	if (do_datacrc)
 		con->in_data_crc = crc;
@@ -2991,7 +2991,6 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
 	BUG_ON(!data);
-	data->cursor = &msg->cursor;
 	data->pages = pages;
 	data->length = length;
 	data->alignment = alignment & ~PAGE_MASK;
@@ -3013,7 +3012,6 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
 	BUG_ON(!data);
-	data->cursor = &msg->cursor;
 	data->pagelist = pagelist;
 
 	msg->data = data;
@@ -3033,7 +3031,6 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
 	BUG_ON(!data);
-	data->cursor = &msg->cursor;
 	data->bio = bio;
 	data->bio_length = length;
 
-- 
cgit 


From 5240d9f95dfe0f0701b35fbff1cb5b70825ad23f Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Thu, 14 Mar 2013 14:09:06 -0500
Subject: libceph: replace message data pointer with list

In place of the message data pointer, use a list head which links
through message data items.  For now we only support a single entry
on that list.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  3 ++-
 net/ceph/messenger.c           | 46 ++++++++++++++++++++++++++++--------------
 2 files changed, 33 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 8846ff610502..318da0170a1e 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -89,6 +89,7 @@ static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
 }
 
 struct ceph_msg_data {
+	struct list_head		links;	/* ceph_msg->data */
 	enum ceph_msg_data_type		type;
 	union {
 #ifdef CONFIG_BLOCK
@@ -143,7 +144,7 @@ struct ceph_msg {
 	struct ceph_buffer *middle;
 
 	size_t				data_length;
-	struct ceph_msg_data		*data;
+	struct list_head		data;
 	struct ceph_msg_data_cursor	cursor;
 
 	struct ceph_connection *con;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 3aa0f30c3c5e..8bfe7d34bc85 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -985,8 +985,10 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
 static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
 {
 	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	struct ceph_msg_data *data;
 
-	cursor->data = msg->data;
+	data = list_first_entry(&msg->data, struct ceph_msg_data, links);
+	cursor->data = data;
 	switch (cursor->data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
 		ceph_msg_data_pagelist_cursor_init(cursor, length);
@@ -1410,7 +1412,7 @@ static int write_partial_message_data(struct ceph_connection *con)
 
 	dout("%s %p msg %p\n", __func__, con, msg);
 
-	if (WARN_ON(!msg->data))
+	if (list_empty(&msg->data))
 		return -EINVAL;
 
 	/*
@@ -2111,7 +2113,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
 	int ret;
 
 	BUG_ON(!msg);
-	if (!msg->data)
+	if (list_empty(&msg->data))
 		return -EIO;
 
 	if (do_datacrc)
@@ -2963,6 +2965,7 @@ static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
 	data = kzalloc(sizeof (*data), GFP_NOFS);
 	if (data)
 		data->type = type;
+	INIT_LIST_HEAD(&data->links);
 
 	return data;
 }
@@ -2972,6 +2975,7 @@ static void ceph_msg_data_destroy(struct ceph_msg_data *data)
 	if (!data)
 		return;
 
+	WARN_ON(!list_empty(&data->links));
 	if (data->type == CEPH_MSG_DATA_PAGELIST) {
 		ceph_pagelist_release(data->pagelist);
 		kfree(data->pagelist);
@@ -2987,7 +2991,7 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 	BUG_ON(!pages);
 	BUG_ON(!length);
 	BUG_ON(msg->data_length);
-	BUG_ON(msg->data != NULL);
+	BUG_ON(!list_empty(&msg->data));
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
 	BUG_ON(!data);
@@ -2995,8 +2999,9 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 	data->length = length;
 	data->alignment = alignment & ~PAGE_MASK;
 
-	msg->data = data;
-	msg->data_length = length;
+	BUG_ON(!list_empty(&msg->data));
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pages);
 
@@ -3008,14 +3013,14 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
 	BUG_ON(msg->data_length);
-	BUG_ON(msg->data != NULL);
+	BUG_ON(!list_empty(&msg->data));
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
 	BUG_ON(!data);
 	data->pagelist = pagelist;
 
-	msg->data = data;
-	msg->data_length = pagelist->length;
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += pagelist->length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
 
@@ -3027,15 +3032,15 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 
 	BUG_ON(!bio);
 	BUG_ON(msg->data_length);
-	BUG_ON(msg->data != NULL);
+	BUG_ON(!list_empty(&msg->data));
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
 	BUG_ON(!data);
 	data->bio = bio;
 	data->bio_length = length;
 
-	msg->data = data;
-	msg->data_length = length;
+	list_add_tail(&data->links, &msg->data);
+	msg->data_length += length;
 }
 EXPORT_SYMBOL(ceph_msg_data_set_bio);
 #endif	/* CONFIG_BLOCK */
@@ -3059,6 +3064,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
 
 	INIT_LIST_HEAD(&m->list_head);
 	kref_init(&m->kref);
+	INIT_LIST_HEAD(&m->data);
 
 	/* front */
 	m->front_max = front_len;
@@ -3204,6 +3210,9 @@ void ceph_msg_kfree(struct ceph_msg *m)
 void ceph_msg_last_put(struct kref *kref)
 {
 	struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+	LIST_HEAD(data);
+	struct list_head *links;
+	struct list_head *next;
 
 	dout("ceph_msg_put last one on %p\n", m);
 	WARN_ON(!list_empty(&m->list_head));
@@ -3213,8 +3222,15 @@ void ceph_msg_last_put(struct kref *kref)
 		ceph_buffer_put(m->middle);
 		m->middle = NULL;
 	}
-	ceph_msg_data_destroy(m->data);
-	m->data = NULL;
+
+	list_splice_init(&m->data, &data);
+	list_for_each_safe(links, next, &data) {
+		struct ceph_msg_data *data;
+
+		data = list_entry(links, struct ceph_msg_data, links);
+		list_del_init(links);
+		ceph_msg_data_destroy(data);
+	}
 	m->data_length = 0;
 
 	if (m->pool)
@@ -3227,7 +3243,7 @@ EXPORT_SYMBOL(ceph_msg_last_put);
 void ceph_msg_dump(struct ceph_msg *msg)
 {
 	pr_debug("msg_dump %p (front_max %d length %zd)\n", msg,
-		 msg->front_max, msg->data->length);
+		 msg->front_max, msg->data_length);
 	print_hex_dump(KERN_DEBUG, "header: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       &msg->hdr, sizeof(msg->hdr), true);
-- 
cgit 


From ca8b3a69174b04376722672d7dd6b666a7f17c50 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 14:46:01 -0500
Subject: libceph: implement multiple data items in a message

This patch adds support to the messenger for more than one data item
in its data list.

A message data cursor has two more fields to support this:
    - a count of the number of bytes left to be consumed across
      all data items in the list, "total_resid"
    - a pointer to the head of the list (for validation only)

The cursor initialization routine has been split into two parts: the
outer one, which initializes the cursor for traversing the entire
list of data items; and the inner one, which initializes the cursor
to start processing a single data item.

When a message cursor is first initialized, the outer initialization
routine sets total_resid to the length provided.  The data pointer
is initialized to the first data item on the list.  From there, the
inner initialization routine finishes by setting up to process the
data item the cursor points to.

Advancing the cursor consumes bytes in total_resid.  If the resid
field reaches zero, it means the current data item is fully
consumed.  If total_resid indicates there is more data, the cursor
is advanced to point to the next data item, and then the inner
initialization routine prepares for using that.  (A check is made at
this point to make sure we don't wrap around the front of the list.)

The type-specific init routines are modified so they can be given a
length that's larger than what the data item can support.  The resid
field is initialized to the smaller of the provided length and the
length of the entire data item.

When total_resid reaches zero, we're done.

This resolves:
    http://tracker.ceph.com/issues/3761

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/messenger.h |  5 ++++-
 net/ceph/messenger.c           | 48 ++++++++++++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 318da0170a1e..de1d2e1ecce2 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -108,7 +108,10 @@ struct ceph_msg_data {
 };
 
 struct ceph_msg_data_cursor {
-	struct ceph_msg_data	*data;		/* data item this describes */
+	size_t			total_resid;	/* across all data items */
+	struct list_head	*data_head;	/* = &ceph_msg->data */
+
+	struct ceph_msg_data	*data;		/* current data item */
 	size_t			resid;		/* bytes not yet consumed */
 	bool			last_piece;	/* current is last piece */
 	bool			need_crc;	/* crc update needed */
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 8bfe7d34bc85..84703e550c26 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -734,7 +734,7 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(!bio);
 	BUG_ON(!bio->bi_vcnt);
 
-	cursor->resid = length;
+	cursor->resid = min(length, data->bio_length);
 	cursor->bio = bio;
 	cursor->vector_index = 0;
 	cursor->vector_offset = 0;
@@ -833,9 +833,8 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor,
 
 	BUG_ON(!data->pages);
 	BUG_ON(!data->length);
-	BUG_ON(length > data->length);	/* short reads are OK */
 
-	cursor->resid = length;
+	cursor->resid = min(length, data->length);
 	page_count = calc_pages_for(data->alignment, (u64)data->length);
 	cursor->page_offset = data->alignment & ~PAGE_MASK;
 	cursor->page_index = 0;
@@ -904,7 +903,6 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
 
 	pagelist = data->pagelist;
 	BUG_ON(!pagelist);
-	BUG_ON(length > pagelist->length);	/* short reads are OK */
 
 	if (!length)
 		return;		/* pagelist can be assigned but empty */
@@ -912,7 +910,7 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor,
 	BUG_ON(list_empty(&pagelist->head));
 	page = list_first_entry(&pagelist->head, struct page, lru);
 
-	cursor->resid = length;
+	cursor->resid = min(length, pagelist->length);
 	cursor->page = page;
 	cursor->offset = 0;
 	cursor->last_piece = length <= PAGE_SIZE;
@@ -982,13 +980,10 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
  * be processed in that piece.  It also tracks whether the current
  * piece is the last one in the data item.
  */
-static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
 {
-	struct ceph_msg_data_cursor *cursor = &msg->cursor;
-	struct ceph_msg_data *data;
+	size_t length = cursor->total_resid;
 
-	data = list_first_entry(&msg->data, struct ceph_msg_data, links);
-	cursor->data = data;
 	switch (cursor->data->type) {
 	case CEPH_MSG_DATA_PAGELIST:
 		ceph_msg_data_pagelist_cursor_init(cursor, length);
@@ -1009,6 +1004,25 @@ static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
 	cursor->need_crc = true;
 }
 
+static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
+{
+	struct ceph_msg_data_cursor *cursor = &msg->cursor;
+	struct ceph_msg_data *data;
+
+	BUG_ON(!length);
+	BUG_ON(length > msg->data_length);
+	BUG_ON(list_empty(&msg->data));
+
+	data = list_first_entry(&msg->data, struct ceph_msg_data, links);
+
+	cursor->data_head = &msg->data;
+	cursor->total_resid = length;
+	data = list_first_entry(&msg->data, struct ceph_msg_data, links);
+	cursor->data = data;
+
+	__ceph_msg_data_cursor_init(cursor);
+}
+
 /*
  * Return the page containing the next piece to process for a given
  * data item, and supply the page offset and length of that piece.
@@ -1073,8 +1087,16 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
 		BUG();
 		break;
 	}
+	cursor->total_resid -= bytes;
 	cursor->need_crc = new_piece;
 
+	if (!cursor->resid && cursor->total_resid) {
+		WARN_ON(!cursor->last_piece);
+		BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
+		cursor->data = list_entry_next(cursor->data, links);
+		__ceph_msg_data_cursor_init(cursor);
+	}
+
 	return new_piece;
 }
 
@@ -2990,8 +3012,6 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 
 	BUG_ON(!pages);
 	BUG_ON(!length);
-	BUG_ON(msg->data_length);
-	BUG_ON(!list_empty(&msg->data));
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
 	BUG_ON(!data);
@@ -3012,8 +3032,6 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 
 	BUG_ON(!pagelist);
 	BUG_ON(!pagelist->length);
-	BUG_ON(msg->data_length);
-	BUG_ON(!list_empty(&msg->data));
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
 	BUG_ON(!data);
@@ -3031,8 +3049,6 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 	struct ceph_msg_data *data;
 
 	BUG_ON(!bio);
-	BUG_ON(msg->data_length);
-	BUG_ON(!list_empty(&msg->data));
 
 	data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
 	BUG_ON(!data);
-- 
cgit 


From 90af36022aecdeeb1b9c0755461187de717c86dd Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 14:46:01 -0500
Subject: libceph: add, don't set data for a message

Change the names of the functions that put data on a pagelist to
reflect that we're adding to whatever's already there rather than
just setting it to the one thing.  Currently only one data item is
ever added to a message, but that's about to change.

This resolves:
    http://tracker.ceph.com/issues/2770

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 fs/ceph/mds_client.c           |  4 ++--
 include/linux/ceph/messenger.h |  6 +++---
 net/ceph/messenger.c           | 12 ++++++------
 net/ceph/osd_client.c          | 16 ++++++++--------
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 13ae44eaa980..4f22671a5bd4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1724,7 +1724,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
 	if (req->r_data_len) {
 		/* outbound data set only by ceph_sync_setxattr() */
 		BUG_ON(!req->r_pages);
-		ceph_msg_data_set_pages(msg, req->r_pages, req->r_data_len, 0);
+		ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0);
 	}
 
 	msg->hdr.data_len = cpu_to_le32(req->r_data_len);
@@ -2608,7 +2608,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
 	if (pagelist->length) {
 		/* set up outbound data if we have any */
 		reply->hdr.data_len = cpu_to_le32(pagelist->length);
-		ceph_msg_data_set_pagelist(reply, pagelist);
+		ceph_msg_data_add_pagelist(reply, pagelist);
 	}
 	ceph_con_send(&session->s_con, reply);
 
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index de1d2e1ecce2..7c1420bb1dce 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -275,12 +275,12 @@ extern void ceph_msg_revoke_incoming(struct ceph_msg *msg);
 
 extern void ceph_con_keepalive(struct ceph_connection *con);
 
-extern void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
+extern void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
 				size_t length, size_t alignment);
-extern void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
+extern void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
 				struct ceph_pagelist *pagelist);
 #ifdef CONFIG_BLOCK
-extern void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
+extern void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
 				size_t length);
 #endif /* CONFIG_BLOCK */
 
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 84703e550c26..a36d98d8073e 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -3005,7 +3005,7 @@ static void ceph_msg_data_destroy(struct ceph_msg_data *data)
 	kfree(data);
 }
 
-void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
+void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
 		size_t length, size_t alignment)
 {
 	struct ceph_msg_data *data;
@@ -3023,9 +3023,9 @@ void ceph_msg_data_set_pages(struct ceph_msg *msg, struct page **pages,
 	list_add_tail(&data->links, &msg->data);
 	msg->data_length += length;
 }
-EXPORT_SYMBOL(ceph_msg_data_set_pages);
+EXPORT_SYMBOL(ceph_msg_data_add_pages);
 
-void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
+void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
 				struct ceph_pagelist *pagelist)
 {
 	struct ceph_msg_data *data;
@@ -3040,10 +3040,10 @@ void ceph_msg_data_set_pagelist(struct ceph_msg *msg,
 	list_add_tail(&data->links, &msg->data);
 	msg->data_length += pagelist->length;
 }
-EXPORT_SYMBOL(ceph_msg_data_set_pagelist);
+EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
 
 #ifdef	CONFIG_BLOCK
-void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
+void ceph_msg_data_add_bio(struct ceph_msg *msg, struct bio *bio,
 		size_t length)
 {
 	struct ceph_msg_data *data;
@@ -3058,7 +3058,7 @@ void ceph_msg_data_set_bio(struct ceph_msg *msg, struct bio *bio,
 	list_add_tail(&data->links, &msg->data);
 	msg->data_length += length;
 }
-EXPORT_SYMBOL(ceph_msg_data_set_bio);
+EXPORT_SYMBOL(ceph_msg_data_add_bio);
 #endif	/* CONFIG_BLOCK */
 
 /*
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 2562e4e52245..73227853d845 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -550,7 +550,7 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_watch_init);
 
-static void ceph_osdc_msg_data_set(struct ceph_msg *msg,
+static void ceph_osdc_msg_data_add(struct ceph_msg *msg,
 				struct ceph_osd_data *osd_data)
 {
 	u64 length = ceph_osd_data_length(osd_data);
@@ -558,14 +558,14 @@ static void ceph_osdc_msg_data_set(struct ceph_msg *msg,
 	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 		BUG_ON(length > (u64) SIZE_MAX);
 		if (length)
-			ceph_msg_data_set_pages(msg, osd_data->pages,
+			ceph_msg_data_add_pages(msg, osd_data->pages,
 					length, osd_data->alignment);
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGELIST) {
 		BUG_ON(!length);
-		ceph_msg_data_set_pagelist(msg, osd_data->pagelist);
+		ceph_msg_data_add_pagelist(msg, osd_data->pagelist);
 #ifdef CONFIG_BLOCK
 	} else if (osd_data->type == CEPH_OSD_DATA_TYPE_BIO) {
-		ceph_msg_data_set_bio(msg, osd_data->bio, length);
+		ceph_msg_data_add_bio(msg, osd_data->bio, length);
 #endif
 	} else {
 		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_NONE);
@@ -600,18 +600,18 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
 		if (src->op == CEPH_OSD_OP_WRITE)
-			ceph_osdc_msg_data_set(req->r_request,
+			ceph_osdc_msg_data_add(req->r_request,
 						&src->extent.osd_data);
 		else
-			ceph_osdc_msg_data_set(req->r_reply,
+			ceph_osdc_msg_data_add(req->r_reply,
 						&src->extent.osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
 		dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len);
-		ceph_osdc_msg_data_set(req->r_reply, &src->cls.response_data);
-		ceph_osdc_msg_data_set(req->r_request, &src->cls.request_info);
+		ceph_osdc_msg_data_add(req->r_reply, &src->cls.response_data);
+		ceph_osdc_msg_data_add(req->r_request, &src->cls.request_info);
 		BUG_ON(src->cls.request_info.type !=
 					CEPH_OSD_DATA_TYPE_PAGELIST);
 		request_data_len = src->cls.request_info.pagelist->length;
-- 
cgit 


From 04017e29bbcf0673d8a6af616c56e395d05f5971 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 5 Apr 2013 14:46:02 -0500
Subject: libceph: make method call data be a separate data item

Right now the data for a method call is specified via a pointer and
length, and it's copied--along with the class and method name--into
a pagelist data item to be sent to the osd.  Instead, encode the
data in a data item separate from the class and method names.

This will allow large amounts of data to be supplied to methods
without copying.  Only rbd uses the class functionality right now,
and when it really needs this it will probably need to use a page
array rather than a page list.  But this simple implementation
demonstrates the functionality on the osd client, and that's enough
for now.

This resolves:
    http://tracker.ceph.com/issues/4104

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             | 15 ++++++++--
 include/linux/ceph/osd_client.h | 10 +++----
 net/ceph/osd_client.c           | 62 +++++++++++++++++++++++++++++------------
 3 files changed, 62 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 6f7a52cf75c7..11b7987cb75f 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1847,8 +1847,19 @@ static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
 		goto out;
 
 	osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
-					class_name, method_name,
-					outbound, outbound_size);
+					class_name, method_name);
+	if (outbound_size) {
+		struct ceph_pagelist *pagelist;
+
+		pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
+		if (!pagelist)
+			goto out;
+
+		ceph_pagelist_init(pagelist);
+		ceph_pagelist_append(pagelist, outbound, outbound_size);
+		osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
+						pagelist);
+	}
 	osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
 					obj_request->pages, inbound_size,
 					0, false, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4ec46c0ceaf7..2a68a7465c18 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -92,10 +92,9 @@ struct ceph_osd_req_op {
 		struct {
 			const char *class_name;
 			const char *method_name;
-			const void *request_data;
 			struct ceph_osd_data request_info;
+			struct ceph_osd_data request_data;
 			struct ceph_osd_data response_data;
-			u32 request_data_len;
 			__u8 class_len;
 			__u8 method_len;
 			__u8 argc;
@@ -259,6 +258,9 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
 					struct bio *bio, size_t bio_length);
 #endif /* CONFIG_BLOCK */
 
+extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
+					unsigned int which,
+					struct ceph_pagelist *pagelist);
 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
 					unsigned int which,
 					struct page **pages, u64 length,
@@ -267,9 +269,7 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
 
 extern void osd_req_op_cls_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
-					const char *class, const char *method,
-					const void *request_data,
-					size_t request_data_size);
+					const char *class, const char *method);
 extern void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					u64 cookie, u64 version, int flag);
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 73227853d845..939be67199ca 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -135,6 +135,16 @@ osd_req_op_cls_request_info(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_info);	/* ??? */
 
+struct ceph_osd_data *
+osd_req_op_cls_request_data(struct ceph_osd_request *osd_req,
+			unsigned int which)
+{
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	return &osd_req->r_ops[which].cls.request_data;
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data);	/* ??? */
+
 struct ceph_osd_data *
 osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
 			unsigned int which)
@@ -192,6 +202,17 @@ static void osd_req_op_cls_request_info_pagelist(
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
 }
 
+void osd_req_op_cls_request_data_pagelist(
+			struct ceph_osd_request *osd_req,
+			unsigned int which, struct ceph_pagelist *pagelist)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_cls_request_data(osd_req, which);
+	ceph_osd_data_pagelist_init(osd_data, pagelist);
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
+
 void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
 			unsigned int which, struct page **pages, u64 length,
 			u32 alignment, bool pages_from_pool, bool own_pages)
@@ -251,6 +272,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
 		break;
 	case CEPH_OSD_OP_CALL:
 		ceph_osd_data_release(&op->cls.request_info);
+		ceph_osd_data_release(&op->cls.request_data);
 		ceph_osd_data_release(&op->cls.response_data);
 		break;
 	default:
@@ -492,8 +514,7 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
 EXPORT_SYMBOL(osd_req_op_extent_update);
 
 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
-			u16 opcode, const char *class, const char *method,
-			const void *request_data, size_t request_data_size)
+			u16 opcode, const char *class, const char *method)
 {
 	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
 	struct ceph_pagelist *pagelist;
@@ -520,12 +541,6 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 	ceph_pagelist_append(pagelist, method, size);
 	payload_len += size;
 
-	op->cls.request_data = request_data;
-	BUG_ON(request_data_size > (size_t) U32_MAX);
-	op->cls.request_data_len = (u32) request_data_size;
-	ceph_pagelist_append(pagelist, request_data, request_data_size);
-	payload_len += request_data_size;
-
 	osd_req_op_cls_request_info_pagelist(osd_req, which, pagelist);
 
 	op->cls.argc = 0;	/* currently unused */
@@ -576,7 +591,9 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			      struct ceph_osd_op *dst, unsigned int which)
 {
 	struct ceph_osd_req_op *src;
+	struct ceph_osd_data *osd_data;
 	u64 request_data_len = 0;
+	u64 data_length;
 
 	BUG_ON(which >= req->r_num_ops);
 	src = &req->r_ops[which];
@@ -599,22 +616,31 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 			cpu_to_le64(src->extent.truncate_size);
 		dst->extent.truncate_seq =
 			cpu_to_le32(src->extent.truncate_seq);
+		osd_data = &src->extent.osd_data;
 		if (src->op == CEPH_OSD_OP_WRITE)
-			ceph_osdc_msg_data_add(req->r_request,
-						&src->extent.osd_data);
+			ceph_osdc_msg_data_add(req->r_request, osd_data);
 		else
-			ceph_osdc_msg_data_add(req->r_reply,
-						&src->extent.osd_data);
+			ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_CALL:
 		dst->cls.class_len = src->cls.class_len;
 		dst->cls.method_len = src->cls.method_len;
-		dst->cls.indata_len = cpu_to_le32(src->cls.request_data_len);
-		ceph_osdc_msg_data_add(req->r_reply, &src->cls.response_data);
-		ceph_osdc_msg_data_add(req->r_request, &src->cls.request_info);
-		BUG_ON(src->cls.request_info.type !=
-					CEPH_OSD_DATA_TYPE_PAGELIST);
-		request_data_len = src->cls.request_info.pagelist->length;
+		osd_data = &src->cls.request_info;
+		ceph_osdc_msg_data_add(req->r_request, osd_data);
+		BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGELIST);
+		request_data_len = osd_data->pagelist->length;
+
+		osd_data = &src->cls.request_data;
+		data_length = ceph_osd_data_length(osd_data);
+		if (data_length) {
+			BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
+			dst->cls.indata_len = cpu_to_le32(data_length);
+			ceph_osdc_msg_data_add(req->r_request, osd_data);
+			src->payload_len += data_length;
+			request_data_len += data_length;
+		}
+		osd_data = &src->cls.response_data;
+		ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_STARTSYNC:
 		break;
-- 
cgit 


From 26be88087ae8a04a5b576aa2f490597b649fc132 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 15 Apr 2013 11:20:42 -0500
Subject: libceph: change how "safe" callback is used

An osd request currently has two callbacks.  They inform the
initiator of the request when we've received confirmation for the
target osd that a request was received, and when the osd indicates
all changes described by the request are durable.

The only time the second callback is used is in the ceph file system
for a synchronous write.  There's a race that makes some handling of
this case unsafe.  This patch addresses this problem.  The error
handling for this callback is also kind of gross, and this patch
changes that as well.

In ceph_sync_write(), if a safe callback is requested we want to add
the request on the ceph inode's unsafe items list.  Because items on
this list must have their tid set (by ceph_osd_start_request()), the
request added *after* the call to that function returns.  The
problem with this is that there's a race between starting the
request and adding it to the unsafe items list; the request may
already be complete before ceph_sync_write() even begins to put it
on the list.

To address this, we change the way the "safe" callback is used.
Rather than just calling it when the request is "safe", we use it to
notify the initiator the bounds (start and end) of the period during
which the request is *unsafe*.  So the initiator gets notified just
before the request gets sent to the osd (when it is "unsafe"), and
again when it's known the results are durable (it's no longer
unsafe).  The first call will get made in __send_request(), just
before the request message gets sent to the messenger for the first
time.  That function is only called by __send_queued(), which is
always called with the osd client's request mutex held.

We then have this callback function insert the request on the ceph
inode's unsafe list when we're told the request is unsafe.  This
will avoid the race because this call will be made under protection
of the osd client's request mutex.  It also nicely groups the setup
and cleanup of the state associated with managing unsafe requests.

The name of the "safe" callback field is changed to "unsafe" to
better reflect its new purpose.  It has a Boolean "unsafe" parameter
to indicate whether the request is becoming unsafe or is now safe.
Because the "msg" parameter wasn't used, we drop that.

This resolves the original problem reportedin:
    http://tracker.ceph.com/issues/4706

Reported-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Yan, Zheng <zheng.z.yan@intel.com>
Reviewed-by: Sage Weil <sage@inktank.com>
---
 fs/ceph/file.c                  | 52 ++++++++++++++++++++++-------------------
 include/linux/ceph/osd_client.h |  4 +++-
 net/ceph/osd_client.c           | 12 +++++++---
 3 files changed, 40 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ae23e31a8f38..a65acf355384 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -446,19 +446,35 @@ done:
 }
 
 /*
- * Write commit callback, called if we requested both an ACK and
- * ONDISK commit reply from the OSD.
+ * Write commit request unsafe callback, called to tell us when a
+ * request is unsafe (that is, in flight--has been handed to the
+ * messenger to send to its target osd).  It is called again when
+ * we've received a response message indicating the request is
+ * "safe" (its CEPH_OSD_FLAG_ONDISK flag is set), or when a request
+ * is completed early (and unsuccessfully) due to a timeout or
+ * interrupt.
+ *
+ * This is used if we requested both an ACK and ONDISK commit reply
+ * from the OSD.
  */
-static void sync_write_commit(struct ceph_osd_request *req,
-			      struct ceph_msg *msg)
+static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 {
 	struct ceph_inode_info *ci = ceph_inode(req->r_inode);
 
-	dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
-	spin_lock(&ci->i_unsafe_lock);
-	list_del_init(&req->r_unsafe_item);
-	spin_unlock(&ci->i_unsafe_lock);
-	ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	dout("%s %p tid %llu %ssafe\n", __func__, req, req->r_tid,
+		unsafe ? "un" : "");
+	if (unsafe) {
+		ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
+		spin_lock(&ci->i_unsafe_lock);
+		list_add_tail(&req->r_unsafe_item,
+			      &ci->i_unsafe_writes);
+		spin_unlock(&ci->i_unsafe_lock);
+	} else {
+		spin_lock(&ci->i_unsafe_lock);
+		list_del_init(&req->r_unsafe_item);
+		spin_unlock(&ci->i_unsafe_lock);
+		ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
+	}
 }
 
 /*
@@ -570,7 +586,8 @@ more:
 
 		if ((file->f_flags & O_SYNC) == 0) {
 			/* get a second commit callback */
-			req->r_safe_callback = sync_write_commit;
+			req->r_unsafe_callback = ceph_sync_write_unsafe;
+			req->r_inode = inode;
 			own_pages = true;
 		}
 	}
@@ -581,21 +598,8 @@ more:
 	ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
 	ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
-	if (!ret) {
-		if (req->r_safe_callback) {
-			/*
-			 * Add to inode unsafe list only after we
-			 * start_request so that a tid has been assigned.
-			 */
-			spin_lock(&ci->i_unsafe_lock);
-			list_add_tail(&req->r_unsafe_item,
-				      &ci->i_unsafe_writes);
-			spin_unlock(&ci->i_unsafe_lock);
-			ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
-		}
-		
+	if (!ret)
 		ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
-	}
 
 	if (file->f_flags & O_DIRECT)
 		ceph_put_page_vector(pages, num_pages, false);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 2a68a7465c18..0d3358ef5285 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -29,6 +29,7 @@ struct ceph_authorizer;
  */
 typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
 				     struct ceph_msg *);
+typedef void (*ceph_osdc_unsafe_callback_t)(struct ceph_osd_request *, bool);
 
 /* a given osd we're communicating with */
 struct ceph_osd {
@@ -149,7 +150,8 @@ struct ceph_osd_request {
 	struct kref       r_kref;
 	bool              r_mempool;
 	struct completion r_completion, r_safe_completion;
-	ceph_osdc_callback_t r_callback, r_safe_callback;
+	ceph_osdc_callback_t r_callback;
+	ceph_osdc_unsafe_callback_t r_unsafe_callback;
 	struct ceph_eversion r_reassert_version;
 	struct list_head  r_unsafe_item;
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 939be67199ca..0c5bf2fb5075 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1314,8 +1314,14 @@ static void __send_request(struct ceph_osd_client *osdc,
 	list_move_tail(&req->r_req_lru_item, &osdc->req_lru);
 
 	ceph_msg_get(req->r_request); /* send consumes a ref */
-	ceph_con_send(&req->r_osd->o_con, req->r_request);
+
+	/* Mark the request unsafe if this is the first timet's being sent. */
+
+	if (!req->r_sent && req->r_unsafe_callback)
+		req->r_unsafe_callback(req, true);
 	req->r_sent = req->r_osd->o_incarnation;
+
+	ceph_con_send(&req->r_osd->o_con, req->r_request);
 }
 
 /*
@@ -1403,8 +1409,8 @@ static void handle_osds_timeout(struct work_struct *work)
 
 static void complete_request(struct ceph_osd_request *req)
 {
-	if (req->r_safe_callback)
-		req->r_safe_callback(req, NULL);
+	if (req->r_unsafe_callback)
+		req->r_unsafe_callback(req, false);
 	complete_all(&req->r_safe_completion);  /* fsync waiter */
 }
 
-- 
cgit 


From 406e2c9f9286fc93ae2191a7abf477dea05aadc9 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 15 Apr 2013 14:50:36 -0500
Subject: libceph: kill off osd data write_request parameters

In the incremental move toward supporting distinct data items in an
osd request some of the functions had "write_request" parameters to
indicate, basically, whether the data belonged to in_data or the
out_data.  Now that we maintain the data fields in the op structure
there is no need to indicate the direction, so get rid of the
"write_request" parameters.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 drivers/block/rbd.c             |  4 ++--
 fs/ceph/addr.c                  |  9 ++++-----
 fs/ceph/file.c                  |  4 ++--
 include/linux/ceph/osd_client.h |  8 ++++----
 net/ceph/osd_client.c           | 25 +++++++++++--------------
 5 files changed, 23 insertions(+), 27 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 13a381b2a779..8e8b876e83c3 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1779,7 +1779,7 @@ static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
 
 		osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
 						0, 0);
-		osd_req_op_extent_osd_data_bio(osd_req, 0, write_request,
+		osd_req_op_extent_osd_data_bio(osd_req, 0,
 				obj_request->bio_list, obj_request->length);
 		rbd_osd_req_format(obj_request, write_request);
 
@@ -2281,7 +2281,7 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
 
 	osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
 					offset, length, 0, 0);
-	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0, false,
+	osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
 					obj_request->pages,
 					obj_request->length,
 					obj_request->offset & ~PAGE_MASK,
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 2d6466b5fe82..3e68ac101040 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -245,7 +245,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
 	dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
 
 	/* unlock all pages, zeroing any data we didn't read */
-	osd_data = osd_req_op_extent_osd_data(req, 0, false);
+	osd_data = osd_req_op_extent_osd_data(req, 0);
 	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
 	num_pages = calc_pages_for((u64)osd_data->alignment,
 					(u64)osd_data->length);
@@ -343,8 +343,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 		}
 		pages[i] = page;
 	}
-	osd_req_op_extent_osd_data_pages(req, 0, false, pages, len, 0,
-					false, false);
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
 	req->r_callback = finish_read;
 	req->r_inode = inode;
 
@@ -571,7 +570,7 @@ static void writepages_finish(struct ceph_osd_request *req,
 	long writeback_stat;
 	unsigned issued = ceph_caps_issued(ci);
 
-	osd_data = osd_req_op_extent_osd_data(req, 0, true);
+	osd_data = osd_req_op_extent_osd_data(req, 0);
 	BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
 	num_pages = calc_pages_for((u64)osd_data->alignment,
 					(u64)osd_data->length);
@@ -916,7 +915,7 @@ get_more_pages:
 		dout("writepages got %d pages at %llu~%llu\n",
 		     locked_pages, offset, len);
 
-		osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, 0,
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
 							!!pool, false);
 
 		pages = NULL;	/* request message now owns the pages array */
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7e94dcb66d92..d70830c66833 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -585,8 +585,8 @@ more:
 			own_pages = true;
 		}
 	}
-	osd_req_op_extent_osd_data_pages(req, 0, true, pages, len,
-					page_align, false, own_pages);
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
+					false, own_pages);
 
 	/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 	ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 0d3358ef5285..0e406934a551 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -241,22 +241,22 @@ extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
 
 extern struct ceph_osd_data *osd_req_op_extent_osd_data(
 					struct ceph_osd_request *osd_req,
-					unsigned int which, bool write_request);
+					unsigned int which);
 extern struct ceph_osd_data *osd_req_op_cls_response_data(
 					struct ceph_osd_request *osd_req,
 					unsigned int which);
 
 extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *,
-					unsigned int which, bool write_request,
+					unsigned int which,
 					struct page **pages, u64 length,
 					u32 alignment, bool pages_from_pool,
 					bool own_pages);
 extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *,
-					unsigned int which, bool write_request,
+					unsigned int which,
 					struct ceph_pagelist *pagelist);
 #ifdef CONFIG_BLOCK
 extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
-					unsigned int which, bool write_request,
+					unsigned int which,
 					struct bio *bio, size_t bio_length);
 #endif /* CONFIG_BLOCK */
 
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 0c5bf2fb5075..409c443c8d1f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -117,7 +117,7 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
 
 struct ceph_osd_data *
 osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
-			unsigned int which, bool write_request)
+			unsigned int which)
 {
 	BUG_ON(which >= osd_req->r_num_ops);
 
@@ -156,37 +156,34 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
 EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
 
 void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
-			unsigned int which, bool write_request,
-			struct page **pages, u64 length, u32 alignment,
+			unsigned int which, struct page **pages,
+			u64 length, u32 alignment,
 			bool pages_from_pool, bool own_pages)
 {
 	struct ceph_osd_data *osd_data;
 
-	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
+	osd_data = osd_req_op_extent_osd_data(osd_req, which);
 	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
 				pages_from_pool, own_pages);
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages);
 
 void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req,
-			unsigned int which, bool write_request,
-			struct ceph_pagelist *pagelist)
+			unsigned int which, struct ceph_pagelist *pagelist)
 {
 	struct ceph_osd_data *osd_data;
 
-	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
+	osd_data = osd_req_op_extent_osd_data(osd_req, which);
 	ceph_osd_data_pagelist_init(osd_data, pagelist);
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist);
 
 #ifdef CONFIG_BLOCK
 void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req,
-			unsigned int which, bool write_request,
-			struct bio *bio, size_t bio_length)
+			unsigned int which, struct bio *bio, size_t bio_length)
 {
 	struct ceph_osd_data *osd_data;
-
-	osd_data = osd_req_op_extent_osd_data(osd_req, which, write_request);
+	osd_data = osd_req_op_extent_osd_data(osd_req, which);
 	ceph_osd_data_bio_init(osd_data, bio, bio_length);
 }
 EXPORT_SYMBOL(osd_req_op_extent_osd_data_bio);
@@ -2284,7 +2281,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
 
 	/* it may be a short read due to an object boundary */
 
-	osd_req_op_extent_osd_data_pages(req, 0, false,
+	osd_req_op_extent_osd_data_pages(req, 0,
 				pages, *plen, page_align, false, false);
 
 	dout("readpages  final extent is %llu~%llu (%llu bytes align %d)\n",
@@ -2327,7 +2324,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 		return PTR_ERR(req);
 
 	/* it may be a short write due to an object boundary */
-	osd_req_op_extent_osd_data_pages(req, 0, true, pages, len, page_align,
+	osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align,
 				false, false);
 	dout("writepages %llu~%llu (%llu bytes)\n", off, len, len);
 
@@ -2428,7 +2425,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
 		 * XXX page data.  Probably OK for reads, but this
 		 * XXX ought to be done more generally.
 		 */
-		osd_data = osd_req_op_extent_osd_data(req, 0, false);
+		osd_data = osd_req_op_extent_osd_data(req, 0);
 		if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
 			if (osd_data->pages &&
 				unlikely(osd_data->length < data_len)) {
-- 
cgit 


From 49719778bfa5371ec9b5a7d989bb29000e3ac5df Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Mon, 11 Feb 2013 12:33:24 -0600
Subject: libceph: support raw data requests

Allow osd request ops that aren't otherwise structured (not class,
extent, or watch ops) to specify "raw" data to be used to hold
incoming data for the op.  Make use of this capability for the osd
STAT op.

Prefix the name of the private function osd_req_op_init() with "_",
and expose a new function by that (earlier) name whose purpose is to
initialize osd ops with (only) implied data.

For now we'll just support the use of a page array for an osd op
with incoming raw data.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h | 10 ++++++++++
 net/ceph/osd_client.c           | 38 ++++++++++++++++++++++++++++++++++----
 2 files changed, 44 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 0e406934a551..4d84a2b44f18 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -84,6 +84,7 @@ struct ceph_osd_req_op {
 	u16 op;           /* CEPH_OSD_OP_* */
 	u32 payload_len;
 	union {
+		struct ceph_osd_data raw_data_in;
 		struct {
 			u64 offset, length;
 			u64 truncate_size;
@@ -232,6 +233,15 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
 extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
 				 struct ceph_msg *msg);
 
+extern void osd_req_op_init(struct ceph_osd_request *osd_req,
+					unsigned int which, u16 opcode);
+
+extern void osd_req_op_raw_data_in_pages(struct ceph_osd_request *,
+					unsigned int which,
+					struct page **pages, u64 length,
+					u32 alignment, bool pages_from_pool,
+					bool own_pages);
+
 extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 					unsigned int which, u16 opcode,
 					u64 offset, u64 length,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 3c0715977de3..c842e877d504 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -121,6 +121,14 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
 		&(oreq)->r_ops[whch].typ.fld;		\
 	})
 
+static struct ceph_osd_data *
+osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
+{
+	BUG_ON(which >= osd_req->r_num_ops);
+
+	return &osd_req->r_ops[which].raw_data_in;
+}
+
 struct ceph_osd_data *
 osd_req_op_extent_osd_data(struct ceph_osd_request *osd_req,
 			unsigned int which)
@@ -137,6 +145,19 @@ osd_req_op_cls_response_data(struct ceph_osd_request *osd_req,
 }
 EXPORT_SYMBOL(osd_req_op_cls_response_data);	/* ??? */
 
+void osd_req_op_raw_data_in_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages,
+			u64 length, u32 alignment,
+			bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_raw_data_in(osd_req, which);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_raw_data_in_pages);
+
 void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req,
 			unsigned int which, struct page **pages,
 			u64 length, u32 alignment,
@@ -437,7 +458,7 @@ static bool osd_req_opcode_valid(u16 opcode)
  * common init routine for all the other init functions, below.
  */
 static struct ceph_osd_req_op *
-osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
+_osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
 				u16 opcode)
 {
 	struct ceph_osd_req_op *op;
@@ -452,12 +473,19 @@ osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which,
 	return op;
 }
 
+void osd_req_op_init(struct ceph_osd_request *osd_req,
+				unsigned int which, u16 opcode)
+{
+	(void)_osd_req_op_init(osd_req, which, opcode);
+}
+EXPORT_SYMBOL(osd_req_op_init);
+
 void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
 				unsigned int which, u16 opcode,
 				u64 offset, u64 length,
 				u64 truncate_size, u32 truncate_seq)
 {
-	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
 	size_t payload_len = 0;
 
 	BUG_ON(opcode != CEPH_OSD_OP_READ && opcode != CEPH_OSD_OP_WRITE);
@@ -495,7 +523,7 @@ EXPORT_SYMBOL(osd_req_op_extent_update);
 void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
 			u16 opcode, const char *class, const char *method)
 {
-	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
 	struct ceph_pagelist *pagelist;
 	size_t payload_len = 0;
 	size_t size;
@@ -532,7 +560,7 @@ void osd_req_op_watch_init(struct ceph_osd_request *osd_req,
 				unsigned int which, u16 opcode,
 				u64 cookie, u64 version, int flag)
 {
-	struct ceph_osd_req_op *op = osd_req_op_init(osd_req, which, opcode);
+	struct ceph_osd_req_op *op = _osd_req_op_init(osd_req, which, opcode);
 
 	BUG_ON(opcode != CEPH_OSD_OP_NOTIFY_ACK && opcode != CEPH_OSD_OP_WATCH);
 
@@ -584,6 +612,8 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
 
 	switch (src->op) {
 	case CEPH_OSD_OP_STAT:
+		osd_data = &src->raw_data_in;
+		ceph_osdc_msg_data_add(req->r_reply, osd_data);
 		break;
 	case CEPH_OSD_OP_READ:
 	case CEPH_OSD_OP_WRITE:
-- 
cgit 


From 6c57b5545d46e276381a15a59283c984cf3f94e3 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 19 Apr 2013 15:34:49 -0500
Subject: libceph: support pages for class request data

Add the ability to provide an array of pages as outbound request
data for object class method calls.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  5 +++++
 net/ceph/osd_client.c           | 12 ++++++++++++
 2 files changed, 17 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4d84a2b44f18..4191cd2c55e5 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -273,6 +273,11 @@ extern void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *,
 extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *,
 					unsigned int which,
 					struct ceph_pagelist *pagelist);
+extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *,
+					unsigned int which,
+					struct page **pages, u64 length,
+					u32 alignment, bool pages_from_pool,
+					bool own_pages);
 extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
 					unsigned int which,
 					struct page **pages, u64 length,
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index c842e877d504..467020c560b2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -214,6 +214,18 @@ void osd_req_op_cls_request_data_pagelist(
 }
 EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist);
 
+void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req,
+			unsigned int which, struct page **pages, u64 length,
+			u32 alignment, bool pages_from_pool, bool own_pages)
+{
+	struct ceph_osd_data *osd_data;
+
+	osd_data = osd_req_op_data(osd_req, which, cls, request_data);
+	ceph_osd_data_pages_init(osd_data, pages, length, alignment,
+				pages_from_pool, own_pages);
+}
+EXPORT_SYMBOL(osd_req_op_cls_request_data_pages);
+
 void osd_req_op_cls_response_data_pages(struct ceph_osd_request *osd_req,
 			unsigned int which, struct page **pages, u64 length,
 			u32 alignment, bool pages_from_pool, bool own_pages)
-- 
cgit 


From b587398a4ff6520753f9a58da294c80ee22443a5 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 19 Apr 2013 15:34:50 -0500
Subject: libceph: add signed type limits

Flesh out the limits defined in <linux/ceph/decode.h> to include the
maximum and minimum values for signed type S8, S16, S32, and S64.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/decode.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 689f1df37bff..9575a52e011f 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -10,10 +10,20 @@
 
 /* This seemed to be the easiest place to define these */
 
-#define	U8_MAX	((u8)  (~0U))
-#define	U16_MAX	((u16) (~0U))
-#define	U32_MAX	((u32) (~0U))
-#define	U64_MAX	((u64) (~0ULL))
+#define	U8_MAX	((u8)(~0U))
+#define	U16_MAX	((u16)(~0U))
+#define	U32_MAX	((u32)(~0U))
+#define	U64_MAX	((u64)(~0ULL))
+
+#define	S8_MAX	((s8)(U8_MAX >> 1))
+#define	S16_MAX	((s16)(U16_MAX >> 1))
+#define	S32_MAX	((s32)(U32_MAX >> 1))
+#define	S64_MAX	((s64)(U64_MAX >> 1LL))
+
+#define	S8_MIN	((s8)(-S8_MAX - 1))
+#define	S16_MIN	((s16)(-S16_MAX - 1))
+#define	S32_MIN	((s32)(-S32_MAX - 1))
+#define	S64_MIN	((s64)(-S64_MAX - 1LL))
 
 /*
  * in all cases,
-- 
cgit 


From c3f56102f28d90946171ae51753bd417b003fd42 Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Fri, 19 Apr 2013 15:34:50 -0500
Subject: libceph: validate timespec conversions

A ceph timespec contains 32-bit unsigned values for its seconds and
nanoseconds components.  For a standard timespec, both fields are
signed, and the seconds field is almost surely 64 bits.

Add some explicit casts so the fact that this conversion is taking
place is obvious.  Also trip a bug if we ever try to put out of
range (negative or too big) values into a ceph timespec.

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/decode.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 9575a52e011f..379f71508995 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -154,14 +154,19 @@ bad:
 static inline void ceph_decode_timespec(struct timespec *ts,
 					const struct ceph_timespec *tv)
 {
-	ts->tv_sec = le32_to_cpu(tv->tv_sec);
-	ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+	ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec);
+	ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec);
 }
 static inline void ceph_encode_timespec(struct ceph_timespec *tv,
 					const struct timespec *ts)
 {
-	tv->tv_sec = cpu_to_le32(ts->tv_sec);
-	tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+	BUG_ON(ts->tv_sec < 0);
+	BUG_ON(ts->tv_sec > (__kernel_time_t)U32_MAX);
+	BUG_ON(ts->tv_nsec < 0);
+	BUG_ON(ts->tv_nsec > (long)U32_MAX);
+
+	tv->tv_sec = cpu_to_le32((u32)ts->tv_sec);
+	tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec);
 }
 
 /*
-- 
cgit 


From 4f0dcb10cf1454a1c38aeaa04cb2757535e4905e Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Tue, 30 Apr 2013 00:44:32 -0500
Subject: libceph: create source file "net/ceph/snapshot.c"

This creates a new source file "net/ceph/snapshot.c" to contain
utility routines related to ceph snapshot contexts.  The main
motivation was to define ceph_create_snap_context() as a common way
to create these structures, but I've moved the definitions of
ceph_get_snap_context() and ceph_put_snap_context() there too.
(The benefit of inlining those is very small, and I'd rather
keep this collection of functions together.)

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/libceph.h | 30 +++--------------
 net/ceph/Makefile            |  2 +-
 net/ceph/snapshot.c          | 78 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 26 deletions(-)
 create mode 100644 net/ceph/snapshot.c

(limited to 'include/linux')

diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 5493d7b86423..2e3024881a5e 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -157,31 +157,11 @@ struct ceph_snap_context {
 	u64 snaps[];
 };
 
-static inline struct ceph_snap_context *
-ceph_get_snap_context(struct ceph_snap_context *sc)
-{
-	/*
-	printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)+1);
-	*/
-	if (sc)
-		atomic_inc(&sc->nref);
-	return sc;
-}
-
-static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
-{
-	if (!sc)
-		return;
-	/*
-	printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
-	       atomic_read(&sc->nref)-1);
-	*/
-	if (atomic_dec_and_test(&sc->nref)) {
-		/*printk(" deleting snap_context %p\n", sc);*/
-		kfree(sc);
-	}
-}
+extern struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+					gfp_t gfp_flags);
+extern struct ceph_snap_context *ceph_get_snap_context(
+					struct ceph_snap_context *sc);
+extern void ceph_put_snap_context(struct ceph_snap_context *sc);
 
 /*
  * calculate the number of pages a given length and offset map onto,
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index e87ef435e11b..958d9856912c 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
 	crypto.o armor.o \
 	auth_x.o \
 	ceph_fs.o ceph_strings.o ceph_hash.o \
-	pagevec.o
+	pagevec.o snapshot.o
 
diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c
new file mode 100644
index 000000000000..154683f5f14c
--- /dev/null
+++ b/net/ceph/snapshot.c
@@ -0,0 +1,78 @@
+/*
+ * snapshot.c    Ceph snapshot context utility routines (part of libceph)
+ *
+ * Copyright (C) 2013 Inktank Storage, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ */
+
+#include <stddef.h>
+
+#include <linux/types.h>
+#include <linux/export.h>
+#include <linux/ceph/libceph.h>
+
+/*
+ * Ceph snapshot contexts are reference counted objects, and the
+ * returned structure holds a single reference.  Acquire additional
+ * references with ceph_get_snap_context(), and release them with
+ * ceph_put_snap_context().  When the reference count reaches zero
+ * the entire structure is freed.
+ */
+
+/*
+ * Create a new ceph snapshot context large enough to hold the
+ * indicated number of snapshot ids (which can be 0).  Caller has
+ * to fill in snapc->seq and snapc->snaps[0..snap_count-1].
+ *
+ * Returns a null pointer if an error occurs.
+ */
+struct ceph_snap_context *ceph_create_snap_context(u32 snap_count,
+						gfp_t gfp_flags)
+{
+	struct ceph_snap_context *snapc;
+	size_t size;
+
+	size = sizeof (struct ceph_snap_context);
+	size += snap_count * sizeof (snapc->snaps[0]);
+	snapc = kzalloc(size, gfp_flags);
+	if (!snapc)
+		return NULL;
+
+	atomic_set(&snapc->nref, 1);
+	snapc->num_snaps = snap_count;
+
+	return snapc;
+}
+EXPORT_SYMBOL(ceph_create_snap_context);
+
+struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+	if (sc)
+		atomic_inc(&sc->nref);
+	return sc;
+}
+EXPORT_SYMBOL(ceph_get_snap_context);
+
+void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+	if (!sc)
+		return;
+	if (atomic_dec_and_test(&sc->nref)) {
+		/*printk(" deleting snap_context %p\n", sc);*/
+		kfree(sc);
+	}
+}
+EXPORT_SYMBOL(ceph_put_snap_context);
-- 
cgit 


From 5975a2e0950291a6bfe9fd5880e7952ff87764be Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 27 Apr 2013 00:28:37 +0000
Subject: KVM: PPC: Book3S: Add API for in-kernel XICS emulation

This adds the API for userspace to instantiate an XICS device in a VM
and connect VCPUs to it.  The API consists of a new device type for
the KVM_CREATE_DEVICE ioctl, a new capability KVM_CAP_IRQ_XICS, which
functions similarly to KVM_CAP_IRQ_MPIC, and the KVM_IRQ_LINE ioctl,
which is used to assert and deassert interrupt inputs of the XICS.

The XICS device has one attribute group, KVM_DEV_XICS_GRP_SOURCES.
Each attribute within this group corresponds to the state of one
interrupt source.  The attribute number is the same as the interrupt
source number.

This does not support irq routing or irqfd yet.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Alexander Graf <agraf@suse.de>
---
 Documentation/virtual/kvm/api.txt          |   8 ++
 Documentation/virtual/kvm/devices/xics.txt |  66 ++++++++++
 arch/powerpc/include/asm/kvm_ppc.h         |   2 +
 arch/powerpc/include/uapi/asm/kvm.h        |  12 ++
 arch/powerpc/kvm/book3s_xics.c             | 190 +++++++++++++++++++++++++----
 arch/powerpc/kvm/book3s_xics.h             |   1 +
 arch/powerpc/kvm/irq.h                     |   3 +
 arch/powerpc/kvm/powerpc.c                 |  22 ++++
 include/linux/kvm_host.h                   |   1 +
 include/uapi/linux/kvm.h                   |   2 +
 virt/kvm/kvm_main.c                        |   5 +
 11 files changed, 287 insertions(+), 25 deletions(-)
 create mode 100644 Documentation/virtual/kvm/devices/xics.txt

(limited to 'include/linux')

diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index c09d1832e935..03492f95ed39 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2772,3 +2772,11 @@ Parameters: args[0] is the MPIC device fd
             args[1] is the MPIC CPU number for this vcpu
 
 This capability connects the vcpu to an in-kernel MPIC device.
+
+6.7 KVM_CAP_IRQ_XICS
+
+Architectures: ppc
+Parameters: args[0] is the XICS device fd
+            args[1] is the XICS CPU number (server ID) for this vcpu
+
+This capability connects the vcpu to an in-kernel XICS device.
diff --git a/Documentation/virtual/kvm/devices/xics.txt b/Documentation/virtual/kvm/devices/xics.txt
new file mode 100644
index 000000000000..42864935ac5d
--- /dev/null
+++ b/Documentation/virtual/kvm/devices/xics.txt
@@ -0,0 +1,66 @@
+XICS interrupt controller
+
+Device type supported: KVM_DEV_TYPE_XICS
+
+Groups:
+  KVM_DEV_XICS_SOURCES
+  Attributes: One per interrupt source, indexed by the source number.
+
+This device emulates the XICS (eXternal Interrupt Controller
+Specification) defined in PAPR.  The XICS has a set of interrupt
+sources, each identified by a 20-bit source number, and a set of
+Interrupt Control Presentation (ICP) entities, also called "servers",
+each associated with a virtual CPU.
+
+The ICP entities are created by enabling the KVM_CAP_IRQ_ARCH
+capability for each vcpu, specifying KVM_CAP_IRQ_XICS in args[0] and
+the interrupt server number (i.e. the vcpu number from the XICS's
+point of view) in args[1] of the kvm_enable_cap struct.  Each ICP has
+64 bits of state which can be read and written using the
+KVM_GET_ONE_REG and KVM_SET_ONE_REG ioctls on the vcpu.  The 64 bit
+state word has the following bitfields, starting at the
+least-significant end of the word:
+
+* Unused, 16 bits
+
+* Pending interrupt priority, 8 bits
+  Zero is the highest priority, 255 means no interrupt is pending.
+
+* Pending IPI (inter-processor interrupt) priority, 8 bits
+  Zero is the highest priority, 255 means no IPI is pending.
+
+* Pending interrupt source number, 24 bits
+  Zero means no interrupt pending, 2 means an IPI is pending
+
+* Current processor priority, 8 bits
+  Zero is the highest priority, meaning no interrupts can be
+  delivered, and 255 is the lowest priority.
+
+Each source has 64 bits of state that can be read and written using
+the KVM_GET_DEVICE_ATTR and KVM_SET_DEVICE_ATTR ioctls, specifying the
+KVM_DEV_XICS_SOURCES attribute group, with the attribute number being
+the interrupt source number.  The 64 bit state word has the following
+bitfields, starting from the least-significant end of the word:
+
+* Destination (server number), 32 bits
+  This specifies where the interrupt should be sent, and is the
+  interrupt server number specified for the destination vcpu.
+
+* Priority, 8 bits
+  This is the priority specified for this interrupt source, where 0 is
+  the highest priority and 255 is the lowest.  An interrupt with a
+  priority of 255 will never be delivered.
+
+* Level sensitive flag, 1 bit
+  This bit is 1 for a level-sensitive interrupt source, or 0 for
+  edge-sensitive (or MSI).
+
+* Masked flag, 1 bit
+  This bit is set to 1 if the interrupt is masked (cannot be delivered
+  regardless of its priority), for example by the ibm,int-off RTAS
+  call, or 0 if it is not masked.
+
+* Pending flag, 1 bit
+  This bit is 1 if the source has a pending interrupt, otherwise 0.
+
+Only one XICS instance may be created per VM.
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index d7339df19259..a5287fe03d77 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -315,6 +315,8 @@ extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
 extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd);
 extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
+extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
+			struct kvm_vcpu *vcpu, u32 cpu);
 #else
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 427b9aca2a0f..0fb1a6e9ff90 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -499,4 +499,16 @@ struct kvm_get_htab_header {
 #define KVM_REG_PPC_TLB3PS	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9a)
 #define KVM_REG_PPC_EPTCFG	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0x9b)
 
+/* PPC64 eXternal Interrupt Controller Specification */
+#define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
+
+/* Layout of 64-bit source attribute values */
+#define  KVM_XICS_DESTINATION_SHIFT	0
+#define  KVM_XICS_DESTINATION_MASK	0xffffffffULL
+#define  KVM_XICS_PRIORITY_SHIFT	32
+#define  KVM_XICS_PRIORITY_MASK		0xff
+#define  KVM_XICS_LEVEL_SENSITIVE	(1ULL << 40)
+#define  KVM_XICS_MASKED		(1ULL << 41)
+#define  KVM_XICS_PENDING		(1ULL << 42)
+
 #endif /* __LINUX_KVM_POWERPC_H */
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index ee841ed8a690..f7a103756618 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -11,6 +11,7 @@
 #include <linux/kvm_host.h>
 #include <linux/err.h>
 #include <linux/gfp.h>
+#include <linux/anon_inodes.h>
 
 #include <asm/uaccess.h>
 #include <asm/kvm_book3s.h>
@@ -55,8 +56,6 @@
  *
  * - Make ICS lockless as well, or at least a per-interrupt lock or hashed
  *   locks array to improve scalability
- *
- * - ioctl's to save/restore the entire state for snapshot & migration
  */
 
 /* -- ICS routines -- */
@@ -64,7 +63,8 @@
 static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);
 
-static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
+static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level,
+			   bool report_status)
 {
 	struct ics_irq_state *state;
 	struct kvmppc_ics *ics;
@@ -81,6 +81,9 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 	if (!state->exists)
 		return -EINVAL;
 
+	if (report_status)
+		return state->asserted;
+
 	/*
 	 * We set state->asserted locklessly. This should be fine as
 	 * we are the only setter, thus concurrent access is undefined
@@ -96,7 +99,7 @@ static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level)
 	/* Attempt delivery */
 	icp_deliver_irq(xics, NULL, irq);
 
-	return 0;
+	return state->asserted;
 }
 
 static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -891,8 +894,8 @@ static void xics_debugfs_init(struct kvmppc_xics *xics)
 	kfree(name);
 }
 
-struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
-					  struct kvmppc_xics *xics, int irq)
+static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
+					struct kvmppc_xics *xics, int irq)
 {
 	struct kvmppc_ics *ics;
 	int i, icsid;
@@ -1044,34 +1047,138 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 	return 0;
 }
 
-/* -- ioctls -- */
+static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
+{
+	int ret;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val, prio;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics)
+		return -ENOENT;
 
-int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args)
+	irqp = &ics->irq_state[idx];
+	mutex_lock(&ics->lock);
+	ret = -ENOENT;
+	if (irqp->exists) {
+		val = irqp->server;
+		prio = irqp->priority;
+		if (prio == MASKED) {
+			val |= KVM_XICS_MASKED;
+			prio = irqp->saved_priority;
+		}
+		val |= prio << KVM_XICS_PRIORITY_SHIFT;
+		if (irqp->asserted)
+			val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING;
+		else if (irqp->masked_pending || irqp->resend)
+			val |= KVM_XICS_PENDING;
+		ret = 0;
+	}
+	mutex_unlock(&ics->lock);
+
+	if (!ret && put_user(val, ubufp))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 {
-	struct kvmppc_xics *xics;
-	int r;
+	struct kvmppc_ics *ics;
+	struct ics_irq_state *irqp;
+	u64 __user *ubufp = (u64 __user *) addr;
+	u16 idx;
+	u64 val;
+	u8 prio;
+	u32 server;
+
+	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
+		return -ENOENT;
+
+	ics = kvmppc_xics_find_ics(xics, irq, &idx);
+	if (!ics) {
+		ics = kvmppc_xics_create_ics(xics->kvm, xics, irq);
+		if (!ics)
+			return -ENOMEM;
+	}
+	irqp = &ics->irq_state[idx];
+	if (get_user(val, ubufp))
+		return -EFAULT;
+
+	server = val & KVM_XICS_DESTINATION_MASK;
+	prio = val >> KVM_XICS_PRIORITY_SHIFT;
+	if (prio != MASKED &&
+	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
+		return -EINVAL;
 
-	/* locking against multiple callers? */
+	mutex_lock(&ics->lock);
+	irqp->server = server;
+	irqp->saved_priority = prio;
+	if (val & KVM_XICS_MASKED)
+		prio = MASKED;
+	irqp->priority = prio;
+	irqp->resend = 0;
+	irqp->masked_pending = 0;
+	irqp->asserted = 0;
+	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
+		irqp->asserted = 1;
+	irqp->exists = 1;
+	mutex_unlock(&ics->lock);
 
-	xics = kvm->arch.xics;
-	if (!xics)
-		return -ENODEV;
+	if (val & KVM_XICS_PENDING)
+		icp_deliver_irq(xics, NULL, irqp->number);
 
-	switch (args->level) {
-	case KVM_INTERRUPT_SET:
-	case KVM_INTERRUPT_SET_LEVEL:
-	case KVM_INTERRUPT_UNSET:
-		r = ics_deliver_irq(xics, args->irq, args->level);
-		break;
-	default:
-		r = -EINVAL;
+	return 0;
+}
+
+int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
+		bool line_status)
+{
+	struct kvmppc_xics *xics = kvm->arch.xics;
+
+	return ics_deliver_irq(xics, irq, level, line_status);
+}
+
+static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_set_source(xics, attr->attr, attr->addr);
 	}
+	return -ENXIO;
+}
 
-	return r;
+static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
+{
+	struct kvmppc_xics *xics = dev->private;
+
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		return xics_get_source(xics, attr->attr, attr->addr);
+	}
+	return -ENXIO;
 }
 
-void kvmppc_xics_free(struct kvmppc_xics *xics)
+static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
 {
+	switch (attr->group) {
+	case KVM_DEV_XICS_GRP_SOURCES:
+		if (attr->attr >= KVMPPC_XICS_FIRST_IRQ &&
+		    attr->attr < KVMPPC_XICS_NR_IRQS)
+			return 0;
+		break;
+	}
+	return -ENXIO;
+}
+
+static void kvmppc_xics_free(struct kvm_device *dev)
+{
+	struct kvmppc_xics *xics = dev->private;
 	int i;
 	struct kvm *kvm = xics->kvm;
 
@@ -1083,17 +1190,21 @@ void kvmppc_xics_free(struct kvmppc_xics *xics)
 	for (i = 0; i <= xics->max_icsid; i++)
 		kfree(xics->ics[i]);
 	kfree(xics);
+	kfree(dev);
 }
 
-int kvm_xics_create(struct kvm *kvm, u32 type)
+static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 {
 	struct kvmppc_xics *xics;
+	struct kvm *kvm = dev->kvm;
 	int ret = 0;
 
 	xics = kzalloc(sizeof(*xics), GFP_KERNEL);
 	if (!xics)
 		return -ENOMEM;
 
+	dev->private = xics;
+	xics->dev = dev;
 	xics->kvm = kvm;
 
 	/* Already there ? */
@@ -1120,6 +1231,35 @@ int kvm_xics_create(struct kvm *kvm, u32 type)
 	return 0;
 }
 
+struct kvm_device_ops kvm_xics_ops = {
+	.name = "kvm-xics",
+	.create = kvmppc_xics_create,
+	.destroy = kvmppc_xics_free,
+	.set_attr = xics_set_attr,
+	.get_attr = xics_get_attr,
+	.has_attr = xics_has_attr,
+};
+
+int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu,
+			     u32 xcpu)
+{
+	struct kvmppc_xics *xics = dev->private;
+	int r = -EBUSY;
+
+	if (dev->ops != &kvm_xics_ops)
+		return -EPERM;
+	if (xics->kvm != vcpu->kvm)
+		return -EPERM;
+	if (vcpu->arch.irq_type)
+		return -EBUSY;
+
+	r = kvmppc_xics_create_icp(vcpu, xcpu);
+	if (!r)
+		vcpu->arch.irq_type = KVMPPC_IRQ_XICS;
+
+	return r;
+}
+
 void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu)
 {
 	if (!vcpu->arch.icp)
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index e4fdec3dde77..dd9326c5c19b 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -88,6 +88,7 @@ struct kvmppc_ics {
 
 struct kvmppc_xics {
 	struct kvm *kvm;
+	struct kvm_device *dev;
 	struct dentry *dentry;
 	u32 max_icsid;
 	bool real_mode;
diff --git a/arch/powerpc/kvm/irq.h b/arch/powerpc/kvm/irq.h
index f1e27fdc8c2e..5a9a10b90762 100644
--- a/arch/powerpc/kvm/irq.h
+++ b/arch/powerpc/kvm/irq.h
@@ -9,6 +9,9 @@ static inline int irqchip_in_kernel(struct kvm *kvm)
 
 #ifdef CONFIG_KVM_MPIC
 	ret = ret || (kvm->arch.mpic != NULL);
+#endif
+#ifdef CONFIG_KVM_XICS
+	ret = ret || (kvm->arch.xics != NULL);
 #endif
 	smp_rmb();
 	return ret;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 270773f0d955..6316ee336e88 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -342,6 +342,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SPAPR_TCE:
 	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS:
+#endif
 		r = 1;
 		break;
 #endif /* CONFIG_PPC_BOOK3S_64 */
@@ -837,6 +840,25 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
 		break;
 	}
 #endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_CAP_IRQ_XICS: {
+		struct file *filp;
+		struct kvm_device *dev;
+
+		r = -EBADF;
+		filp = fget(cap->args[0]);
+		if (!filp)
+			break;
+
+		r = -EPERM;
+		dev = kvm_device_from_filp(filp);
+		if (dev)
+			r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
+
+		fput(filp);
+		break;
+	}
+#endif /* CONFIG_KVM_XICS */
 	default:
 		r = -EINVAL;
 		break;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 996661eb8e99..7823b6369c5e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1086,6 +1086,7 @@ void kvm_device_put(struct kvm_device *dev);
 struct kvm_device *kvm_device_from_filp(struct file *filp);
 
 extern struct kvm_device_ops kvm_mpic_ops;
+extern struct kvm_device_ops kvm_xics_ops;
 
 #ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
 
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 965e5b52dee0..a5c86fc34a37 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -665,6 +665,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DEVICE_CTRL 89
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
+#define KVM_CAP_IRQ_XICS 92
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -837,6 +838,7 @@ struct kvm_device_attr {
 
 #define KVM_DEV_TYPE_FSL_MPIC_20	1
 #define KVM_DEV_TYPE_FSL_MPIC_42	2
+#define KVM_DEV_TYPE_XICS		3
 
 /*
  * ioctls for VM fds
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5da9f02a2a67..e6e7abe16d05 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2246,6 +2246,11 @@ static int kvm_ioctl_create_device(struct kvm *kvm,
 	case KVM_DEV_TYPE_FSL_MPIC_42:
 		ops = &kvm_mpic_ops;
 		break;
+#endif
+#ifdef CONFIG_KVM_XICS
+	case KVM_DEV_TYPE_XICS:
+		ops = &kvm_xics_ops;
+		break;
 #endif
 	default:
 		return -ENODEV;
-- 
cgit 


From c13d2b6d3640d1ef180e40191178b4f1271982c1 Mon Sep 17 00:00:00 2001
From: Bhanu Prakash Gollapudi <bprakash@broadcom.com>
Date: Fri, 8 Mar 2013 13:28:51 -0800
Subject: [SCSI] bnx2fc: Include chip number in the symbolic name

[jejb: move PCI_DEVICE_ID definitions to include/pci_ids.h]
Signed-off-by: Bhanu Prakash Gollapudi <bprakash@broadcom.com>
Signed-off-by: James Bottomley <JBottomley@Parallels.com>
---
 drivers/scsi/bnx2fc/bnx2fc.h      |  4 +++
 drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 51 ++++++++++++++++++++++++++++++++++-----
 include/linux/pci_ids.h           |  6 ++++-
 3 files changed, 54 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/scsi/bnx2fc/bnx2fc.h b/drivers/scsi/bnx2fc/bnx2fc.h
index 11596b2c4702..5d47c08ed99e 100644
--- a/drivers/scsi/bnx2fc/bnx2fc.h
+++ b/drivers/scsi/bnx2fc/bnx2fc.h
@@ -68,6 +68,8 @@
 
 #define PFX			"bnx2fc: "
 
+#define BCM_CHIP_LEN		16
+
 #define BNX2X_DOORBELL_PCI_BAR		2
 
 #define BNX2FC_MAX_BD_LEN		0xffff
@@ -241,6 +243,8 @@ struct bnx2fc_hba {
 	int wait_for_link_down;
 	int num_ofld_sess;
 	struct list_head vports;
+
+	char chip_num[BCM_CHIP_LEN];
 };
 
 struct bnx2fc_interface {
diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
index 7dffec1e5715..a76925716986 100644
--- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
+++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c
@@ -679,6 +679,7 @@ static int bnx2fc_shost_config(struct fc_lport *lport, struct device *dev)
 {
 	struct fcoe_port *port = lport_priv(lport);
 	struct bnx2fc_interface *interface = port->priv;
+	struct bnx2fc_hba *hba = interface->hba;
 	struct Scsi_Host *shost = lport->host;
 	int rc = 0;
 
@@ -699,8 +700,9 @@ static int bnx2fc_shost_config(struct fc_lport *lport, struct device *dev)
 	}
 	if (!lport->vport)
 		fc_host_max_npiv_vports(lport->host) = USHRT_MAX;
-	sprintf(fc_host_symbolic_name(lport->host), "%s v%s over %s",
-		BNX2FC_NAME, BNX2FC_VERSION,
+	snprintf(fc_host_symbolic_name(lport->host), 256,
+		 "%s (Broadcom %s) v%s over %s",
+		BNX2FC_NAME, hba->chip_num, BNX2FC_VERSION,
 		interface->netdev->name);
 
 	return 0;
@@ -1656,23 +1658,60 @@ mem_err:
 static int bnx2fc_bind_pcidev(struct bnx2fc_hba *hba)
 {
 	struct cnic_dev *cnic;
+	struct pci_dev *pdev;
 
 	if (!hba->cnic) {
 		printk(KERN_ERR PFX "cnic is NULL\n");
 		return -ENODEV;
 	}
 	cnic = hba->cnic;
-	hba->pcidev = cnic->pcidev;
-	if (hba->pcidev)
-		pci_dev_get(hba->pcidev);
+	pdev = hba->pcidev = cnic->pcidev;
+	if (!hba->pcidev)
+		return -ENODEV;
 
+	switch (pdev->device) {
+	case PCI_DEVICE_ID_NX2_57710:
+		strncpy(hba->chip_num, "BCM57710", BCM_CHIP_LEN);
+		break;
+	case PCI_DEVICE_ID_NX2_57711:
+		strncpy(hba->chip_num, "BCM57711", BCM_CHIP_LEN);
+		break;
+	case PCI_DEVICE_ID_NX2_57712:
+	case PCI_DEVICE_ID_NX2_57712_MF:
+	case PCI_DEVICE_ID_NX2_57712_VF:
+		strncpy(hba->chip_num, "BCM57712", BCM_CHIP_LEN);
+		break;
+	case PCI_DEVICE_ID_NX2_57800:
+	case PCI_DEVICE_ID_NX2_57800_MF:
+	case PCI_DEVICE_ID_NX2_57800_VF:
+		strncpy(hba->chip_num, "BCM57800", BCM_CHIP_LEN);
+		break;
+	case PCI_DEVICE_ID_NX2_57810:
+	case PCI_DEVICE_ID_NX2_57810_MF:
+	case PCI_DEVICE_ID_NX2_57810_VF:
+		strncpy(hba->chip_num, "BCM57810", BCM_CHIP_LEN);
+		break;
+	case PCI_DEVICE_ID_NX2_57840:
+	case PCI_DEVICE_ID_NX2_57840_MF:
+	case PCI_DEVICE_ID_NX2_57840_VF:
+	case PCI_DEVICE_ID_NX2_57840_2_20:
+	case PCI_DEVICE_ID_NX2_57840_4_10:
+		strncpy(hba->chip_num, "BCM57840", BCM_CHIP_LEN);
+		break;
+	default:
+		pr_err(PFX "Unknown device id 0x%x\n", pdev->device);
+		break;
+	}
+	pci_dev_get(hba->pcidev);
 	return 0;
 }
 
 static void bnx2fc_unbind_pcidev(struct bnx2fc_hba *hba)
 {
-	if (hba->pcidev)
+	if (hba->pcidev) {
+		hba->chip_num[0] = '\0';
 		pci_dev_put(hba->pcidev);
+	}
 	hba->pcidev = NULL;
 }
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 2b85c521f737..c12916248469 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2147,11 +2147,13 @@
 #define PCI_DEVICE_ID_TIGON3_5705M_2	0x165e
 #define PCI_DEVICE_ID_NX2_57712		0x1662
 #define PCI_DEVICE_ID_NX2_57712E	0x1663
+#define PCI_DEVICE_ID_NX2_57712_MF	0x1663
 #define PCI_DEVICE_ID_TIGON3_5714	0x1668
 #define PCI_DEVICE_ID_TIGON3_5714S	0x1669
 #define PCI_DEVICE_ID_TIGON3_5780	0x166a
 #define PCI_DEVICE_ID_TIGON3_5780S	0x166b
 #define PCI_DEVICE_ID_TIGON3_5705F	0x166e
+#define PCI_DEVICE_ID_NX2_57712_VF	0x166f
 #define PCI_DEVICE_ID_TIGON3_5754M	0x1672
 #define PCI_DEVICE_ID_TIGON3_5755M	0x1673
 #define PCI_DEVICE_ID_TIGON3_5756	0x1674
@@ -2177,13 +2179,15 @@
 #define PCI_DEVICE_ID_TIGON3_5787	0x169b
 #define PCI_DEVICE_ID_TIGON3_5788	0x169c
 #define PCI_DEVICE_ID_TIGON3_5789	0x169d
+#define PCI_DEVICE_ID_NX2_57840_4_10	0x16a1
+#define PCI_DEVICE_ID_NX2_57840_2_20	0x16a2
+#define PCI_DEVICE_ID_NX2_57840_MF	0x16a4
 #define PCI_DEVICE_ID_NX2_57800_MF	0x16a5
 #define PCI_DEVICE_ID_TIGON3_5702X	0x16a6
 #define PCI_DEVICE_ID_TIGON3_5703X	0x16a7
 #define PCI_DEVICE_ID_TIGON3_5704S	0x16a8
 #define PCI_DEVICE_ID_NX2_57800_VF	0x16a9
 #define PCI_DEVICE_ID_NX2_5706S		0x16aa
-#define PCI_DEVICE_ID_NX2_57840_MF	0x16a4
 #define PCI_DEVICE_ID_NX2_5708S		0x16ac
 #define PCI_DEVICE_ID_NX2_57840_VF	0x16ad
 #define PCI_DEVICE_ID_NX2_57810_MF	0x16ae
-- 
cgit 


From 4ada8db38a44654446fe35ceb20a1972220e0f69 Mon Sep 17 00:00:00 2001
From: David Miller <davem@davemloft.net>
Date: Thu, 2 May 2013 04:31:32 -0400
Subject: net: Restore NETIF_F_* bit ordering.

Commit 8ad227ff89a7 ("net: vlan: add 802.1ad support") added some new
NETIF_F_* features bits, but it added them in the middle of existing
values.

Userland depends upon the flag bits via the per-netdevice 'flags' sysfs
file.

So restore the previous ordering by adding the new flags at the end.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/netdev_features.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index cbaa027ef5a7..09906b7ca47d 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -25,9 +25,6 @@ enum {
 	NETIF_F_HW_VLAN_CTAG_TX_BIT,	/* Transmit VLAN CTAG HW acceleration */
 	NETIF_F_HW_VLAN_CTAG_RX_BIT,	/* Receive VLAN CTAG HW acceleration */
 	NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */
-	NETIF_F_HW_VLAN_STAG_TX_BIT,	/* Transmit VLAN STAG HW acceleration */
-	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
-	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_VLAN_CHALLENGED_BIT,	/* Device cannot handle VLAN packets */
 	NETIF_F_GSO_BIT,		/* Enable software GSO. */
 	NETIF_F_LLTX_BIT,		/* LockLess TX - deprecated. Please */
@@ -59,6 +56,9 @@ enum {
 	NETIF_F_LOOPBACK_BIT,		/* Enable loopback */
 	NETIF_F_RXFCS_BIT,		/* Append FCS to skb pkt data */
 	NETIF_F_RXALL_BIT,		/* Receive errored frames too */
+	NETIF_F_HW_VLAN_STAG_TX_BIT,	/* Transmit VLAN STAG HW acceleration */
+	NETIF_F_HW_VLAN_STAG_RX_BIT,	/* Receive VLAN STAG HW acceleration */
+	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -86,9 +86,6 @@ enum {
 #define NETIF_F_HW_VLAN_CTAG_FILTER __NETIF_F(HW_VLAN_CTAG_FILTER)
 #define NETIF_F_HW_VLAN_CTAG_RX	__NETIF_F(HW_VLAN_CTAG_RX)
 #define NETIF_F_HW_VLAN_CTAG_TX	__NETIF_F(HW_VLAN_CTAG_TX)
-#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
-#define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
-#define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_IP_CSUM		__NETIF_F(IP_CSUM)
 #define NETIF_F_IPV6_CSUM	__NETIF_F(IPV6_CSUM)
 #define NETIF_F_LLTX		__NETIF_F(LLTX)
@@ -110,6 +107,9 @@ enum {
 #define NETIF_F_RXALL		__NETIF_F(RXALL)
 #define NETIF_F_GSO_GRE		__NETIF_F(GSO_GRE)
 #define NETIF_F_GSO_UDP_TUNNEL	__NETIF_F(GSO_UDP_TUNNEL)
+#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
+#define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
+#define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 
 /* Features valid for ethtool to change */
 /* = all defined minus driver/device-class-related */
-- 
cgit 


From de61608acf89779c8831aaa1428b6975d49d98c0 Mon Sep 17 00:00:00 2001
From: Lars-Peter Clausen <lars@metafoo.de>
Date: Fri, 19 Apr 2013 11:42:14 +0200
Subject: dma:of: Use a mutex to protect the of_dma_list

Currently the OF DMA code uses a spin lock to protect the of_dma_list from
concurrent access and a per controller reference count to protect the controller
from being freed while a request operation is in progress. If
of_dma_controller_free() is called for a controller who's reference count is not
zero it will return -EBUSY and not remove the controller. This is fine up until
here, but leaves the question what the caller of of_dma_controller_free() is
supposed to do if the controller couldn't be freed.  The only viable solution
for the caller is to spin on of_dma_controller_free() until it returns success.
E.g.

	do {
		ret = of_dma_controller_free(dev->of_node)
	} while (ret != -EBUSY);

This is rather ugly and unnecessary and none of the current users of
of_dma_controller_free() check it's return value anyway. Instead protect the
list by a mutex. The mutex will be held as long as a request operation is in
progress. So if of_dma_controller_free() is called while a request operation is
in progress it will be put to sleep and only wake up once the request operation
has finished.

This means that it is no longer possible to register or unregister OF DMA
controllers from a context where it's not possible to sleep. But I doubt that
we'll ever need this.

Also rename of_dma_get_controller back to of_dma_find_controller.

Signed-off-by: Lars-Peter Clausen <lars@metafoo.de>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Vinod Koul <vinod.koul@intel.com>
---
 drivers/dma/of-dma.c   | 76 +++++++++++++-------------------------------------
 include/linux/of_dma.h |  6 ++--
 2 files changed, 22 insertions(+), 60 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/dma/of-dma.c b/drivers/dma/of-dma.c
index 2882403a39cf..7aa0864cd487 100644
--- a/drivers/dma/of-dma.c
+++ b/drivers/dma/of-dma.c
@@ -13,38 +13,31 @@
 #include <linux/device.h>
 #include <linux/err.h>
 #include <linux/module.h>
-#include <linux/rculist.h>
+#include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/of.h>
 #include <linux/of_dma.h>
 
 static LIST_HEAD(of_dma_list);
-static DEFINE_SPINLOCK(of_dma_lock);
+static DEFINE_MUTEX(of_dma_lock);
 
 /**
- * of_dma_get_controller - Get a DMA controller in DT DMA helpers list
+ * of_dma_find_controller - Get a DMA controller in DT DMA helpers list
  * @dma_spec:	pointer to DMA specifier as found in the device tree
  *
  * Finds a DMA controller with matching device node and number for dma cells
- * in a list of registered DMA controllers. If a match is found the use_count
- * variable is increased and a valid pointer to the DMA data stored is retuned.
- * A NULL pointer is returned if no match is found.
+ * in a list of registered DMA controllers. If a match is found a valid pointer
+ * to the DMA data stored is retuned. A NULL pointer is returned if no match is
+ * found.
  */
-static struct of_dma *of_dma_get_controller(struct of_phandle_args *dma_spec)
+static struct of_dma *of_dma_find_controller(struct of_phandle_args *dma_spec)
 {
 	struct of_dma *ofdma;
 
-	spin_lock(&of_dma_lock);
-
 	list_for_each_entry(ofdma, &of_dma_list, of_dma_controllers)
 		if ((ofdma->of_node == dma_spec->np) &&
-		    (ofdma->of_dma_nbcells == dma_spec->args_count)) {
-			ofdma->use_count++;
-			spin_unlock(&of_dma_lock);
+		    (ofdma->of_dma_nbcells == dma_spec->args_count))
 			return ofdma;
-		}
-
-	spin_unlock(&of_dma_lock);
 
 	pr_debug("%s: can't find DMA controller %s\n", __func__,
 		 dma_spec->np->full_name);
@@ -52,22 +45,6 @@ static struct of_dma *of_dma_get_controller(struct of_phandle_args *dma_spec)
 	return NULL;
 }
 
-/**
- * of_dma_put_controller - Decrement use count for a registered DMA controller
- * @of_dma:	pointer to DMA controller data
- *
- * Decrements the use_count variable in the DMA data structure. This function
- * should be called only when a valid pointer is returned from
- * of_dma_get_controller() and no further accesses to data referenced by that
- * pointer are needed.
- */
-static void of_dma_put_controller(struct of_dma *ofdma)
-{
-	spin_lock(&of_dma_lock);
-	ofdma->use_count--;
-	spin_unlock(&of_dma_lock);
-}
-
 /**
  * of_dma_controller_register - Register a DMA controller to DT DMA helpers
  * @np:			device node of DMA controller
@@ -114,12 +91,11 @@ int of_dma_controller_register(struct device_node *np,
 	ofdma->of_dma_nbcells = nbcells;
 	ofdma->of_dma_xlate = of_dma_xlate;
 	ofdma->of_dma_data = data;
-	ofdma->use_count = 0;
 
 	/* Now queue of_dma controller structure in list */
-	spin_lock(&of_dma_lock);
+	mutex_lock(&of_dma_lock);
 	list_add_tail(&ofdma->of_dma_controllers, &of_dma_list);
-	spin_unlock(&of_dma_lock);
+	mutex_unlock(&of_dma_lock);
 
 	return 0;
 }
@@ -131,32 +107,20 @@ EXPORT_SYMBOL_GPL(of_dma_controller_register);
  *
  * Memory allocated by of_dma_controller_register() is freed here.
  */
-int of_dma_controller_free(struct device_node *np)
+void of_dma_controller_free(struct device_node *np)
 {
 	struct of_dma *ofdma;
 
-	spin_lock(&of_dma_lock);
-
-	if (list_empty(&of_dma_list)) {
-		spin_unlock(&of_dma_lock);
-		return -ENODEV;
-	}
+	mutex_lock(&of_dma_lock);
 
 	list_for_each_entry(ofdma, &of_dma_list, of_dma_controllers)
 		if (ofdma->of_node == np) {
-			if (ofdma->use_count) {
-				spin_unlock(&of_dma_lock);
-				return -EBUSY;
-			}
-
 			list_del(&ofdma->of_dma_controllers);
-			spin_unlock(&of_dma_lock);
 			kfree(ofdma);
-			return 0;
+			break;
 		}
 
-	spin_unlock(&of_dma_lock);
-	return -ENODEV;
+	mutex_unlock(&of_dma_lock);
 }
 EXPORT_SYMBOL_GPL(of_dma_controller_free);
 
@@ -219,15 +183,15 @@ struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 		if (of_dma_match_channel(np, name, i, &dma_spec))
 			continue;
 
-		ofdma = of_dma_get_controller(&dma_spec);
+		mutex_lock(&of_dma_lock);
+		ofdma = of_dma_find_controller(&dma_spec);
 
-		if (ofdma) {
+		if (ofdma)
 			chan = ofdma->of_dma_xlate(&dma_spec, ofdma);
-
-			of_dma_put_controller(ofdma);
-		} else {
+		else
 			chan = NULL;
-		}
+
+		mutex_unlock(&of_dma_lock);
 
 		of_node_put(dma_spec.np);
 
diff --git a/include/linux/of_dma.h b/include/linux/of_dma.h
index ce6a8ab3d2bb..364dda734877 100644
--- a/include/linux/of_dma.h
+++ b/include/linux/of_dma.h
@@ -25,7 +25,6 @@ struct of_dma {
 	struct dma_chan		*(*of_dma_xlate)
 				(struct of_phandle_args *, struct of_dma *);
 	void			*of_dma_data;
-	int			use_count;
 };
 
 struct of_dma_filter_info {
@@ -38,7 +37,7 @@ extern int of_dma_controller_register(struct device_node *np,
 		struct dma_chan *(*of_dma_xlate)
 		(struct of_phandle_args *, struct of_dma *),
 		void *data);
-extern int of_dma_controller_free(struct device_node *np);
+extern void of_dma_controller_free(struct device_node *np);
 extern struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
 						     const char *name);
 extern struct dma_chan *of_dma_simple_xlate(struct of_phandle_args *dma_spec,
@@ -52,9 +51,8 @@ static inline int of_dma_controller_register(struct device_node *np,
 	return -ENODEV;
 }
 
-static inline int of_dma_controller_free(struct device_node *np)
+static inline void of_dma_controller_free(struct device_node *np)
 {
-	return -ENODEV;
 }
 
 static inline struct dma_chan *of_dma_request_slave_channel(struct device_node *np,
-- 
cgit 


From 5522ae0b68421e2645303ff010e27afc5292e0ab Mon Sep 17 00:00:00 2001
From: Alex Elder <elder@inktank.com>
Date: Wed, 1 May 2013 12:43:04 -0500
Subject: libceph: use slab cache for osd client requests

Create a slab cache to manage allocation of ceph_osdc_request
structures.

This resolves:
    http://tracker.ceph.com/issues/3926

Signed-off-by: Alex Elder <elder@inktank.com>
Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
---
 include/linux/ceph/osd_client.h |  3 +++
 net/ceph/ceph_common.c          |  7 +++++++
 net/ceph/osd_client.c           | 27 +++++++++++++++++++++++++--
 3 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 4191cd2c55e5..186db0bf4951 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -224,6 +224,9 @@ struct ceph_osd_client {
 	struct workqueue_struct	*notify_wq;
 };
 
+extern int ceph_osdc_setup(void);
+extern void ceph_osdc_cleanup(void);
+
 extern int ceph_osdc_init(struct ceph_osd_client *osdc,
 			  struct ceph_client *client);
 extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index e65e6e4be38b..34b11ee8124e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -606,11 +606,17 @@ static int __init init_ceph_lib(void)
 	if (ret < 0)
 		goto out_crypto;
 
+	ret = ceph_osdc_setup();
+	if (ret < 0)
+		goto out_msgr;
+
 	pr_info("loaded (mon/osd proto %d/%d)\n",
 		CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
 
 	return 0;
 
+out_msgr:
+	ceph_msgr_exit();
 out_crypto:
 	ceph_crypto_shutdown();
 out_debugfs:
@@ -622,6 +628,7 @@ out:
 static void __exit exit_ceph_lib(void)
 {
 	dout("exit_ceph_lib\n");
+	ceph_osdc_cleanup();
 	ceph_msgr_exit();
 	ceph_crypto_shutdown();
 	ceph_debugfs_cleanup();
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 57d8db5b055a..a3395fdfbd4f 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -22,6 +22,8 @@
 #define OSD_OP_FRONT_LEN	4096
 #define OSD_OPREPLY_FRONT_LEN	512
 
+static struct kmem_cache	*ceph_osd_request_cache;
+
 static const struct ceph_connection_operations osd_con_ops;
 
 static void __send_queued(struct ceph_osd_client *osdc);
@@ -315,7 +317,8 @@ void ceph_osdc_release_request(struct kref *kref)
 	if (req->r_mempool)
 		mempool_free(req, req->r_osdc->req_mempool);
 	else
-		kfree(req);
+		kmem_cache_free(ceph_osd_request_cache, req);
+
 }
 EXPORT_SYMBOL(ceph_osdc_release_request);
 
@@ -346,7 +349,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
 		req = mempool_alloc(osdc->req_mempool, gfp_flags);
 		memset(req, 0, sizeof(*req));
 	} else {
-		req = kzalloc(sizeof(*req), gfp_flags);
+		req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
 	}
 	if (req == NULL)
 		return NULL;
@@ -2365,6 +2368,26 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
 }
 EXPORT_SYMBOL(ceph_osdc_writepages);
 
+int ceph_osdc_setup(void)
+{
+	BUG_ON(ceph_osd_request_cache);
+	ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
+					sizeof (struct ceph_osd_request),
+					__alignof__(struct ceph_osd_request),
+					0, NULL);
+
+	return ceph_osd_request_cache ? 0 : -ENOMEM;
+}
+EXPORT_SYMBOL(ceph_osdc_setup);
+
+void ceph_osdc_cleanup(void)
+{
+	BUG_ON(!ceph_osd_request_cache);
+	kmem_cache_destroy(ceph_osd_request_cache);
+	ceph_osd_request_cache = NULL;
+}
+EXPORT_SYMBOL(ceph_osdc_cleanup);
+
 /*
  * handle incoming message
  */
-- 
cgit 


From 159b67d7aefe3902df91075be5d80943c1570aa8 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 9 Apr 2013 17:13:20 -0600
Subject: NVMe: Device specific stripe size handling

We have an nvme device that has a concept of a stripe size. IO requests
that do not transfer data crossing a stripe boundary has greater
performance compared to IO that does cross it. This patch sets the
stripe size for the device if the device and vendor ids match one with
this feature and splits IO requests that cross the stripe boundary.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 19 +++++++++++++++----
 include/linux/nvme.h      |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index f81280663a4b..4151a3d26e2d 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -519,7 +519,11 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 {
 	struct bio_vec *bvec, *bvprv = NULL;
 	struct scatterlist *sg = NULL;
-	int i, length = 0, nsegs = 0;
+	int i, length = 0, nsegs = 0, split_len = bio->bi_size;
+
+	if (nvmeq->dev->stripe_size)
+		split_len = nvmeq->dev->stripe_size -
+			((bio->bi_sector << 9) & (nvmeq->dev->stripe_size - 1));
 
 	sg_init_table(iod->sg, psegs);
 	bio_for_each_segment(bvec, bio, i) {
@@ -535,6 +539,10 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 							bvec->bv_offset);
 			nsegs++;
 		}
+
+		if (split_len - length < bvec->bv_len)
+			return nvme_split_and_submit(bio, nvmeq, i, split_len,
+							split_len - length);
 		length += bvec->bv_len;
 		bvprv = bvec;
 	}
@@ -543,6 +551,7 @@ static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
 	if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
 		return -ENOMEM;
 
+	BUG_ON(length != bio->bi_size);
 	return length;
 }
 
@@ -1612,6 +1621,7 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	struct nvme_id_ns *id_ns;
 	void *mem;
 	dma_addr_t dma_addr;
+	int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
 
 	res = nvme_setup_io_queues(dev);
 	if (res)
@@ -1634,10 +1644,11 @@ static int nvme_dev_add(struct nvme_dev *dev)
 	memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn));
 	memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn));
 	memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr));
-	if (ctrl->mdts) {
-		int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12;
+	if (ctrl->mdts)
 		dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9);
-	}
+	if ((dev->pci_dev->vendor == PCI_VENDOR_ID_INTEL) &&
+			(dev->pci_dev->device == 0x0953) && ctrl->vs[3])
+		dev->stripe_size = 1 << (ctrl->vs[3] + shift);
 
 	id_ns = mem;
 	for (i = 1; i <= nn; i++) {
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 9b6fba872f47..af29b0e0b092 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -536,6 +536,7 @@ struct nvme_dev {
 	char model[40];
 	char firmware_rev[8];
 	u32 max_hw_sectors;
+	u32 stripe_size;
 	u16 oncs;
 };
 
-- 
cgit 


From f410c680b5344f1cb63ff011c6ae3d4963f45c30 Mon Sep 17 00:00:00 2001
From: Keith Busch <keith.busch@intel.com>
Date: Tue, 23 Apr 2013 17:23:59 -0600
Subject: NVMe: Meta-data support in NVME_IOCTL_SUBMIT_IO

This adds support for namespaces with separate meta-data formats in the
submit io ioctl. The meta-data buffer has to be a contiguous, so such
a buffer is allocated and the mapped user pages are copied to/from this
buffer for write/read commands.

Signed-off-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
---
 drivers/block/nvme-core.c | 71 ++++++++++++++++++++++++++++++++++++++++++++---
 include/linux/nvme.h      |  1 +
 2 files changed, 68 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index 4151a3d26e2d..5a3f2235892a 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -1240,13 +1240,19 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	struct nvme_queue *nvmeq;
 	struct nvme_user_io io;
 	struct nvme_command c;
-	unsigned length;
-	int status;
-	struct nvme_iod *iod;
+	unsigned length, meta_len;
+	int status, i;
+	struct nvme_iod *iod, *meta_iod = NULL;
+	dma_addr_t meta_dma_addr;
+	void *meta, *uninitialized_var(meta_mem);
 
 	if (copy_from_user(&io, uio, sizeof(io)))
 		return -EFAULT;
 	length = (io.nblocks + 1) << ns->lba_shift;
+	meta_len = (io.nblocks + 1) * ns->ms;
+
+	if (meta_len && ((io.metadata & 3) || !io.metadata))
+		return -EINVAL;
 
 	switch (io.opcode) {
 	case nvme_cmd_write:
@@ -1272,7 +1278,38 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	c.rw.reftag = cpu_to_le32(io.reftag);
 	c.rw.apptag = cpu_to_le16(io.apptag);
 	c.rw.appmask = cpu_to_le16(io.appmask);
-	/* XXX: metadata */
+
+	if (meta_len) {
+		meta_iod = nvme_map_user_pages(dev, io.opcode & 1, io.metadata, meta_len);
+		if (IS_ERR(meta_iod)) {
+			status = PTR_ERR(meta_iod);
+			meta_iod = NULL;
+			goto unmap;
+		}
+
+		meta_mem = dma_alloc_coherent(&dev->pci_dev->dev, meta_len,
+						&meta_dma_addr, GFP_KERNEL);
+		if (!meta_mem) {
+			status = -ENOMEM;
+			goto unmap;
+		}
+
+		if (io.opcode & 1) {
+			int meta_offset = 0;
+
+			for (i = 0; i < meta_iod->nents; i++) {
+				meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
+						meta_iod->sg[i].offset;
+				memcpy(meta_mem + meta_offset, meta,
+						meta_iod->sg[i].length);
+				kunmap_atomic(meta);
+				meta_offset += meta_iod->sg[i].length;
+			}
+		}
+
+		c.rw.metadata = cpu_to_le64(meta_dma_addr);
+	}
+
 	length = nvme_setup_prps(dev, &c.common, iod, length, GFP_KERNEL);
 
 	nvmeq = get_nvmeq(dev);
@@ -1288,8 +1325,33 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 	else
 		status = nvme_submit_sync_cmd(nvmeq, &c, NULL, NVME_IO_TIMEOUT);
 
+	if (meta_len) {
+		if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) {
+			int meta_offset = 0;
+
+			for (i = 0; i < meta_iod->nents; i++) {
+				meta = kmap_atomic(sg_page(&meta_iod->sg[i])) +
+						meta_iod->sg[i].offset;
+				memcpy(meta, meta_mem + meta_offset,
+						meta_iod->sg[i].length);
+				kunmap_atomic(meta);
+				meta_offset += meta_iod->sg[i].length;
+			}
+		}
+
+		dma_free_coherent(&dev->pci_dev->dev, meta_len, meta_mem,
+								meta_dma_addr);
+	}
+
+ unmap:
 	nvme_unmap_user_pages(dev, io.opcode & 1, iod);
 	nvme_free_iod(dev, iod);
+
+	if (meta_iod) {
+		nvme_unmap_user_pages(dev, io.opcode & 1, meta_iod);
+		nvme_free_iod(dev, meta_iod);
+	}
+
 	return status;
 }
 
@@ -1486,6 +1548,7 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int nsid,
 	ns->disk = disk;
 	lbaf = id->flbas & 0xf;
 	ns->lba_shift = id->lbaf[lbaf].ds;
+	ns->ms = le16_to_cpu(id->lbaf[lbaf].ms);
 	blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
 	if (dev->max_hw_sectors)
 		blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index af29b0e0b092..971ef086ed63 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -552,6 +552,7 @@ struct nvme_ns {
 
 	int ns_id;
 	int lba_shift;
+	int ms;
 	u64 mode_select_num_blocks;
 	u32 mode_select_block_len;
 };
-- 
cgit 


From 265f22a975c1e4cc3a4d1f94a3ec53ffbb6f5b9f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 3 May 2013 03:39:05 +0200
Subject: sched: Keep at least 1 tick per second for active dynticks tasks

The scheduler doesn't yet fully support environments
with a single task running without a periodic tick.

In order to ensure we still maintain the duties of scheduler_tick(),
keep at least 1 tick per second.

This makes sure that we keep the progression of various scheduler
accounting and background maintainance even with a very low granularity.
Examples include cpu load, sched average, CFS entity vruntime,
avenrun and events such as load balancing, amongst other details
handled in sched_class::task_tick().

This limitation will be removed in the future once we get
these individual items to work in full dynticks CPUs.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Hakan Akkan <hakanakkan@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Li Zhong <zhong@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h    |  1 +
 kernel/sched/core.c      | 30 ++++++++++++++++++++++++++++++
 kernel/sched/idle_task.c |  1 +
 kernel/sched/sched.h     | 10 ++++++++++
 kernel/time/tick-sched.c |  7 +++++++
 5 files changed, 49 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ebf7095158a9..af008d7bad57 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1862,6 +1862,7 @@ static inline void wake_up_nohz_cpu(int cpu) { }
 
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(void);
+extern u64 scheduler_tick_max_deferment(void);
 #else
 static inline bool sched_can_stop_tick(void) { return false; }
 #endif
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e94842d4400c..3bdf986a091a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2736,8 +2736,35 @@ void scheduler_tick(void)
 	rq->idle_balance = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
+	rq_last_tick_reset(rq);
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running because the scheduler doesn't
+ * yet completely support full dynticks environment.
+ *
+ * This makes sure that uptime, CFS vruntime, load
+ * balancing, etc... continue to move forward, even
+ * with a very low granularity.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+	struct rq *rq = this_rq();
+	unsigned long next, now = ACCESS_ONCE(jiffies);
+
+	next = rq->last_sched_tick + HZ;
+
+	if (time_before_eq(next, now))
+		return 0;
+
+	return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
+}
+#endif
+
 notrace unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
@@ -6993,6 +7020,9 @@ void __init sched_init(void)
 #ifdef CONFIG_NO_HZ_COMMON
 		rq->nohz_flags = 0;
 #endif
+#ifdef CONFIG_NO_HZ_FULL
+		rq->last_sched_tick = 0;
+#endif
 #endif
 		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b8ce77328341..d8da01008d39 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -17,6 +17,7 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
 {
 	idle_exit_fair(rq);
+	rq_last_tick_reset(rq);
 }
 
 static void post_schedule_idle(struct rq *rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24dc29897749..ce39224d6155 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,9 @@ struct rq {
 #ifdef CONFIG_NO_HZ_COMMON
 	u64 nohz_stamp;
 	unsigned long nohz_flags;
+#endif
+#ifdef CONFIG_NO_HZ_FULL
+	unsigned long last_sched_tick;
 #endif
 	int skip_clock_update;
 
@@ -1090,6 +1093,13 @@ static inline void dec_nr_running(struct rq *rq)
 	rq->nr_running--;
 }
 
+static inline void rq_last_tick_reset(struct rq *rq)
+{
+#ifdef CONFIG_NO_HZ_FULL
+	rq->last_sched_tick = jiffies;
+#endif
+}
+
 extern void update_rq_clock(struct rq *rq);
 
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1c9f53b2ddb7..07929c633570 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -600,6 +600,13 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 			time_delta = KTIME_MAX;
 		}
 
+#ifdef CONFIG_NO_HZ_FULL
+		if (!ts->inidle) {
+			time_delta = min(time_delta,
+					 scheduler_tick_max_deferment());
+		}
+#endif
+
 		/*
 		 * calculate the expiry time for the next timer wheel
 		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
-- 
cgit 


From e765acb4673f377c53505c01279c040906bd706e Mon Sep 17 00:00:00 2001
From: Geert Uytterhoeven <geert@linux-m68k.org>
Date: Fri, 3 May 2013 22:20:38 +0200
Subject: nubus: Kill nubus_proc_detach_device()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit 59d8053f1e16904d54ed7469d4b36801ea6b8f2c ("proc: Move non-public
stuff from linux/proc_fs.h to fs/proc/internal.h") broke Apple NuBus
support:

drivers/nubus/proc.c: In function ‘nubus_proc_detach_device’:
drivers/nubus/proc.c:156: error: dereferencing pointer to incomplete type
drivers/nubus/proc.c:158: error: dereferencing pointer to incomplete type

Fortunately nubus_proc_detach_device() is unused, and appears to have never
been used, so just remove it.

Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/nubus/proc.c  | 15 ---------------
 include/linux/nubus.h |  1 -
 2 files changed, 16 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/nubus/proc.c b/drivers/nubus/proc.c
index b8286ed65919..5371b374f1fe 100644
--- a/drivers/nubus/proc.c
+++ b/drivers/nubus/proc.c
@@ -147,21 +147,6 @@ int nubus_proc_attach_device(struct nubus_dev *dev)
 }
 EXPORT_SYMBOL(nubus_proc_attach_device);
 
-/* FIXME: this is certainly broken! */
-int nubus_proc_detach_device(struct nubus_dev *dev)
-{
-	struct proc_dir_entry *e;
-
-	if ((e = dev->procdir)) {
-		if (atomic_read(&e->count))
-			return -EBUSY;
-		remove_proc_entry(e->name, proc_bus_nubus_dir);
-		dev->procdir = NULL;
-	}
-	return 0;
-}
-EXPORT_SYMBOL(nubus_proc_detach_device);
-
 /*
  * /proc/nubus stuff
  */
diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index b3740527571a..6165b2c62040 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -87,7 +87,6 @@ static inline void nubus_proc_init(void) {}
 #endif
 int get_nubus_list(char *buf);
 int nubus_proc_attach_device(struct nubus_dev *dev);
-int nubus_proc_detach_device(struct nubus_dev *dev);
 /* If we need more precision we can add some more of these */
 struct nubus_dev* nubus_find_device(unsigned short category,
 				    unsigned short type,
-- 
cgit 


From 5ae98f1589e076e4b314fc54ae2beac58842ddc2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sat, 4 May 2013 00:11:23 +0200
Subject: fs: Fix hang with BSD accounting on frozen filesystem

When BSD process accounting is enabled and logs information to a
filesystem which gets frozen, system easily becomes unusable because
each attempt to account process information blocks. Thus e.g. every task
gets blocked in exit.

It seems better to drop accounting information (which can already happen
when filesystem is running out of space) instead of locking system up.
So we just skip the write if the filesystem is frozen.

Reported-by: Nikola Ciprich <nikola.ciprich@linuxbox.cz>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 7 +++++++
 kernel/acct.c      | 7 ++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/fs.h b/include/linux/fs.h
index e8cd6b839675..b5a24ba83b6f 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2227,6 +2227,13 @@ static inline void file_start_write(struct file *file)
 	__sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, true);
 }
 
+static inline bool file_start_write_trylock(struct file *file)
+{
+	if (!S_ISREG(file_inode(file)->i_mode))
+		return true;
+	return __sb_start_write(file_inode(file)->i_sb, SB_FREEZE_WRITE, false);
+}
+
 static inline void file_end_write(struct file *file)
 {
 	if (!S_ISREG(file_inode(file)->i_mode))
diff --git a/kernel/acct.c b/kernel/acct.c
index 85389fe2abd0..8d6e145138bb 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -539,11 +539,16 @@ static void do_acct_process(struct bsd_acct_struct *acct,
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_swaps = encode_comp_t(0);
 
+	/*
+	 * Get freeze protection. If the fs is frozen, just skip the write
+	 * as we could deadlock the system otherwise.
+	 */
+	if (!file_start_write_trylock(file))
+		goto out;
 	/*
 	 * Kernel segment override to datasegment and write it
 	 * to the accounting file.
 	 */
-	file_start_write(file);
 	fs = get_fs();
 	set_fs(KERNEL_DS);
 	/*
-- 
cgit 


From a8ca889ed9585894d53fd8919d80cbe8baff09e7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 5 May 2013 21:31:22 -0400
Subject: mtd_blktrans_ops->release() should return void

Both existing instances always return 0 and even if they didn't,
the value would be lost on the way out.  Just don't bother...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/mtd_blkdevs.c    | 8 ++++----
 drivers/mtd/mtdblock.c       | 4 +---
 drivers/mtd/sm_ftl.c         | 3 +--
 include/linux/mtd/blktrans.h | 2 +-
 4 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 5ad39bb5ab4c..65d7e7e2fb1f 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -240,10 +240,9 @@ error_put:
 static int blktrans_release(struct gendisk *disk, fmode_t mode)
 {
 	struct mtd_blktrans_dev *dev = blktrans_dev_get(disk);
-	int ret = 0;
 
 	if (!dev)
-		return ret;
+		return 0;
 
 	mutex_lock(&dev->lock);
 
@@ -254,13 +253,14 @@ static int blktrans_release(struct gendisk *disk, fmode_t mode)
 	module_put(dev->tr->owner);
 
 	if (dev->mtd) {
-		ret = dev->tr->release ? dev->tr->release(dev) : 0;
+		if (dev->tr->release)
+			dev->tr->release(dev);
 		__put_mtd_device(dev->mtd);
 	}
 unlock:
 	mutex_unlock(&dev->lock);
 	blktrans_dev_put(dev);
-	return ret;
+	return 0;
 }
 
 static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/mtd/mtdblock.c b/drivers/mtd/mtdblock.c
index 6c6d80736fad..2aef5dda522b 100644
--- a/drivers/mtd/mtdblock.c
+++ b/drivers/mtd/mtdblock.c
@@ -308,7 +308,7 @@ static int mtdblock_open(struct mtd_blktrans_dev *mbd)
 	return 0;
 }
 
-static int mtdblock_release(struct mtd_blktrans_dev *mbd)
+static void mtdblock_release(struct mtd_blktrans_dev *mbd)
 {
 	struct mtdblk_dev *mtdblk = container_of(mbd, struct mtdblk_dev, mbd);
 
@@ -333,8 +333,6 @@ static int mtdblock_release(struct mtd_blktrans_dev *mbd)
 	mutex_unlock(&mtdblks_lock);
 
 	pr_debug("ok\n");
-
-	return 0;
 }
 
 static int mtdblock_flush(struct mtd_blktrans_dev *dev)
diff --git a/drivers/mtd/sm_ftl.c b/drivers/mtd/sm_ftl.c
index 8dd6ba52404a..f9d5615c5727 100644
--- a/drivers/mtd/sm_ftl.c
+++ b/drivers/mtd/sm_ftl.c
@@ -1107,7 +1107,7 @@ static int sm_flush(struct mtd_blktrans_dev *dev)
 }
 
 /* outside interface: device is released */
-static int sm_release(struct mtd_blktrans_dev *dev)
+static void sm_release(struct mtd_blktrans_dev *dev)
 {
 	struct sm_ftl *ftl = dev->priv;
 
@@ -1116,7 +1116,6 @@ static int sm_release(struct mtd_blktrans_dev *dev)
 	cancel_work_sync(&ftl->flush_work);
 	sm_cache_flush(ftl);
 	mutex_unlock(&ftl->mutex);
-	return 0;
 }
 
 /* outside interface: get geometry */
diff --git a/include/linux/mtd/blktrans.h b/include/linux/mtd/blktrans.h
index 4eb0a50d0c55..e93837f647de 100644
--- a/include/linux/mtd/blktrans.h
+++ b/include/linux/mtd/blktrans.h
@@ -74,7 +74,7 @@ struct mtd_blktrans_ops {
 
 	/* Called with mtd_table_mutex held; no race with add/remove */
 	int (*open)(struct mtd_blktrans_dev *dev);
-	int (*release)(struct mtd_blktrans_dev *dev);
+	void (*release)(struct mtd_blktrans_dev *dev);
 
 	/* Called on {de,}registration and on subsequent addition/removal
 	   of devices, with mtd_table_mutex held. */
-- 
cgit 


From 6286ae97d10ea2b5cd90532163797ab217bfdbdf Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Fri, 3 May 2013 15:43:18 +0000
Subject: slab: Return NULL for oversized allocations

The inline path seems to have changed the SLAB behavior for very large
kmalloc allocations with  commit e3366016 ("slab: Use common
kmalloc_index/kmalloc_size functions"). This patch restores the old
behavior but also adds diagnostics so that we can figure where in the
code these large allocations occur.

Reported-and-tested-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Christoph Lameter <cl@linux.com>
Link: http://lkml.kernel.org/r/201305040348.CIF81716.OStQOHFJMFLOVF@I-love.SAKURA.ne.jp
[ penberg@kernel.org: use WARN_ON_ONCE ]
Signed-off-by: Pekka Enberg <penberg@kernel.org>
---
 include/linux/slab_def.h | 6 ++++++
 mm/slab_common.c         | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 113ec080313f..cd401580bdd3 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -126,6 +126,9 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 		if (!size)
 			return ZERO_SIZE_PTR;
 
+		if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
+			return NULL;
+
 		i = kmalloc_index(size);
 
 #ifdef CONFIG_ZONE_DMA
@@ -172,6 +175,9 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 		if (!size)
 			return ZERO_SIZE_PTR;
 
+		if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
+			return NULL;
+
 		i = kmalloc_index(size);
 
 #ifdef CONFIG_ZONE_DMA
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 2f0e7d5976cb..c5d352e73d81 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -373,6 +373,9 @@ struct kmem_cache *kmalloc_slab(size_t size, gfp_t flags)
 {
 	int index;
 
+	if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
+		return NULL;
+
 	if (size <= 192) {
 		if (!size)
 			return ZERO_SIZE_PTR;
-- 
cgit 


From 1deb9d341d475ff84262e927d6c0e36fecb9942e Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Mon, 6 May 2013 13:05:50 +0200
Subject: HID: debug: fix RCU preemption issue

Commit 2353f2bea ("HID: protect hid_debug_list") introduced mutex
locking around debug_list access to prevent SMP races when debugfs
nodes are being operated upon by multiple userspace processess.

mutex is not a proper synchronization primitive though, as the hid-debug
callbacks are being called from atomic contexts.

We also have to be careful about disabling IRQs when taking the lock
to prevent deadlock against IRQ handlers.

Benjamin reports this has also been reported in RH bugzilla as bug #958935.

 ===============================
 [ INFO: suspicious RCU usage. ]
 3.9.0+ #94 Not tainted
 -------------------------------
 include/linux/rcupdate.h:476 Illegal context switch in RCU read-side critical section!

 other info that might help us debug this:

 rcu_scheduler_active = 1, debug_locks = 0
 4 locks held by Xorg/5502:
  #0:  (&evdev->mutex){+.+...}, at: [<ffffffff81512c3d>] evdev_write+0x6d/0x160
  #1:  (&(&dev->event_lock)->rlock#2){-.-...}, at: [<ffffffff8150dd9b>] input_inject_event+0x5b/0x230
  #2:  (rcu_read_lock){.+.+..}, at: [<ffffffff8150dd82>] input_inject_event+0x42/0x230
  #3:  (&(&usbhid->lock)->rlock){-.....}, at: [<ffffffff81565289>] usb_hidinput_input_event+0x89/0x120

 stack backtrace:
 CPU: 0 PID: 5502 Comm: Xorg Not tainted 3.9.0+ #94
 Hardware name: Dell Inc. OptiPlex 390/0M5DCD, BIOS A09 07/24/2012
  0000000000000001 ffff8800689c7c38 ffffffff816f249f ffff8800689c7c68
  ffffffff810acb1d 0000000000000000 ffffffff81a03ac7 000000000000019d
  0000000000000000 ffff8800689c7c90 ffffffff8107cda7 0000000000000000
 Call Trace:
  [<ffffffff816f249f>] dump_stack+0x19/0x1b
  [<ffffffff810acb1d>] lockdep_rcu_suspicious+0xfd/0x130
  [<ffffffff8107cda7>] __might_sleep+0xc7/0x230
  [<ffffffff816f7770>] mutex_lock_nested+0x40/0x3a0
  [<ffffffff81312ac4>] ? vsnprintf+0x354/0x640
  [<ffffffff81553cc4>] hid_debug_event+0x34/0x100
  [<ffffffff81554197>] hid_dump_input+0x67/0xa0
  [<ffffffff81556430>] hid_set_field+0x50/0x120
  [<ffffffff8156529a>] usb_hidinput_input_event+0x9a/0x120
  [<ffffffff8150d89e>] input_handle_event+0x8e/0x530
  [<ffffffff8150df10>] input_inject_event+0x1d0/0x230
  [<ffffffff8150dd82>] ? input_inject_event+0x42/0x230
  [<ffffffff81512cae>] evdev_write+0xde/0x160
  [<ffffffff81185038>] vfs_write+0xc8/0x1f0
  [<ffffffff81185535>] SyS_write+0x55/0xa0
  [<ffffffff81704482>] system_call_fastpath+0x16/0x1b
 BUG: sleeping function called from invalid context at kernel/mutex.c:413
 in_atomic(): 1, irqs_disabled(): 1, pid: 5502, name: Xorg
 INFO: lockdep is turned off.
 irq event stamp: 1098574
 hardirqs last  enabled at (1098573): [<ffffffff816fb53f>] _raw_spin_unlock_irqrestore+0x3f/0x70
 hardirqs last disabled at (1098574): [<ffffffff816faaf5>] _raw_spin_lock_irqsave+0x25/0xa0
 softirqs last  enabled at (1098306): [<ffffffff8104971f>] __do_softirq+0x18f/0x3c0
 softirqs last disabled at (1097867): [<ffffffff81049ad5>] irq_exit+0xa5/0xb0
 CPU: 0 PID: 5502 Comm: Xorg Not tainted 3.9.0+ #94
 Hardware name: Dell Inc. OptiPlex 390/0M5DCD, BIOS A09 07/24/2012
  ffffffff81a03ac7 ffff8800689c7c68 ffffffff816f249f ffff8800689c7c90
  ffffffff8107ce60 0000000000000000 ffff8800689c7fd8 ffff88006a62c800
  ffff8800689c7d10 ffffffff816f7770 ffff8800689c7d00 ffffffff81312ac4
 Call Trace:
  [<ffffffff816f249f>] dump_stack+0x19/0x1b
  [<ffffffff8107ce60>] __might_sleep+0x180/0x230
  [<ffffffff816f7770>] mutex_lock_nested+0x40/0x3a0
  [<ffffffff81312ac4>] ? vsnprintf+0x354/0x640
  [<ffffffff81553cc4>] hid_debug_event+0x34/0x100
  [<ffffffff81554197>] hid_dump_input+0x67/0xa0
  [<ffffffff81556430>] hid_set_field+0x50/0x120
  [<ffffffff8156529a>] usb_hidinput_input_event+0x9a/0x120
  [<ffffffff8150d89e>] input_handle_event+0x8e/0x530
  [<ffffffff8150df10>] input_inject_event+0x1d0/0x230
  [<ffffffff8150dd82>] ? input_inject_event+0x42/0x230
  [<ffffffff81512cae>] evdev_write+0xde/0x160
  [<ffffffff81185038>] vfs_write+0xc8/0x1f0
  [<ffffffff81185535>] SyS_write+0x55/0xa0
  [<ffffffff81704482>] system_call_fastpath+0x16/0x1b

Reported-by: majianpeng <majianpeng@gmail.com>
Reported-by: Benjamin Tissoires <benjamin.tissoires@gmail.com>
Reviewed-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 drivers/hid/hid-core.c  |  2 +-
 drivers/hid/hid-debug.c | 15 +++++++++------
 include/linux/hid.h     |  2 +-
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/hid/hid-core.c b/drivers/hid/hid-core.c
index 1e51159c3103..264f55099940 100644
--- a/drivers/hid/hid-core.c
+++ b/drivers/hid/hid-core.c
@@ -2342,7 +2342,7 @@ struct hid_device *hid_allocate_device(void)
 
 	init_waitqueue_head(&hdev->debug_wait);
 	INIT_LIST_HEAD(&hdev->debug_list);
-	mutex_init(&hdev->debug_list_lock);
+	spin_lock_init(&hdev->debug_list_lock);
 	sema_init(&hdev->driver_lock, 1);
 	sema_init(&hdev->driver_input_lock, 1);
 
diff --git a/drivers/hid/hid-debug.c b/drivers/hid/hid-debug.c
index 7e56cb3855e3..8453214ec376 100644
--- a/drivers/hid/hid-debug.c
+++ b/drivers/hid/hid-debug.c
@@ -579,15 +579,16 @@ void hid_debug_event(struct hid_device *hdev, char *buf)
 {
 	int i;
 	struct hid_debug_list *list;
+	unsigned long flags;
 
-	mutex_lock(&hdev->debug_list_lock);
+	spin_lock_irqsave(&hdev->debug_list_lock, flags);
 	list_for_each_entry(list, &hdev->debug_list, node) {
 		for (i = 0; i < strlen(buf); i++)
 			list->hid_debug_buf[(list->tail + i) % HID_DEBUG_BUFSIZE] =
 				buf[i];
 		list->tail = (list->tail + i) % HID_DEBUG_BUFSIZE;
         }
-	mutex_unlock(&hdev->debug_list_lock);
+	spin_unlock_irqrestore(&hdev->debug_list_lock, flags);
 
 	wake_up_interruptible(&hdev->debug_wait);
 }
@@ -977,6 +978,7 @@ static int hid_debug_events_open(struct inode *inode, struct file *file)
 {
 	int err = 0;
 	struct hid_debug_list *list;
+	unsigned long flags;
 
 	if (!(list = kzalloc(sizeof(struct hid_debug_list), GFP_KERNEL))) {
 		err = -ENOMEM;
@@ -992,9 +994,9 @@ static int hid_debug_events_open(struct inode *inode, struct file *file)
 	file->private_data = list;
 	mutex_init(&list->read_mutex);
 
-	mutex_lock(&list->hdev->debug_list_lock);
+	spin_lock_irqsave(&list->hdev->debug_list_lock, flags);
 	list_add_tail(&list->node, &list->hdev->debug_list);
-	mutex_unlock(&list->hdev->debug_list_lock);
+	spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags);
 
 out:
 	return err;
@@ -1088,10 +1090,11 @@ static unsigned int hid_debug_events_poll(struct file *file, poll_table *wait)
 static int hid_debug_events_release(struct inode *inode, struct file *file)
 {
 	struct hid_debug_list *list = file->private_data;
+	unsigned long flags;
 
-	mutex_lock(&list->hdev->debug_list_lock);
+	spin_lock_irqsave(&list->hdev->debug_list_lock, flags);
 	list_del(&list->node);
-	mutex_unlock(&list->hdev->debug_list_lock);
+	spin_unlock_irqrestore(&list->hdev->debug_list_lock, flags);
 	kfree(list->hid_debug_buf);
 	kfree(list);
 
diff --git a/include/linux/hid.h b/include/linux/hid.h
index af1b86d46f6e..0c48991b0402 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -515,7 +515,7 @@ struct hid_device {							/* device report descriptor */
 	struct dentry *debug_rdesc;
 	struct dentry *debug_events;
 	struct list_head debug_list;
-	struct mutex debug_list_lock;
+	spinlock_t  debug_list_lock;
 	wait_queue_head_t debug_wait;
 };
 
-- 
cgit 


From 243198d09f535f5cd74114f1b779c3da25bc70c8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 5 May 2013 16:05:55 +0000
Subject: rps_dev_flow_table_release(): no need to delay vfree()

The same story as with fib_trie patch - vfree() from RCU callbacks
is legitimate now.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h |  1 -
 net/core/net-sysfs.c      | 12 +-----------
 2 files changed, 1 insertion(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index f8898a435dc5..a94a5a0ab122 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -593,7 +593,6 @@ struct rps_dev_flow {
 struct rps_dev_flow_table {
 	unsigned int mask;
 	struct rcu_head rcu;
-	struct work_struct free_work;
 	struct rps_dev_flow flows[0];
 };
 #define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 7427ab5e27d8..981fed397d1d 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -606,21 +606,11 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 	return sprintf(buf, "%lu\n", val);
 }
 
-static void rps_dev_flow_table_release_work(struct work_struct *work)
-{
-	struct rps_dev_flow_table *table = container_of(work,
-	    struct rps_dev_flow_table, free_work);
-
-	vfree(table);
-}
-
 static void rps_dev_flow_table_release(struct rcu_head *rcu)
 {
 	struct rps_dev_flow_table *table = container_of(rcu,
 	    struct rps_dev_flow_table, rcu);
-
-	INIT_WORK(&table->free_work, rps_dev_flow_table_release_work);
-	schedule_work(&table->free_work);
+	vfree(table);
 }
 
 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
-- 
cgit 


From 7c1d5fae4a87d3cf3e9ffd68bcdbaf6529013009 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Fri, 3 May 2013 14:40:01 -0400
Subject: NFSv4: Convert nfs41_free_stateid to use an asynchronous RPC call

The main reason for doing this is will be to allow for an asynchronous
RPC mode that we can use for freeing lock stateids as per section
8.2.4 of RFC5661.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/nfs4proc.c       | 98 +++++++++++++++++++++++++++++++++++++------------
 fs/nfs/nfs4xdr.c        |  2 +-
 include/linux/nfs_xdr.h |  2 +-
 3 files changed, 77 insertions(+), 25 deletions(-)

(limited to 'include/linux')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index dc1da2adc459..1f3c6118c8ce 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6783,26 +6783,76 @@ static int nfs41_test_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 	return err;
 }
 
-static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
-{
-	struct nfs41_free_stateid_args args = {
-		.stateid = stateid,
-	};
+struct nfs_free_stateid_data {
+	struct nfs_server *server;
+	struct nfs41_free_stateid_args args;
 	struct nfs41_free_stateid_res res;
+};
+
+static void nfs41_free_stateid_prepare(struct rpc_task *task, void *calldata)
+{
+	struct nfs_free_stateid_data *data = calldata;
+	nfs41_setup_sequence(nfs4_get_session(data->server),
+			&data->args.seq_args,
+			&data->res.seq_res,
+			task);
+}
+
+static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata)
+{
+	struct nfs_free_stateid_data *data = calldata;
+
+	nfs41_sequence_done(task, &data->res.seq_res);
+
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN)
+			rpc_restart_call_prepare(task);
+	}
+}
+
+static void nfs41_free_stateid_release(void *calldata)
+{
+	kfree(calldata);
+}
+
+const struct rpc_call_ops nfs41_free_stateid_ops = {
+	.rpc_call_prepare = nfs41_free_stateid_prepare,
+	.rpc_call_done = nfs41_free_stateid_done,
+	.rpc_release = nfs41_free_stateid_release,
+};
+
+static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		bool privileged)
+{
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_FREE_STATEID],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
 	};
-	int status;
+	struct rpc_task_setup task_setup = {
+		.rpc_client = server->client,
+		.rpc_message = &msg,
+		.callback_ops = &nfs41_free_stateid_ops,
+		.flags = RPC_TASK_ASYNC,
+	};
+	struct nfs_free_stateid_data *data;
 
 	dprintk("NFS call  free_stateid %p\n", stateid);
-	nfs41_init_sequence(&args.seq_args, &res.seq_res, 0);
-	nfs4_set_sequence_privileged(&args.seq_args);
-	status = nfs4_call_sync_sequence(server->client, server, &msg,
-			&args.seq_args, &res.seq_res);
-	dprintk("NFS reply free_stateid: %d\n", status);
-	return status;
+	data = kmalloc(sizeof(*data), GFP_NOFS);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+	data->server = server;
+	nfs4_stateid_copy(&data->args.stateid, stateid);
+
+	task_setup.callback_data = data;
+
+	msg.rpc_argp = &data->args;
+	msg.rpc_resp = &data->res;
+	nfs41_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	if (privileged)
+		nfs4_set_sequence_privileged(&data->args.seq_args);
+
+	return rpc_run_task(&task_setup);
 }
 
 /**
@@ -6816,15 +6866,17 @@ static int _nfs4_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
  */
 static int nfs41_free_stateid(struct nfs_server *server, nfs4_stateid *stateid)
 {
-	struct nfs4_exception exception = { };
-	int err;
-	do {
-		err = _nfs4_free_stateid(server, stateid);
-		if (err != -NFS4ERR_DELAY)
-			break;
-		nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
-	return err;
+	struct rpc_task *task;
+	int ret;
+
+	task = _nfs41_free_stateid(server, stateid, true);
+	if (IS_ERR(task))
+		return PTR_ERR(task);
+	ret = rpc_wait_for_completion_task(task);
+	if (!ret)
+		ret = task->tk_status;
+	rpc_put_task(task);
+	return ret;
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 3c79c5878c6d..4be8d135ed61 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -2003,7 +2003,7 @@ static void encode_free_stateid(struct xdr_stream *xdr,
 				struct compound_hdr *hdr)
 {
 	encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
-	encode_nfs4_stateid(xdr, args->stateid);
+	encode_nfs4_stateid(xdr, &args->stateid);
 }
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 766c5bc9d441..104b62f23ee0 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1176,7 +1176,7 @@ struct nfs41_test_stateid_res {
 
 struct nfs41_free_stateid_args {
 	struct nfs4_sequence_args	seq_args;
-	nfs4_stateid			*stateid;
+	nfs4_stateid			stateid;
 };
 
 struct nfs41_free_stateid_res {
-- 
cgit 


From db2a144bedd58b3dcf19950c2f476c58c9f39d18 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 5 May 2013 21:52:57 -0400
Subject: block_device_operations->release() should return void

The value passed is 0 in all but "it can never happen" cases (and those
only in a couple of drivers) *and* it would've been lost on the way
out anyway, even if something tried to pass something meaningful.
Just don't bother.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/um/drivers/ubd_kern.c          |  5 ++---
 arch/xtensa/platforms/iss/simdisk.c |  3 +--
 drivers/block/amiflop.c             |  3 +--
 drivers/block/aoe/aoeblk.c          |  6 ++----
 drivers/block/ataflop.c             |  5 ++---
 drivers/block/cciss.c               |  5 ++---
 drivers/block/cpqarray.c            |  6 ++----
 drivers/block/drbd/drbd_main.c      |  5 ++---
 drivers/block/floppy.c              |  4 +---
 drivers/block/loop.c                |  6 ++----
 drivers/block/paride/pcd.c          |  3 +--
 drivers/block/paride/pd.c           |  4 +---
 drivers/block/paride/pf.c           |  9 ++++-----
 drivers/block/pktcdvd.c             |  4 +---
 drivers/block/rbd.c                 |  4 +---
 drivers/block/swim.c                |  4 +---
 drivers/block/swim3.c               |  5 ++---
 drivers/block/xen-blkfront.c        |  3 +--
 drivers/block/xsysace.c             |  3 +--
 drivers/block/z2ram.c               |  6 ++----
 drivers/cdrom/gdrom.c               |  3 +--
 drivers/ide/ide-cd.c                |  4 +---
 drivers/ide/ide-gd.c                |  4 +---
 drivers/ide/ide-tape.c              |  4 +---
 drivers/md/dm.c                     |  4 +---
 drivers/md/md.c                     |  4 +---
 drivers/memstick/core/mspro_block.c |  8 +++-----
 drivers/message/i2o/i2o_block.c     |  8 ++------
 drivers/mmc/card/block.c            |  3 +--
 drivers/mtd/mtd_blkdevs.c           |  5 ++---
 drivers/s390/block/dasd.c           | 18 +++++++-----------
 drivers/s390/block/dcssblk.c        | 12 ++++--------
 drivers/s390/block/scm_blk.c        |  3 +--
 drivers/scsi/sd.c                   |  3 +--
 drivers/scsi/sr.c                   |  3 +--
 fs/block_dev.c                      |  2 +-
 include/linux/blkdev.h              |  2 +-
 37 files changed, 62 insertions(+), 121 deletions(-)

(limited to 'include/linux')

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 41bf72073ccc..879990cb66c6 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -87,7 +87,7 @@ static DEFINE_MUTEX(ubd_lock);
 static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
 
 static int ubd_open(struct block_device *bdev, fmode_t mode);
-static int ubd_release(struct gendisk *disk, fmode_t mode);
+static void ubd_release(struct gendisk *disk, fmode_t mode);
 static int ubd_ioctl(struct block_device *bdev, fmode_t mode,
 		     unsigned int cmd, unsigned long arg);
 static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@@ -1138,7 +1138,7 @@ out:
 	return err;
 }
 
-static int ubd_release(struct gendisk *disk, fmode_t mode)
+static void ubd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct ubd *ubd_dev = disk->private_data;
 
@@ -1146,7 +1146,6 @@ static int ubd_release(struct gendisk *disk, fmode_t mode)
 	if(--ubd_dev->count == 0)
 		ubd_close_dev(ubd_dev);
 	mutex_unlock(&ubd_mutex);
-	return 0;
 }
 
 static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 88608cc11b8c..0345f43d34f3 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -139,13 +139,12 @@ static int simdisk_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
-static int simdisk_release(struct gendisk *disk, fmode_t mode)
+static void simdisk_release(struct gendisk *disk, fmode_t mode)
 {
 	struct simdisk *dev = disk->private_data;
 	spin_lock(&dev->lock);
 	--dev->users;
 	spin_unlock(&dev->lock);
-	return 0;
 }
 
 static const struct block_device_operations simdisk_ops = {
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 386146d792d1..4ff85b8785ee 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1634,7 +1634,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct amiga_floppy_struct *p = disk->private_data;
 	int drive = p - unit;
@@ -1654,7 +1654,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 	floppy_off (drive | 0x40000000);
 #endif
 	mutex_unlock(&amiflop_mutex);
-	return 0;
 }
 
 /*
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index a129f8c8073d..916d9ed5c8aa 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -169,7 +169,7 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
 	return -ENODEV;
 }
 
-static int
+static void
 aoeblk_release(struct gendisk *disk, fmode_t mode)
 {
 	struct aoedev *d = disk->private_data;
@@ -180,11 +180,9 @@ aoeblk_release(struct gendisk *disk, fmode_t mode)
 	if (--d->nopen == 0) {
 		spin_unlock_irqrestore(&d->lock, flags);
 		aoecmd_cfg(d->aoemajor, d->aoeminor);
-		return 0;
+		return;
 	}
 	spin_unlock_irqrestore(&d->lock, flags);
-
-	return 0;
 }
 
 static void
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index ede16c64ff07..0e30c6e5492a 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -367,7 +367,7 @@ static void fd_probe( int drive );
 static int fd_test_drive_present( int drive );
 static void config_types( void );
 static int floppy_open(struct block_device *bdev, fmode_t mode);
-static int floppy_release(struct gendisk *disk, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
 
 /************************* End of Prototypes **************************/
 
@@ -1886,7 +1886,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct atari_floppy_struct *p = disk->private_data;
 	mutex_lock(&ataflop_mutex);
@@ -1897,7 +1897,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 		p->ref = 0;
 	}
 	mutex_unlock(&ataflop_mutex);
-	return 0;
 }
 
 static const struct block_device_operations floppy_fops = {
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e18c99140c0a..94b51c5e0678 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -161,7 +161,7 @@ static irqreturn_t do_cciss_intx(int irq, void *dev_id);
 static irqreturn_t do_cciss_msix_intr(int irq, void *dev_id);
 static int cciss_open(struct block_device *bdev, fmode_t mode);
 static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode);
-static int cciss_release(struct gendisk *disk, fmode_t mode);
+static void cciss_release(struct gendisk *disk, fmode_t mode);
 static int do_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long arg);
 static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
@@ -1123,7 +1123,7 @@ static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
 /*
  * Close.  Sync first.
  */
-static int cciss_release(struct gendisk *disk, fmode_t mode)
+static void cciss_release(struct gendisk *disk, fmode_t mode)
 {
 	ctlr_info_t *h;
 	drive_info_struct *drv;
@@ -1135,7 +1135,6 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
 	drv->usage_count--;
 	h->usage_count--;
 	mutex_unlock(&cciss_mutex);
-	return 0;
 }
 
 static int do_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 3b9e8ebcb96b..639d26b90b91 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -160,7 +160,7 @@ static int sendcmd(
 	unsigned int log_unit );
 
 static int ida_unlocked_open(struct block_device *bdev, fmode_t mode);
-static int ida_release(struct gendisk *disk, fmode_t mode);
+static void ida_release(struct gendisk *disk, fmode_t mode);
 static int ida_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg);
 static int ida_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 static int ida_ctlr_ioctl(ctlr_info_t *h, int dsk, ida_ioctl_t *io);
@@ -856,7 +856,7 @@ static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
 /*
  * Close.  Sync first.
  */
-static int ida_release(struct gendisk *disk, fmode_t mode)
+static void ida_release(struct gendisk *disk, fmode_t mode)
 {
 	ctlr_info_t *host;
 
@@ -864,8 +864,6 @@ static int ida_release(struct gendisk *disk, fmode_t mode)
 	host = get_host(disk);
 	host->usage_count--;
 	mutex_unlock(&cpqarray_mutex);
-
-	return 0;
 }
 
 /*
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index e98da675f0c1..298b868910dc 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -63,7 +63,7 @@ int drbd_asender(struct drbd_thread *);
 
 int drbd_init(void);
 static int drbd_open(struct block_device *bdev, fmode_t mode);
-static int drbd_release(struct gendisk *gd, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
 static int w_md_sync(struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_work *w, int unused);
@@ -1849,13 +1849,12 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
 	return rv;
 }
 
-static int drbd_release(struct gendisk *gd, fmode_t mode)
+static void drbd_release(struct gendisk *gd, fmode_t mode)
 {
 	struct drbd_conf *mdev = gd->private_data;
 	mutex_lock(&drbd_main_mutex);
 	mdev->open_cnt--;
 	mutex_unlock(&drbd_main_mutex);
-	return 0;
 }
 
 static void drbd_set_defaults(struct drbd_conf *mdev)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 2ddd64a9ffde..c49e85608101 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -3601,7 +3601,7 @@ static void __init config_types(void)
 		pr_cont("\n");
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	int drive = (long)disk->private_data;
 
@@ -3615,8 +3615,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 		opened_bdev[drive] = NULL;
 	mutex_unlock(&open_lock);
 	mutex_unlock(&floppy_mutex);
-
-	return 0;
 }
 
 /*
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index b2955b3f2cbc..d92d50fd84b7 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1518,7 +1518,7 @@ out:
 	return err;
 }
 
-static int lo_release(struct gendisk *disk, fmode_t mode)
+static void lo_release(struct gendisk *disk, fmode_t mode)
 {
 	struct loop_device *lo = disk->private_data;
 	int err;
@@ -1535,7 +1535,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
 		 */
 		err = loop_clr_fd(lo);
 		if (!err)
-			goto out_unlocked;
+			return;
 	} else {
 		/*
 		 * Otherwise keep thread (if running) and config,
@@ -1546,8 +1546,6 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
 
 out:
 	mutex_unlock(&lo->lo_ctl_mutex);
-out_unlocked:
-	return 0;
 }
 
 static const struct block_device_operations lo_fops = {
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index ba2b6b5e5910..e76bdc074dbe 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -236,13 +236,12 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int pcd_block_release(struct gendisk *disk, fmode_t mode)
+static void pcd_block_release(struct gendisk *disk, fmode_t mode)
 {
 	struct pcd_unit *cd = disk->private_data;
 	mutex_lock(&pcd_mutex);
 	cdrom_release(&cd->info, mode);
 	mutex_unlock(&pcd_mutex);
-	return 0;
 }
 
 static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 831e3ac156e6..19ad8f0c83ef 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -783,7 +783,7 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
 	}
 }
 
-static int pd_release(struct gendisk *p, fmode_t mode)
+static void pd_release(struct gendisk *p, fmode_t mode)
 {
 	struct pd_unit *disk = p->private_data;
 
@@ -791,8 +791,6 @@ static int pd_release(struct gendisk *p, fmode_t mode)
 	if (!--disk->access && disk->removable)
 		pd_special_command(disk, pd_door_unlock);
 	mutex_unlock(&pd_mutex);
-
-	return 0;
 }
 
 static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index ec8f9ed6326e..f5c86d523ba0 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -211,7 +211,7 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode,
 		    unsigned int cmd, unsigned long arg);
 static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
 
-static int pf_release(struct gendisk *disk, fmode_t mode);
+static void pf_release(struct gendisk *disk, fmode_t mode);
 
 static int pf_detect(void);
 static void do_pf_read(void);
@@ -360,14 +360,15 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
 	return 0;
 }
 
-static int pf_release(struct gendisk *disk, fmode_t mode)
+static void pf_release(struct gendisk *disk, fmode_t mode)
 {
 	struct pf_unit *pf = disk->private_data;
 
 	mutex_lock(&pf_mutex);
 	if (pf->access <= 0) {
 		mutex_unlock(&pf_mutex);
-		return -EINVAL;
+		WARN_ON(1);
+		return;
 	}
 
 	pf->access--;
@@ -376,8 +377,6 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
 		pf_lock(pf, 0);
 
 	mutex_unlock(&pf_mutex);
-	return 0;
-
 }
 
 static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index e0588c6dd86f..9f2d348f7115 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2376,10 +2376,9 @@ out:
 	return ret;
 }
 
-static int pkt_close(struct gendisk *disk, fmode_t mode)
+static void pkt_close(struct gendisk *disk, fmode_t mode)
 {
 	struct pktcdvd_device *pd = disk->private_data;
-	int ret = 0;
 
 	mutex_lock(&pktcdvd_mutex);
 	mutex_lock(&ctl_mutex);
@@ -2391,7 +2390,6 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
 	}
 	mutex_unlock(&ctl_mutex);
 	mutex_unlock(&pktcdvd_mutex);
-	return ret;
 }
 
 
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index b7b7a88d9f68..04ca496485b0 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -411,7 +411,7 @@ static int rbd_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
-static int rbd_release(struct gendisk *disk, fmode_t mode)
+static void rbd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct rbd_device *rbd_dev = disk->private_data;
 	unsigned long open_count_before;
@@ -424,8 +424,6 @@ static int rbd_release(struct gendisk *disk, fmode_t mode)
 	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
 	put_device(&rbd_dev->dev);
 	mutex_unlock(&ctl_mutex);
-
-	return 0;
 }
 
 static const struct block_device_operations rbd_bd_ops = {
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 8766a2257091..2f445b7a174e 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -673,7 +673,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct floppy_state *fs = disk->private_data;
 	struct swim __iomem *base = fs->swd->base;
@@ -687,8 +687,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 	if (fs->ref_count == 0)
 		swim_motor(base, OFF);
 	mutex_unlock(&swim_mutex);
-
-	return 0;
 }
 
 static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 758f2ac878cf..20e061c3e023 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -251,7 +251,7 @@ static int fd_eject(struct floppy_state *fs);
 static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long param);
 static int floppy_open(struct block_device *bdev, fmode_t mode);
-static int floppy_release(struct gendisk *disk, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
 static unsigned int floppy_check_events(struct gendisk *disk,
 					unsigned int clearing);
 static int floppy_revalidate(struct gendisk *disk);
@@ -1017,7 +1017,7 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int floppy_release(struct gendisk *disk, fmode_t mode)
+static void floppy_release(struct gendisk *disk, fmode_t mode)
 {
 	struct floppy_state *fs = disk->private_data;
 	struct swim3 __iomem *sw = fs->swim3;
@@ -1029,7 +1029,6 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 		swim3_select(fs, RELAX);
 	}
 	mutex_unlock(&swim3_mutex);
-	return 0;
 }
 
 static unsigned int floppy_check_events(struct gendisk *disk,
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index a894f88762d8..d89ef86220f4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1617,7 +1617,7 @@ out:
 	return err;
 }
 
-static int blkif_release(struct gendisk *disk, fmode_t mode)
+static void blkif_release(struct gendisk *disk, fmode_t mode)
 {
 	struct blkfront_info *info = disk->private_data;
 	struct block_device *bdev;
@@ -1658,7 +1658,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
 out:
 	bdput(bdev);
 	mutex_unlock(&blkfront_mutex);
-	return 0;
 }
 
 static const struct block_device_operations xlvbd_block_fops =
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 1f38643173ca..f8ef15f37c5e 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -915,7 +915,7 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
-static int ace_release(struct gendisk *disk, fmode_t mode)
+static void ace_release(struct gendisk *disk, fmode_t mode)
 {
 	struct ace_device *ace = disk->private_data;
 	unsigned long flags;
@@ -932,7 +932,6 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
 	}
 	spin_unlock_irqrestore(&ace->lock, flags);
 	mutex_unlock(&xsysace_mutex);
-	return 0;
 }
 
 static int ace_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index a22e3f895947..5a95baf4b104 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -309,20 +309,18 @@ err_out:
     return rc;
 }
 
-static int
+static void
 z2_release(struct gendisk *disk, fmode_t mode)
 {
     mutex_lock(&z2ram_mutex);
     if ( current_device == -1 ) {
     	mutex_unlock(&z2ram_mutex);
-    	return 0;
+    	return;
     }
     mutex_unlock(&z2ram_mutex);
     /*
      * FIXME: unmap memory
      */
-
-    return 0;
 }
 
 static const struct block_device_operations z2_fops =
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index d59cdcb8fe39..4afcb65cc623 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -503,12 +503,11 @@ static int gdrom_bdops_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int gdrom_bdops_release(struct gendisk *disk, fmode_t mode)
+static void gdrom_bdops_release(struct gendisk *disk, fmode_t mode)
 {
 	mutex_lock(&gdrom_mutex);
 	cdrom_release(gd.cd_info, mode);
 	mutex_unlock(&gdrom_mutex);
-	return 0;
 }
 
 static unsigned int gdrom_bdops_check_events(struct gendisk *disk,
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index b23113926388..2ff620444930 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -1606,7 +1606,7 @@ out:
 	return rc;
 }
 
-static int idecd_release(struct gendisk *disk, fmode_t mode)
+static void idecd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct cdrom_info *info = ide_drv_g(disk, cdrom_info);
 
@@ -1615,8 +1615,6 @@ static int idecd_release(struct gendisk *disk, fmode_t mode)
 
 	ide_cd_put(info);
 	mutex_unlock(&ide_cd_mutex);
-
-	return 0;
 }
 
 static int idecd_set_spindown(struct cdrom_device_info *cdi, unsigned long arg)
diff --git a/drivers/ide/ide-gd.c b/drivers/ide/ide-gd.c
index 70ea8763567d..de86631e767d 100644
--- a/drivers/ide/ide-gd.c
+++ b/drivers/ide/ide-gd.c
@@ -250,7 +250,7 @@ static int ide_gd_unlocked_open(struct block_device *bdev, fmode_t mode)
 }
 
 
-static int ide_gd_release(struct gendisk *disk, fmode_t mode)
+static void ide_gd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct ide_disk_obj *idkp = ide_drv_g(disk, ide_disk_obj);
 	ide_drive_t *drive = idkp->drive;
@@ -270,8 +270,6 @@ static int ide_gd_release(struct gendisk *disk, fmode_t mode)
 
 	ide_disk_put(idkp);
 	mutex_unlock(&ide_gd_mutex);
-
-	return 0;
 }
 
 static int ide_gd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 89f859591bbb..c6c574bd5f59 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -1918,15 +1918,13 @@ static int idetape_open(struct block_device *bdev, fmode_t mode)
 	return 0;
 }
 
-static int idetape_release(struct gendisk *disk, fmode_t mode)
+static void idetape_release(struct gendisk *disk, fmode_t mode)
 {
 	struct ide_tape_obj *tape = ide_drv_g(disk, ide_tape_obj);
 
 	mutex_lock(&ide_tape_mutex);
 	ide_tape_put(tape);
 	mutex_unlock(&ide_tape_mutex);
-
-	return 0;
 }
 
 static int idetape_ioctl(struct block_device *bdev, fmode_t mode,
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 9a0bdad9ad8f..d5370a94b2c1 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -339,7 +339,7 @@ out:
 	return md ? 0 : -ENXIO;
 }
 
-static int dm_blk_close(struct gendisk *disk, fmode_t mode)
+static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
 	struct mapped_device *md = disk->private_data;
 
@@ -349,8 +349,6 @@ static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 	dm_put(md);
 
 	spin_unlock(&_minor_lock);
-
-	return 0;
 }
 
 int dm_open_count(struct mapped_device *md)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4c74424c78b0..6330c727396c 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -6674,15 +6674,13 @@ static int md_open(struct block_device *bdev, fmode_t mode)
 	return err;
 }
 
-static int md_release(struct gendisk *disk, fmode_t mode)
+static void md_release(struct gendisk *disk, fmode_t mode)
 {
  	struct mddev *mddev = disk->private_data;
 
 	BUG_ON(!mddev);
 	atomic_dec(&mddev->openers);
 	mddev_put(mddev);
-
-	return 0;
 }
 
 static int md_media_changed(struct gendisk *disk)
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index f12b78dbce04..f4176ca3a794 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -204,7 +204,7 @@ static int mspro_block_bd_open(struct block_device *bdev, fmode_t mode)
 }
 
 
-static int mspro_block_disk_release(struct gendisk *disk)
+static void mspro_block_disk_release(struct gendisk *disk)
 {
 	struct mspro_block_data *msb = disk->private_data;
 	int disk_id = MINOR(disk_devt(disk)) >> MSPRO_BLOCK_PART_SHIFT;
@@ -224,13 +224,11 @@ static int mspro_block_disk_release(struct gendisk *disk)
 	}
 
 	mutex_unlock(&mspro_block_disk_lock);
-
-	return 0;
 }
 
-static int mspro_block_bd_release(struct gendisk *disk, fmode_t mode)
+static void mspro_block_bd_release(struct gendisk *disk, fmode_t mode)
 {
-	return mspro_block_disk_release(disk);
+	mspro_block_disk_release(disk);
 }
 
 static int mspro_block_bd_getgeo(struct block_device *bdev,
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 49e86aed2bc4..6fc3866965df 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -600,10 +600,8 @@ static int i2o_block_open(struct block_device *bdev, fmode_t mode)
  *
  *	Unlock and unmount the media, and power down the device. Gets called if
  *	the block device is closed.
- *
- *	Returns 0 on success or negative error code on failure.
  */
-static int i2o_block_release(struct gendisk *disk, fmode_t mode)
+static void i2o_block_release(struct gendisk *disk, fmode_t mode)
 {
 	struct i2o_block_device *dev = disk->private_data;
 	u8 operation;
@@ -617,7 +615,7 @@ static int i2o_block_release(struct gendisk *disk, fmode_t mode)
 	 * the TID no longer exists.
 	 */
 	if (!dev->i2o_dev)
-		return 0;
+		return;
 
 	mutex_lock(&i2o_block_mutex);
 	i2o_block_device_flush(dev->i2o_dev);
@@ -631,8 +629,6 @@ static int i2o_block_release(struct gendisk *disk, fmode_t mode)
 
 	i2o_block_device_power(dev, operation);
 	mutex_unlock(&i2o_block_mutex);
-
-	return 0;
 }
 
 static int i2o_block_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index e12a03cc2a6e..dd27b0783d52 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -304,14 +304,13 @@ static int mmc_blk_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int mmc_blk_release(struct gendisk *disk, fmode_t mode)
+static void mmc_blk_release(struct gendisk *disk, fmode_t mode)
 {
 	struct mmc_blk_data *md = disk->private_data;
 
 	mutex_lock(&block_mutex);
 	mmc_blk_put(md);
 	mutex_unlock(&block_mutex);
-	return 0;
 }
 
 static int
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 65d7e7e2fb1f..5073cbc796d8 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -237,12 +237,12 @@ error_put:
 	return ret;
 }
 
-static int blktrans_release(struct gendisk *disk, fmode_t mode)
+static void blktrans_release(struct gendisk *disk, fmode_t mode)
 {
 	struct mtd_blktrans_dev *dev = blktrans_dev_get(disk);
 
 	if (!dev)
-		return 0;
+		return;
 
 	mutex_lock(&dev->lock);
 
@@ -260,7 +260,6 @@ static int blktrans_release(struct gendisk *disk, fmode_t mode)
 unlock:
 	mutex_unlock(&dev->lock);
 	blktrans_dev_put(dev);
-	return 0;
 }
 
 static int blktrans_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 82758cbb220b..4361d9772c42 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -2997,18 +2997,14 @@ unlock:
 	return rc;
 }
 
-static int dasd_release(struct gendisk *disk, fmode_t mode)
+static void dasd_release(struct gendisk *disk, fmode_t mode)
 {
-	struct dasd_device *base;
-
-	base = dasd_device_from_gendisk(disk);
-	if (!base)
-		return -ENODEV;
-
-	atomic_dec(&base->block->open_count);
-	module_put(base->discipline->owner);
-	dasd_put_device(base);
-	return 0;
+	struct dasd_device *base = dasd_device_from_gendisk(disk);
+	if (base) {
+		atomic_dec(&base->block->open_count);
+		module_put(base->discipline->owner);
+		dasd_put_device(base);
+	}
 }
 
 /*
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index b6ad0de07930..07ba32b07fb0 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -26,7 +26,7 @@
 #define DCSS_BUS_ID_SIZE 20
 
 static int dcssblk_open(struct block_device *bdev, fmode_t mode);
-static int dcssblk_release(struct gendisk *disk, fmode_t mode);
+static void dcssblk_release(struct gendisk *disk, fmode_t mode);
 static void dcssblk_make_request(struct request_queue *q, struct bio *bio);
 static int dcssblk_direct_access(struct block_device *bdev, sector_t secnum,
 				 void **kaddr, unsigned long *pfn);
@@ -781,16 +781,15 @@ out:
 	return rc;
 }
 
-static int
+static void
 dcssblk_release(struct gendisk *disk, fmode_t mode)
 {
 	struct dcssblk_dev_info *dev_info = disk->private_data;
 	struct segment_info *entry;
-	int rc;
 
 	if (!dev_info) {
-		rc = -ENODEV;
-		goto out;
+		WARN_ON(1);
+		return;
 	}
 	down_write(&dcssblk_devices_sem);
 	if (atomic_dec_and_test(&dev_info->use_count)
@@ -803,9 +802,6 @@ dcssblk_release(struct gendisk *disk, fmode_t mode)
 		dev_info->save_pending = 0;
 	}
 	up_write(&dcssblk_devices_sem);
-	rc = 0;
-out:
-	return rc;
 }
 
 static void
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index b303cab76a7f..5d73e6e49af6 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -123,10 +123,9 @@ static int scm_open(struct block_device *blkdev, fmode_t mode)
 	return scm_get_ref();
 }
 
-static int scm_release(struct gendisk *gendisk, fmode_t mode)
+static void scm_release(struct gendisk *gendisk, fmode_t mode)
 {
 	scm_put_ref();
-	return 0;
 }
 
 static const struct block_device_operations scm_blk_devops = {
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 7992635d405f..e6689776b4f6 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1188,7 +1188,7 @@ error_autopm:
  *
  *	Locking: called with bdev->bd_mutex held.
  **/
-static int sd_release(struct gendisk *disk, fmode_t mode)
+static void sd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct scsi_disk *sdkp = scsi_disk(disk);
 	struct scsi_device *sdev = sdkp->device;
@@ -1207,7 +1207,6 @@ static int sd_release(struct gendisk *disk, fmode_t mode)
 
 	scsi_autopm_put_device(sdev);
 	scsi_disk_put(sdkp);
-	return 0;
 }
 
 static int sd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index f2884ee90710..119d67f9c47e 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -541,14 +541,13 @@ static int sr_block_open(struct block_device *bdev, fmode_t mode)
 	return ret;
 }
 
-static int sr_block_release(struct gendisk *disk, fmode_t mode)
+static void sr_block_release(struct gendisk *disk, fmode_t mode)
 {
 	struct scsi_cd *cd = scsi_cd(disk);
 	mutex_lock(&sr_mutex);
 	cdrom_release(&cd->cdi, mode);
 	scsi_cd_put(cd);
 	mutex_unlock(&sr_mutex);
-	return 0;
 }
 
 static int sr_block_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ce08de7467a3..ad2a14174ae0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1422,7 +1422,7 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	}
 	if (bdev->bd_contains == bdev) {
 		if (disk->fops->release)
-			ret = disk->fops->release(disk, mode);
+			disk->fops->release(disk, mode);
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 78feda9bbae2..e38cfe77f7f0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1484,7 +1484,7 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g)
 
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
-	int (*release) (struct gendisk *, fmode_t);
+	void (*release) (struct gendisk *, fmode_t);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
-- 
cgit 


From 4385bab128911df14ab25f0b5ae1a48d7b53dd94 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 5 May 2013 22:11:03 -0400
Subject: make blkdev_put() return void

same story as with the previous patches - note that return
value of blkdev_close() is lost, since there's nowhere the
caller (__fput()) could return it to.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c                      | 14 ++++++--------
 fs/ext3/super.c                     | 11 ++++-------
 fs/ext4/super.c                     | 11 ++++-------
 fs/nfs/blocklayout/blocklayout.h    |  2 +-
 fs/nfs/blocklayout/blocklayoutdev.c |  4 ++--
 fs/nfs/blocklayout/blocklayoutdm.c  |  8 +-------
 fs/reiserfs/journal.c               | 16 +++-------------
 include/linux/fs.h                  |  2 +-
 8 files changed, 22 insertions(+), 46 deletions(-)

(limited to 'include/linux')

diff --git a/fs/block_dev.c b/fs/block_dev.c
index ad2a14174ae0..3823d3ffb760 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1045,7 +1045,7 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 }
 EXPORT_SYMBOL(bd_set_size);
 
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
 
 /*
  * bd_mutex locking:
@@ -1400,9 +1400,8 @@ static int blkdev_open(struct inode * inode, struct file * filp)
 	return blkdev_get(bdev, filp->f_mode, filp);
 }
 
-static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
+static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 {
-	int ret = 0;
 	struct gendisk *disk = bdev->bd_disk;
 	struct block_device *victim = NULL;
 
@@ -1441,10 +1440,9 @@ static int __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
 	bdput(bdev);
 	if (victim)
 		__blkdev_put(victim, mode, 1);
-	return ret;
 }
 
-int blkdev_put(struct block_device *bdev, fmode_t mode)
+void blkdev_put(struct block_device *bdev, fmode_t mode)
 {
 	mutex_lock(&bdev->bd_mutex);
 
@@ -1488,15 +1486,15 @@ int blkdev_put(struct block_device *bdev, fmode_t mode)
 
 	mutex_unlock(&bdev->bd_mutex);
 
-	return __blkdev_put(bdev, mode, 0);
+	__blkdev_put(bdev, mode, 0);
 }
 EXPORT_SYMBOL(blkdev_put);
 
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
 	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
-
-	return blkdev_put(bdev, filp->f_mode);
+	blkdev_put(bdev, filp->f_mode);
+	return 0;
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3dc48cc8b6eb..6356665a74bb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -362,22 +362,19 @@ fail:
 /*
  * Release the journal device
  */
-static int ext3_blkdev_put(struct block_device *bdev)
+static void ext3_blkdev_put(struct block_device *bdev)
 {
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
-static int ext3_blkdev_remove(struct ext3_sb_info *sbi)
+static void ext3_blkdev_remove(struct ext3_sb_info *sbi)
 {
 	struct block_device *bdev;
-	int ret = -ENODEV;
-
 	bdev = sbi->journal_bdev;
 	if (bdev) {
-		ret = ext3_blkdev_put(bdev);
+		ext3_blkdev_put(bdev);
 		sbi->journal_bdev = NULL;
 	}
-	return ret;
 }
 
 static inline struct inode *orphan_list_entry(struct list_head *l)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 24a146bde742..94cc84db7c9a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -703,22 +703,19 @@ fail:
 /*
  * Release the journal device
  */
-static int ext4_blkdev_put(struct block_device *bdev)
+static void ext4_blkdev_put(struct block_device *bdev)
 {
-	return blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
 }
 
-static int ext4_blkdev_remove(struct ext4_sb_info *sbi)
+static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
 {
 	struct block_device *bdev;
-	int ret = -ENODEV;
-
 	bdev = sbi->journal_bdev;
 	if (bdev) {
-		ret = ext4_blkdev_put(bdev);
+		ext4_blkdev_put(bdev);
 		sbi->journal_bdev = NULL;
 	}
-	return ret;
 }
 
 static inline struct inode *orphan_list_entry(struct list_head *l)
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index f4891bde8851..8485978993e8 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -173,7 +173,7 @@ struct bl_msg_hdr {
 /* blocklayoutdev.c */
 ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t);
 void bl_pipe_destroy_msg(struct rpc_pipe_msg *);
-int nfs4_blkdev_put(struct block_device *bdev);
+void nfs4_blkdev_put(struct block_device *bdev);
 struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server,
 						struct pnfs_device *dev);
 int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo,
diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c
index a86c5bdad9e3..04303b5c9361 100644
--- a/fs/nfs/blocklayout/blocklayoutdev.c
+++ b/fs/nfs/blocklayout/blocklayoutdev.c
@@ -56,11 +56,11 @@ static int decode_sector_number(__be32 **rp, sector_t *sp)
 /*
  * Release the block device
  */
-int nfs4_blkdev_put(struct block_device *bdev)
+void nfs4_blkdev_put(struct block_device *bdev)
 {
 	dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev),
 			MINOR(bdev->bd_dev));
-	return blkdev_put(bdev, FMODE_READ);
+	blkdev_put(bdev, FMODE_READ);
 }
 
 ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,
diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c
index 6fc7b5cae92b..8999cfddd866 100644
--- a/fs/nfs/blocklayout/blocklayoutdm.c
+++ b/fs/nfs/blocklayout/blocklayoutdm.c
@@ -88,14 +88,8 @@ out:
  */
 static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev)
 {
-	int rv;
-
 	dprintk("%s Releasing\n", __func__);
-	rv = nfs4_blkdev_put(bdev->bm_mdev);
-	if (rv)
-		printk(KERN_ERR "NFS: %s nfs4_blkdev_put returns %d\n",
-				__func__, rv);
-
+	nfs4_blkdev_put(bdev->bm_mdev);
 	dev_remove(bdev->net, bdev->bm_mdev->bd_dev);
 }
 
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index afcadcc03e8a..742fdd4c209a 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -97,7 +97,7 @@ static int flush_commit_list(struct super_block *s,
 static int can_dirty(struct reiserfs_journal_cnode *cn);
 static int journal_join(struct reiserfs_transaction_handle *th,
 			struct super_block *sb, unsigned long nblocks);
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal);
 static int dirty_one_transaction(struct super_block *s,
 				 struct reiserfs_journal_list *jl);
@@ -2532,23 +2532,13 @@ static void journal_list_init(struct super_block *sb)
 	SB_JOURNAL(sb)->j_current_jl = alloc_journal_list(sb);
 }
 
-static int release_journal_dev(struct super_block *super,
+static void release_journal_dev(struct super_block *super,
 			       struct reiserfs_journal *journal)
 {
-	int result;
-
-	result = 0;
-
 	if (journal->j_dev_bd != NULL) {
-		result = blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
+		blkdev_put(journal->j_dev_bd, journal->j_dev_mode);
 		journal->j_dev_bd = NULL;
 	}
-
-	if (result != 0) {
-		reiserfs_warning(super, "sh-457",
-				 "Cannot release journal device: %i", result);
-	}
-	return result;
 }
 
 static int journal_init_dev(struct super_block *super,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5a24ba83b6f..43db02e9c9fa 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2091,7 +2091,7 @@ extern struct block_device *blkdev_get_by_path(const char *path, fmode_t mode,
 					       void *holder);
 extern struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode,
 					      void *holder);
-extern int blkdev_put(struct block_device *bdev, fmode_t mode);
+extern void blkdev_put(struct block_device *bdev, fmode_t mode);
 #ifdef CONFIG_SYSFS
 extern int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
 extern void bd_unlink_disk_holder(struct block_device *bdev,
-- 
cgit 


From 2d864e41710f1d2ba406fb62018ab0487152e6f2 Mon Sep 17 00:00:00 2001
From: Anatol Pomozov <anatol.pomozov@gmail.com>
Date: Tue, 7 May 2013 15:37:48 -0700
Subject: kref: minor cleanup

 - make warning smp-safe
 - result of atomic _unless_zero functions should be checked by caller
   to avoid use-after-free error
 - trivial whitespace fix.

Link: https://lkml.org/lkml/2013/4/12/391

Tested: compile x86, boot machine and run xfstests
Signed-off-by: Anatol Pomozov <anatol.pomozov@gmail.com>
[ Removed line-break, changed to use WARN_ON_ONCE()  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kref.h | 9 ++++++---
 lib/kobject.c        | 2 +-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/kref.h b/include/linux/kref.h
index 4972e6e9ca93..e15828fd71f1 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -39,8 +39,11 @@ static inline void kref_init(struct kref *kref)
  */
 static inline void kref_get(struct kref *kref)
 {
-	WARN_ON(!atomic_read(&kref->refcount));
-	atomic_inc(&kref->refcount);
+	/* If refcount was 0 before incrementing then we have a race
+	 * condition when this kref is freeing by some other thread right now.
+	 * In this case one should use kref_get_unless_zero()
+	 */
+	WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
 }
 
 /**
@@ -100,7 +103,7 @@ static inline int kref_put_mutex(struct kref *kref,
 				 struct mutex *lock)
 {
 	WARN_ON(release == NULL);
-        if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
+	if (unlikely(!atomic_add_unless(&kref->refcount, -1, 1))) {
 		mutex_lock(lock);
 		if (unlikely(!atomic_dec_and_test(&kref->refcount))) {
 			mutex_unlock(lock);
diff --git a/lib/kobject.c b/lib/kobject.c
index a65486613d79..b7e29a6056d3 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -529,7 +529,7 @@ struct kobject *kobject_get(struct kobject *kobj)
 	return kobj;
 }
 
-static struct kobject *kobject_get_unless_zero(struct kobject *kobj)
+static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj)
 {
 	if (!kref_get_unless_zero(&kobj->kref))
 		kobj = NULL;
-- 
cgit 


From 0f157a5b58d0d2890f0ae20da9fe66aa19a91a4d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 7 May 2013 16:18:10 -0700
Subject: include/linux/mm.h: complete the mm_walk definition

That nameless-function-arguments thing drives me batty.  Fix.

Cc: Dave Hansen <dave.hansen@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1a7f19e7f1a0..e0c8528a41a4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -951,13 +951,19 @@ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
  * (see walk_page_range for more details)
  */
 struct mm_walk {
-	int (*pgd_entry)(pgd_t *, unsigned long, unsigned long, struct mm_walk *);
-	int (*pud_entry)(pud_t *, unsigned long, unsigned long, struct mm_walk *);
-	int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *);
-	int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *);
-	int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *);
-	int (*hugetlb_entry)(pte_t *, unsigned long,
-			     unsigned long, unsigned long, struct mm_walk *);
+	int (*pgd_entry)(pgd_t *pgd, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
+	int (*pud_entry)(pud_t *pud, unsigned long addr,
+	                 unsigned long next, struct mm_walk *walk);
+	int (*pmd_entry)(pmd_t *pmd, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
+	int (*pte_entry)(pte_t *pte, unsigned long addr,
+			 unsigned long next, struct mm_walk *walk);
+	int (*pte_hole)(unsigned long addr, unsigned long next,
+			struct mm_walk *walk);
+	int (*hugetlb_entry)(pte_t *pte, unsigned long hmask,
+			     unsigned long addr, unsigned long next,
+			     struct mm_walk *walk);
 	struct mm_struct *mm;
 	void *private;
 };
-- 
cgit 


From af73e4d9506d3b797509f3c030e7dcd554f7d9c4 Mon Sep 17 00:00:00 2001
From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Date: Tue, 7 May 2013 16:18:13 -0700
Subject: hugetlbfs: fix mmap failure in unaligned size request

The current kernel returns -EINVAL unless a given mmap length is
"almost" hugepage aligned.  This is because in sys_mmap_pgoff() the
given length is passed to vm_mmap_pgoff() as it is without being aligned
with hugepage boundary.

This is a regression introduced in commit 40716e29243d ("hugetlbfs: fix
alignment of huge page requests"), where alignment code is pushed into
hugetlb_file_setup() and the variable len in caller side is not changed.

To fix this, this patch partially reverts that commit, and adds
alignment code in caller side.  And it also introduces hstate_sizelog()
in order to get proper hstate to specified hugepage size.

Addresses https://bugzilla.kernel.org/show_bug.cgi?id=56881

[akpm@linux-foundation.org: fix warning when CONFIG_HUGETLB_PAGE=n]
Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Reported-by: <iceman_dvd@yahoo.com>
Cc: Steven Truelove <steven.truelove@utoronto.ca>
Cc: Jianguo Wu <wujianguo@huawei.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    | 24 ++++++++++--------------
 include/linux/hugetlb.h | 19 +++++++++++++------
 ipc/shm.c               |  6 +++++-
 mm/mmap.c               |  7 ++++++-
 4 files changed, 34 insertions(+), 22 deletions(-)

(limited to 'include/linux')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 523464e62849..a3f868ae3fd4 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -909,11 +909,8 @@ static int can_do_hugetlb_shm(void)
 
 static int get_hstate_idx(int page_size_log)
 {
-	struct hstate *h;
+	struct hstate *h = hstate_sizelog(page_size_log);
 
-	if (!page_size_log)
-		return default_hstate_idx;
-	h = size_to_hstate(1 << page_size_log);
 	if (!h)
 		return -1;
 	return h - hstates;
@@ -929,9 +926,12 @@ static struct dentry_operations anon_ops = {
 	.d_dname = hugetlb_dname
 };
 
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
-				size_t size, vm_flags_t acctflag,
-				struct user_struct **user,
+/*
+ * Note that size should be aligned to proper hugepage size in caller side,
+ * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
+ */
+struct file *hugetlb_file_setup(const char *name, size_t size,
+				vm_flags_t acctflag, struct user_struct **user,
 				int creat_flags, int page_size_log)
 {
 	struct file *file = ERR_PTR(-ENOMEM);
@@ -939,8 +939,6 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
 	struct path path;
 	struct super_block *sb;
 	struct qstr quick_string;
-	struct hstate *hstate;
-	unsigned long num_pages;
 	int hstate_idx;
 
 	hstate_idx = get_hstate_idx(page_size_log);
@@ -980,12 +978,10 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
 	if (!inode)
 		goto out_dentry;
 
-	hstate = hstate_inode(inode);
-	size += addr & ~huge_page_mask(hstate);
-	num_pages = ALIGN(size, huge_page_size(hstate)) >>
-			huge_page_shift(hstate);
 	file = ERR_PTR(-ENOMEM);
-	if (hugetlb_reserve_pages(inode, 0, num_pages, NULL, acctflag))
+	if (hugetlb_reserve_pages(inode, 0,
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(path.dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 3a62df310f2e..6b4890fa57e7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -189,8 +189,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, unsigned long addr,
-				size_t size, vm_flags_t acct,
+struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
 				struct user_struct **user, int creat_flags,
 				int page_size_log);
 
@@ -209,8 +208,8 @@ static inline int is_file_hugepages(struct file *file)
 
 #define is_file_hugepages(file)			0
 static inline struct file *
-hugetlb_file_setup(const char *name, unsigned long addr, size_t size,
-		vm_flags_t acctflag, struct user_struct **user, int creat_flags,
+hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
+		struct user_struct **user, int creat_flags,
 		int page_size_log)
 {
 	return ERR_PTR(-ENOSYS);
@@ -288,6 +287,13 @@ static inline struct hstate *hstate_file(struct file *f)
 	return hstate_inode(file_inode(f));
 }
 
+static inline struct hstate *hstate_sizelog(int page_size_log)
+{
+	if (!page_size_log)
+		return &default_hstate;
+	return size_to_hstate(1 << page_size_log);
+}
+
 static inline struct hstate *hstate_vma(struct vm_area_struct *vma)
 {
 	return hstate_file(vma->vm_file);
@@ -352,11 +358,12 @@ static inline int hstate_index(struct hstate *h)
 	return h - hstates;
 }
 
-#else
+#else	/* CONFIG_HUGETLB_PAGE */
 struct hstate {};
 #define alloc_huge_page_node(h, nid) NULL
 #define alloc_bootmem_huge_page(h) NULL
 #define hstate_file(f) NULL
+#define hstate_sizelog(s) NULL
 #define hstate_vma(v) NULL
 #define hstate_inode(i) NULL
 #define huge_page_size(h) PAGE_SIZE
@@ -371,6 +378,6 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
 }
 #define hstate_index_to_shift(index) 0
 #define hstate_index(h) 0
-#endif
+#endif	/* CONFIG_HUGETLB_PAGE */
 
 #endif /* _LINUX_HUGETLB_H */
diff --git a/ipc/shm.c b/ipc/shm.c
index 8247c49ec073..34af1fe34701 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -491,10 +491,14 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	sprintf (name, "SYSV%08x", key);
 	if (shmflg & SHM_HUGETLB) {
+		struct hstate *hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT)
+						& SHM_HUGE_MASK);
+		size_t hugesize = ALIGN(size, huge_page_size(hs));
+
 		/* hugetlb_file_setup applies strict accounting */
 		if (shmflg & SHM_NORESERVE)
 			acctflag = VM_NORESERVE;
-		file = hugetlb_file_setup(name, 0, size, acctflag,
+		file = hugetlb_file_setup(name, hugesize, acctflag,
 				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
 				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
 	} else {
diff --git a/mm/mmap.c b/mm/mmap.c
index da3e9c04bf37..1ae21d645c68 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1363,15 +1363,20 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 		file = fget(fd);
 		if (!file)
 			goto out;
+		if (is_file_hugepages(file))
+			len = ALIGN(len, huge_page_size(hstate_file(file)));
 	} else if (flags & MAP_HUGETLB) {
 		struct user_struct *user = NULL;
+
+		len = ALIGN(len, huge_page_size(hstate_sizelog(
+			(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK)));
 		/*
 		 * VM_NORESERVE is used because the reservations will be
 		 * taken when vm_ops->mmap() is called
 		 * A dummy user value is used because we are not locking
 		 * memory so no accounting is necessary
 		 */
-		file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len,
+		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
 				VM_NORESERVE,
 				&user, HUGETLB_ANONHUGE_INODE,
 				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
-- 
cgit 


From 22ea9c070350b824d6c3b65bce06ee7c6cc87b99 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Tue, 7 May 2013 16:18:17 -0700
Subject: remove unused random32() and srandom32()

After finishing a naming transition, remove unused backward
compatibility wrapper macros

Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/random.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/random.h b/include/linux/random.h
index 347ce553a306..3b9377d6b7a5 100644
--- a/include/linux/random.h
+++ b/include/linux/random.h
@@ -29,13 +29,6 @@ u32 prandom_u32(void);
 void prandom_bytes(void *buf, int nbytes);
 void prandom_seed(u32 seed);
 
-/*
- * These macros are preserved for backward compatibility and should be
- * removed as soon as a transition is finished.
- */
-#define random32() prandom_u32()
-#define srandom32(seed) prandom_seed(seed)
-
 u32 prandom_u32_state(struct rnd_state *);
 void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
 
-- 
cgit 


From 4b49bb8ad6d6e6e564017e2fad2357c3024683eb Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Tue, 7 May 2013 16:18:21 -0700
Subject: aio: remove dead code from aio.h

Signed-off-by: Zach Brown <zab@redhat.com>
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/aio.h | 24 ------------------------
 1 file changed, 24 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aio.h b/include/linux/aio.h
index 31ff6dba4872..b46a09f73f1d 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -9,44 +9,22 @@
 
 #include <linux/atomic.h>
 
-#define AIO_MAXSEGS		4
-#define AIO_KIOGRP_NR_ATOMIC	8
-
 struct kioctx;
 
-/* Notes on cancelling a kiocb:
- *	If a kiocb is cancelled, aio_complete may return 0 to indicate 
- *	that cancel has not yet disposed of the kiocb.  All cancel 
- *	operations *must* call aio_put_req to dispose of the kiocb 
- *	to guard against races with the completion code.
- */
-#define KIOCB_C_CANCELLED	0x01
-#define KIOCB_C_COMPLETE	0x02
-
 #define KIOCB_SYNC_KEY		(~0U)
 
 /* ki_flags bits */
-/*
- * This may be used for cancel/retry serialization in the future, but
- * for now it's unused and we probably don't want modules to even
- * think they can use it.
- */
-/* #define KIF_LOCKED		0 */
 #define KIF_KICKED		1
 #define KIF_CANCELLED		2
 
-#define kiocbTryLock(iocb)	test_and_set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
 
-#define kiocbSetLocked(iocb)	set_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
-#define kiocbClearLocked(iocb)	clear_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbClearCancelled(iocb)	clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
-#define kiocbIsLocked(iocb)	test_bit(KIF_LOCKED, &(iocb)->ki_flags)
 #define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
@@ -207,8 +185,6 @@ struct kioctx {
 };
 
 /* prototypes */
-extern unsigned aio_max_size;
-
 #ifdef CONFIG_AIO
 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
 extern int aio_put_req(struct kiocb *iocb);
-- 
cgit 


From 41003a7bcfed1255032e1e7c7b487e505b22e298 Mon Sep 17 00:00:00 2001
From: Zach Brown <zab@redhat.com>
Date: Tue, 7 May 2013 16:18:25 -0700
Subject: aio: remove retry-based AIO

This removes the retry-based AIO infrastructure now that nothing in tree
is using it.

We want to remove retry-based AIO because it is fundemantally unsafe.
It retries IO submission from a kernel thread that has only assumed the
mm of the submitting task.  All other task_struct references in the IO
submission path will see the kernel thread, not the submitting task.
This design flaw means that nothing of any meaningful complexity can use
retry-based AIO.

This removes all the code and data associated with the retry machinery.
The most significant benefit of this is the removal of the locking
around the unused run list in the submission path.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Signed-off-by: Zach Brown <zab@redhat.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c              | 351 ++++----------------------------------------------
 fs/ocfs2/dlmglue.c    |   2 +-
 fs/read_write.c       |  34 +----
 include/linux/aio.h   |  22 ----
 include/linux/errno.h |   1 -
 5 files changed, 31 insertions(+), 379 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 351afe7ac78e..6e095a95a7c6 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -54,11 +54,6 @@ unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio request
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
-static struct workqueue_struct *aio_wq;
-
-static void aio_kick_handler(struct work_struct *);
-static void aio_queue_work(struct kioctx *);
-
 /* aio_setup
  *	Creates the slab caches used by the aio routines, panic on
  *	failure as this is done early during the boot sequence.
@@ -68,9 +63,6 @@ static int __init aio_setup(void)
 	kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
 
-	aio_wq = alloc_workqueue("aio", 0, 1);	/* used to limit concurrency */
-	BUG_ON(!aio_wq);
-
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
 
 	return 0;
@@ -86,7 +78,6 @@ static void aio_free_ring(struct kioctx *ctx)
 		put_page(info->ring_pages[i]);
 
 	if (info->mmap_size) {
-		BUG_ON(ctx->mm != current->mm);
 		vm_munmap(info->mmap_base, info->mmap_size);
 	}
 
@@ -101,6 +92,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 	struct aio_ring *ring;
 	struct aio_ring_info *info = &ctx->ring_info;
 	unsigned nr_events = ctx->max_reqs;
+	struct mm_struct *mm = current->mm;
 	unsigned long size, populate;
 	int nr_pages;
 
@@ -126,23 +118,22 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	info->mmap_size = nr_pages * PAGE_SIZE;
 	dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
-	down_write(&ctx->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
 	info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
 					PROT_READ|PROT_WRITE,
 					MAP_ANONYMOUS|MAP_PRIVATE, 0,
 					&populate);
 	if (IS_ERR((void *)info->mmap_base)) {
-		up_write(&ctx->mm->mmap_sem);
+		up_write(&mm->mmap_sem);
 		info->mmap_size = 0;
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
 
 	dprintk("mmap address: 0x%08lx\n", info->mmap_base);
-	info->nr_pages = get_user_pages(current, ctx->mm,
-					info->mmap_base, nr_pages, 
+	info->nr_pages = get_user_pages(current, mm, info->mmap_base, nr_pages,
 					1, 0, info->ring_pages, NULL);
-	up_write(&ctx->mm->mmap_sem);
+	up_write(&mm->mmap_sem);
 
 	if (unlikely(info->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
@@ -206,10 +197,7 @@ static void __put_ioctx(struct kioctx *ctx)
 	unsigned nr_events = ctx->max_reqs;
 	BUG_ON(ctx->reqs_active);
 
-	cancel_delayed_work_sync(&ctx->wq);
 	aio_free_ring(ctx);
-	mmdrop(ctx->mm);
-	ctx->mm = NULL;
 	if (nr_events) {
 		spin_lock(&aio_nr_lock);
 		BUG_ON(aio_nr - nr_events > aio_nr);
@@ -237,7 +225,7 @@ static inline void put_ioctx(struct kioctx *kioctx)
  */
 static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
-	struct mm_struct *mm;
+	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
 	int err = -ENOMEM;
 
@@ -256,8 +244,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = nr_events;
-	mm = ctx->mm = current->mm;
-	atomic_inc(&mm->mm_count);
 
 	atomic_set(&ctx->users, 2);
 	spin_lock_init(&ctx->ctx_lock);
@@ -265,8 +251,6 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	init_waitqueue_head(&ctx->wait);
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
-	INIT_LIST_HEAD(&ctx->run_list);
-	INIT_DELAYED_WORK(&ctx->wq, aio_kick_handler);
 
 	if (aio_setup_ring(ctx) < 0)
 		goto out_freectx;
@@ -287,14 +271,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	spin_unlock(&mm->ioctx_lock);
 
 	dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
-		ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
+		ctx, ctx->user_id, mm, ctx->ring_info.nr);
 	return ctx;
 
 out_cleanup:
 	err = -EAGAIN;
 	aio_free_ring(ctx);
 out_freectx:
-	mmdrop(mm);
 	kmem_cache_free(kioctx_cachep, ctx);
 	dprintk("aio: error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
@@ -391,8 +374,6 @@ void exit_aio(struct mm_struct *mm)
 		 * as indicator that it needs to unmap the area,
 		 * just set it to 0; aio_free_ring() is the only
 		 * place that uses ->mmap_size, so it's safe.
-		 * That way we get all munmap done to current->mm -
-		 * all other callers have ctx->mm == current->mm.
 		 */
 		ctx->ring_info.mmap_size = 0;
 		put_ioctx(ctx);
@@ -426,7 +407,6 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 	req->ki_dtor = NULL;
 	req->private = NULL;
 	req->ki_iovec = NULL;
-	INIT_LIST_HEAD(&req->ki_run_list);
 	req->ki_eventfd = NULL;
 
 	return req;
@@ -611,281 +591,6 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 	return ret;
 }
 
-/*
- * Queue up a kiocb to be retried. Assumes that the kiocb
- * has already been marked as kicked, and places it on
- * the retry run list for the corresponding ioctx, if it
- * isn't already queued. Returns 1 if it actually queued
- * the kiocb (to tell the caller to activate the work
- * queue to process it), or 0, if it found that it was
- * already queued.
- */
-static inline int __queue_kicked_iocb(struct kiocb *iocb)
-{
-	struct kioctx *ctx = iocb->ki_ctx;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	if (list_empty(&iocb->ki_run_list)) {
-		list_add_tail(&iocb->ki_run_list,
-			&ctx->run_list);
-		return 1;
-	}
-	return 0;
-}
-
-/* aio_run_iocb
- *	This is the core aio execution routine. It is
- *	invoked both for initial i/o submission and
- *	subsequent retries via the aio_kick_handler.
- *	Expects to be invoked with iocb->ki_ctx->lock
- *	already held. The lock is released and reacquired
- *	as needed during processing.
- *
- * Calls the iocb retry method (already setup for the
- * iocb on initial submission) for operation specific
- * handling, but takes care of most of common retry
- * execution details for a given iocb. The retry method
- * needs to be non-blocking as far as possible, to avoid
- * holding up other iocbs waiting to be serviced by the
- * retry kernel thread.
- *
- * The trickier parts in this code have to do with
- * ensuring that only one retry instance is in progress
- * for a given iocb at any time. Providing that guarantee
- * simplifies the coding of individual aio operations as
- * it avoids various potential races.
- */
-static ssize_t aio_run_iocb(struct kiocb *iocb)
-{
-	struct kioctx	*ctx = iocb->ki_ctx;
-	ssize_t (*retry)(struct kiocb *);
-	ssize_t ret;
-
-	if (!(retry = iocb->ki_retry)) {
-		printk("aio_run_iocb: iocb->ki_retry = NULL\n");
-		return 0;
-	}
-
-	/*
-	 * We don't want the next retry iteration for this
-	 * operation to start until this one has returned and
-	 * updated the iocb state. However, wait_queue functions
-	 * can trigger a kick_iocb from interrupt context in the
-	 * meantime, indicating that data is available for the next
-	 * iteration. We want to remember that and enable the
-	 * next retry iteration _after_ we are through with
-	 * this one.
-	 *
-	 * So, in order to be able to register a "kick", but
-	 * prevent it from being queued now, we clear the kick
-	 * flag, but make the kick code *think* that the iocb is
-	 * still on the run list until we are actually done.
-	 * When we are done with this iteration, we check if
-	 * the iocb was kicked in the meantime and if so, queue
-	 * it up afresh.
-	 */
-
-	kiocbClearKicked(iocb);
-
-	/*
-	 * This is so that aio_complete knows it doesn't need to
-	 * pull the iocb off the run list (We can't just call
-	 * INIT_LIST_HEAD because we don't want a kick_iocb to
-	 * queue this on the run list yet)
-	 */
-	iocb->ki_run_list.next = iocb->ki_run_list.prev = NULL;
-	spin_unlock_irq(&ctx->ctx_lock);
-
-	/* Quit retrying if the i/o has been cancelled */
-	if (kiocbIsCancelled(iocb)) {
-		ret = -EINTR;
-		aio_complete(iocb, ret, 0);
-		/* must not access the iocb after this */
-		goto out;
-	}
-
-	/*
-	 * Now we are all set to call the retry method in async
-	 * context.
-	 */
-	ret = retry(iocb);
-
-	if (ret != -EIOCBRETRY && ret != -EIOCBQUEUED) {
-		/*
-		 * There's no easy way to restart the syscall since other AIO's
-		 * may be already running. Just fail this IO with EINTR.
-		 */
-		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
-			     ret == -ERESTARTNOHAND || ret == -ERESTART_RESTARTBLOCK))
-			ret = -EINTR;
-		aio_complete(iocb, ret, 0);
-	}
-out:
-	spin_lock_irq(&ctx->ctx_lock);
-
-	if (-EIOCBRETRY == ret) {
-		/*
-		 * OK, now that we are done with this iteration
-		 * and know that there is more left to go,
-		 * this is where we let go so that a subsequent
-		 * "kick" can start the next iteration
-		 */
-
-		/* will make __queue_kicked_iocb succeed from here on */
-		INIT_LIST_HEAD(&iocb->ki_run_list);
-		/* we must queue the next iteration ourselves, if it
-		 * has already been kicked */
-		if (kiocbIsKicked(iocb)) {
-			__queue_kicked_iocb(iocb);
-
-			/*
-			 * __queue_kicked_iocb will always return 1 here, because
-			 * iocb->ki_run_list is empty at this point so it should
-			 * be safe to unconditionally queue the context into the
-			 * work queue.
-			 */
-			aio_queue_work(ctx);
-		}
-	}
-	return ret;
-}
-
-/*
- * __aio_run_iocbs:
- * 	Process all pending retries queued on the ioctx
- * 	run list.
- * Assumes it is operating within the aio issuer's mm
- * context.
- */
-static int __aio_run_iocbs(struct kioctx *ctx)
-{
-	struct kiocb *iocb;
-	struct list_head run_list;
-
-	assert_spin_locked(&ctx->ctx_lock);
-
-	list_replace_init(&ctx->run_list, &run_list);
-	while (!list_empty(&run_list)) {
-		iocb = list_entry(run_list.next, struct kiocb,
-			ki_run_list);
-		list_del(&iocb->ki_run_list);
-		/*
-		 * Hold an extra reference while retrying i/o.
-		 */
-		iocb->ki_users++;       /* grab extra reference */
-		aio_run_iocb(iocb);
-		__aio_put_req(ctx, iocb);
- 	}
-	if (!list_empty(&ctx->run_list))
-		return 1;
-	return 0;
-}
-
-static void aio_queue_work(struct kioctx * ctx)
-{
-	unsigned long timeout;
-	/*
-	 * if someone is waiting, get the work started right
-	 * away, otherwise, use a longer delay
-	 */
-	smp_mb();
-	if (waitqueue_active(&ctx->wait))
-		timeout = 1;
-	else
-		timeout = HZ/10;
-	queue_delayed_work(aio_wq, &ctx->wq, timeout);
-}
-
-/*
- * aio_run_all_iocbs:
- *	Process all pending retries queued on the ioctx
- *	run list, and keep running them until the list
- *	stays empty.
- * Assumes it is operating within the aio issuer's mm context.
- */
-static inline void aio_run_all_iocbs(struct kioctx *ctx)
-{
-	spin_lock_irq(&ctx->ctx_lock);
-	while (__aio_run_iocbs(ctx))
-		;
-	spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * aio_kick_handler:
- * 	Work queue handler triggered to process pending
- * 	retries on an ioctx. Takes on the aio issuer's
- *	mm context before running the iocbs, so that
- *	copy_xxx_user operates on the issuer's address
- *      space.
- * Run on aiod's context.
- */
-static void aio_kick_handler(struct work_struct *work)
-{
-	struct kioctx *ctx = container_of(work, struct kioctx, wq.work);
-	mm_segment_t oldfs = get_fs();
-	struct mm_struct *mm;
-	int requeue;
-
-	set_fs(USER_DS);
-	use_mm(ctx->mm);
-	spin_lock_irq(&ctx->ctx_lock);
-	requeue =__aio_run_iocbs(ctx);
-	mm = ctx->mm;
-	spin_unlock_irq(&ctx->ctx_lock);
- 	unuse_mm(mm);
-	set_fs(oldfs);
-	/*
-	 * we're in a worker thread already; no point using non-zero delay
-	 */
-	if (requeue)
-		queue_delayed_work(aio_wq, &ctx->wq, 0);
-}
-
-
-/*
- * Called by kick_iocb to queue the kiocb for retry
- * and if required activate the aio work queue to process
- * it
- */
-static void try_queue_kicked_iocb(struct kiocb *iocb)
-{
- 	struct kioctx	*ctx = iocb->ki_ctx;
-	unsigned long flags;
-	int run = 0;
-
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
-	/* set this inside the lock so that we can't race with aio_run_iocb()
-	 * testing it and putting the iocb on the run list under the lock */
-	if (!kiocbTryKick(iocb))
-		run = __queue_kicked_iocb(iocb);
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	if (run)
-		aio_queue_work(ctx);
-}
-
-/*
- * kick_iocb:
- *      Called typically from a wait queue callback context
- *      to trigger a retry of the iocb.
- *      The retry is usually executed by aio workqueue
- *      threads (See aio_kick_handler).
- */
-void kick_iocb(struct kiocb *iocb)
-{
-	/* sync iocbs are easy: they can only ever be executing from a 
-	 * single context. */
-	if (is_sync_kiocb(iocb)) {
-		kiocbSetKicked(iocb);
-	        wake_up_process(iocb->ki_obj.tsk);
-		return;
-	}
-
-	try_queue_kicked_iocb(iocb);
-}
-EXPORT_SYMBOL(kick_iocb);
-
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
  *	Returns true if this is the last user of the request.  The 
@@ -926,9 +631,6 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
-	if (iocb->ki_run_list.prev && !list_empty(&iocb->ki_run_list))
-		list_del_init(&iocb->ki_run_list);
-
 	/*
 	 * cancelled requests don't get events, userland was given one
 	 * when the event got cancelled.
@@ -1083,13 +785,11 @@ static int read_events(struct kioctx *ctx,
 	int			i = 0;
 	struct io_event		ent;
 	struct aio_timeout	to;
-	int			retry = 0;
 
 	/* needed to zero any padding within an entry (there shouldn't be 
 	 * any, but C is fun!
 	 */
 	memset(&ent, 0, sizeof(ent));
-retry:
 	ret = 0;
 	while (likely(i < nr)) {
 		ret = aio_read_evt(ctx, &ent);
@@ -1119,13 +819,6 @@ retry:
 
 	/* End fast path */
 
-	/* racey check, but it gets redone */
-	if (!retry && unlikely(!list_empty(&ctx->run_list))) {
-		retry = 1;
-		aio_run_all_iocbs(ctx);
-		goto retry;
-	}
-
 	init_timeout(&to);
 	if (timeout) {
 		struct timespec	ts;
@@ -1349,7 +1042,7 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 	/* If we managed to write some out we return that, rather than
 	 * the eventual error. */
 	if (opcode == IOCB_CMD_PWRITEV
-	    && ret < 0 && ret != -EIOCBQUEUED && ret != -EIOCBRETRY
+	    && ret < 0 && ret != -EIOCBQUEUED
 	    && iocb->ki_nbytes - iocb->ki_left)
 		ret = iocb->ki_nbytes - iocb->ki_left;
 
@@ -1591,18 +1284,28 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	 * don't see ctx->dead set here, io_destroy() waits for our IO to
 	 * finish.
 	 */
-	if (ctx->dead) {
-		spin_unlock_irq(&ctx->ctx_lock);
+	if (ctx->dead)
 		ret = -EINVAL;
+	spin_unlock_irq(&ctx->ctx_lock);
+	if (ret)
 		goto out_put_req;
+
+	if (unlikely(kiocbIsCancelled(req)))
+		ret = -EINTR;
+	else
+		ret = req->ki_retry(req);
+
+	if (ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND ||
+			     ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
+		aio_complete(req, ret, 0);
 	}
-	aio_run_iocb(req);
-	if (!list_empty(&ctx->run_list)) {
-		/* drain the run list */
-		while (__aio_run_iocbs(ctx))
-			;
-	}
-	spin_unlock_irq(&ctx->ctx_lock);
 
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 12ae194ac943..3a44a648dae7 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2322,7 +2322,7 @@ int ocfs2_inode_lock_full_nested(struct inode *inode,
 	status = __ocfs2_cluster_lock(osb, lockres, level, dlm_flags,
 				      arg_flags, subclass, _RET_IP_);
 	if (status < 0) {
-		if (status != -EAGAIN && status != -EIOCBRETRY)
+		if (status != -EAGAIN)
 			mlog_errno(status);
 		goto bail;
 	}
diff --git a/fs/read_write.c b/fs/read_write.c
index 90ba3b350e50..bce289afd21a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -329,16 +329,6 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
 }
 
-static void wait_on_retry_sync_kiocb(struct kiocb *iocb)
-{
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	if (!kiocbIsKicked(iocb))
-		schedule();
-	else
-		kiocbClearKicked(iocb);
-	__set_current_state(TASK_RUNNING);
-}
-
 ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = buf, .iov_len = len };
@@ -350,13 +340,7 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
-	for (;;) {
-		ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
-		if (ret != -EIOCBRETRY)
-			break;
-		wait_on_retry_sync_kiocb(&kiocb);
-	}
-
+	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
@@ -406,13 +390,7 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
-	for (;;) {
-		ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
-		if (ret != -EIOCBRETRY)
-			break;
-		wait_on_retry_sync_kiocb(&kiocb);
-	}
-
+	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
 	if (-EIOCBQUEUED == ret)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
@@ -592,13 +570,7 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
-	for (;;) {
-		ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
-		if (ret != -EIOCBRETRY)
-			break;
-		wait_on_retry_sync_kiocb(&kiocb);
-	}
-
+	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
 	if (ret == -EIOCBQUEUED)
 		ret = wait_on_sync_kiocb(&kiocb);
 	*ppos = kiocb.ki_pos;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index b46a09f73f1d..019204e46c11 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -14,18 +14,12 @@ struct kioctx;
 #define KIOCB_SYNC_KEY		(~0U)
 
 /* ki_flags bits */
-#define KIF_KICKED		1
 #define KIF_CANCELLED		2
 
-#define kiocbTryKick(iocb)	test_and_set_bit(KIF_KICKED, &(iocb)->ki_flags)
-
-#define kiocbSetKicked(iocb)	set_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
-#define kiocbClearKicked(iocb)	clear_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbClearCancelled(iocb)	clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
-#define kiocbIsKicked(iocb)	test_bit(KIF_KICKED, &(iocb)->ki_flags)
 #define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
 
 /* is there a better place to document function pointer methods? */
@@ -52,18 +46,8 @@ struct kioctx;
  * not ask the method again -- ki_retry must ensure forward progress.
  * aio_complete() must be called once and only once in the future, multiple
  * calls may result in undefined behaviour.
- *
- * If ki_retry returns -EIOCBRETRY it has made a promise that kick_iocb()
- * will be called on the kiocb pointer in the future.  This may happen
- * through generic helpers that associate kiocb->ki_wait with a wait
- * queue head that ki_retry uses via current->io_wait.  It can also happen
- * with custom tracking and manual calls to kick_iocb(), though that is
- * discouraged.  In either case, kick_iocb() must be called once and only
- * once.  ki_retry must ensure forward progress, the AIO core will wait
- * indefinitely for kick_iocb() to be called.
  */
 struct kiocb {
-	struct list_head	ki_run_list;
 	unsigned long		ki_flags;
 	int			ki_users;
 	unsigned		ki_key;		/* id of this request */
@@ -160,7 +144,6 @@ static inline unsigned aio_ring_avail(struct aio_ring_info *info,
 struct kioctx {
 	atomic_t		users;
 	int			dead;
-	struct mm_struct	*mm;
 
 	/* This needs improving */
 	unsigned long		user_id;
@@ -172,15 +155,12 @@ struct kioctx {
 
 	int			reqs_active;
 	struct list_head	active_reqs;	/* used for cancellation */
-	struct list_head	run_list;	/* used for kicked reqs */
 
 	/* sys_io_setup currently limits this to an unsigned int */
 	unsigned		max_reqs;
 
 	struct aio_ring_info	ring_info;
 
-	struct delayed_work	wq;
-
 	struct rcu_head		rcu_head;
 };
 
@@ -188,7 +168,6 @@ struct kioctx {
 #ifdef CONFIG_AIO
 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
 extern int aio_put_req(struct kiocb *iocb);
-extern void kick_iocb(struct kiocb *iocb);
 extern int aio_complete(struct kiocb *iocb, long res, long res2);
 struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
@@ -197,7 +176,6 @@ extern long do_io_submit(aio_context_t ctx_id, long nr,
 #else
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline int aio_put_req(struct kiocb *iocb) { return 0; }
-static inline void kick_iocb(struct kiocb *iocb) { }
 static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
 struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
diff --git a/include/linux/errno.h b/include/linux/errno.h
index f6bf082d4d4f..89627b9187f9 100644
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -28,6 +28,5 @@
 #define EBADTYPE	527	/* Type not supported by server */
 #define EJUKEBOX	528	/* Request initiated, but will not complete before timeout */
 #define EIOCBQUEUED	529	/* iocb queued, will get completion event */
-#define EIOCBRETRY	530	/* iocb queued, will trigger a retry */
 
 #endif
-- 
cgit 


From 2d68449e86168744513ca4f13477f081ce167130 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:18:29 -0700
Subject: aio: kill return value of aio_complete()

Nothing used the return value, and it probably wasn't possible to use it
safely for the locked versions (aio_complete(), aio_put_req()).  Just
kill it.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
Acked-by: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 21 +++++++--------------
 include/linux/aio.h |  8 ++++----
 2 files changed, 11 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 6e095a95a7c6..8b43d6bf8c50 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -531,7 +531,7 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 /* __aio_put_req
  *	Returns true if this put was the last user of the request.
  */
-static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
+static void __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 {
 	dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n",
 		req, atomic_long_read(&req->ki_filp->f_count));
@@ -541,7 +541,7 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	req->ki_users--;
 	BUG_ON(req->ki_users < 0);
 	if (likely(req->ki_users))
-		return 0;
+		return;
 	list_del(&req->ki_list);		/* remove from active_reqs */
 	req->ki_cancel = NULL;
 	req->ki_retry = NULL;
@@ -549,21 +549,18 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
 	fput(req->ki_filp);
 	req->ki_filp = NULL;
 	really_put_req(ctx, req);
-	return 1;
 }
 
 /* aio_put_req
  *	Returns true if this put was the last user of the kiocb,
  *	false if the request is still in use.
  */
-int aio_put_req(struct kiocb *req)
+void aio_put_req(struct kiocb *req)
 {
 	struct kioctx *ctx = req->ki_ctx;
-	int ret;
 	spin_lock_irq(&ctx->ctx_lock);
-	ret = __aio_put_req(ctx, req);
+	__aio_put_req(ctx, req);
 	spin_unlock_irq(&ctx->ctx_lock);
-	return ret;
 }
 EXPORT_SYMBOL(aio_put_req);
 
@@ -593,10 +590,8 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 
 /* aio_complete
  *	Called when the io request on the given iocb is complete.
- *	Returns true if this is the last user of the request.  The 
- *	only other user of the request can be the cancellation code.
  */
-int aio_complete(struct kiocb *iocb, long res, long res2)
+void aio_complete(struct kiocb *iocb, long res, long res2)
 {
 	struct kioctx	*ctx = iocb->ki_ctx;
 	struct aio_ring_info	*info;
@@ -604,7 +599,6 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 	struct io_event	*event;
 	unsigned long	flags;
 	unsigned long	tail;
-	int		ret;
 
 	/*
 	 * Special case handling for sync iocbs:
@@ -618,7 +612,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 		iocb->ki_user_data = res;
 		iocb->ki_users = 0;
 		wake_up_process(iocb->ki_obj.tsk);
-		return 1;
+		return;
 	}
 
 	info = &ctx->ring_info;
@@ -677,7 +671,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2)
 
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
-	ret = __aio_put_req(ctx, iocb);
+	__aio_put_req(ctx, iocb);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -691,7 +685,6 @@ put_rq:
 		wake_up(&ctx->wait);
 
 	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
-	return ret;
 }
 EXPORT_SYMBOL(aio_complete);
 
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 019204e46c11..615d55a5d0be 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -167,16 +167,16 @@ struct kioctx {
 /* prototypes */
 #ifdef CONFIG_AIO
 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-extern int aio_put_req(struct kiocb *iocb);
-extern int aio_complete(struct kiocb *iocb, long res, long res2);
+extern void aio_put_req(struct kiocb *iocb);
+extern void aio_complete(struct kiocb *iocb, long res, long res2);
 struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 #else
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
-static inline int aio_put_req(struct kiocb *iocb) { return 0; }
-static inline int aio_complete(struct kiocb *iocb, long res, long res2) { return 0; }
+static inline void aio_put_req(struct kiocb *iocb) { }
+static inline void aio_complete(struct kiocb *iocb, long res, long res2) { }
 struct mm_struct;
 static inline void exit_aio(struct mm_struct *mm) { }
 static inline long do_io_submit(aio_context_t ctx_id, long nr,
-- 
cgit 


From 4e179bca6718693148c7445c236bc3e0e0013ffd Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:18:33 -0700
Subject: aio: move private stuff out of aio.h

Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Cc: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/gadget/inode.c |  1 +
 fs/aio.c                   | 61 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/aio.h        | 61 ----------------------------------------------
 3 files changed, 62 insertions(+), 61 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 32bcbeba119e..994e7433e87a 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -25,6 +25,7 @@
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/mmu_context.h>
+#include <linux/aio.h>
 
 #include <linux/device.h>
 #include <linux/moduleparam.h>
diff --git a/fs/aio.c b/fs/aio.c
index dbfcd67003ef..670cb8b84345 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -45,6 +45,67 @@
 #define dprintk(x...)	do { ; } while (0)
 #endif
 
+#define AIO_RING_MAGIC			0xa10a10a1
+#define AIO_RING_COMPAT_FEATURES	1
+#define AIO_RING_INCOMPAT_FEATURES	0
+struct aio_ring {
+	unsigned	id;	/* kernel internal index number */
+	unsigned	nr;	/* number of io_events */
+	unsigned	head;
+	unsigned	tail;
+
+	unsigned	magic;
+	unsigned	compat_features;
+	unsigned	incompat_features;
+	unsigned	header_length;	/* size of aio_ring */
+
+
+	struct io_event		io_events[0];
+}; /* 128 bytes + ring size */
+
+#define AIO_RING_PAGES	8
+struct aio_ring_info {
+	unsigned long		mmap_base;
+	unsigned long		mmap_size;
+
+	struct page		**ring_pages;
+	spinlock_t		ring_lock;
+	long			nr_pages;
+
+	unsigned		nr, tail;
+
+	struct page		*internal_pages[AIO_RING_PAGES];
+};
+
+static inline unsigned aio_ring_avail(struct aio_ring_info *info,
+					struct aio_ring *ring)
+{
+	return (ring->head + info->nr - 1 - ring->tail) % info->nr;
+}
+
+struct kioctx {
+	atomic_t		users;
+	int			dead;
+
+	/* This needs improving */
+	unsigned long		user_id;
+	struct hlist_node	list;
+
+	wait_queue_head_t	wait;
+
+	spinlock_t		ctx_lock;
+
+	int			reqs_active;
+	struct list_head	active_reqs;	/* used for cancellation */
+
+	/* sys_io_setup currently limits this to an unsigned int */
+	unsigned		max_reqs;
+
+	struct aio_ring_info	ring_info;
+
+	struct rcu_head		rcu_head;
+};
+
 /*------ sysctl variables----*/
 static DEFINE_SPINLOCK(aio_nr_lock);
 unsigned long aio_nr;		/* current system wide number of aio requests */
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 615d55a5d0be..7b1eb234e0ab 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -103,67 +103,6 @@ static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 		};
 }
 
-#define AIO_RING_MAGIC			0xa10a10a1
-#define AIO_RING_COMPAT_FEATURES	1
-#define AIO_RING_INCOMPAT_FEATURES	0
-struct aio_ring {
-	unsigned	id;	/* kernel internal index number */
-	unsigned	nr;	/* number of io_events */
-	unsigned	head;
-	unsigned	tail;
-
-	unsigned	magic;
-	unsigned	compat_features;
-	unsigned	incompat_features;
-	unsigned	header_length;	/* size of aio_ring */
-
-
-	struct io_event		io_events[0];
-}; /* 128 bytes + ring size */
-
-#define AIO_RING_PAGES	8
-struct aio_ring_info {
-	unsigned long		mmap_base;
-	unsigned long		mmap_size;
-
-	struct page		**ring_pages;
-	spinlock_t		ring_lock;
-	long			nr_pages;
-
-	unsigned		nr, tail;
-
-	struct page		*internal_pages[AIO_RING_PAGES];
-};
-
-static inline unsigned aio_ring_avail(struct aio_ring_info *info,
-					struct aio_ring *ring)
-{
-	return (ring->head + info->nr - 1 - ring->tail) % info->nr;
-}
-
-struct kioctx {
-	atomic_t		users;
-	int			dead;
-
-	/* This needs improving */
-	unsigned long		user_id;
-	struct hlist_node	list;
-
-	wait_queue_head_t	wait;
-
-	spinlock_t		ctx_lock;
-
-	int			reqs_active;
-	struct list_head	active_reqs;	/* used for cancellation */
-
-	/* sys_io_setup currently limits this to an unsigned int */
-	unsigned		max_reqs;
-
-	struct aio_ring_info	ring_info;
-
-	struct rcu_head		rcu_head;
-};
-
 /* prototypes */
 #ifdef CONFIG_AIO
 extern ssize_t wait_on_sync_kiocb(struct kiocb *iocb);
-- 
cgit 


From 11599ebac4a249ab3c8b9a535c21db7a51458c0a Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:18:39 -0700
Subject: aio: make aio_put_req() lockless

Freeing a kiocb needed to touch the kioctx for three things:

 * Pull it off the reqs_active list
 * Decrementing reqs_active
 * Issuing a wakeup, if the kioctx was in the process of being freed.

This patch moves these to aio_complete(), for a couple reasons:

 * aio_complete() already has to issue the wakeup, so if we drop the
   kioctx refcount before aio_complete does its wakeup we don't have to
   do it twice.
 * aio_complete currently has to take the kioctx lock, so it makes sense
   for it to pull the kiocb off the reqs_active list too.
 * A later patch is going to change reqs_active to include unreaped
   completions - this will mean allocating a kiocb doesn't have to look
   at the ringbuffer. So taking the decrement of reqs_active out of
   kiocb_free() is useful prep work for that patch.

This doesn't really affect cancellation, since existing (usb) code that
implements a cancel function still calls aio_complete() - we just have
to make sure that aio_complete does the necessary teardown for cancelled
kiocbs.

It does affect code paths where we free kiocbs that were never
submitted; they need to decrement reqs_active and pull the kiocb off the
reqs_active list.  This occurs in two places: kiocb_batch_free(), which
is going away in a later patch, and the error path in io_submit_one.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 86 +++++++++++++++++++++--------------------------------
 include/linux/aio.h |  4 +--
 2 files changed, 36 insertions(+), 54 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index ffa8b8d2cb8e..f877417f3c42 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -89,7 +89,7 @@ struct kioctx {
 
 	spinlock_t		ctx_lock;
 
-	int			reqs_active;
+	atomic_t		reqs_active;
 	struct list_head	active_reqs;	/* used for cancellation */
 
 	/* sys_io_setup currently limits this to an unsigned int */
@@ -250,7 +250,7 @@ static void ctx_rcu_free(struct rcu_head *head)
 static void __put_ioctx(struct kioctx *ctx)
 {
 	unsigned nr_events = ctx->max_reqs;
-	BUG_ON(ctx->reqs_active);
+	BUG_ON(atomic_read(&ctx->reqs_active));
 
 	aio_free_ring(ctx);
 	if (nr_events) {
@@ -284,7 +284,7 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
 	cancel = kiocb->ki_cancel;
 	kiocbSetCancelled(kiocb);
 	if (cancel) {
-		kiocb->ki_users++;
+		atomic_inc(&kiocb->ki_users);
 		spin_unlock_irq(&ctx->ctx_lock);
 
 		memset(res, 0, sizeof(*res));
@@ -383,12 +383,12 @@ static void kill_ctx(struct kioctx *ctx)
 		kiocb_cancel(ctx, req, &res);
 	}
 
-	if (!ctx->reqs_active)
+	if (!atomic_read(&ctx->reqs_active))
 		goto out;
 
 	add_wait_queue(&ctx->wait, &wait);
 	set_task_state(tsk, TASK_UNINTERRUPTIBLE);
-	while (ctx->reqs_active) {
+	while (atomic_read(&ctx->reqs_active)) {
 		spin_unlock_irq(&ctx->ctx_lock);
 		io_schedule();
 		set_task_state(tsk, TASK_UNINTERRUPTIBLE);
@@ -406,9 +406,9 @@ out:
  */
 ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
 {
-	while (iocb->ki_users) {
+	while (atomic_read(&iocb->ki_users)) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!iocb->ki_users)
+		if (!atomic_read(&iocb->ki_users))
 			break;
 		io_schedule();
 	}
@@ -438,7 +438,7 @@ void exit_aio(struct mm_struct *mm)
 			printk(KERN_DEBUG
 				"exit_aio:ioctx still alive: %d %d %d\n",
 				atomic_read(&ctx->users), ctx->dead,
-				ctx->reqs_active);
+				atomic_read(&ctx->reqs_active));
 		/*
 		 * We don't need to bother with munmap() here -
 		 * exit_mmap(mm) is coming and it'll unmap everything.
@@ -453,11 +453,11 @@ void exit_aio(struct mm_struct *mm)
 }
 
 /* aio_get_req
- *	Allocate a slot for an aio request.  Increments the users count
+ *	Allocate a slot for an aio request.  Increments the ki_users count
  * of the kioctx so that the kioctx stays around until all requests are
  * complete.  Returns NULL if no requests are free.
  *
- * Returns with kiocb->users set to 2.  The io submit code path holds
+ * Returns with kiocb->ki_users set to 2.  The io submit code path holds
  * an extra reference while submitting the i/o.
  * This prevents races between the aio code path referencing the
  * req (after submitting it) and aio_complete() freeing the req.
@@ -471,7 +471,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 		return NULL;
 
 	req->ki_flags = 0;
-	req->ki_users = 2;
+	atomic_set(&req->ki_users, 2);
 	req->ki_key = 0;
 	req->ki_ctx = ctx;
 	req->ki_cancel = NULL;
@@ -512,9 +512,9 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
 		list_del(&req->ki_batch);
 		list_del(&req->ki_list);
 		kmem_cache_free(kiocb_cachep, req);
-		ctx->reqs_active--;
+		atomic_dec(&ctx->reqs_active);
 	}
-	if (unlikely(!ctx->reqs_active && ctx->dead))
+	if (unlikely(!atomic_read(&ctx->reqs_active) && ctx->dead))
 		wake_up_all(&ctx->wait);
 	spin_unlock_irq(&ctx->ctx_lock);
 }
@@ -545,7 +545,8 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
 	spin_lock_irq(&ctx->ctx_lock);
 	ring = kmap_atomic(ctx->ring_info.ring_pages[0]);
 
-	avail = aio_ring_avail(&ctx->ring_info, ring) - ctx->reqs_active;
+	avail = aio_ring_avail(&ctx->ring_info, ring) -
+				atomic_read(&ctx->reqs_active);
 	BUG_ON(avail < 0);
 	if (avail < allocated) {
 		/* Trim back the number of requests. */
@@ -560,7 +561,7 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
 	batch->count -= allocated;
 	list_for_each_entry(req, &batch->head, ki_batch) {
 		list_add(&req->ki_list, &ctx->active_reqs);
-		ctx->reqs_active++;
+		atomic_inc(&ctx->reqs_active);
 	}
 
 	kunmap_atomic(ring);
@@ -583,10 +584,8 @@ static inline struct kiocb *aio_get_req(struct kioctx *ctx,
 	return req;
 }
 
-static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
+static void kiocb_free(struct kiocb *req)
 {
-	assert_spin_locked(&ctx->ctx_lock);
-
 	if (req->ki_filp)
 		fput(req->ki_filp);
 	if (req->ki_eventfd != NULL)
@@ -596,40 +595,12 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req)
 	if (req->ki_iovec != &req->ki_inline_vec)
 		kfree(req->ki_iovec);
 	kmem_cache_free(kiocb_cachep, req);
-	ctx->reqs_active--;
-
-	if (unlikely(!ctx->reqs_active && ctx->dead))
-		wake_up_all(&ctx->wait);
 }
 
-/* __aio_put_req
- *	Returns true if this put was the last user of the request.
- */
-static void __aio_put_req(struct kioctx *ctx, struct kiocb *req)
-{
-	assert_spin_locked(&ctx->ctx_lock);
-
-	req->ki_users--;
-	BUG_ON(req->ki_users < 0);
-	if (likely(req->ki_users))
-		return;
-	list_del(&req->ki_list);		/* remove from active_reqs */
-	req->ki_cancel = NULL;
-	req->ki_retry = NULL;
-
-	really_put_req(ctx, req);
-}
-
-/* aio_put_req
- *	Returns true if this put was the last user of the kiocb,
- *	false if the request is still in use.
- */
 void aio_put_req(struct kiocb *req)
 {
-	struct kioctx *ctx = req->ki_ctx;
-	spin_lock_irq(&ctx->ctx_lock);
-	__aio_put_req(ctx, req);
-	spin_unlock_irq(&ctx->ctx_lock);
+	if (atomic_dec_and_test(&req->ki_users))
+		kiocb_free(req);
 }
 EXPORT_SYMBOL(aio_put_req);
 
@@ -677,9 +648,9 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
 	if (is_sync_kiocb(iocb)) {
-		BUG_ON(iocb->ki_users != 1);
+		BUG_ON(atomic_read(&iocb->ki_users) != 1);
 		iocb->ki_user_data = res;
-		iocb->ki_users = 0;
+		atomic_set(&iocb->ki_users, 0);
 		wake_up_process(iocb->ki_obj.tsk);
 		return;
 	}
@@ -694,6 +665,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	 */
 	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
+	list_del(&iocb->ki_list); /* remove from active_reqs */
+
 	/*
 	 * cancelled requests don't get events, userland was given one
 	 * when the event got cancelled.
@@ -740,7 +713,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 
 put_rq:
 	/* everything turned out well, dispose of the aiocb. */
-	__aio_put_req(ctx, iocb);
+	aio_put_req(iocb);
+	atomic_dec(&ctx->reqs_active);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -905,7 +879,7 @@ static int read_events(struct kioctx *ctx,
 				break;
 			/* Try to only show up in io wait if there are ops
 			 *  in flight */
-			if (ctx->reqs_active)
+			if (atomic_read(&ctx->reqs_active))
 				io_schedule();
 			else
 				schedule();
@@ -1369,6 +1343,14 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	return 0;
 
 out_put_req:
+	spin_lock_irq(&ctx->ctx_lock);
+	list_del(&req->ki_list);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	atomic_dec(&ctx->reqs_active);
+	if (unlikely(!atomic_read(&ctx->reqs_active) && ctx->dead))
+		wake_up_all(&ctx->wait);
+
 	aio_put_req(req);	/* drop extra ref to req */
 	aio_put_req(req);	/* drop i/o ref to req */
 	return ret;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 7b1eb234e0ab..1e728f0086f8 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -49,7 +49,7 @@ struct kioctx;
  */
 struct kiocb {
 	unsigned long		ki_flags;
-	int			ki_users;
+	atomic_t		ki_users;
 	unsigned		ki_key;		/* id of this request */
 
 	struct file		*ki_filp;
@@ -96,7 +96,7 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
-			.ki_users = 1,
+			.ki_users = ATOMIC_INIT(1),
 			.ki_key = KIOCB_SYNC_KEY,
 			.ki_filp = filp,
 			.ki_obj.tsk = current,
-- 
cgit 


From 774a08b3548f3f4f36c1a4f2a29a1b3710a2c939 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:18:43 -0700
Subject: wait: add wait_event_hrtimeout()

Analagous to wait_event_timeout() and friends, this adds
wait_event_hrtimeout() and wait_event_interruptible_hrtimeout().

Note that unlike the versions that use regular timers, these don't
return the amount of time remaining when they return - instead, they
return 0 or -ETIME if they timed out.  because I was uncomfortable with
the semantics of doing it the other way (that I could get it right,
anyways).

If the timer expires, there's no real guarantee that expire_time -
current_time would be <= 0 - due to timer slack certainly, and I'm not
sure I want to know the implications of the different clock bases in
hrtimers.

If the timer does expire and the code calculates that the time remaining
is nonnegative, that could be even worse if the calling code then reuses
that timeout.  Probably safer to just return 0 then, but I could imagine
weird bugs or at least unintended behaviour arising from that too.

I came to the conclusion that if other users end up actually needing the
amount of time remaining, the sanest thing to do would be to create a
version that uses absolute timeouts instead of relative.

[akpm@linux-foundation.org: fix description of `timeout' arg]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 7cb64d4b499d..ac38be2692d8 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -330,6 +330,92 @@ do {									\
 	__ret;								\
 })
 
+#define __wait_event_hrtimeout(wq, condition, timeout, state)		\
+({									\
+	int __ret = 0;							\
+	DEFINE_WAIT(__wait);						\
+	struct hrtimer_sleeper __t;					\
+									\
+	hrtimer_init_on_stack(&__t.timer, CLOCK_MONOTONIC,		\
+			      HRTIMER_MODE_REL);			\
+	hrtimer_init_sleeper(&__t, current);				\
+	if ((timeout).tv64 != KTIME_MAX)				\
+		hrtimer_start_range_ns(&__t.timer, timeout,		\
+				       current->timer_slack_ns,		\
+				       HRTIMER_MODE_REL);		\
+									\
+	for (;;) {							\
+		prepare_to_wait(&wq, &__wait, state);			\
+		if (condition)						\
+			break;						\
+		if (state == TASK_INTERRUPTIBLE &&			\
+		    signal_pending(current)) {				\
+			__ret = -ERESTARTSYS;				\
+			break;						\
+		}							\
+		if (!__t.task) {					\
+			__ret = -ETIME;					\
+			break;						\
+		}							\
+		schedule();						\
+	}								\
+									\
+	hrtimer_cancel(&__t.timer);					\
+	destroy_hrtimer_on_stack(&__t.timer);				\
+	finish_wait(&wq, &__wait);					\
+	__ret;								\
+})
+
+/**
+ * wait_event_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_UNINTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, or -ETIME if the timeout
+ * elapsed.
+ */
+#define wait_event_hrtimeout(wq, condition, timeout)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__ret = __wait_event_hrtimeout(wq, condition, timeout,	\
+					       TASK_UNINTERRUPTIBLE);	\
+	__ret;								\
+})
+
+/**
+ * wait_event_interruptible_hrtimeout - sleep until a condition gets true or a timeout elapses
+ * @wq: the waitqueue to wait on
+ * @condition: a C expression for the event to wait for
+ * @timeout: timeout, as a ktime_t
+ *
+ * The process is put to sleep (TASK_INTERRUPTIBLE) until the
+ * @condition evaluates to true or a signal is received.
+ * The @condition is checked each time the waitqueue @wq is woken up.
+ *
+ * wake_up() has to be called after changing any variable that could
+ * change the result of the wait condition.
+ *
+ * The function returns 0 if @condition became true, -ERESTARTSYS if it was
+ * interrupted by a signal, or -ETIME if the timeout elapsed.
+ */
+#define wait_event_interruptible_hrtimeout(wq, condition, timeout)	\
+({									\
+	long __ret = 0;							\
+	if (!(condition))						\
+		__ret = __wait_event_hrtimeout(wq, condition, timeout,	\
+					       TASK_INTERRUPTIBLE);	\
+	__ret;								\
+})
+
 #define __wait_event_interruptible_exclusive(wq, condition, ret)	\
 do {									\
 	DEFINE_WAIT(__wait);						\
-- 
cgit 


From 0460fef2a9215680f7f85415b57731b7e0fdf673 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:18:49 -0700
Subject: aio: use cancellation list lazily

Cancelling kiocbs requires adding them to a per kioctx linked list,
which is one of the few things we need to take the kioctx lock for in
the fast path.  But most kiocbs can't be cancelled - so if we just do
this lazily, we can avoid quite a bit of locking overhead.

While we're at it, instead of using a flag bit switch to using ki_cancel
itself to indicate that a kiocb has been cancelled/completed.  This lets
us get rid of ki_flags entirely.

[akpm@linux-foundation.org: remove buggy BUG()]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/gadget/inode.c |   3 +-
 fs/aio.c                   | 106 ++++++++++++++++++++++++++-------------------
 include/linux/aio.h        |  27 ++++++++----
 3 files changed, 81 insertions(+), 55 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/usb/gadget/inode.c b/drivers/usb/gadget/inode.c
index 994e7433e87a..570c005062ab 100644
--- a/drivers/usb/gadget/inode.c
+++ b/drivers/usb/gadget/inode.c
@@ -533,7 +533,6 @@ static int ep_aio_cancel(struct kiocb *iocb, struct io_event *e)
 	local_irq_disable();
 	epdata = priv->epdata;
 	// spin_lock(&epdata->dev->lock);
-	kiocbSetCancelled(iocb);
 	if (likely(epdata && epdata->ep && priv->req))
 		value = usb_ep_dequeue (epdata->ep, priv->req);
 	else
@@ -663,7 +662,7 @@ fail:
 		goto fail;
 	}
 
-	iocb->ki_cancel = ep_aio_cancel;
+	kiocb_set_cancel_fn(iocb, ep_aio_cancel);
 	get_ep(epdata);
 	priv->epdata = epdata;
 	priv->actual = 0;
diff --git a/fs/aio.c b/fs/aio.c
index aea060d8c1e8..3428e9ae2f1d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -97,6 +97,8 @@ struct kioctx {
 
 	struct aio_ring_info	ring_info;
 
+	spinlock_t		completion_lock;
+
 	struct rcu_head		rcu_head;
 	struct work_struct	rcu_work;
 };
@@ -220,25 +222,51 @@ static int aio_setup_ring(struct kioctx *ctx)
 #define AIO_EVENTS_FIRST_PAGE	((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
 #define AIO_EVENTS_OFFSET	(AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
 
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
+{
+	struct kioctx *ctx = req->ki_ctx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->ctx_lock, flags);
+
+	if (!req->ki_list.next)
+		list_add(&req->ki_list, &ctx->active_reqs);
+
+	req->ki_cancel = cancel;
+
+	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+}
+EXPORT_SYMBOL(kiocb_set_cancel_fn);
+
 static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
 			struct io_event *res)
 {
-	int (*cancel)(struct kiocb *, struct io_event *);
+	kiocb_cancel_fn *old, *cancel;
 	int ret = -EINVAL;
 
-	cancel = kiocb->ki_cancel;
-	kiocbSetCancelled(kiocb);
-	if (cancel) {
-		atomic_inc(&kiocb->ki_users);
-		spin_unlock_irq(&ctx->ctx_lock);
+	/*
+	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
+	 * actually has a cancel function, hence the cmpxchg()
+	 */
+
+	cancel = ACCESS_ONCE(kiocb->ki_cancel);
+	do {
+		if (!cancel || cancel == KIOCB_CANCELLED)
+			return ret;
 
-		memset(res, 0, sizeof(*res));
-		res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
-		res->data = kiocb->ki_user_data;
-		ret = cancel(kiocb, res);
+		old = cancel;
+		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
+	} while (cancel != old);
 
-		spin_lock_irq(&ctx->ctx_lock);
-	}
+	atomic_inc(&kiocb->ki_users);
+	spin_unlock_irq(&ctx->ctx_lock);
+
+	memset(res, 0, sizeof(*res));
+	res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
+	res->data = kiocb->ki_user_data;
+	ret = cancel(kiocb, res);
+
+	spin_lock_irq(&ctx->ctx_lock);
 
 	return ret;
 }
@@ -326,6 +354,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	atomic_set(&ctx->users, 2);
 	atomic_set(&ctx->dead, 0);
 	spin_lock_init(&ctx->ctx_lock);
+	spin_lock_init(&ctx->completion_lock);
 	mutex_init(&ctx->ring_info.ring_lock);
 	init_waitqueue_head(&ctx->wait);
 
@@ -468,20 +497,12 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req = NULL;
 
-	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
+	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
 		return NULL;
 
-	req->ki_flags = 0;
 	atomic_set(&req->ki_users, 2);
-	req->ki_key = 0;
 	req->ki_ctx = ctx;
-	req->ki_cancel = NULL;
-	req->ki_retry = NULL;
-	req->ki_dtor = NULL;
-	req->private = NULL;
-	req->ki_iovec = NULL;
-	req->ki_eventfd = NULL;
 
 	return req;
 }
@@ -512,7 +533,6 @@ static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
 	spin_lock_irq(&ctx->ctx_lock);
 	list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
 		list_del(&req->ki_batch);
-		list_del(&req->ki_list);
 		kmem_cache_free(kiocb_cachep, req);
 		atomic_dec(&ctx->reqs_active);
 	}
@@ -559,10 +579,7 @@ static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
 	}
 
 	batch->count -= allocated;
-	list_for_each_entry(req, &batch->head, ki_batch) {
-		list_add(&req->ki_list, &ctx->active_reqs);
-		atomic_inc(&ctx->reqs_active);
-	}
+	atomic_add(allocated, &ctx->reqs_active);
 
 	kunmap_atomic(ring);
 	spin_unlock_irq(&ctx->ctx_lock);
@@ -653,25 +670,34 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	info = &ctx->ring_info;
 
 	/*
-	 * Add a completion event to the ring buffer. Must be done holding
-	 * ctx->ctx_lock to prevent other code from messing with the tail
-	 * pointer since we might be called from irq context.
-	 *
 	 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
 	 * need to issue a wakeup after decrementing reqs_active.
 	 */
 	rcu_read_lock();
-	spin_lock_irqsave(&ctx->ctx_lock, flags);
 
-	list_del(&iocb->ki_list); /* remove from active_reqs */
+	if (iocb->ki_list.next) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&ctx->ctx_lock, flags);
+		list_del(&iocb->ki_list);
+		spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+	}
 
 	/*
 	 * cancelled requests don't get events, userland was given one
 	 * when the event got cancelled.
 	 */
-	if (kiocbIsCancelled(iocb))
+	if (unlikely(xchg(&iocb->ki_cancel,
+			  KIOCB_CANCELLED) == KIOCB_CANCELLED))
 		goto put_rq;
 
+	/*
+	 * Add a completion event to the ring buffer. Must be done holding
+	 * ctx->ctx_lock to prevent other code from messing with the tail
+	 * pointer since we might be called from irq context.
+	 */
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+
 	tail = info->tail;
 	pos = tail + AIO_EVENTS_OFFSET;
 
@@ -705,6 +731,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	kunmap_atomic(ring);
 	flush_dcache_page(info->ring_pages[0]);
 
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
 	pr_debug("added to ring %p at [%u]\n", iocb, tail);
 
 	/*
@@ -731,7 +759,6 @@ put_rq:
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
 
-	spin_unlock_irqrestore(&ctx->ctx_lock, flags);
 	rcu_read_unlock();
 }
 EXPORT_SYMBOL(aio_complete);
@@ -1216,15 +1243,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_opcode = iocb->aio_lio_opcode;
 
 	ret = aio_setup_iocb(req, compat);
-
 	if (ret)
 		goto out_put_req;
 
-	if (unlikely(kiocbIsCancelled(req)))
-		ret = -EINTR;
-	else
-		ret = req->ki_retry(req);
-
+	ret = req->ki_retry(req);
 	if (ret != -EIOCBQUEUED) {
 		/*
 		 * There's no easy way to restart the syscall since other AIO's
@@ -1241,10 +1263,6 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	return 0;
 
 out_put_req:
-	spin_lock_irq(&ctx->ctx_lock);
-	list_del(&req->ki_list);
-	spin_unlock_irq(&ctx->ctx_lock);
-
 	atomic_dec(&ctx->reqs_active);
 	aio_put_req(req);	/* drop extra ref to req */
 	aio_put_req(req);	/* drop i/o ref to req */
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 1e728f0086f8..d2a00038ec77 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -10,17 +10,24 @@
 #include <linux/atomic.h>
 
 struct kioctx;
+struct kiocb;
 
 #define KIOCB_SYNC_KEY		(~0U)
 
-/* ki_flags bits */
-#define KIF_CANCELLED		2
-
-#define kiocbSetCancelled(iocb)	set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
-
-#define kiocbClearCancelled(iocb)	clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+/*
+ * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
+ * cancelled or completed (this makes a certain amount of sense because
+ * successful cancellation - io_cancel() - does deliver the completion to
+ * userspace).
+ *
+ * And since most things don't implement kiocb cancellation and we'd really like
+ * kiocb completion to be lockless when possible, we use ki_cancel to
+ * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
+ * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
+ */
+#define KIOCB_CANCELLED		((void *) (~0ULL))
 
-#define kiocbIsCancelled(iocb)	test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
+typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
 
 /* is there a better place to document function pointer methods? */
 /**
@@ -48,13 +55,12 @@ struct kioctx;
  * calls may result in undefined behaviour.
  */
 struct kiocb {
-	unsigned long		ki_flags;
 	atomic_t		ki_users;
 	unsigned		ki_key;		/* id of this request */
 
 	struct file		*ki_filp;
 	struct kioctx		*ki_ctx;	/* may be NULL for sync ops */
-	int			(*ki_cancel)(struct kiocb *, struct io_event *);
+	kiocb_cancel_fn		*ki_cancel;
 	ssize_t			(*ki_retry)(struct kiocb *);
 	void			(*ki_dtor)(struct kiocb *);
 
@@ -112,6 +118,7 @@ struct mm_struct;
 extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
+void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
 #else
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline void aio_put_req(struct kiocb *iocb) { }
@@ -121,6 +128,8 @@ static inline void exit_aio(struct mm_struct *mm) { }
 static inline long do_io_submit(aio_context_t ctx_id, long nr,
 				struct iocb __user * __user *iocbpp,
 				bool compat) { return 0; }
+static inline void kiocb_set_cancel_fn(struct kiocb *req,
+				       kiocb_cancel_fn *cancel) { }
 #endif /* CONFIG_AIO */
 
 static inline struct kiocb *list_kiocb(struct list_head *h)
-- 
cgit 


From a1c8eae75ea3b0168fca93788db1b5aef2424921 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:18:53 -0700
Subject: aio: kill batch allocation

Previously, allocating a kiocb required touching quite a few global
(well, per kioctx) cachelines...  so batching up allocation to amortize
those was worthwhile.  But we've gotten rid of some of those, and in
another couple of patches kiocb allocation won't require writing to any
shared cachelines, so that means we can just rip this code out.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 116 +++++++---------------------------------------------
 include/linux/aio.h |   1 -
 2 files changed, 15 insertions(+), 102 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index d3bff60b5fe6..263ebce940c0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -510,108 +510,27 @@ void exit_aio(struct mm_struct *mm)
  * This prevents races between the aio code path referencing the
  * req (after submitting it) and aio_complete() freeing the req.
  */
-static struct kiocb *__aio_get_req(struct kioctx *ctx)
+static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
-	struct kiocb *req = NULL;
+	struct kiocb *req;
+
+	if (atomic_read(&ctx->reqs_active) >= ctx->ring_info.nr)
+		return NULL;
+
+	if (atomic_inc_return(&ctx->reqs_active) > ctx->ring_info.nr - 1)
+		goto out_put;
 
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
-		return NULL;
+		goto out_put;
 
 	atomic_set(&req->ki_users, 2);
 	req->ki_ctx = ctx;
 
 	return req;
-}
-
-/*
- * struct kiocb's are allocated in batches to reduce the number of
- * times the ctx lock is acquired and released.
- */
-#define KIOCB_BATCH_SIZE	32L
-struct kiocb_batch {
-	struct list_head head;
-	long count; /* number of requests left to allocate */
-};
-
-static void kiocb_batch_init(struct kiocb_batch *batch, long total)
-{
-	INIT_LIST_HEAD(&batch->head);
-	batch->count = total;
-}
-
-static void kiocb_batch_free(struct kioctx *ctx, struct kiocb_batch *batch)
-{
-	struct kiocb *req, *n;
-
-	if (list_empty(&batch->head))
-		return;
-
-	spin_lock_irq(&ctx->ctx_lock);
-	list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
-		list_del(&req->ki_batch);
-		kmem_cache_free(kiocb_cachep, req);
-		atomic_dec(&ctx->reqs_active);
-	}
-	spin_unlock_irq(&ctx->ctx_lock);
-}
-
-/*
- * Allocate a batch of kiocbs.  This avoids taking and dropping the
- * context lock a lot during setup.
- */
-static int kiocb_batch_refill(struct kioctx *ctx, struct kiocb_batch *batch)
-{
-	unsigned short allocated, to_alloc;
-	long avail;
-	struct kiocb *req, *n;
-
-	to_alloc = min(batch->count, KIOCB_BATCH_SIZE);
-	for (allocated = 0; allocated < to_alloc; allocated++) {
-		req = __aio_get_req(ctx);
-		if (!req)
-			/* allocation failed, go with what we've got */
-			break;
-		list_add(&req->ki_batch, &batch->head);
-	}
-
-	if (allocated == 0)
-		goto out;
-
-	spin_lock_irq(&ctx->ctx_lock);
-
-	avail = ctx->ring_info.nr - atomic_read(&ctx->reqs_active) - 1;
-	BUG_ON(avail < 0);
-	if (avail < allocated) {
-		/* Trim back the number of requests. */
-		list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
-			list_del(&req->ki_batch);
-			kmem_cache_free(kiocb_cachep, req);
-			if (--allocated <= avail)
-				break;
-		}
-	}
-
-	batch->count -= allocated;
-	atomic_add(allocated, &ctx->reqs_active);
-
-	spin_unlock_irq(&ctx->ctx_lock);
-
-out:
-	return allocated;
-}
-
-static inline struct kiocb *aio_get_req(struct kioctx *ctx,
-					struct kiocb_batch *batch)
-{
-	struct kiocb *req;
-
-	if (list_empty(&batch->head))
-		if (kiocb_batch_refill(ctx, batch) == 0)
-			return NULL;
-	req = list_first_entry(&batch->head, struct kiocb, ki_batch);
-	list_del(&req->ki_batch);
-	return req;
+out_put:
+	atomic_dec(&ctx->reqs_active);
+	return NULL;
 }
 
 static void kiocb_free(struct kiocb *req)
@@ -1198,8 +1117,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 }
 
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb, struct kiocb_batch *batch,
-			 bool compat)
+			 struct iocb *iocb, bool compat)
 {
 	struct kiocb *req;
 	ssize_t ret;
@@ -1220,7 +1138,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		return -EINVAL;
 	}
 
-	req = aio_get_req(ctx, batch);  /* returns with 2 references to req */
+	req = aio_get_req(ctx);  /* returns with 2 references to req */
 	if (unlikely(!req))
 		return -EAGAIN;
 
@@ -1293,7 +1211,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	long ret = 0;
 	int i = 0;
 	struct blk_plug plug;
-	struct kiocb_batch batch;
 
 	if (unlikely(nr < 0))
 		return -EINVAL;
@@ -1310,8 +1227,6 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 		return -EINVAL;
 	}
 
-	kiocb_batch_init(&batch, nr);
-
 	blk_start_plug(&plug);
 
 	/*
@@ -1332,13 +1247,12 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 			break;
 		}
 
-		ret = io_submit_one(ctx, user_iocb, &tmp, &batch, compat);
+		ret = io_submit_one(ctx, user_iocb, &tmp, compat);
 		if (ret)
 			break;
 	}
 	blk_finish_plug(&plug);
 
-	kiocb_batch_free(ctx, &batch);
 	put_ioctx(ctx);
 	return i ? i : ret;
 }
diff --git a/include/linux/aio.h b/include/linux/aio.h
index d2a00038ec77..f0a8481af99b 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -85,7 +85,6 @@ struct kiocb {
 
 	struct list_head	ki_list;	/* the aio core uses this
 						 * for cancellation */
-	struct list_head	ki_batch;	/* batch allocation */
 
 	/*
 	 * If the aio_resfd field of the userspace iocb is not zero,
-- 
cgit 


From 780a7654cee8d61819512385e778e4827db4bfbc Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 9 Apr 2013 02:22:10 -0700
Subject: audit: Make testing for a valid loginuid explicit.

audit rule additions containing "-F auid!=4294967295" were failing
with EINVAL because of a regression caused by e1760bd.

Apparently some userland audit rule sets want to know if loginuid uid
has been set and are using a test for auid != 4294967295 to determine
that.

In practice that is a horrible way to ask if a value has been set,
because it relies on subtle implementation details and will break
every time the uid implementation in the kernel changes.

So add a clean way to test if the audit loginuid has been set, and
silently convert the old idiom to the cleaner and more comprehensible
new idiom.

Cc: <stable@vger.kernel.org> # 3.7
Reported-By: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Tested-by: Richard Guy Briggs <rgb@redhat.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/audit.h      |  5 +++++
 include/uapi/linux/audit.h |  1 +
 kernel/auditfilter.c       | 17 +++++++++++++++--
 kernel/auditsc.c           |  5 ++++-
 4 files changed, 25 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 469d11755e46..b20b03852f21 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -391,6 +391,11 @@ static inline void audit_ptrace(struct task_struct *t)
 #define audit_signals 0
 #endif /* CONFIG_AUDITSYSCALL */
 
+static inline bool audit_loginuid_set(struct task_struct *tsk)
+{
+	return uid_valid(audit_get_loginuid(tsk));
+}
+
 #ifdef CONFIG_AUDIT
 /* These are defined in audit.c */
 				/* Public API */
diff --git a/include/uapi/linux/audit.h b/include/uapi/linux/audit.h
index c058c24b97ac..75cef3fd97ad 100644
--- a/include/uapi/linux/audit.h
+++ b/include/uapi/linux/audit.h
@@ -246,6 +246,7 @@
 #define AUDIT_OBJ_TYPE	21
 #define AUDIT_OBJ_LEV_LOW	22
 #define AUDIT_OBJ_LEV_HIGH	23
+#define AUDIT_LOGINUID_SET	24
 
 				/* These are ONLY useful when checking
 				 * at syscall exit time (AUDIT_AT_EXIT). */
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 478f4602c96b..bc6595fe952e 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -365,7 +365,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
 	case AUDIT_DIR:
 	case AUDIT_FILTERKEY:
 		break;
-	/* arch is only allowed to be = or != */
+	case AUDIT_LOGINUID_SET:
+		if ((f->val != 0) && (f->val != 1))
+			return -EINVAL;
+	/* FALL THROUGH */
 	case AUDIT_ARCH:
 		if (f->op != Audit_not_equal && f->op != Audit_equal)
 			return -EINVAL;
@@ -419,17 +422,23 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
 		f->lsm_str = NULL;
 		f->lsm_rule = NULL;
 
+		/* Support legacy tests for a valid loginuid */
+		if ((f->type == AUDIT_LOGINUID) && (f->val == 4294967295)) {
+			f->type = AUDIT_LOGINUID_SET;
+			f->val = 0;
+		}
+
 		err = audit_field_valid(entry, f);
 		if (err)
 			goto exit_free;
 
 		err = -EINVAL;
 		switch (f->type) {
+		case AUDIT_LOGINUID:
 		case AUDIT_UID:
 		case AUDIT_EUID:
 		case AUDIT_SUID:
 		case AUDIT_FSUID:
-		case AUDIT_LOGINUID:
 		case AUDIT_OBJ_UID:
 			f->uid = make_kuid(current_user_ns(), f->val);
 			if (!uid_valid(f->uid))
@@ -1222,6 +1231,10 @@ static int audit_filter_user_rules(struct audit_krule *rule, int type,
 			result = audit_uid_comparator(audit_get_loginuid(current),
 						  f->op, f->uid);
 			break;
+		case AUDIT_LOGINUID_SET:
+			result = audit_comparator(audit_loginuid_set(current),
+						  f->op, f->val);
+			break;
 		case AUDIT_MSGTYPE:
 			result = audit_comparator(type, f->op, f->val);
 			break;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index add3086bdb02..3c8a601324a2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -613,6 +613,9 @@ static int audit_filter_rules(struct task_struct *tsk,
 			if (ctx)
 				result = audit_uid_comparator(tsk->loginuid, f->op, f->uid);
 			break;
+		case AUDIT_LOGINUID_SET:
+			result = audit_comparator(audit_loginuid_set(tsk), f->op, f->val);
+			break;
 		case AUDIT_SUBJ_USER:
 		case AUDIT_SUBJ_ROLE:
 		case AUDIT_SUBJ_TYPE:
@@ -1970,7 +1973,7 @@ int audit_set_loginuid(kuid_t loginuid)
 	unsigned int sessionid;
 
 #ifdef CONFIG_AUDIT_LOGINUID_IMMUTABLE
-	if (uid_valid(task->loginuid))
+	if (audit_loginuid_set(task))
 		return -EPERM;
 #else /* CONFIG_AUDIT_LOGINUID_IMMUTABLE */
 	if (!capable(CAP_AUDIT_CONTROL))
-- 
cgit 


From 8a6608907cf165b3ae658c9de2efe6af4be68bff Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:19:10 -0700
Subject: aio: kill ki_key

ki_key wasn't actually used for anything previously - it was always 0.
Drop it to trim struct kiocb a bit.

Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 7 +++++--
 include/linux/aio.h | 9 ++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 986ff305a856..5d7dad365f5f 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1159,7 +1159,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		}
 	}
 
-	ret = put_user(req->ki_key, &user_iocb->aio_key);
+	ret = put_user(KIOCB_KEY, &user_iocb->aio_key);
 	if (unlikely(ret)) {
 		pr_debug("EFAULT: aio_key\n");
 		goto out_put_req;
@@ -1281,10 +1281,13 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 
 	assert_spin_locked(&ctx->ctx_lock);
 
+	if (key != KIOCB_KEY)
+		return NULL;
+
 	/* TODO: use a hash or array, this sucks. */
 	list_for_each(pos, &ctx->active_reqs) {
 		struct kiocb *kiocb = list_kiocb(pos);
-		if (kiocb->ki_obj.user == iocb && kiocb->ki_key == key)
+		if (kiocb->ki_obj.user == iocb)
 			return kiocb;
 	}
 	return NULL;
diff --git a/include/linux/aio.h b/include/linux/aio.h
index f0a8481af99b..7308836dd045 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -12,7 +12,7 @@
 struct kioctx;
 struct kiocb;
 
-#define KIOCB_SYNC_KEY		(~0U)
+#define KIOCB_KEY		0
 
 /*
  * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
@@ -56,10 +56,9 @@ typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
  */
 struct kiocb {
 	atomic_t		ki_users;
-	unsigned		ki_key;		/* id of this request */
 
 	struct file		*ki_filp;
-	struct kioctx		*ki_ctx;	/* may be NULL for sync ops */
+	struct kioctx		*ki_ctx;	/* NULL for sync ops */
 	kiocb_cancel_fn		*ki_cancel;
 	ssize_t			(*ki_retry)(struct kiocb *);
 	void			(*ki_dtor)(struct kiocb *);
@@ -95,14 +94,14 @@ struct kiocb {
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
 {
-	return kiocb->ki_key == KIOCB_SYNC_KEY;
+	return kiocb->ki_ctx == NULL;
 }
 
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
 			.ki_users = ATOMIC_INIT(1),
-			.ki_key = KIOCB_SYNC_KEY,
+			.ki_ctx = NULL,
 			.ki_filp = filp,
 			.ki_obj.tsk = current,
 		};
-- 
cgit 


From 41ef4eb8eef8d06bc1399e7b00c940d771554711 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:19:11 -0700
Subject: aio: kill ki_retry

Thanks to Zach Brown's work to rip out the retry infrastructure, we don't
need this anymore - ki_retry was only called right after the kiocb was
initialized.

This also refactors and trims some duplicated code, as well as cleaning up
the refcounting/error handling a bit.

[akpm@linux-foundation.org: use fmode_t in aio_run_iocb()]
[akpm@linux-foundation.org: fix file_start_write/file_end_write tests]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/aio.c            | 224 ++++++++++++++++++++--------------------------------
 include/linux/aio.h |  26 ------
 2 files changed, 85 insertions(+), 165 deletions(-)

(limited to 'include/linux')

diff --git a/fs/aio.c b/fs/aio.c
index 5d7dad365f5f..c5b1a8c10411 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -903,30 +903,21 @@ static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
 	BUG_ON(ret > 0 && iocb->ki_left == 0);
 }
 
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
+typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
+			    unsigned long, loff_t);
+
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
-			 unsigned long, loff_t);
 	ssize_t ret = 0;
-	unsigned short opcode;
-
-	if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
-		(iocb->ki_opcode == IOCB_CMD_PREAD)) {
-		rw_op = file->f_op->aio_read;
-		opcode = IOCB_CMD_PREADV;
-	} else {
-		rw_op = file->f_op->aio_write;
-		opcode = IOCB_CMD_PWRITEV;
-	}
 
 	/* This matches the pread()/pwrite() logic */
 	if (iocb->ki_pos < 0)
 		return -EINVAL;
 
-	if (opcode == IOCB_CMD_PWRITEV)
+	if (rw == WRITE)
 		file_start_write(file);
 	do {
 		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
@@ -938,9 +929,9 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 	/* retry all partial writes.  retry partial reads as long as its a
 	 * regular file. */
 	} while (ret > 0 && iocb->ki_left > 0 &&
-		 (opcode == IOCB_CMD_PWRITEV ||
+		 (rw == WRITE ||
 		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
-	if (opcode == IOCB_CMD_PWRITEV)
+	if (rw == WRITE)
 		file_end_write(file);
 
 	/* This means we must have transferred all that we could */
@@ -950,7 +941,7 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 
 	/* If we managed to write some out we return that, rather than
 	 * the eventual error. */
-	if (opcode == IOCB_CMD_PWRITEV
+	if (rw == WRITE
 	    && ret < 0 && ret != -EIOCBQUEUED
 	    && iocb->ki_nbytes - iocb->ki_left)
 		ret = iocb->ki_nbytes - iocb->ki_left;
@@ -958,73 +949,41 @@ static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
 	return ret;
 }
 
-static ssize_t aio_fdsync(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t ret = -EINVAL;
-
-	if (file->f_op->aio_fsync)
-		ret = file->f_op->aio_fsync(iocb, 1);
-	return ret;
-}
-
-static ssize_t aio_fsync(struct kiocb *iocb)
-{
-	struct file *file = iocb->ki_filp;
-	ssize_t ret = -EINVAL;
-
-	if (file->f_op->aio_fsync)
-		ret = file->f_op->aio_fsync(iocb, 0);
-	return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
 {
 	ssize_t ret;
 
+	kiocb->ki_nr_segs = kiocb->ki_nbytes;
+
 #ifdef CONFIG_COMPAT
 	if (compat)
-		ret = compat_rw_copy_check_uvector(type,
+		ret = compat_rw_copy_check_uvector(rw,
 				(struct compat_iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
 				&kiocb->ki_iovec);
 	else
 #endif
-		ret = rw_copy_check_uvector(type,
+		ret = rw_copy_check_uvector(rw,
 				(struct iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
 				&kiocb->ki_iovec);
 	if (ret < 0)
-		goto out;
-
-	ret = rw_verify_area(type, kiocb->ki_filp, &kiocb->ki_pos, ret);
-	if (ret < 0)
-		goto out;
+		return ret;
 
-	kiocb->ki_nr_segs = kiocb->ki_nbytes;
-	kiocb->ki_cur_seg = 0;
-	/* ki_nbytes/left now reflect bytes instead of segs */
+	/* ki_nbytes now reflect bytes instead of segs */
 	kiocb->ki_nbytes = ret;
-	kiocb->ki_left = ret;
-
-	ret = 0;
-out:
-	return ret;
+	return 0;
 }
 
-static ssize_t aio_setup_single_vector(int type, struct file * file, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
 {
-	int bytes;
-
-	bytes = rw_verify_area(type, file, &kiocb->ki_pos, kiocb->ki_left);
-	if (bytes < 0)
-		return bytes;
+	if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+		return -EFAULT;
 
 	kiocb->ki_iovec = &kiocb->ki_inline_vec;
 	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
-	kiocb->ki_iovec->iov_len = bytes;
+	kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
 	kiocb->ki_nr_segs = 1;
-	kiocb->ki_cur_seg = 0;
 	return 0;
 }
 
@@ -1033,81 +992,82 @@ static ssize_t aio_setup_single_vector(int type, struct file * file, struct kioc
  *	Performs the initial checks and aio retry method
  *	setup for the kiocb at the time of io submission.
  */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
 {
-	struct file *file = kiocb->ki_filp;
-	ssize_t ret = 0;
+	struct file *file = req->ki_filp;
+	ssize_t ret;
+	int rw;
+	fmode_t mode;
+	aio_rw_op *rw_op;
 
-	switch (kiocb->ki_opcode) {
+	switch (req->ki_opcode) {
 	case IOCB_CMD_PREAD:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_READ)))
-			break;
-		ret = -EFAULT;
-		if (unlikely(!access_ok(VERIFY_WRITE, kiocb->ki_buf,
-			kiocb->ki_left)))
-			break;
-		ret = aio_setup_single_vector(READ, file, kiocb);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
-	case IOCB_CMD_PWRITE:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_WRITE)))
-			break;
-		ret = -EFAULT;
-		if (unlikely(!access_ok(VERIFY_READ, kiocb->ki_buf,
-			kiocb->ki_left)))
-			break;
-		ret = aio_setup_single_vector(WRITE, file, kiocb);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
 	case IOCB_CMD_PREADV:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_READ)))
-			break;
-		ret = aio_setup_vectored_rw(READ, kiocb, compat);
-		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_read)
-			kiocb->ki_retry = aio_rw_vect_retry;
-		break;
+		mode	= FMODE_READ;
+		rw	= READ;
+		rw_op	= file->f_op->aio_read;
+		goto rw_common;
+
+	case IOCB_CMD_PWRITE:
 	case IOCB_CMD_PWRITEV:
-		ret = -EBADF;
-		if (unlikely(!(file->f_mode & FMODE_WRITE)))
-			break;
-		ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
+		mode	= FMODE_WRITE;
+		rw	= WRITE;
+		rw_op	= file->f_op->aio_write;
+		goto rw_common;
+rw_common:
+		if (unlikely(!(file->f_mode & mode)))
+			return -EBADF;
+
+		if (!rw_op)
+			return -EINVAL;
+
+		ret = (req->ki_opcode == IOCB_CMD_PREADV ||
+		       req->ki_opcode == IOCB_CMD_PWRITEV)
+			? aio_setup_vectored_rw(rw, req, compat)
+			: aio_setup_single_vector(rw, req);
 		if (ret)
-			break;
-		ret = -EINVAL;
-		if (file->f_op->aio_write)
-			kiocb->ki_retry = aio_rw_vect_retry;
+			return ret;
+
+		ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
+		if (ret < 0)
+			return ret;
+
+		req->ki_nbytes = ret;
+		req->ki_left = ret;
+
+		ret = aio_rw_vect_retry(req, rw, rw_op);
 		break;
+
 	case IOCB_CMD_FDSYNC:
-		ret = -EINVAL;
-		if (file->f_op->aio_fsync)
-			kiocb->ki_retry = aio_fdsync;
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 1);
 		break;
+
 	case IOCB_CMD_FSYNC:
-		ret = -EINVAL;
-		if (file->f_op->aio_fsync)
-			kiocb->ki_retry = aio_fsync;
+		if (!file->f_op->aio_fsync)
+			return -EINVAL;
+
+		ret = file->f_op->aio_fsync(req, 0);
 		break;
+
 	default:
 		pr_debug("EINVAL: no operation provided\n");
-		ret = -EINVAL;
+		return -EINVAL;
 	}
 
-	if (!kiocb->ki_retry)
-		return ret;
+	if (ret != -EIOCBQUEUED) {
+		/*
+		 * There's no easy way to restart the syscall since other AIO's
+		 * may be already running. Just fail this IO with EINTR.
+		 */
+		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
+			     ret == -ERESTARTNOHAND ||
+			     ret == -ERESTART_RESTARTBLOCK))
+			ret = -EINTR;
+		aio_complete(req, ret, 0);
+	}
 
 	return 0;
 }
@@ -1134,7 +1094,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 		return -EINVAL;
 	}
 
-	req = aio_get_req(ctx);  /* returns with 2 references to req */
+	req = aio_get_req(ctx);
 	if (unlikely(!req))
 		return -EAGAIN;
 
@@ -1173,26 +1133,12 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
 	req->ki_opcode = iocb->aio_lio_opcode;
 
-	ret = aio_setup_iocb(req, compat);
+	ret = aio_run_iocb(req, compat);
 	if (ret)
 		goto out_put_req;
 
-	ret = req->ki_retry(req);
-	if (ret != -EIOCBQUEUED) {
-		/*
-		 * There's no easy way to restart the syscall since other AIO's
-		 * may be already running. Just fail this IO with EINTR.
-		 */
-		if (unlikely(ret == -ERESTARTSYS || ret == -ERESTARTNOINTR ||
-			     ret == -ERESTARTNOHAND ||
-			     ret == -ERESTART_RESTARTBLOCK))
-			ret = -EINTR;
-		aio_complete(req, ret, 0);
-	}
-
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
-
 out_put_req:
 	atomic_dec(&ctx->reqs_active);
 	aio_put_req(req);	/* drop extra ref to req */
diff --git a/include/linux/aio.h b/include/linux/aio.h
index 7308836dd045..1bdf965339f9 100644
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -29,38 +29,12 @@ struct kiocb;
 
 typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
 
-/* is there a better place to document function pointer methods? */
-/**
- * ki_retry	-	iocb forward progress callback
- * @kiocb:	The kiocb struct to advance by performing an operation.
- *
- * This callback is called when the AIO core wants a given AIO operation
- * to make forward progress.  The kiocb argument describes the operation
- * that is to be performed.  As the operation proceeds, perhaps partially,
- * ki_retry is expected to update the kiocb with progress made.  Typically
- * ki_retry is set in the AIO core and it itself calls file_operations
- * helpers.
- *
- * ki_retry's return value determines when the AIO operation is completed
- * and an event is generated in the AIO event ring.  Except the special
- * return values described below, the value that is returned from ki_retry
- * is transferred directly into the completion ring as the operation's
- * resulting status.  Once this has happened ki_retry *MUST NOT* reference
- * the kiocb pointer again.
- *
- * If ki_retry returns -EIOCBQUEUED it has made a promise that aio_complete()
- * will be called on the kiocb pointer in the future.  The AIO core will
- * not ask the method again -- ki_retry must ensure forward progress.
- * aio_complete() must be called once and only once in the future, multiple
- * calls may result in undefined behaviour.
- */
 struct kiocb {
 	atomic_t		ki_users;
 
 	struct file		*ki_filp;
 	struct kioctx		*ki_ctx;	/* NULL for sync ops */
 	kiocb_cancel_fn		*ki_cancel;
-	ssize_t			(*ki_retry)(struct kiocb *);
 	void			(*ki_dtor)(struct kiocb *);
 
 	union {
-- 
cgit 


From a27bb332c04cec8c4afd7912df0dc7890db27560 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <koverstreet@google.com>
Date: Tue, 7 May 2013 16:19:08 -0700
Subject: aio: don't include aio.h in sched.h

Faster kernel compiles by way of fewer unnecessary includes.

[akpm@linux-foundation.org: fix fallout]
[akpm@linux-foundation.org: fix build]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/s390/hypfs/inode.c                      | 1 +
 block/scsi_ioctl.c                           | 1 +
 drivers/char/mem.c                           | 1 +
 drivers/infiniband/hw/ipath/ipath_file_ops.c | 1 +
 drivers/infiniband/hw/qib/qib_file_ops.c     | 2 +-
 drivers/scsi/sg.c                            | 1 +
 drivers/staging/android/logger.c             | 1 +
 fs/9p/vfs_addr.c                             | 1 +
 fs/afs/write.c                               | 1 +
 fs/bio.c                                     | 1 +
 fs/block_dev.c                               | 1 +
 fs/btrfs/file.c                              | 1 +
 fs/btrfs/inode.c                             | 1 +
 fs/ceph/file.c                               | 1 +
 fs/compat.c                                  | 1 +
 fs/direct-io.c                               | 1 +
 fs/ecryptfs/file.c                           | 1 +
 fs/ext2/inode.c                              | 1 +
 fs/ext3/inode.c                              | 1 +
 fs/ext4/file.c                               | 1 +
 fs/ext4/indirect.c                           | 1 +
 fs/ext4/inode.c                              | 1 +
 fs/ext4/page-io.c                            | 1 +
 fs/f2fs/data.c                               | 1 +
 fs/fat/inode.c                               | 1 +
 fs/fuse/cuse.c                               | 1 +
 fs/fuse/dev.c                                | 1 +
 fs/fuse/file.c                               | 1 +
 fs/gfs2/aops.c                               | 1 +
 fs/gfs2/file.c                               | 1 +
 fs/hfs/inode.c                               | 1 +
 fs/hfsplus/inode.c                           | 1 +
 fs/jfs/inode.c                               | 1 +
 fs/nilfs2/inode.c                            | 2 +-
 fs/ntfs/file.c                               | 1 +
 fs/ntfs/inode.c                              | 1 +
 fs/ocfs2/aops.h                              | 2 ++
 fs/ocfs2/inode.h                             | 2 --
 fs/pipe.c                                    | 1 +
 fs/read_write.c                              | 1 +
 fs/reiserfs/inode.c                          | 1 +
 fs/ubifs/file.c                              | 1 +
 fs/udf/inode.c                               | 1 +
 fs/xfs/xfs_aops.c                            | 1 +
 fs/xfs/xfs_file.c                            | 1 +
 include/linux/cgroup.h                       | 1 +
 include/linux/pid_namespace.h                | 1 +
 include/linux/sched.h                        | 2 --
 include/linux/writeback.h                    | 1 +
 kernel/fork.c                                | 1 +
 kernel/printk.c                              | 1 +
 kernel/ptrace.c                              | 1 +
 mm/page_io.c                                 | 1 +
 mm/shmem.c                                   | 1 +
 mm/swap.c                                    | 1 +
 security/keys/internal.h                     | 2 ++
 security/keys/keyctl.c                       | 1 +
 sound/core/pcm_native.c                      | 2 +-
 58 files changed, 58 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index 5f7d7ba2874c..7a539f4f5e30 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/mount.h>
+#include <linux/aio.h>
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 9a87daa6f4fb..a5ffcc988f0b 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -27,6 +27,7 @@
 #include <linux/ratelimit.h>
 #include <linux/slab.h>
 #include <linux/times.h>
+#include <linux/uio.h>
 #include <asm/uaccess.h>
 
 #include <scsi/scsi.h>
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index e49265f18f86..1ccbe9482faa 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -28,6 +28,7 @@
 #include <linux/pfn.h>
 #include <linux/export.h>
 #include <linux/io.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index aed8afee56da..6d7f453b4d05 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -40,6 +40,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
+#include <linux/aio.h>
 #include <linux/jiffies.h>
 #include <linux/cpu.h>
 #include <asm/pgtable.h>
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index 4f7aa301b3b1..b56c9428f3c5 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -39,7 +39,7 @@
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
 #include <linux/io.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
 #include <linux/jiffies.h>
 #include <asm/pgtable.h>
 #include <linux/delay.h>
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 9f0c46547459..df5e961484e1 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -35,6 +35,7 @@ static int sg_version_num = 30534;	/* 2 digits for each component */
 #include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/mm.h>
+#include <linux/aio.h>
 #include <linux/errno.h>
 #include <linux/mtio.h>
 #include <linux/ioctl.h>
diff --git a/drivers/staging/android/logger.c b/drivers/staging/android/logger.c
index b14a55742559..b040200a5a55 100644
--- a/drivers/staging/android/logger.c
+++ b/drivers/staging/android/logger.c
@@ -28,6 +28,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/vmalloc.h>
+#include <linux/aio.h>
 #include "logger.h"
 
 #include <asm/ioctls.h>
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 0ad61c6a65a5..055562c580b4 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -33,6 +33,7 @@
 #include <linux/pagemap.h>
 #include <linux/idr.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 7e03eadb40c0..a890db4b9898 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/aio.h>
 #include "internal.h"
 
 static int afs_write_back_from_locked_page(struct afs_writeback *wb,
diff --git a/fs/bio.c b/fs/bio.c
index b96fc6ce4855..954d73124b41 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -19,6 +19,7 @@
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/uio.h>
 #include <linux/iocontext.h>
 #include <linux/slab.h>
 #include <linux/init.h>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index ce08de7467a3..95ff88b54e98 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -27,6 +27,7 @@
 #include <linux/namei.h>
 #include <linux/log2.h>
 #include <linux/cleancache.h>
+#include <linux/aio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index bb8b7a0e28a6..bc4d54c465a0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 #include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 09c58a35b429..898da0a01e04 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -32,6 +32,7 @@
 #include <linux/writeback.h>
 #include <linux/statfs.h>
 #include <linux/compat.h>
+#include <linux/aio.h>
 #include <linux/bit_spinlock.h>
 #include <linux/xattr.h>
 #include <linux/posix_acl.h>
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index d70830c66833..656e16907430 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -7,6 +7,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
 
 #include "super.h"
 #include "mds_client.h"
diff --git a/fs/compat.c b/fs/compat.c
index 93f7d021b716..fc3b55dce184 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -47,6 +47,7 @@
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/direct-io.c b/fs/direct-io.c
index cfb816dc6d9f..51d16e067d68 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -37,6 +37,7 @@
 #include <linux/uio.h>
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
+#include <linux/aio.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 63b1f54b6a1f..201f0a0d6b0a 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/compat.h>
 #include <linux/fs_stack.h>
+#include <linux/aio.h>
 #include "ecryptfs_kernel.h"
 
 /**
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index fe60cc1117d8..0a87bb10998d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -31,6 +31,7 @@
 #include <linux/mpage.h>
 #include <linux/fiemap.h>
 #include <linux/namei.h>
+#include <linux/aio.h>
 #include "ext2.h"
 #include "acl.h"
 #include "xip.h"
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index d706dbfa6220..23c712825640 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -27,6 +27,7 @@
 #include <linux/writeback.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
+#include <linux/aio.h>
 #include "ext3.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 64848b595b24..4959e29573b6 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -23,6 +23,7 @@
 #include <linux/jbd2.h>
 #include <linux/mount.h>
 #include <linux/path.h>
+#include <linux/aio.h>
 #include <linux/quotaops.h>
 #include <linux/pagevec.h>
 #include "ext4.h"
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 98be6f697463..b8d5d351e24f 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -20,6 +20,7 @@
  *	(sct@redhat.com), 1993, 1998
  */
 
+#include <linux/aio.h>
 #include "ext4_jbd2.h"
 #include "truncate.h"
 #include "ext4_extents.h"	/* Needed for EXT_MAX_BLOCKS */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 793d44b84d7f..0723774bdfb5 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/aio.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 5929cd0baa20..19599bded62a 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -18,6 +18,7 @@
 #include <linux/pagevec.h>
 #include <linux/mpage.h>
 #include <linux/namei.h>
+#include <linux/aio.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
 #include <linux/workqueue.h>
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7bd22a201125..d0ed4ba4b61b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -12,6 +12,7 @@
 #include <linux/f2fs_fs.h>
 #include <linux/buffer_head.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4ff901632b26..dfce656ddb33 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -19,6 +19,7 @@
 #include <linux/mpage.h>
 #include <linux/buffer_head.h>
 #include <linux/mount.h>
+#include <linux/aio.h>
 #include <linux/vfs.h>
 #include <linux/parser.h>
 #include <linux/uio.h>
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 6f96a8def147..06b5e086ab3a 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -38,6 +38,7 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/aio.h>
 #include <linux/kdev_t.h>
 #include <linux/kthread.h>
 #include <linux/list.h>
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 9bfd1a3214e6..a45c19093eb4 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -19,6 +19,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/swap.h>
 #include <linux/splice.h>
+#include <linux/aio.h>
 
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d15c6f21c17f..82f7ee581245 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
+#include <linux/aio.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
 
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9883694f1e7c..0bad69ed6336 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -20,6 +20,7 @@
 #include <linux/swap.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/backing-dev.h>
+#include <linux/aio.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d79c2dadc536..acd16764b133 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,6 +25,7 @@
 #include <asm/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
+#include <linux/aio.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 716e1aafb2e2..f9299d8a64e3 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 
 #include "hfs_fs.h"
 #include "btree.h"
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 7faaa964968e..f833d35630ab 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mpage.h>
 #include <linux/sched.h>
+#include <linux/aio.h>
 
 #include "hfsplus_fs.h"
 #include "hfsplus_raw.h"
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 77554b61d124..730f24e282a6 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -23,6 +23,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
 #include "jfs_incore.h"
 #include "jfs_inode.h"
 #include "jfs_filsys.h"
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index cf02f5530713..689fb608648e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -25,7 +25,7 @@
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/writeback.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
 #include "nilfs.h"
 #include "btnode.h"
 #include "segment.h"
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 1da4b81e6f76..c5670b8d198c 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -27,6 +27,7 @@
 #include <linux/swap.h>
 #include <linux/uio.h>
 #include <linux/writeback.h>
+#include <linux/aio.h>
 
 #include <asm/page.h>
 #include <asm/uaccess.h>
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index d3e118cc6ffa..2778b0255dc6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -28,6 +28,7 @@
 #include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/log2.h>
+#include <linux/aio.h>
 
 #include "aops.h"
 #include "attrib.h"
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index ffb2da370a99..f671e49beb34 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -22,6 +22,8 @@
 #ifndef OCFS2_AOPS_H
 #define OCFS2_AOPS_H
 
+#include <linux/aio.h>
+
 handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 struct page *page,
 							 unsigned from,
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 88924a3133fa..621fc73bf23d 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -147,8 +147,6 @@ void ocfs2_refresh_inode(struct inode *inode,
 int ocfs2_mark_inode_dirty(handle_t *handle,
 			   struct inode *inode,
 			   struct buffer_head *bh);
-int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
-int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 struct buffer_head *ocfs2_bread(struct inode *inode,
 				int block, int *err, int reada);
 
diff --git a/fs/pipe.c b/fs/pipe.c
index a029a14bacf1..d2c45e14e6d8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -21,6 +21,7 @@
 #include <linux/audit.h>
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <linux/aio.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
diff --git a/fs/read_write.c b/fs/read_write.c
index bce289afd21a..03430008704e 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -9,6 +9,7 @@
 #include <linux/fcntl.h>
 #include <linux/file.h>
 #include <linux/uio.h>
+#include <linux/aio.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include <linux/export.h>
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ea5061fd4f3e..77d6d47abc83 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/quotaops.h>
 #include <linux/swap.h>
+#include <linux/aio.h>
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f12189d2db1d..14374530784c 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -50,6 +50,7 @@
  */
 
 #include "ubifs.h"
+#include <linux/aio.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 7a12e48ad819..b6d15d349810 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
+#include <linux/aio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3244c988d379..2b2691b73428 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
 #include "xfs_bmap.h"
+#include <linux/aio.h>
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 054d60c0ac57..a5f2042aec8b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
 
+#include <linux/aio.h>
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 3bff9ce09cf7..5047355b9a0f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -28,6 +28,7 @@ struct cgroup_subsys;
 struct inode;
 struct cgroup;
 struct css_id;
+struct eventfd_ctx;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 731e4ecee3bd..e2772666f004 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -4,6 +4,7 @@
 #include <linux/sched.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
+#include <linux/workqueue.h>
 #include <linux/threads.h>
 #include <linux/nsproxy.h>
 #include <linux/kref.h>
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4800e9d1864c..022c085ac3c5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -313,8 +313,6 @@ extern void schedule_preempt_disabled(void);
 struct nsproxy;
 struct user_namespace;
 
-#include <linux/aio.h>
-
 #ifdef CONFIG_MMU
 extern void arch_pick_mmap_layout(struct mm_struct *mm);
 extern unsigned long
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9a9367c0c076..579a5007c696 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
 #define WRITEBACK_H
 
 #include <linux/sched.h>
+#include <linux/workqueue.h>
 #include <linux/fs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d40687b1434..c509cc4a0d53 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -70,6 +70,7 @@
 #include <linux/khugepaged.h>
 #include <linux/signalfd.h>
 #include <linux/uprobes.h>
+#include <linux/aio.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 96dcfcd9a2d4..fa36e1494420 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/memblock.h>
+#include <linux/aio.h>
 #include <linux/syscalls.h>
 #include <linux/kexec.h>
 #include <linux/kdb.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 17ae54da0ec2..aed981a3f69c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -17,6 +17,7 @@
 #include <linux/ptrace.h>
 #include <linux/security.h>
 #include <linux/signal.h>
+#include <linux/uio.h>
 #include <linux/audit.h>
 #include <linux/pid_namespace.h>
 #include <linux/syscalls.h>
diff --git a/mm/page_io.c b/mm/page_io.c
index bb5d75274686..06a8842a6ec6 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -20,6 +20,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/frontswap.h>
+#include <linux/aio.h>
 #include <asm/pgtable.h>
 
 static struct bio *get_swap_bio(gfp_t gfp_flags,
diff --git a/mm/shmem.c b/mm/shmem.c
index 39b2a0b86fe8..5e6a8422658b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -31,6 +31,7 @@
 #include <linux/mm.h>
 #include <linux/export.h>
 #include <linux/swap.h>
+#include <linux/aio.h>
 
 static struct vfsmount *shm_mnt;
 
diff --git a/mm/swap.c b/mm/swap.c
index acd40bfffa82..dfd7d71d6841 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -30,6 +30,7 @@
 #include <linux/backing-dev.h>
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
+#include <linux/uio.h>
 
 #include "internal.h"
 
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 8bbefc3b55d4..d4f1468b9b50 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -16,6 +16,8 @@
 #include <linux/key-type.h>
 #include <linux/task_work.h>
 
+struct iovec;
+
 #ifdef __KDEBUG
 #define kenter(FMT, ...) \
 	printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index 4b5c948eb414..33cfd27b4de2 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -22,6 +22,7 @@
 #include <linux/err.h>
 #include <linux/vmalloc.h>
 #include <linux/security.h>
+#include <linux/uio.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 23e3c46cd0a4..ccfa383f1fda 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/pm_qos.h>
-#include <linux/uio.h>
+#include <linux/aio.h>
 #include <linux/dma-mapping.h>
 #include <sound/core.h>
 #include <sound/control.h>
-- 
cgit 


From ab3ea5bf37e7189e843e19e500e7af50e802b5f6 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew.r.wilcox@intel.com>
Date: Mon, 6 May 2013 08:22:18 -0400
Subject: NVMe: Simplify Firmware Activate code slightly

Add definitions for the three Firmware Activate actions, and change the
SCSI translation code to construct the command into a temporary variable
instead of translating the endianness back-and-forth.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
Reviewed-by: Vishal Verma <vishal.l.verma@linux.intel.com>
---
 drivers/block/nvme-scsi.c | 6 ++----
 include/linux/nvme.h      | 3 +++
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c
index 1c0710ca55d6..fed54b039893 100644
--- a/drivers/block/nvme-scsi.c
+++ b/drivers/block/nvme-scsi.c
@@ -1577,10 +1577,8 @@ static int nvme_trans_send_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr,
 		c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1);
 		c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS);
 	} else if (opcode == nvme_admin_activate_fw) {
-		c.common.cdw10[0] = cpu_to_le32(buffer_id);
-		/* AA=01b Replace & activate at reset */
-		c.common.cdw10[0] = cpu_to_le32(le32_to_cpu(
-						c.common.cdw10[0]) | 0x00000008);
+		u32 cdw10 = buffer_id | NVME_FWACT_REPL_ACTV;
+		c.common.cdw10[0] = cpu_to_le32(cdw10);
 	}
 
 	nvme_sc = nvme_submit_admin_cmd(dev, &c, NULL);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 971ef086ed63..f451c8d6e231 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -316,6 +316,9 @@ enum {
 	NVME_FEAT_WRITE_ATOMIC	= 0x0a,
 	NVME_FEAT_ASYNC_EVENT	= 0x0b,
 	NVME_FEAT_SW_PROGRESS	= 0x0c,
+	NVME_FWACT_REPL		= (0 << 3),
+	NVME_FWACT_REPL_ACTV	= (1 << 3),
+	NVME_FWACT_ACTV		= (2 << 3),
 };
 
 struct nvme_identify {
-- 
cgit 


From 6eecdc5f95a393cb558503123eae9a9a6642e835 Mon Sep 17 00:00:00 2001
From: Dan Williams <dcbw@redhat.com>
Date: Mon, 6 May 2013 11:29:23 +0000
Subject: usbnet: allow status interrupt URB to always be active

Some drivers (sierra_net) need the status interrupt URB
active even when the device is closed, because they receive
custom indications from firmware.  Add functions to refcount
the status interrupt URB submit/kill operation so that
sub-drivers and the generic driver don't fight over whether
the status interrupt URB is active or not.

A sub-driver can call usbnet_status_start() at any time, but
the URB is only submitted the first time the function is
called.  Likewise, when the sub-driver is done with the URB,
it calls usbnet_status_stop() but the URB is only killed when
all users have stopped it.  The URB is still killed and
re-submitted for suspend/resume, as before, with the same
refcount it had at suspend.

Signed-off-by: Dan Williams <dcbw@redhat.com>
Acked-by: Oliver Neukum <oliver@neukum.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/usbnet.c   | 77 ++++++++++++++++++++++++++++++++++++++++++----
 include/linux/usb/usbnet.h |  5 +++
 2 files changed, 76 insertions(+), 6 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 1e5a9b72650e..f95cb032394b 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -252,6 +252,70 @@ static int init_status (struct usbnet *dev, struct usb_interface *intf)
 	return 0;
 }
 
+/* Submit the interrupt URB if not previously submitted, increasing refcount */
+int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags)
+{
+	int ret = 0;
+
+	WARN_ON_ONCE(dev->interrupt == NULL);
+	if (dev->interrupt) {
+		mutex_lock(&dev->interrupt_mutex);
+
+		if (++dev->interrupt_count == 1)
+			ret = usb_submit_urb(dev->interrupt, mem_flags);
+
+		dev_dbg(&dev->udev->dev, "incremented interrupt URB count to %d\n",
+			dev->interrupt_count);
+		mutex_unlock(&dev->interrupt_mutex);
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(usbnet_status_start);
+
+/* For resume; submit interrupt URB if previously submitted */
+static int __usbnet_status_start_force(struct usbnet *dev, gfp_t mem_flags)
+{
+	int ret = 0;
+
+	mutex_lock(&dev->interrupt_mutex);
+	if (dev->interrupt_count) {
+		ret = usb_submit_urb(dev->interrupt, mem_flags);
+		dev_dbg(&dev->udev->dev,
+			"submitted interrupt URB for resume\n");
+	}
+	mutex_unlock(&dev->interrupt_mutex);
+	return ret;
+}
+
+/* Kill the interrupt URB if all submitters want it killed */
+void usbnet_status_stop(struct usbnet *dev)
+{
+	if (dev->interrupt) {
+		mutex_lock(&dev->interrupt_mutex);
+		WARN_ON(dev->interrupt_count == 0);
+
+		if (dev->interrupt_count && --dev->interrupt_count == 0)
+			usb_kill_urb(dev->interrupt);
+
+		dev_dbg(&dev->udev->dev,
+			"decremented interrupt URB count to %d\n",
+			dev->interrupt_count);
+		mutex_unlock(&dev->interrupt_mutex);
+	}
+}
+EXPORT_SYMBOL_GPL(usbnet_status_stop);
+
+/* For suspend; always kill interrupt URB */
+static void __usbnet_status_stop_force(struct usbnet *dev)
+{
+	if (dev->interrupt) {
+		mutex_lock(&dev->interrupt_mutex);
+		usb_kill_urb(dev->interrupt);
+		dev_dbg(&dev->udev->dev, "killed interrupt URB for suspend\n");
+		mutex_unlock(&dev->interrupt_mutex);
+	}
+}
+
 /* Passes this packet up the stack, updating its accounting.
  * Some link protocols batch packets, so their rx_fixup paths
  * can return clones as well as just modify the original skb.
@@ -725,7 +789,7 @@ int usbnet_stop (struct net_device *net)
 	if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
 		usbnet_terminate_urbs(dev);
 
-	usb_kill_urb(dev->interrupt);
+	usbnet_status_stop(dev);
 
 	usbnet_purge_paused_rxq(dev);
 
@@ -787,7 +851,7 @@ int usbnet_open (struct net_device *net)
 
 	/* start any status interrupt transfer */
 	if (dev->interrupt) {
-		retval = usb_submit_urb (dev->interrupt, GFP_KERNEL);
+		retval = usbnet_status_start(dev, GFP_KERNEL);
 		if (retval < 0) {
 			netif_err(dev, ifup, dev->net,
 				  "intr submit %d\n", retval);
@@ -1458,6 +1522,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod)
 	dev->delay.data = (unsigned long) dev;
 	init_timer (&dev->delay);
 	mutex_init (&dev->phy_mutex);
+	mutex_init(&dev->interrupt_mutex);
+	dev->interrupt_count = 0;
 
 	dev->net = net;
 	strcpy (net->name, "usb%d");
@@ -1593,7 +1659,7 @@ int usbnet_suspend (struct usb_interface *intf, pm_message_t message)
 		 */
 		netif_device_detach (dev->net);
 		usbnet_terminate_urbs(dev);
-		usb_kill_urb(dev->interrupt);
+		__usbnet_status_stop_force(dev);
 
 		/*
 		 * reattach so runtime management can use and
@@ -1613,9 +1679,8 @@ int usbnet_resume (struct usb_interface *intf)
 	int                     retval;
 
 	if (!--dev->suspend_count) {
-		/* resume interrupt URBs */
-		if (dev->interrupt && test_bit(EVENT_DEV_OPEN, &dev->flags))
-			usb_submit_urb(dev->interrupt, GFP_NOIO);
+		/* resume interrupt URB if it was previously submitted */
+		__usbnet_status_start_force(dev, GFP_NOIO);
 
 		spin_lock_irq(&dev->txq.lock);
 		while ((res = usb_get_from_anchor(&dev->deferred))) {
diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index da46327fca17..f18d64129f99 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -56,6 +56,8 @@ struct usbnet {
 	struct sk_buff_head	done;
 	struct sk_buff_head	rxq_pause;
 	struct urb		*interrupt;
+	unsigned		interrupt_count;
+	struct mutex		interrupt_mutex;
 	struct usb_anchor	deferred;
 	struct tasklet_struct	bh;
 
@@ -248,4 +250,7 @@ extern int usbnet_nway_reset(struct net_device *net);
 extern int usbnet_manage_power(struct usbnet *, int);
 extern void usbnet_link_change(struct usbnet *, bool, bool);
 
+extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags);
+extern void usbnet_status_stop(struct usbnet *dev);
+
 #endif /* __LINUX_USB_USBNET_H */
-- 
cgit 


From 91c2e0bcae72a3086c698b5de2b950b885abb0e6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 Mar 2013 20:10:59 -0500
Subject: unify compat fanotify_mark(2), switch to COMPAT_SYSCALL_DEFINE

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/arm64/include/asm/unistd32.h  |  2 +-
 arch/arm64/kernel/sys32.S          |  7 -------
 arch/mips/kernel/linux32.c         |  7 -------
 arch/mips/kernel/scall64-o32.S     |  2 +-
 arch/parisc/kernel/sys_parisc32.c  |  8 --------
 arch/powerpc/kernel/sys_ppc32.c    |  8 --------
 arch/s390/kernel/compat_wrapper.S  |  9 ---------
 arch/s390/kernel/syscalls.S        |  2 +-
 arch/sparc/kernel/sys32.S          |  9 ---------
 arch/sparc/kernel/systbls_64.S     |  2 +-
 arch/x86/ia32/sys_ia32.c           |  9 ---------
 arch/x86/include/asm/sys_ia32.h    |  3 ---
 arch/x86/syscalls/syscall_32.tbl   |  2 +-
 fs/notify/fanotify/fanotify_user.c | 17 +++++++++++++++++
 include/linux/compat.h             |  2 ++
 kernel/sys_ni.c                    |  1 +
 16 files changed, 25 insertions(+), 65 deletions(-)

(limited to 'include/linux')

diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h
index 12f22492df4c..58125bf008d3 100644
--- a/arch/arm64/include/asm/unistd32.h
+++ b/arch/arm64/include/asm/unistd32.h
@@ -389,7 +389,7 @@ __SYSCALL(364, sys_perf_event_open)
 __SYSCALL(365, compat_sys_recvmmsg)
 __SYSCALL(366, sys_accept4)
 __SYSCALL(367, sys_fanotify_init)
-__SYSCALL(368, compat_sys_fanotify_mark_wrapper)
+__SYSCALL(368, compat_sys_fanotify_mark)
 __SYSCALL(369, sys_prlimit64)
 __SYSCALL(370, sys_name_to_handle_at)
 __SYSCALL(371, compat_sys_open_by_handle_at)
diff --git a/arch/arm64/kernel/sys32.S b/arch/arm64/kernel/sys32.S
index db01aa978c41..a1b19ed7467c 100644
--- a/arch/arm64/kernel/sys32.S
+++ b/arch/arm64/kernel/sys32.S
@@ -104,13 +104,6 @@ compat_sys_fallocate_wrapper:
 	b	sys_fallocate
 ENDPROC(compat_sys_fallocate_wrapper)
 
-compat_sys_fanotify_mark_wrapper:
-	orr	x2, x2, x3, lsl #32
-	mov	w3, w4
-	mov	w4, w5
-	b	sys_fanotify_mark
-ENDPROC(compat_sys_fanotify_mark_wrapper)
-
 #undef __SYSCALL
 #define __SYSCALL(x, y)		.quad	y	// x
 
diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c
index d1d576b765f5..0b29646bcee7 100644
--- a/arch/mips/kernel/linux32.c
+++ b/arch/mips/kernel/linux32.c
@@ -165,10 +165,3 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_a2,
 	return sys_fallocate(fd, mode, merge_64(offset_a2, offset_a3),
 			     merge_64(len_a4, len_a5));
 }
-
-SYSCALL_DEFINE6(32_fanotify_mark, int, fanotify_fd, unsigned int, flags,
-		u64, a3, u64, a4, int, dfd, const char	__user *, pathname)
-{
-	return sys_fanotify_mark(fanotify_fd, flags, merge_64(a3, a4),
-				 dfd, pathname);
-}
diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S
index 103bfe570fe8..74f485d3c0ef 100644
--- a/arch/mips/kernel/scall64-o32.S
+++ b/arch/mips/kernel/scall64-o32.S
@@ -529,7 +529,7 @@ sys_call_table:
 	PTR	sys_accept4
 	PTR	compat_sys_recvmmsg		/* 4335 */
 	PTR	sys_fanotify_init
-	PTR	sys_32_fanotify_mark
+	PTR	compat_sys_fanotify_mark
 	PTR	sys_prlimit64
 	PTR	sys_name_to_handle_at
 	PTR	compat_sys_open_by_handle_at	/* 4340 */
diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c
index f517e08e7f0d..a134ff4da12e 100644
--- a/arch/parisc/kernel/sys_parisc32.c
+++ b/arch/parisc/kernel/sys_parisc32.c
@@ -59,11 +59,3 @@ asmlinkage long sys32_unimplemented(int r26, int r25, int r24, int r23,
     	current->comm, current->pid, r20);
     return -ENOSYS;
 }
-
-asmlinkage long compat_sys_fanotify_mark(int fan_fd, int flags, u32 mask_hi,
-					 u32 mask_lo, int fd,
-					 const char __user *pathname)
-{
-	return sys_fanotify_mark(fan_fd, flags, ((u64)mask_hi << 32) | mask_lo,
-				 fd, pathname);
-}
diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c
index cd6e19d263b3..8a285876aef8 100644
--- a/arch/powerpc/kernel/sys_ppc32.c
+++ b/arch/powerpc/kernel/sys_ppc32.c
@@ -126,11 +126,3 @@ asmlinkage long compat_sys_sync_file_range2(int fd, unsigned int flags,
 
 	return sys_sync_file_range(fd, offset, nbytes, flags);
 }
-
-asmlinkage long compat_sys_fanotify_mark(int fanotify_fd, unsigned int flags,
-					 unsigned mask_hi, unsigned mask_lo,
-					 int dfd, const char __user *pathname)
-{
-	u64 mask = ((u64)mask_hi << 32) | mask_lo;
-	return sys_fanotify_mark(fanotify_fd, flags, mask, dfd, pathname);
-}
diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S
index 2d72d9e96c15..99696783fbf7 100644
--- a/arch/s390/kernel/compat_wrapper.S
+++ b/arch/s390/kernel/compat_wrapper.S
@@ -1349,15 +1349,6 @@ ENTRY(sys_fanotify_init_wrapper)
 	llgfr	%r3,%r3			# unsigned int
 	jg	sys_fanotify_init	# branch to system call
 
-ENTRY(sys_fanotify_mark_wrapper)
-	lgfr	%r2,%r2			# int
-	llgfr	%r3,%r3			# unsigned int
-	sllg	%r4,%r4,32		# get high word of 64bit mask
-	lr	%r4,%r5			# get low word of 64bit mask
-	llgfr	%r5,%r6			# unsigned int
-	llgt	%r6,164(%r15)		# char *
-	jg	sys_fanotify_mark	# branch to system call
-
 ENTRY(sys_prlimit64_wrapper)
 	lgfr	%r2,%r2			# pid_t
 	llgfr	%r3,%r3			# unsigned int
diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S
index 9f214e992eed..9605e063dc8b 100644
--- a/arch/s390/kernel/syscalls.S
+++ b/arch/s390/kernel/syscalls.S
@@ -341,7 +341,7 @@ SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev)
 SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo) /* 330 */
 SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper)
 SYSCALL(sys_fanotify_init,sys_fanotify_init,sys_fanotify_init_wrapper)
-SYSCALL(sys_fanotify_mark,sys_fanotify_mark,sys_fanotify_mark_wrapper)
+SYSCALL(sys_fanotify_mark,sys_fanotify_mark,compat_sys_fanotify_mark)
 SYSCALL(sys_prlimit64,sys_prlimit64,sys_prlimit64_wrapper)
 SYSCALL(sys_name_to_handle_at,sys_name_to_handle_at,sys_name_to_handle_at_wrapper) /* 335 */
 SYSCALL(sys_open_by_handle_at,sys_open_by_handle_at,compat_sys_open_by_handle_at)
diff --git a/arch/sparc/kernel/sys32.S b/arch/sparc/kernel/sys32.S
index 2e680b5245c9..f7c72b6efc27 100644
--- a/arch/sparc/kernel/sys32.S
+++ b/arch/sparc/kernel/sys32.S
@@ -239,15 +239,6 @@ do_sys_accept4: /* sys_accept4(int, struct sockaddr *, int *, int) */
 	nop
 	nop
 
-	.globl		sys32_fanotify_mark
-sys32_fanotify_mark:
-	sethi		%hi(sys_fanotify_mark), %g1
-	sllx		%o2, 32, %o2
-	or		%o2, %o3, %o2
-	mov		%o4, %o3
-	jmpl		%g1 + %lo(sys_fanotify_mark), %g0
-	 mov		%o5, %o4
-
 	.section	__ex_table,"a"
 	.align		4
 	.word		1b, __retl_efault, 2b, __retl_efault
diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S
index 8fd932080215..6d81597064b6 100644
--- a/arch/sparc/kernel/systbls_64.S
+++ b/arch/sparc/kernel/systbls_64.S
@@ -84,7 +84,7 @@ sys_call_table32:
 	.word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1
 /*320*/	.word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv
 	.word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open, compat_sys_recvmmsg, sys_fanotify_init
-/*330*/	.word sys32_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime
+/*330*/	.word compat_sys_fanotify_mark, sys_prlimit64, sys_name_to_handle_at, compat_sys_open_by_handle_at, compat_sys_clock_adjtime
 	.word sys_syncfs, compat_sys_sendmmsg, sys_setns, compat_sys_process_vm_readv, compat_sys_process_vm_writev
 /*340*/	.word sys_kern_features, sys_kcmp, sys_finit_module
 
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 4e4907c67d92..8e0ceecdc957 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -243,12 +243,3 @@ asmlinkage long sys32_fallocate(int fd, int mode, unsigned offset_lo,
 	return sys_fallocate(fd, mode, ((u64)offset_hi << 32) | offset_lo,
 			     ((u64)len_hi << 32) | len_lo);
 }
-
-asmlinkage long sys32_fanotify_mark(int fanotify_fd, unsigned int flags,
-				    u32 mask_lo, u32 mask_hi,
-				    int fd, const char  __user *pathname)
-{
-	return sys_fanotify_mark(fanotify_fd, flags,
-				 ((u64)mask_hi << 32) | mask_lo,
-				 fd, pathname);
-}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 0ef202e232d6..82c34ee25a65 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -50,9 +50,6 @@ asmlinkage long sys32_fallocate(int, int, unsigned,
 asmlinkage long sys32_sigreturn(void);
 asmlinkage long sys32_rt_sigreturn(void);
 
-asmlinkage long sys32_fanotify_mark(int, unsigned int, u32, u32, int,
-				    const char __user *);
-
 #endif /* CONFIG_COMPAT */
 
 #endif /* _ASM_X86_SYS_IA32_H */
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d0d59bfbccce..aabfb8380a1c 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -345,7 +345,7 @@
 336	i386	perf_event_open		sys_perf_event_open
 337	i386	recvmmsg		sys_recvmmsg			compat_sys_recvmmsg
 338	i386	fanotify_init		sys_fanotify_init
-339	i386	fanotify_mark		sys_fanotify_mark		sys32_fanotify_mark
+339	i386	fanotify_mark		sys_fanotify_mark		compat_sys_fanotify_mark
 340	i386	prlimit64		sys_prlimit64
 341	i386	name_to_handle_at	sys_name_to_handle_at
 342	i386	open_by_handle_at	sys_open_by_handle_at		compat_sys_open_by_handle_at
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index d0be29fa94cf..6c80083a984f 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/uaccess.h>
+#include <linux/compat.h>
 
 #include <asm/ioctls.h>
 
@@ -857,6 +858,22 @@ fput_and_out:
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE6(fanotify_mark,
+				int, fanotify_fd, unsigned int, flags,
+				__u32, mask0, __u32, mask1, int, dfd,
+				const char  __user *, pathname)
+{
+	return sys_fanotify_mark(fanotify_fd, flags,
+#ifdef __BIG_ENDIAN
+				((__u64)mask1 << 32) | mask0,
+#else
+				((__u64)mask0 << 32) | mask1,
+#endif
+				 dfd, pathname);
+}
+#endif
+
 /*
  * fanotify_user_setup - Our initialization function.  Note that we cannot return
  * error because we have compiled-in VFS hooks.  So an (unlikely) failure here
diff --git a/include/linux/compat.h b/include/linux/compat.h
index d53c35352ea9..7f0c1dd09079 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -673,6 +673,8 @@ int __compat_save_altstack(compat_stack_t __user *, unsigned long);
 asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid,
 						 struct compat_timespec __user *interval);
 
+asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
+					    int, const char __user *);
 #else
 
 #define is_compat_task() (0)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index bfd6787b355a..7078052284fd 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -200,6 +200,7 @@ cond_syscall(sys_perf_event_open);
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
+cond_syscall(compat_sys_fanotify_mark);
 
 /* open by handle */
 cond_syscall(sys_name_to_handle_at);
-- 
cgit 


From 60371952e1dd0d507039b2654c22083b32c38398 Mon Sep 17 00:00:00 2001
From: Fabio Estevam <fabio.estevam@freescale.com>
Date: Wed, 8 May 2013 21:05:54 +0800
Subject: ARM: imx: Select GENERIC_ALLOCATOR

Since commit 657eee7 (media: coda: use genalloc API) the following build
error happens with imx_v4_v5_defconfig:

drivers/built-in.o: In function 'coda_remove':
clk-composite.c:(.text+0x112180): undefined reference to 'gen_pool_free'
drivers/built-in.o: In function 'coda_probe':
clk-composite.c:(.text+0x112310): undefined reference to 'of_get_named_gen_pool'
clk-composite.c:(.text+0x1123f4): undefined reference to 'gen_pool_alloc'
clk-composite.c:(.text+0x11240c): undefined reference to 'gen_pool_virt_to_phys'
clk-composite.c:(.text+0x112458): undefined reference to 'dev_get_gen_pool'

Select GENERIC_ALLOCATOR and get rid of the custom IRAM_ALLOC.

Signed-off-by: Fabio Estevam <fabio.estevam@freescale.com>
Signed-off-by: Shawn Guo <shawn.guo@linaro.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
---
 arch/arm/mach-imx/Kconfig              |  5 +--
 arch/arm/mach-imx/Makefile             |  1 -
 arch/arm/mach-imx/iram_alloc.c         | 73 ----------------------------------
 include/linux/platform_data/imx-iram.h | 41 -------------------
 4 files changed, 1 insertion(+), 119 deletions(-)
 delete mode 100644 arch/arm/mach-imx/iram_alloc.c
 delete mode 100644 include/linux/platform_data/imx-iram.h

(limited to 'include/linux')

diff --git a/arch/arm/mach-imx/Kconfig b/arch/arm/mach-imx/Kconfig
index 78f795d73cb6..ba44328464f3 100644
--- a/arch/arm/mach-imx/Kconfig
+++ b/arch/arm/mach-imx/Kconfig
@@ -5,6 +5,7 @@ config ARCH_MXC
 	select AUTO_ZRELADDR if !ZBOOT_ROM
 	select CLKDEV_LOOKUP
 	select CLKSRC_MMIO
+	select GENERIC_ALLOCATOR
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_IRQ_CHIP
 	select MULTI_IRQ_HANDLER
@@ -61,10 +62,6 @@ config MXC_ULPI
 config ARCH_HAS_RNGA
 	bool
 
-config IRAM_ALLOC
-	bool
-	select GENERIC_ALLOCATOR
-
 config HAVE_IMX_ANATOP
 	bool
 
diff --git a/arch/arm/mach-imx/Makefile b/arch/arm/mach-imx/Makefile
index 930958973f81..70ae7c490ac0 100644
--- a/arch/arm/mach-imx/Makefile
+++ b/arch/arm/mach-imx/Makefile
@@ -23,7 +23,6 @@ obj-$(CONFIG_ARCH_MXC_IOMUX_V3) += iomux-v3.o
 obj-$(CONFIG_MXC_TZIC) += tzic.o
 obj-$(CONFIG_MXC_AVIC) += avic.o
 
-obj-$(CONFIG_IRAM_ALLOC) += iram_alloc.o
 obj-$(CONFIG_MXC_ULPI) += ulpi.o
 obj-$(CONFIG_MXC_USE_EPIT) += epit.o
 obj-$(CONFIG_MXC_DEBUG_BOARD) += 3ds_debugboard.o
diff --git a/arch/arm/mach-imx/iram_alloc.c b/arch/arm/mach-imx/iram_alloc.c
deleted file mode 100644
index e05cf407db65..000000000000
--- a/arch/arm/mach-imx/iram_alloc.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (C) 2010 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
-
-#include <linux/kernel.h>
-#include <linux/io.h>
-#include <linux/module.h>
-#include <linux/spinlock.h>
-#include <linux/genalloc.h>
-#include "linux/platform_data/imx-iram.h"
-
-static unsigned long iram_phys_base;
-static void __iomem *iram_virt_base;
-static struct gen_pool *iram_pool;
-
-static inline void __iomem *iram_phys_to_virt(unsigned long p)
-{
-	return iram_virt_base + (p - iram_phys_base);
-}
-
-void __iomem *iram_alloc(unsigned int size, unsigned long *dma_addr)
-{
-	if (!iram_pool)
-		return NULL;
-
-	*dma_addr = gen_pool_alloc(iram_pool, size);
-	pr_debug("iram alloc - %dB@0x%lX\n", size, *dma_addr);
-	if (!*dma_addr)
-		return NULL;
-	return iram_phys_to_virt(*dma_addr);
-}
-EXPORT_SYMBOL(iram_alloc);
-
-void iram_free(unsigned long addr, unsigned int size)
-{
-	if (!iram_pool)
-		return;
-
-	gen_pool_free(iram_pool, addr, size);
-}
-EXPORT_SYMBOL(iram_free);
-
-int __init iram_init(unsigned long base, unsigned long size)
-{
-	iram_phys_base = base;
-
-	iram_pool = gen_pool_create(PAGE_SHIFT, -1);
-	if (!iram_pool)
-		return -ENOMEM;
-
-	gen_pool_add(iram_pool, base, size, -1);
-	iram_virt_base = ioremap(iram_phys_base, size);
-	if (!iram_virt_base)
-		return -EIO;
-
-	pr_debug("i.MX IRAM pool: %ld KB@0x%p\n", size / 1024, iram_virt_base);
-	return 0;
-}
diff --git a/include/linux/platform_data/imx-iram.h b/include/linux/platform_data/imx-iram.h
deleted file mode 100644
index 022690c33702..000000000000
--- a/include/linux/platform_data/imx-iram.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (C) 2010 Freescale Semiconductor, Inc. All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301, USA.
- */
-#include <linux/errno.h>
-
-#ifdef CONFIG_IRAM_ALLOC
-
-int __init iram_init(unsigned long base, unsigned long size);
-void __iomem *iram_alloc(unsigned int size, unsigned long *dma_addr);
-void iram_free(unsigned long dma_addr, unsigned int size);
-
-#else
-
-static inline int __init iram_init(unsigned long base, unsigned long size)
-{
-	return -ENOMEM;
-}
-
-static inline void __iomem *iram_alloc(unsigned int size, unsigned long *dma_addr)
-{
-	return NULL;
-}
-
-static inline void iram_free(unsigned long base, unsigned long size) {}
-
-#endif
-- 
cgit 


From f04f24fb7e48d446bd89a01c6056571f25972511 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:17 +0900
Subject: ftrace, kprobes: Fix a deadlock on ftrace_regex_lock

Fix a deadlock on ftrace_regex_lock which happens when setting
an enable_event trigger on dynamic kprobe event as below.

----
sh-2.05b# echo p vfs_symlink > kprobe_events
sh-2.05b# echo vfs_symlink:enable_event:kprobes:p_vfs_symlink_0 > set_ftrace_filter

=============================================
[ INFO: possible recursive locking detected ]
3.9.0+ #35 Not tainted
---------------------------------------------
sh/72 is trying to acquire lock:
 (ftrace_regex_lock){+.+.+.}, at: [<ffffffff810ba6c1>] ftrace_set_hash+0x81/0x1f0

but task is already holding lock:
 (ftrace_regex_lock){+.+.+.}, at: [<ffffffff810b7cbd>] ftrace_regex_write.isra.29.part.30+0x3d/0x220

other info that might help us debug this:
 Possible unsafe locking scenario:

       CPU0
       ----
  lock(ftrace_regex_lock);
  lock(ftrace_regex_lock);

 *** DEADLOCK ***
----

To fix that, this introduces a finer regex_lock for each ftrace_ops.
ftrace_regex_lock is too big of a lock which protects all
filter/notrace_hash operations, but it doesn't need to be a global
lock after supporting multiple ftrace_ops because each ftrace_ops
has its own filter/notrace_hash.

Link: http://lkml.kernel.org/r/20130509054417.30398.84254.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
[ Added initialization flag and automate mutex initialization for
  non ftrace.c ftrace_probes. ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  4 +++
 kernel/trace/ftrace.c  | 73 +++++++++++++++++++++++++++++++++++---------------
 2 files changed, 56 insertions(+), 21 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f83e17a40e8b..99d0fbcbaf79 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -90,6 +90,8 @@ typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip,
  *            not set this, then the ftrace infrastructure will add recursion
  *            protection for the caller.
  * STUB   - The ftrace_ops is just a place holder.
+ * INITIALIZED - The ftrace_ops has already been initialized (first use time
+ *            register_ftrace_function() is called, it will initialized the ops)
  */
 enum {
 	FTRACE_OPS_FL_ENABLED			= 1 << 0,
@@ -100,6 +102,7 @@ enum {
 	FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED	= 1 << 5,
 	FTRACE_OPS_FL_RECURSION_SAFE		= 1 << 6,
 	FTRACE_OPS_FL_STUB			= 1 << 7,
+	FTRACE_OPS_FL_INITIALIZED		= 1 << 8,
 };
 
 struct ftrace_ops {
@@ -110,6 +113,7 @@ struct ftrace_ops {
 #ifdef CONFIG_DYNAMIC_FTRACE
 	struct ftrace_hash		*notrace_hash;
 	struct ftrace_hash		*filter_hash;
+	struct mutex			regex_lock;
 #endif
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d85a0ad81a67..827f2fe7bc3f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -64,6 +64,13 @@
 
 #define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_GLOBAL | FTRACE_OPS_FL_CONTROL)
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+#define INIT_REGEX_LOCK(opsname)	\
+	.regex_lock	= __MUTEX_INITIALIZER(opsname.regex_lock),
+#else
+#define INIT_REGEX_LOCK(opsname)
+#endif
+
 static struct ftrace_ops ftrace_list_end __read_mostly = {
 	.func		= ftrace_stub,
 	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
@@ -131,6 +138,16 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
 	while (likely(op = rcu_dereference_raw((op)->next)) &&	\
 	       unlikely((op) != &ftrace_list_end))
 
+static inline void ftrace_ops_init(struct ftrace_ops *ops)
+{
+#ifdef CONFIG_DYNAMIC_FTRACE
+	if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
+		mutex_init(&ops->regex_lock);
+		ops->flags |= FTRACE_OPS_FL_INITIALIZED;
+	}
+#endif
+}
+
 /**
  * ftrace_nr_registered_ops - return number of ops registered
  *
@@ -907,7 +924,8 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly = {
 	.func		= function_profile_call,
-	.flags		= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags		= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(ftrace_profile_ops)
 };
 
 static int register_ftrace_profiler(void)
@@ -1103,11 +1121,10 @@ static struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
 	.notrace_hash		= EMPTY_HASH,
 	.filter_hash		= EMPTY_HASH,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(global_ops)
 };
 
-static DEFINE_MUTEX(ftrace_regex_lock);
-
 struct ftrace_page {
 	struct ftrace_page	*next;
 	struct dyn_ftrace	*records;
@@ -1247,6 +1264,7 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
 
 void ftrace_free_filter(struct ftrace_ops *ops)
 {
+	ftrace_ops_init(ops);
 	free_ftrace_hash(ops->filter_hash);
 	free_ftrace_hash(ops->notrace_hash);
 }
@@ -2624,6 +2642,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 	struct ftrace_hash *hash;
 	int ret = 0;
 
+	ftrace_ops_init(ops);
+
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
@@ -2656,7 +2676,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		}
 	}
 
-	mutex_lock(&ftrace_regex_lock);
+	mutex_lock(&ops->regex_lock);
 
 	if ((file->f_mode & FMODE_WRITE) &&
 	    (file->f_flags & O_TRUNC))
@@ -2677,7 +2697,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
 		}
 	} else
 		file->private_data = iter;
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&ops->regex_lock);
 
 	return ret;
 }
@@ -2910,6 +2930,8 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip,
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
 	.func		= function_trace_probe_call,
+	.flags		= FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(trace_probe_ops)
 };
 
 static int ftrace_probe_registered;
@@ -3256,18 +3278,18 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (!cnt)
 		return 0;
 
-	mutex_lock(&ftrace_regex_lock);
-
-	ret = -ENODEV;
-	if (unlikely(ftrace_disabled))
-		goto out_unlock;
-
 	if (file->f_mode & FMODE_READ) {
 		struct seq_file *m = file->private_data;
 		iter = m->private;
 	} else
 		iter = file->private_data;
 
+	mutex_lock(&iter->ops->regex_lock);
+
+	ret = -ENODEV;
+	if (unlikely(ftrace_disabled))
+		goto out_unlock;
+
 	parser = &iter->parser;
 	read = trace_get_user(parser, ubuf, cnt, ppos);
 
@@ -3282,7 +3304,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 
 	ret = read;
 out_unlock:
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&iter->ops->regex_lock);
 
 	return ret;
 }
@@ -3344,7 +3366,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 	if (!hash)
 		return -ENOMEM;
 
-	mutex_lock(&ftrace_regex_lock);
+	mutex_lock(&ops->regex_lock);
 	if (reset)
 		ftrace_filter_reset(hash);
 	if (buf && !ftrace_match_records(hash, buf, len)) {
@@ -3366,7 +3388,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
 	mutex_unlock(&ftrace_lock);
 
  out_regex_unlock:
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&ops->regex_lock);
 
 	free_ftrace_hash(hash);
 	return ret;
@@ -3392,6 +3414,7 @@ ftrace_set_addr(struct ftrace_ops *ops, unsigned long ip, int remove,
 int ftrace_set_filter_ip(struct ftrace_ops *ops, unsigned long ip,
 			 int remove, int reset)
 {
+	ftrace_ops_init(ops);
 	return ftrace_set_addr(ops, ip, remove, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter_ip);
@@ -3416,6 +3439,7 @@ ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
 int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
 		       int len, int reset)
 {
+	ftrace_ops_init(ops);
 	return ftrace_set_regex(ops, buf, len, reset, 1);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_filter);
@@ -3434,6 +3458,7 @@ EXPORT_SYMBOL_GPL(ftrace_set_filter);
 int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
 			int len, int reset)
 {
+	ftrace_ops_init(ops);
 	return ftrace_set_regex(ops, buf, len, reset, 0);
 }
 EXPORT_SYMBOL_GPL(ftrace_set_notrace);
@@ -3524,6 +3549,8 @@ ftrace_set_early_filter(struct ftrace_ops *ops, char *buf, int enable)
 {
 	char *func;
 
+	ftrace_ops_init(ops);
+
 	while (buf) {
 		func = strsep(&buf, ",");
 		ftrace_set_regex(ops, func, strlen(func), 0, enable);
@@ -3551,14 +3578,14 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 	int filter_hash;
 	int ret;
 
-	mutex_lock(&ftrace_regex_lock);
 	if (file->f_mode & FMODE_READ) {
 		iter = m->private;
-
 		seq_release(inode, file);
 	} else
 		iter = file->private_data;
 
+	mutex_lock(&iter->ops->regex_lock);
+
 	parser = &iter->parser;
 	if (trace_parser_loaded(parser)) {
 		parser->buffer[parser->idx] = 0;
@@ -3587,7 +3614,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
 	free_ftrace_hash(iter->hash);
 	kfree(iter);
 
-	mutex_unlock(&ftrace_regex_lock);
+	mutex_unlock(&iter->ops->regex_lock);
 	return 0;
 }
 
@@ -4126,7 +4153,8 @@ void __init ftrace_init(void)
 
 static struct ftrace_ops global_ops = {
 	.func			= ftrace_stub,
-	.flags			= FTRACE_OPS_FL_RECURSION_SAFE,
+	.flags			= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(global_ops)
 };
 
 static int __init ftrace_nodyn_init(void)
@@ -4180,8 +4208,9 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
 }
 
 static struct ftrace_ops control_ops = {
-	.func = ftrace_ops_control_func,
-	.flags = FTRACE_OPS_FL_RECURSION_SAFE,
+	.func	= ftrace_ops_control_func,
+	.flags	= FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
+	INIT_REGEX_LOCK(control_ops)
 };
 
 static inline void
@@ -4539,6 +4568,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret = -1;
 
+	ftrace_ops_init(ops);
+
 	mutex_lock(&ftrace_lock);
 
 	ret = __register_ftrace_function(ops);
-- 
cgit 


From 1cf4c0732db3cd3c49cadbc60ff6bda08604e6fa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Date: Thu, 9 May 2013 14:44:29 +0900
Subject: tracing: Modify soft-mode only if there's no other referrer

Modify soft-mode flag only if no other soft-mode referrer
(currently only the ftrace triggers) by using a reference
counter in each ftrace_event_file.

Without this fix, adding and removing several different
enable/disable_event triggers on the same event clear
soft-mode bit from the ftrace_event_file. This also
happens with a typo of glob on setting triggers.

e.g.

 # echo vfs_symlink:enable_event:net:netif_rx > set_ftrace_filter
 # cat events/net/netif_rx/enable
 0*
 # echo typo_func:enable_event:net:netif_rx > set_ftrace_filter
 # cat events/net/netif_rx/enable
 0
 # cat set_ftrace_filter
 #### all functions enabled ####
 vfs_symlink:enable_event:net:netif_rx:unlimited

As above, we still have a trigger, but soft-mode is gone.

Link: http://lkml.kernel.org/r/20130509054429.30398.7464.stgit@mhiramat-M0-7522

Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: David Sharp <dhsharp@google.com>
Cc: Hiraku Toyooka <hiraku.toyooka.gu@hitachi.com>
Cc: Tom Zanussi <tom.zanussi@intel.com>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace_events.c  | 12 ++++++++++--
 2 files changed, 11 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 34e00fb49bec..4372658c73ae 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -293,6 +293,7 @@ struct ftrace_event_file {
 	 * caching and such. Which is mostly OK ;-)
 	 */
 	unsigned long		flags;
+	atomic_t		sm_ref;	/* soft-mode reference counter */
 };
 
 #define __TRACE_EVENT_FLAGS(name, value)				\
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 915c136d7bd1..8be1224046f8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -251,7 +251,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 	switch (enable) {
 	case 0:
 		/*
-		 * When soft_disable is set and enable is cleared, we want
+		 * When soft_disable is set and enable is cleared, the sm_ref
+		 * reference counter is decremented. If it reaches 0, we want
 		 * to clear the SOFT_DISABLED flag but leave the event in the
 		 * state that it was. That is, if the event was enabled and
 		 * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
@@ -263,6 +264,8 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 		 * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
 		 */
 		if (soft_disable) {
+			if (atomic_dec_return(&file->sm_ref) > 0)
+				break;
 			disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
 			clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
 		} else
@@ -291,8 +294,11 @@ static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
 		 */
 		if (!soft_disable)
 			clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
-		else
+		else {
+			if (atomic_inc_return(&file->sm_ref) > 1)
+				break;
 			set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+		}
 
 		if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
 
@@ -1540,6 +1546,7 @@ __trace_add_new_event(struct ftrace_event_call *call,
 
 	file->event_call = call;
 	file->tr = tr;
+	atomic_set(&file->sm_ref, 0);
 	list_add(&file->list, &tr->events);
 
 	return event_create_dir(tr->event_dir, file, id, enable, filter, format);
@@ -1562,6 +1569,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call,
 
 	file->event_call = call;
 	file->tr = tr;
+	atomic_set(&file->sm_ref, 0);
 	list_add(&file->list, &tr->events);
 
 	return 0;
-- 
cgit 


From 058ce5ca8155a4c8eeac9b9e8a70e6664996337b Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Fri, 10 May 2013 14:37:17 +0100
Subject: dm: document iterate_devices

Document iterate_devices in device-mapper.h.

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 include/linux/device-mapper.h | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 1e483fa7afb4..3cd32478f2fd 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -79,11 +79,26 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
 			    struct bio_vec *biovec, int max_size);
 
+/*
+ * These iteration functions are typically used to check (and combine)
+ * properties of underlying devices.
+ * E.g. Does at least one underlying device support flush?
+ *      Does any underlying device not support WRITE_SAME?
+ *
+ * The callout function is called once for each contiguous section of
+ * an underlying device.  State can be maintained in *data.
+ * Return non-zero to stop iterating through any further devices.
+ */
 typedef int (*iterate_devices_callout_fn) (struct dm_target *ti,
 					   struct dm_dev *dev,
 					   sector_t start, sector_t len,
 					   void *data);
 
+/*
+ * This function must iterate through each section of device used by the
+ * target until it encounters a non-zero return code, which it then returns.
+ * Returns zero if no callout returned non-zero.
+ */
 typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
 				      iterate_devices_callout_fn fn,
 				      void *data);
-- 
cgit