From e00320875d0cc5f8099a7227b2f25fbb3231268d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 08:48:23 +0100
Subject: x86: fix stackprotector canary updates during context switches

fix a bug noticed and fixed by pageexec@freemail.hu.

if built with -fstack-protector-all then we'll have canary checks built
into the __switch_to() function. That does not work well with the
canary-switching code there: while we already use the %rsp of the
new task, we still call __switch_to() whith the previous task's canary
value in the PDA, hence the __switch_to() ssp prologue instructions
will store the previous canary. Then we update the PDA and upon return
from __switch_to() the canary check triggers and we panic.

so update the canary after we have called __switch_to(), where we are
at the same stackframe level as the last stackframe of the next
(and now freshly current) task.

Note: this means that we call __switch_to() [and its sub-functions]
still with the old canary, but that is not a problem, both the previous
and the next task has a high-quality canary. The only (mostly academic)
disadvantage is that the canary of one task may leak onto the stack of
another task, increasing the risk of information leaks, were an attacker
able to read the stack of specific tasks (but not that of others).

To solve this we'll have to reorganize the way we switch tasks, and move
the PDA setting into the switch_to() assembly code. That will happen in
another patch.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-x86/pda.h    | 2 --
 include/asm-x86/system.h | 6 +++++-
 include/linux/sched.h    | 3 +--
 3 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 101fb9e11954..62b734986a44 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -16,11 +16,9 @@ struct x8664_pda {
 	unsigned long oldrsp;		/* 24 user rsp for system call */
 	int irqcount;			/* 32 Irq nesting counter. Starts -1 */
 	unsigned int cpunumber;		/* 36 Logical CPU number */
-#ifdef CONFIG_CC_STACKPROTECTOR
 	unsigned long stack_canary;	/* 40 stack canary value */
 					/* gcc-ABI: this canary MUST be at
 					   offset 40!!! */
-#endif
 	char *irqstackptr;
 	unsigned int __softirq_pending;
 	unsigned int __nmi_count;	/* number of NMI on this CPUs */
diff --git a/include/asm-x86/system.h b/include/asm-x86/system.h
index a2f04cd79b29..172f54185093 100644
--- a/include/asm-x86/system.h
+++ b/include/asm-x86/system.h
@@ -92,6 +92,8 @@ do {									\
 	     ".globl thread_return\n"					  \
 	     "thread_return:\n\t"					  \
 	     "movq %%gs:%P[pda_pcurrent],%%rsi\n\t"			  \
+	     "movq %P[task_canary](%%rsi),%%r8\n\t"			  \
+	     "movq %%r8,%%gs:%P[pda_canary]\n\t"			  \
 	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
 	     LOCK_PREFIX "btr  %[tif_fork],%P[ti_flags](%%r8)\n\t"	  \
 	     "movq %%rax,%%rdi\n\t" 					  \
@@ -103,7 +105,9 @@ do {									\
 	       [ti_flags] "i" (offsetof(struct thread_info, flags)),	  \
 	       [tif_fork] "i" (TIF_FORK),			  	  \
 	       [thread_info] "i" (offsetof(struct task_struct, stack)),   \
-	       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent))  \
+	       [task_canary] "i" (offsetof(struct task_struct, stack_canary)),\
+	       [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)), \
+	       [pda_canary] "i" (offsetof(struct x8664_pda, stack_canary))\
 	     : "memory", "cc" __EXTRA_CLOBBER)
 #endif
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4b..d6a515158783 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1096,10 +1096,9 @@ struct task_struct {
 	pid_t pid;
 	pid_t tgid;
 
-#ifdef CONFIG_CC_STACKPROTECTOR
 	/* Canary value for the -fstack-protector gcc feature */
 	unsigned long stack_canary;
-#endif
+
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
-- 
cgit 


From 9b5609fd773e6ac0b1d6d6e1bf68f32cca64e06b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:41:09 +0100
Subject: stackprotector: include files

create <linux/stackprotector.h> for core kernel files to include.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-x86/stackprotector.h | 4 ++++
 include/linux/stackprotector.h   | 8 ++++++++
 init/main.c                      | 1 +
 3 files changed, 13 insertions(+)
 create mode 100644 include/asm-x86/stackprotector.h
 create mode 100644 include/linux/stackprotector.h

(limited to 'include')

diff --git a/include/asm-x86/stackprotector.h b/include/asm-x86/stackprotector.h
new file mode 100644
index 000000000000..dcac7a6bdba2
--- /dev/null
+++ b/include/asm-x86/stackprotector.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_STACKPROTECTOR_H
+#define _ASM_STACKPROTECTOR_H 1
+
+#endif
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
new file mode 100644
index 000000000000..d3e8bbe602f8
--- /dev/null
+++ b/include/linux/stackprotector.h
@@ -0,0 +1,8 @@
+#ifndef _LINUX_STACKPROTECTOR_H
+#define _LINUX_STACKPROTECTOR_H 1
+
+#ifdef CONFIG_CC_STACKPROTECTOR
+# include <asm/stackprotector.h>
+#endif
+
+#endif
diff --git a/init/main.c b/init/main.c
index f7fb20021d48..a84322ca64a2 100644
--- a/init/main.c
+++ b/init/main.c
@@ -14,6 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/kernel.h>
 #include <linux/syscalls.h>
+#include <linux/stackprotector.h>
 #include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/delay.h>
-- 
cgit 


From 18aa8bb12dcb10adc3d7c9d69714d53667c0ab7f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:42:02 +0100
Subject: stackprotector: add boot_init_stack_canary()

add the boot_init_stack_canary() and make the secondary idle threads
use it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c     |  6 ++----
 include/asm-x86/stackprotector.h | 20 ++++++++++++++++++++
 include/linux/stackprotector.h   |  4 ++++
 3 files changed, 26 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d4c7ac7aa430..5107cb214c7b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -147,7 +147,6 @@ void cpu_idle(void)
 {
 	current_thread_info()->status |= TS_POLLING;
 
-#ifdef CONFIG_CC_STACKPROTECTOR
 	/*
 	 * If we're the non-boot CPU, nothing set the PDA stack
 	 * canary up for us - and if we are the boot CPU we have
@@ -156,9 +155,8 @@ void cpu_idle(void)
 	 * invalid canaries already on the stack wont ever
 	 * trigger):
 	 */
-	current->stack_canary = get_random_int();
-	write_pda(stack_canary, current->stack_canary);
-#endif
+	boot_init_stack_canary();
+
 	/* endless idle loop with no priority at all */
 	while (1) {
 		tick_nohz_stop_sched_tick();
diff --git a/include/asm-x86/stackprotector.h b/include/asm-x86/stackprotector.h
index dcac7a6bdba2..0f91f7a2688c 100644
--- a/include/asm-x86/stackprotector.h
+++ b/include/asm-x86/stackprotector.h
@@ -1,4 +1,24 @@
 #ifndef _ASM_STACKPROTECTOR_H
 #define _ASM_STACKPROTECTOR_H 1
 
+/*
+ * Initialize the stackprotector canary value.
+ *
+ * NOTE: this must only be called from functions that never return,
+ * and it must always be inlined.
+ */
+static __always_inline void boot_init_stack_canary(void)
+{
+	/*
+	 * If we're the non-boot CPU, nothing set the PDA stack
+	 * canary up for us - and if we are the boot CPU we have
+	 * a 0 stack canary. This is a good place for updating
+	 * it, as we wont ever return from this function (so the
+	 * invalid canaries already on the stack wont ever
+	 * trigger):
+	 */
+	current->stack_canary = get_random_int();
+	write_pda(stack_canary, current->stack_canary);
+}
+
 #endif
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
index d3e8bbe602f8..422e71aafd0b 100644
--- a/include/linux/stackprotector.h
+++ b/include/linux/stackprotector.h
@@ -3,6 +3,10 @@
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 # include <asm/stackprotector.h>
+#else
+static inline void boot_init_stack_canary(void)
+{
+}
 #endif
 
 #endif
-- 
cgit 


From 420594296838fdc9a674470d710cda7d1487f9f4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:44:08 +0100
Subject: x86: fix the stackprotector canary of the boot CPU

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c   | 1 +
 include/linux/stackprotector.h | 4 ++++
 init/main.c                    | 6 ++++++
 3 files changed, 11 insertions(+)

(limited to 'include')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5107cb214c7b..cce47f7fbf22 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -16,6 +16,7 @@
 
 #include <stdarg.h>
 
+#include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
diff --git a/include/linux/stackprotector.h b/include/linux/stackprotector.h
index 422e71aafd0b..6f3e54c704c0 100644
--- a/include/linux/stackprotector.h
+++ b/include/linux/stackprotector.h
@@ -1,6 +1,10 @@
 #ifndef _LINUX_STACKPROTECTOR_H
 #define _LINUX_STACKPROTECTOR_H 1
 
+#include <linux/compiler.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 # include <asm/stackprotector.h>
 #else
diff --git a/init/main.c b/init/main.c
index a84322ca64a2..b44e4eb0f5e3 100644
--- a/init/main.c
+++ b/init/main.c
@@ -546,6 +546,12 @@ asmlinkage void __init start_kernel(void)
 	unwind_init();
 	lockdep_init();
 	debug_objects_early_init();
+
+	/*
+	 * Set up the the initial canary ASAP:
+	 */
+	boot_init_stack_canary();
+
 	cgroup_init_early();
 
 	local_irq_disable();
-- 
cgit 


From 960a672bd9f1ec06e8f197cf81a50fd07ea02e7f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:56:04 +0100
Subject: x86: stackprotector: mix TSC to the boot canary

mix the TSC to the boot canary.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/asm-x86/stackprotector.h | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86/stackprotector.h b/include/asm-x86/stackprotector.h
index 0f91f7a2688c..3baf7ad89be1 100644
--- a/include/asm-x86/stackprotector.h
+++ b/include/asm-x86/stackprotector.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_STACKPROTECTOR_H
 #define _ASM_STACKPROTECTOR_H 1
 
+#include <asm/tsc.h>
+
 /*
  * Initialize the stackprotector canary value.
  *
@@ -9,16 +11,28 @@
  */
 static __always_inline void boot_init_stack_canary(void)
 {
+	u64 canary;
+	u64 tsc;
+
 	/*
 	 * If we're the non-boot CPU, nothing set the PDA stack
 	 * canary up for us - and if we are the boot CPU we have
 	 * a 0 stack canary. This is a good place for updating
 	 * it, as we wont ever return from this function (so the
 	 * invalid canaries already on the stack wont ever
-	 * trigger):
+	 * trigger).
+	 *
+	 * We both use the random pool and the current TSC as a source
+	 * of randomness. The TSC only matters for very early init,
+	 * there it already has some randomness on most systems. Later
+	 * on during the bootup the random pool has true entropy too.
 	 */
-	current->stack_canary = get_random_int();
-	write_pda(stack_canary, current->stack_canary);
+	get_random_bytes(&canary, sizeof(canary));
+	tsc = __native_read_tsc();
+	canary += tsc + (tsc << 32UL);
+
+	current->stack_canary = canary;
+	write_pda(stack_canary, canary);
 }
 
 #endif
-- 
cgit 


From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 22 Apr 2008 16:38:23 -0500
Subject: stackprotector: use canary at end of stack to indicate overruns at
 oops time

(Updated with a common max-stack-used checker that knows about
the canary, as suggested by Joe Perches)

Use a canary at the end of the stack to clearly indicate
at oops time whether the stack has ever overflowed.

This is a very simple implementation with a couple of
drawbacks:

1) a thread may legitimately use exactly up to the last
   word on the stack

 -- but the chances of doing this and then oopsing later seem slim

2) it's possible that the stack usage isn't dense enough
   that the canary location could get skipped over

 -- but the worst that happens is that we don't flag the overrun
 -- though this happens fairly often in my testing :(

With the code in place, an intentionally-bloated stack oops might
do:

BUG: unable to handle kernel paging request at ffff8103f84cc680
IP: [<ffffffff810253df>] update_curr+0x9a/0xa8
PGD 8063 PUD 0
Thread overran stack or stack corrupted
Oops: 0000 [1] SMP
CPU 0
...

... unless the stack overrun is so bad that it corrupts some other
thread.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c   |  7 +++++++
 include/linux/magic.h |  1 +
 include/linux/sched.h | 13 +++++++++++++
 kernel/exit.c         |  5 +----
 kernel/fork.c         |  5 +++++
 kernel/sched.c        |  7 +------
 6 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..1f524df68b96 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -25,6 +25,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	unsigned long address;
 	int write, si_code;
 	int fault;
+	unsigned long *stackend;
+
 #ifdef CONFIG_X86_64
 	unsigned long flags;
 #endif
@@ -850,6 +853,10 @@ no_context:
 
 	show_fault_oops(regs, error_code, address);
 
+ 	stackend = end_of_stack(tsk);
+	if (*stackend != STACK_END_MAGIC)
+		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 1fa0c2ce4dec..74e68e201166 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -42,4 +42,5 @@
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
 
+#define STACK_END_MAGIC		0x57AC6E9D
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d6a515158783..c5181e77f305 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1969,6 +1969,19 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 
 extern void thread_info_cache_init(void);
 
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+	unsigned long *n = end_of_stack(p);
+
+	do { 	/* Skip over canary */
+		n++;
+	} while (!*n);
+
+	return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f6185e69b69..fb8de6cbf2c7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -899,12 +899,9 @@ static void check_stack_usage(void)
 {
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
-	unsigned long *n = end_of_stack(current);
 	unsigned long free;
 
-	while (*n == 0)
-		n++;
-	free = (unsigned long)n - (unsigned long)end_of_stack(current);
+	free = stack_not_used(current);
 
 	if (free >= lowest_to_date)
 		return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d428336e7aa1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -54,6 +54,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/magic.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
+	unsigned long *stackend;
+
 	int err;
 
 	prepare_to_copy(orig);
@@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto out;
 
 	setup_thread_stack(tsk, orig);
+	stackend = end_of_stack(tsk);
+	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	tsk->stack_canary = get_random_int();
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..a964ed945094 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p)
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-	{
-		unsigned long *n = end_of_stack(p);
-		while (!*n)
-			n++;
-		free = (unsigned long)n - (unsigned long)end_of_stack(p);
-	}
+	free = stack_not_used(p);
 #endif
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
-- 
cgit 


From af9ff7868f0f76d3364351b1641b9dfa99588e77 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Jul 2008 09:36:38 -0700
Subject: x86: simplify stackprotector self-check

Clean up the code by removing no longer needed code;
make sure the pda is updated and kept in sync

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/pda.h |  1 +
 kernel/panic.c        | 29 +++++++----------------------
 2 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 62b734986a44..a5ff5bb76299 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -131,4 +131,5 @@ do {									\
 
 #define PDA_STACKOFFSET (5*8)
 
+#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
 #endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 28153aec7100..87445a894c3a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -328,37 +328,21 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #ifndef GCC_HAS_SP
 #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
 #endif
+
 static unsigned long __stack_check_testing;
+
 /*
  * Self test function for the stack-protector feature.
  * This test requires that the local variable absolutely has
- * a stack slot, hence the barrier()s.
+ * a stack slot.
  */
 static noinline void __stack_chk_test_func(void)
 {
-	unsigned long foo;
-	barrier();
-	/*
-	 * we need to make sure we're not about to clobber the return address,
-	 * while real exploits do this, it's unhealthy on a running system.
-	 * Besides, if we would, the test is already failed anyway so
-	 * time to pull the emergency brake on it.
-	 */
-	if ((unsigned long)__builtin_return_address(0) ==
-					*(((unsigned long *)&foo)+1)) {
-		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-	}
-#ifdef CONFIG_FRAME_POINTER
-	/* We also don't want to clobber the frame pointer */
-	if ((unsigned long)__builtin_return_address(0) ==
-					*(((unsigned long *)&foo)+2)) {
-		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-	}
-#endif
-	if (current->stack_canary != *(((unsigned long *)&foo)+1))
-		printk(KERN_ERR "No -fstack-protector canary found\n");
+	unsigned long dummy_buffer[64]; /* force gcc to use the canary */
 
 	current->stack_canary = ~current->stack_canary;
+	refresh_stack_canary();
+	dummy_buffer[3] = 1; /* fool gcc into keeping the variable */
 }
 
 static int __stack_chk_test(void)
@@ -371,6 +355,7 @@ static int __stack_chk_test(void)
 		WARN_ON(1);
 	};
 	current->stack_canary = ~current->stack_canary;
+	refresh_stack_canary();
 	return 0;
 }
 /*
-- 
cgit 


From 7f7ace0cda64c99599c23785f8979a072e118058 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:08 -0800
Subject: cpumask: update irq_desc to use cpumask_var_t

Impact: reduce memory usage, use new cpumask API.

Replace the affinity and pending_masks with cpumask_var_t's.  This adds
to the significant size reduction done with the SPARSE_IRQS changes.

The added functions (init_alloc_desc_masks & init_copy_desc_masks) are
in the include file so they can be inlined (and optimized out for the
!CONFIG_CPUMASKS_OFFSTACK case.)  [Naming chosen to be consistent with
the other init*irq functions, as well as the backwards arg declaration
of "from, to" instead of the more common "to, from" standard.]

Includes a slight change to the declaration of struct irq_desc to embed
the pending_mask within ifdef(CONFIG_SMP) to be consistent with other
references, and some small changes to Xen.

Tested: sparse/non-sparse/cpumask_offstack/non-cpumask_offstack/nonuma/nosmp on x86_64

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: virtualization@lists.osdl.org
Cc: xen-devel@lists.xensource.com
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
---
 arch/x86/kernel/io_apic.c | 20 ++++++------
 arch/x86/kernel/irq_32.c  |  2 +-
 arch/x86/kernel/irq_64.c  |  2 +-
 drivers/xen/events.c      |  4 +--
 include/linux/irq.h       | 81 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/irq/chip.c         |  5 ++-
 kernel/irq/handle.c       | 26 ++++++++-------
 kernel/irq/manage.c       | 12 +++----
 kernel/irq/migration.c    | 12 +++----
 kernel/irq/numa_migrate.c | 12 ++++++-
 kernel/irq/proc.c         |  4 +--
 11 files changed, 135 insertions(+), 45 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1c4a1302536c..1337eab60ecc 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -356,7 +356,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
 
 	if (!cfg->move_in_progress) {
 		/* it means that domain is not changed */
-		if (!cpumask_intersects(&desc->affinity, mask))
+		if (!cpumask_intersects(desc->affinity, mask))
 			cfg->move_desc_pending = 1;
 	}
 }
@@ -579,9 +579,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	cpumask_and(&desc->affinity, cfg->domain, mask);
+	cpumask_and(desc->affinity, cfg->domain, mask);
 	set_extra_move_desc(desc, mask);
-	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+	return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
 }
 
 static void
@@ -2383,7 +2383,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
 
-	cpumask_copy(&desc->affinity, mask);
+	cpumask_copy(desc->affinity, mask);
 }
 
 static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2405,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
-	cpumask_clear(&desc->pending_mask);
+	cpumask_clear(desc->pending_mask);
 
 unmask:
 	unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2434,7 @@ static void ir_irq_migration(struct work_struct *work)
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, &desc->pending_mask);
+			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -2448,7 +2448,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 {
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, mask);
+		cpumask_copy(desc->pending_mask, mask);
 		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
@@ -2516,7 +2516,7 @@ static void irq_complete_move(struct irq_desc **descp)
 
 		/* domain has not changed, but affinity did */
 		me = smp_processor_id();
-		if (cpu_isset(me, desc->affinity)) {
+		if (cpumask_test_cpu(me, desc->affinity)) {
 			*descp = desc = move_irq_desc(desc, me);
 			/* get the new one */
 			cfg = desc->chip_data;
@@ -4039,7 +4039,7 @@ void __init setup_ioapic_dest(void)
 			 */
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = &desc->affinity;
+				mask = desc->affinity;
 			else
 				mask = TARGET_CPUS;
 
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
 		if (irq == 2)
 			continue;
 
-		affinity = &desc->affinity;
+		affinity = desc->affinity;
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			printk("Breaking affinity for irq %i\n", irq);
 			affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..0b21cb1ea11f 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -100,7 +100,7 @@ void fixup_irqs(void)
 		/* interrupt's are disabled at this point */
 		spin_lock(&desc->lock);
 
-		affinity = &desc->affinity;
+		affinity = desc->affinity;
 		if (!irq_has_action(irq) ||
 		    cpumask_equal(affinity, cpu_online_mask)) {
 			spin_unlock(&desc->lock);
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eb0dfdeaa949..e0767ff35d6c 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
+	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
 #endif
 
 	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
@@ -142,7 +142,7 @@ static void init_evtchn_cpu_bindings(void)
 
 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
-		desc->affinity = cpumask_of_cpu(0);
+		cpumask_copy(desc->affinity, cpumask_of(0));
 	}
 #endif
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..fa27210f1dfd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -182,11 +182,11 @@ struct irq_desc {
 	unsigned int		irqs_unhandled;
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
-	cpumask_t		affinity;
+	cpumask_var_t		affinity;
 	unsigned int		cpu;
-#endif
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	cpumask_t		pending_mask;
+	cpumask_var_t		pending_mask;
+#endif
 #endif
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
@@ -422,4 +422,79 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #endif /* !CONFIG_S390 */
 
+#ifdef CONFIG_SMP
+/**
+ * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * @desc:	pointer to irq_desc struct
+ * @boot:	true if need bootmem
+ *
+ * Allocates affinity and pending_mask cpumask if required.
+ * Returns true if successful (or not required).
+ * Side effect: affinity has all bits set, pending_mask has all bits clear.
+ */
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+								bool boot)
+{
+	if (boot) {
+		alloc_bootmem_cpumask_var(&desc->affinity);
+		cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+		alloc_bootmem_cpumask_var(&desc->pending_mask);
+		cpumask_clear(desc->pending_mask);
+#endif
+		return true;
+	}
+
+	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+		return false;
+	cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+		free_cpumask_var(desc->affinity);
+		return false;
+	}
+	cpumask_clear(desc->pending_mask);
+#endif
+	return true;
+}
+
+/**
+ * init_copy_desc_masks - copy cpumasks for irq_desc
+ * @old_desc:	pointer to old irq_desc struct
+ * @new_desc:	pointer to new irq_desc struct
+ *
+ * Insures affinity and pending_masks are copied to new irq_desc.
+ * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
+ * irq_desc struct so the copy is redundant.
+ */
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+					struct irq_desc *new_desc)
+{
+#ifdef CONFIG_CPUMASKS_OFFSTACK
+	cpumask_copy(new_desc->affinity, old_desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
+#endif
+#endif
+}
+
+#else /* !CONFIG_SMP */
+
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+								bool boot)
+{
+	return true;
+}
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+					struct irq_desc *new_desc)
+{
+}
+
+#endif	/* CONFIG_SMP */
+
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..c248eba98b43 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-	cpumask_setall(&desc->affinity);
+	cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_clear(desc->pending_mask);
+#endif
 #endif
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..b8fa1354f01c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -64,9 +64,6 @@ static struct irq_desc irq_desc_init = {
 	.handle_irq = handle_bad_irq,
 	.depth      = 1,
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-	.affinity   = CPU_MASK_ALL
-#endif
 };
 
 void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -88,6 +85,8 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
+	int node = cpu_to_node(cpu);
+
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
@@ -101,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+		BUG_ON(1);
+	}
 	arch_init_chip_data(desc, cpu);
 }
 
@@ -119,9 +122,6 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 		.handle_irq = handle_bad_irq,
 		.depth	    = 1,
 		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-		.affinity   = CPU_MASK_ALL
-#endif
 	}
 };
 
@@ -141,7 +141,7 @@ int __init early_irq_init(void)
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy[i];
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+		init_alloc_desc_masks(&desc[i], 0, true);
 		irq_desc_ptrs[i] = desc + i;
 	}
 
@@ -188,6 +188,10 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+		BUG_ON(1);
+	}
 	init_one_irq_desc(irq, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
@@ -207,9 +211,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
 		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
-#ifdef CONFIG_SMP
-		.affinity = CPU_MASK_ALL
-#endif
 	}
 };
 
@@ -222,9 +223,10 @@ int __init early_irq_init(void)
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
-
+		init_alloc_desc_masks(&desc[i], 0, true);
+	}
 	return arch_early_irq_init();
 }
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..b98739af4558 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -98,14 +98,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		cpumask_copy(&desc->affinity, cpumask);
+		cpumask_copy(desc->affinity, cpumask);
 		desc->chip->set_affinity(irq, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, cpumask);
+		cpumask_copy(desc->pending_mask, cpumask);
 	}
 #else
-	cpumask_copy(&desc->affinity, cpumask);
+	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
@@ -127,16 +127,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-		if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+		if (cpumask_any_and(desc->affinity, cpu_online_mask)
 		    < nr_cpu_ids)
 			goto set_affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+	cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
-	desc->chip->set_affinity(irq, &desc->affinity);
+	desc->chip->set_affinity(irq, desc->affinity);
 
 	return 0;
 }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
 
 	desc->status &= ~IRQ_MOVE_PENDING;
 
-	if (unlikely(cpumask_empty(&desc->pending_mask)))
+	if (unlikely(cpumask_empty(desc->pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
 	 * For correct operation this depends on the caller
 	 * masking the irqs.
 	 */
-	if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
 		   < nr_cpu_ids)) {
-		cpumask_and(&desc->affinity,
-			    &desc->pending_mask, cpu_online_mask);
-		desc->chip->set_affinity(irq, &desc->affinity);
+		cpumask_and(desc->affinity,
+			    desc->pending_mask, cpu_online_mask);
+		desc->chip->set_affinity(irq, desc->affinity);
 	}
-	cpumask_clear(&desc->pending_mask);
+	cpumask_clear(desc->pending_mask);
 }
 
 void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..f001a4ea6414 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -46,6 +46,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	init_copy_desc_masks(old_desc, desc);
 	arch_init_copy_chip_data(old_desc, desc, cpu);
 }
 
@@ -76,11 +77,20 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 	if (!desc) {
-		printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+		printk(KERN_ERR "irq %d: can not get new irq_desc "
+				"for migration.\n", irq);
 		/* still use old one */
 		desc = old_desc;
 		goto out_unlock;
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+				"for migration.\n", irq);
+		/* still use old one */
+		kfree(desc);
+		desc = old_desc;
+		goto out_unlock;
+	}
 	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = &desc->affinity;
+	const struct cpumask *mask = desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PENDING)
-		mask = &desc->pending_mask;
+		mask = desc->pending_mask;
 #endif
 	seq_cpumask(m, mask);
 	seq_putc(m, '\n');
-- 
cgit 


From fbd59a8d1f7cf325fdb6828659f1fb76631e87b3 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Sat, 10 Jan 2009 21:58:08 -0800
Subject: cpumask: Use topology_core_cpumask()/topology_thread_cpumask()

Impact: reduce stack usage, use new cpumask API.

This actually uses topology_core_cpumask() and
topology_thread_cpumask(), removing the only users of
topology_core_siblings() and topology_thread_siblings()

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
Cc: linux-net-drivers@solarflare.com
---
 Documentation/cputopology.txt |  6 +++---
 drivers/base/topology.c       | 33 ++++++++++++++++-----------------
 drivers/net/sfc/efx.c         |  4 ++--
 include/linux/topology.h      |  6 ++++++
 4 files changed, 27 insertions(+), 22 deletions(-)

(limited to 'include')

diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt
index 45932ec21cee..b41f3e58aefa 100644
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -18,11 +18,11 @@ For an architecture to support this feature, it must define some of
 these macros in include/asm-XXX/topology.h:
 #define topology_physical_package_id(cpu)
 #define topology_core_id(cpu)
-#define topology_thread_siblings(cpu)
-#define topology_core_siblings(cpu)
+#define topology_thread_cpumask(cpu)
+#define topology_core_cpumask(cpu)
 
 The type of **_id is int.
-The type of siblings is cpumask_t.
+The type of siblings is (const) struct cpumask *.
 
 To be consistent on all architectures, include/linux/topology.h
 provides default definitions for any of the above macros that are
diff --git a/drivers/base/topology.c b/drivers/base/topology.c
index a778fb52b11f..bf6b13206d00 100644
--- a/drivers/base/topology.c
+++ b/drivers/base/topology.c
@@ -31,7 +31,10 @@
 #include <linux/hardirq.h>
 #include <linux/topology.h>
 
-#define define_one_ro(_name) 		\
+#define define_one_ro_named(_name, _func)				\
+static SYSDEV_ATTR(_name, 0444, _func, NULL)
+
+#define define_one_ro(_name)				\
 static SYSDEV_ATTR(_name, 0444, show_##_name, NULL)
 
 #define define_id_show_func(name)				\
@@ -42,8 +45,8 @@ static ssize_t show_##name(struct sys_device *dev,		\
 	return sprintf(buf, "%d\n", topology_##name(cpu));	\
 }
 
-#if defined(topology_thread_siblings) || defined(topology_core_siblings)
-static ssize_t show_cpumap(int type, cpumask_t *mask, char *buf)
+#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
+static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
 {
 	ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
 	int n = 0;
@@ -65,7 +68,7 @@ static ssize_t show_##name(struct sys_device *dev,			\
 			   struct sysdev_attribute *attr, char *buf)	\
 {									\
 	unsigned int cpu = dev->id;					\
-	return show_cpumap(0, &(topology_##name(cpu)), buf);		\
+	return show_cpumap(0, topology_##name(cpu), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
@@ -74,7 +77,7 @@ static ssize_t show_##name##_list(struct sys_device *dev,		\
 				  char *buf)				\
 {									\
 	unsigned int cpu = dev->id;					\
-	return show_cpumap(1, &(topology_##name(cpu)), buf);		\
+	return show_cpumap(1, topology_##name(cpu), buf);		\
 }
 
 #else
@@ -82,9 +85,7 @@ static ssize_t show_##name##_list(struct sys_device *dev,		\
 static ssize_t show_##name(struct sys_device *dev,			\
 			   struct sysdev_attribute *attr, char *buf)	\
 {									\
-	unsigned int cpu = dev->id;					\
-	cpumask_t mask = topology_##name(cpu);				\
-	return show_cpumap(0, &mask, buf);				\
+	return show_cpumap(0, topology_##name(dev->id), buf);		\
 }
 
 #define define_siblings_show_list(name)					\
@@ -92,9 +93,7 @@ static ssize_t show_##name##_list(struct sys_device *dev,		\
 				  struct sysdev_attribute *attr,	\
 				  char *buf)				\
 {									\
-	unsigned int cpu = dev->id;					\
-	cpumask_t mask = topology_##name(cpu);				\
-	return show_cpumap(1, &mask, buf);				\
+	return show_cpumap(1, topology_##name(dev->id), buf);		\
 }
 #endif
 
@@ -107,13 +106,13 @@ define_one_ro(physical_package_id);
 define_id_show_func(core_id);
 define_one_ro(core_id);
 
-define_siblings_show_func(thread_siblings);
-define_one_ro(thread_siblings);
-define_one_ro(thread_siblings_list);
+define_siblings_show_func(thread_cpumask);
+define_one_ro_named(thread_siblings, show_thread_cpumask);
+define_one_ro_named(thread_siblings_list, show_thread_cpumask_list);
 
-define_siblings_show_func(core_siblings);
-define_one_ro(core_siblings);
-define_one_ro(core_siblings_list);
+define_siblings_show_func(core_cpumask);
+define_one_ro_named(core_siblings, show_core_cpumask);
+define_one_ro_named(core_siblings_list, show_core_cpumask_list);
 
 static struct attribute *default_attrs[] = {
 	&attr_physical_package_id.attr,
diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c
index 7673fd92eaf5..f2e56ceee0ea 100644
--- a/drivers/net/sfc/efx.c
+++ b/drivers/net/sfc/efx.c
@@ -863,8 +863,8 @@ static int efx_wanted_rx_queues(void)
 	for_each_online_cpu(cpu) {
 		if (!cpu_isset(cpu, core_mask)) {
 			++count;
-			cpus_or(core_mask, core_mask,
-				topology_core_siblings(cpu));
+			cpumask_or(&core_mask, &core_mask,
+				   topology_core_cpumask(cpu));
 		}
 	}
 
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e632d29f0544..a16b9e06f2e5 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -193,5 +193,11 @@ int arch_update_cpu_topology(void);
 #ifndef topology_core_siblings
 #define topology_core_siblings(cpu)		cpumask_of_cpu(cpu)
 #endif
+#ifndef topology_thread_cpumask
+#define topology_thread_cpumask(cpu)		cpumask_of(cpu)
+#endif
+#ifndef topology_core_cpumask
+#define topology_core_cpumask(cpu)		cpumask_of(cpu)
+#endif
 
 #endif /* _LINUX_TOPOLOGY_H */
-- 
cgit 


From 802bf931f2688ad125b73db597ce63cc842fb27a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:09 -0800
Subject: cpumask: fix bug in use cpumask_var_t in irq_desc

Impact: fix bug where new irq_desc uses old cpumask pointers which are freed.

As Yinghai pointed out, init_copy_one_irq_desc() copies the old desc to
the new desc overwriting the cpumask pointers.  Since the old_desc and
the cpumask pointers are freed, then memory corruption will occur if
these old pointers are used.

Move the allocation of these pointers to after the copy.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/irq.h       |  9 +++++++--
 kernel/irq/handle.c       |  8 +-------
 kernel/irq/numa_migrate.c | 13 ++++++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index fa27210f1dfd..27a67536511e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -426,15 +426,18 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 /**
  * init_alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
+ * @cpu:	cpu which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
  * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
+	int node;
+
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
 		cpumask_setall(desc->affinity);
@@ -446,6 +449,8 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
 		return true;
 	}
 
+	node = cpu_to_node(cpu);
+
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
 	cpumask_setall(desc->affinity);
@@ -484,7 +489,7 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
 	return true;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b8fa1354f01c..f01c0a30cb42 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -85,8 +85,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
-	int node = cpu_to_node(cpu);
-
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
@@ -100,7 +98,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
+	if (!init_alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
@@ -188,10 +186,6 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
-		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-		BUG_ON(1);
-	}
 	init_one_irq_desc(irq, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index f001a4ea6414..666260e4c065 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,16 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 	old_desc->kstat_irqs = NULL;
 }
 
-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	if (!init_alloc_desc_masks(desc, cpu, false)) {
+		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+				"for migration.\n", irq);
+		return false;
+	}
 	spin_lock_init(&desc->lock);
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
 	init_copy_desc_masks(old_desc, desc);
 	arch_init_copy_chip_data(old_desc, desc, cpu);
+	return true;
 }
 
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -83,15 +89,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		desc = old_desc;
 		goto out_unlock;
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
-		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-				"for migration.\n", irq);
+	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
 		/* still use old one */
 		kfree(desc);
 		desc = old_desc;
 		goto out_unlock;
 	}
-	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
 
-- 
cgit 


From 9332fccdedf8e09448f3b69b624211ae879f6c45 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:07 -0800
Subject: irq: initialize nr_irqs based on nr_cpu_ids

Impact: Reduce memory usage.

This is the second half of the changes to make the irq_desc_ptrs be
variable sized based on nr_cpu_ids.  This is done by adding a new
"max_nr_irqs" macro to irq_vectors.h (and a dummy in irqnr.h) to
return a max NR_IRQS value based on NR_CPUS or nr_cpu_ids.

This necessitated moving the define of MAX_IO_APICS to a separate
file (asm/apicnum.h) so it could be included without the baggage
of the other asm/apicdef.h declarations.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/include/asm/apicdef.h     |  8 ++------
 arch/x86/include/asm/apicnum.h     | 12 ++++++++++++
 arch/x86/include/asm/irq_vectors.h | 16 +++++++++++-----
 include/linux/irqnr.h              |  7 +++++++
 kernel/irq/handle.c                |  3 +++
 5 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 arch/x86/include/asm/apicnum.h

(limited to 'include')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..1a6454ef7f6c 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -132,12 +132,8 @@
 #define APIC_BASE_MSR	0x800
 #define X2APIC_ENABLE	(1UL << 10)
 
-#ifdef CONFIG_X86_32
-# define MAX_IO_APICS 64
-#else
-# define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
-#endif
+/* get MAX_IO_APICS */
+#include <asm/apicnum.h>
 
 /*
  * All x86-64 systems are xAPIC compatible.
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 000000000000..82f613c607ce
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_APICNUM_H
+#define _ASM_X86_APICNUM_H
+
+/* define MAX_IO_APICS */
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..602361ad0e74 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -105,6 +105,8 @@
 
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 
+#include <asm/apicnum.h>	/* need MAX_IO_APICS */
+
 #ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
@@ -112,11 +114,15 @@
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
 #else
-# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
-#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
-# else
-#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
-# endif
+
+/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */
+# define max_nr_irqs(nr_cpus)				\
+	((8 * nr_cpus) > (32 * MAX_IO_APICS) ?		\
+		(NR_VECTORS + (8 * NR_CPUS)) :		\
+		(NR_VECTORS + (32 * MAX_IO_APICS)))	\
+
+# define NR_IRQS max_nr_irqs(NR_CPUS)
+
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..de66e4e10406 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -20,11 +20,18 @@
 
 # define for_each_irq_desc_reverse(irq, desc)                          \
 	for (irq = nr_irqs - 1; irq >= 0; irq--)
+
 #else /* CONFIG_GENERIC_HARDIRQS */
 
+#include <asm/irq_vectors.h>	/* need possible max_nr_irqs() */
+
 extern int nr_irqs;
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 
+# ifndef max_nr_irqs
+#  define max_nr_irqs(nr_cpus)	NR_IRQS
+# endif
+
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d0b8f7e72790..ebba7a116f14 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,6 +133,9 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
+	/* initialize nr_irqs based on nr_cpu_ids */
+	nr_irqs = max_nr_irqs(nr_cpu_ids);
+
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
 
 	desc = irq_desc_legacy;
-- 
cgit 


From 92296c6d6e908c35fca287a21af27be814af9c75 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sun, 11 Jan 2009 09:22:58 -0800
Subject: cpumask, irq: non-x86 build failures

Ingo Molnar wrote:

> All non-x86 architectures fail to build:
>
> In file included from /home/mingo/tip/include/linux/random.h:11,
>                  from /home/mingo/tip/include/linux/stackprotector.h:6,
>                  from /home/mingo/tip/init/main.c:17:
> /home/mingo/tip/include/linux/irqnr.h:26:63: error: asm/irq_vectors.h: No such file or directory

Do not include asm/irq_vectors.h in generic code - it's not available
on all architectures.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apicdef.h | 8 ++++++--
 include/linux/irqnr.h          | 6 ------
 kernel/irq/handle.c            | 5 +++++
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 1a6454ef7f6c..63134e31e8b9 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -132,8 +132,12 @@
 #define APIC_BASE_MSR	0x800
 #define X2APIC_ENABLE	(1UL << 10)
 
-/* get MAX_IO_APICS */
-#include <asm/apicnum.h>
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
 
 /*
  * All x86-64 systems are xAPIC compatible.
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index de66e4e10406..887477bc2ab0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -23,15 +23,9 @@
 
 #else /* CONFIG_GENERIC_HARDIRQS */
 
-#include <asm/irq_vectors.h>	/* need possible max_nr_irqs() */
-
 extern int nr_irqs;
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 
-# ifndef max_nr_irqs
-#  define max_nr_irqs(nr_cpus)	NR_IRQS
-# endif
-
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b39f32ac8f80..04d3e46031e5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -58,6 +58,11 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
+
+#ifndef max_nr_irqs
+#define max_nr_irqs(nr_cpus)	NR_IRQS
+#endif
+
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
 	.status	    = IRQ_DISABLED,
-- 
cgit 


From 4a046d1754ee6ebb6f399696805ed61ea0444d4c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 12 Jan 2009 17:39:24 -0800
Subject: x86: arch_probe_nr_irqs

Impact: save RAM with large NR_CPUS, get smaller nr_irqs

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/include/asm/irq_vectors.h |  7 ++-----
 arch/x86/kernel/io_apic.c          | 16 ++++++++++++++++
 include/linux/interrupt.h          |  1 +
 kernel/irq/handle.c                |  9 ++-------
 kernel/softirq.c                   |  5 +++++
 5 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 602361ad0e74..a16a2ab2b429 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -115,14 +115,11 @@
 # endif
 #else
 
-/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */
-# define max_nr_irqs(nr_cpus)				\
-	((8 * nr_cpus) > (32 * MAX_IO_APICS) ?		\
+# define NR_IRQS					\
+	((8 * NR_CPUS) > (32 * MAX_IO_APICS) ?		\
 		(NR_VECTORS + (8 * NR_CPUS)) :		\
 		(NR_VECTORS + (32 * MAX_IO_APICS)))	\
 
-# define NR_IRQS max_nr_irqs(NR_CPUS)
-
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index ae80638012de..157986916cd1 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3850,6 +3850,22 @@ void __init probe_nr_irqs_gsi(void)
 		nr_irqs_gsi = nr;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+	int nr;
+
+	nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
+		(NR_VECTORS + (8 * nr_cpu_ids)) :
+		(NR_VECTORS + (32 * nr_ioapics)));
+
+	if (nr < nr_irqs && nr > nr_irqs_gsi)
+		nr_irqs = nr;
+
+	return 0;
+}
+#endif
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..472f11765f60 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v);
 struct irq_desc;
 
 extern int early_irq_init(void);
+extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 04d3e46031e5..375d68cd5bf0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -59,10 +59,6 @@ EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
 
-#ifndef max_nr_irqs
-#define max_nr_irqs(nr_cpus)	NR_IRQS
-#endif
-
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
 	.status	    = IRQ_DISABLED,
@@ -137,9 +133,8 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
-	/* initialize nr_irqs based on nr_cpu_ids */
-	nr_irqs = max_nr_irqs(nr_cpu_ids);
-
+	 /* initialize nr_irqs based on nr_cpu_ids */
+	arch_probe_nr_irqs();
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
 
 	desc = irq_desc_legacy;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..0365b4899a3d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -795,6 +795,11 @@ int __init __weak early_irq_init(void)
 	return 0;
 }
 
+int __init __weak arch_probe_nr_irqs(void)
+{
+	return 0;
+}
+
 int __init __weak arch_early_irq_init(void)
 {
 	return 0;
-- 
cgit 


From 3e5d8f978435bb9ba4dfe3f4514e65e7885db1a9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Jan 2009 20:41:35 +0900
Subject: x86: make percpu symbols zerobased on SMP

[ Based on original patch from Christoph Lameter and Mike Travis. ]

This patch makes percpu symbols zerobased on x86_64 SMP by adding
PERCPU_VADDR() to vmlinux.lds.h which helps setting explicit vaddr on
the percpu output section and using it in vmlinux_64.lds.S.  A new
PHDR is added as existing ones cannot contain sections near address
zero.  PERCPU_VADDR() also adds a new symbol __per_cpu_load which
always points to the vaddr of the loaded percpu data.init region.

The following adjustments have been made to accomodate the address
change.

* code to locate percpu gdt_page in head_64.S is updated to add the
  load address to the gdt_page offset.

* __per_cpu_load is used in places where access to the init data area
  is necessary.

* pda->data_offset is initialized soon after C code is entered as zero
  value doesn't work anymore.

This patch is mostly taken from Mike Travis' "x86_64: Base percpu
variables at zero" patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head64.c          |  2 ++
 arch/x86/kernel/head_64.S         | 24 +++++++++++++++++-
 arch/x86/kernel/setup_percpu.c    |  2 +-
 arch/x86/kernel/vmlinux_64.lds.S  | 17 ++++++++++++-
 include/asm-generic/sections.h    |  2 +-
 include/asm-generic/vmlinux.lds.h | 51 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 88 insertions(+), 10 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index b9a4d8c4b935..bc2900ca82c7 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -44,6 +44,8 @@ void __init x86_64_init_pda(void)
 {
 	_cpu_pda = __cpu_pda;
 	cpu_pda(0) = &_boot_cpu_pda;
+	cpu_pda(0)->data_offset =
+		(unsigned long)(__per_cpu_load - __per_cpu_start);
 	pda_init(0);
 }
 
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0e275d495563..7ee0363871e8 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -204,6 +204,23 @@ ENTRY(secondary_startup_64)
 	pushq $0
 	popfq
 
+#ifdef CONFIG_SMP
+	/*
+	 * early_gdt_base should point to the gdt_page in static percpu init
+	 * data area.  Computing this requires two symbols - __per_cpu_load
+	 * and per_cpu__gdt_page.  As linker can't do no such relocation, do
+	 * it by hand.  As early_gdt_descr is manipulated by C code for
+	 * secondary CPUs, this should be done only once for the boot CPU
+	 * when early_gdt_descr_base contains zero.
+	 */
+	movq	early_gdt_descr_base(%rip), %rax
+	testq	%rax, %rax
+	jnz	1f
+	movq	$__per_cpu_load, %rax
+	addq	$per_cpu__gdt_page, %rax
+	movq	%rax, early_gdt_descr_base(%rip)
+1:
+#endif
 	/*
 	 * We must switch to a new descriptor in kernel space for the GDT
 	 * because soon the kernel won't have access anymore to the userspace
@@ -401,7 +418,12 @@ NEXT_PAGE(level2_spare_pgt)
 	.globl early_gdt_descr
 early_gdt_descr:
 	.word	GDT_ENTRIES*8-1
-	.quad   per_cpu__gdt_page
+#ifdef CONFIG_SMP
+early_gdt_descr_base:
+	.quad   0x0000000000000000
+#else
+	.quad	per_cpu__gdt_page
+#endif
 
 ENTRY(phys_base)
 	/* This must match the first entry in level2_kernel_pgt */
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 56c63ac62b10..44845842e722 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -213,7 +213,7 @@ void __init setup_per_cpu_areas(void)
 		}
 #endif
 		per_cpu_offset(cpu) = ptr - __per_cpu_start;
-		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
+		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
 
 		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1a614c0e6bef..f50280db0dfe 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,6 +19,9 @@ PHDRS {
 	data PT_LOAD FLAGS(7);	/* RWE */
 	user PT_LOAD FLAGS(7);	/* RWE */
 	data.init PT_LOAD FLAGS(7);	/* RWE */
+#ifdef CONFIG_SMP
+	percpu PT_LOAD FLAGS(7);	/* RWE */
+#endif
 	note PT_NOTE FLAGS(0);	/* ___ */
 }
 SECTIONS
@@ -208,14 +211,26 @@ SECTIONS
   __initramfs_end = .;
 #endif
 
+#ifdef CONFIG_SMP
+  /*
+   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+   * output PHDR, so the next output section - __data_nosave - should
+   * switch it back to data.init.
+   */
+  . = ALIGN(PAGE_SIZE);
+  PERCPU_VADDR(0, :percpu)
+#else
   PERCPU(PAGE_SIZE)
+#endif
 
   . = ALIGN(PAGE_SIZE);
   __init_end = .;
 
   . = ALIGN(PAGE_SIZE);
   __nosave_begin = .;
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) }
+  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+      *(.data.nosave)
+  } :data.init	/* switch back to data.init, see PERCPU_VADDR() above */
   . = ALIGN(PAGE_SIZE);
   __nosave_end = .;
 
diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h
index 79a7ff925bf8..4ce48e878530 100644
--- a/include/asm-generic/sections.h
+++ b/include/asm-generic/sections.h
@@ -9,7 +9,7 @@ extern char __bss_start[], __bss_stop[];
 extern char __init_begin[], __init_end[];
 extern char _sinittext[], _einittext[];
 extern char _end[];
-extern char __per_cpu_start[], __per_cpu_end[];
+extern char __per_cpu_load[], __per_cpu_start[], __per_cpu_end[];
 extern char __kprobes_text_start[], __kprobes_text_end[];
 extern char __initdata_begin[], __initdata_end[];
 extern char __start_rodata[], __end_rodata[];
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..fc2f55f2dcd6 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -430,12 +430,51 @@
   	*(.initcall7.init)						\
   	*(.initcall7s.init)
 
-#define PERCPU(align)							\
-	. = ALIGN(align);						\
-	VMLINUX_SYMBOL(__per_cpu_start) = .;				\
-	.data.percpu  : AT(ADDR(.data.percpu) - LOAD_OFFSET) {		\
+#define PERCPU_PROLOG(vaddr)						\
+	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
+	.data.percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) {		\
+		VMLINUX_SYMBOL(__per_cpu_start) = .;
+
+#define PERCPU_EPILOG(phdr)						\
+		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
+	} phdr								\
+	. = __per_cpu_load + SIZEOF(.data.percpu);
+
+/**
+ * PERCPU_VADDR - define output section for percpu area
+ * @vaddr: explicit base address (optional)
+ * @phdr: destination PHDR (optional)
+ *
+ * Macro which expands to output section for percpu area.  If @vaddr
+ * is not blank, it specifies explicit base address and all percpu
+ * symbols will be offset from the given address.  If blank, @vaddr
+ * always equals @laddr + LOAD_OFFSET.
+ *
+ * @phdr defines the output PHDR to use if not blank.  Be warned that
+ * output PHDR is sticky.  If @phdr is specified, the next output
+ * section in the linker script will go there too.  @phdr should have
+ * a leading colon.
+ *
+ * This macro defines three symbols, __per_cpu_load, __per_cpu_start
+ * and __per_cpu_end.  The first one is the vaddr of loaded percpu
+ * init data.  __per_cpu_start equals @vaddr and __per_cpu_end is the
+ * end offset.
+ */
+#define PERCPU_VADDR(vaddr, phdr)					\
+	PERCPU_PROLOG(vaddr)						\
 		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
-	}								\
-	VMLINUX_SYMBOL(__per_cpu_end) = .;
+	PERCPU_EPILOG(phdr)
+
+/**
+ * PERCPU - define output section for percpu area, simple version
+ * @align: required alignment
+ *
+ * Align to @align and outputs output section for percpu area.  This
+ * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
+ * __per_cpu_start will be identical.
+ */
+#define PERCPU(align)							\
+	. = ALIGN(align);						\
+	PERCPU_VADDR( , )
-- 
cgit 


From 1a51e3a0aed18767cf2762e95456ecfeb0bca5e6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 13 Jan 2009 20:41:35 +0900
Subject: x86: fold pda into percpu area on SMP

[ Based on original patch from Christoph Lameter and Mike Travis. ]

Currently pdas and percpu areas are allocated separately.  %gs points
to local pda and percpu area can be reached using pda->data_offset.
This patch folds pda into percpu area.

Due to strange gcc requirement, pda needs to be at the beginning of
the percpu area so that pda->stack_canary is at %gs:40.  To achieve
this, a new percpu output section macro - PERCPU_VADDR_PREALLOC() - is
added and used to reserve pda sized chunk at the start of the percpu
area.

After this change, for boot cpu, %gs first points to pda in the
data.init area and later during setup_per_cpu_areas() gets updated to
point to the actual pda.  This means that setup_per_cpu_areas() need
to reload %gs for CPU0 while clearing pda area for other cpus as cpu0
already has modified it when control reaches setup_per_cpu_areas().

This patch also removes now unnecessary get_local_pda() and its call
sites.

A lot of this patch is taken from Mike Travis' "x86_64: Fold pda into
per cpu area" patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/percpu.h     |   8 +++
 arch/x86/include/asm/smp.h        |   2 -
 arch/x86/kernel/asm-offsets_64.c  |   1 +
 arch/x86/kernel/cpu/common.c      |   6 +--
 arch/x86/kernel/head64.c          |   8 ++-
 arch/x86/kernel/head_64.S         |  15 ++++--
 arch/x86/kernel/setup_percpu.c    | 107 ++++++++++++++++----------------------
 arch/x86/kernel/smpboot.c         |  60 +--------------------
 arch/x86/kernel/vmlinux_64.lds.S  |   6 ++-
 arch/x86/xen/smp.c                |  10 ----
 include/asm-generic/vmlinux.lds.h |  25 ++++++++-
 11 files changed, 104 insertions(+), 144 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index df644f3e53e6..0ed77cf33f76 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -1,6 +1,14 @@
 #ifndef _ASM_X86_PERCPU_H
 #define _ASM_X86_PERCPU_H
 
+#ifndef __ASSEMBLY__
+#ifdef CONFIG_X86_64
+extern void load_pda_offset(int cpu);
+#else
+static inline void load_pda_offset(int cpu) { }
+#endif
+#endif
+
 #ifdef CONFIG_X86_64
 #include <linux/compiler.h>
 
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index a8cea7b09434..127415402ea1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -19,8 +19,6 @@
 #include <asm/thread_info.h>
 #include <asm/cpumask.h>
 
-extern int __cpuinit get_local_pda(int cpu);
-
 extern int smp_num_siblings;
 extern unsigned int num_processors;
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 1d41d3f1edbc..f8d1b047ef4f 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -56,6 +56,7 @@ int main(void)
 	ENTRY(cpunumber);
 	ENTRY(irqstackptr);
 	ENTRY(data_offset);
+	DEFINE(pda_size, sizeof(struct x8664_pda));
 	BLANK();
 #undef ENTRY
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c116c599326e..7041acdf5579 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -893,10 +893,8 @@ void __cpuinit pda_init(int cpu)
 	/* Setup up data that may be needed in __get_free_pages early */
 	loadsegment(fs, 0);
 	loadsegment(gs, 0);
-	/* Memory clobbers used to order PDA accessed */
-	mb();
-	wrmsrl(MSR_GS_BASE, pda);
-	mb();
+
+	load_pda_offset(cpu);
 
 	pda->cpunumber = cpu;
 	pda->irqcount = -1;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 462d0beccb6b..1a311293f733 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,12 +26,18 @@
 #include <asm/bios_ebda.h>
 #include <asm/trampoline.h>
 
-/* boot cpu pda, referenced by head_64.S to initialize %gs for boot CPU */
+#ifndef CONFIG_SMP
+/* boot cpu pda, referenced by head_64.S to initialize %gs on UP */
 struct x8664_pda _boot_cpu_pda;
+#endif
 
 void __init x86_64_init_pda(void)
 {
+#ifdef CONFIG_SMP
+	cpu_pda(0) = (void *)__per_cpu_load;
+#else
 	cpu_pda(0) = &_boot_cpu_pda;
+#endif
 	cpu_pda(0)->data_offset =
 		(unsigned long)(__per_cpu_load - __per_cpu_start);
 	pda_init(0);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 2f0ab0089883..7a995d0e9f78 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -245,10 +245,13 @@ ENTRY(secondary_startup_64)
 
 	/* Set up %gs.
 	 *
-	 * %gs should point to the pda.  For initial boot, make %gs point
-	 * to the _boot_cpu_pda in data section.  For a secondary CPU,
-	 * initial_gs should be set to its pda address before the CPU runs
-	 * this code.
+	 * On SMP, %gs should point to the per-cpu area.  For initial
+	 * boot, make %gs point to the init data section.  For a
+	 * secondary CPU,initial_gs should be set to its pda address
+	 * before the CPU runs this code.
+	 *
+	 * On UP, initial_gs points to _boot_cpu_pda and doesn't
+	 * change.
 	 */
 	movl	$MSR_GS_BASE,%ecx
 	movq	initial_gs(%rip),%rax
@@ -278,7 +281,11 @@ ENTRY(secondary_startup_64)
 	ENTRY(initial_code)
 	.quad	x86_64_start_kernel
 	ENTRY(initial_gs)
+#ifdef CONFIG_SMP
+	.quad	__per_cpu_load
+#else
 	.quad	_boot_cpu_pda
+#endif
 	__FINITDATA
 
 	ENTRY(stack_start)
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 73ab01b297c5..63d462802272 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -13,6 +13,7 @@
 #include <asm/mpspec.h>
 #include <asm/apicdef.h>
 #include <asm/highmem.h>
+#include <asm/proto.h>
 #include <asm/cpumask.h>
 
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
@@ -65,6 +66,36 @@ static void __init setup_node_to_cpumask_map(void);
 static inline void setup_node_to_cpumask_map(void) { }
 #endif
 
+#ifdef CONFIG_X86_64
+void __cpuinit load_pda_offset(int cpu)
+{
+	/* Memory clobbers used to order pda/percpu accesses */
+	mb();
+	wrmsrl(MSR_GS_BASE, cpu_pda(cpu));
+	mb();
+}
+
+#endif /* CONFIG_SMP && CONFIG_X86_64 */
+
+#ifdef CONFIG_X86_64
+
+/* correctly size the local cpu masks */
+static void setup_cpu_local_masks(void)
+{
+	alloc_bootmem_cpumask_var(&cpu_initialized_mask);
+	alloc_bootmem_cpumask_var(&cpu_callin_mask);
+	alloc_bootmem_cpumask_var(&cpu_callout_mask);
+	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
+}
+
+#else /* CONFIG_X86_32 */
+
+static inline void setup_cpu_local_masks(void)
+{
+}
+
+#endif /* CONFIG_X86_32 */
+
 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 /*
  * Copy data used in early init routines from the initial arrays to the
@@ -101,63 +132,7 @@ static void __init setup_per_cpu_maps(void)
  */
 unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
 EXPORT_SYMBOL(__per_cpu_offset);
-static inline void setup_cpu_pda_map(void) { }
-
-#elif !defined(CONFIG_SMP)
-static inline void setup_cpu_pda_map(void) { }
-
-#else /* CONFIG_SMP && CONFIG_X86_64 */
-
-/*
- * Allocate cpu_pda pointer table and array via alloc_bootmem.
- */
-static void __init setup_cpu_pda_map(void)
-{
-	char *pda;
-	unsigned long size;
-	int cpu;
-
-	size = roundup(sizeof(struct x8664_pda), cache_line_size());
-
-	/* allocate cpu_pda array and pointer table */
-	{
-		unsigned long asize = size * (nr_cpu_ids - 1);
-
-		pda = alloc_bootmem(asize);
-	}
-
-	/* initialize pointer table to static pda's */
-	for_each_possible_cpu(cpu) {
-		if (cpu == 0) {
-			/* leave boot cpu pda in place */
-			continue;
-		}
-		cpu_pda(cpu) = (struct x8664_pda *)pda;
-		cpu_pda(cpu)->in_bootmem = 1;
-		pda += size;
-	}
-}
-
-#endif /* CONFIG_SMP && CONFIG_X86_64 */
-
-#ifdef CONFIG_X86_64
-
-/* correctly size the local cpu masks */
-static void setup_cpu_local_masks(void)
-{
-	alloc_bootmem_cpumask_var(&cpu_initialized_mask);
-	alloc_bootmem_cpumask_var(&cpu_callin_mask);
-	alloc_bootmem_cpumask_var(&cpu_callout_mask);
-	alloc_bootmem_cpumask_var(&cpu_sibling_setup_mask);
-}
-
-#else /* CONFIG_X86_32 */
-
-static inline void setup_cpu_local_masks(void)
-{
-}
-
-#endif /* CONFIG_X86_32 */
+#endif
 
 /*
  * Great future plan:
@@ -171,9 +146,6 @@ void __init setup_per_cpu_areas(void)
 	int cpu;
 	unsigned long align = 1;
 
-	/* Setup cpu_pda map */
-	setup_cpu_pda_map();
-
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
 	align = max_t(unsigned long, PAGE_SIZE, align);
@@ -204,8 +176,21 @@ void __init setup_per_cpu_areas(void)
 				cpu, node, __pa(ptr));
 		}
 #endif
-		per_cpu_offset(cpu) = ptr - __per_cpu_start;
+
 		memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
+#ifdef CONFIG_X86_64
+		cpu_pda(cpu) = (void *)ptr;
+
+		/*
+		 * CPU0 modified pda in the init data area, reload pda
+		 * offset for CPU0 and clear the area for others.
+		 */
+		if (cpu == 0)
+			load_pda_offset(0);
+		else
+			memset(cpu_pda(cpu), 0, sizeof(*cpu_pda(cpu)));
+#endif
+		per_cpu_offset(cpu) = ptr - __per_cpu_start;
 
 		DBG("PERCPU: cpu %4d %p\n", cpu, ptr);
 	}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 70d846628bbf..f2f77ca494d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -744,52 +744,6 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
 	complete(&c_idle->done);
 }
 
-#ifdef CONFIG_X86_64
-
-/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
-static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
-{
-	if (!after_bootmem)
-		free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
-}
-
-/*
- * Allocate node local memory for the AP pda.
- *
- * Must be called after the _cpu_pda pointer table is initialized.
- */
-int __cpuinit get_local_pda(int cpu)
-{
-	struct x8664_pda *oldpda, *newpda;
-	unsigned long size = sizeof(struct x8664_pda);
-	int node = cpu_to_node(cpu);
-
-	if (cpu_pda(cpu) && !cpu_pda(cpu)->in_bootmem)
-		return 0;
-
-	oldpda = cpu_pda(cpu);
-	newpda = kmalloc_node(size, GFP_ATOMIC, node);
-	if (!newpda) {
-		printk(KERN_ERR "Could not allocate node local PDA "
-			"for CPU %d on node %d\n", cpu, node);
-
-		if (oldpda)
-			return 0;	/* have a usable pda */
-		else
-			return -1;
-	}
-
-	if (oldpda) {
-		memcpy(newpda, oldpda, size);
-		free_bootmem_pda(oldpda);
-	}
-
-	newpda->in_bootmem = 0;
-	cpu_pda(cpu) = newpda;
-	return 0;
-}
-#endif /* CONFIG_X86_64 */
-
 static int __cpuinit do_boot_cpu(int apicid, int cpu)
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
@@ -807,16 +761,6 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	};
 	INIT_WORK(&c_idle.work, do_fork_idle);
 
-#ifdef CONFIG_X86_64
-	/* Allocate node local memory for AP pdas */
-	if (cpu > 0) {
-		boot_error = get_local_pda(cpu);
-		if (boot_error)
-			goto restore_state;
-			/* if can't get pda memory, can't start cpu */
-	}
-#endif
-
 	alternatives_smp_switch(1);
 
 	c_idle.idle = get_idle_for_cpu(cpu);
@@ -931,9 +875,7 @@ do_rest:
 				inquire_remote_apic(apicid);
 		}
 	}
-#ifdef CONFIG_X86_64
-restore_state:
-#endif
+
 	if (boot_error) {
 		/* Try to put things back the way they were before ... */
 		numa_remove_cpu(cpu); /* was set by numa_add_cpu */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index f50280db0dfe..962f21f1d4d7 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -5,6 +5,7 @@
 #define LOAD_OFFSET __START_KERNEL_map
 
 #include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
 #include <asm/page.h>
 
 #undef i386	/* in case the preprocessor is a 32bit one */
@@ -215,10 +216,11 @@ SECTIONS
   /*
    * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
    * output PHDR, so the next output section - __data_nosave - should
-   * switch it back to data.init.
+   * switch it back to data.init.  Also, pda should be at the head of
+   * percpu area.  Preallocate it.
    */
   . = ALIGN(PAGE_SIZE);
-  PERCPU_VADDR(0, :percpu)
+  PERCPU_VADDR_PREALLOC(0, :percpu, pda_size)
 #else
   PERCPU(PAGE_SIZE)
 #endif
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index c44e2069c7c7..83fa4236477d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -283,16 +283,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	struct task_struct *idle = idle_task(cpu);
 	int rc;
 
-#ifdef CONFIG_X86_64
-	/* Allocate node local memory for AP pdas */
-	WARN_ON(cpu == 0);
-	if (cpu > 0) {
-		rc = get_local_pda(cpu);
-		if (rc)
-			return rc;
-	}
-#endif
-
 #ifdef CONFIG_X86_32
 	init_gdt(cpu);
 	per_cpu(current_task, cpu) = idle;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index fc2f55f2dcd6..e53319cf29cb 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -441,9 +441,10 @@
 	. = __per_cpu_load + SIZEOF(.data.percpu);
 
 /**
- * PERCPU_VADDR - define output section for percpu area
+ * PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
  * @vaddr: explicit base address (optional)
  * @phdr: destination PHDR (optional)
+ * @prealloc: the size of prealloc area
  *
  * Macro which expands to output section for percpu area.  If @vaddr
  * is not blank, it specifies explicit base address and all percpu
@@ -455,11 +456,33 @@
  * section in the linker script will go there too.  @phdr should have
  * a leading colon.
  *
+ * If @prealloc is non-zero, the specified number of bytes will be
+ * reserved at the start of percpu area.  As the prealloc area is
+ * likely to break alignment, this macro puts areas in increasing
+ * alignment order.
+ *
  * This macro defines three symbols, __per_cpu_load, __per_cpu_start
  * and __per_cpu_end.  The first one is the vaddr of loaded percpu
  * init data.  __per_cpu_start equals @vaddr and __per_cpu_end is the
  * end offset.
  */
+#define PERCPU_VADDR_PREALLOC(vaddr, segment, prealloc)			\
+	PERCPU_PROLOG(vaddr)						\
+		. += prealloc;						\
+		*(.data.percpu)						\
+		*(.data.percpu.shared_aligned)				\
+		*(.data.percpu.page_aligned)				\
+	PERCPU_EPILOG(segment)
+
+/**
+ * PERCPU_VADDR - define output section for percpu area
+ * @vaddr: explicit base address (optional)
+ * @phdr: destination PHDR (optional)
+ *
+ * Macro which expands to output section for percpu area.  Mostly
+ * identical to PERCPU_VADDR_PREALLOC(@vaddr, @phdr, 0) other than
+ * using slighly different layout.
+ */
 #define PERCPU_VADDR(vaddr, phdr)					\
 	PERCPU_PROLOG(vaddr)						\
 		*(.data.percpu.page_aligned)				\
-- 
cgit 


From 6dbde3530850d4d8bfc1b6bd4006d92786a2787f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 15 Jan 2009 22:15:53 +0900
Subject: percpu: add optimized generic percpu accessors

It is an optimization and a cleanup, and adds the following new
generic percpu methods:

  percpu_read()
  percpu_write()
  percpu_add()
  percpu_sub()
  percpu_and()
  percpu_or()
  percpu_xor()

and implements support for them on x86. (other architectures will fall
back to a default implementation)

The advantage is that for example to read a local percpu variable,
instead of this sequence:

 return __get_cpu_var(var);

 ffffffff8102ca2b:	48 8b 14 fd 80 09 74 	mov    -0x7e8bf680(,%rdi,8),%rdx
 ffffffff8102ca32:	81
 ffffffff8102ca33:	48 c7 c0 d8 59 00 00 	mov    $0x59d8,%rax
 ffffffff8102ca3a:	48 8b 04 10          	mov    (%rax,%rdx,1),%rax

We can get a single instruction by using the optimized variants:

 return percpu_read(var);

 ffffffff8102ca3f:	65 48 8b 05 91 8f fd 	mov    %gs:0x7efd8f91(%rip),%rax

I also cleaned up the x86-specific APIs and made the x86 code use
these new generic percpu primitives.

tj: * fixed generic percpu_sub() definition as Roel Kluin pointed out
    * added percpu_and() for completeness's sake
    * made generic percpu ops atomic against preemption

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/include/asm/current.h        |  2 +-
 arch/x86/include/asm/irq_regs_32.h    |  4 +--
 arch/x86/include/asm/mmu_context_32.h | 12 ++++----
 arch/x86/include/asm/pda.h            | 10 +++----
 arch/x86/include/asm/percpu.h         | 24 ++++++++--------
 arch/x86/include/asm/smp.h            |  2 +-
 arch/x86/kernel/process_32.c          |  2 +-
 arch/x86/kernel/tlb_32.c              | 10 +++----
 arch/x86/mach-voyager/voyager_smp.c   |  4 +--
 arch/x86/xen/enlighten.c              | 14 +++++-----
 arch/x86/xen/irq.c                    |  8 +++---
 arch/x86/xen/mmu.c                    |  2 +-
 arch/x86/xen/multicalls.h             |  2 +-
 arch/x86/xen/smp.c                    |  2 +-
 include/asm-generic/percpu.h          | 52 +++++++++++++++++++++++++++++++++++
 15 files changed, 102 insertions(+), 48 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 0930b4f8d672..0728480f5c56 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,7 +10,7 @@ struct task_struct;
 DECLARE_PER_CPU(struct task_struct *, current_task);
 static __always_inline struct task_struct *get_current(void)
 {
-	return x86_read_percpu(current_task);
+	return percpu_read(current_task);
 }
 
 #else /* X86_32 */
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
index 86afd7473457..d7ed33ee94e9 100644
--- a/arch/x86/include/asm/irq_regs_32.h
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -15,7 +15,7 @@ DECLARE_PER_CPU(struct pt_regs *, irq_regs);
 
 static inline struct pt_regs *get_irq_regs(void)
 {
-	return x86_read_percpu(irq_regs);
+	return percpu_read(irq_regs);
 }
 
 static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
@@ -23,7 +23,7 @@ static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
 	struct pt_regs *old_regs;
 
 	old_regs = get_irq_regs();
-	x86_write_percpu(irq_regs, new_regs);
+	percpu_write(irq_regs, new_regs);
 
 	return old_regs;
 }
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
index 7e98ce1d2c0e..08b53454f831 100644
--- a/arch/x86/include/asm/mmu_context_32.h
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -4,8 +4,8 @@
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK)
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_LAZY);
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
+		percpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
 #endif
 }
 
@@ -19,8 +19,8 @@ static inline void switch_mm(struct mm_struct *prev,
 		/* stop flush ipis for the previous mm */
 		cpu_clear(cpu, prev->cpu_vm_mask);
 #ifdef CONFIG_SMP
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		x86_write_percpu(cpu_tlbstate.active_mm, next);
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		percpu_write(cpu_tlbstate.active_mm, next);
 #endif
 		cpu_set(cpu, next->cpu_vm_mask);
 
@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev,
 	}
 #ifdef CONFIG_SMP
 	else {
-		x86_write_percpu(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(x86_read_percpu(cpu_tlbstate.active_mm) != next);
+		percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next);
 
 		if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
 			/* We were in lazy tlb mode and leave_mm disabled
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index e3d3a081d798..47f274fe6953 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -45,11 +45,11 @@ extern void pda_init(int);
 
 #define cpu_pda(cpu)		(&per_cpu(__pda, cpu))
 
-#define read_pda(field)		x86_read_percpu(__pda.field)
-#define write_pda(field, val)	x86_write_percpu(__pda.field, val)
-#define add_pda(field, val)	x86_add_percpu(__pda.field, val)
-#define sub_pda(field, val)	x86_sub_percpu(__pda.field, val)
-#define or_pda(field, val)	x86_or_percpu(__pda.field, val)
+#define read_pda(field)		percpu_read(__pda.field)
+#define write_pda(field, val)	percpu_write(__pda.field, val)
+#define add_pda(field, val)	percpu_add(__pda.field, val)
+#define sub_pda(field, val)	percpu_sub(__pda.field, val)
+#define or_pda(field, val)	percpu_or(__pda.field, val)
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define test_and_clear_bit_pda(bit, field)				\
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 328b31a429d7..03aa4b00a1c3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -40,16 +40,11 @@
 
 #ifdef CONFIG_SMP
 #define __percpu_seg_str	"%%"__stringify(__percpu_seg)":"
-#define __my_cpu_offset		x86_read_percpu(this_cpu_off)
+#define __my_cpu_offset		percpu_read(this_cpu_off)
 #else
 #define __percpu_seg_str
 #endif
 
-#include <asm-generic/percpu.h>
-
-/* We can use this directly for local CPU (faster). */
-DECLARE_PER_CPU(unsigned long, this_cpu_off);
-
 /* For arch-specific code, we can use direct single-insn ops (they
  * don't give an lvalue though). */
 extern void __bad_percpu_size(void);
@@ -115,11 +110,13 @@ do {							\
 	ret__;						\
 })
 
-#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
-#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
-#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
-#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
-#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
+#define percpu_read(var)	percpu_from_op("mov", per_cpu__##var)
+#define percpu_write(var, val)	percpu_to_op("mov", per_cpu__##var, val)
+#define percpu_add(var, val)	percpu_to_op("add", per_cpu__##var, val)
+#define percpu_sub(var, val)	percpu_to_op("sub", per_cpu__##var, val)
+#define percpu_and(var, val)	percpu_to_op("and", per_cpu__##var, val)
+#define percpu_or(var, val)	percpu_to_op("or", per_cpu__##var, val)
+#define percpu_xor(var, val)	percpu_to_op("xor", per_cpu__##var, val)
 
 /* This is not atomic against other CPUs -- CPU preemption needs to be off */
 #define x86_test_and_clear_bit_percpu(bit, var)				\
@@ -131,6 +128,11 @@ do {							\
 	old__;								\
 })
 
+#include <asm-generic/percpu.h>
+
+/* We can use this directly for local CPU (faster). */
+DECLARE_PER_CPU(unsigned long, this_cpu_off);
+
 #ifdef CONFIG_X86_64
 extern void load_pda_offset(int cpu);
 #else
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 127415402ea1..c7bbbbe65d3f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -160,7 +160,7 @@ extern unsigned disabled_cpus __cpuinitdata;
  * from the initial startup. We map APIC_BASE very early in page_setup(),
  * so this is correct in the x86 case.
  */
-#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
+#define raw_smp_processor_id() (percpu_read(cpu_number))
 extern int safe_smp_processor_id(void);
 
 #elif defined(CONFIG_X86_64_SMP)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a546f55c77b4..77d546817d94 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -591,7 +591,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	if (prev->gs | next->gs)
 		loadsegment(gs, next->gs);
 
-	x86_write_percpu(current_task, next_p);
+	percpu_write(current_task, next_p);
 
 	return prev_p;
 }
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index ec53818f4e38..e65449d0f7d9 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -34,8 +34,8 @@ static DEFINE_SPINLOCK(tlbstate_lock);
  */
 void leave_mm(int cpu)
 {
-	BUG_ON(x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK);
-	cpu_clear(cpu, x86_read_percpu(cpu_tlbstate.active_mm)->cpu_vm_mask);
+	BUG_ON(percpu_read(cpu_tlbstate.state) == TLBSTATE_OK);
+	cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask);
 	load_cr3(swapper_pg_dir);
 }
 EXPORT_SYMBOL_GPL(leave_mm);
@@ -103,8 +103,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs)
 		 * BUG();
 		 */
 
-	if (flush_mm == x86_read_percpu(cpu_tlbstate.active_mm)) {
-		if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_OK) {
+	if (flush_mm == percpu_read(cpu_tlbstate.active_mm)) {
+		if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
 			if (flush_va == TLB_FLUSH_ALL)
 				local_flush_tlb();
 			else
@@ -222,7 +222,7 @@ static void do_flush_tlb_all(void *info)
 	unsigned long cpu = smp_processor_id();
 
 	__flush_tlb_all();
-	if (x86_read_percpu(cpu_tlbstate.state) == TLBSTATE_LAZY)
+	if (percpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY)
 		leave_mm(cpu);
 }
 
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 1a48368acb09..96f15b09a4c5 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -402,7 +402,7 @@ void __init find_smp_config(void)
 	     VOYAGER_SUS_IN_CONTROL_PORT);
 
 	current_thread_info()->cpu = boot_cpu_id;
-	x86_write_percpu(cpu_number, boot_cpu_id);
+	percpu_write(cpu_number, boot_cpu_id);
 }
 
 /*
@@ -1782,7 +1782,7 @@ static void __init voyager_smp_cpus_done(unsigned int max_cpus)
 void __init smp_setup_processor_id(void)
 {
 	current_thread_info()->cpu = hard_smp_processor_id();
-	x86_write_percpu(cpu_number, hard_smp_processor_id());
+	percpu_write(cpu_number, hard_smp_processor_id());
 }
 
 static void voyager_send_call_func(cpumask_t callmask)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 312414ef9365..75b94139e1f2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -695,17 +695,17 @@ static void xen_write_cr0(unsigned long cr0)
 
 static void xen_write_cr2(unsigned long cr2)
 {
-	x86_read_percpu(xen_vcpu)->arch.cr2 = cr2;
+	percpu_read(xen_vcpu)->arch.cr2 = cr2;
 }
 
 static unsigned long xen_read_cr2(void)
 {
-	return x86_read_percpu(xen_vcpu)->arch.cr2;
+	return percpu_read(xen_vcpu)->arch.cr2;
 }
 
 static unsigned long xen_read_cr2_direct(void)
 {
-	return x86_read_percpu(xen_vcpu_info.arch.cr2);
+	return percpu_read(xen_vcpu_info.arch.cr2);
 }
 
 static void xen_write_cr4(unsigned long cr4)
@@ -718,12 +718,12 @@ static void xen_write_cr4(unsigned long cr4)
 
 static unsigned long xen_read_cr3(void)
 {
-	return x86_read_percpu(xen_cr3);
+	return percpu_read(xen_cr3);
 }
 
 static void set_current_cr3(void *v)
 {
-	x86_write_percpu(xen_current_cr3, (unsigned long)v);
+	percpu_write(xen_current_cr3, (unsigned long)v);
 }
 
 static void __xen_write_cr3(bool kernel, unsigned long cr3)
@@ -748,7 +748,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3)
 	MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
 
 	if (kernel) {
-		x86_write_percpu(xen_cr3, cr3);
+		percpu_write(xen_cr3, cr3);
 
 		/* Update xen_current_cr3 once the batch has actually
 		   been submitted. */
@@ -764,7 +764,7 @@ static void xen_write_cr3(unsigned long cr3)
 
 	/* Update while interrupts are disabled, so its atomic with
 	   respect to ipis */
-	x86_write_percpu(xen_cr3, cr3);
+	percpu_write(xen_cr3, cr3);
 
 	__xen_write_cr3(true, cr3);
 
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index bb042608c602..2e8271431e1a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -39,7 +39,7 @@ static unsigned long xen_save_fl(void)
 	struct vcpu_info *vcpu;
 	unsigned long flags;
 
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 
 	/* flag has opposite sense of mask */
 	flags = !vcpu->evtchn_upcall_mask;
@@ -62,7 +62,7 @@ static void xen_restore_fl(unsigned long flags)
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 	vcpu->evtchn_upcall_mask = flags;
 	preempt_enable_no_resched();
 
@@ -83,7 +83,7 @@ static void xen_irq_disable(void)
 	   make sure we're don't switch CPUs between getting the vcpu
 	   pointer and updating the mask. */
 	preempt_disable();
-	x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
+	percpu_read(xen_vcpu)->evtchn_upcall_mask = 1;
 	preempt_enable_no_resched();
 }
 
@@ -96,7 +96,7 @@ static void xen_irq_enable(void)
 	   the caller is confused and is trying to re-enable interrupts
 	   on an indeterminate processor. */
 
-	vcpu = x86_read_percpu(xen_vcpu);
+	vcpu = percpu_read(xen_vcpu);
 	vcpu->evtchn_upcall_mask = 0;
 
 	/* Doesn't matter if we get preempted here, because any
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 503c240e26c7..7bc7852cc5c4 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1074,7 +1074,7 @@ static void drop_other_mm_ref(void *info)
 
 	/* If this cpu still has a stale cr3 reference, then make sure
 	   it has been flushed. */
-	if (x86_read_percpu(xen_current_cr3) == __pa(mm->pgd)) {
+	if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
 		load_cr3(swapper_pg_dir);
 		arch_flush_lazy_cpu_mode();
 	}
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 858938241616..e786fa7f2615 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -39,7 +39,7 @@ static inline void xen_mc_issue(unsigned mode)
 		xen_mc_flush();
 
 	/* restore flags saved in xen_mc_batch */
-	local_irq_restore(x86_read_percpu(xen_mc_irq_flags));
+	local_irq_restore(percpu_read(xen_mc_irq_flags));
 }
 
 /* Set up a callback to be called when the current batch is flushed */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 83fa4236477d..3bfd6dd0b47c 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -78,7 +78,7 @@ static __cpuinit void cpu_bringup(void)
 	xen_setup_cpu_clockevents();
 
 	cpu_set(cpu, cpu_online_map);
-	x86_write_percpu(cpu_state, CPU_ONLINE);
+	percpu_write(cpu_state, CPU_ONLINE);
 	wmb();
 
 	/* We can take interrupts now: we're officially "up". */
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index b0e63c672ebd..00f45ff081a6 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -80,4 +80,56 @@ extern void setup_per_cpu_areas(void);
 #define DECLARE_PER_CPU(type, name) extern PER_CPU_ATTRIBUTES \
 					__typeof__(type) per_cpu_var(name)
 
+/*
+ * Optional methods for optimized non-lvalue per-cpu variable access.
+ *
+ * @var can be a percpu variable or a field of it and its size should
+ * equal char, int or long.  percpu_read() evaluates to a lvalue and
+ * all others to void.
+ *
+ * These operations are guaranteed to be atomic w.r.t. preemption.
+ * The generic versions use plain get/put_cpu_var().  Archs are
+ * encouraged to implement single-instruction alternatives which don't
+ * require preemption protection.
+ */
+#ifndef percpu_read
+# define percpu_read(var)						\
+  ({									\
+	typeof(per_cpu_var(var)) __tmp_var__;				\
+	__tmp_var__ = get_cpu_var(var);					\
+	put_cpu_var(var);						\
+	__tmp_var__;							\
+  })
+#endif
+
+#define __percpu_generic_to_op(var, val, op)				\
+do {									\
+	get_cpu_var(var) op val;					\
+	put_cpu_var(var);						\
+} while (0)
+
+#ifndef percpu_write
+# define percpu_write(var, val)		__percpu_generic_to_op(var, (val), =)
+#endif
+
+#ifndef percpu_add
+# define percpu_add(var, val)		__percpu_generic_to_op(var, (val), +=)
+#endif
+
+#ifndef percpu_sub
+# define percpu_sub(var, val)		__percpu_generic_to_op(var, (val), -=)
+#endif
+
+#ifndef percpu_and
+# define percpu_and(var, val)		__percpu_generic_to_op(var, (val), &=)
+#endif
+
+#ifndef percpu_or
+# define percpu_or(var, val)		__percpu_generic_to_op(var, (val), |=)
+#endif
+
+#ifndef percpu_xor
+# define percpu_xor(var, val)		__percpu_generic_to_op(var, (val), ^=)
+#endif
+
 #endif /* _ASM_GENERIC_PERCPU_H_ */
-- 
cgit 


From 145cd30bac885dffad9db9d487baad07b68a3d04 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 17 Jan 2009 14:42:50 +0900
Subject: linker script: add missing VMLINUX_SYMBOL

The newly added PERCPU_*() macros define and use __per_cpu_load but
VMLINUX_SYMBOL() was missing from usages causing build failures on
archs where linker visible symbol is different from C symbols
(e.g. blackfin).

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index e53319cf29cb..aa6b9b1b30b5 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -432,13 +432,14 @@
 
 #define PERCPU_PROLOG(vaddr)						\
 	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
-	.data.percpu vaddr : AT(__per_cpu_load - LOAD_OFFSET) {		\
+	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
+				- LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__per_cpu_start) = .;
 
 #define PERCPU_EPILOG(phdr)						\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
 	} phdr								\
-	. = __per_cpu_load + SIZEOF(.data.percpu);
+	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
 
 /**
  * PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
-- 
cgit 


From 0bd74fa8e29dcad98f7e8ffe01ec05fb3326abaf Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 19 Jan 2009 12:21:27 +0900
Subject: percpu: refactor percpu.h

Impact: cleanup

Refactor the DEFINE_PER_CPU_* macros and add .data.percpu.first
section.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/asm-generic/vmlinux.lds.h |  1 +
 include/linux/percpu.h            | 41 ++++++++++++++++++++++-----------------
 2 files changed, 24 insertions(+), 18 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index aa6b9b1b30b5..32bbf50d3055 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -486,6 +486,7 @@
  */
 #define PERCPU_VADDR(vaddr, phdr)					\
 	PERCPU_PROLOG(vaddr)						\
+		*(.data.percpu.first)					\
 		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 9f2a3751873a..0e24202b5a4e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -9,34 +9,39 @@
 #include <asm/percpu.h>
 
 #ifdef CONFIG_SMP
-#define DEFINE_PER_CPU(type, name)					\
-	__attribute__((__section__(".data.percpu")))			\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+#define PER_CPU_BASE_SECTION ".data.percpu"
 
 #ifdef MODULE
-#define SHARED_ALIGNED_SECTION ".data.percpu"
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
 #else
-#define SHARED_ALIGNED_SECTION ".data.percpu.shared_aligned"
+#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
 #endif
+#define PER_CPU_FIRST_SECTION ".first"
 
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)			\
-	__attribute__((__section__(SHARED_ALIGNED_SECTION)))		\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name		\
-	____cacheline_aligned_in_smp
+#else
+
+#define PER_CPU_BASE_SECTION ".data"
+#define PER_CPU_SHARED_ALIGNED_SECTION ""
+#define PER_CPU_FIRST_SECTION ""
+
+#endif
 
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)			\
-	__attribute__((__section__(".data.percpu.page_aligned")))	\
+#define DEFINE_PER_CPU_SECTION(type, name, section)			\
+	__attribute__((__section__(PER_CPU_BASE_SECTION section)))	\
 	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
-#else
+
 #define DEFINE_PER_CPU(type, name)					\
-	PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
+	DEFINE_PER_CPU_SECTION(type, name, "")
 
-#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)		      \
-	DEFINE_PER_CPU(type, name)
+#define DEFINE_PER_CPU_SHARED_ALIGNED(type, name)			\
+	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
+	____cacheline_aligned_in_smp
 
-#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)		      \
-	DEFINE_PER_CPU(type, name)
-#endif
+#define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
+
+#define DEFINE_PER_CPU_FIRST(type, name)				\
+	DEFINE_PER_CPU_SECTION(type, name, PER_CPU_FIRST_SECTION)
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
-- 
cgit 


From 6b7c38d55587f43bcd2cbce3a98b1c0826982090 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 19 Jan 2009 12:21:28 +0900
Subject: linker script: kill PERCPU_VADDR_PREALLOC()

Impact: cleanup

With .data.percpu.first in place, PERCPU_VADDR_PREALLOC() is no longer
necessary.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/asm-generic/vmlinux.lds.h | 45 +++++++--------------------------------
 1 file changed, 8 insertions(+), 37 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 32bbf50d3055..53e21f36a802 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -430,22 +430,10 @@
   	*(.initcall7.init)						\
   	*(.initcall7s.init)
 
-#define PERCPU_PROLOG(vaddr)						\
-	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
-	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
-				- LOAD_OFFSET) {			\
-		VMLINUX_SYMBOL(__per_cpu_start) = .;
-
-#define PERCPU_EPILOG(phdr)						\
-		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
-	} phdr								\
-	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
-
 /**
- * PERCPU_VADDR_PREALLOC - define output section for percpu area with prealloc
+ * PERCPU_VADDR - define output section for percpu area
  * @vaddr: explicit base address (optional)
  * @phdr: destination PHDR (optional)
- * @prealloc: the size of prealloc area
  *
  * Macro which expands to output section for percpu area.  If @vaddr
  * is not blank, it specifies explicit base address and all percpu
@@ -457,40 +445,23 @@
  * section in the linker script will go there too.  @phdr should have
  * a leading colon.
  *
- * If @prealloc is non-zero, the specified number of bytes will be
- * reserved at the start of percpu area.  As the prealloc area is
- * likely to break alignment, this macro puts areas in increasing
- * alignment order.
- *
  * This macro defines three symbols, __per_cpu_load, __per_cpu_start
  * and __per_cpu_end.  The first one is the vaddr of loaded percpu
  * init data.  __per_cpu_start equals @vaddr and __per_cpu_end is the
  * end offset.
  */
-#define PERCPU_VADDR_PREALLOC(vaddr, segment, prealloc)			\
-	PERCPU_PROLOG(vaddr)						\
-		. += prealloc;						\
-		*(.data.percpu)						\
-		*(.data.percpu.shared_aligned)				\
-		*(.data.percpu.page_aligned)				\
-	PERCPU_EPILOG(segment)
-
-/**
- * PERCPU_VADDR - define output section for percpu area
- * @vaddr: explicit base address (optional)
- * @phdr: destination PHDR (optional)
- *
- * Macro which expands to output section for percpu area.  Mostly
- * identical to PERCPU_VADDR_PREALLOC(@vaddr, @phdr, 0) other than
- * using slighly different layout.
- */
 #define PERCPU_VADDR(vaddr, phdr)					\
-	PERCPU_PROLOG(vaddr)						\
+	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
+	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
+				- LOAD_OFFSET) {			\
+		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
 		*(.data.percpu.first)					\
 		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
-	PERCPU_EPILOG(phdr)
+		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
+	} phdr								\
+	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
 
 /**
  * PERCPU - define output section for percpu area, simple version
-- 
cgit 


From 5a611268b69f05262936dd177205acbce4471358 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 26 Jan 2009 08:44:05 -0500
Subject: generic, x86: fix __per_cpu_load relocation

This patch fixes this linker error:

 WARNING: Absolute relocations present
 Offset     Info     Type     Sym.Value Sym.Name
 c0a4e07d 00e78001   R_386_32 c0ab0000  __per_cpu_load

Now, __per_cpu_load is a section-relative symbol:

 c0aa4000 D __per_cpu_load
 c0aa4000 A __per_cpu_load_abs

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 53e21f36a802..f3180a85c66a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -451,17 +451,18 @@
  * end offset.
  */
 #define PERCPU_VADDR(vaddr, phdr)					\
-	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
-	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
+	VMLINUX_SYMBOL(__per_cpu_load_abs) = .;				\
+	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load_abs)	\
 				- LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
+		VMLINUX_SYMBOL(__per_cpu_load) = LOADADDR(.data.percpu) + LOAD_OFFSET;\
 		*(.data.percpu.first)					\
 		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
 	} phdr								\
-	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
+	. = VMLINUX_SYMBOL(__per_cpu_load_abs) + SIZEOF(.data.percpu);
 
 /**
  * PERCPU - define output section for percpu area, simple version
-- 
cgit 


From dba3d36b2f0842ed7f25c33cd3a2ccdb3d0df9db Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 29 Jan 2009 17:10:12 +0100
Subject: Revert "generic, x86: fix __per_cpu_load relocation"

This reverts commit 5a611268b69f05262936dd177205acbce4471358.

It is causing occasional boot crashes, caused by certain
linker versions (GNU ld version 2.18.50.0.6-2 20080403) messing up:

 82dcc000 D __per_cpu_load
 c16e6000 A __per_cpu_load_abs

The __per_cpu_load value is out of whack. Hpa noticed the following
detail:

  * (gdb) p/x -(0xc16e6000-0x82dcc000)
  * $2 = 0xc16e6000
  * I.e. one is the other << 1

The two symbols should be equal.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f3180a85c66a..53e21f36a802 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -451,18 +451,17 @@
  * end offset.
  */
 #define PERCPU_VADDR(vaddr, phdr)					\
-	VMLINUX_SYMBOL(__per_cpu_load_abs) = .;				\
-	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load_abs)	\
+	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
+	.data.percpu vaddr : AT(VMLINUX_SYMBOL(__per_cpu_load)		\
 				- LOAD_OFFSET) {			\
 		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
-		VMLINUX_SYMBOL(__per_cpu_load) = LOADADDR(.data.percpu) + LOAD_OFFSET;\
 		*(.data.percpu.first)					\
 		*(.data.percpu.page_aligned)				\
 		*(.data.percpu)						\
 		*(.data.percpu.shared_aligned)				\
 		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
 	} phdr								\
-	. = VMLINUX_SYMBOL(__per_cpu_load_abs) + SIZEOF(.data.percpu);
+	. = VMLINUX_SYMBOL(__per_cpu_load) + SIZEOF(.data.percpu);
 
 /**
  * PERCPU - define output section for percpu area, simple version
-- 
cgit 


From 3ac6cffea4aa18007a454a7442da2855882f403d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 30 Jan 2009 16:32:22 +0900
Subject: linker script: use separate simpler definition for PERCPU()

Impact: fix linker screwup on x86_32

Recent x86_64 zerobased patches introduced PERCPU_VADDR() to put
.data.percpu to a predefined address and re-defined PERCPU() in terms
of it.  The new macro defined one extra symbol, __per_cpu_load, for
LMA of the section so that the init data could be accessed.  This new
symbol introduced the following problems to x86_32.

1. If __per_cpu_load is defined outside of .data.percpu as an absolute
   symbol, relocation generation for relocatable kernel fails due to
   absolute relocation.

2. If __per_cpu_load is put inside .data.percpu with absolute address
   assignment to work around #1, linker gets confused and under
   certain configurations ends up relocating the symbol against
   .data.percpu such that the load address gets added on top of
   already set load address.

As x86_32 doesn't use predefined address for .data.percpu, there's no
need for it to care about the possibility of __per_cpu_load being
different from __per_cpu_start.

This patch defines PERCPU() separately so that __per_cpu_load is
defined inside .data.percpu so that everything is ordinary
linking-wise.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 53e21f36a802..5406e70aba86 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -445,10 +445,9 @@
  * section in the linker script will go there too.  @phdr should have
  * a leading colon.
  *
- * This macro defines three symbols, __per_cpu_load, __per_cpu_start
- * and __per_cpu_end.  The first one is the vaddr of loaded percpu
- * init data.  __per_cpu_start equals @vaddr and __per_cpu_end is the
- * end offset.
+ * Note that this macros defines __per_cpu_load as an absolute symbol.
+ * If there is no need to put the percpu section at a predetermined
+ * address, use PERCPU().
  */
 #define PERCPU_VADDR(vaddr, phdr)					\
 	VMLINUX_SYMBOL(__per_cpu_load) = .;				\
@@ -470,7 +469,20 @@
  * Align to @align and outputs output section for percpu area.  This
  * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and
  * __per_cpu_start will be identical.
+ *
+ * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except
+ * that __per_cpu_load is defined as a relative symbol against
+ * .data.percpu which is required for relocatable x86_32
+ * configuration.
  */
 #define PERCPU(align)							\
 	. = ALIGN(align);						\
-	PERCPU_VADDR( , )
+	.data.percpu	: AT(ADDR(.data.percpu) - LOAD_OFFSET) {	\
+		VMLINUX_SYMBOL(__per_cpu_load) = .;			\
+		VMLINUX_SYMBOL(__per_cpu_start) = .;			\
+		*(.data.percpu.first)					\
+		*(.data.percpu.page_aligned)				\
+		*(.data.percpu)						\
+		*(.data.percpu.shared_aligned)				\
+		VMLINUX_SYMBOL(__per_cpu_end) = .;			\
+	}
-- 
cgit 


From dacd762eabf69e32f0e9181f99fd19b6f96ea5c5 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 31 Jan 2009 10:53:32 +0530
Subject: headers_check fix: frv, swab.h

fix the following 'make headers_check' warning:

  usr/include/asm-frv/swab.h:4: include of <linux/types.h> is preferred over <asm/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/asm-frv/swab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-frv/swab.h b/include/asm-frv/swab.h
index afb3396ba5ed..f305834b4799 100644
--- a/include/asm-frv/swab.h
+++ b/include/asm-frv/swab.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_SWAB_H
 #define _ASM_SWAB_H
 
-#include <asm/types.h>
+#include <linux/types.h>
 
 #if defined(__GNUC__) && !defined(__STRICT_ANSI__) || defined(__KERNEL__)
 #  define __SWAB_64_THRU_32__
-- 
cgit 


From d8cbec15af88e067f33cb78efad15d581fa79b12 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 31 Jan 2009 11:18:19 +0530
Subject: headers_check fix: m32r, swab.h

fix the following 'make headers_check' warning:

  usr/include/asm-m32r/swab.h:4: include of <linux/types.h> is preferred over <asm/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/asm-m32r/swab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-m32r/swab.h b/include/asm-m32r/swab.h
index 97973e101825..54dab001d6d1 100644
--- a/include/asm-m32r/swab.h
+++ b/include/asm-m32r/swab.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_M32R_SWAB_H
 #define _ASM_M32R_SWAB_H
 
-#include <asm/types.h>
+#include <linux/types.h>
 
 #if !defined(__STRICT_ANSI__) || defined(__KERNEL__)
 #  define __SWAB_64_THRU_32__
-- 
cgit 


From bef53ca086e069a3fb8e6bf4ecf06221de9b445f Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 31 Jan 2009 11:29:28 +0530
Subject: headers_check fix: mn10300, swab.h

fix the following 'make headers_check' warnings:

  usr/include/asm-mn10300/swab.h:14: include of <linux/types.h> is preferred over <asm/types.h>
  usr/include/asm-mn10300/swab.h:19: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/asm-mn10300/swab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/asm-mn10300/swab.h b/include/asm-mn10300/swab.h
index 4504d1b4b477..bd818a820ca8 100644
--- a/include/asm-mn10300/swab.h
+++ b/include/asm-mn10300/swab.h
@@ -11,7 +11,7 @@
 #ifndef _ASM_SWAB_H
 #define _ASM_SWAB_H
 
-#include <asm/types.h>
+#include <linux/types.h>
 
 #ifdef __GNUC__
 
-- 
cgit 


From 7e7f4eae28711fbb7f4d5e4b0aa3195776194bc1 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 2 Feb 2009 21:40:10 +0530
Subject: headers_check fix: linux/coda_psdev.h

fix the following 'make headers_check' warning:

  usr/include/linux/coda_psdev.h:90: extern's make no sense in userspace

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/coda_psdev.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 07ae8f846055..6f06352cf55e 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -24,7 +24,7 @@ static inline struct venus_comm *coda_vcp(struct super_block *sb)
 	return (struct venus_comm *)((sb)->s_fs_info);
 }
 
-
+#ifdef __KERNEL__
 /* upcalls */
 int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
 int venus_getattr(struct super_block *sb, struct CodaFid *fid,
@@ -64,6 +64,12 @@ int coda_downcall(int opcode, union outputArgs *out, struct super_block *sb);
 int venus_fsync(struct super_block *sb, struct CodaFid *fid);
 int venus_statfs(struct dentry *dentry, struct kstatfs *sfs);
 
+/*
+ * Statistics
+ */
+
+extern struct venus_comm coda_comms[];
+#endif /* __KERNEL__ */
 
 /* messages between coda filesystem in kernel and Venus */
 struct upc_req {
@@ -82,11 +88,4 @@ struct upc_req {
 #define REQ_WRITE  0x4
 #define REQ_ABORT  0x8
 
-
-/*
- * Statistics
- */
-
-extern struct venus_comm coda_comms[];
-
 #endif
-- 
cgit 


From 25d00fddf8d23234da2d45c051a14450939496d6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 2 Feb 2009 21:40:58 +0530
Subject: headers_check fix: linux/in6.h

fix the following 'make headers_check' warnings:

  usr/include/linux/in6.h:47: extern's make no sense in userspace
  usr/include/linux/in6.h:49: extern's make no sense in userspace

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/in6.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/in6.h b/include/linux/in6.h
index bc492048c349..718bf21c5754 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -44,11 +44,11 @@ struct in6_addr
  * NOTE: Be aware the IN6ADDR_* constants and in6addr_* externals are defined
  * in network byte order, not in host byte order as are the IPv4 equivalents
  */
+#ifdef __KERNEL__
 extern const struct in6_addr in6addr_any;
 #define IN6ADDR_ANY_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } } }
 extern const struct in6_addr in6addr_loopback;
 #define IN6ADDR_LOOPBACK_INIT { { { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } }
-#ifdef __KERNEL__
 extern const struct in6_addr in6addr_linklocal_allnodes;
 #define IN6ADDR_LINKLOCAL_ALLNODES_INIT	\
 		{ { { 0xff,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1 } } }
-- 
cgit 


From 9fe03bc3139503fbad66016bf714f4575babf651 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 2 Feb 2009 21:41:41 +0530
Subject: headers_check fix: linux/nubus.h

fix the following 'make headers_check' warnings:

  usr/include/linux/nubus.h:297: extern's make no sense in userspace
  usr/include/linux/nubus.h:299: extern's make no sense in userspace
  usr/include/linux/nubus.h:303: extern's make no sense in userspace

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/nubus.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index 7382af374731..9be57d992695 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -296,6 +296,7 @@ struct nubus_dev {
 	struct nubus_board* board;
 };
 
+#ifdef __KERNEL__
 /* This is all NuBus devices (used to find devices later on) */
 extern struct nubus_dev* nubus_devices;
 /* This is all NuBus cards */
@@ -351,6 +352,7 @@ void nubus_get_rsrc_mem(void* dest,
 void nubus_get_rsrc_str(void* dest,
 			const struct nubus_dirent *dirent,
 			int maxlen);
+#endif /* __KERNEL__ */
 
 /* We'd like to get rid of this eventually.  Only daynaport.c uses it now. */
 static inline void *nubus_slot_addr(int slot)
-- 
cgit 


From 7d7dc0d6b0565484e0623cb08b5dcdd56424697b Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 2 Feb 2009 21:44:09 +0530
Subject: headers_check fix: linux/socket.h

fix the following 'make headers_check' warning:

  usr/include/linux/socket.h:29: extern's make no sense in userspace

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/socket.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/socket.h b/include/linux/socket.h
index 20fc4bbfca42..afc01909a428 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -24,10 +24,12 @@ struct __kernel_sockaddr_storage {
 #include <linux/types.h>		/* pid_t			*/
 #include <linux/compiler.h>		/* __user			*/
 
-#ifdef CONFIG_PROC_FS
+#ifdef __KERNEL__
+# ifdef CONFIG_PROC_FS
 struct seq_file;
 extern void socket_seq_show(struct seq_file *seq);
-#endif
+# endif
+#endif /* __KERNEL__ */
 
 typedef unsigned short	sa_family_t;
 
-- 
cgit 


From 11d9f653aff1d445b4300ae1d2e2d675a0e9172f Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Mon, 2 Feb 2009 21:45:41 +0530
Subject: headers_check fix: linux/reinserfs_fs.h

fix the following 'make headers_check' warnings:

  usr/include/linux/reiserfs_fs.h:687: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:995: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:997: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1467: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1760: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1764: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1766: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1769: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1771: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1805: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1948: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1949: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1950: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1951: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1962: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1963: extern's make no sense in userspace
  usr/include/linux/reiserfs_fs.h:1964: extern's make no sense in userspace

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/reiserfs_fs.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index bc5114d35e99..a4db55fd1f65 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -698,7 +698,9 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key)
 /* object identifier for root dir */
 #define REISERFS_ROOT_OBJECTID 2
 #define REISERFS_ROOT_PARENT_OBJECTID 1
+#ifdef __KERNEL__
 extern struct reiserfs_key root_key;
+#endif /* __KERNEL__ */
 
 /* 
  * Picture represents a leaf of the S+tree
@@ -1006,10 +1008,12 @@ struct reiserfs_de_head {
 #define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
 #define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
 
+#ifdef __KERNEL__
 extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
 				   __le32 par_dirid, __le32 par_objid);
 extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
 				__le32 par_dirid, __le32 par_objid);
+#endif /* __KERNEL__ */
 
 /* array of the entry headers */
  /* get item body */
@@ -1478,7 +1482,9 @@ struct item_operations {
 	void (*print_vi) (struct virtual_item * vi);
 };
 
+#ifdef __KERNEL__
 extern struct item_operations *item_ops[TYPE_ANY + 1];
+#endif /* __KERNEL__ */
 
 #define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
 #define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
@@ -1679,6 +1685,7 @@ struct reiserfs_transaction_handle {
 	struct list_head t_list;
 };
 
+#ifdef __KERNEL__
 /* used to keep track of ordered and tail writes, attached to the buffer
  * head through b_journal_head.
  */
@@ -2203,4 +2210,5 @@ int reiserfs_unpack(struct inode *inode, struct file *filp);
 /* xattr stuff */
 #define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
 
+#endif /* __KERNEL__ */
 #endif				/* _LINUX_REISER_FS_H */
-- 
cgit 


From f2cddb29ebfc02dfd2c4b439aa0433393ad15575 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Tue, 3 Feb 2009 19:21:38 +0530
Subject: headers_check fix cleanup: linux/coda_psdev.h

These are only for kernel internals as pointed by Arnd Bergmann:
  struct kstatfs
  struct venus_comm
  coda_vcp()

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/coda_psdev.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h
index 6f06352cf55e..5b5d4731f956 100644
--- a/include/linux/coda_psdev.h
+++ b/include/linux/coda_psdev.h
@@ -6,6 +6,7 @@
 #define CODA_PSDEV_MAJOR 67
 #define MAX_CODADEVS  5	   /* how many do we allow */
 
+#ifdef __KERNEL__
 struct kstatfs;
 
 /* communication pending/processing queues */
@@ -24,7 +25,6 @@ static inline struct venus_comm *coda_vcp(struct super_block *sb)
 	return (struct venus_comm *)((sb)->s_fs_info);
 }
 
-#ifdef __KERNEL__
 /* upcalls */
 int venus_rootfid(struct super_block *sb, struct CodaFid *fidp);
 int venus_getattr(struct super_block *sb, struct CodaFid *fid,
-- 
cgit 


From 5007b1fc4ef2c1b496536b2f026353c1d44d92ef Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Tue, 3 Feb 2009 19:28:24 +0530
Subject: headers_check fix cleanup: linux/nubus.h

These are only for kernel internals as pointed by Arnd Bergmann:
   struct nubus_board
   struct nubus_dev

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/nubus.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nubus.h b/include/linux/nubus.h
index 9be57d992695..e137b3c486a7 100644
--- a/include/linux/nubus.h
+++ b/include/linux/nubus.h
@@ -237,6 +237,7 @@ struct nubus_dirent
 	int mask;
 };
 
+#ifdef __KERNEL__
 struct nubus_board {
 	struct nubus_board* next;
 	struct nubus_dev* first_dev;
@@ -296,7 +297,6 @@ struct nubus_dev {
 	struct nubus_board* board;
 };
 
-#ifdef __KERNEL__
 /* This is all NuBus devices (used to find devices later on) */
 extern struct nubus_dev* nubus_devices;
 /* This is all NuBus cards */
-- 
cgit 


From 750e1c18251345e662bb7e7062b5fd5c1ade36de Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Tue, 3 Feb 2009 19:40:03 +0530
Subject: headers_check fix cleanup: linux/reiserfs_fs.h

Only REISERFS_IOC_* definitions are required for user space
rest should be in #ifdef __KERNEL__ as pointed by Arnd Bergmann.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/reiserfs_fs.h | 62 ++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 34 deletions(-)

(limited to 'include')

diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index a4db55fd1f65..e356c99f0659 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -28,8 +28,6 @@
 #include <linux/reiserfs_fs_sb.h>
 #endif
 
-struct fid;
-
 /*
  *  include/linux/reiser_fs.h
  *
@@ -37,6 +35,33 @@ struct fid;
  *
  */
 
+/* ioctl's command */
+#define REISERFS_IOC_UNPACK		_IOW(0xCD,1,long)
+/* define following flags to be the same as in ext2, so that chattr(1),
+   lsattr(1) will work with us. */
+#define REISERFS_IOC_GETFLAGS		FS_IOC_GETFLAGS
+#define REISERFS_IOC_SETFLAGS		FS_IOC_SETFLAGS
+#define REISERFS_IOC_GETVERSION		FS_IOC_GETVERSION
+#define REISERFS_IOC_SETVERSION		FS_IOC_SETVERSION
+
+#ifdef __KERNEL__
+/* the 32 bit compat definitions with int argument */
+#define REISERFS_IOC32_UNPACK		_IOW(0xCD, 1, int)
+#define REISERFS_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define REISERFS_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define REISERFS_IOC32_GETVERSION	FS_IOC32_GETVERSION
+#define REISERFS_IOC32_SETVERSION	FS_IOC32_SETVERSION
+
+/* Locking primitives */
+/* Right now we are still falling back to (un)lock_kernel, but eventually that
+   would evolve into real per-fs locks */
+#define reiserfs_write_lock( sb ) lock_kernel()
+#define reiserfs_write_unlock( sb ) unlock_kernel()
+
+/* xattr stuff */
+#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
+struct fid;
+
 /* in reading the #defines, it may help to understand that they employ
    the following abbreviations:
 
@@ -698,9 +723,8 @@ static inline void cpu_key_k_offset_dec(struct cpu_key *key)
 /* object identifier for root dir */
 #define REISERFS_ROOT_OBJECTID 2
 #define REISERFS_ROOT_PARENT_OBJECTID 1
-#ifdef __KERNEL__
+
 extern struct reiserfs_key root_key;
-#endif /* __KERNEL__ */
 
 /* 
  * Picture represents a leaf of the S+tree
@@ -1008,12 +1032,10 @@ struct reiserfs_de_head {
 #define de_visible(deh)	    	    test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
 #define de_hidden(deh)	    	    !test_bit_unaligned (DEH_Visible, &((deh)->deh_state))
 
-#ifdef __KERNEL__
 extern void make_empty_dir_item_v1(char *body, __le32 dirid, __le32 objid,
 				   __le32 par_dirid, __le32 par_objid);
 extern void make_empty_dir_item(char *body, __le32 dirid, __le32 objid,
 				__le32 par_dirid, __le32 par_objid);
-#endif /* __KERNEL__ */
 
 /* array of the entry headers */
  /* get item body */
@@ -1482,9 +1504,7 @@ struct item_operations {
 	void (*print_vi) (struct virtual_item * vi);
 };
 
-#ifdef __KERNEL__
 extern struct item_operations *item_ops[TYPE_ANY + 1];
-#endif /* __KERNEL__ */
 
 #define op_bytes_number(ih,bsize)                    item_ops[le_ih_k_type (ih)]->bytes_number (ih, bsize)
 #define op_is_left_mergeable(key,bsize)              item_ops[le_key_k_type (le_key_version (key), key)]->is_left_mergeable (key, bsize)
@@ -1546,7 +1566,6 @@ struct reiserfs_iget_args {
 /*                    FUNCTION DECLARATIONS                                */
 /***************************************************************************/
 
-/*#ifdef __KERNEL__*/
 #define get_journal_desc_magic(bh) (bh->b_data + bh->b_size - 12)
 
 #define journal_trans_half(blocksize) \
@@ -1685,7 +1704,6 @@ struct reiserfs_transaction_handle {
 	struct list_head t_list;
 };
 
-#ifdef __KERNEL__
 /* used to keep track of ordered and tail writes, attached to the buffer
  * head through b_journal_head.
  */
@@ -2185,30 +2203,6 @@ long reiserfs_compat_ioctl(struct file *filp,
 		   unsigned int cmd, unsigned long arg);
 int reiserfs_unpack(struct inode *inode, struct file *filp);
 
-/* ioctl's command */
-#define REISERFS_IOC_UNPACK		_IOW(0xCD,1,long)
-/* define following flags to be the same as in ext2, so that chattr(1),
-   lsattr(1) will work with us. */
-#define REISERFS_IOC_GETFLAGS		FS_IOC_GETFLAGS
-#define REISERFS_IOC_SETFLAGS		FS_IOC_SETFLAGS
-#define REISERFS_IOC_GETVERSION		FS_IOC_GETVERSION
-#define REISERFS_IOC_SETVERSION		FS_IOC_SETVERSION
-
-/* the 32 bit compat definitions with int argument */
-#define REISERFS_IOC32_UNPACK		_IOW(0xCD, 1, int)
-#define REISERFS_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
-#define REISERFS_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
-#define REISERFS_IOC32_GETVERSION	FS_IOC32_GETVERSION
-#define REISERFS_IOC32_SETVERSION	FS_IOC32_SETVERSION
-
-/* Locking primitives */
-/* Right now we are still falling back to (un)lock_kernel, but eventually that
-   would evolve into real per-fs locks */
-#define reiserfs_write_lock( sb ) lock_kernel()
-#define reiserfs_write_unlock( sb ) unlock_kernel()
-
-/* xattr stuff */
-#define REISERFS_XATTR_DIR_SEM(s) (REISERFS_SB(s)->xattr_dir_sem)
 
 #endif /* __KERNEL__ */
 #endif				/* _LINUX_REISER_FS_H */
-- 
cgit 


From 7b2cd92adc5430b0c1adeb120971852b4ea1ab08 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 5 Feb 2009 16:48:24 +1100
Subject: crypto: api - Fix zeroing on free

Geert Uytterhoeven pointed out that we're not zeroing all the
memory when freeing a transform.  This patch fixes it by calling
ksize to ensure that we zero everything in sight.

Reported-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 crypto/api.c           | 20 ++++++++++----------
 include/linux/crypto.h |  7 ++++++-
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include')

diff --git a/crypto/api.c b/crypto/api.c
index 9975a7bd246c..efe77df6863f 100644
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -557,34 +557,34 @@ err:
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(crypto_alloc_tfm);
- 
+
 /*
- *	crypto_free_tfm - Free crypto transform
+ *	crypto_destroy_tfm - Free crypto transform
+ *	@mem: Start of tfm slab
  *	@tfm: Transform to free
  *
- *	crypto_free_tfm() frees up the transform and any associated resources,
+ *	This function frees up the transform and any associated resources,
  *	then drops the refcount on the associated algorithm.
  */
-void crypto_free_tfm(struct crypto_tfm *tfm)
+void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm)
 {
 	struct crypto_alg *alg;
 	int size;
 
-	if (unlikely(!tfm))
+	if (unlikely(!mem))
 		return;
 
 	alg = tfm->__crt_alg;
-	size = sizeof(*tfm) + alg->cra_ctxsize;
+	size = ksize(mem);
 
 	if (!tfm->exit && alg->cra_exit)
 		alg->cra_exit(tfm);
 	crypto_exit_ops(tfm);
 	crypto_mod_put(alg);
-	memset(tfm, 0, size);
-	kfree(tfm);
+	memset(mem, 0, size);
+	kfree(mem);
 }
-
-EXPORT_SYMBOL_GPL(crypto_free_tfm);
+EXPORT_SYMBOL_GPL(crypto_destroy_tfm);
 
 int crypto_has_alg(const char *name, u32 type, u32 mask)
 {
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index 3bacd71509fb..1f2e9020acc6 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -552,7 +552,12 @@ struct crypto_tfm *crypto_alloc_tfm(const char *alg_name,
 				    const struct crypto_type *frontend,
 				    u32 type, u32 mask);
 struct crypto_tfm *crypto_alloc_base(const char *alg_name, u32 type, u32 mask);
-void crypto_free_tfm(struct crypto_tfm *tfm);
+void crypto_destroy_tfm(void *mem, struct crypto_tfm *tfm);
+
+static inline void crypto_free_tfm(struct crypto_tfm *tfm)
+{
+	return crypto_destroy_tfm(tfm, tfm);
+}
 
 int alg_test(const char *driver, const char *alg, u32 type, u32 mask);
 
-- 
cgit 


From 412e87ae5d852bc3d836f475c19d954b3324363d Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 5 Feb 2009 16:51:25 +1100
Subject: crypto: shash - Fix tfm destruction

We were freeing an offset into the slab object instead of the
start.  This patch fixes it by calling crypto_destroy_tfm which
allows the correct address to be given.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 include/crypto/hash.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/crypto/hash.h b/include/crypto/hash.h
index cd16d6e668ce..d797e119e3d5 100644
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -222,7 +222,7 @@ static inline struct crypto_tfm *crypto_shash_tfm(struct crypto_shash *tfm)
 
 static inline void crypto_free_shash(struct crypto_shash *tfm)
 {
-	crypto_free_tfm(crypto_shash_tfm(tfm));
+	crypto_destroy_tfm(tfm, crypto_shash_tfm(tfm));
 }
 
 static inline unsigned int crypto_shash_alignmask(
-- 
cgit 


From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Feb 2009 12:24:15 +0100
Subject: signal: re-add dead task accumulation stats.

We're going to split the process wide cpu accounting into two parts:

 - clocks; which can take all the time they want since they run
           from user context.

 - timers; which need constant time tracing but can affort the overhead
           because they're default off -- and rare.

The clock readout will go back to a full sum of the thread group, for this
we need to re-add the exit stats that were removed in the initial itimer
rework (f06febc9: timers: fix itimer/many thread hang).

Furthermore, since that full sum can be rather slow for large thread groups
and we have the complete dead task stats, revert the do_notify_parent time
computation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 +++++++++-
 kernel/exit.c         |  3 +++
 kernel/fork.c         |  3 ++-
 kernel/signal.c       |  8 ++++----
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2127e959e0f4..2e0646a30314 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -559,7 +559,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t cutime, cstime;
+	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -567,6 +567,14 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	struct task_io_accounting ioac;
 
+	/*
+	 * Cumulative ns of schedule CPU time fo dead threads in the
+	 * group, not including a zombie group leader, (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sum_sched_runtime;
+
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f1875..efd30ccf3858 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
+		sig->utime = cputime_add(sig->utime, task_utime(tsk));
+		sig->stime = cputime_add(sig->stime, task_stime(tsk));
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
+		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 242a706e7721..e8e854a04ad2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->tty_old_pgrp = NULL;
 	sig->tty = NULL;
 
-	sig->cutime = sig->cstime = cputime_zero;
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 	task_io_accounting_init(&sig->ioac);
+	sig->sum_sched_runtime = 0;
 	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
diff --git a/kernel/signal.c b/kernel/signal.c
index b6b36768b758..2a74fe87c0dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
-	struct task_cputime cputime;
 	int ret = sig;
 
 	BUG_ON(sig == -1);
@@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	thread_group_cputime(tsk, &cputime);
-	info.si_utime = cputime_to_jiffies(cputime.utime);
-	info.si_stime = cputime_to_jiffies(cputime.stime);
+	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+				tsk->signal->utime));
+	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
+				tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
-- 
cgit 


From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Feb 2009 12:24:16 +0100
Subject: timers: split process wide cpu clocks/timers

Change the process wide cpu timers/clocks so that we:

 1) don't mess up the kernel with too many threads,
 2) don't have a per-cpu allocation for each process,
 3) have no impact when not used.

In order to accomplish this we're going to split it into two parts:

 - clocks; which can take all the time they want since they run
           from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID)

 - timers; which need constant time sampling but since they're
           explicity used, the user can pay the overhead.

The clock readout will go back to a full sum of the thread group, while the
timers will run of a global 'clock' that only runs when needed, so only
programs that make use of the facility pay the price.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 11 +++---
 include/linux/sched.h     | 54 +++++++++++++++------------
 kernel/itimer.c           |  4 +-
 kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_stats.h      | 45 ++++++++++++----------
 5 files changed, 155 insertions(+), 54 deletions(-)

(limited to 'include')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ea0ea1a4c36f..e752d973fa21 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -48,12 +48,11 @@ extern struct fs_struct init_fs;
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
-	.cputime	= { .totals = {					\
-		.utime = cputime_zero,					\
-		.stime = cputime_zero,					\
-		.sum_exec_runtime = 0,					\
-		.lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock),	\
-	}, },								\
+	.cputimer	= { 						\
+		.cputime = INIT_CPUTIME,				\
+		.running = 0,						\
+		.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+	},								\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e0646a30314..082d7619b3a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -443,7 +443,6 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- * @lock:		lock for fields in this struct
  *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
@@ -454,23 +453,33 @@ struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
-	spinlock_t lock;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
 #define virt_exp	utime
 #define sched_exp	sum_exec_runtime
 
+#define INIT_CPUTIME	\
+	(struct task_cputime) {					\
+		.utime = cputime_zero,				\
+		.stime = cputime_zero,				\
+		.sum_exec_runtime = 0,				\
+	}
+
 /**
- * struct thread_group_cputime - thread group interval timer counts
- * @totals:		thread group interval timers; substructure for
- *			uniprocessor kernel, per-cpu for SMP kernel.
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime:		thread group interval timers.
+ * @running:		non-zero when there are timers running and
+ * 			@cputime receives updates.
+ * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU clock calculations.
+ * used for thread group CPU timer calculations.
  */
-struct thread_group_cputime {
-	struct task_cputime totals;
+struct thread_group_cputimer {
+	struct task_cputime cputime;
+	int running;
+	spinlock_t lock;
 };
 
 /*
@@ -519,10 +528,10 @@ struct signal_struct {
 	cputime_t it_prof_incr, it_virt_incr;
 
 	/*
-	 * Thread group totals for process CPU clocks.
-	 * See thread_group_cputime(), et al, for details.
+	 * Thread group totals for process CPU timers.
+	 * See thread_group_cputimer(), et al, for details.
 	 */
-	struct thread_group_cputime cputime;
+	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
@@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 /*
  * Thread group CPU time accounting.
  */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 
 static inline
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
-	struct task_cputime *totals = &tsk->signal->cputime.totals;
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	unsigned long flags;
 
-	spin_lock_irqsave(&totals->lock, flags);
-	*times = *totals;
-	spin_unlock_irqrestore(&totals->lock, flags);
+	WARN_ON(!cputimer->running);
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	*times = cputimer->cputime;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
-	sig->cputime.totals = (struct task_cputime){
-		.utime = cputime_zero,
-		.stime = cputime_zero,
-		.sum_exec_runtime = 0,
-	};
-
-	spin_lock_init(&sig->cputime.totals.lock);
+	sig->cputimer.cputime = INIT_CPUTIME;
+	spin_lock_init(&sig->cputimer.lock);
+	sig->cputimer.running = 0;
 }
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6a5fe93dd8bd..58762f7077ec 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
 			struct task_cputime cputime;
 			cputime_t utime;
 
-			thread_group_cputime(tsk, &cputime);
+			thread_group_cputimer(tsk, &cputime);
 			utime = cputime.utime;
 			if (cputime_le(cval, utime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
 			struct task_cputime times;
 			cputime_t ptime;
 
-			thread_group_cputime(tsk, &times);
+			thread_group_cputimer(tsk, &times);
 			ptime = cputime_add(times.utime, times.stime);
 			if (cputime_le(cval, ptime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da94d7be..db107c9bbc05 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct sighand_struct *sighand;
+	struct signal_struct *sig;
+	struct task_struct *t;
+
+	*times = INIT_CPUTIME;
+
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
+	if (!sighand)
+		goto out;
+
+	sig = tsk->signal;
+
+	t = tsk;
+	do {
+		times->utime = cputime_add(times->utime, t->utime);
+		times->stime = cputime_add(times->stime, t->stime);
+		times->sum_exec_runtime += t->se.sum_exec_runtime;
+
+		t = next_thread(t);
+	} while (t != tsk);
+
+	times->utime = cputime_add(times->utime, sig->utime);
+	times->stime = cputime_add(times->stime, sig->stime);
+	times->sum_exec_runtime += sig->sum_sched_runtime;
+out:
+	rcu_read_unlock();
+}
+
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
@@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 					     now);
 }
 
+/*
+ * Enable the process wide cpu timer accounting.
+ *
+ * serialized using ->sighand->siglock
+ */
+static void start_process_timers(struct task_struct *tsk)
+{
+	tsk->signal->cputimer.running = 1;
+	barrier();
+}
+
+/*
+ * Release the process wide timer accounting -- timer stops ticking when
+ * nobody cares about it.
+ *
+ * serialized using ->sighand->siglock
+ */
+static void stop_process_timers(struct task_struct *tsk)
+{
+	tsk->signal->cputimer.running = 0;
+	barrier();
+}
+
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
@@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
+	if (!CPUCLOCK_PERTHREAD(timer->it_clock))
+		start_process_timers(p);
+
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
@@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk,
 	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
 	    cputime_eq(sig->it_virt_expires, cputime_zero) &&
-	    list_empty(&timers[CPUCLOCK_SCHED]))
+	    list_empty(&timers[CPUCLOCK_SCHED])) {
+		stop_process_timers(tsk);
 		return;
+	}
 
 	/*
 	 * Collect the current process totals.
 	 */
-	thread_group_cputime(tsk, &cputime);
+	thread_group_cputimer(tsk, &cputime);
 	utime = cputime.utime;
 	ptime = cputime_add(utime, cputime.stime);
 	sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (!task_cputime_zero(&sig->cputime_expires)) {
 		struct task_cputime group_sample;
 
-		thread_group_cputime(tsk, &group_sample);
+		thread_group_cputimer(tsk, &group_sample);
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
 	}
@@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+/*
+ * Sample a process (thread group) timer for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_timer_sample_group(const clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputimer(p, &cputime);
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = cputime.utime;
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		break;
+	}
+	return 0;
+}
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tsk->sighand->siglock must be held by the caller.
@@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	cpu_clock_sample_group(clock_idx, tsk, &now);
+	start_process_timers(tsk);
+	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
 		if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8ab0cef8ecab..a8f93dd374e1 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct task_cputime *times;
-	struct signal_struct *sig;
+	struct thread_group_cputimer *cputimer;
 
 	/* tsk == current, ensure it is safe to use ->signal */
 	if (unlikely(tsk->exit_state))
 		return;
 
-	sig = tsk->signal;
-	times = &sig->cputime.totals;
+	cputimer = &tsk->signal->cputimer;
 
-	spin_lock(&times->lock);
-	times->utime = cputime_add(times->utime, cputime);
-	spin_unlock(&times->lock);
+	if (!cputimer->running)
+		return;
+
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.utime =
+		cputime_add(cputimer->cputime.utime, cputime);
+	spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct task_cputime *times;
-	struct signal_struct *sig;
+	struct thread_group_cputimer *cputimer;
 
 	/* tsk == current, ensure it is safe to use ->signal */
 	if (unlikely(tsk->exit_state))
 		return;
 
-	sig = tsk->signal;
-	times = &sig->cputime.totals;
+	cputimer = &tsk->signal->cputimer;
+
+	if (!cputimer->running)
+		return;
 
-	spin_lock(&times->lock);
-	times->stime = cputime_add(times->stime, cputime);
-	spin_unlock(&times->lock);
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.stime =
+		cputime_add(cputimer->cputime.stime, cputime);
+	spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct task_cputime *times;
+	struct thread_group_cputimer *cputimer;
 	struct signal_struct *sig;
 
 	sig = tsk->signal;
@@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (unlikely(!sig))
 		return;
 
-	times = &sig->cputime.totals;
+	cputimer = &sig->cputimer;
+
+	if (!cputimer->running)
+		return;
 
-	spin_lock(&times->lock);
-	times->sum_exec_runtime += ns;
-	spin_unlock(&times->lock);
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.sum_exec_runtime += ns;
+	spin_unlock(&cputimer->lock);
 }
-- 
cgit 


From 65a4e574d2382d83f71b30ea92f86d2e40a6ef8d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 31 Jan 2009 03:36:17 +0100
Subject: smp, generic: introduce arch_disable_smp_support() instead of
 disable_ioapic_setup()

Impact: cleanup

disable_ioapic_setup() in init/main.c is ugly as the function is
x86-specific. The #ifdef inline prototype there is ugly too.

Replace it with a generic arch_disable_smp_support() function - which
has a weak alias for non-x86 architectures and for non-ioapic x86 builds.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io_apic.h |  9 ---------
 arch/x86/kernel/apic.c         |  4 +---
 arch/x86/kernel/io_apic.c      | 11 ++++++++++-
 arch/x86/kernel/smpboot.c      |  2 +-
 include/linux/smp.h            |  6 ++++++
 init/main.c                    | 12 ++++++------
 6 files changed, 24 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 08ec793aa043..309d0e23193a 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,15 +143,6 @@ extern int noioapicreroute;
 /* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
 extern int timer_through_8259;
 
-static inline void disable_ioapic_setup(void)
-{
-#ifdef CONFIG_PCI
-	noioapicquirk = 1;
-	noioapicreroute = -1;
-#endif
-	skip_ioapic_setup = 1;
-}
-
 /*
  * If we use the IO-APIC for IRQ routing, disable automatic
  * assignment of PCI IRQ's.
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 85d8b50d1af7..a04a73a51d20 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -1138,9 +1138,7 @@ void __cpuinit setup_local_APIC(void)
 	int i, j;
 
 	if (disable_apic) {
-#ifdef CONFIG_X86_IO_APIC
-		disable_ioapic_setup();
-#endif
+		arch_disable_smp_support();
 		return;
 	}
 
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 57d60c741e37..84bccac4619f 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -98,10 +98,19 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
 
 int skip_ioapic_setup;
 
+void arch_disable_smp_support(void)
+{
+#ifdef CONFIG_PCI
+	noioapicquirk = 1;
+	noioapicreroute = -1;
+#endif
+	skip_ioapic_setup = 1;
+}
+
 static int __init parse_noapic(char *str)
 {
 	/* disable IO-APIC */
-	disable_ioapic_setup();
+	arch_disable_smp_support();
 	return 0;
 }
 early_param("noapic", parse_noapic);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f40f86fec2fe..96f7d304f5c9 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1071,7 +1071,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
 		printk(KERN_ERR "... forcing use of dummy APIC emulation."
 				"(tell your hw vendor)\n");
 		smpboot_clear_io_apic();
-		disable_ioapic_setup();
+		arch_disable_smp_support();
 		return -1;
 	}
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 715196b09d67..d41a3a865fe3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -66,6 +66,12 @@ extern int __cpu_up(unsigned int cpunum);
  */
 extern void smp_cpus_done(unsigned int max_cpus);
 
+/*
+ * Callback to arch code if there's nosmp or maxcpus=0 on the
+ * boot command line:
+ */
+extern void arch_disable_smp_support(void);
+
 /*
  * Call a function on all other processors
  */
diff --git a/init/main.c b/init/main.c
index bfe4fb0c9842..6441083f8273 100644
--- a/init/main.c
+++ b/init/main.c
@@ -136,14 +136,14 @@ unsigned int __initdata setup_max_cpus = NR_CPUS;
  * greater than 0, limits the maximum number of CPUs activated in
  * SMP mode to <NUM>.
  */
-#ifndef CONFIG_X86_IO_APIC
-static inline void disable_ioapic_setup(void) {};
-#endif
+
+void __weak arch_disable_smp_support(void) { }
 
 static int __init nosmp(char *str)
 {
 	setup_max_cpus = 0;
-	disable_ioapic_setup();
+	arch_disable_smp_support();
+
 	return 0;
 }
 
@@ -153,14 +153,14 @@ static int __init maxcpus(char *str)
 {
 	get_option(&str, &setup_max_cpus);
 	if (setup_max_cpus == 0)
-		disable_ioapic_setup();
+		arch_disable_smp_support();
 
 	return 0;
 }
 
 early_param("maxcpus", maxcpus);
 #else
-#define setup_max_cpus NR_CPUS
+const unsigned int setup_max_cpus = NR_CPUS;
 #endif
 
 /*
-- 
cgit 


From a146649bc19d5eba4f5bfac6720c5f252d517a71 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 31 Jan 2009 14:09:06 +0100
Subject: smp, generic: introduce arch_disable_smp_support(), build fix

This function should be provided on UP too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'include')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index d41a3a865fe3..bbacb7baa446 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -66,12 +66,6 @@ extern int __cpu_up(unsigned int cpunum);
  */
 extern void smp_cpus_done(unsigned int max_cpus);
 
-/*
- * Callback to arch code if there's nosmp or maxcpus=0 on the
- * boot command line:
- */
-extern void arch_disable_smp_support(void);
-
 /*
  * Call a function on all other processors
  */
@@ -182,6 +176,12 @@ static inline void init_call_single_data(void)
 #define put_cpu()		preempt_enable()
 #define put_cpu_no_resched()	preempt_enable_no_resched()
 
+/*
+ * Callback to arch code if there's nosmp or maxcpus=0 on the
+ * boot command line:
+ */
+extern void arch_disable_smp_support(void);
+
 void smp_setup_processor_id(void);
 
 #endif /* __LINUX_SMP_H */
-- 
cgit 


From 7d8e23df69820e6be42bcc41d441f4860e8c76f7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 6 Feb 2009 14:57:51 +0100
Subject: timers: split process wide cpu clocks/timers, remove spurious warning

Mike Galbraith reported that the new warning in thread_group_cputimer()
triggers en masse with Amarok running.

Oleg Nesterov observed:

  Can't fastpath_timer_check()->thread_group_cputimer() have the
  false warning too? Suppose we had the timer, then posix_cpu_timer_del()
  removes this timer, but task_cputime_zero(&sig->cputime_expires) still
  not true.

Remove the spurious debug warning.

Reported-by: Mike Galbraith <efault@gmx.de>
Explained-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 082d7619b3a1..79392916d6c9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2208,8 +2208,6 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	unsigned long flags;
 
-	WARN_ON(!cputimer->running);
-
 	spin_lock_irqsave(&cputimer->lock, flags);
 	*times = cputimer->cputime;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
-- 
cgit 


From 527bdfee18ac6a4c026060c2c2b1144df9a5bf1f Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 6 Feb 2009 20:47:58 +0530
Subject: make linux/types.h as assembly safe

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/types.h b/include/linux/types.h
index 712ca53bc348..c30973ace890 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -1,6 +1,7 @@
 #ifndef _LINUX_TYPES_H
 #define _LINUX_TYPES_H
 
+#ifndef __ASSEMBLY__
 #ifdef	__KERNEL__
 
 #define DECLARE_BITMAP(name,bits) \
@@ -212,5 +213,5 @@ struct ustat {
 };
 
 #endif	/* __KERNEL__ */
-
+#endif /*  __ASSEMBLY__ */
 #endif /* _LINUX_TYPES_H */
-- 
cgit 


From b4bd07c20ba0c1fa7ad09ba257e0a5cfc2bf6bb3 Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@davemloft.net>
Date: Fri, 6 Feb 2009 22:06:43 -0800
Subject: net_dma: call dmaengine_get only if NET_DMA enabled

Based upon a patch from Atsushi Nemoto <anemo@mba.ocn.ne.jp>

--------------------
The commit 649274d993212e7c23c0cb734572c2311c200872 ("net_dma:
acquire/release dma channels on ifup/ifdown") added unconditional call
of dmaengine_get() to net_dma.  The API should be called only if
NET_DMA was enabled.
--------------------

Signed-off-by: David S. Miller <davem@davemloft.net>
Acked-by: Dan Williams <dan.j.williams@intel.com>
---
 include/linux/dmaengine.h | 12 ++++++++++++
 net/core/dev.c            |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 3e0f64c335c8..3e68469c1885 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -282,6 +282,18 @@ static inline void dmaengine_put(void)
 }
 #endif
 
+#ifdef CONFIG_NET_DMA
+#define net_dmaengine_get()	dmaengine_get()
+#define net_dmaengine_put()	dmaengine_put()
+#else
+static inline void net_dmaengine_get(void)
+{
+}
+static inline void net_dmaengine_put(void)
+{
+}
+#endif
+
 dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan,
 	void *dest, void *src, size_t len);
 dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan,
diff --git a/net/core/dev.c b/net/core/dev.c
index 5379b0c1190a..a17e00662363 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1090,7 +1090,7 @@ int dev_open(struct net_device *dev)
 		/*
 		 *	Enable NET_DMA
 		 */
-		dmaengine_get();
+		net_dmaengine_get();
 
 		/*
 		 *	Initialize multicasting status
@@ -1172,7 +1172,7 @@ int dev_close(struct net_device *dev)
 	/*
 	 *	Shutdown NET_DMA
 	 */
-	dmaengine_put();
+	net_dmaengine_put();
 
 	return 0;
 }
-- 
cgit 


From 0fb807c3e573ff9de2965ca38c907605d4735d16 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sun, 8 Feb 2009 11:00:25 +0530
Subject: unconditionally include asm/types.h from linux/types.h

Reported-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 include/linux/types.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/types.h b/include/linux/types.h
index c30973ace890..fca82ed55f49 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_TYPES_H
 #define _LINUX_TYPES_H
 
+#include <asm/types.h>
+
 #ifndef __ASSEMBLY__
 #ifdef	__KERNEL__
 
@@ -10,7 +12,6 @@
 #endif
 
 #include <linux/posix_types.h>
-#include <asm/types.h>
 
 #ifndef __KERNEL_STRICT_NAMES
 
-- 
cgit 


From 0f973f27888e4664b253ab2cf69c67c2eb80ab1b Mon Sep 17 00:00:00 2001
From: Jesse Barnes <jbarnes@virtuousgeek.org>
Date: Mon, 26 Jan 2009 17:10:45 -0800
Subject: drm/i915: add fence register management to execbuf

Adds code to set up fence registers at execbuf time on pre-965 chips as
necessary.  Also fixes up a few bugs in the pre-965 tile register support
(get_order != ffs).  The number of fences available to the kernel defaults
to the hw limit minus 3 (for legacy X front/back/depth), but a new parameter
allows userspace to override that as needed.

Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/gpu/drm/i915/i915_dma.c        | 10 ++++
 drivers/gpu/drm/i915/i915_drv.h        |  6 +++
 drivers/gpu/drm/i915/i915_gem.c        | 56 +++++++++++++++-------
 drivers/gpu/drm/i915/i915_gem_tiling.c | 88 +++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_reg.h        |  4 +-
 include/drm/i915_drm.h                 |  2 +
 6 files changed, 146 insertions(+), 20 deletions(-)

(limited to 'include')

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 1e01e7847155..cc0adb428cee 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -731,6 +731,9 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_GEM:
 		value = dev_priv->has_gem;
 		break;
+	case I915_PARAM_NUM_FENCES_AVAIL:
+		value = dev_priv->num_fence_regs - dev_priv->fence_reg_start;
+		break;
 	default:
 		DRM_ERROR("Unknown parameter %d\n", param->param);
 		return -EINVAL;
@@ -764,6 +767,13 @@ static int i915_setparam(struct drm_device *dev, void *data,
 	case I915_SETPARAM_ALLOW_BATCHBUFFER:
 		dev_priv->allow_batchbuffer = param->value;
 		break;
+	case I915_SETPARAM_NUM_USED_FENCES:
+		if (param->value > dev_priv->num_fence_regs ||
+		    param->value < 0)
+			return -EINVAL;
+		/* Userspace can use first N regs */
+		dev_priv->fence_reg_start = param->value;
+		break;
 	default:
 		DRM_ERROR("unknown parameter %d\n", param->param);
 		return -EINVAL;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index f471d218b89a..a70bf77290fc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -602,6 +602,7 @@ int i915_gem_init_object(struct drm_gem_object *obj);
 void i915_gem_free_object(struct drm_gem_object *obj);
 int i915_gem_object_pin(struct drm_gem_object *obj, uint32_t alignment);
 void i915_gem_object_unpin(struct drm_gem_object *obj);
+int i915_gem_object_unbind(struct drm_gem_object *obj);
 void i915_gem_lastclose(struct drm_device *dev);
 uint32_t i915_get_gem_seqno(struct drm_device *dev);
 void i915_gem_retire_requests(struct drm_device *dev);
@@ -785,6 +786,11 @@ extern int i915_wait_ring(struct drm_device * dev, int n, const char *caller);
 			IS_I945GM(dev) || IS_I965GM(dev) || IS_GM45(dev))
 
 #define I915_NEED_GFX_HWS(dev) (IS_G33(dev) || IS_GM45(dev) || IS_G4X(dev))
+/* With the 945 and later, Y tiling got adjusted so that it was 32 128-byte
+ * rows, which changed the alignment requirements and fence programming.
+ */
+#define HAS_128_BYTE_Y_TILING(dev) (IS_I9XX(dev) && !(IS_I915G(dev) || \
+						      IS_I915GM(dev)))
 #define SUPPORTS_INTEGRATED_HDMI(dev)	(IS_G4X(dev))
 
 #define PRIMARY_RINGBUFFER_SIZE         (128*1024)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e1f831f166ca..6a9e3a875083 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -52,7 +52,7 @@ static void i915_gem_object_free_page_list(struct drm_gem_object *obj);
 static int i915_gem_object_wait_rendering(struct drm_gem_object *obj);
 static int i915_gem_object_bind_to_gtt(struct drm_gem_object *obj,
 					   unsigned alignment);
-static int i915_gem_object_get_fence_reg(struct drm_gem_object *obj);
+static int i915_gem_object_get_fence_reg(struct drm_gem_object *obj, bool write);
 static void i915_gem_clear_fence_reg(struct drm_gem_object *obj);
 static int i915_gem_evict_something(struct drm_device *dev);
 static int i915_gem_phys_pwrite(struct drm_device *dev, struct drm_gem_object *obj,
@@ -567,6 +567,7 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	pgoff_t page_offset;
 	unsigned long pfn;
 	int ret = 0;
+	bool write = !!(vmf->flags & FAULT_FLAG_WRITE);
 
 	/* We don't use vmf->pgoff since that has the fake offset */
 	page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >>
@@ -586,7 +587,7 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	/* Need a new fence register? */
 	if (obj_priv->fence_reg == I915_FENCE_REG_NONE &&
 	    obj_priv->tiling_mode != I915_TILING_NONE) {
-		ret = i915_gem_object_get_fence_reg(obj);
+		ret = i915_gem_object_get_fence_reg(obj, write);
 		if (ret != 0)
 			return VM_FAULT_SIGBUS;
 	}
@@ -1214,7 +1215,7 @@ i915_gem_object_wait_rendering(struct drm_gem_object *obj)
 /**
  * Unbinds an object from the GTT aperture.
  */
-static int
+int
 i915_gem_object_unbind(struct drm_gem_object *obj)
 {
 	struct drm_device *dev = obj->dev;
@@ -1448,21 +1449,26 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *reg)
 	drm_i915_private_t *dev_priv = dev->dev_private;
 	struct drm_i915_gem_object *obj_priv = obj->driver_private;
 	int regnum = obj_priv->fence_reg;
+	int tile_width;
 	uint32_t val;
 	uint32_t pitch_val;
 
 	if ((obj_priv->gtt_offset & ~I915_FENCE_START_MASK) ||
 	    (obj_priv->gtt_offset & (obj->size - 1))) {
-		WARN(1, "%s: object not 1M or size aligned\n", __func__);
+		WARN(1, "%s: object 0x%08x not 1M or size (0x%x) aligned\n",
+		     __func__, obj_priv->gtt_offset, obj->size);
 		return;
 	}
 
-	if (obj_priv->tiling_mode == I915_TILING_Y && (IS_I945G(dev) ||
-						       IS_I945GM(dev) ||
-						       IS_G33(dev)))
-		pitch_val = (obj_priv->stride / 128) - 1;
+	if (obj_priv->tiling_mode == I915_TILING_Y &&
+	    HAS_128_BYTE_Y_TILING(dev))
+		tile_width = 128;
 	else
-		pitch_val = (obj_priv->stride / 512) - 1;
+		tile_width = 512;
+
+	/* Note: pitch better be a power of two tile widths */
+	pitch_val = obj_priv->stride / tile_width;
+	pitch_val = ffs(pitch_val) - 1;
 
 	val = obj_priv->gtt_offset;
 	if (obj_priv->tiling_mode == I915_TILING_Y)
@@ -1486,7 +1492,8 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg)
 
 	if ((obj_priv->gtt_offset & ~I915_FENCE_START_MASK) ||
 	    (obj_priv->gtt_offset & (obj->size - 1))) {
-		WARN(1, "%s: object not 1M or size aligned\n", __func__);
+		WARN(1, "%s: object 0x%08x not 1M or size aligned\n",
+		     __func__, obj_priv->gtt_offset);
 		return;
 	}
 
@@ -1506,6 +1513,7 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg)
 /**
  * i915_gem_object_get_fence_reg - set up a fence reg for an object
  * @obj: object to map through a fence reg
+ * @write: object is about to be written
  *
  * When mapping objects through the GTT, userspace wants to be able to write
  * to them without having to worry about swizzling if the object is tiled.
@@ -1517,7 +1525,7 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *reg)
  * and tiling format.
  */
 static int
-i915_gem_object_get_fence_reg(struct drm_gem_object *obj)
+i915_gem_object_get_fence_reg(struct drm_gem_object *obj, bool write)
 {
 	struct drm_device *dev = obj->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -1530,12 +1538,18 @@ i915_gem_object_get_fence_reg(struct drm_gem_object *obj)
 		WARN(1, "allocating a fence for non-tiled object?\n");
 		break;
 	case I915_TILING_X:
-		WARN(obj_priv->stride & (512 - 1),
-		     "object is X tiled but has non-512B pitch\n");
+		if (!obj_priv->stride)
+			return -EINVAL;
+		WARN((obj_priv->stride & (512 - 1)),
+		     "object 0x%08x is X tiled but has non-512B pitch\n",
+		     obj_priv->gtt_offset);
 		break;
 	case I915_TILING_Y:
-		WARN(obj_priv->stride & (128 - 1),
-		     "object is Y tiled but has non-128B pitch\n");
+		if (!obj_priv->stride)
+			return -EINVAL;
+		WARN((obj_priv->stride & (128 - 1)),
+		     "object 0x%08x is Y tiled but has non-128B pitch\n",
+		     obj_priv->gtt_offset);
 		break;
 	}
 
@@ -1637,7 +1651,7 @@ i915_gem_object_bind_to_gtt(struct drm_gem_object *obj, unsigned alignment)
 	if (dev_priv->mm.suspended)
 		return -EBUSY;
 	if (alignment == 0)
-		alignment = PAGE_SIZE;
+		alignment = i915_gem_get_gtt_alignment(obj);
 	if (alignment & (PAGE_SIZE - 1)) {
 		DRM_ERROR("Invalid object alignment requested %u\n", alignment);
 		return -EINVAL;
@@ -2658,6 +2672,14 @@ i915_gem_object_pin(struct drm_gem_object *obj, uint32_t alignment)
 				DRM_ERROR("Failure to bind: %d", ret);
 			return ret;
 		}
+		/*
+		 * Pre-965 chips need a fence register set up in order to
+		 * properly handle tiled surfaces.
+		 */
+		if (!IS_I965G(dev) &&
+		    obj_priv->fence_reg == I915_FENCE_REG_NONE &&
+		    obj_priv->tiling_mode != I915_TILING_NONE)
+			i915_gem_object_get_fence_reg(obj, true);
 	}
 	obj_priv->pin_count++;
 
@@ -3297,7 +3319,7 @@ i915_gem_load(struct drm_device *dev)
 	/* Old X drivers will take 0-2 for front, back, depth buffers */
 	dev_priv->fence_reg_start = 3;
 
-	if (IS_I965G(dev))
+	if (IS_I965G(dev) || IS_I945G(dev) || IS_I945GM(dev) || IS_G33(dev))
 		dev_priv->num_fence_regs = 16;
 	else
 		dev_priv->num_fence_regs = 8;
diff --git a/drivers/gpu/drm/i915/i915_gem_tiling.c b/drivers/gpu/drm/i915/i915_gem_tiling.c
index 241f39b7f460..2534c792808e 100644
--- a/drivers/gpu/drm/i915/i915_gem_tiling.c
+++ b/drivers/gpu/drm/i915/i915_gem_tiling.c
@@ -173,6 +173,73 @@ i915_gem_detect_bit_6_swizzle(struct drm_device *dev)
 	dev_priv->mm.bit_6_swizzle_y = swizzle_y;
 }
 
+
+/**
+ * Returns the size of the fence for a tiled object of the given size.
+ */
+static int
+i915_get_fence_size(struct drm_device *dev, int size)
+{
+	int i;
+	int start;
+
+	if (IS_I965G(dev)) {
+		/* The 965 can have fences at any page boundary. */
+		return ALIGN(size, 4096);
+	} else {
+		/* Align the size to a power of two greater than the smallest
+		 * fence size.
+		 */
+		if (IS_I9XX(dev))
+			start = 1024 * 1024;
+		else
+			start = 512 * 1024;
+
+		for (i = start; i < size; i <<= 1)
+			;
+
+		return i;
+	}
+}
+
+/* Check pitch constriants for all chips & tiling formats */
+static bool
+i915_tiling_ok(struct drm_device *dev, int stride, int size, int tiling_mode)
+{
+	int tile_width;
+
+	/* Linear is always fine */
+	if (tiling_mode == I915_TILING_NONE)
+		return true;
+
+	if (tiling_mode == I915_TILING_Y && HAS_128_BYTE_Y_TILING(dev))
+		tile_width = 128;
+	else
+		tile_width = 512;
+
+	/* 965+ just needs multiples of tile width */
+	if (IS_I965G(dev)) {
+		if (stride & (tile_width - 1))
+			return false;
+		return true;
+	}
+
+	/* Pre-965 needs power of two tile widths */
+	if (stride < tile_width)
+		return false;
+
+	if (stride & (stride - 1))
+		return false;
+
+	/* We don't handle the aperture area covered by the fence being bigger
+	 * than the object size.
+	 */
+	if (i915_get_fence_size(dev, size) != size)
+		return false;
+
+	return true;
+}
+
 /**
  * Sets the tiling mode of an object, returning the required swizzling of
  * bit 6 of addresses in the object.
@@ -191,6 +258,9 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 		return -EINVAL;
 	obj_priv = obj->driver_private;
 
+	if (!i915_tiling_ok(dev, args->stride, obj->size, args->tiling_mode))
+		return -EINVAL;
+
 	mutex_lock(&dev->struct_mutex);
 
 	if (args->tiling_mode == I915_TILING_NONE) {
@@ -207,7 +277,23 @@ i915_gem_set_tiling(struct drm_device *dev, void *data,
 			args->swizzle_mode = I915_BIT_6_SWIZZLE_NONE;
 		}
 	}
-	obj_priv->tiling_mode = args->tiling_mode;
+	if (args->tiling_mode != obj_priv->tiling_mode) {
+		int ret;
+
+		/* Unbind the object, as switching tiling means we're
+		 * switching the cache organization due to fencing, probably.
+		 */
+		ret = i915_gem_object_unbind(obj);
+		if (ret != 0) {
+			WARN(ret != -ERESTARTSYS,
+			     "failed to unbind object for tiling switch");
+			args->tiling_mode = obj_priv->tiling_mode;
+			mutex_unlock(&dev->struct_mutex);
+
+			return ret;
+		}
+		obj_priv->tiling_mode = args->tiling_mode;
+	}
 	obj_priv->stride = args->stride;
 
 	mutex_unlock(&dev->struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index 273162579e1b..928e00462570 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -186,12 +186,12 @@
 #define FENCE_REG_830_0			0x2000
 #define   I830_FENCE_START_MASK		0x07f80000
 #define   I830_FENCE_TILING_Y_SHIFT	12
-#define   I830_FENCE_SIZE_BITS(size)	((get_order(size >> 19) - 1) << 8)
+#define   I830_FENCE_SIZE_BITS(size)	((ffs((size) >> 19) - 1) << 8)
 #define   I830_FENCE_PITCH_SHIFT	4
 #define   I830_FENCE_REG_VALID		(1<<0)
 
 #define   I915_FENCE_START_MASK		0x0ff00000
-#define   I915_FENCE_SIZE_BITS(size)	((get_order(size >> 20) - 1) << 8)
+#define   I915_FENCE_SIZE_BITS(size)	((ffs((size) >> 20) - 1) << 8)
 
 #define FENCE_REG_965_0			0x03000
 #define   I965_FENCE_PITCH_SHIFT	2
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index b3bcf72dc656..912cd52db965 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -261,6 +261,7 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_LAST_DISPATCH         3
 #define I915_PARAM_CHIPSET_ID            4
 #define I915_PARAM_HAS_GEM               5
+#define I915_PARAM_NUM_FENCES_AVAIL      6
 
 typedef struct drm_i915_getparam {
 	int param;
@@ -272,6 +273,7 @@ typedef struct drm_i915_getparam {
 #define I915_SETPARAM_USE_MI_BATCHBUFFER_START            1
 #define I915_SETPARAM_TEX_LRU_LOG_GRANULARITY             2
 #define I915_SETPARAM_ALLOW_BATCHBUFFER                   3
+#define I915_SETPARAM_NUM_USED_FENCES                     4
 
 typedef struct drm_i915_setparam {
 	int param;
-- 
cgit 


From d3770449d3cb058b94ca1d050d5ced4a66c75ce4 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 8 Feb 2009 09:58:38 -0500
Subject: percpu: make PER_CPU_BASE_SECTION overridable by arches

Impact: bug fix

IA-64 needs to put percpu data in the seperate section even on UP.
Fixes regression caused by "percpu: refactor percpu.h"

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/include/asm/percpu.h | 4 ++--
 include/linux/percpu.h         | 8 +++++++-
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/arch/ia64/include/asm/percpu.h b/arch/ia64/include/asm/percpu.h
index 77f30b664b4e..30cf46534dd2 100644
--- a/arch/ia64/include/asm/percpu.h
+++ b/arch/ia64/include/asm/percpu.h
@@ -27,12 +27,12 @@ extern void *per_cpu_init(void);
 
 #else /* ! SMP */
 
-#define PER_CPU_ATTRIBUTES	__attribute__((__section__(".data.percpu")))
-
 #define per_cpu_init()				(__phys_per_cpu_start)
 
 #endif	/* SMP */
 
+#define PER_CPU_BASE_SECTION ".data.percpu"
+
 /*
  * Be extremely careful when taking the address of this variable!  Due to virtual
  * remapping, it is different from the canonical address returned by __get_cpu_var(var)!
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 0e24202b5a4e..3577ffd90d45 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -8,8 +8,15 @@
 
 #include <asm/percpu.h>
 
+#ifndef PER_CPU_BASE_SECTION
 #ifdef CONFIG_SMP
 #define PER_CPU_BASE_SECTION ".data.percpu"
+#else
+#define PER_CPU_BASE_SECTION ".data"
+#endif
+#endif
+
+#ifdef CONFIG_SMP
 
 #ifdef MODULE
 #define PER_CPU_SHARED_ALIGNED_SECTION ""
@@ -20,7 +27,6 @@
 
 #else
 
-#define PER_CPU_BASE_SECTION ".data"
 #define PER_CPU_SHARED_ALIGNED_SECTION ""
 #define PER_CPU_FIRST_SECTION ""
 
-- 
cgit 


From a5ef7ca0e2636bad0ccd07b996d775348ae2b65e Mon Sep 17 00:00:00 2001
From: Kyle McMartin <kyle@redhat.com>
Date: Sun, 8 Feb 2009 17:39:58 -0500
Subject: x86: spinlocks: define dummy __raw_spin_is_contended

Architectures other than mips and x86 are not using ticket spinlocks.
Therefore, the contention on the lock is meaningless, since there is
nobody known to be waiting on it (arguably /fairly/ unfair locks).

Dummy it out to return 0 on other architectures.

Signed-off-by: Kyle McMartin <kyle@redhat.com>
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/spinlock.h | 1 +
 arch/x86/include/asm/paravirt.h  | 1 +
 arch/x86/include/asm/spinlock.h  | 1 +
 include/linux/spinlock.h         | 5 +++++
 4 files changed, 8 insertions(+)

(limited to 'include')

diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 1a1f320c30d8..0884947ebe27 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -51,6 +51,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 
 	return (((counters >> 14) - counters) & 0x1fff) > 1;
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index ba3e2ff6aedc..c09a14127584 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1402,6 +1402,7 @@ static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
 {
 	return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
 {
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index d17c91981da2..8247e94ac6b1 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -245,6 +245,7 @@ static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
 {
 	return __ticket_spin_is_contended(lock);
 }
+#define __raw_spin_is_contended	__raw_spin_is_contended
 
 static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
 {
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index e0c0fccced46..a0c66a2e00ad 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -124,7 +124,12 @@ do {								\
 #ifdef CONFIG_GENERIC_LOCKBREAK
 #define spin_is_contended(lock) ((lock)->break_lock)
 #else
+
+#ifdef __raw_spin_is_contended
 #define spin_is_contended(lock)	__raw_spin_is_contended(&(lock)->raw_lock)
+#else
+#define spin_is_contended(lock)	(((void)(lock), 0))
+#endif /*__raw_spin_is_contended*/
 #endif
 
 /**
-- 
cgit 


From 43a990765a9e874350bae1009366d00809dbc9d8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 10 Feb 2009 00:00:22 +0100
Subject: sound: Remove OSSlib stuff from linux/soundcard.h

Removed OSSlib stuff from linux/soundcard.h to fix the warnings for
'make headers_check'.

This patch breaks building against OSSlib with the kernel headers
instead of its own headers. It should still work with any
version of the library from the 2003 onwards which provide
their own headers for the latest interface.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Takashi Iwai <tiwai@suse.de>
---
 include/linux/soundcard.h | 74 +++++++++++++++--------------------------------
 1 file changed, 23 insertions(+), 51 deletions(-)

(limited to 'include')

diff --git a/include/linux/soundcard.h b/include/linux/soundcard.h
index 523d069c862c..1904afedb82f 100644
--- a/include/linux/soundcard.h
+++ b/include/linux/soundcard.h
@@ -1045,50 +1045,36 @@ typedef struct mixer_vol_table {
  */
 #define LOCL_STARTAUDIO		1
 
-#if (!defined(__KERNEL__) && !defined(KERNEL) && !defined(INKERNEL) && !defined(_KERNEL)) || defined(USE_SEQ_MACROS) 
+#if !defined(__KERNEL__) || defined(USE_SEQ_MACROS)
 /*
  *	Some convenience macros to simplify programming of the
  *	/dev/sequencer interface
  *
- *	These macros define the API which should be used when possible.
+ *	This is a legacy interface for applications written against
+ *	the OSSlib-3.8 style interface. It is no longer possible
+ *	to actually link against OSSlib with this header, but we
+ *	still provide these macros for programs using them.
+ *
+ *	If you want to use OSSlib, it is recommended that you get
+ *	the GPL version of OSS-4.x and build against that version
+ *	of the header.
+ *
+ *	We redefine the extern keyword so that make headers_check
+ *	does not complain about SEQ_USE_EXTBUF.
  */
 #define SEQ_DECLAREBUF()		SEQ_USE_EXTBUF()
 
 void seqbuf_dump(void);	/* This function must be provided by programs */
 
-extern int OSS_init(int seqfd, int buflen);
-extern void OSS_seqbuf_dump(int fd, unsigned char *buf, int buflen);
-extern void OSS_seq_advbuf(int len, int fd, unsigned char *buf, int buflen);
-extern void OSS_seq_needbuf(int len, int fd, unsigned char *buf, int buflen);
-extern void OSS_patch_caching(int dev, int chn, int patch,
-			      int fd, unsigned char *buf, int buflen);
-extern void OSS_drum_caching(int dev, int chn, int patch,
-			      int fd, unsigned char *buf, int buflen);
-extern void OSS_write_patch(int fd, unsigned char *buf, int len);
-extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
-
 #define SEQ_PM_DEFINES int __foo_bar___
-#ifdef OSSLIB
-#  define SEQ_USE_EXTBUF() \
-		extern unsigned char *_seqbuf; \
-		extern int _seqbuflen;extern int _seqbufptr
-#  define SEQ_DEFINEBUF(len) SEQ_USE_EXTBUF();static int _requested_seqbuflen=len
-#  define _SEQ_ADVBUF(len) OSS_seq_advbuf(len, seqfd, _seqbuf, _seqbuflen)
-#  define _SEQ_NEEDBUF(len) OSS_seq_needbuf(len, seqfd, _seqbuf, _seqbuflen)
-#  define SEQ_DUMPBUF() OSS_seqbuf_dump(seqfd, _seqbuf, _seqbuflen)
-
-#  define SEQ_LOAD_GMINSTR(dev, instr) \
-		OSS_patch_caching(dev, -1, instr, seqfd, _seqbuf, _seqbuflen)
-#  define SEQ_LOAD_GMDRUM(dev, drum) \
-		OSS_drum_caching(dev, -1, drum, seqfd, _seqbuf, _seqbuflen)
-#else /* !OSSLIB */
-
-#  define SEQ_LOAD_GMINSTR(dev, instr)
-#  define SEQ_LOAD_GMDRUM(dev, drum)
-
-#  define SEQ_USE_EXTBUF() \
-		extern unsigned char _seqbuf[]; \
-		extern int _seqbuflen;extern int _seqbufptr
+
+#define SEQ_LOAD_GMINSTR(dev, instr)
+#define SEQ_LOAD_GMDRUM(dev, drum)
+
+#define _SEQ_EXTERN extern
+#define SEQ_USE_EXTBUF() \
+		_SEQ_EXTERN unsigned char _seqbuf[]; \
+		_SEQ_EXTERN int _seqbuflen; _SEQ_EXTERN int _seqbufptr
 
 #ifndef USE_SIMPLE_MACROS
 /* Sample seqbuf_dump() implementation:
@@ -1131,7 +1117,6 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
  */
 #define _SEQ_NEEDBUF(len)	/* empty */
 #endif
-#endif /* !OSSLIB */
 
 #define SEQ_VOLUME_MODE(dev, mode)	{_SEQ_NEEDBUF(8);\
 					_seqbuf[_seqbufptr] = SEQ_EXTENDED;\
@@ -1215,14 +1200,8 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
 		_CHN_COMMON(dev, MIDI_CHN_PRESSURE, chn, pressure, 0, 0)
 
 #define SEQ_SET_PATCH SEQ_PGM_CHANGE
-#ifdef OSSLIB
-#   define SEQ_PGM_CHANGE(dev, chn, patch) \
-		{OSS_patch_caching(dev, chn, patch, seqfd, _seqbuf, _seqbuflen); \
-		 _CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0);}
-#else
-#   define SEQ_PGM_CHANGE(dev, chn, patch) \
+#define SEQ_PGM_CHANGE(dev, chn, patch) \
 		_CHN_COMMON(dev, MIDI_PGM_CHANGE, chn, patch, 0, 0)
-#endif
 
 #define SEQ_CONTROL(dev, chn, controller, value) \
 		_CHN_COMMON(dev, MIDI_CTL_CHANGE, chn, controller, 0, value)
@@ -1300,19 +1279,12 @@ extern int OSS_write_patch2(int fd, unsigned char *buf, int len);
 /*
  * Patch loading.
  */
-#ifdef OSSLIB
-#   define SEQ_WRPATCH(patchx, len) \
-		OSS_write_patch(seqfd, (char*)(patchx), len)
-#   define SEQ_WRPATCH2(patchx, len) \
-		OSS_write_patch2(seqfd, (char*)(patchx), len)
-#else
-#   define SEQ_WRPATCH(patchx, len) \
+#define SEQ_WRPATCH(patchx, len) \
 		{if (_seqbufptr) SEQ_DUMPBUF();\
 		 if (write(seqfd, (char*)(patchx), len)==-1) \
 		    perror("Write patch: /dev/sequencer");}
-#   define SEQ_WRPATCH2(patchx, len) \
+#define SEQ_WRPATCH2(patchx, len) \
 		(SEQ_DUMPBUF(), write(seqfd, (char*)(patchx), len))
-#endif
 
 #endif
 #endif
-- 
cgit 


From 6cd61c0baa8bce32271226198b46c67a7a05d108 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:39 +0900
Subject: elf: add ELF_CORE_COPY_KERNEL_REGS()

ELF core dump is used for both user land core dump and kernel crash
dump.  Depending on architecture, register might need to be accessed
differently for userland and kernel.  Allow architectures to define
ELF_CORE_COPY_KERNEL_REGS() and use different operation for kernel
register dump.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/elfcore.h | 9 +++++++++
 kernel/kexec.c          | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 5ca54d77079f..7605c5e9589f 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -111,6 +111,15 @@ static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *re
 #endif
 }
 
+static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
+{
+#ifdef ELF_CORE_COPY_KERNEL_REGS
+	ELF_CORE_COPY_KERNEL_REGS((*elfregs), regs);
+#else
+	elf_core_copy_regs(elfregs, regs);
+#endif
+}
+
 static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
 {
 #ifdef ELF_CORE_COPY_TASK_REGS
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8a6d7b08864e..795e7b67a228 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 		return;
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
 	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
 		      	      &prstatus, sizeof(prstatus));
 	final_note(buf);
-- 
cgit 


From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 10 Feb 2009 14:02:27 +0000
Subject: Do not account for the address space used by hugetlbfs using
 VM_ACCOUNT

When overcommit is disabled, the core VM accounts for pages used by anonymous
shared, private mappings and special mappings. It keeps track of VMAs that
should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
with VM_NORESERVE.

Overcommit for hugetlbfs is much riskier than overcommit for base pages
due to contiguity requirements. It avoids overcommiting on both shared and
private mappings using reservation counters that are checked and updated
during mmap(). This ensures (within limits) that hugepages exist in the
future when faults occurs or it is too easy to applications to be SIGKILLed.

As hugetlbfs makes its own reservations of a different unit to the base page
size, VM_ACCOUNT should never be set. Even if the units were correct, we would
double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
be set because an application can request no reserves be made for hugetlbfs
at the risk of getting killed later.

With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and
VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
breaks the accounting for both the core VM and hugetlbfs, can trigger an
OOM storm when hugepage pools are too small lockups and corrupted counters
otherwise are used. This patch brings hugetlbfs more in line with how the
core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  8 +++++---
 include/linux/hugetlb.h |  5 +++--
 include/linux/mm.h      |  3 +--
 ipc/shm.c               |  8 +++++---
 mm/fremap.c             |  2 +-
 mm/hugetlb.c            | 39 +++++++++++++++++++++++++--------------
 mm/mmap.c               | 38 ++++++++++++++++++++++----------------
 mm/mprotect.c           |  5 +++--
 8 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'include')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma))
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
 		goto out;
 
 	ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
 			can_do_mlock());
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 
 	error = -ENOMEM;
 	if (hugetlb_reserve_pages(inode, 0,
-			size >> huge_page_shift(hstate_inode(inode)), NULL))
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f1d2fba19ea0..af09660001c7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
-						struct vm_area_struct *vma);
+						struct vm_area_struct *vma,
+						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long hugepages_treat_as_movable;
@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t);
+struct file *hugetlb_file_setup(const char *name, size_t, int);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8ddc98b8405..323561582c10 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
-	unsigned int vm_flags, unsigned long pgoff,
-	int accountable);
+	unsigned int vm_flags, unsigned long pgoff);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/ipc/shm.c b/ipc/shm.c
index f8f69fad3a27..05d51d2a792c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	struct file * file;
 	char name[13];
 	int id;
+	int acctflag = 0;
 
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	sprintf (name, "SYSV%08x", key);
 	if (shmflg & SHM_HUGETLB) {
-		/* hugetlb_file_setup takes care of mlock user accounting */
-		file = hugetlb_file_setup(name, size);
+		/* hugetlb_file_setup applies strict accounting */
+		if (shmflg & SHM_NORESERVE)
+			acctflag = VM_NORESERVE;
+		file = hugetlb_file_setup(name, size, acctflag);
 		shp->mlock_user = current_user();
 	} else {
-		int acctflag = 0;
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f3306a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			flags &= MAP_NONBLOCK;
 			get_file(file);
 			addr = mmap_region(file, start, size,
-					flags, vma->vm_flags, pgoff, 1);
+					flags, vma->vm_flags, pgoff);
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..207464209546 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 int hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
-					struct vm_area_struct *vma)
+					struct vm_area_struct *vma,
+					int acctflag)
 {
-	long ret, chg;
+	long ret = 0, chg;
 	struct hstate *h = hstate_inode(inode);
 
-	if (vma && vma->vm_flags & VM_NORESERVE)
-		return 0;
-
 	/*
 	 * Shared mappings base their reservation on the number of pages that
 	 * are already allocated on behalf of the file. Private mappings need
@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 */
 	if (!vma || vma->vm_flags & VM_SHARED)
 		chg = region_chg(&inode->i_mapping->private_list, from, to);
-	else {
-		struct resv_map *resv_map = resv_map_alloc();
-		if (!resv_map)
-			return -ENOMEM;
-
+	else
 		chg = to - from;
 
-		set_vma_resv_map(vma, resv_map);
-		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
-	}
-
 	if (chg < 0)
 		return chg;
 
 	if (hugetlb_get_quota(inode->i_mapping, chg))
 		return -ENOSPC;
+
+	/*
+	 * Only apply hugepage reservation if asked. We still have to
+	 * take the filesystem quota because it is an upper limit
+	 * defined for the mount and not necessarily memory as a whole
+	 */
+	if (acctflag & VM_NORESERVE) {
+		reset_vma_resv_huge_pages(vma);
+		return 0;
+	}
+
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode,
 	}
 	if (!vma || vma->vm_flags & VM_SHARED)
 		region_add(&inode->i_mapping->private_list, from, to);
+	else {
+		struct resv_map *resv_map = resv_map_alloc();
+
+		if (!resv_map)
+			return -ENOMEM;
+
+		set_vma_resv_map(vma, resv_map);
+		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+	}
+
 	return 0;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 214b6a258eeb..eb1270bebe67 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	struct inode *inode;
 	unsigned int vm_flags;
 	int error;
-	int accountable = 1;
 	unsigned long reqprot = prot;
 
 	/*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
 			}
-			if (is_file_hugepages(file))
-				accountable = 0;
 
 			if (!file->f_op || !file->f_op->mmap)
 				return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	if (error)
 		return error;
 
-	return mmap_region(file, addr, len, flags, vm_flags, pgoff,
-			   accountable);
+	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
 
@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 
 /*
  * We account for memory if it's a private writeable mapping,
- * and VM_NORESERVE wasn't set.
+ * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 {
+	/*
+	 * hugetlb has its own accounting separate from the core VM
+	 * VM_HUGETLB may not be set yet so we cannot check for that flag.
+	 */
+	if (file && is_file_hugepages(file))
+		return 0;
+
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
-			  unsigned int vm_flags, unsigned long pgoff,
-			  int accountable)
+			  unsigned int vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ munmap_back:
 
 	/*
 	 * Set 'VM_NORESERVE' if we should not account for the
-	 * memory use of this mapping. We only honor MAP_NORESERVE
-	 * if we're allowed to overcommit memory.
+	 * memory use of this mapping.
 	 */
-	if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-		vm_flags |= VM_NORESERVE;
-	if (!accountable)
-		vm_flags |= VM_NORESERVE;
+	if ((flags & MAP_NORESERVE)) {
+		/* We honor MAP_NORESERVE if allowed to overcommit */
+		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+			vm_flags |= VM_NORESERVE;
+
+		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
+		if (file && is_file_hugepages(file))
+			vm_flags |= VM_NORESERVE;
+	}
 
 	/*
 	 * Private writable mapping: check memory availability
 	 */
-	if (accountable_mapping(vm_flags)) {
+	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
 		if (security_vm_enough_memory(charged))
 			return -ENOMEM;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694e13f4..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
-	 * make it unwritable again.
+	 * make it unwritable again. hugetlb mapping were accounted for
+	 * even if read-only so there is no need to account for them here
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
-- 
cgit 


From 1db8508cf483dc1ecf66141f90a7c03659d69512 Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Tue, 10 Feb 2009 23:27:32 +0100
Subject: hugetlbfs: fix build failure with !CONFIG_HUGETLBFS

Fix regression due to 5a6fe125950676015f5108fb71b2a67441755003,
"Do not account for the address space used by hugetlbfs using VM_ACCOUNT"
which added an argument to the function hugetlb_file_setup() but not to
the macro hugetlb_file_setup().

Reported-by: Chris Clayton <chris2553@googlemail.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index af09660001c7..03be7f29ca01 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -159,9 +159,9 @@ static inline void set_file_hugepages(struct file *file)
 }
 #else /* !CONFIG_HUGETLBFS */
 
-#define is_file_hugepages(file)		0
-#define set_file_hugepages(file)	BUG()
-#define hugetlb_file_setup(name,size)	ERR_PTR(-ENOSYS)
+#define is_file_hugepages(file)			0
+#define set_file_hugepages(file)		BUG()
+#define hugetlb_file_setup(name,size,acctflag)	ERR_PTR(-ENOSYS)
 
 #endif /* !CONFIG_HUGETLBFS */
 
-- 
cgit 


From e672f7db767156bf71adf9c592cfe81b339523d6 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 10 Feb 2009 17:18:17 -0800
Subject: pkt_sched: type should be __u32 in header

Using u32 in this header breaks the build of iptables.

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/pkt_sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
index b2648e8e4987..d51a2b3e221e 100644
--- a/include/linux/pkt_sched.h
+++ b/include/linux/pkt_sched.h
@@ -515,7 +515,7 @@ enum
 
 struct tc_drr_stats
 {
-	u32	deficit;
+	__u32	deficit;
 };
 
 #endif
-- 
cgit 


From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 10 Feb 2009 16:37:31 +0100
Subject: timers: split process wide cpu clocks/timers, fix

To decrease the chance of a missed enable, always enable the timer when we
sample it, we'll always disable it when we find that there are no active timers
in the jiffy tick.

This fixes a flood of warnings reported by Mike Galbraith.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     |  1 +
 kernel/posix-cpu-timers.c | 42 ++++++++++++++----------------------------
 2 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79392916d6c9..5d10fa0b6002 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2209,6 +2209,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 	unsigned long flags;
 
 	spin_lock_irqsave(&cputimer->lock, flags);
+	cputimer->running = 1;
 	*times = cputimer->cputime;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index db107c9bbc05..e5d7bfdfa7d4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
 	struct task_cputime cputime;
 
-	thread_group_cputime(tsk, &cputime);
+	thread_group_cputimer(tsk, &cputime);
 	cleanup_timers(tsk->signal->cpu_timers,
 		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
 }
@@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 					     now);
 }
 
-/*
- * Enable the process wide cpu timer accounting.
- *
- * serialized using ->sighand->siglock
- */
-static void start_process_timers(struct task_struct *tsk)
-{
-	tsk->signal->cputimer.running = 1;
-	barrier();
-}
-
-/*
- * Release the process wide timer accounting -- timer stops ticking when
- * nobody cares about it.
- *
- * serialized using ->sighand->siglock
- */
-static void stop_process_timers(struct task_struct *tsk)
-{
-	tsk->signal->cputimer.running = 0;
-	barrier();
-}
-
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
@@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
-	if (!CPUCLOCK_PERTHREAD(timer->it_clock))
-		start_process_timers(p);
-
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
@@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
+static void stop_process_timers(struct task_struct *tsk)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	unsigned long flags;
+
+	if (!cputimer->running)
+		return;
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	cputimer->running = 0;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	start_process_timers(tsk);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
-- 
cgit 


From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Feb 2009 11:30:27 +0100
Subject: timers: fix TIMER_ABSTIME for process wide cpu timers

The POSIX timer interface allows for absolute time expiry values through the
TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock
every time we start it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     | 13 +------------
 kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d10fa0b6002..8981e52c714f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2201,18 +2201,7 @@ static inline int spin_needbreak(spinlock_t *lock)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-
-static inline
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 1;
-	*times = cputimer->cputime;
-	spin_unlock_irqrestore(&cputimer->lock, flags);
-}
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e5d7bfdfa7d4..2313a4cc14ea 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -261,6 +261,40 @@ out:
 	rcu_read_unlock();
 }
 
+static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+{
+	if (cputime_gt(b->utime, a->utime))
+		a->utime = b->utime;
+
+	if (cputime_gt(b->stime, a->stime))
+		a->stime = b->stime;
+
+	if (b->sum_exec_runtime > a->sum_exec_runtime)
+		a->sum_exec_runtime = b->sum_exec_runtime;
+}
+
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct task_cputime sum;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	if (!cputimer->running) {
+		cputimer->running = 1;
+		/*
+		 * The POSIX timer interface allows for absolute time expiry
+		 * values through the TIMER_ABSTIME flag, therefore we have
+		 * to synchronize the timer to the clock every time we start
+		 * it.
+		 */
+		thread_group_cputime(tsk, &sum);
+		update_gt_cputime(&cputimer->cputime, &sum);
+	}
+	*times = cputimer->cputime;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
-- 
cgit 


From 9f339e7028e2855717af3193c938f9960ad13b38 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 11 Feb 2009 15:10:27 +0100
Subject: x86, ptrace, mm: fix double-free on race

Ptrace_detach() races with __ptrace_unlink() if the traced task is
reaped while detaching. This might cause a double-free of the BTS
buffer.

Change the ptrace_detach() path to only do the memory accounting in
ptrace_bts_detach() and leave the buffer free to ptrace_bts_untrace()
which will be called from __ptrace_unlink().

The fix follows a proposal from Oleg Nesterov.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c | 16 ++++++++++------
 include/linux/mm.h       |  1 +
 mm/mlock.c               |  7 ++++++-
 3 files changed, 17 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 0a5df5f82fb9..5a4c23d89892 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -810,12 +810,16 @@ static void ptrace_bts_untrace(struct task_struct *child)
 
 static void ptrace_bts_detach(struct task_struct *child)
 {
-	if (unlikely(child->bts)) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-
-		ptrace_bts_free_buffer(child);
-	}
+	/*
+	 * Ptrace_detach() races with ptrace_untrace() in case
+	 * the child dies and is reaped by another thread.
+	 *
+	 * We only do the memory accounting at this point and
+	 * leave the buffer deallocation and the bts tracer
+	 * release to ptrace_bts_untrace() which will be called
+	 * later on with tasklist_lock held.
+	 */
+	release_locked_buffer(child->bts_buffer, child->bts_size);
 }
 #else
 static inline void ptrace_bts_fork(struct task_struct *tsk) {}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8ddc98b8405..3d7fb44d7d7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1305,5 +1305,6 @@ void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
 extern void free_locked_buffer(void *buffer, size_t size);
+extern void release_locked_buffer(void *buffer, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 028ec482fdd4..2b57f7e60390 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -657,7 +657,7 @@ void *alloc_locked_buffer(size_t size)
 	return buffer;
 }
 
-void free_locked_buffer(void *buffer, size_t size)
+void release_locked_buffer(void *buffer, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
@@ -667,6 +667,11 @@ void free_locked_buffer(void *buffer, size_t size)
 	current->mm->locked_vm -= pgsz;
 
 	up_write(&current->mm->mmap_sem);
+}
+
+void free_locked_buffer(void *buffer, size_t size)
+{
+	release_locked_buffer(buffer, size);
 
 	kfree(buffer);
 }
-- 
cgit 


From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 11 Feb 2009 13:04:36 -0800
Subject: cgroups: fix lockdep subclasses overflow

I enabled all cgroup subsystems when compiling kernel, and then:
 # mount -t cgroup -o net_cls xxx /mnt
 # mkdir /mnt/0

This showed up immediately:
 BUG: MAX_LOCKDEP_SUBCLASSES too low!
 turning off the locking correctness validator.

It's caused by the cgroup hierarchy lock:
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup_subsys *ss = subsys[i];
		if (ss->root == root)
			mutex_lock_nested(&ss->hierarchy_mutex, i);
	}

Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but
MAX_LOCKDEP_SUBCLASSES is 8.

This patch uses different lockdep keys for different subsystems.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e4e8e117d27d..499900d0cee7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -378,6 +378,7 @@ struct cgroup_subsys {
 	 * - initiating hotplug events
 	 */
 	struct mutex hierarchy_mutex;
+	struct lock_class_key subsys_key;
 
 	/*
 	 * Link to parent, and list entry in parent's children.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5a54ff42874e..e14db9c089b9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (ss->root == root)
-			mutex_lock_nested(&ss->hierarchy_mutex, i);
+			mutex_lock(&ss->hierarchy_mutex);
 	}
 }
 
@@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	BUG_ON(!list_empty(&init_task.tasks));
 
 	mutex_init(&ss->hierarchy_mutex);
+	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 }
 
-- 
cgit 


From 6c5979631b4b03c9288776562c18036765e398c1 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 11 Feb 2009 13:04:38 -0800
Subject: syscall define: fix uml compile bug

With the new system call defines we get this on uml:

arch/um/sys-i386/built-in.o: In function `sys_call_table':
(.rodata+0x308): undefined reference to `sys_sigprocmask'

Reason for this is that uml passes the preprocessor option
-Dsigprocmask=kernel_sigprocmask to gcc when compiling the kernel.
This causes SYSCALL_DEFINE3(sigprocmask, ...) to be expanded to
SYSCALL_DEFINEx(3, kernel_sigprocmask, ...) and finally to a system
call named sys_kernel_sigprocmask.  However sys_sigprocmask is missing
because of this.

To avoid macro expansion for the system call name just concatenate the
name at first define instead of carrying it through severel levels.
This was pointed out by Al Viro.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Reviewed-by: WANG Cong <wangcong@zeuux.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/syscalls.h | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 0eda02ff2414..f9f900cfd066 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -95,13 +95,13 @@ struct old_linux_dirent;
 #define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
-#define SYSCALL_DEFINE0(name)   asmlinkage long sys_##name(void)
-#define SYSCALL_DEFINE1(...)    SYSCALL_DEFINEx(1, __VA_ARGS__)
-#define SYSCALL_DEFINE2(...)    SYSCALL_DEFINEx(2, __VA_ARGS__)
-#define SYSCALL_DEFINE3(...)    SYSCALL_DEFINEx(3, __VA_ARGS__)
-#define SYSCALL_DEFINE4(...)    SYSCALL_DEFINEx(4, __VA_ARGS__)
-#define SYSCALL_DEFINE5(...)    SYSCALL_DEFINEx(5, __VA_ARGS__)
-#define SYSCALL_DEFINE6(...)    SYSCALL_DEFINEx(6, __VA_ARGS__)
+#define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
+#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
 
 #ifdef CONFIG_PPC64
 #define SYSCALL_ALIAS(alias, name)					\
@@ -121,21 +121,21 @@ struct old_linux_dirent;
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
 #define SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__));		\
-	static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__));	\
-	asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__))		\
+	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
+	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
+	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
 	{								\
 		__SC_TEST##x(__VA_ARGS__);				\
-		return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__));	\
+		return (long) SYSC##name(__SC_CAST##x(__VA_ARGS__));	\
 	}								\
-	SYSCALL_ALIAS(sys_##name, SyS_##name);				\
-	static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__))
+	SYSCALL_ALIAS(sys##name, SyS##name);				\
+	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__))
 
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
 #define SYSCALL_DEFINEx(x, name, ...)					\
-	asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__))
+	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
-- 
cgit 


From 99709372736a216f99eb32b76fba835a2bfc93a8 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 12 Feb 2009 16:43:17 -0800
Subject: net: don't use in_atomic() in gfp_any()

The problem is that in_atomic() will return false inside spinlocks if
CONFIG_PREEMPT=n.  This will lead to deadlockable GFP_KERNEL allocations
from spinlocked regions.

Secondly, if CONFIG_PREEMPT=y, this bug solves itself because networking
will instead use GFP_ATOMIC from this callsite.  Hence we won't get the
might_sleep() debugging warnings which would have informed us of the buggy
callsites.

Solve both these problems by switching to in_interrupt().  Now, if someone
runs a gfp_any() allocation from inside spinlock we will get the warning
if CONFIG_PREEMPT=y.

I reviewed all callsites and most of them were too complex for my little
brain and none of them documented their interface requirements.  I have no
idea what this patch will do.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/net/sock.h b/include/net/sock.h
index 5a3a151bd730..ce3b5b622683 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1308,7 +1308,7 @@ static inline int sock_writeable(const struct sock *sk)
 
 static inline gfp_t gfp_any(void)
 {
-	return in_atomic() ? GFP_ATOMIC : GFP_KERNEL;
+	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
 }
 
 static inline long sock_rcvtimeo(const struct sock *sk, int noblock)
-- 
cgit