From ce5ccdef1090367f3024b4d5e7908bf6bd2929ae Mon Sep 17 00:00:00 2001
From: Kumar Gala <galak@kernel.crashing.org>
Date: Mon, 16 Jul 2007 23:27:10 -0500
Subject: PCI: Move prototypes for pci_bus_find_capability to
 include/linux/pci.h

We need pci_bus_find_capability() in some arch/powerpc code so move
the prototype into a header accessible to it.

Also kill the duplicate prototype for pci_bus_alloc_resource().

Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index e7d8d4e19a53..325225c48458 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -578,6 +578,9 @@ int pci_set_power_state(struct pci_dev *dev, pci_power_t state);
 pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state);
 int pci_enable_wake(struct pci_dev *dev, pci_power_t state, int enable);
 
+/* Functions for PCI Hotplug drivers to use */
+int pci_bus_find_capability (struct pci_bus *bus, unsigned int devfn, int cap);
+
 /* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
 void pci_bus_assign_resources(struct pci_bus *bus);
 void pci_bus_size_bridges(struct pci_bus *bus);
-- 
cgit 


From 2637e5b539f5f153f2124dbad087b5216bc68d6d Mon Sep 17 00:00:00 2001
From: Brice Goglin <brice@myri.com>
Date: Tue, 14 Aug 2007 12:43:48 +0200
Subject: PCI: make pcie_get_readrq visible in pci.h

[PATCH] PCI: make pcie_get_readrq visible in pci.h

pcie_get_readrq() is EXPORT_SYMBOL'ed, but its prototype is not
visible in pci.h, add it there.

This is needed by some network drivers.

Signed-off-by: Brice Goglin <brice@myri.com>
Acked-by: Peter Oruba <peter.oruba@amd.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/pci.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/pci.h b/include/linux/pci.h
index 325225c48458..038a0dc7273a 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -557,6 +557,7 @@ int pci_set_consistent_dma_mask(struct pci_dev *dev, u64 mask);
 int pcix_get_max_mmrbc(struct pci_dev *dev);
 int pcix_get_mmrbc(struct pci_dev *dev);
 int pcix_set_mmrbc(struct pci_dev *dev, int mmrbc);
+int pcie_get_readrq(struct pci_dev *dev);
 int pcie_set_readrq(struct pci_dev *dev, int rq);
 void pci_update_resource(struct pci_dev *dev, struct resource *res, int resno);
 int __must_check pci_assign_resource(struct pci_dev *dev, int i);
-- 
cgit 


From 4be8f906435a6af241821ab5b94b2b12cb7d57d8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Thu, 16 Aug 2007 14:34:42 +0900
Subject: PCI: disable MSI on RS690

RS690 can't do MSI like its predecessors.  Disable MSI on RS690.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: Henry Su <henry.su@amd.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 1 +
 include/linux/pci_ids.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index ab1c5c5d03f8..ca89ed92cd04 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1650,6 +1650,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_GCN
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT1000_PCIX, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS400_200, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS480, quirk_disable_all_msi);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS690, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT3351, quirk_disable_all_msi);
 
 /* Disable MSI on chipsets that are known to not support it */
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 8938d59013c6..0d6279c60b32 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -360,6 +360,7 @@
 #define PCI_DEVICE_ID_ATI_RS400_166     0x5a32
 #define PCI_DEVICE_ID_ATI_RS400_200     0x5a33
 #define PCI_DEVICE_ID_ATI_RS480         0x5950
+#define PCI_DEVICE_ID_ATI_RS690		0x7910
 /* ATI IXP Chipset */
 #define PCI_DEVICE_ID_ATI_IXP200_IDE	0x4349
 #define PCI_DEVICE_ID_ATI_IXP200_SMBUS	0x4353
-- 
cgit 


From aea6a433f50cd89b9cbd10850fd0b32f961f9883 Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Sat, 18 Aug 2007 03:03:10 +0900
Subject: PCI: disable MSI on RD580

RD580 can't do MSI like its predecessors.  Disable MSI on RD580.

Signed-off-by: Tejun Heo <teheo@suse.de>
CC: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 1 +
 include/linux/pci_ids.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index ca89ed92cd04..ebfc1de7579e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1650,6 +1650,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_GCN
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT1000_PCIX, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS400_200, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS480, quirk_disable_all_msi);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RD580, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS690, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT3351, quirk_disable_all_msi);
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0d6279c60b32..80c27d7bf021 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -360,6 +360,7 @@
 #define PCI_DEVICE_ID_ATI_RS400_166     0x5a32
 #define PCI_DEVICE_ID_ATI_RS400_200     0x5a33
 #define PCI_DEVICE_ID_ATI_RS480         0x5950
+#define PCI_DEVICE_ID_ATI_RD580		0x5952
 #define PCI_DEVICE_ID_ATI_RS690		0x7910
 /* ATI IXP Chipset */
 #define PCI_DEVICE_ID_ATI_IXP200_IDE	0x4349
-- 
cgit 


From f122392f679ebed39db08074f935d770504623eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Tue, 21 Aug 2007 14:33:01 +0900
Subject: PCI: disable MSI on RX790

RX790 can't do MSI like its predecessors.  Disable MSI on RX790.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Cc: stable <stable@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 drivers/pci/quirks.c    | 1 +
 include/linux/pci_ids.h | 1 +
 2 files changed, 2 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index ebfc1de7579e..31f680f238c8 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -1651,6 +1651,7 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_HT1
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS400_200, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS480, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RD580, quirk_disable_all_msi);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RX790, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_RS690, quirk_disable_all_msi);
 DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_VT3351, quirk_disable_all_msi);
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 80c27d7bf021..f77944e432f2 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -361,6 +361,7 @@
 #define PCI_DEVICE_ID_ATI_RS400_200     0x5a33
 #define PCI_DEVICE_ID_ATI_RS480         0x5950
 #define PCI_DEVICE_ID_ATI_RD580		0x5952
+#define PCI_DEVICE_ID_ATI_RX790		0x5957
 #define PCI_DEVICE_ID_ATI_RS690		0x7910
 /* ATI IXP Chipset */
 #define PCI_DEVICE_ID_ATI_IXP200_IDE	0x4349
-- 
cgit 


From 2aa44d0567ed21b47b87d68819415d48194cb923 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: sched_clock_idle_[sleep|wakeup]_event()

construct a more or less wall-clock time out of sched_clock(), by
using ACPI-idle's existing knowledge about how much time we spent
idling. This allows the rq clock to work around TSC-stops-in-C2,
TSC-gets-corrupted-in-C3 type of problems.

( Besides the scheduler's statistics this also benefits blktrace and
  printk-timestamps as well. )

Furthermore, the precise before-C2/C3-sleep and after-C2/C3-wakeup
callbacks allow the scheduler to get out the most of the period where
the CPU has a reliable TSC. This results in slightly more precise
task statistics.

the ACPI bits were acked by Len.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Len Brown <len.brown@intel.com>
---
 arch/i386/kernel/tsc.c        |  1 -
 drivers/acpi/processor_idle.c | 32 +++++++++++++++++++++++++-------
 include/linux/sched.h         |  3 ++-
 kernel/sched.c                | 41 ++++++++++++++++++++++++++++++++---------
 kernel/sched_debug.c          |  3 ++-
 5 files changed, 61 insertions(+), 19 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/kernel/tsc.c b/arch/i386/kernel/tsc.c
index debd7dbb4158..a39280b4dd3a 100644
--- a/arch/i386/kernel/tsc.c
+++ b/arch/i386/kernel/tsc.c
@@ -292,7 +292,6 @@ static struct clocksource clocksource_tsc = {
 
 void mark_tsc_unstable(char *reason)
 {
-	sched_clock_unstable_event();
 	if (!tsc_unstable) {
 		tsc_unstable = 1;
 		tsc_enabled = 0;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index a8634a0655fc..d9b8af763e1e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -63,6 +63,7 @@
 ACPI_MODULE_NAME("processor_idle");
 #define ACPI_PROCESSOR_FILE_POWER	"power"
 #define US_TO_PM_TIMER_TICKS(t)		((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
+#define PM_TIMER_TICK_NS		(1000000000ULL/PM_TIMER_FREQUENCY)
 #define C2_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 #define C3_OVERHEAD			4	/* 1us (3.579 ticks per us) */
 static void (*pm_idle_save) (void) __read_mostly;
@@ -462,6 +463,9 @@ static void acpi_processor_idle(void)
 		 * TBD: Can't get time duration while in C1, as resumes
 		 *      go to an ISR rather than here.  Need to instrument
 		 *      base interrupt handler.
+		 *
+		 * Note: the TSC better not stop in C1, sched_clock() will
+		 *       skew otherwise.
 		 */
 		sleep_ticks = 0xFFFFFFFF;
 		break;
@@ -469,6 +473,8 @@ static void acpi_processor_idle(void)
 	case ACPI_STATE_C2:
 		/* Get start time (ticks) */
 		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
+		/* Tell the scheduler that we are going deep-idle: */
+		sched_clock_idle_sleep_event();
 		/* Invoke C2 */
 		acpi_state_timer_broadcast(pr, cx, 1);
 		acpi_cstate_enter(cx);
@@ -479,17 +485,22 @@ static void acpi_processor_idle(void)
 		/* TSC halts in C2, so notify users */
 		mark_tsc_unstable("possible TSC halt in C2");
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+
+		/* Tell the scheduler how much we idled: */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
 		/* Re-enable interrupts */
 		local_irq_enable();
+		/* Do not account our idle-switching overhead: */
+		sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
+
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
 		acpi_state_timer_broadcast(pr, cx, 0);
 		break;
 
 	case ACPI_STATE_C3:
-
 		/*
 		 * disable bus master
 		 * bm_check implies we need ARB_DIS
@@ -518,6 +529,8 @@ static void acpi_processor_idle(void)
 		t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
 		/* Invoke C3 */
 		acpi_state_timer_broadcast(pr, cx, 1);
+		/* Tell the scheduler that we are going deep-idle: */
+		sched_clock_idle_sleep_event();
 		acpi_cstate_enter(cx);
 		/* Get end time (ticks) */
 		t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
@@ -531,12 +544,17 @@ static void acpi_processor_idle(void)
 		/* TSC halts in C3, so notify users */
 		mark_tsc_unstable("TSC halts in C3");
 #endif
+		/* Compute time (ticks) that we were actually asleep */
+		sleep_ticks = ticks_elapsed(t1, t2);
+		/* Tell the scheduler how much we idled: */
+		sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
+
 		/* Re-enable interrupts */
 		local_irq_enable();
+		/* Do not account our idle-switching overhead: */
+		sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
+
 		current_thread_info()->status |= TS_POLLING;
-		/* Compute time (ticks) that we were actually asleep */
-		sleep_ticks =
-		    ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
 		acpi_state_timer_broadcast(pr, cx, 0);
 		break;
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 682ef87da6eb..1845b2e99a87 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1388,7 +1388,8 @@ extern void sched_exec(void);
 #define sched_exec()   {}
 #endif
 
-extern void sched_clock_unstable_event(void);
+extern void sched_clock_idle_sleep_event(void);
+extern void sched_clock_idle_wakeup_event(u64 delta_ns);
 
 #ifdef CONFIG_HOTPLUG_CPU
 extern void idle_task_exit(void);
diff --git a/kernel/sched.c b/kernel/sched.c
index 45e17b83b7f1..48e7586168ef 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -262,7 +262,8 @@ struct rq {
 	s64 clock_max_delta;
 
 	unsigned int clock_warps, clock_overflows;
-	unsigned int clock_unstable_events;
+	u64 idle_clock;
+	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
 
 	atomic_t nr_iowait;
@@ -556,18 +557,40 @@ static inline struct rq *this_rq_lock(void)
 }
 
 /*
- * CPU frequency is/was unstable - start new by setting prev_clock_raw:
+ * We are going deep-idle (irqs are disabled):
  */
-void sched_clock_unstable_event(void)
+void sched_clock_idle_sleep_event(void)
 {
-	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq = cpu_rq(smp_processor_id());
 
-	rq = task_rq_lock(current, &flags);
-	rq->prev_clock_raw = sched_clock();
-	rq->clock_unstable_events++;
-	task_rq_unlock(rq, &flags);
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	spin_unlock(&rq->lock);
+	rq->clock_deep_idle_events++;
+}
+EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
+
+/*
+ * We just idled delta nanoseconds (called with irqs disabled):
+ */
+void sched_clock_idle_wakeup_event(u64 delta_ns)
+{
+	struct rq *rq = cpu_rq(smp_processor_id());
+	u64 now = sched_clock();
+
+	rq->idle_clock += delta_ns;
+	/*
+	 * Override the previous timestamp and ignore all
+	 * sched_clock() deltas that occured while we idled,
+	 * and use the PM-provided delta_ns to advance the
+	 * rq clock:
+	 */
+	spin_lock(&rq->lock);
+	rq->prev_clock_raw = now;
+	rq->clock += delta_ns;
+	spin_unlock(&rq->lock);
 }
+EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 /*
  * resched_task - mark a task 'to be rescheduled now'.
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 87e524762b85..ab18f45f2ab2 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -154,10 +154,11 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(next_balance);
 	P(curr->pid);
 	P(clock);
+	P(idle_clock);
 	P(prev_clock_raw);
 	P(clock_warps);
 	P(clock_overflows);
-	P(clock_unstable_events);
+	P(clock_deep_idle_events);
 	P(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[1]);
-- 
cgit 


From f8700df7c419781efb34696de7e7f49717f8ede7 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Thu, 23 Aug 2007 15:18:02 +0200
Subject: sched: fix broken SMT/MC optimizations

On a four package system with HT - HT load balancing optimizations were
broken.  For example, if two tasks end up running on two logical threads
of one of the packages, scheduler is not able to pull one of the tasks
to a completely idle package.

In this scenario, for nice-0 tasks, imbalance calculated by scheduler
will be 512 and find_busiest_queue() will return 0 (as each cpu's load
is 1024 > imbalance and has only one task running).

Similarly MC scheduler optimizations also get fixed with this patch.

[ mingo@elte.hu: restored fair balancing by increasing the fuzz and
                 adding it back to the power decision, without the /2
                 factor. ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1845b2e99a87..ba78807eab91 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -681,7 +681,7 @@ enum cpu_idle_type {
 #define SCHED_LOAD_SHIFT	10
 #define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
 
-#define SCHED_LOAD_SCALE_FUZZ	(SCHED_LOAD_SCALE >> 1)
+#define SCHED_LOAD_SCALE_FUZZ	SCHED_LOAD_SCALE
 
 #ifdef CONFIG_SMP
 #define SD_LOAD_BALANCE		1	/* Do load balancing on this domain. */
diff --git a/kernel/sched.c b/kernel/sched.c
index 5fecbbba12ac..d96030db8ff7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2517,7 +2517,7 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) {
+	if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
-- 
cgit 


From a7ecd1ea913346a72f41a002c365882dc05c9bd5 Mon Sep 17 00:00:00 2001
From: Yu Luming <luming.yu@intel.com>
Date: Thu, 23 Aug 2007 23:05:55 -0400
Subject: ACPI: video: Add keycode for ACPI video driver hotkey events.

Signed-off-by: Luming Yu <luming.yu@intel.com>
Signed-off-by: Zhang Rui <rui.zhang@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 include/linux/input.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index e02c6a66b2ba..cf2b5619aa13 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -363,6 +363,12 @@ struct input_absinfo {
 
 #define KEY_UNKNOWN		240
 
+#define KEY_VIDEO_NEXT		241	/* drive next video source */
+#define KEY_VIDEO_PREV		242	/* drive previous video source */
+#define KEY_BRIGHTNESS_CYCLE	243	/* brightness up, after max is min */
+#define KEY_BRIGHTNESS_ZERO	244	/* brightness off, use ambient */
+#define KEY_DISPLAY_OFF		245	/* display device to off state */
+
 #define BTN_MISC		0x100
 #define BTN_0			0x100
 #define BTN_1			0x101
-- 
cgit 


From 6dc2c1b7798ef645213afc82f6d5eac3d61bc18b Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 23 Aug 2007 10:19:53 +0100
Subject: Renumber AUDIT_TTY_[GS]ET

Renumber AUDIT_TTY_[GS]ET to avoid a conflict with netlink message types
already used in the wild.

Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/audit.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/audit.h b/include/linux/audit.h
index 4bbd8601b8f0..d6579df8dadf 100644
--- a/include/linux/audit.h
+++ b/include/linux/audit.h
@@ -63,8 +63,8 @@
 #define AUDIT_ADD_RULE		1011	/* Add syscall filtering rule */
 #define AUDIT_DEL_RULE		1012	/* Delete syscall filtering rule */
 #define AUDIT_LIST_RULES	1013	/* List syscall filtering rules */
-#define AUDIT_TTY_GET		1014	/* Get TTY auditing status */
-#define AUDIT_TTY_SET		1015	/* Set TTY auditing status */
+#define AUDIT_TTY_GET		1016	/* Get TTY auditing status */
+#define AUDIT_TTY_SET		1017	/* Set TTY auditing status */
 
 #define AUDIT_FIRST_USER_MSG	1100	/* Userspace messages mostly uninteresting to kernel */
 #define AUDIT_USER_AVC		1107	/* We filter this differently */
-- 
cgit 


From e120e8d03a263cf75f2abc0f8b3a03a65cfd3b88 Mon Sep 17 00:00:00 2001
From: Olaf Hering <olaf@aepfle.de>
Date: Sat, 25 Aug 2007 05:42:01 +1000
Subject: [POWERPC] Fix undefined reference to device_power_up/resume

Current Linus tree fails to link on pmac32:

drivers/built-in.o: In function `pmac_wakeup_devices':
via-pmu.c:(.text+0x5bab4): undefined reference to `device_power_up'
via-pmu.c:(.text+0x5bb08): undefined reference to `device_resume'
drivers/built-in.o: In function `pmac_suspend_devices':
via-pmu.c:(.text+0x5c260): undefined reference to `device_power_down'
via-pmu.c:(.text+0x5c27c): undefined reference to `device_resume'
make[1]: *** [.tmp_vmlinux1] Error 1

changing CONFIG_PM > CONFIG_PM_SLEEP leads to:

drivers/built-in.o: In function `pmu_led_set':
via-pmu-led.c:(.text+0x5cdca): undefined reference to `pmu_sys_suspended'
via-pmu-led.c:(.text+0x5cdce): undefined reference to `pmu_sys_suspended'
drivers/built-in.o: In function `pmu_req_done':
via-pmu-led.c:(.text+0x5ce3e): undefined reference to `pmu_sys_suspended'
via-pmu-led.c:(.text+0x5ce42): undefined reference to `pmu_sys_suspended'
drivers/built-in.o: In function `adb_init':
(.init.text+0x4c5c): undefined reference to `pmu_register_sleep_notifier'
make[1]: *** [.tmp_vmlinux1] Error 1

So change even more places from PM to PM_SLEEP to allow linking.

Signed-off-by: Olaf Hering <olaf@aepfle.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 drivers/macintosh/adb.c     |  4 ++--
 drivers/macintosh/via-pmu.c | 34 +++++++++++++++++-----------------
 include/linux/pmu.h         |  2 +-
 3 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c
index bc77c5e2ca9f..5c742a526082 100644
--- a/drivers/macintosh/adb.c
+++ b/drivers/macintosh/adb.c
@@ -89,7 +89,7 @@ static int sleepy_trackpad;
 static int autopoll_devs;
 int __adb_probe_sync;
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 static void adb_notify_sleep(struct pmu_sleep_notifier *self, int when);
 static struct pmu_sleep_notifier adb_sleep_notifier = {
 	adb_notify_sleep,
@@ -313,7 +313,7 @@ int __init adb_init(void)
 		printk(KERN_WARNING "Warning: no ADB interface detected\n");
 		adb_controller = NULL;
 	} else {
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 		pmu_register_sleep_notifier(&adb_sleep_notifier);
 #endif /* CONFIG_PM */
 #ifdef CONFIG_PPC
diff --git a/drivers/macintosh/via-pmu.c b/drivers/macintosh/via-pmu.c
index 157080b3b468..04f3973e3874 100644
--- a/drivers/macintosh/via-pmu.c
+++ b/drivers/macintosh/via-pmu.c
@@ -152,10 +152,10 @@ static spinlock_t pmu_lock;
 static u8 pmu_intr_mask;
 static int pmu_version;
 static int drop_interrupts;
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 static int option_lid_wakeup = 1;
-#endif /* CONFIG_PM && CONFIG_PPC32 */
-#if (defined(CONFIG_PM)&&defined(CONFIG_PPC32))||defined(CONFIG_PMAC_BACKLIGHT_LEGACY)
+#endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
+#if (defined(CONFIG_PM_SLEEP)&&defined(CONFIG_PPC32))||defined(CONFIG_PMAC_BACKLIGHT_LEGACY)
 static int sleep_in_progress;
 #endif
 static unsigned long async_req_locks;
@@ -875,7 +875,7 @@ proc_read_options(char *page, char **start, off_t off,
 {
 	char *p = page;
 
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 	if (pmu_kind == PMU_KEYLARGO_BASED &&
 	    pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0)
 		p += sprintf(p, "lid_wakeup=%d\n", option_lid_wakeup);
@@ -916,7 +916,7 @@ proc_write_options(struct file *file, const char __user *buffer,
 	*(val++) = 0;
 	while(*val == ' ')
 		val++;
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 	if (pmu_kind == PMU_KEYLARGO_BASED &&
 	    pmac_call_feature(PMAC_FTR_SLEEP_STATE,NULL,0,-1) >= 0)
 		if (!strcmp(label, "lid_wakeup"))
@@ -1738,7 +1738,7 @@ pmu_present(void)
 	return via != 0;
 }
 
-#ifdef CONFIG_PM
+#ifdef CONFIG_PM_SLEEP
 
 static LIST_HEAD(sleep_notifiers);
 
@@ -1769,9 +1769,9 @@ pmu_unregister_sleep_notifier(struct pmu_sleep_notifier* n)
 	return 0;
 }
 EXPORT_SYMBOL(pmu_unregister_sleep_notifier);
-#endif /* CONFIG_PM */
+#endif /* CONFIG_PM_SLEEP */
 
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 
 /* Sleep is broadcast last-to-first */
 static void broadcast_sleep(int when)
@@ -2390,7 +2390,7 @@ powerbook_sleep_3400(void)
 	return 0;
 }
 
-#endif /* CONFIG_PM && CONFIG_PPC32 */
+#endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 
 /*
  * Support for /dev/pmu device
@@ -2573,7 +2573,7 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 	int error = -EINVAL;
 
 	switch (cmd) {
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 	case PMU_IOC_SLEEP:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
@@ -2601,7 +2601,7 @@ pmu_ioctl(struct inode * inode, struct file *filp,
 			return put_user(0, argp);
 		else
 			return put_user(1, argp);
-#endif /* CONFIG_PM && CONFIG_PPC32 */
+#endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 
 #ifdef CONFIG_PMAC_BACKLIGHT_LEGACY
 	/* Compatibility ioctl's for backlight */
@@ -2757,7 +2757,7 @@ pmu_polled_request(struct adb_request *req)
  * to do suspend-to-disk.
  */
 
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 
 int pmu_sys_suspended;
 
@@ -2792,7 +2792,7 @@ static int pmu_sys_resume(struct sys_device *sysdev)
 	return 0;
 }
 
-#endif /* CONFIG_PM && CONFIG_PPC32 */
+#endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 
 static struct sysdev_class pmu_sysclass = {
 	set_kset_name("pmu"),
@@ -2803,10 +2803,10 @@ static struct sys_device device_pmu = {
 };
 
 static struct sysdev_driver driver_pmu = {
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 	.suspend	= &pmu_sys_suspend,
 	.resume		= &pmu_sys_resume,
-#endif /* CONFIG_PM && CONFIG_PPC32 */
+#endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 };
 
 static int __init init_pmu_sysfs(void)
@@ -2841,10 +2841,10 @@ EXPORT_SYMBOL(pmu_wait_complete);
 EXPORT_SYMBOL(pmu_suspend);
 EXPORT_SYMBOL(pmu_resume);
 EXPORT_SYMBOL(pmu_unlock);
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 EXPORT_SYMBOL(pmu_enable_irled);
 EXPORT_SYMBOL(pmu_battery_count);
 EXPORT_SYMBOL(pmu_batteries);
 EXPORT_SYMBOL(pmu_power_flags);
-#endif /* CONFIG_PM && CONFIG_PPC32 */
+#endif /* CONFIG_PM_SLEEP && CONFIG_PPC32 */
 
diff --git a/include/linux/pmu.h b/include/linux/pmu.h
index 5ad913ff02b2..b7824c215354 100644
--- a/include/linux/pmu.h
+++ b/include/linux/pmu.h
@@ -226,7 +226,7 @@ extern unsigned int pmu_power_flags;
 extern void pmu_backlight_init(void);
 
 /* some code needs to know if the PMU was suspended for hibernation */
-#if defined(CONFIG_PM) && defined(CONFIG_PPC32)
+#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PPC32)
 extern int pmu_sys_suspended;
 #else
 /* if power management is not configured it can't be suspended */
-- 
cgit 


From 32ddef98f232585f20bc8bdb891029a6a5f633d0 Mon Sep 17 00:00:00 2001
From: Xavier Bachelot <xavier@bachelot.org>
Date: Sat, 25 Aug 2007 18:10:52 +1000
Subject: agp: Add device id for P4M900 to via-agp module

Signed-off-by: Dave Airlie <airlied@linux.ie>
---
 drivers/char/agp/via-agp.c | 5 +++++
 include/linux/pci_ids.h    | 1 +
 2 files changed, 6 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/char/agp/via-agp.c b/drivers/char/agp/via-agp.c
index 9aaf401a8975..0ecc54d327bc 100644
--- a/drivers/char/agp/via-agp.c
+++ b/drivers/char/agp/via-agp.c
@@ -399,6 +399,11 @@ static struct agp_device_ids via_agp_device_ids[] __devinitdata =
 		.device_id  = PCI_DEVICE_ID_VIA_P4M890,
 		.chipset_name   = "P4M890",
 	},
+	/* P4M900 */
+	{
+		.device_id  = PCI_DEVICE_ID_VIA_VT3364,
+		.chipset_name   = "P4M900",
+	},
 	{ }, /* dummy final entry, always present */
 };
 
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 07fc57429b58..0d58a39017de 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1287,6 +1287,7 @@
 #define PCI_DEVICE_ID_VIA_VT3324	0x0324
 #define PCI_DEVICE_ID_VIA_VT3336	0x0336
 #define PCI_DEVICE_ID_VIA_VT3351	0x0351
+#define PCI_DEVICE_ID_VIA_VT3364	0x0364
 #define PCI_DEVICE_ID_VIA_8371_0	0x0391
 #define PCI_DEVICE_ID_VIA_8501_0	0x0501
 #define PCI_DEVICE_ID_VIA_82C561	0x0561
-- 
cgit 


From 218050855ece4e923106ab614ac65afa0f618df3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 25 Aug 2007 18:41:53 +0200
Subject: sched: adaptive scheduler granularity

Instead of specifying the preemption granularity, specify the wanted
latency. By fixing the granlarity to a constany the wakeup latency
it a function of the number of running tasks on the rq.

Invert this relation.

sysctl_sched_granularity becomes a minimum for the dynamic granularity
computed from the new sysctl_sched_latency.

Then use this latency to do more intelligent granularity decisions: if
there are fewer tasks running then we can schedule coarser. This helps
performance while still always keeping the latency target.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        | 14 ++++++----
 kernel/sched_fair.c   | 77 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/sysctl.c       | 11 ++++++++
 4 files changed, 86 insertions(+), 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba78807eab91..322764e04052 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1399,6 +1399,7 @@ static inline void idle_task_exit(void) {}
 
 extern void sched_idle_next(void);
 
+extern unsigned int sysctl_sched_latency;
 extern unsigned int sysctl_sched_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
diff --git a/kernel/sched.c b/kernel/sched.c
index 6798328a2e0e..da26f46d50d7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4911,14 +4911,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
 static inline void sched_init_granularity(void)
 {
 	unsigned int factor = 1 + ilog2(num_online_cpus());
-	const unsigned long gran_limit = 100000000;
+	const unsigned long limit = 100000000;
 
 	sysctl_sched_granularity *= factor;
-	if (sysctl_sched_granularity > gran_limit)
-		sysctl_sched_granularity = gran_limit;
+	if (sysctl_sched_granularity > limit)
+		sysctl_sched_granularity = limit;
 
-	sysctl_sched_runtime_limit = sysctl_sched_granularity * 5;
-	sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2;
+	sysctl_sched_latency *= factor;
+	if (sysctl_sched_latency > limit)
+		sysctl_sched_latency = limit;
+
+	sysctl_sched_runtime_limit = sysctl_sched_latency * 5;
+	sysctl_sched_wakeup_granularity = sysctl_sched_latency / 2;
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4d6b7e2df2aa..0ba1e60f08d0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -15,23 +15,32 @@
  *
  *  Scaled math optimizations by Thomas Gleixner
  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
 
 /*
- * Preemption granularity:
- * (default: 10 msec, units: nanoseconds)
+ * Targeted preemption latency for CPU-bound tasks:
+ * (default: 20ms, units: nanoseconds)
  *
- * NOTE: this granularity value is not the same as the concept of
- * 'timeslice length' - timeslices in CFS will typically be somewhat
- * larger than this value. (to see the precise effective timeslice
- * length of your workload, run vmstat and monitor the context-switches
- * field)
+ * NOTE: this latency value is not the same as the concept of
+ * 'timeslice length' - timeslices in CFS are of variable length.
+ * (to see the precise effective timeslice length of your workload,
+ *  run vmstat and monitor the context-switches field)
  *
  * On SMP systems the value of this is multiplied by the log2 of the
  * number of CPUs. (i.e. factor 2x on 2-way systems, 3x on 4-way
  * systems, 4x on 8-way systems, 5x on 16-way systems, etc.)
+ * Targeted preemption latency for CPU-bound tasks:
  */
-unsigned int sysctl_sched_granularity __read_mostly = 10000000UL;
+unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
+
+/*
+ * Minimal preemption granularity for CPU-bound tasks:
+ * (default: 2 msec, units: nanoseconds)
+ */
+unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL;
 
 /*
  * SCHED_BATCH wake-up granularity.
@@ -212,6 +221,49 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
  * Scheduling class statistics methods:
  */
 
+/*
+ * Calculate the preemption granularity needed to schedule every
+ * runnable task once per sysctl_sched_latency amount of time.
+ * (down to a sensible low limit on granularity)
+ *
+ * For example, if there are 2 tasks running and latency is 10 msecs,
+ * we switch tasks every 5 msecs. If we have 3 tasks running, we have
+ * to switch tasks every 3.33 msecs to get a 10 msecs observed latency
+ * for each task. We do finer and finer scheduling up to until we
+ * reach the minimum granularity value.
+ *
+ * To achieve this we use the following dynamic-granularity rule:
+ *
+ *    gran = lat/nr - lat/nr/nr
+ *
+ * This comes out of the following equations:
+ *
+ *    kA1 + gran = kB1
+ *    kB2 + gran = kA2
+ *    kA2 = kA1
+ *    kB2 = kB1 - d + d/nr
+ *    lat = d * nr
+ *
+ * Where 'k' is key, 'A' is task A (waiting), 'B' is task B (running),
+ * '1' is start of time, '2' is end of time, 'd' is delay between
+ * 1 and 2 (during which task B was running), 'nr' is number of tasks
+ * running, 'lat' is the the period of each task. ('lat' is the
+ * sched_latency that we aim for.)
+ */
+static long
+sched_granularity(struct cfs_rq *cfs_rq)
+{
+	unsigned int gran = sysctl_sched_latency;
+	unsigned int nr = cfs_rq->nr_running;
+
+	if (nr > 1) {
+		gran = gran/nr - gran/nr/nr;
+		gran = max(gran, sysctl_sched_granularity);
+	}
+
+	return gran;
+}
+
 /*
  * We rescale the rescheduling granularity of tasks according to their
  * nice level, but only linearly, not exponentially:
@@ -302,7 +354,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	delta_fair = calc_delta_fair(delta_exec, lw);
 	delta_mine = calc_delta_mine(delta_exec, curr->load.weight, lw);
 
-	if (cfs_rq->sleeper_bonus > sysctl_sched_granularity) {
+	if (cfs_rq->sleeper_bonus > sysctl_sched_latency) {
 		delta = min((u64)delta_mine, cfs_rq->sleeper_bonus);
 		delta = min(delta, (unsigned long)(
 			(long)sysctl_sched_runtime_limit - curr->wait_runtime));
@@ -689,7 +741,8 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	__check_preempt_curr_fair(cfs_rq, next, curr, sysctl_sched_granularity);
+	__check_preempt_curr_fair(cfs_rq, next, curr,
+			sched_granularity(cfs_rq));
 }
 
 /**************************************************
@@ -1034,7 +1087,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * it will preempt the parent:
 	 */
 	p->se.fair_key = current->se.fair_key -
-		niced_granularity(&rq->curr->se, sysctl_sched_granularity) - 1;
+		niced_granularity(&rq->curr->se, sched_granularity(cfs_rq)) - 1;
 	/*
 	 * The first wait is dominated by the child-runs-first logic,
 	 * so do not credit it with that waiting time yet:
@@ -1047,7 +1100,7 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	 * -granularity/2, so initialize the task with that:
 	 */
 	if (sysctl_sched_features & SCHED_FEAT_START_DEBIT)
-		p->se.wait_runtime = -((long)sysctl_sched_granularity / 2);
+		p->se.wait_runtime = -(sched_granularity(cfs_rq) / 2);
 
 	__enqueue_entity(cfs_rq, se);
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ea90ef51085c..9e3d2960faf5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -231,6 +231,17 @@ static ctl_table kern_table[] = {
 		.extra1		= &min_sched_granularity_ns,
 		.extra2		= &max_sched_granularity_ns,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_latency_ns",
+		.data		= &sysctl_sched_latency,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &min_sched_granularity_ns,
+		.extra2		= &max_sched_granularity_ns,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "sched_wakeup_granularity_ns",
-- 
cgit 


From 172ac3dbb7d3e528ac53d08a34df88d1ac53c534 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 25 Aug 2007 18:41:53 +0200
Subject: sched: cleanup, sched_granularity -> sched_min_granularity

due to adaptive granularity scheduling the role of sched_granularity
has changed to "minimum granularity", so rename the variable (and the
tunable) accordingly.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h | 2 +-
 kernel/sched.c        | 6 +++---
 kernel/sched_fair.c   | 4 ++--
 kernel/sysctl.c       | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 322764e04052..bd6a0320a770 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1400,7 +1400,7 @@ static inline void idle_task_exit(void) {}
 extern void sched_idle_next(void);
 
 extern unsigned int sysctl_sched_latency;
-extern unsigned int sysctl_sched_granularity;
+extern unsigned int sysctl_sched_min_granularity;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
diff --git a/kernel/sched.c b/kernel/sched.c
index da26f46d50d7..a40ab657ad19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4913,9 +4913,9 @@ static inline void sched_init_granularity(void)
 	unsigned int factor = 1 + ilog2(num_online_cpus());
 	const unsigned long limit = 100000000;
 
-	sysctl_sched_granularity *= factor;
-	if (sysctl_sched_granularity > limit)
-		sysctl_sched_granularity = limit;
+	sysctl_sched_min_granularity *= factor;
+	if (sysctl_sched_min_granularity > limit)
+		sysctl_sched_min_granularity = limit;
 
 	sysctl_sched_latency *= factor;
 	if (sysctl_sched_latency > limit)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0ba1e60f08d0..ee3771850aaf 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -40,7 +40,7 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 2 msec, units: nanoseconds)
  */
-unsigned int sysctl_sched_granularity __read_mostly = 2000000ULL;
+unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
 
 /*
  * SCHED_BATCH wake-up granularity.
@@ -258,7 +258,7 @@ sched_granularity(struct cfs_rq *cfs_rq)
 
 	if (nr > 1) {
 		gran = gran/nr - gran/nr/nr;
-		gran = max(gran, sysctl_sched_granularity);
+		gran = max(gran, sysctl_sched_min_granularity);
 	}
 
 	return gran;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9e3d2960faf5..6ace893c17c9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -222,8 +222,8 @@ static ctl_table kern_table[] = {
 #ifdef CONFIG_SCHED_DEBUG
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "sched_granularity_ns",
-		.data		= &sysctl_sched_granularity,
+		.procname	= "sched_min_granularity_ns",
+		.data		= &sysctl_sched_min_granularity,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec_minmax,
-- 
cgit 


From 36d98d3edce12c8f9ffd33f8f5d23ce728380925 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Wed, 22 Aug 2007 12:36:01 -0700
Subject: [KBUILD]: Sanitize tc_ematch headers.

The headers in tc_ematch are used by iproute2, so these headers should
be processed.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/Kbuild | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index ad7f71a81b0a..818cc3a50e6b 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -7,6 +7,7 @@ header-y += raid/
 header-y += spi/
 header-y += sunrpc/
 header-y += tc_act/
+header-y += tc_ematch/
 header-y += netfilter/
 header-y += netfilter_arp/
 header-y += netfilter_bridge/
-- 
cgit 


From f424bb9efaee90b752aabcb4e5e95920ee9580bb Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 24 Aug 2007 23:04:18 -0700
Subject: [PPPOL2TP]: Fix endianness annotations.

{s,d}_{session,tunnel} in pppol2tp_addr are actually host-endian
everywhere.  We might switch them to net-endian, of course, but
that structure is exposed to userland via getname...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/if_pppol2tp.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/if_pppol2tp.h b/include/linux/if_pppol2tp.h
index 516203b6fdeb..a7d6a2234b31 100644
--- a/include/linux/if_pppol2tp.h
+++ b/include/linux/if_pppol2tp.h
@@ -32,8 +32,8 @@ struct pppol2tp_addr
 
 	struct sockaddr_in addr;	/* IP address and port to send to */
 
-	__be16 s_tunnel, s_session;	/* For matching incoming packets */
-	__be16 d_tunnel, d_session;	/* For sending outgoing packets */
+	__u16 s_tunnel, s_session;	/* For matching incoming packets */
+	__u16 d_tunnel, d_session;	/* For sending outgoing packets */
 };
 
 /* Socket options:
-- 
cgit 


From f6cf891c4d7128f9f91243fc0b9ce99e10fa1586 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 28 Aug 2007 12:53:24 +0200
Subject: sched: make the scheduler converge to the ideal latency

de-HZ-ification of the granularity defaults unearthed a pre-existing
property of CFS: while it correctly converges to the granularity goal,
it does not prevent run-time fluctuations in the range of
[-gran ... 0 ... +gran].

With the increase of the granularity due to the removal of HZ
dependencies, this becomes visible in chew-max output (with 5 tasks
running):

 out:  28 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   37 .   40
 out:  27 . 27. 32 | flu:  0 .  0 | ran:   17 .   13 | per:   44 .   40
 out:  27 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   36 .   40
 out:  29 . 27. 32 | flu:  2 .  0 | ran:   17 .   13 | per:   46 .   40
 out:  28 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   37 .   40
 out:  29 . 27. 32 | flu:  0 .  0 | ran:   18 .   13 | per:   47 .   40
 out:  28 . 27. 32 | flu:  0 .  0 | ran:    9 .   13 | per:   37 .   40

average slice is the ideal 13 msecs and the period is picture-perfect 40
msecs. But the 'ran' field fluctuates around 13.33 msecs and there's no
mechanism in CFS to keep that from happening: it's a perfectly valid
solution that CFS finds.

to fix this we add a granularity/preemption rule that knows about
the "target latency", which makes tasks that run longer than the ideal
latency run a bit less. The simplest approach is to simply decrease the
preemption granularity when a task overruns its ideal latency. For this
we have to track how much the task executed since its last preemption.

( this adds a new field to task_struct, but we can eliminate that
  overhead in 2.6.24 by putting all the scheduler timestamps into an
  anonymous union. )

with this change in place, chew-max output is fluctuation-less all
around:

 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  2 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  1 | ran:   13 .   13 | per:   41 .   40
 out:  28 . 27. 39 | flu:  0 .  1 | ran:   13 .   13 | per:   41 .   40

this patch has no impact on any fastpath or on any globally observable
scheduling property. (unless you have sharp enough eyes to see
millisecond-level ruckles in glxgears smoothness :-)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  1 +
 kernel/sched_fair.c   | 26 ++++++++++++++++++++++----
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index bd6a0320a770..f4e324ed2e44 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -904,6 +904,7 @@ struct sched_entity {
 
 	u64			exec_start;
 	u64			sum_exec_runtime;
+	u64			prev_sum_exec_runtime;
 	u64			wait_start_fair;
 	u64			sleep_start_fair;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9fe473a190de..b533d6db78aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1587,6 +1587,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_start_fair		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
+	p->se.prev_sum_exec_runtime	= 0;
 	p->se.delta_exec		= 0;
 	p->se.delta_fair_run		= 0;
 	p->se.delta_fair_sleep		= 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9f53d49f3aab..721fe7744874 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -668,7 +668,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
 /*
  * Preempt the current task with a newly woken task if needed:
  */
-static void
+static int
 __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 			  struct sched_entity *curr, unsigned long granularity)
 {
@@ -679,8 +679,11 @@ __check_preempt_curr_fair(struct cfs_rq *cfs_rq, struct sched_entity *se,
 	 * preempt the current task unless the best task has
 	 * a larger than sched_granularity fairness advantage:
 	 */
-	if (__delta > niced_granularity(curr, granularity))
+	if (__delta > niced_granularity(curr, granularity)) {
 		resched_task(rq_of(cfs_rq)->curr);
+		return 1;
+	}
+	return 0;
 }
 
 static inline void
@@ -725,6 +728,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
 static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
+	unsigned long gran, ideal_runtime, delta_exec;
 	struct sched_entity *next;
 
 	/*
@@ -741,8 +745,22 @@ static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	if (next == curr)
 		return;
 
-	__check_preempt_curr_fair(cfs_rq, next, curr,
-			sched_granularity(cfs_rq));
+	gran = sched_granularity(cfs_rq);
+	ideal_runtime = niced_granularity(curr,
+		max(sysctl_sched_latency / cfs_rq->nr_running,
+		    (unsigned long)sysctl_sched_min_granularity));
+	/*
+	 * If we executed more than what the latency constraint suggests,
+	 * reduce the rescheduling granularity. This way the total latency
+	 * of how much a task is not scheduled converges to
+	 * sysctl_sched_latency:
+	 */
+	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+	if (delta_exec > ideal_runtime)
+		gran = 0;
+
+	if (__check_preempt_curr_fair(cfs_rq, next, curr, gran))
+		curr->prev_sum_exec_runtime = curr->sum_exec_runtime;
 }
 
 /**************************************************
-- 
cgit 


From 05bb1fad1cde025a864a90cfeb98dcbefe78a44a Mon Sep 17 00:00:00 2001
From: "David S. Miller" <davem@sunset.davemloft.net>
Date: Thu, 30 Aug 2007 22:10:28 -0700
Subject: [TCP]: Allow minimum RTO to be configurable via routing metrics.

Cell phone networks do link layer retransmissions and other
things that cause unnecessary timeout retransmits.  So allow
the minimum RTO to be inflated per-route to deal with this.

Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rtnetlink.h |  2 ++
 net/ipv4/tcp_input.c      | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index c91476ce314a..dff3192374f8 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -351,6 +351,8 @@ enum
 #define RTAX_INITCWND RTAX_INITCWND
 	RTAX_FEATURES,
 #define RTAX_FEATURES RTAX_FEATURES
+	RTAX_RTO_MIN,
+#define RTAX_RTO_MIN RTAX_RTO_MIN
 	__RTAX_MAX
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9785df37a65f..1ee72127462b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -555,6 +555,16 @@ static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb)
 		tcp_grow_window(sk, skb);
 }
 
+static u32 tcp_rto_min(struct sock *sk)
+{
+	struct dst_entry *dst = __sk_dst_get(sk);
+	u32 rto_min = TCP_RTO_MIN;
+
+	if (dst_metric_locked(dst, RTAX_RTO_MIN))
+		rto_min = dst->metrics[RTAX_RTO_MIN-1];
+	return rto_min;
+}
+
 /* Called to compute a smoothed rtt estimate. The data fed to this
  * routine either comes from timestamps, or from segments that were
  * known _not_ to have been retransmitted [see Karn/Partridge
@@ -616,13 +626,13 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt)
 			if (tp->mdev_max < tp->rttvar)
 				tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2;
 			tp->rtt_seq = tp->snd_nxt;
-			tp->mdev_max = TCP_RTO_MIN;
+			tp->mdev_max = tcp_rto_min(sk);
 		}
 	} else {
 		/* no previous measure. */
 		tp->srtt = m<<3;	/* take the measured time to be rtt */
 		tp->mdev = m<<1;	/* make sure rto = 3*rtt */
-		tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN);
+		tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk));
 		tp->rtt_seq = tp->snd_nxt;
 	}
 }
-- 
cgit 


From 91a6d4ed333a47003f07636b3bd143521e0bbad1 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Mon, 27 Aug 2007 19:35:22 +0200
Subject: ata: add ATA_MWDMA* and ATA_SWDMA* defines

Cc: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 include/linux/ata.h | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/ata.h b/include/linux/ata.h
index 23a22df039d8..c043c1ccf1c5 100644
--- a/include/linux/ata.h
+++ b/include/linux/ata.h
@@ -73,6 +73,19 @@ enum {
 	ATA_PIO5		= ATA_PIO4 | (1 << 5),
 	ATA_PIO6		= ATA_PIO5 | (1 << 6),
 
+	ATA_SWDMA0		= (1 << 0),
+	ATA_SWDMA1		= ATA_SWDMA0 | (1 << 1),
+	ATA_SWDMA2		= ATA_SWDMA1 | (1 << 2),
+
+	ATA_SWDMA2_ONLY		= (1 << 2),
+
+	ATA_MWDMA0		= (1 << 0),
+	ATA_MWDMA1		= ATA_MWDMA0 | (1 << 1),
+	ATA_MWDMA2		= ATA_MWDMA1 | (1 << 2),
+
+	ATA_MWDMA12_ONLY	= (1 << 1) | (1 << 2),
+	ATA_MWDMA2_ONLY		= (1 << 2),
+
 	ATA_UDMA0		= (1 << 0),
 	ATA_UDMA1		= ATA_UDMA0 | (1 << 1),
 	ATA_UDMA2		= ATA_UDMA1 | (1 << 2),
-- 
cgit 


From aa137f9d29d30592774c727ec5cfcf9891e576fa Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Fri, 31 Aug 2007 00:48:45 -0700
Subject: SLUB: Force inlining for functions in slub_def.h

Some compilers (especially older gcc releases) may skip inlining
sometimes which will lead to link failures.  Force the inlining of
keyfunctions in slub_def.h to avoid these issues.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Jan Dittmer <jdi@l4x.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slub_def.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 124270df8734..74962077f632 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -78,7 +78,7 @@ extern struct kmem_cache kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
  * Sorry that the following has to be that ugly but some versions of GCC
  * have trouble with constant propagation and loops.
  */
-static inline int kmalloc_index(size_t size)
+static __always_inline int kmalloc_index(size_t size)
 {
 	if (!size)
 		return 0;
@@ -133,7 +133,7 @@ static inline int kmalloc_index(size_t size)
  * This ought to end up with a global pointer to the right cache
  * in kmalloc_caches.
  */
-static inline struct kmem_cache *kmalloc_slab(size_t size)
+static __always_inline struct kmem_cache *kmalloc_slab(size_t size)
 {
 	int index = kmalloc_index(size);
 
@@ -166,7 +166,7 @@ static inline struct kmem_cache *kmalloc_slab(size_t size)
 void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
 void *__kmalloc(size_t size, gfp_t flags);
 
-static inline void *kmalloc(size_t size, gfp_t flags)
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
 {
 	if (__builtin_constant_p(size) && !(flags & SLUB_DMA)) {
 		struct kmem_cache *s = kmalloc_slab(size);
@@ -183,7 +183,7 @@ static inline void *kmalloc(size_t size, gfp_t flags)
 void *__kmalloc_node(size_t size, gfp_t flags, int node);
 void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 {
 	if (__builtin_constant_p(size) && !(flags & SLUB_DMA)) {
 		struct kmem_cache *s = kmalloc_slab(size);
-- 
cgit 


From 16c55b038033d8f6f7601996dfae44399666d9ab Mon Sep 17 00:00:00 2001
From: Tejun Heo <htejun@gmail.com>
Date: Wed, 29 Aug 2007 11:58:33 +0900
Subject: libata: implement BROKEN_HPA horkage and apply it to affected drives

Some drives choke on READ_NATIVE_MAX_ADDRESS[_EXT].  Implement
ATA_HORKAGE_BROKEN_HPA and apply it to affected drives.

Signed-off-by: Tejun Heo <htejun@gmail.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/libata-core.c | 11 ++++++++---
 include/linux/libata.h    |  1 +
 2 files changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index 9cf46bf8c8d2..a3ee087223de 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -1911,8 +1911,9 @@ int ata_dev_configure(struct ata_device *dev)
 					dev->flags |= ATA_DFLAG_FLUSH_EXT;
 			}
 
-			if (ata_id_hpa_enabled(dev->id))
-				dev->n_sectors = ata_hpa_resize(dev);
+			if (!(dev->horkage & ATA_HORKAGE_BROKEN_HPA) &&
+			    ata_id_hpa_enabled(dev->id))
+ 				dev->n_sectors = ata_hpa_resize(dev);
 
 			/* config NCQ */
 			ata_dev_config_ncq(dev, ncq_desc, sizeof(ncq_desc));
@@ -3795,7 +3796,11 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = {
 	{ "ST9160821AS",	"3.CLF",	ATA_HORKAGE_NONCQ, },
 	{ "SAMSUNG HD401LJ",	"ZZ100-15",	ATA_HORKAGE_NONCQ, },
 
-	/* Devices with NCQ limits */
+	/* devices which puke on READ_NATIVE_MAX */
+	{ "HDS724040KLSA80",	"KFAOA20N",	ATA_HORKAGE_BROKEN_HPA, },
+	{ "WDC WD3200JD-00KLB0", "WD-WCAMR1130137", ATA_HORKAGE_BROKEN_HPA },
+	{ "WDC WD2500JD-00HBB0", "WD-WMAL71490727", ATA_HORKAGE_BROKEN_HPA },
+	{ "MAXTOR 6L080L4",	"A93.0500",	ATA_HORKAGE_BROKEN_HPA },
 
 	/* End Marker */
 	{ }
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 41978a557318..a67bb9075e9b 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -303,6 +303,7 @@ enum {
 	ATA_HORKAGE_NODMA	= (1 << 1),	/* DMA problems */
 	ATA_HORKAGE_NONCQ	= (1 << 2),	/* Don't use NCQ */
 	ATA_HORKAGE_MAX_SEC_128	= (1 << 3),	/* Limit max sects to 128 */
+	ATA_HORKAGE_BROKEN_HPA	= (1 << 4),	/* Broken HPA */
 };
 
 enum hsm_task_states {
-- 
cgit 


From f3de4be9d5f8551d7880a1f1f5231a30e0161b1f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 30 Aug 2007 23:56:29 -0700
Subject: PM: Fix dependencies of CONFIG_SUSPEND and CONFIG_HIBERNATION

Dependencies of CONFIG_SUSPEND and CONFIG_HIBERNATION introduced by commit
296699de6bdc717189a331ab6bbe90e05c94db06 "Introduce CONFIG_SUSPEND for
suspend-to-Ram and standby" are incorrect, as they don't cover the facts that
(1) not all architectures support suspend and (2) SMP hibernation is only
possible on X86 and PPC64 (if CONFIG_PPC64_SWSUSP is set).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86_64/defconfig |  1 -
 include/linux/cpu.h   |  6 +++---
 kernel/cpu.c          |  4 ++--
 kernel/power/Kconfig  | 41 +++++++++++++++++++++++++++++++----------
 4 files changed, 36 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig
index e64f65c9d901..b091c5e35558 100644
--- a/arch/x86_64/defconfig
+++ b/arch/x86_64/defconfig
@@ -201,7 +201,6 @@ CONFIG_PM=y
 # CONFIG_PM_DEBUG is not set
 CONFIG_HIBERNATION=y
 CONFIG_PM_STD_PARTITION=""
-CONFIG_SUSPEND_SMP=y
 
 #
 # ACPI (Advanced Configuration and Power Interface) Support
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 1d5ded0836ee..0ad72c4cf312 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -126,16 +126,16 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 static inline int cpu_is_offline(int cpu) { return 0; }
 #endif		/* CONFIG_HOTPLUG_CPU */
 
-#ifdef CONFIG_SUSPEND_SMP
+#ifdef CONFIG_PM_SLEEP_SMP
 extern int suspend_cpu_hotplug;
 
 extern int disable_nonboot_cpus(void);
 extern void enable_nonboot_cpus(void);
-#else
+#else /* !CONFIG_PM_SLEEP_SMP */
 #define suspend_cpu_hotplug	0
 
 static inline int disable_nonboot_cpus(void) { return 0; }
 static inline void enable_nonboot_cpus(void) {}
-#endif
+#endif /* !CONFIG_PM_SLEEP_SMP */
 
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 181ae7086029..38033db8d8ec 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -273,7 +273,7 @@ int __cpuinit cpu_up(unsigned int cpu)
 	return err;
 }
 
-#ifdef CONFIG_SUSPEND_SMP
+#ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_t frozen_cpus;
 
 int disable_nonboot_cpus(void)
@@ -334,4 +334,4 @@ void enable_nonboot_cpus(void)
 out:
 	mutex_unlock(&cpu_add_remove_lock);
 }
-#endif
+#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 412859f8d94a..c8580a1e6873 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -72,15 +72,10 @@ config PM_TRACE
 	CAUTION: this option will cause your machine's real-time clock to be
 	set to an invalid time after a resume.
 
-config SUSPEND_SMP_POSSIBLE
-	bool
-	depends on (X86 && !X86_VOYAGER) || (PPC64 && (PPC_PSERIES || PPC_PMAC))
-	depends on SMP
-	default y
-
-config SUSPEND_SMP
+config PM_SLEEP_SMP
 	bool
-	depends on SUSPEND_SMP_POSSIBLE && PM_SLEEP
+	depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
+	depends on PM_SLEEP
 	select HOTPLUG_CPU
 	default y
 
@@ -89,20 +84,46 @@ config PM_SLEEP
 	depends on SUSPEND || HIBERNATION
 	default y
 
+config SUSPEND_UP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \
+		   || SUPERH || FRV
+	depends on !SMP
+	default y
+
+config SUSPEND_SMP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) \
+		   || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM
+	depends on SMP
+	default y
+
 config SUSPEND
 	bool "Suspend to RAM and standby"
 	depends on PM
-	depends on !SMP || SUSPEND_SMP_POSSIBLE
+	depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE
 	default y
 	---help---
 	  Allow the system to enter sleep states in which main memory is
 	  powered and thus its contents are preserved, such as the
 	  suspend-to-RAM state (i.e. the ACPI S3 state).
 
+config HIBERNATION_UP_POSSIBLE
+	bool
+	depends on X86 || PPC64_SWSUSP || FRV || PPC32
+	depends on !SMP
+	default y
+
+config HIBERNATION_SMP_POSSIBLE
+	bool
+	depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP
+	depends on SMP
+	default y
+
 config HIBERNATION
 	bool "Hibernation (aka 'suspend to disk')"
 	depends on PM && SWAP
-	depends on ((X86 || PPC64_SWSUSP || FRV || PPC32) && !SMP) || SUSPEND_SMP_POSSIBLE
+	depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
 	---help---
 	  Enable the suspend to disk (STD) functionality, which is usually
 	  called "hibernation" in user interfaces.  STD checkpoints the
-- 
cgit 


From 60693e5a9a2063b87d4dbe8029816c814b3fa84e Mon Sep 17 00:00:00 2001
From: Shane Huang <shane.huang@amd.com>
Date: Thu, 30 Aug 2007 23:56:38 -0700
Subject: i2c-piix4: Fix SB700 PCI device ID

We find that SB700 and SB800 use the same SMBus device ID as SB600, which is
0x4385, instead of the already submitted 0x4395.

Besides removing the wrong SB700 device ID, add SB800 support to kernel, by
renaming the PCI_DEVICE_ID_ATI_IXP600_SMBUS into
PCI_DEVICE_ID_ATI_SBX00_SMBUS.

Signed-off-by: Shane Huang <shane.huang@amd.com>
Signed-off-by: Jean Delvare <khali@linux-fr.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/i2c/busses/i2c-piix4 | 2 +-
 drivers/i2c/busses/Kconfig         | 1 +
 drivers/i2c/busses/i2c-piix4.c     | 6 ++----
 include/linux/pci_ids.h            | 3 +--
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/i2c/busses/i2c-piix4 b/Documentation/i2c/busses/i2c-piix4
index fa0c786a8bf5..cf6b6cb02aa1 100644
--- a/Documentation/i2c/busses/i2c-piix4
+++ b/Documentation/i2c/busses/i2c-piix4
@@ -6,7 +6,7 @@ Supported adapters:
     Datasheet: Publicly available at the Intel website
   * ServerWorks OSB4, CSB5, CSB6 and HT-1000 southbridges
     Datasheet: Only available via NDA from ServerWorks
-  * ATI IXP200, IXP300, IXP400, SB600 and SB700 southbridges
+  * ATI IXP200, IXP300, IXP400, SB600, SB700 and SB800 southbridges
     Datasheet: Not publicly available
   * Standard Microsystems (SMSC) SLC90E66 (Victory66) southbridge
     Datasheet: Publicly available at the SMSC website http://www.smsc.com
diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
index 1842f523c23d..9f3a4cd0b07f 100644
--- a/drivers/i2c/busses/Kconfig
+++ b/drivers/i2c/busses/Kconfig
@@ -208,6 +208,7 @@ config I2C_PIIX4
 	    ATI IXP400
 	    ATI SB600
 	    ATI SB700
+	    ATI SB800
 	    Serverworks OSB4
 	    Serverworks CSB5
 	    Serverworks CSB6
diff --git a/drivers/i2c/busses/i2c-piix4.c b/drivers/i2c/busses/i2c-piix4.c
index debc76cd2161..167e4137ee21 100644
--- a/drivers/i2c/busses/i2c-piix4.c
+++ b/drivers/i2c/busses/i2c-piix4.c
@@ -23,7 +23,7 @@
    Supports:
 	Intel PIIX4, 440MX
 	Serverworks OSB4, CSB5, CSB6, HT-1000
-	ATI IXP200, IXP300, IXP400, SB600, SB700
+	ATI IXP200, IXP300, IXP400, SB600, SB700, SB800
 	SMSC Victory66
 
    Note: we assume there can only be one device, with one SMBus interface.
@@ -397,9 +397,7 @@ static struct pci_device_id piix4_ids[] = {
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS),
 	  .driver_data = 0 },
-	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP600_SMBUS),
-	  .driver_data = 0 },
-	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP700_SMBUS),
+	{ PCI_DEVICE(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS),
 	  .driver_data = 0 },
 	{ PCI_DEVICE(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_OSB4),
 	  .driver_data = 0 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 06d23e10a16d..17168f3cc73f 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -374,10 +374,9 @@
 #define PCI_DEVICE_ID_ATI_IXP400_SATA   0x4379
 #define PCI_DEVICE_ID_ATI_IXP400_SATA2	0x437a
 #define PCI_DEVICE_ID_ATI_IXP600_SATA	0x4380
-#define PCI_DEVICE_ID_ATI_IXP600_SMBUS	0x4385
+#define PCI_DEVICE_ID_ATI_SBX00_SMBUS	0x4385
 #define PCI_DEVICE_ID_ATI_IXP600_IDE	0x438c
 #define PCI_DEVICE_ID_ATI_IXP700_SATA	0x4390
-#define PCI_DEVICE_ID_ATI_IXP700_SMBUS	0x4395
 #define PCI_DEVICE_ID_ATI_IXP700_IDE	0x439c
 
 #define PCI_VENDOR_ID_VLSI		0x1004
-- 
cgit 


From dec4ad86c2fbea062e9ef9caa6d6e79f7c5e0b12 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Thu, 30 Aug 2007 23:56:40 -0700
Subject: hugepage: fix broken check for offset alignment in hugepage mappings

For hugepage mappings, the file offset, like the address and size, needs to
be aligned to the size of a hugepage.

In commit 68589bc353037f233fe510ad9ff432338c95db66, the check for this was
moved into prepare_hugepage_range() along with the address and size checks.
 But since BenH's rework of the get_unmapped_area() paths leading up to
commit 4b1d89290b62bb2db476c94c82cf7442aab440c8, prepare_hugepage_range()
is only called for MAP_FIXED mappings, not for other mappings.  This means
we're no longer ever checking for an aligned offset - I've confirmed that
mmap() will (apparently) succeed with a misaligned offset on both powerpc
and i386 at least.

This patch restores the check, removing it from prepare_hugepage_range()
and putting it back into hugetlbfs_file_mmap().  I'm putting it there,
rather than in the get_unmapped_area() path so it only needs to go in one
place, than separately in the half-dozen or so arch-specific
implementations of hugetlb_get_unmapped_area().

Signed-off-by: David Gibson <david@gibson.dropbear.id.au>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/i386/mm/hugetlbpage.c    |  2 +-
 arch/ia64/mm/hugetlbpage.c    |  6 ++----
 arch/sparc64/mm/hugetlbpage.c |  2 +-
 fs/hugetlbfs/inode.c          | 15 ++++++++++-----
 include/linux/hugetlb.h       | 10 +++-------
 5 files changed, 17 insertions(+), 18 deletions(-)

(limited to 'include/linux')

diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c
index efdf95ac8031..6c06d9c0488e 100644
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -367,7 +367,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c
index d22861c5b04c..a9ff685aea25 100644
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -75,10 +75,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
  * Don't actually need to do any preparation, but need to make sure
  * the address is in the right region.
  */
-int prepare_hugepage_range(unsigned long addr, unsigned long len, pgoff_t pgoff)
+int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
-		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
@@ -151,7 +149,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, u
 
 	/* Handle MAP_FIXED */
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index eaba9b70b184..6cfab2e4d340 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -175,7 +175,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c848a191525d..950c2fbb815b 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -82,14 +82,19 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	int ret;
 
 	/*
-	 * vma alignment has already been checked by prepare_hugepage_range.
-	 * If you add any error returns here, do so after setting VM_HUGETLB,
-	 * so is_vm_hugetlb_page tests below unmap_region go the right way
-	 * when do_mmap_pgoff unwinds (may be important on powerpc and ia64).
+	 * vma address alignment (but not the pgoff alignment) has
+	 * already been checked by prepare_hugepage_range.  If you add
+	 * any error returns here, do so after setting VM_HUGETLB, so
+	 * is_vm_hugetlb_page tests below unmap_region go the right
+	 * way when do_mmap_pgoff unwinds (may be important on powerpc
+	 * and ia64).
 	 */
 	vma->vm_flags |= VM_HUGETLB | VM_RESERVED;
 	vma->vm_ops = &hugetlb_vm_ops;
 
+	if (vma->vm_pgoff & ~(HPAGE_MASK >> PAGE_SHIFT))
+		return -EINVAL;
+
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	mutex_lock(&inode->i_mutex);
@@ -132,7 +137,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		return -ENOMEM;
 
 	if (flags & MAP_FIXED) {
-		if (prepare_hugepage_range(addr, len, pgoff))
+		if (prepare_hugepage_range(addr, len))
 			return -EINVAL;
 		return addr;
 	}
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index e6a71c82d204..3a19b032c0eb 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -66,11 +66,8 @@ void hugetlb_free_pgd_range(struct mmu_gather **tlb, unsigned long addr,
  * If the arch doesn't supply something else, assume that hugepage
  * size aligned regions are ok without further preparation.
  */
-static inline int prepare_hugepage_range(unsigned long addr, unsigned long len,
-						pgoff_t pgoff)
+static inline int prepare_hugepage_range(unsigned long addr, unsigned long len)
 {
-	if (pgoff & (~HPAGE_MASK >> PAGE_SHIFT))
-		return -EINVAL;
 	if (len & ~HPAGE_MASK)
 		return -EINVAL;
 	if (addr & ~HPAGE_MASK)
@@ -78,8 +75,7 @@ static inline int prepare_hugepage_range(unsigned long addr, unsigned long len,
 	return 0;
 }
 #else
-int prepare_hugepage_range(unsigned long addr, unsigned long len,
-						pgoff_t pgoff);
+int prepare_hugepage_range(unsigned long addr, unsigned long len);
 #endif
 
 #ifndef ARCH_HAS_SETCLEAR_HUGE_PTE
@@ -117,7 +113,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
-#define prepare_hugepage_range(addr,len,pgoff)	(-EINVAL)
+#define prepare_hugepage_range(addr,len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
-- 
cgit 


From 1b3b4a1a2deb7d3e5d66063bd76304d840c966b3 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <Trond.Myklebust@netapp.com>
Date: Tue, 28 Aug 2007 10:29:36 -0400
Subject: NFS: Fix a write request leak in nfs_invalidate_page()

Ryusuke Konishi says:

The recent truncate_complete_page() clears the dirty flag from a page
before calling a_ops->invalidatepage(),
^^^^^^
static void
truncate_complete_page(struct address_space *mapping, struct page *page)
{
        ...
        cancel_dirty_page(page, PAGE_CACHE_SIZE);  <--- Inserted here at
kernel 2.6.20

        if (PagePrivate(page))
                do_invalidatepage(page, 0);   ---> will call
a_ops->invalidatepage()
        ...
}

and this is disturbing nfs_wb_page_priority() from calling
nfs_writepage_locked() that is expected to handle the pending
request (=nfs_page) associated with the page.

int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
{
        ...
        if (clear_page_dirty_for_io(page)) {
                ret = nfs_writepage_locked(page, &wbc);
                if (ret < 0)
                        goto out;
        }
        ...
}

Since truncate_complete_page() will get rid of the page after
a_ops->invalidatepage() returns, the request (=nfs_page) associated
with the page becomes a garbage in nfs_inode->nfs_page_tree.
------------------------

Fix this by ensuring that nfs_wb_page_priority() recognises that it may
also need to clear out non-dirty pages that have an nfs_page associated
with them.

Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
---
 fs/nfs/file.c          |  2 +-
 fs/nfs/write.c         | 44 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/nfs_fs.h |  1 +
 3 files changed, 46 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index c87dc713b5d7..579cf8a7d4a7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -316,7 +316,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset)
 	if (offset != 0)
 		return;
 	/* Cancel any unstarted writes on this page */
-	nfs_wb_page_priority(page->mapping->host, page, FLUSH_INVALIDATE);
+	nfs_wb_page_cancel(page->mapping->host, page);
 }
 
 static int nfs_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ef97e0c0f5b1..0d7a77cc394b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1396,6 +1396,50 @@ out:
 	return ret;
 }
 
+int nfs_wb_page_cancel(struct inode *inode, struct page *page)
+{
+	struct nfs_page *req;
+	loff_t range_start = page_offset(page);
+	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
+	struct writeback_control wbc = {
+		.bdi = page->mapping->backing_dev_info,
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = LONG_MAX,
+		.range_start = range_start,
+		.range_end = range_end,
+	};
+	int ret = 0;
+
+	BUG_ON(!PageLocked(page));
+	for (;;) {
+		req = nfs_page_find_request(page);
+		if (req == NULL)
+			goto out;
+		if (test_bit(PG_NEED_COMMIT, &req->wb_flags)) {
+			nfs_release_request(req);
+			break;
+		}
+		if (nfs_lock_request_dontget(req)) {
+			nfs_inode_remove_request(req);
+			/*
+			 * In case nfs_inode_remove_request has marked the
+			 * page as being dirty
+			 */
+			cancel_dirty_page(page, PAGE_CACHE_SIZE);
+			nfs_unlock_request(req);
+			break;
+		}
+		ret = nfs_wait_on_request(req);
+		if (ret < 0)
+			goto out;
+	}
+	if (!PagePrivate(page))
+		return 0;
+	ret = nfs_sync_mapping_wait(page->mapping, &wbc, FLUSH_INVALIDATE);
+out:
+	return ret;
+}
+
 int nfs_wb_page_priority(struct inode *inode, struct page *page, int how)
 {
 	loff_t range_start = page_offset(page);
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 157dcb055b5c..7250eeadd7b5 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -431,6 +431,7 @@ extern int nfs_sync_mapping_range(struct address_space *, loff_t, loff_t, int);
 extern int nfs_wb_all(struct inode *inode);
 extern int nfs_wb_page(struct inode *inode, struct page* page);
 extern int nfs_wb_page_priority(struct inode *inode, struct page* page, int how);
+extern int nfs_wb_page_cancel(struct inode *inode, struct page* page);
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
 extern int  nfs_commit_inode(struct inode *, int);
 extern struct nfs_write_data *nfs_commit_alloc(void);
-- 
cgit 


From 9e3d3d07de1a9f2b83299653b75bfdbc0a8118f2 Mon Sep 17 00:00:00 2001
From: Samuel Thibault <samuel.thibault@ens-lyon.org>
Date: Tue, 4 Sep 2007 23:16:04 -0400
Subject: Input: add more Braille keycodes

Some braille keyboards have 10 dots, so extend the Input braille keys
definitions.

Signed-off-by: Samuel Thibault <samuel.thibault@ens-lyon.org>
Signed-off-by: Dmitry Torokhov <dtor@mail.ru>
---
 include/linux/input.h    | 2 ++
 include/linux/keyboard.h | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/input.h b/include/linux/input.h
index e02c6a66b2ba..17df5a7331c7 100644
--- a/include/linux/input.h
+++ b/include/linux/input.h
@@ -552,6 +552,8 @@ struct input_absinfo {
 #define KEY_BRL_DOT6		0x1f6
 #define KEY_BRL_DOT7		0x1f7
 #define KEY_BRL_DOT8		0x1f8
+#define KEY_BRL_DOT9		0x1f9
+#define KEY_BRL_DOT10		0x1fa
 
 /* We avoid low common keys in module aliases so they don't get huge. */
 #define KEY_MIN_INTERESTING	KEY_MUTE
diff --git a/include/linux/keyboard.h b/include/linux/keyboard.h
index de76843bbe8a..7ddbc30aa8e7 100644
--- a/include/linux/keyboard.h
+++ b/include/linux/keyboard.h
@@ -437,8 +437,10 @@ extern unsigned short plain_map[NR_KEYS];
 #define K_BRL_DOT6      K(KT_BRL, 6)
 #define K_BRL_DOT7      K(KT_BRL, 7)
 #define K_BRL_DOT8      K(KT_BRL, 8)
+#define K_BRL_DOT9      K(KT_BRL, 9)
+#define K_BRL_DOT10     K(KT_BRL, 10)
 
-#define NR_BRL		9
+#define NR_BRL		11
 
 #define MAX_DIACR	256
 #endif
-- 
cgit 


From b311ec4ae82b1dc337689e966dcf9c5f6a53877e Mon Sep 17 00:00:00 2001
From: Joseph Chan <josephchan@via.com.tw>
Date: Mon, 10 Sep 2007 22:06:01 -0400
Subject: [libata, IDE] add new VIA bridge to VIA PATA drivers

Signed-off-by: Joseph Chan <josephchan@via.com.tw>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
---
 drivers/ata/pata_via.c      | 1 +
 drivers/ide/pci/via82cxxx.c | 1 +
 include/linux/pci_ids.h     | 1 +
 3 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/drivers/ata/pata_via.c b/drivers/ata/pata_via.c
index 1b0acafad1a6..636c4f1a0b24 100644
--- a/drivers/ata/pata_via.c
+++ b/drivers/ata/pata_via.c
@@ -97,6 +97,7 @@ static const struct via_isa_bridge {
 	u8 rev_max;
 	u16 flags;
 } via_isa_bridges[] = {
+	{ "vx800",	PCI_DEVICE_ID_VIA_VX800,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "vt8251",	PCI_DEVICE_ID_VIA_8251,     0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
 	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, VIA_UDMA_133 | VIA_BAD_AST },
diff --git a/drivers/ide/pci/via82cxxx.c b/drivers/ide/pci/via82cxxx.c
index 581316f9581d..8c539381d622 100644
--- a/drivers/ide/pci/via82cxxx.c
+++ b/drivers/ide/pci/via82cxxx.c
@@ -74,6 +74,7 @@ static struct via_isa_bridge {
 	u8 udma_mask;
 	u8 flags;
 } via_isa_bridges[] = {
+	{ "vx800",	PCI_DEVICE_ID_VIA_VX800,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "cx700",	PCI_DEVICE_ID_VIA_CX700,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "vt8237s",	PCI_DEVICE_ID_VIA_8237S,    0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
 	{ "vt6410",	PCI_DEVICE_ID_VIA_6410,     0x00, 0x2f, ATA_UDMA6, VIA_BAD_AST },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 17168f3cc73f..d41747b9fd15 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1343,6 +1343,7 @@
 #define PCI_DEVICE_ID_VIA_8231_4	0x8235
 #define PCI_DEVICE_ID_VIA_8365_1	0x8305
 #define PCI_DEVICE_ID_VIA_CX700		0x8324
+#define PCI_DEVICE_ID_VIA_VX800		0x8353
 #define PCI_DEVICE_ID_VIA_8371_1	0x8391
 #define PCI_DEVICE_ID_VIA_82C598_1	0x8598
 #define PCI_DEVICE_ID_VIA_838X_1	0xB188
-- 
cgit 


From 16fcec35e7d7c4faaa4709f6434a4a25b06d25e3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Tue, 11 Sep 2007 11:28:26 +0200
Subject: [NETFILTER]: Fix/improve deadlock condition on module removal
 netfilter

So I've had a deadlock reported to me.  I've found that the sequence of
events goes like this:

1) process A (modprobe) runs to remove ip_tables.ko

2) process B (iptables-restore) runs and calls setsockopt on a netfilter socket,
increasing the ip_tables socket_ops use count

3) process A acquires a file lock on the file ip_tables.ko, calls remove_module
in the kernel, which in turn executes the ip_tables module cleanup routine,
which calls nf_unregister_sockopt

4) nf_unregister_sockopt, seeing that the use count is non-zero, puts the
calling process into uninterruptible sleep, expecting the process using the
socket option code to wake it up when it exits the kernel

4) the user of the socket option code (process B) in do_ipt_get_ctl, calls
ipt_find_table_lock, which in this case calls request_module to load
ip_tables_nat.ko

5) request_module forks a copy of modprobe (process C) to load the module and
blocks until modprobe exits.

6) Process C. forked by request_module process the dependencies of
ip_tables_nat.ko, of which ip_tables.ko is one.

7) Process C attempts to lock the request module and all its dependencies, it
blocks when it attempts to lock ip_tables.ko (which was previously locked in
step 3)

Theres not really any great permanent solution to this that I can see, but I've
developed a two part solution that corrects the problem

Part 1) Modifies the nf_sockopt registration code so that, instead of using a
use counter internal to the nf_sockopt_ops structure, we instead use a pointer
to the registering modules owner to do module reference counting when nf_sockopt
calls a modules set/get routine.  This prevents the deadlock by preventing set 4
from happening.

Part 2) Enhances the modprobe utilty so that by default it preforms non-blocking
remove operations (the same way rmmod does), and add an option to explicity
request blocking operation.  So if you select blocking operation in modprobe you
can still cause the above deadlock, but only if you explicity try (and since
root can do any old stupid thing it would like....  :)  ).

Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netfilter.h                      |  5 ++--
 net/bridge/netfilter/ebtables.c                |  1 +
 net/ipv4/ipvs/ip_vs_ctl.c                      |  1 +
 net/ipv4/netfilter/arp_tables.c                |  1 +
 net/ipv4/netfilter/ip_tables.c                 |  1 +
 net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c |  1 +
 net/ipv6/netfilter/ip6_tables.c                |  1 +
 net/netfilter/nf_sockopt.c                     | 36 ++++++++------------------
 8 files changed, 19 insertions(+), 28 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 0eed0b7ab2df..1dd075eda595 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -88,9 +88,8 @@ struct nf_sockopt_ops
 	int (*compat_get)(struct sock *sk, int optval,
 			void __user *user, int *len);
 
-	/* Number of users inside set() or get(). */
-	unsigned int use;
-	struct task_struct *cleanup_task;
+	/* Use the module struct to lock set/get code in place */
+	struct module *owner;
 };
 
 /* Each queued (to userspace) skbuff has one of these. */
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 4169a2a89a39..6018d0e51938 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1513,6 +1513,7 @@ static struct nf_sockopt_ops ebt_sockopts =
 	.get_optmin	= EBT_BASE_CTL,
 	.get_optmax	= EBT_SO_GET_MAX + 1,
 	.get		= do_ebt_get_ctl,
+	.owner		= THIS_MODULE,
 };
 
 static int __init ebtables_init(void)
diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c
index 902fd578aa3c..f656d41d8d41 100644
--- a/net/ipv4/ipvs/ip_vs_ctl.c
+++ b/net/ipv4/ipvs/ip_vs_ctl.c
@@ -2339,6 +2339,7 @@ static struct nf_sockopt_ops ip_vs_sockopts = {
 	.get_optmin	= IP_VS_BASE_CTL,
 	.get_optmax	= IP_VS_SO_GET_MAX+1,
 	.get		= do_ip_vs_get_ctl,
+	.owner		= THIS_MODULE,
 };
 
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d1149aba9351..29114a9ccd1d 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1161,6 +1161,7 @@ static struct nf_sockopt_ops arpt_sockopts = {
 	.get_optmin	= ARPT_BASE_CTL,
 	.get_optmax	= ARPT_SO_GET_MAX+1,
 	.get		= do_arpt_get_ctl,
+	.owner		= THIS_MODULE,
 };
 
 static int __init arp_tables_init(void)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e1b402c6b855..6486894f450c 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -2296,6 +2296,7 @@ static struct nf_sockopt_ops ipt_sockopts = {
 #ifdef CONFIG_COMPAT
 	.compat_get	= compat_do_ipt_get_ctl,
 #endif
+	.owner		= THIS_MODULE,
 };
 
 static struct xt_match icmp_matchstruct __read_mostly = {
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 53cb1772f38f..f813e02aab30 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -399,6 +399,7 @@ static struct nf_sockopt_ops so_getorigdst = {
 	.get_optmin	= SO_ORIGINAL_DST,
 	.get_optmax	= SO_ORIGINAL_DST+1,
 	.get		= &getorigdst,
+	.owner		= THIS_MODULE,
 };
 
 struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4 __read_mostly = {
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index aeda617246b7..cd9df02bb85c 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1462,6 +1462,7 @@ static struct nf_sockopt_ops ip6t_sockopts = {
 	.get_optmin	= IP6T_BASE_CTL,
 	.get_optmax	= IP6T_SO_GET_MAX+1,
 	.get		= do_ip6t_get_ctl,
+	.owner		= THIS_MODULE,
 };
 
 static struct xt_match icmp6_matchstruct __read_mostly = {
diff --git a/net/netfilter/nf_sockopt.c b/net/netfilter/nf_sockopt.c
index 8b8ece750313..e32761ce260c 100644
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -55,18 +55,7 @@ EXPORT_SYMBOL(nf_register_sockopt);
 
 void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
 {
-	/* No point being interruptible: we're probably in cleanup_module() */
- restart:
 	mutex_lock(&nf_sockopt_mutex);
-	if (reg->use != 0) {
-		/* To be woken by nf_sockopt call... */
-		/* FIXME: Stuart Young's name appears gratuitously. */
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		reg->cleanup_task = current;
-		mutex_unlock(&nf_sockopt_mutex);
-		schedule();
-		goto restart;
-	}
 	list_del(&reg->list);
 	mutex_unlock(&nf_sockopt_mutex);
 }
@@ -86,10 +75,11 @@ static int nf_sockopt(struct sock *sk, int pf, int val,
 	list_for_each(i, &nf_sockopts) {
 		ops = (struct nf_sockopt_ops *)i;
 		if (ops->pf == pf) {
+			if (!try_module_get(ops->owner))
+				goto out_nosup;
 			if (get) {
 				if (val >= ops->get_optmin
 				    && val < ops->get_optmax) {
-					ops->use++;
 					mutex_unlock(&nf_sockopt_mutex);
 					ret = ops->get(sk, val, opt, len);
 					goto out;
@@ -97,23 +87,20 @@ static int nf_sockopt(struct sock *sk, int pf, int val,
 			} else {
 				if (val >= ops->set_optmin
 				    && val < ops->set_optmax) {
-					ops->use++;
 					mutex_unlock(&nf_sockopt_mutex);
 					ret = ops->set(sk, val, opt, *len);
 					goto out;
 				}
 			}
+			module_put(ops->owner);
 		}
 	}
+ out_nosup:
 	mutex_unlock(&nf_sockopt_mutex);
 	return -ENOPROTOOPT;
 
  out:
-	mutex_lock(&nf_sockopt_mutex);
-	ops->use--;
-	if (ops->cleanup_task)
-		wake_up_process(ops->cleanup_task);
-	mutex_unlock(&nf_sockopt_mutex);
+	module_put(ops->owner);
 	return ret;
 }
 
@@ -144,10 +131,12 @@ static int compat_nf_sockopt(struct sock *sk, int pf, int val,
 	list_for_each(i, &nf_sockopts) {
 		ops = (struct nf_sockopt_ops *)i;
 		if (ops->pf == pf) {
+			if (!try_module_get(ops->owner))
+				goto out_nosup;
+
 			if (get) {
 				if (val >= ops->get_optmin
 				    && val < ops->get_optmax) {
-					ops->use++;
 					mutex_unlock(&nf_sockopt_mutex);
 					if (ops->compat_get)
 						ret = ops->compat_get(sk,
@@ -160,7 +149,6 @@ static int compat_nf_sockopt(struct sock *sk, int pf, int val,
 			} else {
 				if (val >= ops->set_optmin
 				    && val < ops->set_optmax) {
-					ops->use++;
 					mutex_unlock(&nf_sockopt_mutex);
 					if (ops->compat_set)
 						ret = ops->compat_set(sk,
@@ -171,17 +159,15 @@ static int compat_nf_sockopt(struct sock *sk, int pf, int val,
 					goto out;
 				}
 			}
+			module_put(ops->owner);
 		}
 	}
+ out_nosup:
 	mutex_unlock(&nf_sockopt_mutex);
 	return -ENOPROTOOPT;
 
  out:
-	mutex_lock(&nf_sockopt_mutex);
-	ops->use--;
-	if (ops->cleanup_task)
-		wake_up_process(ops->cleanup_task);
-	mutex_unlock(&nf_sockopt_mutex);
+	module_put(ops->owner);
 	return ret;
 }
 
-- 
cgit 


From 5547bbeed37f7ab64942ffcce9293681101577ef Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 23 Aug 2007 10:37:53 -0700
Subject: PCI AER: fix warnings when PCIEAER=n

Fix warnings when CONFIG_PCIEAER=n:

drivers/pci/pcie/portdrv_pci.c:105: warning: statement with no effect
drivers/pci/pcie/portdrv_pci.c:226: warning: statement with no effect
drivers/scsi/arcmsr/arcmsr_hba.c:352: warning: statement with no effect

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Linas Vepstas <linas@austin.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 include/linux/aer.h | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/aer.h b/include/linux/aer.h
index 509656286e53..bcf236d825e8 100644
--- a/include/linux/aer.h
+++ b/include/linux/aer.h
@@ -15,11 +15,26 @@ extern int pci_disable_pcie_error_reporting(struct pci_dev *dev);
 extern int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev);
 extern int pci_cleanup_aer_correct_error_status(struct pci_dev *dev);
 #else
-#define pci_enable_pcie_error_reporting(dev)		(-EINVAL)
-#define pci_find_aer_capability(dev)			(0)
-#define pci_disable_pcie_error_reporting(dev)		(-EINVAL)
-#define pci_cleanup_aer_uncorrect_error_status(dev)	(-EINVAL)
-#define pci_cleanup_aer_correct_error_status(dev)	(-EINVAL)
+static inline int pci_enable_pcie_error_reporting(struct pci_dev *dev)
+{
+	return -EINVAL;
+}
+static inline int pci_find_aer_capability(struct pci_dev *dev)
+{
+	return 0;
+}
+static inline int pci_disable_pcie_error_reporting(struct pci_dev *dev)
+{
+	return -EINVAL;
+}
+static inline int pci_cleanup_aer_uncorrect_error_status(struct pci_dev *dev)
+{
+	return -EINVAL;
+}
+static inline int pci_cleanup_aer_correct_error_status(struct pci_dev *dev)
+{
+	return -EINVAL;
+}
 #endif
 
 #endif //_AER_H_
-- 
cgit 


From 99fa9844f0eed5582b5648f745204758b27db659 Mon Sep 17 00:00:00 2001
From: Jason Gaston <jason.d.gaston@intel.com>
Date: Thu, 30 Aug 2007 17:50:56 -0700
Subject: PCI: irq and pci_ids patch for Intel Tolapai

This patch adds the Intel Tolapai LPC and SMBus Controller DID's.

Signed-off-by: Jason Gaston <jason.d.gaston@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/i386/pci/irq.c     | 1 +
 include/linux/pci_ids.h | 2 ++
 2 files changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/arch/i386/pci/irq.c b/arch/i386/pci/irq.c
index 665db063a40a..8434f2323b87 100644
--- a/arch/i386/pci/irq.c
+++ b/arch/i386/pci/irq.c
@@ -550,6 +550,7 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 		case PCI_DEVICE_ID_INTEL_ICH9_3:
 		case PCI_DEVICE_ID_INTEL_ICH9_4:
 		case PCI_DEVICE_ID_INTEL_ICH9_5:
+		case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
 			r->name = "PIIX/ICH";
 			r->get = pirq_piix_get;
 			r->set = pirq_piix_set;
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index d41747b9fd15..55f307ffbf96 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2293,6 +2293,8 @@
 #define PCI_DEVICE_ID_INTEL_MCH_PC	0x3599
 #define PCI_DEVICE_ID_INTEL_MCH_PC1	0x359a
 #define PCI_DEVICE_ID_INTEL_E7525_MCH	0x359e
+#define PCI_DEVICE_ID_INTEL_TOLAPAI_0	0x5031
+#define PCI_DEVICE_ID_INTEL_TOLAPAI_1	0x5032
 #define PCI_DEVICE_ID_INTEL_82371SB_0	0x7000
 #define PCI_DEVICE_ID_INTEL_82371SB_1	0x7010
 #define PCI_DEVICE_ID_INTEL_82371SB_2	0x7020
-- 
cgit 


From 6c3c22f3cb2b7cd0a42a024b93db76b5c3133d37 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Tue, 11 Sep 2007 22:28:36 +0200
Subject: ide: add ide_dev_is_sata() helper (take 2)

Make the SATA drive detection code from eighty_ninty_three() into inline
ide_dev_is_sata() helper fixing it along the way to be more strict while
checking word 80 for the reserved values...

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-iops.c |  3 +--
 include/linux/ide.h    | 13 +++++++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/ide/ide-iops.c b/drivers/ide/ide-iops.c
index f4cd2700cae5..646a54e233d3 100644
--- a/drivers/ide/ide-iops.c
+++ b/drivers/ide/ide-iops.c
@@ -615,8 +615,7 @@ u8 eighty_ninty_three (ide_drive_t *drive)
 	if (hwif->cbl != ATA_CBL_PATA80 && !ivb)
 		goto no_80w;
 
-	/* Check for SATA but only if we are ATA5 or higher */
-	if (id->hw_config == 0 && (id->major_rev_num & 0x7FE0))
+	if (ide_dev_is_sata(id))
 		return 1;
 
 	/*
diff --git a/include/linux/ide.h b/include/linux/ide.h
index c792b4fd1588..b9f66c10caa0 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1378,6 +1378,19 @@ static inline int ide_dev_has_iordy(struct hd_driveid *id)
 	return ((id->field_valid & 2) && (id->capability & 8)) ? 1 : 0;
 }
 
+static inline int ide_dev_is_sata(struct hd_driveid *id)
+{
+	/*
+	 * See if word 93 is 0 AND drive is at least ATA-5 compatible
+	 * verifying that word 80 by casting it to a signed type --
+	 * this trick allows us to filter out the reserved values of
+	 * 0x0000 and 0xffff along with the earlier ATA revisions...
+	 */
+	if (id->hw_config == 0 && (short)id->major_rev_num >= 0x0020)
+		return 1;
+	return 0;
+}
+
 u8 ide_dump_status(ide_drive_t *, const char *, u8);
 
 typedef struct ide_pio_timings_s {
-- 
cgit 


From df96efd73b81b8bc2d23b3d8b6025cce3d43db6c Mon Sep 17 00:00:00 2001
From: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Date: Tue, 11 Sep 2007 22:24:45 +0100
Subject: leds: Add missing include for leds.h

This patch has added #include <linux/spinlock.h> to include/linux/leds.h
for rwlock_t.

Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Richard Purdie <rpurdie@rpsys.net>
---
 include/linux/leds.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/leds.h b/include/linux/leds.h
index 421175092ee2..dc1178f6184b 100644
--- a/include/linux/leds.h
+++ b/include/linux/leds.h
@@ -13,6 +13,7 @@
 #define __LINUX_LEDS_H_INCLUDED
 
 #include <linux/list.h>
+#include <linux/spinlock.h>
 
 struct device;
 /*
-- 
cgit 


From a83308e60f63749dc1d08acb0d8fa9e2ec13c9a7 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Tue, 11 Sep 2007 15:23:47 -0700
Subject: PTR_ALIGN

The AdvanSys driver wants to align some pointers, and the ALIGN macro
doesn't work for pointers.  Rather than try to make it work, add a new
PTR_ALIGN macro which is typesafe.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f592df74b3cf..47160fe378c9 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -34,6 +34,7 @@ extern const char linux_proc_banner[];
 
 #define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
 #define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+#define PTR_ALIGN(p, a)		((typeof(p))ALIGN((unsigned long)(p), (a)))
 
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
 
-- 
cgit 


From dd23aae4f5edf4e1dbd8f7f8013a754ba3253f48 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 11 Sep 2007 15:23:55 -0700
Subject: Fix select on /proc files without ->poll
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Taneli Vähäkangas <vahakang@cs.helsinki.fi> reported that commit
786d7e1612f0b0adb6046f19b906609e4fe8b1ba aka "Fix rmmod/read/write races
in /proc entries" broke SBCL + SLIME combo.

The old code in do_select() used DEFAULT_POLLMASK, if couldn't find
->poll handler.  The new code makes ->poll always there and returns 0 by
default, which is not correct.  Return DEFAULT_POLLMASK instead.

Steps to reproduce:

	install emacs, SBCL, SLIME
	emacs
	M-x slime	in *inferior-lisp* buffer
	[watch it doing "Connecting to Swank on port X.."]

Please, apply before 2.6.23.

P.S.: why SBCL can't just read(2) /proc/cpuinfo is a mystery.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: T Taneli Vahakangas <vahakang@cs.helsinki.fi>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c      | 3 ++-
 fs/select.c          | 2 --
 include/linux/poll.h | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index a5b0dfd89a17..0e4d37c93eea 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <linux/stat.h>
 #include <linux/completion.h>
+#include <linux/poll.h>
 #include <linux/file.h>
 #include <linux/limits.h>
 #include <linux/init.h>
@@ -232,7 +233,7 @@ static ssize_t proc_reg_write(struct file *file, const char __user *buf, size_t
 static unsigned int proc_reg_poll(struct file *file, struct poll_table_struct *pts)
 {
 	struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
-	unsigned int rv = 0;
+	unsigned int rv = DEFAULT_POLLMASK;
 	unsigned int (*poll)(struct file *, struct poll_table_struct *);
 
 	spin_lock(&pde->pde_unload_lock);
diff --git a/fs/select.c b/fs/select.c
index a974082b0824..46dca31c607a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -26,8 +26,6 @@
 
 #include <asm/uaccess.h>
 
-#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
-
 struct poll_table_page {
 	struct poll_table_page * next;
 	struct poll_table_entry * entry;
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 27690798623f..16d813b364ef 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -21,6 +21,8 @@
 #define WQUEUES_STACK_ALLOC	(MAX_STACK_ALLOC - FRONTEND_STACK_ALLOC)
 #define N_INLINE_POLL_ENTRIES	(WQUEUES_STACK_ALLOC / sizeof(struct poll_table_entry))
 
+#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
+
 struct poll_table_struct;
 
 /* 
-- 
cgit 


From d9cc20484e5e48c6a5deb4387c20fd45bfbdde8c Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sun, 16 Sep 2007 16:21:16 -0700
Subject: [NET] skbuff: Add skb_cow_head

This patch adds an optimised version of skb_cow that avoids the copy if
the header can be modified even if the rest of the payload is cloned.

This can be used in encapsulating paths where we only need to modify the
header.  As it is, this can be used in PPPOE and bridging.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/pppoe.c       |  2 +-
 include/linux/skbuff.h    | 40 +++++++++++++++++++++++++++++++---------
 net/bridge/br_netfilter.c |  2 +-
 3 files changed, 33 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index bac36546e0bf..0d7f570b9a54 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -860,7 +860,7 @@ static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb)
 	/* Copy the data if there is no space for the header or if it's
 	 * read-only.
 	 */
-	if (skb_cow(skb, sizeof(*ph) + dev->hard_header_len))
+	if (skb_cow_head(skb, sizeof(*ph) + dev->hard_header_len))
 		goto abort;
 
 	__skb_push(skb, sizeof(*ph));
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 93c27f71122a..a656cecd373c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1352,6 +1352,22 @@ static inline int skb_clone_writable(struct sk_buff *skb, int len)
 	       skb_headroom(skb) + len <= skb->hdr_len;
 }
 
+static inline int __skb_cow(struct sk_buff *skb, unsigned int headroom,
+			    int cloned)
+{
+	int delta = 0;
+
+	if (headroom < NET_SKB_PAD)
+		headroom = NET_SKB_PAD;
+	if (headroom > skb_headroom(skb))
+		delta = headroom - skb_headroom(skb);
+
+	if (delta || cloned)
+		return pskb_expand_head(skb, ALIGN(delta, NET_SKB_PAD), 0,
+					GFP_ATOMIC);
+	return 0;
+}
+
 /**
  *	skb_cow - copy header of skb when it is required
  *	@skb: buffer to cow
@@ -1366,16 +1382,22 @@ static inline int skb_clone_writable(struct sk_buff *skb, int len)
  */
 static inline int skb_cow(struct sk_buff *skb, unsigned int headroom)
 {
-	int delta = (headroom > NET_SKB_PAD ? headroom : NET_SKB_PAD) -
-			skb_headroom(skb);
-
-	if (delta < 0)
-		delta = 0;
+	return __skb_cow(skb, headroom, skb_cloned(skb));
+}
 
-	if (delta || skb_cloned(skb))
-		return pskb_expand_head(skb, (delta + (NET_SKB_PAD-1)) &
-				~(NET_SKB_PAD-1), 0, GFP_ATOMIC);
-	return 0;
+/**
+ *	skb_cow_head - skb_cow but only making the head writable
+ *	@skb: buffer to cow
+ *	@headroom: needed headroom
+ *
+ *	This function is identical to skb_cow except that we replace the
+ *	skb_cloned check by skb_header_cloned.  It should be used when
+ *	you only need to push on some header and do not need to modify
+ *	the data.
+ */
+static inline int skb_cow_head(struct sk_buff *skb, unsigned int headroom)
+{
+	return __skb_cow(skb, headroom, skb_header_cloned(skb));
 }
 
 /**
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 3ee2022928e3..fc13130035e7 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -183,7 +183,7 @@ int nf_bridge_copy_header(struct sk_buff *skb)
 	int err;
 	int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
 
-	err = skb_cow(skb, header_size);
+	err = skb_cow_head(skb, header_size);
 	if (err)
 		return err;
 
-- 
cgit 


From fa890d586cc127ce72597ba0a909bfecf784e10c Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Sun, 16 Sep 2007 17:01:26 -0600
Subject: Fix non-ISA link error in drivers/scsi/advansys.c

When CONFIG_ISA is disabled, the isa_driver support will not be compiled
in.  Define stubs so that we don't get link-time errors.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/isa.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/isa.h b/include/linux/isa.h
index 1b855335cb11..b0270e3814c8 100644
--- a/include/linux/isa.h
+++ b/include/linux/isa.h
@@ -22,7 +22,18 @@ struct isa_driver {
 
 #define to_isa_driver(x) container_of((x), struct isa_driver, driver)
 
+#ifdef CONFIG_ISA
 int isa_register_driver(struct isa_driver *, unsigned int);
 void isa_unregister_driver(struct isa_driver *);
+#else
+static inline int isa_register_driver(struct isa_driver *d, unsigned int i)
+{
+	return 0;
+}
+
+static inline void isa_unregister_driver(struct isa_driver *d)
+{
+}
+#endif
 
 #endif /* __LINUX_ISA_H */
-- 
cgit 


From 735de2230f09741077a645a913de0a04b10208bf Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 18 Sep 2007 22:46:44 -0700
Subject: Convert uid hash to hlist

Surprisingly, but (spotted by Alexey Dobriyan) the uid hash still uses
list_heads, thus occupying twice as much place as it could.  Convert it to
hlist_heads.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h          |  2 +-
 include/linux/user_namespace.h |  2 +-
 kernel/user.c                  | 15 ++++++++-------
 kernel/user_namespace.c        |  2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index f4e324ed2e44..6239bc2c2baa 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -593,7 +593,7 @@ struct user_struct {
 #endif
 
 	/* Hash table maintenance information */
-	struct list_head uidhash_list;
+	struct hlist_node uidhash_node;
 	uid_t uid;
 };
 
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 1101b0ce878f..b5f41d4c2eec 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -11,7 +11,7 @@
 
 struct user_namespace {
 	struct kref		kref;
-	struct list_head	uidhash_table[UIDHASH_SZ];
+	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*root_user;
 };
 
diff --git a/kernel/user.c b/kernel/user.c
index e080ba863ae3..add57c7e4c07 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,21 +55,22 @@ struct user_struct root_user = {
 /*
  * These routines must be called with the uidhash spinlock held!
  */
-static inline void uid_hash_insert(struct user_struct *up, struct list_head *hashent)
+static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 {
-	list_add(&up->uidhash_list, hashent);
+	hlist_add_head(&up->uidhash_node, hashent);
 }
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	list_del(&up->uidhash_list);
+	hlist_del(&up->uidhash_node);
 }
 
-static inline struct user_struct *uid_hash_find(uid_t uid, struct list_head *hashent)
+static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
 {
 	struct user_struct *user;
+	struct hlist_node *h;
 
-	list_for_each_entry(user, hashent, uidhash_list) {
+	hlist_for_each_entry(user, h, hashent, uidhash_node) {
 		if(user->uid == uid) {
 			atomic_inc(&user->__count);
 			return user;
@@ -118,7 +119,7 @@ void free_uid(struct user_struct *up)
 
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
-	struct list_head *hashent = uidhashentry(ns, uid);
+	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up;
 
 	spin_lock_irq(&uidhash_lock);
@@ -207,7 +208,7 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(init_user_ns.uidhash_table + n);
+		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 85af9422ea6e..e7ba1bf8457c 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,7 +39,7 @@ static struct user_namespace *clone_user_ns(struct user_namespace *old_ns)
 	kref_init(&ns->kref);
 
 	for (n = 0; n < UIDHASH_SZ; ++n)
-		INIT_LIST_HEAD(ns->uidhash_table + n);
+		INIT_HLIST_HEAD(ns->uidhash_table + n);
 
 	/* Insert new root user.  */
 	ns->root_user = alloc_uid(ns, 0);
-- 
cgit 


From 28f300d23674fa01ae747c66ce861d4ee6aebe8c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 18 Sep 2007 22:46:45 -0700
Subject: Fix user namespace exiting OOPs

It turned out, that the user namespace is released during the do_exit() in
exit_task_namespaces(), but the struct user_struct is released only during the
put_task_struct(), i.e.  MUCH later.

On debug kernels with poisoned slabs this will cause the oops in
uid_hash_remove() because the head of the chain, which resides inside the
struct user_namespace, will be already freed and poisoned.

Since the uid hash itself is required only when someone can search it, i.e.
when the namespace is alive, we can safely unhash all the user_struct-s from
it during the namespace exiting.  The subsequent free_uid() will complete the
user_struct destruction.

For example simple program

   #include <sched.h>

   char stack[2 * 1024 * 1024];

   int f(void *foo)
   {
   	return 0;
   }

   int main(void)
   {
   	clone(f, stack + 1 * 1024 * 1024, 0x10000000, 0);
   	return 0;
   }

run on kernel with CONFIG_USER_NS turned on will oops the
kernel immediately.

This was spotted during OpenVZ kernel testing.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Alexey Dobriyan <adobriyan@openvz.org>
Acked-by: "Serge E. Hallyn" <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h   |  1 +
 kernel/user.c           | 26 +++++++++++++++++++++++++-
 kernel/user_namespace.c |  2 +-
 3 files changed, 27 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6239bc2c2baa..5445eaec6908 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1472,6 +1472,7 @@ static inline struct user_struct *get_uid(struct user_struct *u)
 }
 extern void free_uid(struct user_struct *);
 extern void switch_uid(struct user_struct *);
+extern void release_uids(struct user_namespace *ns);
 
 #include <asm/current.h>
 
diff --git a/kernel/user.c b/kernel/user.c
index add57c7e4c07..9ca2848fc356 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -62,7 +62,7 @@ static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *ha
 
 static inline void uid_hash_remove(struct user_struct *up)
 {
-	hlist_del(&up->uidhash_node);
+	hlist_del_init(&up->uidhash_node);
 }
 
 static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
@@ -199,6 +199,30 @@ void switch_uid(struct user_struct *new_user)
 	suid_keys(current);
 }
 
+void release_uids(struct user_namespace *ns)
+{
+	int i;
+	unsigned long flags;
+	struct hlist_head *head;
+	struct hlist_node *nd;
+
+	spin_lock_irqsave(&uidhash_lock, flags);
+	/*
+	 * collapse the chains so that the user_struct-s will
+	 * be still alive, but not in hashes. subsequent free_uid()
+	 * will free them.
+	 */
+	for (i = 0; i < UIDHASH_SZ; i++) {
+		head = ns->uidhash_table + i;
+		while (!hlist_empty(head)) {
+			nd = head->first;
+			hlist_del_init(nd);
+		}
+	}
+	spin_unlock_irqrestore(&uidhash_lock, flags);
+
+	free_uid(ns->root_user);
+}
 
 static int __init uid_cache_init(void)
 {
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index e7ba1bf8457c..7af90fc4f0fd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -81,7 +81,7 @@ void free_user_ns(struct kref *kref)
 	struct user_namespace *ns;
 
 	ns = container_of(kref, struct user_namespace, kref);
-	free_uid(ns->root_user);
+	release_uids(ns);
 	kfree(ns);
 }
 
-- 
cgit 


From 480eccf9ae1073b87bb4fe118971fbf134a5bc61 Mon Sep 17 00:00:00 2001
From: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Date: Tue, 18 Sep 2007 22:46:47 -0700
Subject: Fix NUMA Memory Policy Reference Counting

This patch proposes fixes to the reference counting of memory policy in the
page allocation paths and in show_numa_map().  Extracted from my "Memory
Policy Cleanups and Enhancements" series as stand-alone.

Shared policy lookup [shmem] has always added a reference to the policy,
but this was never unrefed after page allocation or after formatting the
numa map data.

Default system policy should not require additional ref counting, nor
should the current task's task policy.  However, show_numa_map() calls
get_vma_policy() to examine what may be [likely is] another task's policy.
The latter case needs protection against freeing of the policy.

This patch adds a reference count to a mempolicy returned by
get_vma_policy() when the policy is a vma policy or another task's
mempolicy.  Again, shared policy is already reference counted on lookup.  A
matching "unref" [__mpol_free()] is performed in alloc_page_vma() for
shared and vma policies, and in show_numa_map() for shared and another
task's mempolicy.  We can call __mpol_free() directly, saving an admittedly
inexpensive inline NULL test, because we know we have a non-NULL policy.

Handling policy ref counts for hugepages is a bit trickier.
huge_zonelist() returns a zone list that might come from a shared or vma
'BIND policy.  In this case, we should hold the reference until after the
huge page allocation in dequeue_hugepage().  The patch modifies
huge_zonelist() to return a pointer to the mempolicy if it needs to be
unref'd after allocation.

Kernel Build [16cpu, 32GB, ia64] - average of 10 runs:

		w/o patch	w/ refcount patch
	    Avg	  Std Devn	   Avg	  Std Devn
Real:	 100.59	    0.38	 100.63	    0.43
User:	1209.60	    0.37	1209.91	    0.31
System:   81.52	    0.42	  81.64	    0.34

Signed-off-by:  Lee Schermerhorn <lee.schermerhorn@hp.com>
Acked-by: Andi Kleen <ak@suse.de>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h |  4 +--
 mm/hugetlb.c              |  4 ++-
 mm/mempolicy.c            | 79 +++++++++++++++++++++++++++++++++++++++++------
 3 files changed, 75 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 5bdd656e88cf..a020eb2d4e2a 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -159,7 +159,7 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
 
 extern struct mempolicy default_policy;
 extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr, gfp_t gfp_flags);
+		unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
 extern unsigned slab_node(struct mempolicy *policy);
 
 extern enum zone_type policy_zone;
@@ -256,7 +256,7 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
-		unsigned long addr, gfp_t gfp_flags)
+ 		unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
 {
 	return NODE_DATA(0)->node_zonelists + gfp_zone(gfp_flags);
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index de4cf458d6e1..84c795ee2d65 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -71,8 +71,9 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 {
 	int nid;
 	struct page *page = NULL;
+	struct mempolicy *mpol;
 	struct zonelist *zonelist = huge_zonelist(vma, address,
-						htlb_alloc_mask);
+					htlb_alloc_mask, &mpol);
 	struct zone **z;
 
 	for (z = zonelist->zones; *z; z++) {
@@ -87,6 +88,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 			break;
 		}
 	}
+	mpol_free(mpol);	/* unref if mpol !NULL */
 	return page;
 }
 
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index bb54b88c3d5a..3d6ac9505d07 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1077,21 +1077,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 
 #endif
 
-/* Return effective policy for a VMA */
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task - task for fallback if vma policy == default
+ * @vma   - virtual memory area whose policy is sought
+ * @addr  - address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Returned policy has extra reference count if shared, vma,
+ * or some other task's policy [show_numa_maps() can pass
+ * @task != current].  It is the caller's responsibility to
+ * free the reference in these cases.
+ */
 static struct mempolicy * get_vma_policy(struct task_struct *task,
 		struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = task->mempolicy;
+	int shared_pol = 0;
 
 	if (vma) {
-		if (vma->vm_ops && vma->vm_ops->get_policy)
+		if (vma->vm_ops && vma->vm_ops->get_policy) {
 			pol = vma->vm_ops->get_policy(vma, addr);
-		else if (vma->vm_policy &&
+			shared_pol = 1;	/* if pol non-NULL, add ref below */
+		} else if (vma->vm_policy &&
 				vma->vm_policy->policy != MPOL_DEFAULT)
 			pol = vma->vm_policy;
 	}
 	if (!pol)
 		pol = &default_policy;
+	else if (!shared_pol && pol != current->mempolicy)
+		mpol_get(pol);	/* vma or other task's policy */
 	return pol;
 }
 
@@ -1207,19 +1223,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
 }
 
 #ifdef CONFIG_HUGETLBFS
-/* Return a zonelist suitable for a huge page allocation. */
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ *
+ * Returns a zonelist suitable for a huge page allocation.
+ * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If it is also a policy for which get_vma_policy() returns an extra
+ * reference, we must hold that reference until after allocation.
+ * In that case, return policy via @mpol so hugetlb allocation can drop
+ * the reference.  For non-'BIND referenced policies, we can/do drop the
+ * reference here, so the caller doesn't need to know about the special case
+ * for default and current task policy.
+ */
 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
-							gfp_t gfp_flags)
+				gfp_t gfp_flags, struct mempolicy **mpol)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct zonelist *zl;
 
+	*mpol = NULL;		/* probably no unref needed */
 	if (pol->policy == MPOL_INTERLEAVE) {
 		unsigned nid;
 
 		nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
+		__mpol_free(pol);		/* finished with pol */
 		return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
 	}
-	return zonelist_policy(GFP_HIGHUSER, pol);
+
+	zl = zonelist_policy(GFP_HIGHUSER, pol);
+	if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
+		if (pol->policy != MPOL_BIND)
+			__mpol_free(pol);	/* finished with pol */
+		else
+			*mpol = pol;	/* unref needed after allocation */
+	}
+	return zl;
 }
 #endif
 
@@ -1264,6 +1306,7 @@ struct page *
 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 {
 	struct mempolicy *pol = get_vma_policy(current, vma, addr);
+	struct zonelist *zl;
 
 	cpuset_update_task_memory_state();
 
@@ -1273,7 +1316,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
 		nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
 		return alloc_page_interleave(gfp, 0, nid);
 	}
-	return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+	zl = zonelist_policy(gfp, pol);
+	if (pol != &default_policy && pol != current->mempolicy) {
+		/*
+		 * slow path: ref counted policy -- shared or vma
+		 */
+		struct page *page =  __alloc_pages(gfp, 0, zl);
+		__mpol_free(pol);
+		return page;
+	}
+	/*
+	 * fast path:  default or task policy
+	 */
+	return __alloc_pages(gfp, 0, zl);
 }
 
 /**
@@ -1872,6 +1927,7 @@ int show_numa_map(struct seq_file *m, void *v)
 	struct numa_maps *md;
 	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
+	struct mempolicy *pol;
 	int n;
 	char buffer[50];
 
@@ -1882,8 +1938,13 @@ int show_numa_map(struct seq_file *m, void *v)
 	if (!md)
 		return 0;
 
-	mpol_to_str(buffer, sizeof(buffer),
-			    get_vma_policy(priv->task, vma, vma->vm_start));
+	pol = get_vma_policy(priv->task, vma, vma->vm_start);
+	mpol_to_str(buffer, sizeof(buffer), pol);
+	/*
+	 * unref shared or other task's mempolicy
+	 */
+	if (pol != &default_policy && pol != current->mempolicy)
+		__mpol_free(pol);
 
 	seq_printf(m, "%08lx %s", vma->vm_start, buffer);
 
-- 
cgit 


From 1799e35d5baab6e06168b46cc78b968e728ea3d1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 19 Sep 2007 23:34:46 +0200
Subject: sched: add /proc/sys/kernel/sched_compat_yield

add /proc/sys/kernel/sched_compat_yield to make sys_sched_yield()
more agressive, by moving the yielding task to the last position
in the rbtree.

with sched_compat_yield=0:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  2539 mingo     20   0  1576  252  204 R   50  0.0   0:02.03 loop_yield
  2541 mingo     20   0  1576  244  196 R   50  0.0   0:02.05 loop

with sched_compat_yield=1:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  2584 mingo     20   0  1576  248  196 R   99  0.0   0:52.45 loop
  2582 mingo     20   0  1576  256  204 R    0  0.0   0:00.00 loop_yield

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  5 +---
 kernel/sched_fair.c   | 63 ++++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sysctl.c       |  8 +++++++
 4 files changed, 67 insertions(+), 10 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5445eaec6908..3de79016f2a6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1406,6 +1406,7 @@ extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_batch_wakeup_granularity;
 extern unsigned int sysctl_sched_stat_granularity;
 extern unsigned int sysctl_sched_runtime_limit;
+extern unsigned int sysctl_sched_compat_yield;
 extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index deeb1f8e0c30..63e0971c8fbb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4550,10 +4550,7 @@ asmlinkage long sys_sched_yield(void)
 	struct rq *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_cnt);
-	if (unlikely(rq->nr_running == 1))
-		schedstat_inc(rq, yld_act_empty);
-	else
-		current->sched_class->yield_task(rq, current);
+	current->sched_class->yield_task(rq, current);
 
 	/*
 	 * Since we are going to call schedule() anyway, there's
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 892616bf2c77..c9fbe8e73a45 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -42,6 +42,14 @@ unsigned int sysctl_sched_latency __read_mostly = 20000000ULL;
  */
 unsigned int sysctl_sched_min_granularity __read_mostly = 2000000ULL;
 
+/*
+ * sys_sched_yield() compat mode
+ *
+ * This option switches the agressive yield implementation of the
+ * old scheduler back on.
+ */
+unsigned int __read_mostly sysctl_sched_compat_yield;
+
 /*
  * SCHED_BATCH wake-up granularity.
  * (default: 25 msec, units: nanoseconds)
@@ -897,19 +905,62 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 }
 
 /*
- * sched_yield() support is very simple - we dequeue and enqueue
+ * sched_yield() support is very simple - we dequeue and enqueue.
+ *
+ * If compat_yield is turned on then we requeue to the end of the tree.
  */
 static void yield_task_fair(struct rq *rq, struct task_struct *p)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
+	struct sched_entity *rightmost, *se = &p->se;
+	struct rb_node *parent;
 
-	__update_rq_clock(rq);
 	/*
-	 * Dequeue and enqueue the task to update its
-	 * position within the tree:
+	 * Are we the only task in the tree?
+	 */
+	if (unlikely(cfs_rq->nr_running == 1))
+		return;
+
+	if (likely(!sysctl_sched_compat_yield)) {
+		__update_rq_clock(rq);
+		/*
+		 * Dequeue and enqueue the task to update its
+		 * position within the tree:
+		 */
+		dequeue_entity(cfs_rq, &p->se, 0);
+		enqueue_entity(cfs_rq, &p->se, 0);
+
+		return;
+	}
+	/*
+	 * Find the rightmost entry in the rbtree:
 	 */
-	dequeue_entity(cfs_rq, &p->se, 0);
-	enqueue_entity(cfs_rq, &p->se, 0);
+	do {
+		parent = *link;
+		link = &parent->rb_right;
+	} while (*link);
+
+	rightmost = rb_entry(parent, struct sched_entity, run_node);
+	/*
+	 * Already in the rightmost position?
+	 */
+	if (unlikely(rightmost == se))
+		return;
+
+	/*
+	 * Minimally necessary key value to be last in the tree:
+	 */
+	se->fair_key = rightmost->fair_key + 1;
+
+	if (cfs_rq->rb_leftmost == &se->run_node)
+		cfs_rq->rb_leftmost = rb_next(&se->run_node);
+	/*
+	 * Relink the task to the rightmost position:
+	 */
+	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+	rb_link_node(&se->run_node, parent, link);
+	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 }
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6ace893c17c9..53a456ebf6d5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -303,6 +303,14 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_compat_yield",
+		.data		= &sysctl_sched_compat_yield,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-- 
cgit 


From b8fceee17a310f189188599a8fa5e9beaff57eb0 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Thu, 20 Sep 2007 12:40:16 -0700
Subject: signalfd simplification

This simplifies signalfd code, by avoiding it to remain attached to the
sighand during its lifetime.

In this way, the signalfd remain attached to the sighand only during
poll(2) (and select and epoll) and read(2).  This also allows to remove
all the custom "tsk == current" checks in kernel/signal.c, since
dequeue_signal() will only be called by "current".

I think this is also what Ben was suggesting time ago.

The external effect of this, is that a thread can extract only its own
private signals and the group ones.  I think this is an acceptable
behaviour, in that those are the signals the thread would be able to
fetch w/out signalfd.

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/exec.c                 |   3 -
 fs/signalfd.c             | 190 +++++++---------------------------------------
 include/linux/init_task.h |   2 +-
 include/linux/sched.h     |   2 +-
 include/linux/signalfd.h  |  40 +---------
 kernel/exit.c             |   9 ---
 kernel/fork.c             |   2 +-
 kernel/signal.c           |   8 +-
 8 files changed, 39 insertions(+), 217 deletions(-)

(limited to 'include/linux')

diff --git a/fs/exec.c b/fs/exec.c
index c21a8cc06277..073b0b8c6d05 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -50,7 +50,6 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/audit.h>
-#include <linux/signalfd.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -784,7 +783,6 @@ static int de_thread(struct task_struct *tsk)
 	 * and we can just re-use it all.
 	 */
 	if (atomic_read(&oldsighand->count) <= 1) {
-		signalfd_detach(tsk);
 		exit_itimers(sig);
 		return 0;
 	}
@@ -923,7 +921,6 @@ static int de_thread(struct task_struct *tsk)
 	sig->flags = 0;
 
 no_thread_group:
-	signalfd_detach(tsk);
 	exit_itimers(sig);
 	if (leader)
 		release_task(leader);
diff --git a/fs/signalfd.c b/fs/signalfd.c
index a8e293d30034..aefb0be07942 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -11,8 +11,10 @@
  *      Now using anonymous inode source.
  *      Thanks to Oleg Nesterov for useful code review and suggestions.
  *      More comments and suggestions from Arnd Bergmann.
- * Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
+ *  Sat May 19, 2007: Davi E. M. Arnaut <davi@haxent.com.br>
  *      Retrieve multiple signals with one read() call
+ *  Sun Jul 15, 2007: Davide Libenzi <davidel@xmailserver.org>
+ *      Attach to the sighand only during read() and poll().
  */
 
 #include <linux/file.h>
@@ -27,102 +29,12 @@
 #include <linux/signalfd.h>
 
 struct signalfd_ctx {
-	struct list_head lnk;
-	wait_queue_head_t wqh;
 	sigset_t sigmask;
-	struct task_struct *tsk;
 };
 
-struct signalfd_lockctx {
-	struct task_struct *tsk;
-	unsigned long flags;
-};
-
-/*
- * Tries to acquire the sighand lock. We do not increment the sighand
- * use count, and we do not even pin the task struct, so we need to
- * do it inside an RCU read lock, and we must be prepared for the
- * ctx->tsk going to NULL (in signalfd_deliver()), and for the sighand
- * being detached. We return 0 if the sighand has been detached, or
- * 1 if we were able to pin the sighand lock.
- */
-static int signalfd_lock(struct signalfd_ctx *ctx, struct signalfd_lockctx *lk)
-{
-	struct sighand_struct *sighand = NULL;
-
-	rcu_read_lock();
-	lk->tsk = rcu_dereference(ctx->tsk);
-	if (likely(lk->tsk != NULL))
-		sighand = lock_task_sighand(lk->tsk, &lk->flags);
-	rcu_read_unlock();
-
-	if (!sighand)
-		return 0;
-
-	if (!ctx->tsk) {
-		unlock_task_sighand(lk->tsk, &lk->flags);
-		return 0;
-	}
-
-	if (lk->tsk->tgid == current->tgid)
-		lk->tsk = current;
-
-	return 1;
-}
-
-static void signalfd_unlock(struct signalfd_lockctx *lk)
-{
-	unlock_task_sighand(lk->tsk, &lk->flags);
-}
-
-/*
- * This must be called with the sighand lock held.
- */
-void signalfd_deliver(struct task_struct *tsk, int sig)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-	struct signalfd_ctx *ctx, *tmp;
-
-	BUG_ON(!sig);
-	list_for_each_entry_safe(ctx, tmp, &sighand->signalfd_list, lnk) {
-		/*
-		 * We use a negative signal value as a way to broadcast that the
-		 * sighand has been orphaned, so that we can notify all the
-		 * listeners about this. Remember the ctx->sigmask is inverted,
-		 * so if the user is interested in a signal, that corresponding
-		 * bit will be zero.
-		 */
-		if (sig < 0) {
-			if (ctx->tsk == tsk) {
-				ctx->tsk = NULL;
-				list_del_init(&ctx->lnk);
-				wake_up(&ctx->wqh);
-			}
-		} else {
-			if (!sigismember(&ctx->sigmask, sig))
-				wake_up(&ctx->wqh);
-		}
-	}
-}
-
-static void signalfd_cleanup(struct signalfd_ctx *ctx)
-{
-	struct signalfd_lockctx lk;
-
-	/*
-	 * This is tricky. If the sighand is gone, we do not need to remove
-	 * context from the list, the list itself won't be there anymore.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		list_del(&ctx->lnk);
-		signalfd_unlock(&lk);
-	}
-	kfree(ctx);
-}
-
 static int signalfd_release(struct inode *inode, struct file *file)
 {
-	signalfd_cleanup(file->private_data);
+	kfree(file->private_data);
 	return 0;
 }
 
@@ -130,23 +42,15 @@ static unsigned int signalfd_poll(struct file *file, poll_table *wait)
 {
 	struct signalfd_ctx *ctx = file->private_data;
 	unsigned int events = 0;
-	struct signalfd_lockctx lk;
 
-	poll_wait(file, &ctx->wqh, wait);
+	poll_wait(file, &current->sighand->signalfd_wqh, wait);
 
-	/*
-	 * Let the caller get a POLLIN in this case, ala socket recv() when
-	 * the peer disconnects.
-	 */
-	if (signalfd_lock(ctx, &lk)) {
-		if ((lk.tsk == current &&
-		     next_signal(&lk.tsk->pending, &ctx->sigmask) > 0) ||
-		    next_signal(&lk.tsk->signal->shared_pending,
-				&ctx->sigmask) > 0)
-			events |= POLLIN;
-		signalfd_unlock(&lk);
-	} else
+	spin_lock_irq(&current->sighand->siglock);
+	if (next_signal(&current->pending, &ctx->sigmask) ||
+	    next_signal(&current->signal->shared_pending,
+			&ctx->sigmask))
 		events |= POLLIN;
+	spin_unlock_irq(&current->sighand->siglock);
 
 	return events;
 }
@@ -219,59 +123,46 @@ static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
 				int nonblock)
 {
 	ssize_t ret;
-	struct signalfd_lockctx lk;
 	DECLARE_WAITQUEUE(wait, current);
 
-	if (!signalfd_lock(ctx, &lk))
-		return 0;
-
-	ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
+	spin_lock_irq(&current->sighand->siglock);
+	ret = dequeue_signal(current, &ctx->sigmask, info);
 	switch (ret) {
 	case 0:
 		if (!nonblock)
 			break;
 		ret = -EAGAIN;
 	default:
-		signalfd_unlock(&lk);
+		spin_unlock_irq(&current->sighand->siglock);
 		return ret;
 	}
 
-	add_wait_queue(&ctx->wqh, &wait);
+	add_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		ret = dequeue_signal(lk.tsk, &ctx->sigmask, info);
-		signalfd_unlock(&lk);
+		ret = dequeue_signal(current, &ctx->sigmask, info);
 		if (ret != 0)
 			break;
 		if (signal_pending(current)) {
 			ret = -ERESTARTSYS;
 			break;
 		}
+		spin_unlock_irq(&current->sighand->siglock);
 		schedule();
-		ret = signalfd_lock(ctx, &lk);
-		if (unlikely(!ret)) {
-			/*
-			 * Let the caller read zero byte, ala socket
-			 * recv() when the peer disconnect. This test
-			 * must be done before doing a dequeue_signal(),
-			 * because if the sighand has been orphaned,
-			 * the dequeue_signal() call is going to crash
-			 * because ->sighand will be long gone.
-			 */
-			 break;
-		}
+		spin_lock_irq(&current->sighand->siglock);
 	}
+	spin_unlock_irq(&current->sighand->siglock);
 
-	remove_wait_queue(&ctx->wqh, &wait);
+	remove_wait_queue(&current->sighand->signalfd_wqh, &wait);
 	__set_current_state(TASK_RUNNING);
 
 	return ret;
 }
 
 /*
- * Returns either the size of a "struct signalfd_siginfo", or zero if the
- * sighand we are attached to, has been orphaned. The "count" parameter
- * must be at least the size of a "struct signalfd_siginfo".
+ * Returns a multiple of the size of a "struct signalfd_siginfo", or a negative
+ * error code. The "count" parameter must be at least the size of a
+ * "struct signalfd_siginfo".
  */
 static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 			     loff_t *ppos)
@@ -287,7 +178,6 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		return -EINVAL;
 
 	siginfo = (struct signalfd_siginfo __user *) buf;
-
 	do {
 		ret = signalfd_dequeue(ctx, &info, nonblock);
 		if (unlikely(ret <= 0))
@@ -300,7 +190,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
 		nonblock = 1;
 	} while (--count);
 
-	return total ? total : ret;
+	return total ? total: ret;
 }
 
 static const struct file_operations signalfd_fops = {
@@ -309,20 +199,13 @@ static const struct file_operations signalfd_fops = {
 	.read		= signalfd_read,
 };
 
-/*
- * Create a file descriptor that is associated with our signal
- * state. We can pass it around to others if we want to, but
- * it will always be _our_ signal state.
- */
 asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask)
 {
 	int error;
 	sigset_t sigmask;
 	struct signalfd_ctx *ctx;
-	struct sighand_struct *sighand;
 	struct file *file;
 	struct inode *inode;
-	struct signalfd_lockctx lk;
 
 	if (sizemask != sizeof(sigset_t) ||
 	    copy_from_user(&sigmask, user_mask, sizeof(sigmask)))
@@ -335,17 +218,7 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 		if (!ctx)
 			return -ENOMEM;
 
-		init_waitqueue_head(&ctx->wqh);
 		ctx->sigmask = sigmask;
-		ctx->tsk = current->group_leader;
-
-		sighand = current->sighand;
-		/*
-		 * Add this fd to the list of signal listeners.
-		 */
-		spin_lock_irq(&sighand->siglock);
-		list_add_tail(&ctx->lnk, &sighand->signalfd_list);
-		spin_unlock_irq(&sighand->siglock);
 
 		/*
 		 * When we call this, the initialization must be complete, since
@@ -364,23 +237,18 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas
 			fput(file);
 			return -EINVAL;
 		}
-		/*
-		 * We need to be prepared of the fact that the sighand this fd
-		 * is attached to, has been detched. In that case signalfd_lock()
-		 * will return 0, and we'll just skip setting the new mask.
-		 */
-		if (signalfd_lock(ctx, &lk)) {
-			ctx->sigmask = sigmask;
-			signalfd_unlock(&lk);
-		}
-		wake_up(&ctx->wqh);
+		spin_lock_irq(&current->sighand->siglock);
+		ctx->sigmask = sigmask;
+		spin_unlock_irq(&current->sighand->siglock);
+
+		wake_up(&current->sighand->signalfd_wqh);
 		fput(file);
 	}
 
 	return ufd;
 
 err_fdalloc:
-	signalfd_cleanup(ctx);
+	kfree(ctx);
 	return error;
 }
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index cab741c2d603..f8abfa349ef9 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -86,7 +86,7 @@ extern struct nsproxy init_nsproxy;
 	.count		= ATOMIC_INIT(1), 				\
 	.action		= { { { .sa_handler = NULL, } }, },		\
 	.siglock	= __SPIN_LOCK_UNLOCKED(sighand.siglock),	\
-	.signalfd_list	= LIST_HEAD_INIT(sighand.signalfd_list),	\
+	.signalfd_wqh	= __WAIT_QUEUE_HEAD_INITIALIZER(sighand.signalfd_wqh),	\
 }
 
 extern struct group_info init_groups;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3de79016f2a6..a01ac6dd5f5e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -438,7 +438,7 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
-	struct list_head        signalfd_list;
+	wait_queue_head_t	signalfd_wqh;
 };
 
 struct pacct_struct {
diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h
index 510429495690..4c9ff0910ae0 100644
--- a/include/linux/signalfd.h
+++ b/include/linux/signalfd.h
@@ -45,49 +45,17 @@ struct signalfd_siginfo {
 #ifdef CONFIG_SIGNALFD
 
 /*
- * Deliver the signal to listening signalfd. This must be called
- * with the sighand lock held. Same are the following that end up
- * calling signalfd_deliver().
- */
-void signalfd_deliver(struct task_struct *tsk, int sig);
-
-/*
- * No need to fall inside signalfd_deliver() if no signal listeners
- * are available.
+ * Deliver the signal to listening signalfd.
  */
 static inline void signalfd_notify(struct task_struct *tsk, int sig)
 {
-	if (unlikely(!list_empty(&tsk->sighand->signalfd_list)))
-		signalfd_deliver(tsk, sig);
-}
-
-/*
- * The signal -1 is used to notify the signalfd that the sighand
- * is on its way to be detached.
- */
-static inline void signalfd_detach_locked(struct task_struct *tsk)
-{
-	if (unlikely(!list_empty(&tsk->sighand->signalfd_list)))
-		signalfd_deliver(tsk, -1);
-}
-
-static inline void signalfd_detach(struct task_struct *tsk)
-{
-	struct sighand_struct *sighand = tsk->sighand;
-
-	if (unlikely(!list_empty(&sighand->signalfd_list))) {
-		spin_lock_irq(&sighand->siglock);
-		signalfd_deliver(tsk, -1);
-		spin_unlock_irq(&sighand->siglock);
-	}
+	if (unlikely(waitqueue_active(&tsk->sighand->signalfd_wqh)))
+		wake_up(&tsk->sighand->signalfd_wqh);
 }
 
 #else /* CONFIG_SIGNALFD */
 
-#define signalfd_deliver(t, s) do { } while (0)
-#define signalfd_notify(t, s) do { } while (0)
-#define signalfd_detach_locked(t) do { } while (0)
-#define signalfd_detach(t) do { } while (0)
+static inline void signalfd_notify(struct task_struct *tsk, int sig) { }
 
 #endif /* CONFIG_SIGNALFD */
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 06b24b3aa370..993369ee94d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -24,7 +24,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/profile.h>
-#include <linux/signalfd.h>
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/kthread.h>
@@ -86,14 +85,6 @@ static void __exit_signal(struct task_struct *tsk)
 	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 
-	/*
-	 * Notify that this sighand has been detached. This must
-	 * be called with the tsk->sighand lock held. Also, this
-	 * access tsk->sighand internally, so it must be called
-	 * before tsk->sighand is reset.
-	 */
-	signalfd_detach_locked(tsk);
-
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count))
 		posix_cpu_timers_exit_group(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index 7332e236d367..33f12f48684a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1438,7 +1438,7 @@ static void sighand_ctor(void *data, struct kmem_cache *cachep,
 	struct sighand_struct *sighand = data;
 
 	spin_lock_init(&sighand->siglock);
-	INIT_LIST_HEAD(&sighand->signalfd_list);
+	init_waitqueue_head(&sighand->signalfd_wqh);
 }
 
 void __init proc_caches_init(void)
diff --git a/kernel/signal.c b/kernel/signal.c
index 3169bed0b4d0..9fb91a32edda 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -378,8 +378,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 	/* We only dequeue private signals from ourselves, we don't let
 	 * signalfd steal them
 	 */
-	if (likely(tsk == current))
-		signr = __dequeue_signal(&tsk->pending, mask, info);
+	signr = __dequeue_signal(&tsk->pending, mask, info);
 	if (!signr) {
 		signr = __dequeue_signal(&tsk->signal->shared_pending,
 					 mask, info);
@@ -407,8 +406,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 			}
 		}
 	}
-	if (likely(tsk == current))
-		recalc_sigpending();
+	recalc_sigpending();
 	if (signr && unlikely(sig_kernel_stop(signr))) {
 		/*
 		 * Set a marker that we have dequeued a stop signal.  Our
@@ -425,7 +423,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
 		if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT))
 			tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
 	}
-	if (signr && likely(tsk == current) &&
+	if (signr &&
 	     ((info->si_code & __SI_MASK) == __SI_TIMER) &&
 	     info->si_sys_private){
 		/*
-- 
cgit 


From ff0ce6845bc18292e80ea40d11c3d3a539a3fc5e Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Wed, 26 Sep 2007 15:52:17 -0700
Subject: Revert "[PATCH] x86-64: fix x86_64-mm-sched-clock-share"

This reverts commit 184c44d2049c4db7ef6ec65794546954da2c6a0e.

As noted by Dave Jones:
   "Linus, please revert the above cset.  It doesn't seem to be
    necessary (it was added to fix a miscompile in 'make allnoconfig'
    which doesn't seem to be repeatable with it reverted) and actively
   breaks the ARM SA1100 framebuffer driver."

Requested-by: Dave Jones <davej@redhat.com>
Cc: Russell King <rmk+lkml@arm.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpufreq.h | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 963051a967d6..3ec6e7ff5fbd 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -32,15 +32,7 @@
  *                     CPUFREQ NOTIFIER INTERFACE                    *
  *********************************************************************/
 
-#ifdef CONFIG_CPU_FREQ
 int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list);
-#else
-static inline int cpufreq_register_notifier(struct notifier_block *nb,
-						unsigned int list)
-{
-	return 0;
-}
-#endif
 int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list);
 
 #define CPUFREQ_TRANSITION_NOTIFIER	(0)
@@ -268,22 +260,17 @@ struct freq_attr {
 int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu);
 int cpufreq_update_policy(unsigned int cpu);
 
+/* query the current CPU frequency (in kHz). If zero, cpufreq couldn't detect it */
+unsigned int cpufreq_get(unsigned int cpu);
 
-/*
- * query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it
- */
+/* query the last known CPU freq (in kHz). If zero, cpufreq couldn't detect it */
 #ifdef CONFIG_CPU_FREQ
 unsigned int cpufreq_quick_get(unsigned int cpu);
-unsigned int cpufreq_get(unsigned int cpu);
 #else
 static inline unsigned int cpufreq_quick_get(unsigned int cpu)
 {
 	return 0;
 }
-static inline unsigned int cpufreq_get(unsigned int cpu)
-{
-	return 0;
-}
 #endif
 
 
-- 
cgit 


From 0c2043abefacac97b6d01129c1123a466c95b7c1 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@woody.linux-foundation.org>
Date: Sun, 7 Oct 2007 16:17:38 -0700
Subject: Don't do load-average calculations at even 5-second intervals
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

It turns out that there are a few other five-second timers in the
kernel, and if the timers get in sync, the load-average can get
artificially inflated by events that just happen to coincide.

So just offset the load average calculation it by a timer tick.

Noticed by Anders Boström, for whom the coincidence started triggering
on one of his machines with the JBD jiffies rounding code (JBD is one of
the subsystems that also end up using a 5-second timer by default).

Tested-by: Anders Boström <anders@bostrom.dyndns.org>
Cc: Chuck Ebbert <cebbert@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index a01ac6dd5f5e..313c6b6e774f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -113,7 +113,7 @@ extern unsigned long avenrun[];		/* Load averages */
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
-#define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
+#define LOAD_FREQ	(5*HZ+1)	/* 5 sec intervals */
 #define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
 #define EXP_5		2014		/* 1/exp(5sec/5min) */
 #define EXP_15		2037		/* 1/exp(5sec/15min) */
-- 
cgit 


From a200ee182a016752464a12cb2e8762e48254bb09 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Oct 2007 18:54:37 +0200
Subject: mm: set_page_dirty_balance() vs ->page_mkwrite()

All the current page_mkwrite() implementations also set the page dirty. Which
results in the set_page_dirty_balance() call to _not_ call balance, because the
page is already found dirty.

This allows us to dirty a _lot_ of pages without ever hitting
balance_dirty_pages().  Not good (tm).

Force a balance call if ->page_mkwrite() was successful.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h | 2 +-
 mm/memory.c               | 9 +++++++--
 mm/page-writeback.c       | 4 ++--
 3 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 4ef4d22e5e43..b4af6bcb7b7a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -127,7 +127,7 @@ int sync_page_range(struct inode *inode, struct address_space *mapping,
 			loff_t pos, loff_t count);
 int sync_page_range_nolock(struct inode *inode, struct address_space *mapping,
 			   loff_t pos, loff_t count);
-void set_page_dirty_balance(struct page *page);
+void set_page_dirty_balance(struct page *page, int page_mkwrite);
 void writeback_set_ratelimit(void);
 
 /* pdflush.c */
diff --git a/mm/memory.c b/mm/memory.c
index c0e7741a98de..f82b359b2745 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1639,6 +1639,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *old_page, *new_page;
 	pte_t entry;
 	int reuse = 0, ret = 0;
+	int page_mkwrite = 0;
 	struct page *dirty_page = NULL;
 
 	old_page = vm_normal_page(vma, address, orig_pte);
@@ -1687,6 +1688,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			page_cache_release(old_page);
 			if (!pte_same(*page_table, orig_pte))
 				goto unlock;
+
+			page_mkwrite = 1;
 		}
 		dirty_page = old_page;
 		get_page(dirty_page);
@@ -1774,7 +1777,7 @@ unlock:
 		 * do_no_page is protected similarly.
 		 */
 		wait_on_page_locked(dirty_page);
-		set_page_dirty_balance(dirty_page);
+		set_page_dirty_balance(dirty_page, page_mkwrite);
 		put_page(dirty_page);
 	}
 	return ret;
@@ -2322,6 +2325,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	struct page *dirty_page = NULL;
 	struct vm_fault vmf;
 	int ret;
+	int page_mkwrite = 0;
 
 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
 	vmf.pgoff = pgoff;
@@ -2398,6 +2402,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 					anon = 1; /* no anon but release vmf.page */
 					goto out;
 				}
+				page_mkwrite = 1;
 			}
 		}
 
@@ -2453,7 +2458,7 @@ out_unlocked:
 	if (anon)
 		page_cache_release(vmf.page);
 	else if (dirty_page) {
-		set_page_dirty_balance(dirty_page);
+		set_page_dirty_balance(dirty_page, page_mkwrite);
 		put_page(dirty_page);
 	}
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 63512a9ed57e..44720363374c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -274,9 +274,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 		pdflush_operation(background_writeout, 0);
 }
 
-void set_page_dirty_balance(struct page *page)
+void set_page_dirty_balance(struct page *page, int page_mkwrite)
 {
-	if (set_page_dirty(page)) {
+	if (set_page_dirty(page) || page_mkwrite) {
 		struct address_space *mapping = page_mapping(page);
 
 		if (mapping)
-- 
cgit